diff --git "a/checkpoint-10950/trainer_state.json" "b/checkpoint-10950/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-10950/trainer_state.json" @@ -0,0 +1,76684 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 10.0, + "eval_steps": 500, + "global_step": 10950, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0009132420091324201, + "grad_norm": 68.06432342529297, + "learning_rate": 0.0, + "loss": 0.9354, + "step": 1 + }, + { + "epoch": 0.0018264840182648401, + "grad_norm": 57.40131759643555, + "learning_rate": 9.1324200913242e-09, + "loss": 1.8321, + "step": 2 + }, + { + "epoch": 0.0027397260273972603, + "grad_norm": 16.884729385375977, + "learning_rate": 1.82648401826484e-08, + "loss": 0.1733, + "step": 3 + }, + { + "epoch": 0.0036529680365296802, + "grad_norm": 132.2845001220703, + "learning_rate": 2.7397260273972606e-08, + "loss": 2.2325, + "step": 4 + }, + { + "epoch": 0.0045662100456621, + "grad_norm": 50.25056076049805, + "learning_rate": 3.65296803652968e-08, + "loss": 0.4262, + "step": 5 + }, + { + "epoch": 0.005479452054794521, + "grad_norm": 139.25555419921875, + "learning_rate": 4.5662100456621004e-08, + "loss": 4.4872, + "step": 6 + }, + { + "epoch": 0.006392694063926941, + "grad_norm": 11.410428047180176, + "learning_rate": 5.479452054794521e-08, + "loss": 0.087, + "step": 7 + }, + { + "epoch": 0.0073059360730593605, + "grad_norm": 26.781200408935547, + "learning_rate": 6.392694063926942e-08, + "loss": 0.2724, + "step": 8 + }, + { + "epoch": 0.00821917808219178, + "grad_norm": 117.16572570800781, + "learning_rate": 7.30593607305936e-08, + "loss": 0.8946, + "step": 9 + }, + { + "epoch": 0.0091324200913242, + "grad_norm": 87.38581848144531, + "learning_rate": 8.219178082191781e-08, + "loss": 2.0773, + "step": 10 + }, + { + "epoch": 0.01004566210045662, + "grad_norm": 38.41877746582031, + "learning_rate": 9.132420091324201e-08, + "loss": 0.341, + "step": 11 + }, + { + "epoch": 0.010958904109589041, + "grad_norm": 18.262998580932617, + "learning_rate": 1.0045662100456622e-07, + "loss": 0.1296, + "step": 12 + }, + { + "epoch": 0.011872146118721462, + "grad_norm": 115.7493896484375, + "learning_rate": 1.0958904109589042e-07, + "loss": 2.4674, + "step": 13 + }, + { + "epoch": 0.012785388127853882, + "grad_norm": 13.028263092041016, + "learning_rate": 1.1872146118721462e-07, + "loss": 0.102, + "step": 14 + }, + { + "epoch": 0.0136986301369863, + "grad_norm": 101.97847747802734, + "learning_rate": 1.2785388127853884e-07, + "loss": 2.599, + "step": 15 + }, + { + "epoch": 0.014611872146118721, + "grad_norm": 53.89338302612305, + "learning_rate": 1.36986301369863e-07, + "loss": 0.7742, + "step": 16 + }, + { + "epoch": 0.015525114155251141, + "grad_norm": 47.47715759277344, + "learning_rate": 1.461187214611872e-07, + "loss": 0.5636, + "step": 17 + }, + { + "epoch": 0.01643835616438356, + "grad_norm": 0.6754053831100464, + "learning_rate": 1.5525114155251144e-07, + "loss": 0.0046, + "step": 18 + }, + { + "epoch": 0.017351598173515982, + "grad_norm": 29.08022689819336, + "learning_rate": 1.6438356164383561e-07, + "loss": 0.2648, + "step": 19 + }, + { + "epoch": 0.0182648401826484, + "grad_norm": 0.16177403926849365, + "learning_rate": 1.7351598173515984e-07, + "loss": 0.0013, + "step": 20 + }, + { + "epoch": 0.019178082191780823, + "grad_norm": 42.24726104736328, + "learning_rate": 1.8264840182648401e-07, + "loss": 0.4695, + "step": 21 + }, + { + "epoch": 0.02009132420091324, + "grad_norm": 106.44953918457031, + "learning_rate": 1.9178082191780824e-07, + "loss": 2.7419, + "step": 22 + }, + { + "epoch": 0.021004566210045664, + "grad_norm": 36.37990188598633, + "learning_rate": 2.0091324200913244e-07, + "loss": 0.2691, + "step": 23 + }, + { + "epoch": 0.021917808219178082, + "grad_norm": 7.422334671020508, + "learning_rate": 2.1004566210045664e-07, + "loss": 0.0516, + "step": 24 + }, + { + "epoch": 0.0228310502283105, + "grad_norm": 56.217288970947266, + "learning_rate": 2.1917808219178084e-07, + "loss": 1.2391, + "step": 25 + }, + { + "epoch": 0.023744292237442923, + "grad_norm": 36.640113830566406, + "learning_rate": 2.2831050228310502e-07, + "loss": 0.4504, + "step": 26 + }, + { + "epoch": 0.024657534246575342, + "grad_norm": 38.681488037109375, + "learning_rate": 2.3744292237442925e-07, + "loss": 0.3517, + "step": 27 + }, + { + "epoch": 0.025570776255707764, + "grad_norm": 87.12687683105469, + "learning_rate": 2.465753424657534e-07, + "loss": 1.3528, + "step": 28 + }, + { + "epoch": 0.026484018264840183, + "grad_norm": 51.191741943359375, + "learning_rate": 2.557077625570777e-07, + "loss": 0.342, + "step": 29 + }, + { + "epoch": 0.0273972602739726, + "grad_norm": 47.60920715332031, + "learning_rate": 2.648401826484018e-07, + "loss": 0.4923, + "step": 30 + }, + { + "epoch": 0.028310502283105023, + "grad_norm": 16.710180282592773, + "learning_rate": 2.73972602739726e-07, + "loss": 0.1836, + "step": 31 + }, + { + "epoch": 0.029223744292237442, + "grad_norm": 3.771493673324585, + "learning_rate": 2.831050228310503e-07, + "loss": 0.0313, + "step": 32 + }, + { + "epoch": 0.030136986301369864, + "grad_norm": 109.36833190917969, + "learning_rate": 2.922374429223744e-07, + "loss": 0.6761, + "step": 33 + }, + { + "epoch": 0.031050228310502283, + "grad_norm": 2.225229024887085, + "learning_rate": 3.013698630136987e-07, + "loss": 0.0164, + "step": 34 + }, + { + "epoch": 0.0319634703196347, + "grad_norm": 28.807327270507812, + "learning_rate": 3.105022831050229e-07, + "loss": 0.296, + "step": 35 + }, + { + "epoch": 0.03287671232876712, + "grad_norm": 85.60076904296875, + "learning_rate": 3.19634703196347e-07, + "loss": 1.4293, + "step": 36 + }, + { + "epoch": 0.033789954337899546, + "grad_norm": 146.47271728515625, + "learning_rate": 3.2876712328767123e-07, + "loss": 6.3253, + "step": 37 + }, + { + "epoch": 0.034703196347031964, + "grad_norm": 117.61624908447266, + "learning_rate": 3.378995433789955e-07, + "loss": 4.8337, + "step": 38 + }, + { + "epoch": 0.03561643835616438, + "grad_norm": 164.93954467773438, + "learning_rate": 3.470319634703197e-07, + "loss": 3.6684, + "step": 39 + }, + { + "epoch": 0.0365296803652968, + "grad_norm": 97.55934143066406, + "learning_rate": 3.561643835616439e-07, + "loss": 3.7719, + "step": 40 + }, + { + "epoch": 0.03744292237442922, + "grad_norm": 76.084716796875, + "learning_rate": 3.6529680365296803e-07, + "loss": 1.0799, + "step": 41 + }, + { + "epoch": 0.038356164383561646, + "grad_norm": 30.32883644104004, + "learning_rate": 3.7442922374429223e-07, + "loss": 0.2191, + "step": 42 + }, + { + "epoch": 0.039269406392694065, + "grad_norm": 60.2735481262207, + "learning_rate": 3.835616438356165e-07, + "loss": 1.0689, + "step": 43 + }, + { + "epoch": 0.04018264840182648, + "grad_norm": 30.514699935913086, + "learning_rate": 3.926940639269407e-07, + "loss": 0.1902, + "step": 44 + }, + { + "epoch": 0.0410958904109589, + "grad_norm": 72.5616683959961, + "learning_rate": 4.018264840182649e-07, + "loss": 1.206, + "step": 45 + }, + { + "epoch": 0.04200913242009133, + "grad_norm": 8.780183792114258, + "learning_rate": 4.1095890410958903e-07, + "loss": 0.0877, + "step": 46 + }, + { + "epoch": 0.042922374429223746, + "grad_norm": 55.44345474243164, + "learning_rate": 4.200913242009133e-07, + "loss": 0.803, + "step": 47 + }, + { + "epoch": 0.043835616438356165, + "grad_norm": 111.92570495605469, + "learning_rate": 4.292237442922375e-07, + "loss": 1.44, + "step": 48 + }, + { + "epoch": 0.04474885844748858, + "grad_norm": 67.82302856445312, + "learning_rate": 4.383561643835617e-07, + "loss": 1.6464, + "step": 49 + }, + { + "epoch": 0.045662100456621, + "grad_norm": 3.3678739070892334, + "learning_rate": 4.474885844748859e-07, + "loss": 0.0284, + "step": 50 + }, + { + "epoch": 0.04657534246575343, + "grad_norm": 109.62472534179688, + "learning_rate": 4.5662100456621004e-07, + "loss": 2.4901, + "step": 51 + }, + { + "epoch": 0.047488584474885846, + "grad_norm": 234.56802368164062, + "learning_rate": 4.657534246575343e-07, + "loss": 6.8259, + "step": 52 + }, + { + "epoch": 0.048401826484018265, + "grad_norm": 79.14295196533203, + "learning_rate": 4.748858447488585e-07, + "loss": 0.8206, + "step": 53 + }, + { + "epoch": 0.049315068493150684, + "grad_norm": 81.86441802978516, + "learning_rate": 4.840182648401827e-07, + "loss": 2.2153, + "step": 54 + }, + { + "epoch": 0.0502283105022831, + "grad_norm": 9.974383354187012, + "learning_rate": 4.931506849315068e-07, + "loss": 0.0872, + "step": 55 + }, + { + "epoch": 0.05114155251141553, + "grad_norm": 147.03695678710938, + "learning_rate": 5.022831050228311e-07, + "loss": 4.6945, + "step": 56 + }, + { + "epoch": 0.052054794520547946, + "grad_norm": 41.140113830566406, + "learning_rate": 5.114155251141553e-07, + "loss": 0.5744, + "step": 57 + }, + { + "epoch": 0.052968036529680365, + "grad_norm": 0.599461555480957, + "learning_rate": 5.205479452054795e-07, + "loss": 0.0045, + "step": 58 + }, + { + "epoch": 0.053881278538812784, + "grad_norm": 42.66651153564453, + "learning_rate": 5.296803652968036e-07, + "loss": 0.4592, + "step": 59 + }, + { + "epoch": 0.0547945205479452, + "grad_norm": 9.464042663574219, + "learning_rate": 5.388127853881279e-07, + "loss": 0.0763, + "step": 60 + }, + { + "epoch": 0.05570776255707763, + "grad_norm": 53.34172058105469, + "learning_rate": 5.47945205479452e-07, + "loss": 0.4607, + "step": 61 + }, + { + "epoch": 0.05662100456621005, + "grad_norm": 19.939786911010742, + "learning_rate": 5.570776255707763e-07, + "loss": 0.2615, + "step": 62 + }, + { + "epoch": 0.057534246575342465, + "grad_norm": 6.353438854217529, + "learning_rate": 5.662100456621006e-07, + "loss": 0.0366, + "step": 63 + }, + { + "epoch": 0.058447488584474884, + "grad_norm": 49.75902557373047, + "learning_rate": 5.753424657534247e-07, + "loss": 0.6928, + "step": 64 + }, + { + "epoch": 0.0593607305936073, + "grad_norm": 87.43708038330078, + "learning_rate": 5.844748858447488e-07, + "loss": 2.8626, + "step": 65 + }, + { + "epoch": 0.06027397260273973, + "grad_norm": 23.2319393157959, + "learning_rate": 5.936073059360731e-07, + "loss": 0.1576, + "step": 66 + }, + { + "epoch": 0.06118721461187215, + "grad_norm": 79.48406982421875, + "learning_rate": 6.027397260273974e-07, + "loss": 0.6596, + "step": 67 + }, + { + "epoch": 0.062100456621004566, + "grad_norm": 1.8611171245574951, + "learning_rate": 6.118721461187215e-07, + "loss": 0.0187, + "step": 68 + }, + { + "epoch": 0.06301369863013699, + "grad_norm": 97.09123229980469, + "learning_rate": 6.210045662100458e-07, + "loss": 2.6354, + "step": 69 + }, + { + "epoch": 0.0639269406392694, + "grad_norm": 37.96271514892578, + "learning_rate": 6.3013698630137e-07, + "loss": 0.4763, + "step": 70 + }, + { + "epoch": 0.06484018264840183, + "grad_norm": 8.596692085266113, + "learning_rate": 6.39269406392694e-07, + "loss": 0.059, + "step": 71 + }, + { + "epoch": 0.06575342465753424, + "grad_norm": 54.65970993041992, + "learning_rate": 6.484018264840183e-07, + "loss": 0.491, + "step": 72 + }, + { + "epoch": 0.06666666666666667, + "grad_norm": 86.0518569946289, + "learning_rate": 6.575342465753425e-07, + "loss": 1.7439, + "step": 73 + }, + { + "epoch": 0.06757990867579909, + "grad_norm": 77.19422149658203, + "learning_rate": 6.666666666666667e-07, + "loss": 2.1298, + "step": 74 + }, + { + "epoch": 0.0684931506849315, + "grad_norm": 64.32123565673828, + "learning_rate": 6.75799086757991e-07, + "loss": 0.7369, + "step": 75 + }, + { + "epoch": 0.06940639269406393, + "grad_norm": 51.41822814941406, + "learning_rate": 6.849315068493151e-07, + "loss": 0.6184, + "step": 76 + }, + { + "epoch": 0.07031963470319634, + "grad_norm": 40.94023132324219, + "learning_rate": 6.940639269406394e-07, + "loss": 0.4981, + "step": 77 + }, + { + "epoch": 0.07123287671232877, + "grad_norm": 7.056486129760742, + "learning_rate": 7.031963470319635e-07, + "loss": 0.0476, + "step": 78 + }, + { + "epoch": 0.07214611872146119, + "grad_norm": 118.4101791381836, + "learning_rate": 7.123287671232878e-07, + "loss": 2.7188, + "step": 79 + }, + { + "epoch": 0.0730593607305936, + "grad_norm": 92.7807846069336, + "learning_rate": 7.21461187214612e-07, + "loss": 2.438, + "step": 80 + }, + { + "epoch": 0.07397260273972603, + "grad_norm": 59.1584587097168, + "learning_rate": 7.305936073059361e-07, + "loss": 0.8324, + "step": 81 + }, + { + "epoch": 0.07488584474885844, + "grad_norm": 79.5971450805664, + "learning_rate": 7.397260273972603e-07, + "loss": 2.5712, + "step": 82 + }, + { + "epoch": 0.07579908675799087, + "grad_norm": 53.06120300292969, + "learning_rate": 7.488584474885845e-07, + "loss": 0.8685, + "step": 83 + }, + { + "epoch": 0.07671232876712329, + "grad_norm": 48.890846252441406, + "learning_rate": 7.579908675799087e-07, + "loss": 0.6362, + "step": 84 + }, + { + "epoch": 0.0776255707762557, + "grad_norm": 57.90552520751953, + "learning_rate": 7.67123287671233e-07, + "loss": 1.1068, + "step": 85 + }, + { + "epoch": 0.07853881278538813, + "grad_norm": 63.01938247680664, + "learning_rate": 7.762557077625571e-07, + "loss": 0.8611, + "step": 86 + }, + { + "epoch": 0.07945205479452055, + "grad_norm": 40.329071044921875, + "learning_rate": 7.853881278538814e-07, + "loss": 0.3022, + "step": 87 + }, + { + "epoch": 0.08036529680365297, + "grad_norm": 64.04638671875, + "learning_rate": 7.945205479452056e-07, + "loss": 1.2497, + "step": 88 + }, + { + "epoch": 0.08127853881278539, + "grad_norm": 49.89202117919922, + "learning_rate": 8.036529680365298e-07, + "loss": 0.551, + "step": 89 + }, + { + "epoch": 0.0821917808219178, + "grad_norm": 61.58476257324219, + "learning_rate": 8.12785388127854e-07, + "loss": 2.2082, + "step": 90 + }, + { + "epoch": 0.08310502283105023, + "grad_norm": 214.096435546875, + "learning_rate": 8.219178082191781e-07, + "loss": 1.1688, + "step": 91 + }, + { + "epoch": 0.08401826484018265, + "grad_norm": 89.21487426757812, + "learning_rate": 8.310502283105023e-07, + "loss": 1.3305, + "step": 92 + }, + { + "epoch": 0.08493150684931507, + "grad_norm": 109.57759094238281, + "learning_rate": 8.401826484018266e-07, + "loss": 0.707, + "step": 93 + }, + { + "epoch": 0.08584474885844749, + "grad_norm": 84.60148620605469, + "learning_rate": 8.493150684931507e-07, + "loss": 2.2715, + "step": 94 + }, + { + "epoch": 0.0867579908675799, + "grad_norm": 48.16481399536133, + "learning_rate": 8.58447488584475e-07, + "loss": 0.8938, + "step": 95 + }, + { + "epoch": 0.08767123287671233, + "grad_norm": 94.8746109008789, + "learning_rate": 8.675799086757991e-07, + "loss": 3.6957, + "step": 96 + }, + { + "epoch": 0.08858447488584476, + "grad_norm": 66.32077026367188, + "learning_rate": 8.767123287671234e-07, + "loss": 0.9223, + "step": 97 + }, + { + "epoch": 0.08949771689497717, + "grad_norm": 77.42247772216797, + "learning_rate": 8.858447488584476e-07, + "loss": 2.8834, + "step": 98 + }, + { + "epoch": 0.09041095890410959, + "grad_norm": 55.609073638916016, + "learning_rate": 8.949771689497718e-07, + "loss": 0.7567, + "step": 99 + }, + { + "epoch": 0.091324200913242, + "grad_norm": 88.9837875366211, + "learning_rate": 9.04109589041096e-07, + "loss": 2.8587, + "step": 100 + }, + { + "epoch": 0.09223744292237443, + "grad_norm": 6.118884563446045, + "learning_rate": 9.132420091324201e-07, + "loss": 0.0474, + "step": 101 + }, + { + "epoch": 0.09315068493150686, + "grad_norm": 39.99677276611328, + "learning_rate": 9.223744292237443e-07, + "loss": 0.2746, + "step": 102 + }, + { + "epoch": 0.09406392694063927, + "grad_norm": 80.9307861328125, + "learning_rate": 9.315068493150686e-07, + "loss": 1.4681, + "step": 103 + }, + { + "epoch": 0.09497716894977169, + "grad_norm": 73.21324920654297, + "learning_rate": 9.406392694063927e-07, + "loss": 1.1143, + "step": 104 + }, + { + "epoch": 0.0958904109589041, + "grad_norm": 48.42059326171875, + "learning_rate": 9.49771689497717e-07, + "loss": 0.5037, + "step": 105 + }, + { + "epoch": 0.09680365296803653, + "grad_norm": 43.80384826660156, + "learning_rate": 9.589041095890411e-07, + "loss": 0.4748, + "step": 106 + }, + { + "epoch": 0.09771689497716896, + "grad_norm": 97.79289245605469, + "learning_rate": 9.680365296803654e-07, + "loss": 1.5788, + "step": 107 + }, + { + "epoch": 0.09863013698630137, + "grad_norm": 66.81253814697266, + "learning_rate": 9.771689497716896e-07, + "loss": 1.6654, + "step": 108 + }, + { + "epoch": 0.09954337899543379, + "grad_norm": 46.085086822509766, + "learning_rate": 9.863013698630137e-07, + "loss": 0.2185, + "step": 109 + }, + { + "epoch": 0.1004566210045662, + "grad_norm": 52.37796401977539, + "learning_rate": 9.95433789954338e-07, + "loss": 1.0068, + "step": 110 + }, + { + "epoch": 0.10136986301369863, + "grad_norm": 50.07334518432617, + "learning_rate": 1.0045662100456622e-06, + "loss": 0.7947, + "step": 111 + }, + { + "epoch": 0.10228310502283106, + "grad_norm": 75.27021789550781, + "learning_rate": 1.0136986301369864e-06, + "loss": 1.6411, + "step": 112 + }, + { + "epoch": 0.10319634703196347, + "grad_norm": 67.1604995727539, + "learning_rate": 1.0228310502283107e-06, + "loss": 0.9911, + "step": 113 + }, + { + "epoch": 0.10410958904109589, + "grad_norm": 116.9887466430664, + "learning_rate": 1.0319634703196347e-06, + "loss": 3.9152, + "step": 114 + }, + { + "epoch": 0.1050228310502283, + "grad_norm": 15.431314468383789, + "learning_rate": 1.041095890410959e-06, + "loss": 0.0864, + "step": 115 + }, + { + "epoch": 0.10593607305936073, + "grad_norm": 23.53504753112793, + "learning_rate": 1.050228310502283e-06, + "loss": 0.2627, + "step": 116 + }, + { + "epoch": 0.10684931506849316, + "grad_norm": 75.1446304321289, + "learning_rate": 1.0593607305936073e-06, + "loss": 2.4074, + "step": 117 + }, + { + "epoch": 0.10776255707762557, + "grad_norm": 42.04245376586914, + "learning_rate": 1.0684931506849318e-06, + "loss": 0.5973, + "step": 118 + }, + { + "epoch": 0.108675799086758, + "grad_norm": 82.04195404052734, + "learning_rate": 1.0776255707762558e-06, + "loss": 3.0046, + "step": 119 + }, + { + "epoch": 0.1095890410958904, + "grad_norm": 71.49076080322266, + "learning_rate": 1.08675799086758e-06, + "loss": 0.6572, + "step": 120 + }, + { + "epoch": 0.11050228310502283, + "grad_norm": 127.798583984375, + "learning_rate": 1.095890410958904e-06, + "loss": 5.7049, + "step": 121 + }, + { + "epoch": 0.11141552511415526, + "grad_norm": 33.40270233154297, + "learning_rate": 1.1050228310502283e-06, + "loss": 0.3835, + "step": 122 + }, + { + "epoch": 0.11232876712328767, + "grad_norm": 68.99260711669922, + "learning_rate": 1.1141552511415526e-06, + "loss": 0.8491, + "step": 123 + }, + { + "epoch": 0.1132420091324201, + "grad_norm": 11.04240894317627, + "learning_rate": 1.1232876712328769e-06, + "loss": 0.1226, + "step": 124 + }, + { + "epoch": 0.1141552511415525, + "grad_norm": 86.46871948242188, + "learning_rate": 1.132420091324201e-06, + "loss": 2.6541, + "step": 125 + }, + { + "epoch": 0.11506849315068493, + "grad_norm": 48.634517669677734, + "learning_rate": 1.1415525114155251e-06, + "loss": 0.5923, + "step": 126 + }, + { + "epoch": 0.11598173515981736, + "grad_norm": 6.109269142150879, + "learning_rate": 1.1506849315068494e-06, + "loss": 0.0616, + "step": 127 + }, + { + "epoch": 0.11689497716894977, + "grad_norm": 46.357357025146484, + "learning_rate": 1.1598173515981737e-06, + "loss": 0.5214, + "step": 128 + }, + { + "epoch": 0.1178082191780822, + "grad_norm": 14.676424026489258, + "learning_rate": 1.1689497716894977e-06, + "loss": 0.1588, + "step": 129 + }, + { + "epoch": 0.1187214611872146, + "grad_norm": 72.22183227539062, + "learning_rate": 1.178082191780822e-06, + "loss": 1.1795, + "step": 130 + }, + { + "epoch": 0.11963470319634703, + "grad_norm": 80.57902526855469, + "learning_rate": 1.1872146118721462e-06, + "loss": 0.659, + "step": 131 + }, + { + "epoch": 0.12054794520547946, + "grad_norm": 75.20453643798828, + "learning_rate": 1.1963470319634705e-06, + "loss": 1.4921, + "step": 132 + }, + { + "epoch": 0.12146118721461187, + "grad_norm": 3.32236647605896, + "learning_rate": 1.2054794520547947e-06, + "loss": 0.0292, + "step": 133 + }, + { + "epoch": 0.1223744292237443, + "grad_norm": 104.18318939208984, + "learning_rate": 1.2146118721461188e-06, + "loss": 1.3304, + "step": 134 + }, + { + "epoch": 0.1232876712328767, + "grad_norm": 59.308799743652344, + "learning_rate": 1.223744292237443e-06, + "loss": 1.8819, + "step": 135 + }, + { + "epoch": 0.12420091324200913, + "grad_norm": 33.17453384399414, + "learning_rate": 1.2328767123287673e-06, + "loss": 0.5236, + "step": 136 + }, + { + "epoch": 0.12511415525114156, + "grad_norm": 91.92144775390625, + "learning_rate": 1.2420091324200915e-06, + "loss": 1.2611, + "step": 137 + }, + { + "epoch": 0.12602739726027398, + "grad_norm": 32.85380172729492, + "learning_rate": 1.2511415525114158e-06, + "loss": 0.4261, + "step": 138 + }, + { + "epoch": 0.12694063926940638, + "grad_norm": 12.95295524597168, + "learning_rate": 1.26027397260274e-06, + "loss": 0.162, + "step": 139 + }, + { + "epoch": 0.1278538812785388, + "grad_norm": 14.981559753417969, + "learning_rate": 1.2694063926940639e-06, + "loss": 0.1422, + "step": 140 + }, + { + "epoch": 0.12876712328767123, + "grad_norm": 18.211179733276367, + "learning_rate": 1.278538812785388e-06, + "loss": 0.1814, + "step": 141 + }, + { + "epoch": 0.12968036529680366, + "grad_norm": 1.4554235935211182, + "learning_rate": 1.2876712328767124e-06, + "loss": 0.0137, + "step": 142 + }, + { + "epoch": 0.13059360730593608, + "grad_norm": 96.02511596679688, + "learning_rate": 1.2968036529680366e-06, + "loss": 1.3033, + "step": 143 + }, + { + "epoch": 0.13150684931506848, + "grad_norm": 30.535167694091797, + "learning_rate": 1.3059360730593609e-06, + "loss": 0.4875, + "step": 144 + }, + { + "epoch": 0.1324200913242009, + "grad_norm": 74.86480712890625, + "learning_rate": 1.315068493150685e-06, + "loss": 2.4444, + "step": 145 + }, + { + "epoch": 0.13333333333333333, + "grad_norm": 141.13381958007812, + "learning_rate": 1.3242009132420092e-06, + "loss": 1.6792, + "step": 146 + }, + { + "epoch": 0.13424657534246576, + "grad_norm": 80.93357849121094, + "learning_rate": 1.3333333333333334e-06, + "loss": 2.4571, + "step": 147 + }, + { + "epoch": 0.13515981735159818, + "grad_norm": 37.357872009277344, + "learning_rate": 1.3424657534246577e-06, + "loss": 0.5093, + "step": 148 + }, + { + "epoch": 0.13607305936073058, + "grad_norm": 30.145450592041016, + "learning_rate": 1.351598173515982e-06, + "loss": 0.2523, + "step": 149 + }, + { + "epoch": 0.136986301369863, + "grad_norm": 39.232234954833984, + "learning_rate": 1.360730593607306e-06, + "loss": 0.4172, + "step": 150 + }, + { + "epoch": 0.13789954337899543, + "grad_norm": 39.96592330932617, + "learning_rate": 1.3698630136986302e-06, + "loss": 0.247, + "step": 151 + }, + { + "epoch": 0.13881278538812786, + "grad_norm": 6.704331398010254, + "learning_rate": 1.3789954337899545e-06, + "loss": 0.0749, + "step": 152 + }, + { + "epoch": 0.13972602739726028, + "grad_norm": 57.15786361694336, + "learning_rate": 1.3881278538812787e-06, + "loss": 0.4782, + "step": 153 + }, + { + "epoch": 0.14063926940639268, + "grad_norm": 16.199190139770508, + "learning_rate": 1.397260273972603e-06, + "loss": 0.159, + "step": 154 + }, + { + "epoch": 0.1415525114155251, + "grad_norm": 11.56640338897705, + "learning_rate": 1.406392694063927e-06, + "loss": 0.0875, + "step": 155 + }, + { + "epoch": 0.14246575342465753, + "grad_norm": 2.982431173324585, + "learning_rate": 1.4155251141552513e-06, + "loss": 0.0301, + "step": 156 + }, + { + "epoch": 0.14337899543378996, + "grad_norm": 70.60018920898438, + "learning_rate": 1.4246575342465755e-06, + "loss": 2.0773, + "step": 157 + }, + { + "epoch": 0.14429223744292238, + "grad_norm": 24.81955337524414, + "learning_rate": 1.4337899543378998e-06, + "loss": 0.3092, + "step": 158 + }, + { + "epoch": 0.14520547945205478, + "grad_norm": 73.91964721679688, + "learning_rate": 1.442922374429224e-06, + "loss": 1.0631, + "step": 159 + }, + { + "epoch": 0.1461187214611872, + "grad_norm": 50.78304672241211, + "learning_rate": 1.4520547945205479e-06, + "loss": 0.529, + "step": 160 + }, + { + "epoch": 0.14703196347031963, + "grad_norm": 6.584497451782227, + "learning_rate": 1.4611872146118721e-06, + "loss": 0.062, + "step": 161 + }, + { + "epoch": 0.14794520547945206, + "grad_norm": 0.8494138717651367, + "learning_rate": 1.4703196347031964e-06, + "loss": 0.0053, + "step": 162 + }, + { + "epoch": 0.14885844748858448, + "grad_norm": 20.75357437133789, + "learning_rate": 1.4794520547945206e-06, + "loss": 0.2322, + "step": 163 + }, + { + "epoch": 0.14977168949771688, + "grad_norm": 86.63910675048828, + "learning_rate": 1.4885844748858449e-06, + "loss": 0.9693, + "step": 164 + }, + { + "epoch": 0.1506849315068493, + "grad_norm": 78.17677307128906, + "learning_rate": 1.497716894977169e-06, + "loss": 4.6754, + "step": 165 + }, + { + "epoch": 0.15159817351598173, + "grad_norm": 2.3074629306793213, + "learning_rate": 1.5068493150684932e-06, + "loss": 0.0145, + "step": 166 + }, + { + "epoch": 0.15251141552511416, + "grad_norm": 0.526445746421814, + "learning_rate": 1.5159817351598174e-06, + "loss": 0.0047, + "step": 167 + }, + { + "epoch": 0.15342465753424658, + "grad_norm": 11.234533309936523, + "learning_rate": 1.5251141552511417e-06, + "loss": 0.1188, + "step": 168 + }, + { + "epoch": 0.154337899543379, + "grad_norm": 52.97864532470703, + "learning_rate": 1.534246575342466e-06, + "loss": 1.3681, + "step": 169 + }, + { + "epoch": 0.1552511415525114, + "grad_norm": 2.5516161918640137, + "learning_rate": 1.5433789954337902e-06, + "loss": 0.0255, + "step": 170 + }, + { + "epoch": 0.15616438356164383, + "grad_norm": 26.160207748413086, + "learning_rate": 1.5525114155251142e-06, + "loss": 0.3145, + "step": 171 + }, + { + "epoch": 0.15707762557077626, + "grad_norm": 55.623085021972656, + "learning_rate": 1.5616438356164385e-06, + "loss": 1.582, + "step": 172 + }, + { + "epoch": 0.15799086757990868, + "grad_norm": 48.001686096191406, + "learning_rate": 1.5707762557077627e-06, + "loss": 0.633, + "step": 173 + }, + { + "epoch": 0.1589041095890411, + "grad_norm": 52.38616943359375, + "learning_rate": 1.579908675799087e-06, + "loss": 0.3747, + "step": 174 + }, + { + "epoch": 0.1598173515981735, + "grad_norm": 162.98629760742188, + "learning_rate": 1.5890410958904112e-06, + "loss": 0.5404, + "step": 175 + }, + { + "epoch": 0.16073059360730593, + "grad_norm": 36.80109786987305, + "learning_rate": 1.5981735159817353e-06, + "loss": 0.2944, + "step": 176 + }, + { + "epoch": 0.16164383561643836, + "grad_norm": 60.45900344848633, + "learning_rate": 1.6073059360730595e-06, + "loss": 2.1891, + "step": 177 + }, + { + "epoch": 0.16255707762557078, + "grad_norm": 11.884732246398926, + "learning_rate": 1.6164383561643838e-06, + "loss": 0.161, + "step": 178 + }, + { + "epoch": 0.1634703196347032, + "grad_norm": 66.24649810791016, + "learning_rate": 1.625570776255708e-06, + "loss": 0.8631, + "step": 179 + }, + { + "epoch": 0.1643835616438356, + "grad_norm": 4.873290538787842, + "learning_rate": 1.6347031963470323e-06, + "loss": 0.0429, + "step": 180 + }, + { + "epoch": 0.16529680365296803, + "grad_norm": 55.18122863769531, + "learning_rate": 1.6438356164383561e-06, + "loss": 0.7072, + "step": 181 + }, + { + "epoch": 0.16621004566210046, + "grad_norm": 87.83419799804688, + "learning_rate": 1.6529680365296804e-06, + "loss": 4.2224, + "step": 182 + }, + { + "epoch": 0.16712328767123288, + "grad_norm": 30.545448303222656, + "learning_rate": 1.6621004566210046e-06, + "loss": 0.2851, + "step": 183 + }, + { + "epoch": 0.1680365296803653, + "grad_norm": 78.3531723022461, + "learning_rate": 1.671232876712329e-06, + "loss": 1.1169, + "step": 184 + }, + { + "epoch": 0.1689497716894977, + "grad_norm": 12.743905067443848, + "learning_rate": 1.6803652968036531e-06, + "loss": 0.1359, + "step": 185 + }, + { + "epoch": 0.16986301369863013, + "grad_norm": 17.06480598449707, + "learning_rate": 1.6894977168949772e-06, + "loss": 0.115, + "step": 186 + }, + { + "epoch": 0.17077625570776256, + "grad_norm": 39.7038688659668, + "learning_rate": 1.6986301369863014e-06, + "loss": 0.2281, + "step": 187 + }, + { + "epoch": 0.17168949771689498, + "grad_norm": 54.43790054321289, + "learning_rate": 1.7077625570776257e-06, + "loss": 0.5779, + "step": 188 + }, + { + "epoch": 0.1726027397260274, + "grad_norm": 6.408254146575928, + "learning_rate": 1.71689497716895e-06, + "loss": 0.0475, + "step": 189 + }, + { + "epoch": 0.1735159817351598, + "grad_norm": 50.131324768066406, + "learning_rate": 1.7260273972602742e-06, + "loss": 0.641, + "step": 190 + }, + { + "epoch": 0.17442922374429223, + "grad_norm": 12.81390380859375, + "learning_rate": 1.7351598173515982e-06, + "loss": 0.1661, + "step": 191 + }, + { + "epoch": 0.17534246575342466, + "grad_norm": 5.394819736480713, + "learning_rate": 1.7442922374429225e-06, + "loss": 0.061, + "step": 192 + }, + { + "epoch": 0.17625570776255708, + "grad_norm": 65.02466583251953, + "learning_rate": 1.7534246575342468e-06, + "loss": 0.4309, + "step": 193 + }, + { + "epoch": 0.1771689497716895, + "grad_norm": 65.50847625732422, + "learning_rate": 1.762557077625571e-06, + "loss": 0.9212, + "step": 194 + }, + { + "epoch": 0.1780821917808219, + "grad_norm": 11.872905731201172, + "learning_rate": 1.7716894977168953e-06, + "loss": 0.1712, + "step": 195 + }, + { + "epoch": 0.17899543378995433, + "grad_norm": 34.90537643432617, + "learning_rate": 1.7808219178082193e-06, + "loss": 0.4115, + "step": 196 + }, + { + "epoch": 0.17990867579908676, + "grad_norm": 190.03562927246094, + "learning_rate": 1.7899543378995436e-06, + "loss": 1.7555, + "step": 197 + }, + { + "epoch": 0.18082191780821918, + "grad_norm": 62.073646545410156, + "learning_rate": 1.7990867579908678e-06, + "loss": 0.8869, + "step": 198 + }, + { + "epoch": 0.1817351598173516, + "grad_norm": 43.61760711669922, + "learning_rate": 1.808219178082192e-06, + "loss": 0.3214, + "step": 199 + }, + { + "epoch": 0.182648401826484, + "grad_norm": 67.0809097290039, + "learning_rate": 1.8173515981735163e-06, + "loss": 2.0105, + "step": 200 + }, + { + "epoch": 0.18356164383561643, + "grad_norm": 41.15782165527344, + "learning_rate": 1.8264840182648401e-06, + "loss": 0.5847, + "step": 201 + }, + { + "epoch": 0.18447488584474886, + "grad_norm": 5.473720073699951, + "learning_rate": 1.8356164383561644e-06, + "loss": 0.0528, + "step": 202 + }, + { + "epoch": 0.18538812785388128, + "grad_norm": 66.5866928100586, + "learning_rate": 1.8447488584474887e-06, + "loss": 0.9268, + "step": 203 + }, + { + "epoch": 0.1863013698630137, + "grad_norm": 9.59362506866455, + "learning_rate": 1.853881278538813e-06, + "loss": 0.1023, + "step": 204 + }, + { + "epoch": 0.1872146118721461, + "grad_norm": 7.627255916595459, + "learning_rate": 1.8630136986301372e-06, + "loss": 0.0705, + "step": 205 + }, + { + "epoch": 0.18812785388127853, + "grad_norm": 3.5499608516693115, + "learning_rate": 1.8721461187214612e-06, + "loss": 0.0302, + "step": 206 + }, + { + "epoch": 0.18904109589041096, + "grad_norm": 52.38984298706055, + "learning_rate": 1.8812785388127855e-06, + "loss": 0.4341, + "step": 207 + }, + { + "epoch": 0.18995433789954339, + "grad_norm": 40.42197799682617, + "learning_rate": 1.8904109589041097e-06, + "loss": 0.3276, + "step": 208 + }, + { + "epoch": 0.1908675799086758, + "grad_norm": 25.061542510986328, + "learning_rate": 1.899543378995434e-06, + "loss": 0.1835, + "step": 209 + }, + { + "epoch": 0.1917808219178082, + "grad_norm": 80.00957489013672, + "learning_rate": 1.9086757990867582e-06, + "loss": 1.3986, + "step": 210 + }, + { + "epoch": 0.19269406392694063, + "grad_norm": 64.599365234375, + "learning_rate": 1.9178082191780823e-06, + "loss": 1.2601, + "step": 211 + }, + { + "epoch": 0.19360730593607306, + "grad_norm": 94.4041519165039, + "learning_rate": 1.9269406392694063e-06, + "loss": 3.028, + "step": 212 + }, + { + "epoch": 0.19452054794520549, + "grad_norm": 63.52536392211914, + "learning_rate": 1.9360730593607308e-06, + "loss": 0.8085, + "step": 213 + }, + { + "epoch": 0.1954337899543379, + "grad_norm": 69.5842056274414, + "learning_rate": 1.945205479452055e-06, + "loss": 0.9568, + "step": 214 + }, + { + "epoch": 0.1963470319634703, + "grad_norm": 33.89743423461914, + "learning_rate": 1.9543378995433793e-06, + "loss": 0.5217, + "step": 215 + }, + { + "epoch": 0.19726027397260273, + "grad_norm": 8.664738655090332, + "learning_rate": 1.9634703196347033e-06, + "loss": 0.07, + "step": 216 + }, + { + "epoch": 0.19817351598173516, + "grad_norm": 5.221890926361084, + "learning_rate": 1.9726027397260274e-06, + "loss": 0.0339, + "step": 217 + }, + { + "epoch": 0.19908675799086759, + "grad_norm": 88.74034881591797, + "learning_rate": 1.981735159817352e-06, + "loss": 0.8685, + "step": 218 + }, + { + "epoch": 0.2, + "grad_norm": 81.36849212646484, + "learning_rate": 1.990867579908676e-06, + "loss": 1.5525, + "step": 219 + }, + { + "epoch": 0.2009132420091324, + "grad_norm": 74.11175537109375, + "learning_rate": 2.0000000000000003e-06, + "loss": 2.3022, + "step": 220 + }, + { + "epoch": 0.20182648401826483, + "grad_norm": 49.175567626953125, + "learning_rate": 2.0091324200913244e-06, + "loss": 1.0646, + "step": 221 + }, + { + "epoch": 0.20273972602739726, + "grad_norm": 92.80632019042969, + "learning_rate": 2.0182648401826484e-06, + "loss": 2.3931, + "step": 222 + }, + { + "epoch": 0.20365296803652969, + "grad_norm": 6.329792022705078, + "learning_rate": 2.027397260273973e-06, + "loss": 0.0675, + "step": 223 + }, + { + "epoch": 0.2045662100456621, + "grad_norm": 71.43294525146484, + "learning_rate": 2.036529680365297e-06, + "loss": 1.9993, + "step": 224 + }, + { + "epoch": 0.2054794520547945, + "grad_norm": 58.23261642456055, + "learning_rate": 2.0456621004566214e-06, + "loss": 1.5043, + "step": 225 + }, + { + "epoch": 0.20639269406392693, + "grad_norm": 102.34451293945312, + "learning_rate": 2.0547945205479454e-06, + "loss": 3.4558, + "step": 226 + }, + { + "epoch": 0.20730593607305936, + "grad_norm": 17.338762283325195, + "learning_rate": 2.0639269406392695e-06, + "loss": 0.2029, + "step": 227 + }, + { + "epoch": 0.20821917808219179, + "grad_norm": 50.98520278930664, + "learning_rate": 2.073059360730594e-06, + "loss": 0.5506, + "step": 228 + }, + { + "epoch": 0.2091324200913242, + "grad_norm": 39.286251068115234, + "learning_rate": 2.082191780821918e-06, + "loss": 0.3265, + "step": 229 + }, + { + "epoch": 0.2100456621004566, + "grad_norm": 26.45781707763672, + "learning_rate": 2.0913242009132424e-06, + "loss": 0.1556, + "step": 230 + }, + { + "epoch": 0.21095890410958903, + "grad_norm": 49.28672409057617, + "learning_rate": 2.100456621004566e-06, + "loss": 0.5081, + "step": 231 + }, + { + "epoch": 0.21187214611872146, + "grad_norm": 96.94695281982422, + "learning_rate": 2.1095890410958905e-06, + "loss": 1.9073, + "step": 232 + }, + { + "epoch": 0.21278538812785389, + "grad_norm": 14.070869445800781, + "learning_rate": 2.1187214611872146e-06, + "loss": 0.1096, + "step": 233 + }, + { + "epoch": 0.2136986301369863, + "grad_norm": 8.990796089172363, + "learning_rate": 2.127853881278539e-06, + "loss": 0.0438, + "step": 234 + }, + { + "epoch": 0.2146118721461187, + "grad_norm": 22.700773239135742, + "learning_rate": 2.1369863013698635e-06, + "loss": 0.2312, + "step": 235 + }, + { + "epoch": 0.21552511415525114, + "grad_norm": 58.68943405151367, + "learning_rate": 2.146118721461187e-06, + "loss": 1.2884, + "step": 236 + }, + { + "epoch": 0.21643835616438356, + "grad_norm": 73.10874938964844, + "learning_rate": 2.1552511415525116e-06, + "loss": 0.68, + "step": 237 + }, + { + "epoch": 0.217351598173516, + "grad_norm": 12.89918327331543, + "learning_rate": 2.1643835616438356e-06, + "loss": 0.0985, + "step": 238 + }, + { + "epoch": 0.2182648401826484, + "grad_norm": 10.5908203125, + "learning_rate": 2.17351598173516e-06, + "loss": 0.1054, + "step": 239 + }, + { + "epoch": 0.2191780821917808, + "grad_norm": 39.61189270019531, + "learning_rate": 2.182648401826484e-06, + "loss": 0.3549, + "step": 240 + }, + { + "epoch": 0.22009132420091324, + "grad_norm": 66.67745971679688, + "learning_rate": 2.191780821917808e-06, + "loss": 0.8386, + "step": 241 + }, + { + "epoch": 0.22100456621004566, + "grad_norm": 5.478058815002441, + "learning_rate": 2.2009132420091326e-06, + "loss": 0.0509, + "step": 242 + }, + { + "epoch": 0.2219178082191781, + "grad_norm": 53.769561767578125, + "learning_rate": 2.2100456621004567e-06, + "loss": 1.0864, + "step": 243 + }, + { + "epoch": 0.2228310502283105, + "grad_norm": 58.872955322265625, + "learning_rate": 2.219178082191781e-06, + "loss": 1.0596, + "step": 244 + }, + { + "epoch": 0.2237442922374429, + "grad_norm": 33.33201599121094, + "learning_rate": 2.228310502283105e-06, + "loss": 0.5207, + "step": 245 + }, + { + "epoch": 0.22465753424657534, + "grad_norm": 6.420796871185303, + "learning_rate": 2.2374429223744292e-06, + "loss": 0.0393, + "step": 246 + }, + { + "epoch": 0.22557077625570776, + "grad_norm": 60.810813903808594, + "learning_rate": 2.2465753424657537e-06, + "loss": 0.7341, + "step": 247 + }, + { + "epoch": 0.2264840182648402, + "grad_norm": 33.63410186767578, + "learning_rate": 2.2557077625570777e-06, + "loss": 0.4597, + "step": 248 + }, + { + "epoch": 0.2273972602739726, + "grad_norm": 20.35140609741211, + "learning_rate": 2.264840182648402e-06, + "loss": 0.1535, + "step": 249 + }, + { + "epoch": 0.228310502283105, + "grad_norm": 52.55070877075195, + "learning_rate": 2.2739726027397262e-06, + "loss": 1.7785, + "step": 250 + }, + { + "epoch": 0.22922374429223744, + "grad_norm": 3.6097636222839355, + "learning_rate": 2.2831050228310503e-06, + "loss": 0.0146, + "step": 251 + }, + { + "epoch": 0.23013698630136986, + "grad_norm": 37.39328384399414, + "learning_rate": 2.2922374429223748e-06, + "loss": 0.447, + "step": 252 + }, + { + "epoch": 0.2310502283105023, + "grad_norm": 42.44994354248047, + "learning_rate": 2.301369863013699e-06, + "loss": 0.4046, + "step": 253 + }, + { + "epoch": 0.2319634703196347, + "grad_norm": 5.60738468170166, + "learning_rate": 2.3105022831050233e-06, + "loss": 0.0693, + "step": 254 + }, + { + "epoch": 0.2328767123287671, + "grad_norm": 26.629289627075195, + "learning_rate": 2.3196347031963473e-06, + "loss": 0.3251, + "step": 255 + }, + { + "epoch": 0.23378995433789954, + "grad_norm": 6.7685136795043945, + "learning_rate": 2.3287671232876713e-06, + "loss": 0.068, + "step": 256 + }, + { + "epoch": 0.23470319634703196, + "grad_norm": 25.1732177734375, + "learning_rate": 2.3378995433789954e-06, + "loss": 0.3235, + "step": 257 + }, + { + "epoch": 0.2356164383561644, + "grad_norm": 71.6132583618164, + "learning_rate": 2.34703196347032e-06, + "loss": 1.276, + "step": 258 + }, + { + "epoch": 0.2365296803652968, + "grad_norm": 121.56906127929688, + "learning_rate": 2.356164383561644e-06, + "loss": 4.2487, + "step": 259 + }, + { + "epoch": 0.2374429223744292, + "grad_norm": 28.178861618041992, + "learning_rate": 2.3652968036529684e-06, + "loss": 0.453, + "step": 260 + }, + { + "epoch": 0.23835616438356164, + "grad_norm": 34.948265075683594, + "learning_rate": 2.3744292237442924e-06, + "loss": 0.4237, + "step": 261 + }, + { + "epoch": 0.23926940639269406, + "grad_norm": 47.15904235839844, + "learning_rate": 2.3835616438356164e-06, + "loss": 0.6504, + "step": 262 + }, + { + "epoch": 0.2401826484018265, + "grad_norm": 35.1468620300293, + "learning_rate": 2.392694063926941e-06, + "loss": 0.2563, + "step": 263 + }, + { + "epoch": 0.2410958904109589, + "grad_norm": 68.30524444580078, + "learning_rate": 2.401826484018265e-06, + "loss": 1.7037, + "step": 264 + }, + { + "epoch": 0.2420091324200913, + "grad_norm": 25.49103546142578, + "learning_rate": 2.4109589041095894e-06, + "loss": 0.4208, + "step": 265 + }, + { + "epoch": 0.24292237442922374, + "grad_norm": 50.17207336425781, + "learning_rate": 2.4200913242009135e-06, + "loss": 0.9858, + "step": 266 + }, + { + "epoch": 0.24383561643835616, + "grad_norm": 75.93529510498047, + "learning_rate": 2.4292237442922375e-06, + "loss": 2.2316, + "step": 267 + }, + { + "epoch": 0.2447488584474886, + "grad_norm": 92.4083480834961, + "learning_rate": 2.438356164383562e-06, + "loss": 4.0517, + "step": 268 + }, + { + "epoch": 0.245662100456621, + "grad_norm": 27.877607345581055, + "learning_rate": 2.447488584474886e-06, + "loss": 0.3484, + "step": 269 + }, + { + "epoch": 0.2465753424657534, + "grad_norm": 72.68559265136719, + "learning_rate": 2.4566210045662105e-06, + "loss": 3.5966, + "step": 270 + }, + { + "epoch": 0.24748858447488584, + "grad_norm": 64.9039535522461, + "learning_rate": 2.4657534246575345e-06, + "loss": 1.1651, + "step": 271 + }, + { + "epoch": 0.24840182648401826, + "grad_norm": 72.90157318115234, + "learning_rate": 2.4748858447488586e-06, + "loss": 2.2731, + "step": 272 + }, + { + "epoch": 0.2493150684931507, + "grad_norm": 8.874948501586914, + "learning_rate": 2.484018264840183e-06, + "loss": 0.0431, + "step": 273 + }, + { + "epoch": 0.2502283105022831, + "grad_norm": 48.9442138671875, + "learning_rate": 2.493150684931507e-06, + "loss": 0.5992, + "step": 274 + }, + { + "epoch": 0.2511415525114155, + "grad_norm": 52.695980072021484, + "learning_rate": 2.5022831050228315e-06, + "loss": 0.5697, + "step": 275 + }, + { + "epoch": 0.25205479452054796, + "grad_norm": 84.45189666748047, + "learning_rate": 2.511415525114155e-06, + "loss": 1.0985, + "step": 276 + }, + { + "epoch": 0.25296803652968036, + "grad_norm": 29.70526695251465, + "learning_rate": 2.52054794520548e-06, + "loss": 0.3621, + "step": 277 + }, + { + "epoch": 0.25388127853881276, + "grad_norm": 52.05765151977539, + "learning_rate": 2.5296803652968037e-06, + "loss": 0.6436, + "step": 278 + }, + { + "epoch": 0.2547945205479452, + "grad_norm": 8.074180603027344, + "learning_rate": 2.5388127853881277e-06, + "loss": 0.0494, + "step": 279 + }, + { + "epoch": 0.2557077625570776, + "grad_norm": 9.059545516967773, + "learning_rate": 2.547945205479452e-06, + "loss": 0.0873, + "step": 280 + }, + { + "epoch": 0.25662100456621006, + "grad_norm": 14.950732231140137, + "learning_rate": 2.557077625570776e-06, + "loss": 0.1443, + "step": 281 + }, + { + "epoch": 0.25753424657534246, + "grad_norm": 2.0679636001586914, + "learning_rate": 2.5662100456621007e-06, + "loss": 0.0205, + "step": 282 + }, + { + "epoch": 0.25844748858447486, + "grad_norm": 6.873154163360596, + "learning_rate": 2.5753424657534247e-06, + "loss": 0.0482, + "step": 283 + }, + { + "epoch": 0.2593607305936073, + "grad_norm": 316.8436584472656, + "learning_rate": 2.5844748858447488e-06, + "loss": 1.2007, + "step": 284 + }, + { + "epoch": 0.2602739726027397, + "grad_norm": 71.63383483886719, + "learning_rate": 2.5936073059360732e-06, + "loss": 0.4799, + "step": 285 + }, + { + "epoch": 0.26118721461187216, + "grad_norm": 85.48809814453125, + "learning_rate": 2.6027397260273973e-06, + "loss": 2.4392, + "step": 286 + }, + { + "epoch": 0.26210045662100456, + "grad_norm": 74.98529052734375, + "learning_rate": 2.6118721461187217e-06, + "loss": 2.9905, + "step": 287 + }, + { + "epoch": 0.26301369863013696, + "grad_norm": 64.04651641845703, + "learning_rate": 2.6210045662100458e-06, + "loss": 1.5176, + "step": 288 + }, + { + "epoch": 0.2639269406392694, + "grad_norm": 50.00849151611328, + "learning_rate": 2.63013698630137e-06, + "loss": 0.5198, + "step": 289 + }, + { + "epoch": 0.2648401826484018, + "grad_norm": 122.53203582763672, + "learning_rate": 2.6392694063926943e-06, + "loss": 4.2033, + "step": 290 + }, + { + "epoch": 0.26575342465753427, + "grad_norm": 18.808988571166992, + "learning_rate": 2.6484018264840183e-06, + "loss": 0.2335, + "step": 291 + }, + { + "epoch": 0.26666666666666666, + "grad_norm": 5.359448432922363, + "learning_rate": 2.6575342465753428e-06, + "loss": 0.0521, + "step": 292 + }, + { + "epoch": 0.26757990867579906, + "grad_norm": 33.21257400512695, + "learning_rate": 2.666666666666667e-06, + "loss": 0.4716, + "step": 293 + }, + { + "epoch": 0.2684931506849315, + "grad_norm": 8.377789497375488, + "learning_rate": 2.675799086757991e-06, + "loss": 0.0879, + "step": 294 + }, + { + "epoch": 0.2694063926940639, + "grad_norm": 39.967010498046875, + "learning_rate": 2.6849315068493153e-06, + "loss": 0.175, + "step": 295 + }, + { + "epoch": 0.27031963470319637, + "grad_norm": 2.923743486404419, + "learning_rate": 2.6940639269406394e-06, + "loss": 0.0236, + "step": 296 + }, + { + "epoch": 0.27123287671232876, + "grad_norm": 23.57546043395996, + "learning_rate": 2.703196347031964e-06, + "loss": 0.2512, + "step": 297 + }, + { + "epoch": 0.27214611872146116, + "grad_norm": 25.691497802734375, + "learning_rate": 2.712328767123288e-06, + "loss": 0.269, + "step": 298 + }, + { + "epoch": 0.2730593607305936, + "grad_norm": 100.08006286621094, + "learning_rate": 2.721461187214612e-06, + "loss": 1.6776, + "step": 299 + }, + { + "epoch": 0.273972602739726, + "grad_norm": 7.796666145324707, + "learning_rate": 2.7305936073059364e-06, + "loss": 0.0766, + "step": 300 + }, + { + "epoch": 0.27488584474885847, + "grad_norm": 15.338479995727539, + "learning_rate": 2.7397260273972604e-06, + "loss": 0.1431, + "step": 301 + }, + { + "epoch": 0.27579908675799086, + "grad_norm": 81.04891204833984, + "learning_rate": 2.748858447488585e-06, + "loss": 0.7442, + "step": 302 + }, + { + "epoch": 0.27671232876712326, + "grad_norm": 71.57987976074219, + "learning_rate": 2.757990867579909e-06, + "loss": 0.254, + "step": 303 + }, + { + "epoch": 0.2776255707762557, + "grad_norm": 54.28063201904297, + "learning_rate": 2.767123287671233e-06, + "loss": 0.5993, + "step": 304 + }, + { + "epoch": 0.2785388127853881, + "grad_norm": 71.85018920898438, + "learning_rate": 2.7762557077625574e-06, + "loss": 0.5883, + "step": 305 + }, + { + "epoch": 0.27945205479452057, + "grad_norm": 45.30476379394531, + "learning_rate": 2.7853881278538815e-06, + "loss": 0.6702, + "step": 306 + }, + { + "epoch": 0.28036529680365296, + "grad_norm": 7.189358711242676, + "learning_rate": 2.794520547945206e-06, + "loss": 0.0755, + "step": 307 + }, + { + "epoch": 0.28127853881278536, + "grad_norm": 45.324459075927734, + "learning_rate": 2.80365296803653e-06, + "loss": 0.4925, + "step": 308 + }, + { + "epoch": 0.2821917808219178, + "grad_norm": 78.59455871582031, + "learning_rate": 2.812785388127854e-06, + "loss": 0.965, + "step": 309 + }, + { + "epoch": 0.2831050228310502, + "grad_norm": 11.569807052612305, + "learning_rate": 2.8219178082191785e-06, + "loss": 0.0825, + "step": 310 + }, + { + "epoch": 0.28401826484018267, + "grad_norm": 30.336713790893555, + "learning_rate": 2.8310502283105025e-06, + "loss": 0.2364, + "step": 311 + }, + { + "epoch": 0.28493150684931506, + "grad_norm": 70.9504165649414, + "learning_rate": 2.840182648401827e-06, + "loss": 2.0262, + "step": 312 + }, + { + "epoch": 0.28584474885844746, + "grad_norm": 21.231395721435547, + "learning_rate": 2.849315068493151e-06, + "loss": 0.1818, + "step": 313 + }, + { + "epoch": 0.2867579908675799, + "grad_norm": 92.81367492675781, + "learning_rate": 2.8584474885844747e-06, + "loss": 1.1668, + "step": 314 + }, + { + "epoch": 0.2876712328767123, + "grad_norm": 28.494413375854492, + "learning_rate": 2.8675799086757996e-06, + "loss": 0.3279, + "step": 315 + }, + { + "epoch": 0.28858447488584477, + "grad_norm": 23.38553810119629, + "learning_rate": 2.876712328767123e-06, + "loss": 0.1869, + "step": 316 + }, + { + "epoch": 0.28949771689497716, + "grad_norm": 42.791160583496094, + "learning_rate": 2.885844748858448e-06, + "loss": 0.5843, + "step": 317 + }, + { + "epoch": 0.29041095890410956, + "grad_norm": 109.8545150756836, + "learning_rate": 2.8949771689497717e-06, + "loss": 7.4271, + "step": 318 + }, + { + "epoch": 0.291324200913242, + "grad_norm": 54.00971221923828, + "learning_rate": 2.9041095890410957e-06, + "loss": 0.9039, + "step": 319 + }, + { + "epoch": 0.2922374429223744, + "grad_norm": 41.557777404785156, + "learning_rate": 2.91324200913242e-06, + "loss": 0.4315, + "step": 320 + }, + { + "epoch": 0.29315068493150687, + "grad_norm": 36.58842086791992, + "learning_rate": 2.9223744292237442e-06, + "loss": 0.2768, + "step": 321 + }, + { + "epoch": 0.29406392694063926, + "grad_norm": 82.61620330810547, + "learning_rate": 2.9315068493150687e-06, + "loss": 1.7031, + "step": 322 + }, + { + "epoch": 0.29497716894977166, + "grad_norm": 41.72526168823242, + "learning_rate": 2.9406392694063927e-06, + "loss": 0.7204, + "step": 323 + }, + { + "epoch": 0.2958904109589041, + "grad_norm": 128.79417419433594, + "learning_rate": 2.9497716894977168e-06, + "loss": 2.0796, + "step": 324 + }, + { + "epoch": 0.2968036529680365, + "grad_norm": 5.327880859375, + "learning_rate": 2.9589041095890413e-06, + "loss": 0.064, + "step": 325 + }, + { + "epoch": 0.29771689497716897, + "grad_norm": 42.95458221435547, + "learning_rate": 2.9680365296803653e-06, + "loss": 0.9589, + "step": 326 + }, + { + "epoch": 0.29863013698630136, + "grad_norm": 41.95772933959961, + "learning_rate": 2.9771689497716898e-06, + "loss": 0.9001, + "step": 327 + }, + { + "epoch": 0.29954337899543376, + "grad_norm": 84.6715087890625, + "learning_rate": 2.986301369863014e-06, + "loss": 1.1605, + "step": 328 + }, + { + "epoch": 0.3004566210045662, + "grad_norm": 30.57842445373535, + "learning_rate": 2.995433789954338e-06, + "loss": 0.3636, + "step": 329 + }, + { + "epoch": 0.3013698630136986, + "grad_norm": 66.43045043945312, + "learning_rate": 3.0045662100456623e-06, + "loss": 1.9304, + "step": 330 + }, + { + "epoch": 0.30228310502283107, + "grad_norm": 98.15336608886719, + "learning_rate": 3.0136986301369864e-06, + "loss": 1.8442, + "step": 331 + }, + { + "epoch": 0.30319634703196346, + "grad_norm": 11.013792037963867, + "learning_rate": 3.022831050228311e-06, + "loss": 0.1412, + "step": 332 + }, + { + "epoch": 0.3041095890410959, + "grad_norm": 18.200904846191406, + "learning_rate": 3.031963470319635e-06, + "loss": 0.2437, + "step": 333 + }, + { + "epoch": 0.3050228310502283, + "grad_norm": 10.723212242126465, + "learning_rate": 3.0410958904109593e-06, + "loss": 0.1164, + "step": 334 + }, + { + "epoch": 0.3059360730593607, + "grad_norm": 31.146650314331055, + "learning_rate": 3.0502283105022834e-06, + "loss": 0.343, + "step": 335 + }, + { + "epoch": 0.30684931506849317, + "grad_norm": 24.454259872436523, + "learning_rate": 3.0593607305936074e-06, + "loss": 0.3708, + "step": 336 + }, + { + "epoch": 0.30776255707762556, + "grad_norm": 36.673770904541016, + "learning_rate": 3.068493150684932e-06, + "loss": 0.3873, + "step": 337 + }, + { + "epoch": 0.308675799086758, + "grad_norm": 92.8531494140625, + "learning_rate": 3.077625570776256e-06, + "loss": 0.2058, + "step": 338 + }, + { + "epoch": 0.3095890410958904, + "grad_norm": 53.884098052978516, + "learning_rate": 3.0867579908675804e-06, + "loss": 1.3316, + "step": 339 + }, + { + "epoch": 0.3105022831050228, + "grad_norm": 30.014461517333984, + "learning_rate": 3.0958904109589044e-06, + "loss": 0.4914, + "step": 340 + }, + { + "epoch": 0.31141552511415527, + "grad_norm": 9.839530944824219, + "learning_rate": 3.1050228310502285e-06, + "loss": 0.1474, + "step": 341 + }, + { + "epoch": 0.31232876712328766, + "grad_norm": 62.871219635009766, + "learning_rate": 3.114155251141553e-06, + "loss": 0.7689, + "step": 342 + }, + { + "epoch": 0.3132420091324201, + "grad_norm": 3.873842477798462, + "learning_rate": 3.123287671232877e-06, + "loss": 0.0184, + "step": 343 + }, + { + "epoch": 0.3141552511415525, + "grad_norm": 8.182830810546875, + "learning_rate": 3.1324200913242014e-06, + "loss": 0.093, + "step": 344 + }, + { + "epoch": 0.3150684931506849, + "grad_norm": 19.894065856933594, + "learning_rate": 3.1415525114155255e-06, + "loss": 0.2357, + "step": 345 + }, + { + "epoch": 0.31598173515981737, + "grad_norm": 30.317304611206055, + "learning_rate": 3.1506849315068495e-06, + "loss": 0.3805, + "step": 346 + }, + { + "epoch": 0.31689497716894977, + "grad_norm": 11.492612838745117, + "learning_rate": 3.159817351598174e-06, + "loss": 0.12, + "step": 347 + }, + { + "epoch": 0.3178082191780822, + "grad_norm": 51.9879035949707, + "learning_rate": 3.168949771689498e-06, + "loss": 0.4983, + "step": 348 + }, + { + "epoch": 0.3187214611872146, + "grad_norm": 54.24248504638672, + "learning_rate": 3.1780821917808225e-06, + "loss": 1.3431, + "step": 349 + }, + { + "epoch": 0.319634703196347, + "grad_norm": 12.117453575134277, + "learning_rate": 3.1872146118721465e-06, + "loss": 0.1584, + "step": 350 + }, + { + "epoch": 0.32054794520547947, + "grad_norm": 7.585262298583984, + "learning_rate": 3.1963470319634706e-06, + "loss": 0.0566, + "step": 351 + }, + { + "epoch": 0.32146118721461187, + "grad_norm": 28.901395797729492, + "learning_rate": 3.205479452054795e-06, + "loss": 0.3165, + "step": 352 + }, + { + "epoch": 0.3223744292237443, + "grad_norm": 105.32510375976562, + "learning_rate": 3.214611872146119e-06, + "loss": 2.2013, + "step": 353 + }, + { + "epoch": 0.3232876712328767, + "grad_norm": 27.528278350830078, + "learning_rate": 3.2237442922374436e-06, + "loss": 0.5366, + "step": 354 + }, + { + "epoch": 0.3242009132420091, + "grad_norm": 15.290724754333496, + "learning_rate": 3.2328767123287676e-06, + "loss": 0.1641, + "step": 355 + }, + { + "epoch": 0.32511415525114157, + "grad_norm": 4.891739368438721, + "learning_rate": 3.242009132420091e-06, + "loss": 0.0617, + "step": 356 + }, + { + "epoch": 0.32602739726027397, + "grad_norm": 68.54436492919922, + "learning_rate": 3.251141552511416e-06, + "loss": 1.4473, + "step": 357 + }, + { + "epoch": 0.3269406392694064, + "grad_norm": 35.48155975341797, + "learning_rate": 3.2602739726027397e-06, + "loss": 0.4421, + "step": 358 + }, + { + "epoch": 0.3278538812785388, + "grad_norm": 80.57593536376953, + "learning_rate": 3.2694063926940646e-06, + "loss": 1.2478, + "step": 359 + }, + { + "epoch": 0.3287671232876712, + "grad_norm": 37.67930603027344, + "learning_rate": 3.2785388127853882e-06, + "loss": 0.3729, + "step": 360 + }, + { + "epoch": 0.32968036529680367, + "grad_norm": 94.0909194946289, + "learning_rate": 3.2876712328767123e-06, + "loss": 3.6053, + "step": 361 + }, + { + "epoch": 0.33059360730593607, + "grad_norm": 66.16034698486328, + "learning_rate": 3.296803652968037e-06, + "loss": 2.5699, + "step": 362 + }, + { + "epoch": 0.3315068493150685, + "grad_norm": 24.935449600219727, + "learning_rate": 3.3059360730593608e-06, + "loss": 0.2576, + "step": 363 + }, + { + "epoch": 0.3324200913242009, + "grad_norm": 14.09774398803711, + "learning_rate": 3.3150684931506857e-06, + "loss": 0.1702, + "step": 364 + }, + { + "epoch": 0.3333333333333333, + "grad_norm": 7.069679260253906, + "learning_rate": 3.3242009132420093e-06, + "loss": 0.0727, + "step": 365 + }, + { + "epoch": 0.33424657534246577, + "grad_norm": 15.943455696105957, + "learning_rate": 3.3333333333333333e-06, + "loss": 0.1069, + "step": 366 + }, + { + "epoch": 0.33515981735159817, + "grad_norm": 23.07937240600586, + "learning_rate": 3.342465753424658e-06, + "loss": 0.2361, + "step": 367 + }, + { + "epoch": 0.3360730593607306, + "grad_norm": 67.49198913574219, + "learning_rate": 3.351598173515982e-06, + "loss": 1.154, + "step": 368 + }, + { + "epoch": 0.336986301369863, + "grad_norm": 65.07665252685547, + "learning_rate": 3.3607305936073063e-06, + "loss": 0.7827, + "step": 369 + }, + { + "epoch": 0.3378995433789954, + "grad_norm": 10.846187591552734, + "learning_rate": 3.3698630136986303e-06, + "loss": 0.1076, + "step": 370 + }, + { + "epoch": 0.33881278538812787, + "grad_norm": 145.4097442626953, + "learning_rate": 3.3789954337899544e-06, + "loss": 3.3325, + "step": 371 + }, + { + "epoch": 0.33972602739726027, + "grad_norm": 6.186230182647705, + "learning_rate": 3.388127853881279e-06, + "loss": 0.0381, + "step": 372 + }, + { + "epoch": 0.3406392694063927, + "grad_norm": 75.63530731201172, + "learning_rate": 3.397260273972603e-06, + "loss": 1.1268, + "step": 373 + }, + { + "epoch": 0.3415525114155251, + "grad_norm": 107.10131072998047, + "learning_rate": 3.4063926940639274e-06, + "loss": 5.1749, + "step": 374 + }, + { + "epoch": 0.3424657534246575, + "grad_norm": 19.122350692749023, + "learning_rate": 3.4155251141552514e-06, + "loss": 0.2293, + "step": 375 + }, + { + "epoch": 0.34337899543378997, + "grad_norm": 40.284019470214844, + "learning_rate": 3.4246575342465754e-06, + "loss": 0.4166, + "step": 376 + }, + { + "epoch": 0.34429223744292237, + "grad_norm": 13.691253662109375, + "learning_rate": 3.4337899543379e-06, + "loss": 0.1444, + "step": 377 + }, + { + "epoch": 0.3452054794520548, + "grad_norm": 33.971004486083984, + "learning_rate": 3.442922374429224e-06, + "loss": 0.4653, + "step": 378 + }, + { + "epoch": 0.3461187214611872, + "grad_norm": 74.86805725097656, + "learning_rate": 3.4520547945205484e-06, + "loss": 2.9559, + "step": 379 + }, + { + "epoch": 0.3470319634703196, + "grad_norm": 6.316058158874512, + "learning_rate": 3.4611872146118725e-06, + "loss": 0.0627, + "step": 380 + }, + { + "epoch": 0.34794520547945207, + "grad_norm": 18.111894607543945, + "learning_rate": 3.4703196347031965e-06, + "loss": 0.2882, + "step": 381 + }, + { + "epoch": 0.34885844748858447, + "grad_norm": 7.735611915588379, + "learning_rate": 3.479452054794521e-06, + "loss": 0.0976, + "step": 382 + }, + { + "epoch": 0.3497716894977169, + "grad_norm": 25.34828758239746, + "learning_rate": 3.488584474885845e-06, + "loss": 0.2762, + "step": 383 + }, + { + "epoch": 0.3506849315068493, + "grad_norm": 12.451370239257812, + "learning_rate": 3.4977168949771695e-06, + "loss": 0.1446, + "step": 384 + }, + { + "epoch": 0.3515981735159817, + "grad_norm": 24.91458511352539, + "learning_rate": 3.5068493150684935e-06, + "loss": 0.4222, + "step": 385 + }, + { + "epoch": 0.35251141552511417, + "grad_norm": 28.426197052001953, + "learning_rate": 3.5159817351598176e-06, + "loss": 0.3327, + "step": 386 + }, + { + "epoch": 0.35342465753424657, + "grad_norm": 56.31101608276367, + "learning_rate": 3.525114155251142e-06, + "loss": 1.3277, + "step": 387 + }, + { + "epoch": 0.354337899543379, + "grad_norm": 25.42639923095703, + "learning_rate": 3.534246575342466e-06, + "loss": 0.3781, + "step": 388 + }, + { + "epoch": 0.3552511415525114, + "grad_norm": 79.04695129394531, + "learning_rate": 3.5433789954337905e-06, + "loss": 1.1594, + "step": 389 + }, + { + "epoch": 0.3561643835616438, + "grad_norm": 70.95799255371094, + "learning_rate": 3.5525114155251146e-06, + "loss": 1.3525, + "step": 390 + }, + { + "epoch": 0.35707762557077627, + "grad_norm": 69.90977478027344, + "learning_rate": 3.5616438356164386e-06, + "loss": 1.6278, + "step": 391 + }, + { + "epoch": 0.35799086757990867, + "grad_norm": 7.200101852416992, + "learning_rate": 3.570776255707763e-06, + "loss": 0.0588, + "step": 392 + }, + { + "epoch": 0.3589041095890411, + "grad_norm": 19.019670486450195, + "learning_rate": 3.579908675799087e-06, + "loss": 0.2788, + "step": 393 + }, + { + "epoch": 0.3598173515981735, + "grad_norm": 79.18528747558594, + "learning_rate": 3.5890410958904116e-06, + "loss": 1.9011, + "step": 394 + }, + { + "epoch": 0.3607305936073059, + "grad_norm": 12.586182594299316, + "learning_rate": 3.5981735159817356e-06, + "loss": 0.1172, + "step": 395 + }, + { + "epoch": 0.36164383561643837, + "grad_norm": 9.951542854309082, + "learning_rate": 3.6073059360730597e-06, + "loss": 0.1427, + "step": 396 + }, + { + "epoch": 0.36255707762557077, + "grad_norm": 63.2325439453125, + "learning_rate": 3.616438356164384e-06, + "loss": 0.7346, + "step": 397 + }, + { + "epoch": 0.3634703196347032, + "grad_norm": 38.8760871887207, + "learning_rate": 3.625570776255708e-06, + "loss": 0.2062, + "step": 398 + }, + { + "epoch": 0.3643835616438356, + "grad_norm": 56.02519607543945, + "learning_rate": 3.6347031963470326e-06, + "loss": 1.4904, + "step": 399 + }, + { + "epoch": 0.365296803652968, + "grad_norm": 32.925289154052734, + "learning_rate": 3.6438356164383567e-06, + "loss": 0.4544, + "step": 400 + }, + { + "epoch": 0.36621004566210047, + "grad_norm": 128.21607971191406, + "learning_rate": 3.6529680365296803e-06, + "loss": 0.8585, + "step": 401 + }, + { + "epoch": 0.36712328767123287, + "grad_norm": 30.837268829345703, + "learning_rate": 3.662100456621005e-06, + "loss": 0.4327, + "step": 402 + }, + { + "epoch": 0.3680365296803653, + "grad_norm": 59.48014831542969, + "learning_rate": 3.671232876712329e-06, + "loss": 0.33, + "step": 403 + }, + { + "epoch": 0.3689497716894977, + "grad_norm": 79.51265716552734, + "learning_rate": 3.6803652968036537e-06, + "loss": 1.3256, + "step": 404 + }, + { + "epoch": 0.3698630136986301, + "grad_norm": 40.662410736083984, + "learning_rate": 3.6894977168949773e-06, + "loss": 0.505, + "step": 405 + }, + { + "epoch": 0.37077625570776257, + "grad_norm": 62.09318923950195, + "learning_rate": 3.6986301369863014e-06, + "loss": 1.3473, + "step": 406 + }, + { + "epoch": 0.37168949771689497, + "grad_norm": 63.45012664794922, + "learning_rate": 3.707762557077626e-06, + "loss": 1.2271, + "step": 407 + }, + { + "epoch": 0.3726027397260274, + "grad_norm": 35.762962341308594, + "learning_rate": 3.71689497716895e-06, + "loss": 0.1746, + "step": 408 + }, + { + "epoch": 0.3735159817351598, + "grad_norm": 54.59815979003906, + "learning_rate": 3.7260273972602743e-06, + "loss": 0.6056, + "step": 409 + }, + { + "epoch": 0.3744292237442922, + "grad_norm": 98.3079833984375, + "learning_rate": 3.7351598173515984e-06, + "loss": 1.6847, + "step": 410 + }, + { + "epoch": 0.37534246575342467, + "grad_norm": 45.3715705871582, + "learning_rate": 3.7442922374429224e-06, + "loss": 0.5087, + "step": 411 + }, + { + "epoch": 0.37625570776255707, + "grad_norm": 24.033599853515625, + "learning_rate": 3.753424657534247e-06, + "loss": 0.3063, + "step": 412 + }, + { + "epoch": 0.3771689497716895, + "grad_norm": 72.08406829833984, + "learning_rate": 3.762557077625571e-06, + "loss": 3.0312, + "step": 413 + }, + { + "epoch": 0.3780821917808219, + "grad_norm": 36.68943786621094, + "learning_rate": 3.7716894977168954e-06, + "loss": 0.1702, + "step": 414 + }, + { + "epoch": 0.3789954337899543, + "grad_norm": 8.18693733215332, + "learning_rate": 3.7808219178082194e-06, + "loss": 0.0592, + "step": 415 + }, + { + "epoch": 0.37990867579908677, + "grad_norm": 107.80520629882812, + "learning_rate": 3.7899543378995435e-06, + "loss": 0.6594, + "step": 416 + }, + { + "epoch": 0.38082191780821917, + "grad_norm": 42.67121887207031, + "learning_rate": 3.799086757990868e-06, + "loss": 0.3502, + "step": 417 + }, + { + "epoch": 0.3817351598173516, + "grad_norm": 48.518829345703125, + "learning_rate": 3.808219178082192e-06, + "loss": 0.6395, + "step": 418 + }, + { + "epoch": 0.382648401826484, + "grad_norm": 214.4287109375, + "learning_rate": 3.8173515981735164e-06, + "loss": 1.2045, + "step": 419 + }, + { + "epoch": 0.3835616438356164, + "grad_norm": 1.4027366638183594, + "learning_rate": 3.826484018264841e-06, + "loss": 0.0093, + "step": 420 + }, + { + "epoch": 0.38447488584474887, + "grad_norm": 45.3709602355957, + "learning_rate": 3.8356164383561645e-06, + "loss": 0.2501, + "step": 421 + }, + { + "epoch": 0.38538812785388127, + "grad_norm": 33.827518463134766, + "learning_rate": 3.844748858447489e-06, + "loss": 0.2268, + "step": 422 + }, + { + "epoch": 0.3863013698630137, + "grad_norm": 64.39665222167969, + "learning_rate": 3.853881278538813e-06, + "loss": 3.6976, + "step": 423 + }, + { + "epoch": 0.3872146118721461, + "grad_norm": 28.534883499145508, + "learning_rate": 3.863013698630138e-06, + "loss": 0.3585, + "step": 424 + }, + { + "epoch": 0.3881278538812785, + "grad_norm": 25.455547332763672, + "learning_rate": 3.8721461187214615e-06, + "loss": 0.3872, + "step": 425 + }, + { + "epoch": 0.38904109589041097, + "grad_norm": 97.46215057373047, + "learning_rate": 3.881278538812785e-06, + "loss": 3.1757, + "step": 426 + }, + { + "epoch": 0.38995433789954337, + "grad_norm": 16.55772590637207, + "learning_rate": 3.89041095890411e-06, + "loss": 0.1717, + "step": 427 + }, + { + "epoch": 0.3908675799086758, + "grad_norm": 17.179073333740234, + "learning_rate": 3.899543378995434e-06, + "loss": 0.231, + "step": 428 + }, + { + "epoch": 0.3917808219178082, + "grad_norm": 54.01028823852539, + "learning_rate": 3.9086757990867586e-06, + "loss": 0.5778, + "step": 429 + }, + { + "epoch": 0.3926940639269406, + "grad_norm": 128.09420776367188, + "learning_rate": 3.917808219178082e-06, + "loss": 3.343, + "step": 430 + }, + { + "epoch": 0.39360730593607307, + "grad_norm": 54.18558883666992, + "learning_rate": 3.926940639269407e-06, + "loss": 0.5807, + "step": 431 + }, + { + "epoch": 0.39452054794520547, + "grad_norm": 6.520636558532715, + "learning_rate": 3.936073059360731e-06, + "loss": 0.0713, + "step": 432 + }, + { + "epoch": 0.3954337899543379, + "grad_norm": 5.334735870361328, + "learning_rate": 3.945205479452055e-06, + "loss": 0.0411, + "step": 433 + }, + { + "epoch": 0.3963470319634703, + "grad_norm": 38.09136199951172, + "learning_rate": 3.954337899543379e-06, + "loss": 0.6738, + "step": 434 + }, + { + "epoch": 0.3972602739726027, + "grad_norm": 57.8822021484375, + "learning_rate": 3.963470319634704e-06, + "loss": 1.1389, + "step": 435 + }, + { + "epoch": 0.39817351598173517, + "grad_norm": 40.8767204284668, + "learning_rate": 3.972602739726027e-06, + "loss": 0.7902, + "step": 436 + }, + { + "epoch": 0.39908675799086757, + "grad_norm": 0.40228545665740967, + "learning_rate": 3.981735159817352e-06, + "loss": 0.0036, + "step": 437 + }, + { + "epoch": 0.4, + "grad_norm": 5.973355293273926, + "learning_rate": 3.990867579908676e-06, + "loss": 0.0482, + "step": 438 + }, + { + "epoch": 0.4009132420091324, + "grad_norm": 6.143064975738525, + "learning_rate": 4.000000000000001e-06, + "loss": 0.0565, + "step": 439 + }, + { + "epoch": 0.4018264840182648, + "grad_norm": 47.07587432861328, + "learning_rate": 4.009132420091324e-06, + "loss": 0.4334, + "step": 440 + }, + { + "epoch": 0.40273972602739727, + "grad_norm": 59.78684997558594, + "learning_rate": 4.018264840182649e-06, + "loss": 0.8883, + "step": 441 + }, + { + "epoch": 0.40365296803652967, + "grad_norm": 2.0197486877441406, + "learning_rate": 4.027397260273973e-06, + "loss": 0.0164, + "step": 442 + }, + { + "epoch": 0.4045662100456621, + "grad_norm": 29.954967498779297, + "learning_rate": 4.036529680365297e-06, + "loss": 0.4334, + "step": 443 + }, + { + "epoch": 0.4054794520547945, + "grad_norm": 30.133853912353516, + "learning_rate": 4.045662100456621e-06, + "loss": 0.3707, + "step": 444 + }, + { + "epoch": 0.4063926940639269, + "grad_norm": 2.9897966384887695, + "learning_rate": 4.054794520547946e-06, + "loss": 0.0226, + "step": 445 + }, + { + "epoch": 0.40730593607305937, + "grad_norm": 53.9329833984375, + "learning_rate": 4.063926940639269e-06, + "loss": 1.7447, + "step": 446 + }, + { + "epoch": 0.40821917808219177, + "grad_norm": 32.57020950317383, + "learning_rate": 4.073059360730594e-06, + "loss": 0.3327, + "step": 447 + }, + { + "epoch": 0.4091324200913242, + "grad_norm": 38.643680572509766, + "learning_rate": 4.082191780821918e-06, + "loss": 0.7391, + "step": 448 + }, + { + "epoch": 0.4100456621004566, + "grad_norm": 83.08110046386719, + "learning_rate": 4.091324200913243e-06, + "loss": 1.9547, + "step": 449 + }, + { + "epoch": 0.410958904109589, + "grad_norm": 77.34700012207031, + "learning_rate": 4.100456621004566e-06, + "loss": 2.4234, + "step": 450 + }, + { + "epoch": 0.41187214611872147, + "grad_norm": 16.598493576049805, + "learning_rate": 4.109589041095891e-06, + "loss": 0.1678, + "step": 451 + }, + { + "epoch": 0.41278538812785387, + "grad_norm": 285.70330810546875, + "learning_rate": 4.118721461187215e-06, + "loss": 2.0963, + "step": 452 + }, + { + "epoch": 0.4136986301369863, + "grad_norm": 40.8900146484375, + "learning_rate": 4.127853881278539e-06, + "loss": 0.1633, + "step": 453 + }, + { + "epoch": 0.4146118721461187, + "grad_norm": 96.76689910888672, + "learning_rate": 4.136986301369863e-06, + "loss": 1.6194, + "step": 454 + }, + { + "epoch": 0.4155251141552511, + "grad_norm": 113.47908020019531, + "learning_rate": 4.146118721461188e-06, + "loss": 6.1492, + "step": 455 + }, + { + "epoch": 0.41643835616438357, + "grad_norm": 79.85359191894531, + "learning_rate": 4.1552511415525115e-06, + "loss": 2.8074, + "step": 456 + }, + { + "epoch": 0.41735159817351597, + "grad_norm": 77.24742889404297, + "learning_rate": 4.164383561643836e-06, + "loss": 2.4938, + "step": 457 + }, + { + "epoch": 0.4182648401826484, + "grad_norm": 6.207293510437012, + "learning_rate": 4.1735159817351604e-06, + "loss": 0.0488, + "step": 458 + }, + { + "epoch": 0.4191780821917808, + "grad_norm": 70.53875732421875, + "learning_rate": 4.182648401826485e-06, + "loss": 1.9882, + "step": 459 + }, + { + "epoch": 0.4200913242009132, + "grad_norm": 71.66934204101562, + "learning_rate": 4.1917808219178085e-06, + "loss": 2.8655, + "step": 460 + }, + { + "epoch": 0.42100456621004567, + "grad_norm": 10.640972137451172, + "learning_rate": 4.200913242009132e-06, + "loss": 0.1195, + "step": 461 + }, + { + "epoch": 0.42191780821917807, + "grad_norm": 56.44793701171875, + "learning_rate": 4.2100456621004574e-06, + "loss": 1.5405, + "step": 462 + }, + { + "epoch": 0.4228310502283105, + "grad_norm": 79.12266540527344, + "learning_rate": 4.219178082191781e-06, + "loss": 2.1756, + "step": 463 + }, + { + "epoch": 0.4237442922374429, + "grad_norm": 71.85143280029297, + "learning_rate": 4.2283105022831055e-06, + "loss": 4.2621, + "step": 464 + }, + { + "epoch": 0.4246575342465753, + "grad_norm": 90.15496063232422, + "learning_rate": 4.237442922374429e-06, + "loss": 1.7449, + "step": 465 + }, + { + "epoch": 0.42557077625570777, + "grad_norm": 39.98769760131836, + "learning_rate": 4.246575342465754e-06, + "loss": 0.6585, + "step": 466 + }, + { + "epoch": 0.42648401826484017, + "grad_norm": 81.28450012207031, + "learning_rate": 4.255707762557078e-06, + "loss": 1.4628, + "step": 467 + }, + { + "epoch": 0.4273972602739726, + "grad_norm": 28.13211441040039, + "learning_rate": 4.264840182648402e-06, + "loss": 0.3177, + "step": 468 + }, + { + "epoch": 0.428310502283105, + "grad_norm": 40.72591018676758, + "learning_rate": 4.273972602739727e-06, + "loss": 0.7724, + "step": 469 + }, + { + "epoch": 0.4292237442922374, + "grad_norm": 32.5097770690918, + "learning_rate": 4.283105022831051e-06, + "loss": 0.5624, + "step": 470 + }, + { + "epoch": 0.4301369863013699, + "grad_norm": 92.14881134033203, + "learning_rate": 4.292237442922374e-06, + "loss": 0.8604, + "step": 471 + }, + { + "epoch": 0.43105022831050227, + "grad_norm": 142.38510131835938, + "learning_rate": 4.301369863013699e-06, + "loss": 1.8393, + "step": 472 + }, + { + "epoch": 0.4319634703196347, + "grad_norm": 37.2678337097168, + "learning_rate": 4.310502283105023e-06, + "loss": 0.6993, + "step": 473 + }, + { + "epoch": 0.4328767123287671, + "grad_norm": 83.65821075439453, + "learning_rate": 4.319634703196348e-06, + "loss": 1.7803, + "step": 474 + }, + { + "epoch": 0.4337899543378995, + "grad_norm": 87.20782470703125, + "learning_rate": 4.328767123287671e-06, + "loss": 1.031, + "step": 475 + }, + { + "epoch": 0.434703196347032, + "grad_norm": 6.083211421966553, + "learning_rate": 4.337899543378996e-06, + "loss": 0.0616, + "step": 476 + }, + { + "epoch": 0.43561643835616437, + "grad_norm": 36.945316314697266, + "learning_rate": 4.34703196347032e-06, + "loss": 0.479, + "step": 477 + }, + { + "epoch": 0.4365296803652968, + "grad_norm": 51.733673095703125, + "learning_rate": 4.356164383561644e-06, + "loss": 0.4984, + "step": 478 + }, + { + "epoch": 0.4374429223744292, + "grad_norm": 72.68516540527344, + "learning_rate": 4.365296803652968e-06, + "loss": 1.2078, + "step": 479 + }, + { + "epoch": 0.4383561643835616, + "grad_norm": 77.12810516357422, + "learning_rate": 4.374429223744293e-06, + "loss": 1.1394, + "step": 480 + }, + { + "epoch": 0.4392694063926941, + "grad_norm": 48.645103454589844, + "learning_rate": 4.383561643835616e-06, + "loss": 0.6131, + "step": 481 + }, + { + "epoch": 0.44018264840182647, + "grad_norm": 40.3365478515625, + "learning_rate": 4.392694063926941e-06, + "loss": 0.1206, + "step": 482 + }, + { + "epoch": 0.4410958904109589, + "grad_norm": 56.51421356201172, + "learning_rate": 4.401826484018265e-06, + "loss": 2.4283, + "step": 483 + }, + { + "epoch": 0.4420091324200913, + "grad_norm": 119.43048095703125, + "learning_rate": 4.41095890410959e-06, + "loss": 1.9595, + "step": 484 + }, + { + "epoch": 0.4429223744292237, + "grad_norm": 175.7879180908203, + "learning_rate": 4.420091324200913e-06, + "loss": 2.484, + "step": 485 + }, + { + "epoch": 0.4438356164383562, + "grad_norm": 93.11001586914062, + "learning_rate": 4.429223744292238e-06, + "loss": 1.285, + "step": 486 + }, + { + "epoch": 0.44474885844748857, + "grad_norm": 61.50492477416992, + "learning_rate": 4.438356164383562e-06, + "loss": 1.8684, + "step": 487 + }, + { + "epoch": 0.445662100456621, + "grad_norm": 48.88862991333008, + "learning_rate": 4.447488584474886e-06, + "loss": 1.2828, + "step": 488 + }, + { + "epoch": 0.4465753424657534, + "grad_norm": 66.81449127197266, + "learning_rate": 4.45662100456621e-06, + "loss": 1.3404, + "step": 489 + }, + { + "epoch": 0.4474885844748858, + "grad_norm": 301.2268371582031, + "learning_rate": 4.465753424657535e-06, + "loss": 1.4818, + "step": 490 + }, + { + "epoch": 0.4484018264840183, + "grad_norm": 88.18816375732422, + "learning_rate": 4.4748858447488585e-06, + "loss": 0.7531, + "step": 491 + }, + { + "epoch": 0.44931506849315067, + "grad_norm": 55.539459228515625, + "learning_rate": 4.484018264840183e-06, + "loss": 0.9779, + "step": 492 + }, + { + "epoch": 0.4502283105022831, + "grad_norm": 116.52015686035156, + "learning_rate": 4.493150684931507e-06, + "loss": 2.0441, + "step": 493 + }, + { + "epoch": 0.4511415525114155, + "grad_norm": 106.34709167480469, + "learning_rate": 4.502283105022832e-06, + "loss": 0.3274, + "step": 494 + }, + { + "epoch": 0.4520547945205479, + "grad_norm": 260.5641174316406, + "learning_rate": 4.5114155251141555e-06, + "loss": 0.7259, + "step": 495 + }, + { + "epoch": 0.4529680365296804, + "grad_norm": 84.97603607177734, + "learning_rate": 4.52054794520548e-06, + "loss": 2.3906, + "step": 496 + }, + { + "epoch": 0.45388127853881277, + "grad_norm": 58.20850372314453, + "learning_rate": 4.529680365296804e-06, + "loss": 1.4322, + "step": 497 + }, + { + "epoch": 0.4547945205479452, + "grad_norm": 158.07362365722656, + "learning_rate": 4.538812785388128e-06, + "loss": 0.7413, + "step": 498 + }, + { + "epoch": 0.4557077625570776, + "grad_norm": 21.759971618652344, + "learning_rate": 4.5479452054794525e-06, + "loss": 0.1068, + "step": 499 + }, + { + "epoch": 0.45662100456621, + "grad_norm": 83.2422866821289, + "learning_rate": 4.557077625570777e-06, + "loss": 3.6276, + "step": 500 + }, + { + "epoch": 0.4575342465753425, + "grad_norm": 67.7101821899414, + "learning_rate": 4.566210045662101e-06, + "loss": 0.3293, + "step": 501 + }, + { + "epoch": 0.45844748858447487, + "grad_norm": 152.84710693359375, + "learning_rate": 4.575342465753425e-06, + "loss": 2.4638, + "step": 502 + }, + { + "epoch": 0.4593607305936073, + "grad_norm": 135.95692443847656, + "learning_rate": 4.5844748858447495e-06, + "loss": 2.4171, + "step": 503 + }, + { + "epoch": 0.4602739726027397, + "grad_norm": 41.38825225830078, + "learning_rate": 4.593607305936074e-06, + "loss": 0.4773, + "step": 504 + }, + { + "epoch": 0.4611872146118721, + "grad_norm": 64.04532623291016, + "learning_rate": 4.602739726027398e-06, + "loss": 0.9785, + "step": 505 + }, + { + "epoch": 0.4621004566210046, + "grad_norm": 12.29796314239502, + "learning_rate": 4.611872146118721e-06, + "loss": 0.1143, + "step": 506 + }, + { + "epoch": 0.46301369863013697, + "grad_norm": 22.380569458007812, + "learning_rate": 4.6210045662100465e-06, + "loss": 0.2274, + "step": 507 + }, + { + "epoch": 0.4639269406392694, + "grad_norm": 50.53429412841797, + "learning_rate": 4.63013698630137e-06, + "loss": 0.6229, + "step": 508 + }, + { + "epoch": 0.4648401826484018, + "grad_norm": 66.4316177368164, + "learning_rate": 4.639269406392695e-06, + "loss": 1.0369, + "step": 509 + }, + { + "epoch": 0.4657534246575342, + "grad_norm": 92.88690185546875, + "learning_rate": 4.648401826484018e-06, + "loss": 1.2025, + "step": 510 + }, + { + "epoch": 0.4666666666666667, + "grad_norm": 57.06919860839844, + "learning_rate": 4.657534246575343e-06, + "loss": 1.2187, + "step": 511 + }, + { + "epoch": 0.46757990867579907, + "grad_norm": 235.53187561035156, + "learning_rate": 4.666666666666667e-06, + "loss": 1.3066, + "step": 512 + }, + { + "epoch": 0.4684931506849315, + "grad_norm": 23.10707664489746, + "learning_rate": 4.675799086757991e-06, + "loss": 0.2466, + "step": 513 + }, + { + "epoch": 0.4694063926940639, + "grad_norm": 74.56914520263672, + "learning_rate": 4.684931506849315e-06, + "loss": 1.8942, + "step": 514 + }, + { + "epoch": 0.4703196347031963, + "grad_norm": 62.548301696777344, + "learning_rate": 4.69406392694064e-06, + "loss": 1.2762, + "step": 515 + }, + { + "epoch": 0.4712328767123288, + "grad_norm": 277.53692626953125, + "learning_rate": 4.703196347031963e-06, + "loss": 0.4307, + "step": 516 + }, + { + "epoch": 0.47214611872146117, + "grad_norm": 76.259521484375, + "learning_rate": 4.712328767123288e-06, + "loss": 0.1953, + "step": 517 + }, + { + "epoch": 0.4730593607305936, + "grad_norm": 30.028995513916016, + "learning_rate": 4.721461187214612e-06, + "loss": 0.3878, + "step": 518 + }, + { + "epoch": 0.473972602739726, + "grad_norm": 53.572776794433594, + "learning_rate": 4.730593607305937e-06, + "loss": 0.5867, + "step": 519 + }, + { + "epoch": 0.4748858447488584, + "grad_norm": 94.72140502929688, + "learning_rate": 4.73972602739726e-06, + "loss": 0.9318, + "step": 520 + }, + { + "epoch": 0.4757990867579909, + "grad_norm": 57.214561462402344, + "learning_rate": 4.748858447488585e-06, + "loss": 1.0495, + "step": 521 + }, + { + "epoch": 0.4767123287671233, + "grad_norm": 56.35989761352539, + "learning_rate": 4.757990867579909e-06, + "loss": 1.5294, + "step": 522 + }, + { + "epoch": 0.4776255707762557, + "grad_norm": 73.49085235595703, + "learning_rate": 4.767123287671233e-06, + "loss": 1.6441, + "step": 523 + }, + { + "epoch": 0.4785388127853881, + "grad_norm": 37.295440673828125, + "learning_rate": 4.776255707762557e-06, + "loss": 0.3143, + "step": 524 + }, + { + "epoch": 0.4794520547945205, + "grad_norm": 83.23880767822266, + "learning_rate": 4.785388127853882e-06, + "loss": 1.3545, + "step": 525 + }, + { + "epoch": 0.480365296803653, + "grad_norm": 53.468017578125, + "learning_rate": 4.7945205479452054e-06, + "loss": 1.0666, + "step": 526 + }, + { + "epoch": 0.4812785388127854, + "grad_norm": 35.33201217651367, + "learning_rate": 4.80365296803653e-06, + "loss": 0.5089, + "step": 527 + }, + { + "epoch": 0.4821917808219178, + "grad_norm": 18.14586067199707, + "learning_rate": 4.812785388127854e-06, + "loss": 0.2326, + "step": 528 + }, + { + "epoch": 0.4831050228310502, + "grad_norm": 90.72901153564453, + "learning_rate": 4.821917808219179e-06, + "loss": 1.5325, + "step": 529 + }, + { + "epoch": 0.4840182648401826, + "grad_norm": 41.64129638671875, + "learning_rate": 4.8310502283105025e-06, + "loss": 1.0641, + "step": 530 + }, + { + "epoch": 0.4849315068493151, + "grad_norm": 64.13614654541016, + "learning_rate": 4.840182648401827e-06, + "loss": 1.1299, + "step": 531 + }, + { + "epoch": 0.4858447488584475, + "grad_norm": 24.585241317749023, + "learning_rate": 4.849315068493151e-06, + "loss": 0.3823, + "step": 532 + }, + { + "epoch": 0.4867579908675799, + "grad_norm": 41.64512634277344, + "learning_rate": 4.858447488584475e-06, + "loss": 0.858, + "step": 533 + }, + { + "epoch": 0.4876712328767123, + "grad_norm": 81.23416900634766, + "learning_rate": 4.8675799086757995e-06, + "loss": 1.3271, + "step": 534 + }, + { + "epoch": 0.4885844748858447, + "grad_norm": 37.82551193237305, + "learning_rate": 4.876712328767124e-06, + "loss": 0.8122, + "step": 535 + }, + { + "epoch": 0.4894977168949772, + "grad_norm": 52.37285614013672, + "learning_rate": 4.8858447488584476e-06, + "loss": 0.7499, + "step": 536 + }, + { + "epoch": 0.4904109589041096, + "grad_norm": 56.59807586669922, + "learning_rate": 4.894977168949772e-06, + "loss": 2.2483, + "step": 537 + }, + { + "epoch": 0.491324200913242, + "grad_norm": 6.683116436004639, + "learning_rate": 4.9041095890410965e-06, + "loss": 0.0762, + "step": 538 + }, + { + "epoch": 0.4922374429223744, + "grad_norm": 70.1314697265625, + "learning_rate": 4.913242009132421e-06, + "loss": 3.6631, + "step": 539 + }, + { + "epoch": 0.4931506849315068, + "grad_norm": 184.19931030273438, + "learning_rate": 4.9223744292237446e-06, + "loss": 1.7276, + "step": 540 + }, + { + "epoch": 0.4940639269406393, + "grad_norm": 43.94780349731445, + "learning_rate": 4.931506849315069e-06, + "loss": 1.2903, + "step": 541 + }, + { + "epoch": 0.4949771689497717, + "grad_norm": 87.876708984375, + "learning_rate": 4.9406392694063935e-06, + "loss": 1.8646, + "step": 542 + }, + { + "epoch": 0.4958904109589041, + "grad_norm": 69.56128692626953, + "learning_rate": 4.949771689497717e-06, + "loss": 2.0991, + "step": 543 + }, + { + "epoch": 0.4968036529680365, + "grad_norm": 63.239532470703125, + "learning_rate": 4.958904109589042e-06, + "loss": 2.0594, + "step": 544 + }, + { + "epoch": 0.4977168949771689, + "grad_norm": 29.67751693725586, + "learning_rate": 4.968036529680366e-06, + "loss": 0.2907, + "step": 545 + }, + { + "epoch": 0.4986301369863014, + "grad_norm": 3.5098013877868652, + "learning_rate": 4.97716894977169e-06, + "loss": 0.0399, + "step": 546 + }, + { + "epoch": 0.4995433789954338, + "grad_norm": 14.555994033813477, + "learning_rate": 4.986301369863014e-06, + "loss": 0.1431, + "step": 547 + }, + { + "epoch": 0.5004566210045662, + "grad_norm": 76.59325408935547, + "learning_rate": 4.995433789954338e-06, + "loss": 1.3638, + "step": 548 + }, + { + "epoch": 0.5013698630136987, + "grad_norm": 22.4242000579834, + "learning_rate": 5.004566210045663e-06, + "loss": 0.4697, + "step": 549 + }, + { + "epoch": 0.502283105022831, + "grad_norm": 64.79351043701172, + "learning_rate": 5.0136986301369875e-06, + "loss": 1.328, + "step": 550 + }, + { + "epoch": 0.5031963470319635, + "grad_norm": 9.013267517089844, + "learning_rate": 5.02283105022831e-06, + "loss": 0.1191, + "step": 551 + }, + { + "epoch": 0.5041095890410959, + "grad_norm": 48.840614318847656, + "learning_rate": 5.031963470319635e-06, + "loss": 1.3248, + "step": 552 + }, + { + "epoch": 0.5050228310502283, + "grad_norm": 47.725616455078125, + "learning_rate": 5.04109589041096e-06, + "loss": 1.7435, + "step": 553 + }, + { + "epoch": 0.5059360730593607, + "grad_norm": 87.16140747070312, + "learning_rate": 5.050228310502283e-06, + "loss": 3.1934, + "step": 554 + }, + { + "epoch": 0.5068493150684932, + "grad_norm": 78.09439086914062, + "learning_rate": 5.059360730593607e-06, + "loss": 1.3228, + "step": 555 + }, + { + "epoch": 0.5077625570776255, + "grad_norm": 37.910640716552734, + "learning_rate": 5.068493150684932e-06, + "loss": 0.319, + "step": 556 + }, + { + "epoch": 0.508675799086758, + "grad_norm": 30.11316680908203, + "learning_rate": 5.077625570776255e-06, + "loss": 0.3461, + "step": 557 + }, + { + "epoch": 0.5095890410958904, + "grad_norm": 62.06250762939453, + "learning_rate": 5.08675799086758e-06, + "loss": 1.2789, + "step": 558 + }, + { + "epoch": 0.5105022831050229, + "grad_norm": 39.18709182739258, + "learning_rate": 5.095890410958904e-06, + "loss": 0.4872, + "step": 559 + }, + { + "epoch": 0.5114155251141552, + "grad_norm": 22.610260009765625, + "learning_rate": 5.10502283105023e-06, + "loss": 0.3782, + "step": 560 + }, + { + "epoch": 0.5123287671232877, + "grad_norm": 14.112375259399414, + "learning_rate": 5.114155251141552e-06, + "loss": 0.087, + "step": 561 + }, + { + "epoch": 0.5132420091324201, + "grad_norm": 79.7240982055664, + "learning_rate": 5.123287671232877e-06, + "loss": 0.2384, + "step": 562 + }, + { + "epoch": 0.5141552511415525, + "grad_norm": 31.070199966430664, + "learning_rate": 5.132420091324201e-06, + "loss": 0.3266, + "step": 563 + }, + { + "epoch": 0.5150684931506849, + "grad_norm": 34.800357818603516, + "learning_rate": 5.141552511415525e-06, + "loss": 0.4395, + "step": 564 + }, + { + "epoch": 0.5159817351598174, + "grad_norm": 56.41197967529297, + "learning_rate": 5.1506849315068494e-06, + "loss": 1.0599, + "step": 565 + }, + { + "epoch": 0.5168949771689497, + "grad_norm": 85.5616226196289, + "learning_rate": 5.159817351598174e-06, + "loss": 0.8914, + "step": 566 + }, + { + "epoch": 0.5178082191780822, + "grad_norm": 57.68589401245117, + "learning_rate": 5.1689497716894975e-06, + "loss": 2.1123, + "step": 567 + }, + { + "epoch": 0.5187214611872146, + "grad_norm": 127.50444030761719, + "learning_rate": 5.178082191780822e-06, + "loss": 1.6264, + "step": 568 + }, + { + "epoch": 0.5196347031963471, + "grad_norm": 51.08968734741211, + "learning_rate": 5.1872146118721464e-06, + "loss": 0.7132, + "step": 569 + }, + { + "epoch": 0.5205479452054794, + "grad_norm": 43.525108337402344, + "learning_rate": 5.196347031963471e-06, + "loss": 0.9168, + "step": 570 + }, + { + "epoch": 0.5214611872146119, + "grad_norm": 73.32159423828125, + "learning_rate": 5.2054794520547945e-06, + "loss": 1.4898, + "step": 571 + }, + { + "epoch": 0.5223744292237443, + "grad_norm": 18.531721115112305, + "learning_rate": 5.214611872146119e-06, + "loss": 0.3852, + "step": 572 + }, + { + "epoch": 0.5232876712328767, + "grad_norm": 53.15848922729492, + "learning_rate": 5.2237442922374435e-06, + "loss": 0.3744, + "step": 573 + }, + { + "epoch": 0.5242009132420091, + "grad_norm": 60.74750900268555, + "learning_rate": 5.232876712328767e-06, + "loss": 1.3379, + "step": 574 + }, + { + "epoch": 0.5251141552511416, + "grad_norm": 31.92019271850586, + "learning_rate": 5.2420091324200915e-06, + "loss": 0.3908, + "step": 575 + }, + { + "epoch": 0.5260273972602739, + "grad_norm": 0.4140773117542267, + "learning_rate": 5.251141552511416e-06, + "loss": 0.0035, + "step": 576 + }, + { + "epoch": 0.5269406392694064, + "grad_norm": 78.08464050292969, + "learning_rate": 5.26027397260274e-06, + "loss": 1.5057, + "step": 577 + }, + { + "epoch": 0.5278538812785388, + "grad_norm": 74.39046478271484, + "learning_rate": 5.269406392694064e-06, + "loss": 0.5927, + "step": 578 + }, + { + "epoch": 0.5287671232876713, + "grad_norm": 4.943101406097412, + "learning_rate": 5.2785388127853886e-06, + "loss": 0.0417, + "step": 579 + }, + { + "epoch": 0.5296803652968036, + "grad_norm": 22.32513999938965, + "learning_rate": 5.287671232876713e-06, + "loss": 0.3348, + "step": 580 + }, + { + "epoch": 0.5305936073059361, + "grad_norm": 21.722620010375977, + "learning_rate": 5.296803652968037e-06, + "loss": 0.1822, + "step": 581 + }, + { + "epoch": 0.5315068493150685, + "grad_norm": 60.318809509277344, + "learning_rate": 5.305936073059361e-06, + "loss": 1.4401, + "step": 582 + }, + { + "epoch": 0.5324200913242009, + "grad_norm": 37.44599914550781, + "learning_rate": 5.3150684931506856e-06, + "loss": 0.3819, + "step": 583 + }, + { + "epoch": 0.5333333333333333, + "grad_norm": 13.201604843139648, + "learning_rate": 5.324200913242009e-06, + "loss": 0.1645, + "step": 584 + }, + { + "epoch": 0.5342465753424658, + "grad_norm": 40.97284698486328, + "learning_rate": 5.333333333333334e-06, + "loss": 1.3687, + "step": 585 + }, + { + "epoch": 0.5351598173515981, + "grad_norm": 23.420198440551758, + "learning_rate": 5.342465753424658e-06, + "loss": 0.3294, + "step": 586 + }, + { + "epoch": 0.5360730593607306, + "grad_norm": 18.84324073791504, + "learning_rate": 5.351598173515982e-06, + "loss": 0.2077, + "step": 587 + }, + { + "epoch": 0.536986301369863, + "grad_norm": 46.553890228271484, + "learning_rate": 5.360730593607306e-06, + "loss": 1.068, + "step": 588 + }, + { + "epoch": 0.5378995433789955, + "grad_norm": 10.080890655517578, + "learning_rate": 5.369863013698631e-06, + "loss": 0.089, + "step": 589 + }, + { + "epoch": 0.5388127853881278, + "grad_norm": 41.5589714050293, + "learning_rate": 5.378995433789955e-06, + "loss": 0.6124, + "step": 590 + }, + { + "epoch": 0.5397260273972603, + "grad_norm": 16.695966720581055, + "learning_rate": 5.388127853881279e-06, + "loss": 0.1924, + "step": 591 + }, + { + "epoch": 0.5406392694063927, + "grad_norm": 51.20433807373047, + "learning_rate": 5.397260273972603e-06, + "loss": 0.7019, + "step": 592 + }, + { + "epoch": 0.5415525114155251, + "grad_norm": 21.87163734436035, + "learning_rate": 5.406392694063928e-06, + "loss": 0.3906, + "step": 593 + }, + { + "epoch": 0.5424657534246575, + "grad_norm": 9.78168773651123, + "learning_rate": 5.415525114155251e-06, + "loss": 0.1208, + "step": 594 + }, + { + "epoch": 0.54337899543379, + "grad_norm": 91.91058349609375, + "learning_rate": 5.424657534246576e-06, + "loss": 1.1218, + "step": 595 + }, + { + "epoch": 0.5442922374429223, + "grad_norm": 8.832018852233887, + "learning_rate": 5.4337899543379e-06, + "loss": 0.1161, + "step": 596 + }, + { + "epoch": 0.5452054794520548, + "grad_norm": 7.5638651847839355, + "learning_rate": 5.442922374429224e-06, + "loss": 0.0824, + "step": 597 + }, + { + "epoch": 0.5461187214611872, + "grad_norm": 51.32774353027344, + "learning_rate": 5.452054794520548e-06, + "loss": 1.0961, + "step": 598 + }, + { + "epoch": 0.5470319634703197, + "grad_norm": 28.25202178955078, + "learning_rate": 5.461187214611873e-06, + "loss": 0.4639, + "step": 599 + }, + { + "epoch": 0.547945205479452, + "grad_norm": 36.279388427734375, + "learning_rate": 5.470319634703197e-06, + "loss": 0.8421, + "step": 600 + }, + { + "epoch": 0.5488584474885845, + "grad_norm": 85.42608642578125, + "learning_rate": 5.479452054794521e-06, + "loss": 1.6631, + "step": 601 + }, + { + "epoch": 0.5497716894977169, + "grad_norm": 43.33143615722656, + "learning_rate": 5.488584474885845e-06, + "loss": 1.0838, + "step": 602 + }, + { + "epoch": 0.5506849315068493, + "grad_norm": 68.16275024414062, + "learning_rate": 5.49771689497717e-06, + "loss": 2.6176, + "step": 603 + }, + { + "epoch": 0.5515981735159817, + "grad_norm": 42.96827697753906, + "learning_rate": 5.506849315068493e-06, + "loss": 1.5563, + "step": 604 + }, + { + "epoch": 0.5525114155251142, + "grad_norm": 25.175565719604492, + "learning_rate": 5.515981735159818e-06, + "loss": 0.4146, + "step": 605 + }, + { + "epoch": 0.5534246575342465, + "grad_norm": 64.73966979980469, + "learning_rate": 5.525114155251142e-06, + "loss": 1.1034, + "step": 606 + }, + { + "epoch": 0.554337899543379, + "grad_norm": 12.092034339904785, + "learning_rate": 5.534246575342466e-06, + "loss": 0.1161, + "step": 607 + }, + { + "epoch": 0.5552511415525114, + "grad_norm": 167.8294219970703, + "learning_rate": 5.5433789954337904e-06, + "loss": 1.6577, + "step": 608 + }, + { + "epoch": 0.5561643835616439, + "grad_norm": 76.42986297607422, + "learning_rate": 5.552511415525115e-06, + "loss": 1.9273, + "step": 609 + }, + { + "epoch": 0.5570776255707762, + "grad_norm": 12.808704376220703, + "learning_rate": 5.561643835616439e-06, + "loss": 0.1636, + "step": 610 + }, + { + "epoch": 0.5579908675799087, + "grad_norm": 150.81614685058594, + "learning_rate": 5.570776255707763e-06, + "loss": 1.7991, + "step": 611 + }, + { + "epoch": 0.5589041095890411, + "grad_norm": 84.90834045410156, + "learning_rate": 5.5799086757990874e-06, + "loss": 3.893, + "step": 612 + }, + { + "epoch": 0.5598173515981735, + "grad_norm": 5.0928168296813965, + "learning_rate": 5.589041095890412e-06, + "loss": 0.0535, + "step": 613 + }, + { + "epoch": 0.5607305936073059, + "grad_norm": 48.974056243896484, + "learning_rate": 5.5981735159817355e-06, + "loss": 0.1706, + "step": 614 + }, + { + "epoch": 0.5616438356164384, + "grad_norm": 28.075958251953125, + "learning_rate": 5.60730593607306e-06, + "loss": 0.3658, + "step": 615 + }, + { + "epoch": 0.5625570776255707, + "grad_norm": 49.270816802978516, + "learning_rate": 5.6164383561643845e-06, + "loss": 1.2624, + "step": 616 + }, + { + "epoch": 0.5634703196347032, + "grad_norm": 27.469465255737305, + "learning_rate": 5.625570776255708e-06, + "loss": 0.435, + "step": 617 + }, + { + "epoch": 0.5643835616438356, + "grad_norm": 41.164703369140625, + "learning_rate": 5.6347031963470325e-06, + "loss": 0.947, + "step": 618 + }, + { + "epoch": 0.5652968036529681, + "grad_norm": 48.250648498535156, + "learning_rate": 5.643835616438357e-06, + "loss": 1.0167, + "step": 619 + }, + { + "epoch": 0.5662100456621004, + "grad_norm": 25.843896865844727, + "learning_rate": 5.6529680365296815e-06, + "loss": 0.321, + "step": 620 + }, + { + "epoch": 0.5671232876712329, + "grad_norm": 53.40678787231445, + "learning_rate": 5.662100456621005e-06, + "loss": 0.9605, + "step": 621 + }, + { + "epoch": 0.5680365296803653, + "grad_norm": 27.423219680786133, + "learning_rate": 5.6712328767123296e-06, + "loss": 0.3334, + "step": 622 + }, + { + "epoch": 0.5689497716894977, + "grad_norm": 3.868077278137207, + "learning_rate": 5.680365296803654e-06, + "loss": 0.0523, + "step": 623 + }, + { + "epoch": 0.5698630136986301, + "grad_norm": 29.393131256103516, + "learning_rate": 5.689497716894977e-06, + "loss": 0.5305, + "step": 624 + }, + { + "epoch": 0.5707762557077626, + "grad_norm": 19.736719131469727, + "learning_rate": 5.698630136986302e-06, + "loss": 0.3656, + "step": 625 + }, + { + "epoch": 0.5716894977168949, + "grad_norm": 2.0738935470581055, + "learning_rate": 5.7077625570776266e-06, + "loss": 0.0189, + "step": 626 + }, + { + "epoch": 0.5726027397260274, + "grad_norm": 24.40214729309082, + "learning_rate": 5.716894977168949e-06, + "loss": 0.3326, + "step": 627 + }, + { + "epoch": 0.5735159817351598, + "grad_norm": 12.220474243164062, + "learning_rate": 5.726027397260274e-06, + "loss": 0.158, + "step": 628 + }, + { + "epoch": 0.5744292237442923, + "grad_norm": 56.92481994628906, + "learning_rate": 5.735159817351599e-06, + "loss": 1.5841, + "step": 629 + }, + { + "epoch": 0.5753424657534246, + "grad_norm": 80.17408752441406, + "learning_rate": 5.744292237442924e-06, + "loss": 2.5803, + "step": 630 + }, + { + "epoch": 0.5762557077625571, + "grad_norm": 57.25596237182617, + "learning_rate": 5.753424657534246e-06, + "loss": 0.9644, + "step": 631 + }, + { + "epoch": 0.5771689497716895, + "grad_norm": 10.503912925720215, + "learning_rate": 5.762557077625572e-06, + "loss": 0.1615, + "step": 632 + }, + { + "epoch": 0.5780821917808219, + "grad_norm": 16.7788028717041, + "learning_rate": 5.771689497716896e-06, + "loss": 0.2602, + "step": 633 + }, + { + "epoch": 0.5789954337899543, + "grad_norm": 86.9013671875, + "learning_rate": 5.780821917808219e-06, + "loss": 1.3376, + "step": 634 + }, + { + "epoch": 0.5799086757990868, + "grad_norm": 33.3195915222168, + "learning_rate": 5.789954337899543e-06, + "loss": 0.7646, + "step": 635 + }, + { + "epoch": 0.5808219178082191, + "grad_norm": 14.07887077331543, + "learning_rate": 5.799086757990869e-06, + "loss": 0.1378, + "step": 636 + }, + { + "epoch": 0.5817351598173516, + "grad_norm": 41.867942810058594, + "learning_rate": 5.8082191780821915e-06, + "loss": 0.8743, + "step": 637 + }, + { + "epoch": 0.582648401826484, + "grad_norm": 100.45879364013672, + "learning_rate": 5.817351598173516e-06, + "loss": 1.2221, + "step": 638 + }, + { + "epoch": 0.5835616438356165, + "grad_norm": 41.24256896972656, + "learning_rate": 5.82648401826484e-06, + "loss": 0.3686, + "step": 639 + }, + { + "epoch": 0.5844748858447488, + "grad_norm": 26.170076370239258, + "learning_rate": 5.835616438356166e-06, + "loss": 0.4008, + "step": 640 + }, + { + "epoch": 0.5853881278538813, + "grad_norm": 25.71246910095215, + "learning_rate": 5.8447488584474885e-06, + "loss": 0.4252, + "step": 641 + }, + { + "epoch": 0.5863013698630137, + "grad_norm": 38.490936279296875, + "learning_rate": 5.853881278538813e-06, + "loss": 0.8594, + "step": 642 + }, + { + "epoch": 0.5872146118721461, + "grad_norm": 64.95326232910156, + "learning_rate": 5.863013698630137e-06, + "loss": 1.6991, + "step": 643 + }, + { + "epoch": 0.5881278538812785, + "grad_norm": 39.928462982177734, + "learning_rate": 5.872146118721461e-06, + "loss": 0.6769, + "step": 644 + }, + { + "epoch": 0.589041095890411, + "grad_norm": 71.31889343261719, + "learning_rate": 5.8812785388127855e-06, + "loss": 4.3555, + "step": 645 + }, + { + "epoch": 0.5899543378995433, + "grad_norm": 87.74031829833984, + "learning_rate": 5.89041095890411e-06, + "loss": 1.5698, + "step": 646 + }, + { + "epoch": 0.5908675799086758, + "grad_norm": 6.561049938201904, + "learning_rate": 5.8995433789954336e-06, + "loss": 0.0671, + "step": 647 + }, + { + "epoch": 0.5917808219178082, + "grad_norm": 8.169573783874512, + "learning_rate": 5.908675799086758e-06, + "loss": 0.0726, + "step": 648 + }, + { + "epoch": 0.5926940639269407, + "grad_norm": 7.941648483276367, + "learning_rate": 5.9178082191780825e-06, + "loss": 0.0774, + "step": 649 + }, + { + "epoch": 0.593607305936073, + "grad_norm": 34.052974700927734, + "learning_rate": 5.926940639269407e-06, + "loss": 0.4664, + "step": 650 + }, + { + "epoch": 0.5945205479452055, + "grad_norm": 23.149728775024414, + "learning_rate": 5.936073059360731e-06, + "loss": 0.2654, + "step": 651 + }, + { + "epoch": 0.5954337899543379, + "grad_norm": 67.1331558227539, + "learning_rate": 5.945205479452055e-06, + "loss": 1.0033, + "step": 652 + }, + { + "epoch": 0.5963470319634703, + "grad_norm": 23.40542984008789, + "learning_rate": 5.9543378995433795e-06, + "loss": 0.1467, + "step": 653 + }, + { + "epoch": 0.5972602739726027, + "grad_norm": 52.0373649597168, + "learning_rate": 5.963470319634703e-06, + "loss": 1.0895, + "step": 654 + }, + { + "epoch": 0.5981735159817352, + "grad_norm": 102.15977478027344, + "learning_rate": 5.972602739726028e-06, + "loss": 0.5142, + "step": 655 + }, + { + "epoch": 0.5990867579908675, + "grad_norm": 11.692145347595215, + "learning_rate": 5.981735159817352e-06, + "loss": 0.1302, + "step": 656 + }, + { + "epoch": 0.6, + "grad_norm": 58.56364059448242, + "learning_rate": 5.990867579908676e-06, + "loss": 1.509, + "step": 657 + }, + { + "epoch": 0.6009132420091324, + "grad_norm": 15.157914161682129, + "learning_rate": 6e-06, + "loss": 0.1322, + "step": 658 + }, + { + "epoch": 0.6018264840182649, + "grad_norm": 75.32354736328125, + "learning_rate": 6.009132420091325e-06, + "loss": 2.2019, + "step": 659 + }, + { + "epoch": 0.6027397260273972, + "grad_norm": 60.34479904174805, + "learning_rate": 6.018264840182649e-06, + "loss": 1.23, + "step": 660 + }, + { + "epoch": 0.6036529680365297, + "grad_norm": 38.08934020996094, + "learning_rate": 6.027397260273973e-06, + "loss": 1.0489, + "step": 661 + }, + { + "epoch": 0.6045662100456621, + "grad_norm": 54.554115295410156, + "learning_rate": 6.036529680365297e-06, + "loss": 2.8587, + "step": 662 + }, + { + "epoch": 0.6054794520547945, + "grad_norm": 1.243954062461853, + "learning_rate": 6.045662100456622e-06, + "loss": 0.0126, + "step": 663 + }, + { + "epoch": 0.6063926940639269, + "grad_norm": 51.95165252685547, + "learning_rate": 6.054794520547945e-06, + "loss": 0.77, + "step": 664 + }, + { + "epoch": 0.6073059360730594, + "grad_norm": 4.158714771270752, + "learning_rate": 6.06392694063927e-06, + "loss": 0.0557, + "step": 665 + }, + { + "epoch": 0.6082191780821918, + "grad_norm": 48.623619079589844, + "learning_rate": 6.073059360730594e-06, + "loss": 0.6484, + "step": 666 + }, + { + "epoch": 0.6091324200913242, + "grad_norm": 28.061548233032227, + "learning_rate": 6.082191780821919e-06, + "loss": 0.3514, + "step": 667 + }, + { + "epoch": 0.6100456621004566, + "grad_norm": 45.88258743286133, + "learning_rate": 6.091324200913242e-06, + "loss": 0.8221, + "step": 668 + }, + { + "epoch": 0.6109589041095891, + "grad_norm": 8.53674602508545, + "learning_rate": 6.100456621004567e-06, + "loss": 0.1029, + "step": 669 + }, + { + "epoch": 0.6118721461187214, + "grad_norm": 9.285322189331055, + "learning_rate": 6.109589041095891e-06, + "loss": 0.1297, + "step": 670 + }, + { + "epoch": 0.6127853881278539, + "grad_norm": 75.65455627441406, + "learning_rate": 6.118721461187215e-06, + "loss": 1.0241, + "step": 671 + }, + { + "epoch": 0.6136986301369863, + "grad_norm": 34.18675231933594, + "learning_rate": 6.127853881278539e-06, + "loss": 0.3853, + "step": 672 + }, + { + "epoch": 0.6146118721461187, + "grad_norm": 28.333385467529297, + "learning_rate": 6.136986301369864e-06, + "loss": 0.4717, + "step": 673 + }, + { + "epoch": 0.6155251141552511, + "grad_norm": 40.999839782714844, + "learning_rate": 6.146118721461187e-06, + "loss": 0.3262, + "step": 674 + }, + { + "epoch": 0.6164383561643836, + "grad_norm": 41.24323272705078, + "learning_rate": 6.155251141552512e-06, + "loss": 0.8035, + "step": 675 + }, + { + "epoch": 0.617351598173516, + "grad_norm": 85.70073699951172, + "learning_rate": 6.164383561643836e-06, + "loss": 2.9794, + "step": 676 + }, + { + "epoch": 0.6182648401826484, + "grad_norm": 40.81912612915039, + "learning_rate": 6.173515981735161e-06, + "loss": 0.7459, + "step": 677 + }, + { + "epoch": 0.6191780821917808, + "grad_norm": 46.340606689453125, + "learning_rate": 6.182648401826484e-06, + "loss": 1.7197, + "step": 678 + }, + { + "epoch": 0.6200913242009133, + "grad_norm": 33.39655303955078, + "learning_rate": 6.191780821917809e-06, + "loss": 0.5516, + "step": 679 + }, + { + "epoch": 0.6210045662100456, + "grad_norm": 34.23274612426758, + "learning_rate": 6.200913242009133e-06, + "loss": 0.4643, + "step": 680 + }, + { + "epoch": 0.6219178082191781, + "grad_norm": 12.680501937866211, + "learning_rate": 6.210045662100457e-06, + "loss": 0.2042, + "step": 681 + }, + { + "epoch": 0.6228310502283105, + "grad_norm": 56.47810745239258, + "learning_rate": 6.219178082191781e-06, + "loss": 2.5065, + "step": 682 + }, + { + "epoch": 0.6237442922374429, + "grad_norm": 61.573974609375, + "learning_rate": 6.228310502283106e-06, + "loss": 3.0424, + "step": 683 + }, + { + "epoch": 0.6246575342465753, + "grad_norm": 78.63492584228516, + "learning_rate": 6.2374429223744295e-06, + "loss": 0.6031, + "step": 684 + }, + { + "epoch": 0.6255707762557078, + "grad_norm": 39.288211822509766, + "learning_rate": 6.246575342465754e-06, + "loss": 0.6266, + "step": 685 + }, + { + "epoch": 0.6264840182648402, + "grad_norm": 64.41303253173828, + "learning_rate": 6.255707762557078e-06, + "loss": 1.4445, + "step": 686 + }, + { + "epoch": 0.6273972602739726, + "grad_norm": 203.01803588867188, + "learning_rate": 6.264840182648403e-06, + "loss": 0.3818, + "step": 687 + }, + { + "epoch": 0.628310502283105, + "grad_norm": 14.263669967651367, + "learning_rate": 6.2739726027397265e-06, + "loss": 0.1972, + "step": 688 + }, + { + "epoch": 0.6292237442922375, + "grad_norm": 30.385074615478516, + "learning_rate": 6.283105022831051e-06, + "loss": 0.7069, + "step": 689 + }, + { + "epoch": 0.6301369863013698, + "grad_norm": 5.43474006652832, + "learning_rate": 6.292237442922375e-06, + "loss": 0.0612, + "step": 690 + }, + { + "epoch": 0.6310502283105023, + "grad_norm": 55.47772216796875, + "learning_rate": 6.301369863013699e-06, + "loss": 1.6097, + "step": 691 + }, + { + "epoch": 0.6319634703196347, + "grad_norm": 58.96402359008789, + "learning_rate": 6.3105022831050235e-06, + "loss": 1.6413, + "step": 692 + }, + { + "epoch": 0.6328767123287671, + "grad_norm": 25.877939224243164, + "learning_rate": 6.319634703196348e-06, + "loss": 0.6713, + "step": 693 + }, + { + "epoch": 0.6337899543378995, + "grad_norm": 81.61888885498047, + "learning_rate": 6.328767123287672e-06, + "loss": 3.3936, + "step": 694 + }, + { + "epoch": 0.634703196347032, + "grad_norm": 12.506178855895996, + "learning_rate": 6.337899543378996e-06, + "loss": 0.102, + "step": 695 + }, + { + "epoch": 0.6356164383561644, + "grad_norm": 59.501216888427734, + "learning_rate": 6.3470319634703205e-06, + "loss": 1.9662, + "step": 696 + }, + { + "epoch": 0.6365296803652968, + "grad_norm": 12.025671005249023, + "learning_rate": 6.356164383561645e-06, + "loss": 0.1787, + "step": 697 + }, + { + "epoch": 0.6374429223744292, + "grad_norm": 56.215675354003906, + "learning_rate": 6.365296803652969e-06, + "loss": 1.4228, + "step": 698 + }, + { + "epoch": 0.6383561643835617, + "grad_norm": 77.6214828491211, + "learning_rate": 6.374429223744293e-06, + "loss": 1.1893, + "step": 699 + }, + { + "epoch": 0.639269406392694, + "grad_norm": 34.36013412475586, + "learning_rate": 6.3835616438356175e-06, + "loss": 0.739, + "step": 700 + }, + { + "epoch": 0.6401826484018265, + "grad_norm": 30.66062355041504, + "learning_rate": 6.392694063926941e-06, + "loss": 0.68, + "step": 701 + }, + { + "epoch": 0.6410958904109589, + "grad_norm": 39.396427154541016, + "learning_rate": 6.401826484018266e-06, + "loss": 0.4677, + "step": 702 + }, + { + "epoch": 0.6420091324200913, + "grad_norm": 51.799442291259766, + "learning_rate": 6.41095890410959e-06, + "loss": 0.4946, + "step": 703 + }, + { + "epoch": 0.6429223744292237, + "grad_norm": 37.70170211791992, + "learning_rate": 6.420091324200914e-06, + "loss": 0.7904, + "step": 704 + }, + { + "epoch": 0.6438356164383562, + "grad_norm": 11.361842155456543, + "learning_rate": 6.429223744292238e-06, + "loss": 0.142, + "step": 705 + }, + { + "epoch": 0.6447488584474886, + "grad_norm": 85.4591293334961, + "learning_rate": 6.438356164383563e-06, + "loss": 4.7522, + "step": 706 + }, + { + "epoch": 0.645662100456621, + "grad_norm": 115.23799896240234, + "learning_rate": 6.447488584474887e-06, + "loss": 1.8467, + "step": 707 + }, + { + "epoch": 0.6465753424657534, + "grad_norm": 41.9813232421875, + "learning_rate": 6.456621004566211e-06, + "loss": 0.7695, + "step": 708 + }, + { + "epoch": 0.6474885844748859, + "grad_norm": 37.97197723388672, + "learning_rate": 6.465753424657535e-06, + "loss": 0.8999, + "step": 709 + }, + { + "epoch": 0.6484018264840182, + "grad_norm": 20.900197982788086, + "learning_rate": 6.47488584474886e-06, + "loss": 0.4266, + "step": 710 + }, + { + "epoch": 0.6493150684931507, + "grad_norm": 19.70432472229004, + "learning_rate": 6.484018264840182e-06, + "loss": 0.1903, + "step": 711 + }, + { + "epoch": 0.6502283105022831, + "grad_norm": 32.039676666259766, + "learning_rate": 6.493150684931508e-06, + "loss": 0.2849, + "step": 712 + }, + { + "epoch": 0.6511415525114155, + "grad_norm": 30.956302642822266, + "learning_rate": 6.502283105022832e-06, + "loss": 0.5905, + "step": 713 + }, + { + "epoch": 0.6520547945205479, + "grad_norm": 7.4852728843688965, + "learning_rate": 6.511415525114155e-06, + "loss": 0.0959, + "step": 714 + }, + { + "epoch": 0.6529680365296804, + "grad_norm": 86.50015258789062, + "learning_rate": 6.5205479452054794e-06, + "loss": 2.0551, + "step": 715 + }, + { + "epoch": 0.6538812785388128, + "grad_norm": 27.341676712036133, + "learning_rate": 6.529680365296805e-06, + "loss": 0.7099, + "step": 716 + }, + { + "epoch": 0.6547945205479452, + "grad_norm": 44.564334869384766, + "learning_rate": 6.538812785388129e-06, + "loss": 1.1181, + "step": 717 + }, + { + "epoch": 0.6557077625570776, + "grad_norm": 27.554262161254883, + "learning_rate": 6.547945205479452e-06, + "loss": 0.5133, + "step": 718 + }, + { + "epoch": 0.6566210045662101, + "grad_norm": 21.147464752197266, + "learning_rate": 6.5570776255707765e-06, + "loss": 0.3198, + "step": 719 + }, + { + "epoch": 0.6575342465753424, + "grad_norm": 43.03898620605469, + "learning_rate": 6.566210045662102e-06, + "loss": 1.0958, + "step": 720 + }, + { + "epoch": 0.6584474885844749, + "grad_norm": 27.572044372558594, + "learning_rate": 6.5753424657534245e-06, + "loss": 0.6076, + "step": 721 + }, + { + "epoch": 0.6593607305936073, + "grad_norm": 57.559226989746094, + "learning_rate": 6.584474885844749e-06, + "loss": 1.5016, + "step": 722 + }, + { + "epoch": 0.6602739726027397, + "grad_norm": 36.52582931518555, + "learning_rate": 6.593607305936074e-06, + "loss": 0.6997, + "step": 723 + }, + { + "epoch": 0.6611872146118721, + "grad_norm": 17.693391799926758, + "learning_rate": 6.602739726027397e-06, + "loss": 0.2368, + "step": 724 + }, + { + "epoch": 0.6621004566210046, + "grad_norm": 81.96202850341797, + "learning_rate": 6.6118721461187215e-06, + "loss": 2.9275, + "step": 725 + }, + { + "epoch": 0.663013698630137, + "grad_norm": 67.50602722167969, + "learning_rate": 6.621004566210046e-06, + "loss": 1.0212, + "step": 726 + }, + { + "epoch": 0.6639269406392694, + "grad_norm": 52.93470764160156, + "learning_rate": 6.630136986301371e-06, + "loss": 1.6375, + "step": 727 + }, + { + "epoch": 0.6648401826484018, + "grad_norm": 45.22749710083008, + "learning_rate": 6.639269406392694e-06, + "loss": 1.2405, + "step": 728 + }, + { + "epoch": 0.6657534246575343, + "grad_norm": 77.43939971923828, + "learning_rate": 6.6484018264840186e-06, + "loss": 3.124, + "step": 729 + }, + { + "epoch": 0.6666666666666666, + "grad_norm": 53.92664337158203, + "learning_rate": 6.657534246575343e-06, + "loss": 1.3655, + "step": 730 + }, + { + "epoch": 0.6675799086757991, + "grad_norm": 15.419285774230957, + "learning_rate": 6.666666666666667e-06, + "loss": 0.2719, + "step": 731 + }, + { + "epoch": 0.6684931506849315, + "grad_norm": 39.864418029785156, + "learning_rate": 6.675799086757991e-06, + "loss": 0.7615, + "step": 732 + }, + { + "epoch": 0.6694063926940639, + "grad_norm": 47.259342193603516, + "learning_rate": 6.684931506849316e-06, + "loss": 1.0764, + "step": 733 + }, + { + "epoch": 0.6703196347031963, + "grad_norm": 36.133056640625, + "learning_rate": 6.694063926940639e-06, + "loss": 0.6828, + "step": 734 + }, + { + "epoch": 0.6712328767123288, + "grad_norm": 15.736336708068848, + "learning_rate": 6.703196347031964e-06, + "loss": 0.2648, + "step": 735 + }, + { + "epoch": 0.6721461187214612, + "grad_norm": 50.85316848754883, + "learning_rate": 6.712328767123288e-06, + "loss": 1.9092, + "step": 736 + }, + { + "epoch": 0.6730593607305936, + "grad_norm": 49.499759674072266, + "learning_rate": 6.721461187214613e-06, + "loss": 0.5484, + "step": 737 + }, + { + "epoch": 0.673972602739726, + "grad_norm": 21.92116355895996, + "learning_rate": 6.730593607305936e-06, + "loss": 0.3469, + "step": 738 + }, + { + "epoch": 0.6748858447488585, + "grad_norm": 9.293303489685059, + "learning_rate": 6.739726027397261e-06, + "loss": 0.1403, + "step": 739 + }, + { + "epoch": 0.6757990867579908, + "grad_norm": 75.51750183105469, + "learning_rate": 6.748858447488585e-06, + "loss": 1.8663, + "step": 740 + }, + { + "epoch": 0.6767123287671233, + "grad_norm": 15.18571949005127, + "learning_rate": 6.757990867579909e-06, + "loss": 0.2555, + "step": 741 + }, + { + "epoch": 0.6776255707762557, + "grad_norm": 67.99600982666016, + "learning_rate": 6.767123287671233e-06, + "loss": 1.8033, + "step": 742 + }, + { + "epoch": 0.6785388127853881, + "grad_norm": 26.506702423095703, + "learning_rate": 6.776255707762558e-06, + "loss": 0.4841, + "step": 743 + }, + { + "epoch": 0.6794520547945205, + "grad_norm": 89.83123779296875, + "learning_rate": 6.785388127853881e-06, + "loss": 2.1783, + "step": 744 + }, + { + "epoch": 0.680365296803653, + "grad_norm": 28.967992782592773, + "learning_rate": 6.794520547945206e-06, + "loss": 0.603, + "step": 745 + }, + { + "epoch": 0.6812785388127854, + "grad_norm": 33.974918365478516, + "learning_rate": 6.80365296803653e-06, + "loss": 0.3755, + "step": 746 + }, + { + "epoch": 0.6821917808219178, + "grad_norm": 66.175048828125, + "learning_rate": 6.812785388127855e-06, + "loss": 1.9213, + "step": 747 + }, + { + "epoch": 0.6831050228310502, + "grad_norm": 8.28872299194336, + "learning_rate": 6.821917808219178e-06, + "loss": 0.1043, + "step": 748 + }, + { + "epoch": 0.6840182648401827, + "grad_norm": 52.5512809753418, + "learning_rate": 6.831050228310503e-06, + "loss": 1.6793, + "step": 749 + }, + { + "epoch": 0.684931506849315, + "grad_norm": 27.39432144165039, + "learning_rate": 6.840182648401827e-06, + "loss": 0.4102, + "step": 750 + }, + { + "epoch": 0.6858447488584475, + "grad_norm": 26.348758697509766, + "learning_rate": 6.849315068493151e-06, + "loss": 0.4781, + "step": 751 + }, + { + "epoch": 0.6867579908675799, + "grad_norm": 1.2963130474090576, + "learning_rate": 6.858447488584475e-06, + "loss": 0.0143, + "step": 752 + }, + { + "epoch": 0.6876712328767123, + "grad_norm": 68.62899780273438, + "learning_rate": 6.8675799086758e-06, + "loss": 2.3914, + "step": 753 + }, + { + "epoch": 0.6885844748858447, + "grad_norm": 46.22692108154297, + "learning_rate": 6.876712328767123e-06, + "loss": 1.1883, + "step": 754 + }, + { + "epoch": 0.6894977168949772, + "grad_norm": 29.458988189697266, + "learning_rate": 6.885844748858448e-06, + "loss": 0.4662, + "step": 755 + }, + { + "epoch": 0.6904109589041096, + "grad_norm": 30.232351303100586, + "learning_rate": 6.894977168949772e-06, + "loss": 0.5356, + "step": 756 + }, + { + "epoch": 0.691324200913242, + "grad_norm": 13.450727462768555, + "learning_rate": 6.904109589041097e-06, + "loss": 0.2569, + "step": 757 + }, + { + "epoch": 0.6922374429223744, + "grad_norm": 35.26477813720703, + "learning_rate": 6.9132420091324204e-06, + "loss": 1.4782, + "step": 758 + }, + { + "epoch": 0.6931506849315069, + "grad_norm": 3.49106502532959, + "learning_rate": 6.922374429223745e-06, + "loss": 0.0514, + "step": 759 + }, + { + "epoch": 0.6940639269406392, + "grad_norm": 39.787689208984375, + "learning_rate": 6.931506849315069e-06, + "loss": 0.7768, + "step": 760 + }, + { + "epoch": 0.6949771689497717, + "grad_norm": 56.49861145019531, + "learning_rate": 6.940639269406393e-06, + "loss": 1.7614, + "step": 761 + }, + { + "epoch": 0.6958904109589041, + "grad_norm": 43.74366760253906, + "learning_rate": 6.9497716894977175e-06, + "loss": 0.5504, + "step": 762 + }, + { + "epoch": 0.6968036529680365, + "grad_norm": 63.43455123901367, + "learning_rate": 6.958904109589042e-06, + "loss": 2.2604, + "step": 763 + }, + { + "epoch": 0.6977168949771689, + "grad_norm": 30.571231842041016, + "learning_rate": 6.9680365296803655e-06, + "loss": 0.5208, + "step": 764 + }, + { + "epoch": 0.6986301369863014, + "grad_norm": 3.311459541320801, + "learning_rate": 6.97716894977169e-06, + "loss": 0.0431, + "step": 765 + }, + { + "epoch": 0.6995433789954338, + "grad_norm": 26.849485397338867, + "learning_rate": 6.9863013698630145e-06, + "loss": 0.3184, + "step": 766 + }, + { + "epoch": 0.7004566210045662, + "grad_norm": 17.830631256103516, + "learning_rate": 6.995433789954339e-06, + "loss": 0.2396, + "step": 767 + }, + { + "epoch": 0.7013698630136986, + "grad_norm": 66.7369155883789, + "learning_rate": 7.0045662100456626e-06, + "loss": 2.0643, + "step": 768 + }, + { + "epoch": 0.7022831050228311, + "grad_norm": 33.98820495605469, + "learning_rate": 7.013698630136987e-06, + "loss": 0.8943, + "step": 769 + }, + { + "epoch": 0.7031963470319634, + "grad_norm": 49.114036560058594, + "learning_rate": 7.0228310502283115e-06, + "loss": 1.4868, + "step": 770 + }, + { + "epoch": 0.7041095890410959, + "grad_norm": 20.08656883239746, + "learning_rate": 7.031963470319635e-06, + "loss": 0.3673, + "step": 771 + }, + { + "epoch": 0.7050228310502283, + "grad_norm": 34.71658706665039, + "learning_rate": 7.0410958904109596e-06, + "loss": 0.5329, + "step": 772 + }, + { + "epoch": 0.7059360730593607, + "grad_norm": 33.27152633666992, + "learning_rate": 7.050228310502284e-06, + "loss": 0.8093, + "step": 773 + }, + { + "epoch": 0.7068493150684931, + "grad_norm": 11.89699649810791, + "learning_rate": 7.059360730593608e-06, + "loss": 0.1513, + "step": 774 + }, + { + "epoch": 0.7077625570776256, + "grad_norm": 34.92823791503906, + "learning_rate": 7.068493150684932e-06, + "loss": 1.0036, + "step": 775 + }, + { + "epoch": 0.708675799086758, + "grad_norm": 16.65701675415039, + "learning_rate": 7.077625570776257e-06, + "loss": 0.2449, + "step": 776 + }, + { + "epoch": 0.7095890410958904, + "grad_norm": 45.738243103027344, + "learning_rate": 7.086757990867581e-06, + "loss": 2.0959, + "step": 777 + }, + { + "epoch": 0.7105022831050228, + "grad_norm": 37.01980972290039, + "learning_rate": 7.095890410958905e-06, + "loss": 0.3187, + "step": 778 + }, + { + "epoch": 0.7114155251141553, + "grad_norm": 8.49087142944336, + "learning_rate": 7.105022831050229e-06, + "loss": 0.0912, + "step": 779 + }, + { + "epoch": 0.7123287671232876, + "grad_norm": 17.043176651000977, + "learning_rate": 7.114155251141554e-06, + "loss": 0.2666, + "step": 780 + }, + { + "epoch": 0.7132420091324201, + "grad_norm": 2.672597885131836, + "learning_rate": 7.123287671232877e-06, + "loss": 0.033, + "step": 781 + }, + { + "epoch": 0.7141552511415525, + "grad_norm": 51.582271575927734, + "learning_rate": 7.132420091324202e-06, + "loss": 0.9804, + "step": 782 + }, + { + "epoch": 0.7150684931506849, + "grad_norm": 56.95985412597656, + "learning_rate": 7.141552511415526e-06, + "loss": 0.8468, + "step": 783 + }, + { + "epoch": 0.7159817351598173, + "grad_norm": 51.92981719970703, + "learning_rate": 7.15068493150685e-06, + "loss": 1.6778, + "step": 784 + }, + { + "epoch": 0.7168949771689498, + "grad_norm": 66.26959991455078, + "learning_rate": 7.159817351598174e-06, + "loss": 0.9215, + "step": 785 + }, + { + "epoch": 0.7178082191780822, + "grad_norm": 23.6391658782959, + "learning_rate": 7.168949771689499e-06, + "loss": 0.484, + "step": 786 + }, + { + "epoch": 0.7187214611872146, + "grad_norm": 32.235103607177734, + "learning_rate": 7.178082191780823e-06, + "loss": 0.5127, + "step": 787 + }, + { + "epoch": 0.719634703196347, + "grad_norm": 9.647183418273926, + "learning_rate": 7.187214611872147e-06, + "loss": 0.1365, + "step": 788 + }, + { + "epoch": 0.7205479452054795, + "grad_norm": 51.98509216308594, + "learning_rate": 7.196347031963471e-06, + "loss": 2.5533, + "step": 789 + }, + { + "epoch": 0.7214611872146118, + "grad_norm": 35.9306526184082, + "learning_rate": 7.205479452054796e-06, + "loss": 0.9911, + "step": 790 + }, + { + "epoch": 0.7223744292237443, + "grad_norm": 20.171262741088867, + "learning_rate": 7.214611872146119e-06, + "loss": 0.3002, + "step": 791 + }, + { + "epoch": 0.7232876712328767, + "grad_norm": 30.217788696289062, + "learning_rate": 7.223744292237444e-06, + "loss": 0.4279, + "step": 792 + }, + { + "epoch": 0.7242009132420091, + "grad_norm": 20.92295265197754, + "learning_rate": 7.232876712328768e-06, + "loss": 0.3798, + "step": 793 + }, + { + "epoch": 0.7251141552511415, + "grad_norm": 10.708449363708496, + "learning_rate": 7.242009132420091e-06, + "loss": 0.1148, + "step": 794 + }, + { + "epoch": 0.726027397260274, + "grad_norm": 32.791561126708984, + "learning_rate": 7.251141552511416e-06, + "loss": 0.7247, + "step": 795 + }, + { + "epoch": 0.7269406392694064, + "grad_norm": 20.328840255737305, + "learning_rate": 7.260273972602741e-06, + "loss": 0.5448, + "step": 796 + }, + { + "epoch": 0.7278538812785388, + "grad_norm": 124.81167602539062, + "learning_rate": 7.269406392694065e-06, + "loss": 2.9038, + "step": 797 + }, + { + "epoch": 0.7287671232876712, + "grad_norm": 38.644676208496094, + "learning_rate": 7.278538812785388e-06, + "loss": 1.5484, + "step": 798 + }, + { + "epoch": 0.7296803652968037, + "grad_norm": 22.814355850219727, + "learning_rate": 7.287671232876713e-06, + "loss": 0.4071, + "step": 799 + }, + { + "epoch": 0.730593607305936, + "grad_norm": 32.645896911621094, + "learning_rate": 7.296803652968038e-06, + "loss": 0.5644, + "step": 800 + }, + { + "epoch": 0.7315068493150685, + "grad_norm": 35.33977508544922, + "learning_rate": 7.305936073059361e-06, + "loss": 0.7897, + "step": 801 + }, + { + "epoch": 0.7324200913242009, + "grad_norm": 4.871058940887451, + "learning_rate": 7.315068493150685e-06, + "loss": 0.0687, + "step": 802 + }, + { + "epoch": 0.7333333333333333, + "grad_norm": 28.500709533691406, + "learning_rate": 7.32420091324201e-06, + "loss": 0.7204, + "step": 803 + }, + { + "epoch": 0.7342465753424657, + "grad_norm": 38.53377151489258, + "learning_rate": 7.333333333333333e-06, + "loss": 0.814, + "step": 804 + }, + { + "epoch": 0.7351598173515982, + "grad_norm": 9.233534812927246, + "learning_rate": 7.342465753424658e-06, + "loss": 0.1681, + "step": 805 + }, + { + "epoch": 0.7360730593607306, + "grad_norm": 36.417991638183594, + "learning_rate": 7.351598173515982e-06, + "loss": 1.0676, + "step": 806 + }, + { + "epoch": 0.736986301369863, + "grad_norm": 36.96110534667969, + "learning_rate": 7.360730593607307e-06, + "loss": 0.6873, + "step": 807 + }, + { + "epoch": 0.7378995433789954, + "grad_norm": 80.41133117675781, + "learning_rate": 7.36986301369863e-06, + "loss": 1.8354, + "step": 808 + }, + { + "epoch": 0.7388127853881279, + "grad_norm": 10.89206600189209, + "learning_rate": 7.378995433789955e-06, + "loss": 0.1455, + "step": 809 + }, + { + "epoch": 0.7397260273972602, + "grad_norm": 26.012718200683594, + "learning_rate": 7.388127853881279e-06, + "loss": 0.5329, + "step": 810 + }, + { + "epoch": 0.7406392694063927, + "grad_norm": 13.667774200439453, + "learning_rate": 7.397260273972603e-06, + "loss": 0.2969, + "step": 811 + }, + { + "epoch": 0.7415525114155251, + "grad_norm": 29.657264709472656, + "learning_rate": 7.406392694063927e-06, + "loss": 0.5556, + "step": 812 + }, + { + "epoch": 0.7424657534246575, + "grad_norm": 26.82705307006836, + "learning_rate": 7.415525114155252e-06, + "loss": 0.6503, + "step": 813 + }, + { + "epoch": 0.7433789954337899, + "grad_norm": 22.423809051513672, + "learning_rate": 7.424657534246575e-06, + "loss": 0.386, + "step": 814 + }, + { + "epoch": 0.7442922374429224, + "grad_norm": 37.2216682434082, + "learning_rate": 7.4337899543379e-06, + "loss": 0.7262, + "step": 815 + }, + { + "epoch": 0.7452054794520548, + "grad_norm": 25.733165740966797, + "learning_rate": 7.442922374429224e-06, + "loss": 0.5711, + "step": 816 + }, + { + "epoch": 0.7461187214611872, + "grad_norm": 28.319114685058594, + "learning_rate": 7.452054794520549e-06, + "loss": 0.2129, + "step": 817 + }, + { + "epoch": 0.7470319634703196, + "grad_norm": 9.261975288391113, + "learning_rate": 7.461187214611872e-06, + "loss": 0.1519, + "step": 818 + }, + { + "epoch": 0.7479452054794521, + "grad_norm": 55.14284896850586, + "learning_rate": 7.470319634703197e-06, + "loss": 2.1618, + "step": 819 + }, + { + "epoch": 0.7488584474885844, + "grad_norm": 21.096975326538086, + "learning_rate": 7.479452054794521e-06, + "loss": 0.55, + "step": 820 + }, + { + "epoch": 0.7497716894977169, + "grad_norm": 13.69158935546875, + "learning_rate": 7.488584474885845e-06, + "loss": 0.1708, + "step": 821 + }, + { + "epoch": 0.7506849315068493, + "grad_norm": 17.21070671081543, + "learning_rate": 7.497716894977169e-06, + "loss": 0.2352, + "step": 822 + }, + { + "epoch": 0.7515981735159817, + "grad_norm": 38.88177490234375, + "learning_rate": 7.506849315068494e-06, + "loss": 0.5725, + "step": 823 + }, + { + "epoch": 0.7525114155251141, + "grad_norm": 30.292089462280273, + "learning_rate": 7.515981735159817e-06, + "loss": 0.7252, + "step": 824 + }, + { + "epoch": 0.7534246575342466, + "grad_norm": 51.8858528137207, + "learning_rate": 7.525114155251142e-06, + "loss": 2.3166, + "step": 825 + }, + { + "epoch": 0.754337899543379, + "grad_norm": 33.150238037109375, + "learning_rate": 7.534246575342466e-06, + "loss": 0.409, + "step": 826 + }, + { + "epoch": 0.7552511415525114, + "grad_norm": 41.43996047973633, + "learning_rate": 7.543378995433791e-06, + "loss": 0.9229, + "step": 827 + }, + { + "epoch": 0.7561643835616438, + "grad_norm": 11.126212120056152, + "learning_rate": 7.552511415525114e-06, + "loss": 0.1761, + "step": 828 + }, + { + "epoch": 0.7570776255707763, + "grad_norm": 4.169756889343262, + "learning_rate": 7.561643835616439e-06, + "loss": 0.0669, + "step": 829 + }, + { + "epoch": 0.7579908675799086, + "grad_norm": 51.34815216064453, + "learning_rate": 7.570776255707763e-06, + "loss": 1.5751, + "step": 830 + }, + { + "epoch": 0.7589041095890411, + "grad_norm": 6.545751571655273, + "learning_rate": 7.579908675799087e-06, + "loss": 0.1033, + "step": 831 + }, + { + "epoch": 0.7598173515981735, + "grad_norm": 2.9893863201141357, + "learning_rate": 7.589041095890411e-06, + "loss": 0.0343, + "step": 832 + }, + { + "epoch": 0.7607305936073059, + "grad_norm": 47.044498443603516, + "learning_rate": 7.598173515981736e-06, + "loss": 1.2367, + "step": 833 + }, + { + "epoch": 0.7616438356164383, + "grad_norm": 22.53572654724121, + "learning_rate": 7.6073059360730595e-06, + "loss": 0.3005, + "step": 834 + }, + { + "epoch": 0.7625570776255708, + "grad_norm": 26.115236282348633, + "learning_rate": 7.616438356164384e-06, + "loss": 0.53, + "step": 835 + }, + { + "epoch": 0.7634703196347032, + "grad_norm": 17.79840660095215, + "learning_rate": 7.625570776255708e-06, + "loss": 0.4465, + "step": 836 + }, + { + "epoch": 0.7643835616438356, + "grad_norm": 71.9837646484375, + "learning_rate": 7.634703196347033e-06, + "loss": 1.4733, + "step": 837 + }, + { + "epoch": 0.765296803652968, + "grad_norm": 7.2975993156433105, + "learning_rate": 7.643835616438356e-06, + "loss": 0.1093, + "step": 838 + }, + { + "epoch": 0.7662100456621005, + "grad_norm": 38.874324798583984, + "learning_rate": 7.652968036529682e-06, + "loss": 0.6017, + "step": 839 + }, + { + "epoch": 0.7671232876712328, + "grad_norm": 17.191516876220703, + "learning_rate": 7.662100456621005e-06, + "loss": 0.2759, + "step": 840 + }, + { + "epoch": 0.7680365296803653, + "grad_norm": 50.794612884521484, + "learning_rate": 7.671232876712329e-06, + "loss": 1.028, + "step": 841 + }, + { + "epoch": 0.7689497716894977, + "grad_norm": 2.6283376216888428, + "learning_rate": 7.680365296803653e-06, + "loss": 0.0279, + "step": 842 + }, + { + "epoch": 0.7698630136986301, + "grad_norm": 30.493778228759766, + "learning_rate": 7.689497716894978e-06, + "loss": 0.4608, + "step": 843 + }, + { + "epoch": 0.7707762557077625, + "grad_norm": 74.42973327636719, + "learning_rate": 7.698630136986302e-06, + "loss": 4.1302, + "step": 844 + }, + { + "epoch": 0.771689497716895, + "grad_norm": 39.609046936035156, + "learning_rate": 7.707762557077625e-06, + "loss": 0.6254, + "step": 845 + }, + { + "epoch": 0.7726027397260274, + "grad_norm": 3.347531318664551, + "learning_rate": 7.71689497716895e-06, + "loss": 0.0451, + "step": 846 + }, + { + "epoch": 0.7735159817351598, + "grad_norm": 42.03330993652344, + "learning_rate": 7.726027397260276e-06, + "loss": 0.7012, + "step": 847 + }, + { + "epoch": 0.7744292237442922, + "grad_norm": 71.4776382446289, + "learning_rate": 7.735159817351598e-06, + "loss": 2.5083, + "step": 848 + }, + { + "epoch": 0.7753424657534247, + "grad_norm": 21.252649307250977, + "learning_rate": 7.744292237442923e-06, + "loss": 0.2319, + "step": 849 + }, + { + "epoch": 0.776255707762557, + "grad_norm": 53.81922912597656, + "learning_rate": 7.753424657534248e-06, + "loss": 3.5005, + "step": 850 + }, + { + "epoch": 0.7771689497716895, + "grad_norm": 37.77101516723633, + "learning_rate": 7.76255707762557e-06, + "loss": 0.7439, + "step": 851 + }, + { + "epoch": 0.7780821917808219, + "grad_norm": 39.7142333984375, + "learning_rate": 7.771689497716896e-06, + "loss": 1.2041, + "step": 852 + }, + { + "epoch": 0.7789954337899543, + "grad_norm": 42.07859802246094, + "learning_rate": 7.78082191780822e-06, + "loss": 0.7909, + "step": 853 + }, + { + "epoch": 0.7799086757990867, + "grad_norm": 44.454795837402344, + "learning_rate": 7.789954337899543e-06, + "loss": 1.5772, + "step": 854 + }, + { + "epoch": 0.7808219178082192, + "grad_norm": 13.260149002075195, + "learning_rate": 7.799086757990868e-06, + "loss": 0.2257, + "step": 855 + }, + { + "epoch": 0.7817351598173516, + "grad_norm": 9.82217025756836, + "learning_rate": 7.808219178082192e-06, + "loss": 0.1531, + "step": 856 + }, + { + "epoch": 0.782648401826484, + "grad_norm": 7.195595741271973, + "learning_rate": 7.817351598173517e-06, + "loss": 0.0753, + "step": 857 + }, + { + "epoch": 0.7835616438356164, + "grad_norm": 56.82462692260742, + "learning_rate": 7.82648401826484e-06, + "loss": 1.4337, + "step": 858 + }, + { + "epoch": 0.7844748858447489, + "grad_norm": 22.68279266357422, + "learning_rate": 7.835616438356164e-06, + "loss": 0.2758, + "step": 859 + }, + { + "epoch": 0.7853881278538812, + "grad_norm": 24.87026596069336, + "learning_rate": 7.84474885844749e-06, + "loss": 0.2643, + "step": 860 + }, + { + "epoch": 0.7863013698630137, + "grad_norm": 12.982011795043945, + "learning_rate": 7.853881278538813e-06, + "loss": 0.1995, + "step": 861 + }, + { + "epoch": 0.7872146118721461, + "grad_norm": 17.326045989990234, + "learning_rate": 7.863013698630137e-06, + "loss": 0.2519, + "step": 862 + }, + { + "epoch": 0.7881278538812785, + "grad_norm": 15.124853134155273, + "learning_rate": 7.872146118721462e-06, + "loss": 0.3451, + "step": 863 + }, + { + "epoch": 0.7890410958904109, + "grad_norm": 76.3216781616211, + "learning_rate": 7.881278538812786e-06, + "loss": 0.2113, + "step": 864 + }, + { + "epoch": 0.7899543378995434, + "grad_norm": 50.23212814331055, + "learning_rate": 7.89041095890411e-06, + "loss": 1.265, + "step": 865 + }, + { + "epoch": 0.7908675799086758, + "grad_norm": 37.972747802734375, + "learning_rate": 7.899543378995435e-06, + "loss": 1.3894, + "step": 866 + }, + { + "epoch": 0.7917808219178082, + "grad_norm": 27.920305252075195, + "learning_rate": 7.908675799086758e-06, + "loss": 0.505, + "step": 867 + }, + { + "epoch": 0.7926940639269406, + "grad_norm": 42.410919189453125, + "learning_rate": 7.917808219178082e-06, + "loss": 0.9785, + "step": 868 + }, + { + "epoch": 0.7936073059360731, + "grad_norm": 83.62518310546875, + "learning_rate": 7.926940639269407e-06, + "loss": 3.7085, + "step": 869 + }, + { + "epoch": 0.7945205479452054, + "grad_norm": 18.797771453857422, + "learning_rate": 7.936073059360731e-06, + "loss": 0.1843, + "step": 870 + }, + { + "epoch": 0.7954337899543379, + "grad_norm": 28.13274574279785, + "learning_rate": 7.945205479452055e-06, + "loss": 0.7661, + "step": 871 + }, + { + "epoch": 0.7963470319634703, + "grad_norm": 19.194190979003906, + "learning_rate": 7.95433789954338e-06, + "loss": 0.2847, + "step": 872 + }, + { + "epoch": 0.7972602739726027, + "grad_norm": 2.516575336456299, + "learning_rate": 7.963470319634703e-06, + "loss": 0.0282, + "step": 873 + }, + { + "epoch": 0.7981735159817351, + "grad_norm": 37.358760833740234, + "learning_rate": 7.972602739726027e-06, + "loss": 1.3009, + "step": 874 + }, + { + "epoch": 0.7990867579908676, + "grad_norm": 25.282529830932617, + "learning_rate": 7.981735159817352e-06, + "loss": 0.3575, + "step": 875 + }, + { + "epoch": 0.8, + "grad_norm": 39.364837646484375, + "learning_rate": 7.990867579908676e-06, + "loss": 0.8882, + "step": 876 + }, + { + "epoch": 0.8009132420091324, + "grad_norm": 48.18341064453125, + "learning_rate": 8.000000000000001e-06, + "loss": 1.2012, + "step": 877 + }, + { + "epoch": 0.8018264840182648, + "grad_norm": 10.146980285644531, + "learning_rate": 8.009132420091325e-06, + "loss": 0.0876, + "step": 878 + }, + { + "epoch": 0.8027397260273973, + "grad_norm": 11.957255363464355, + "learning_rate": 8.018264840182649e-06, + "loss": 0.1822, + "step": 879 + }, + { + "epoch": 0.8036529680365296, + "grad_norm": 22.307085037231445, + "learning_rate": 8.027397260273974e-06, + "loss": 0.5118, + "step": 880 + }, + { + "epoch": 0.8045662100456621, + "grad_norm": 50.46153259277344, + "learning_rate": 8.036529680365297e-06, + "loss": 1.1337, + "step": 881 + }, + { + "epoch": 0.8054794520547945, + "grad_norm": 20.463123321533203, + "learning_rate": 8.045662100456621e-06, + "loss": 0.4868, + "step": 882 + }, + { + "epoch": 0.806392694063927, + "grad_norm": 35.3681526184082, + "learning_rate": 8.054794520547946e-06, + "loss": 0.74, + "step": 883 + }, + { + "epoch": 0.8073059360730593, + "grad_norm": 10.796250343322754, + "learning_rate": 8.06392694063927e-06, + "loss": 0.1583, + "step": 884 + }, + { + "epoch": 0.8082191780821918, + "grad_norm": 26.035152435302734, + "learning_rate": 8.073059360730594e-06, + "loss": 0.449, + "step": 885 + }, + { + "epoch": 0.8091324200913242, + "grad_norm": 44.30646514892578, + "learning_rate": 8.082191780821919e-06, + "loss": 1.2596, + "step": 886 + }, + { + "epoch": 0.8100456621004566, + "grad_norm": 29.374229431152344, + "learning_rate": 8.091324200913243e-06, + "loss": 0.8028, + "step": 887 + }, + { + "epoch": 0.810958904109589, + "grad_norm": 38.7402229309082, + "learning_rate": 8.100456621004566e-06, + "loss": 0.697, + "step": 888 + }, + { + "epoch": 0.8118721461187215, + "grad_norm": 56.96029281616211, + "learning_rate": 8.109589041095892e-06, + "loss": 2.2649, + "step": 889 + }, + { + "epoch": 0.8127853881278538, + "grad_norm": 65.90884399414062, + "learning_rate": 8.118721461187215e-06, + "loss": 0.8024, + "step": 890 + }, + { + "epoch": 0.8136986301369863, + "grad_norm": 41.85341262817383, + "learning_rate": 8.127853881278539e-06, + "loss": 0.7197, + "step": 891 + }, + { + "epoch": 0.8146118721461187, + "grad_norm": 66.38700103759766, + "learning_rate": 8.136986301369864e-06, + "loss": 3.3021, + "step": 892 + }, + { + "epoch": 0.8155251141552512, + "grad_norm": 10.04651927947998, + "learning_rate": 8.146118721461188e-06, + "loss": 0.1129, + "step": 893 + }, + { + "epoch": 0.8164383561643835, + "grad_norm": 40.937904357910156, + "learning_rate": 8.155251141552513e-06, + "loss": 1.3544, + "step": 894 + }, + { + "epoch": 0.817351598173516, + "grad_norm": 46.061588287353516, + "learning_rate": 8.164383561643837e-06, + "loss": 0.6209, + "step": 895 + }, + { + "epoch": 0.8182648401826484, + "grad_norm": 11.807045936584473, + "learning_rate": 8.17351598173516e-06, + "loss": 0.1967, + "step": 896 + }, + { + "epoch": 0.8191780821917808, + "grad_norm": 41.97748947143555, + "learning_rate": 8.182648401826486e-06, + "loss": 1.0265, + "step": 897 + }, + { + "epoch": 0.8200913242009132, + "grad_norm": 30.67413902282715, + "learning_rate": 8.19178082191781e-06, + "loss": 0.503, + "step": 898 + }, + { + "epoch": 0.8210045662100457, + "grad_norm": 48.82606887817383, + "learning_rate": 8.200913242009133e-06, + "loss": 1.8049, + "step": 899 + }, + { + "epoch": 0.821917808219178, + "grad_norm": 71.51165771484375, + "learning_rate": 8.210045662100458e-06, + "loss": 1.8265, + "step": 900 + }, + { + "epoch": 0.8228310502283105, + "grad_norm": 59.294498443603516, + "learning_rate": 8.219178082191782e-06, + "loss": 2.1649, + "step": 901 + }, + { + "epoch": 0.8237442922374429, + "grad_norm": 41.737178802490234, + "learning_rate": 8.228310502283105e-06, + "loss": 1.2402, + "step": 902 + }, + { + "epoch": 0.8246575342465754, + "grad_norm": 19.350492477416992, + "learning_rate": 8.23744292237443e-06, + "loss": 0.3421, + "step": 903 + }, + { + "epoch": 0.8255707762557077, + "grad_norm": 27.025562286376953, + "learning_rate": 8.246575342465754e-06, + "loss": 0.2417, + "step": 904 + }, + { + "epoch": 0.8264840182648402, + "grad_norm": 1.9074053764343262, + "learning_rate": 8.255707762557078e-06, + "loss": 0.0191, + "step": 905 + }, + { + "epoch": 0.8273972602739726, + "grad_norm": 95.95440673828125, + "learning_rate": 8.264840182648403e-06, + "loss": 2.3231, + "step": 906 + }, + { + "epoch": 0.828310502283105, + "grad_norm": 43.37923812866211, + "learning_rate": 8.273972602739727e-06, + "loss": 1.0924, + "step": 907 + }, + { + "epoch": 0.8292237442922374, + "grad_norm": 14.770596504211426, + "learning_rate": 8.28310502283105e-06, + "loss": 0.2024, + "step": 908 + }, + { + "epoch": 0.8301369863013699, + "grad_norm": 35.26417541503906, + "learning_rate": 8.292237442922376e-06, + "loss": 0.8951, + "step": 909 + }, + { + "epoch": 0.8310502283105022, + "grad_norm": 15.32934284210205, + "learning_rate": 8.3013698630137e-06, + "loss": 0.2395, + "step": 910 + }, + { + "epoch": 0.8319634703196347, + "grad_norm": 40.761940002441406, + "learning_rate": 8.310502283105023e-06, + "loss": 1.346, + "step": 911 + }, + { + "epoch": 0.8328767123287671, + "grad_norm": 8.898496627807617, + "learning_rate": 8.319634703196348e-06, + "loss": 0.1162, + "step": 912 + }, + { + "epoch": 0.8337899543378996, + "grad_norm": 22.878738403320312, + "learning_rate": 8.328767123287672e-06, + "loss": 0.3989, + "step": 913 + }, + { + "epoch": 0.8347031963470319, + "grad_norm": 48.128787994384766, + "learning_rate": 8.337899543378997e-06, + "loss": 1.2748, + "step": 914 + }, + { + "epoch": 0.8356164383561644, + "grad_norm": 4.370965480804443, + "learning_rate": 8.347031963470321e-06, + "loss": 0.061, + "step": 915 + }, + { + "epoch": 0.8365296803652968, + "grad_norm": 46.05537796020508, + "learning_rate": 8.356164383561644e-06, + "loss": 0.6892, + "step": 916 + }, + { + "epoch": 0.8374429223744292, + "grad_norm": 22.959396362304688, + "learning_rate": 8.36529680365297e-06, + "loss": 0.3704, + "step": 917 + }, + { + "epoch": 0.8383561643835616, + "grad_norm": 30.282541275024414, + "learning_rate": 8.374429223744293e-06, + "loss": 0.489, + "step": 918 + }, + { + "epoch": 0.8392694063926941, + "grad_norm": 64.87556457519531, + "learning_rate": 8.383561643835617e-06, + "loss": 1.8517, + "step": 919 + }, + { + "epoch": 0.8401826484018264, + "grad_norm": 46.57028579711914, + "learning_rate": 8.392694063926942e-06, + "loss": 1.4006, + "step": 920 + }, + { + "epoch": 0.8410958904109589, + "grad_norm": 25.618141174316406, + "learning_rate": 8.401826484018264e-06, + "loss": 0.375, + "step": 921 + }, + { + "epoch": 0.8420091324200913, + "grad_norm": 12.060128211975098, + "learning_rate": 8.41095890410959e-06, + "loss": 0.1576, + "step": 922 + }, + { + "epoch": 0.8429223744292238, + "grad_norm": 57.96827697753906, + "learning_rate": 8.420091324200915e-06, + "loss": 1.5422, + "step": 923 + }, + { + "epoch": 0.8438356164383561, + "grad_norm": 42.5572509765625, + "learning_rate": 8.429223744292239e-06, + "loss": 1.2643, + "step": 924 + }, + { + "epoch": 0.8447488584474886, + "grad_norm": 40.76325225830078, + "learning_rate": 8.438356164383562e-06, + "loss": 1.1132, + "step": 925 + }, + { + "epoch": 0.845662100456621, + "grad_norm": 9.233156204223633, + "learning_rate": 8.447488584474887e-06, + "loss": 0.1158, + "step": 926 + }, + { + "epoch": 0.8465753424657534, + "grad_norm": 12.497429847717285, + "learning_rate": 8.456621004566211e-06, + "loss": 0.2032, + "step": 927 + }, + { + "epoch": 0.8474885844748858, + "grad_norm": 20.88695526123047, + "learning_rate": 8.465753424657535e-06, + "loss": 0.5276, + "step": 928 + }, + { + "epoch": 0.8484018264840183, + "grad_norm": 31.950300216674805, + "learning_rate": 8.474885844748858e-06, + "loss": 1.2378, + "step": 929 + }, + { + "epoch": 0.8493150684931506, + "grad_norm": 0.5463946461677551, + "learning_rate": 8.484018264840184e-06, + "loss": 0.0077, + "step": 930 + }, + { + "epoch": 0.8502283105022831, + "grad_norm": 16.138765335083008, + "learning_rate": 8.493150684931507e-06, + "loss": 0.2426, + "step": 931 + }, + { + "epoch": 0.8511415525114155, + "grad_norm": 14.548689842224121, + "learning_rate": 8.50228310502283e-06, + "loss": 0.2095, + "step": 932 + }, + { + "epoch": 0.852054794520548, + "grad_norm": 14.626317024230957, + "learning_rate": 8.511415525114156e-06, + "loss": 0.1797, + "step": 933 + }, + { + "epoch": 0.8529680365296803, + "grad_norm": 7.32077169418335, + "learning_rate": 8.520547945205481e-06, + "loss": 0.0978, + "step": 934 + }, + { + "epoch": 0.8538812785388128, + "grad_norm": 3.743565320968628, + "learning_rate": 8.529680365296803e-06, + "loss": 0.0381, + "step": 935 + }, + { + "epoch": 0.8547945205479452, + "grad_norm": 36.98033905029297, + "learning_rate": 8.538812785388129e-06, + "loss": 0.8671, + "step": 936 + }, + { + "epoch": 0.8557077625570776, + "grad_norm": 66.20729064941406, + "learning_rate": 8.547945205479454e-06, + "loss": 4.0677, + "step": 937 + }, + { + "epoch": 0.85662100456621, + "grad_norm": 58.56326675415039, + "learning_rate": 8.557077625570776e-06, + "loss": 1.3031, + "step": 938 + }, + { + "epoch": 0.8575342465753425, + "grad_norm": 260.5022888183594, + "learning_rate": 8.566210045662101e-06, + "loss": 1.5163, + "step": 939 + }, + { + "epoch": 0.8584474885844748, + "grad_norm": 43.913204193115234, + "learning_rate": 8.575342465753425e-06, + "loss": 0.6801, + "step": 940 + }, + { + "epoch": 0.8593607305936073, + "grad_norm": 41.27130889892578, + "learning_rate": 8.584474885844748e-06, + "loss": 0.6593, + "step": 941 + }, + { + "epoch": 0.8602739726027397, + "grad_norm": 21.91136360168457, + "learning_rate": 8.593607305936074e-06, + "loss": 0.3794, + "step": 942 + }, + { + "epoch": 0.8611872146118722, + "grad_norm": 58.05061340332031, + "learning_rate": 8.602739726027397e-06, + "loss": 2.0651, + "step": 943 + }, + { + "epoch": 0.8621004566210045, + "grad_norm": 111.88894653320312, + "learning_rate": 8.611872146118723e-06, + "loss": 0.5552, + "step": 944 + }, + { + "epoch": 0.863013698630137, + "grad_norm": 4.8136491775512695, + "learning_rate": 8.621004566210046e-06, + "loss": 0.0507, + "step": 945 + }, + { + "epoch": 0.8639269406392694, + "grad_norm": 43.22735595703125, + "learning_rate": 8.63013698630137e-06, + "loss": 0.9271, + "step": 946 + }, + { + "epoch": 0.8648401826484018, + "grad_norm": 19.748376846313477, + "learning_rate": 8.639269406392695e-06, + "loss": 0.237, + "step": 947 + }, + { + "epoch": 0.8657534246575342, + "grad_norm": 47.26508331298828, + "learning_rate": 8.648401826484019e-06, + "loss": 1.488, + "step": 948 + }, + { + "epoch": 0.8666666666666667, + "grad_norm": 20.630945205688477, + "learning_rate": 8.657534246575343e-06, + "loss": 0.418, + "step": 949 + }, + { + "epoch": 0.867579908675799, + "grad_norm": 47.636783599853516, + "learning_rate": 8.666666666666668e-06, + "loss": 1.1168, + "step": 950 + }, + { + "epoch": 0.8684931506849315, + "grad_norm": 35.35184860229492, + "learning_rate": 8.675799086757991e-06, + "loss": 0.8006, + "step": 951 + }, + { + "epoch": 0.869406392694064, + "grad_norm": 10.310883522033691, + "learning_rate": 8.684931506849315e-06, + "loss": 0.1054, + "step": 952 + }, + { + "epoch": 0.8703196347031964, + "grad_norm": 31.58631706237793, + "learning_rate": 8.69406392694064e-06, + "loss": 0.5137, + "step": 953 + }, + { + "epoch": 0.8712328767123287, + "grad_norm": 7.111154556274414, + "learning_rate": 8.703196347031964e-06, + "loss": 0.1008, + "step": 954 + }, + { + "epoch": 0.8721461187214612, + "grad_norm": 19.847875595092773, + "learning_rate": 8.712328767123288e-06, + "loss": 0.4195, + "step": 955 + }, + { + "epoch": 0.8730593607305936, + "grad_norm": 7.029451847076416, + "learning_rate": 8.721461187214613e-06, + "loss": 0.0638, + "step": 956 + }, + { + "epoch": 0.873972602739726, + "grad_norm": 40.01822280883789, + "learning_rate": 8.730593607305937e-06, + "loss": 0.771, + "step": 957 + }, + { + "epoch": 0.8748858447488584, + "grad_norm": 40.26274108886719, + "learning_rate": 8.73972602739726e-06, + "loss": 0.7081, + "step": 958 + }, + { + "epoch": 0.8757990867579909, + "grad_norm": 8.959488868713379, + "learning_rate": 8.748858447488585e-06, + "loss": 0.1126, + "step": 959 + }, + { + "epoch": 0.8767123287671232, + "grad_norm": 37.39285659790039, + "learning_rate": 8.757990867579909e-06, + "loss": 0.5727, + "step": 960 + }, + { + "epoch": 0.8776255707762557, + "grad_norm": 63.173519134521484, + "learning_rate": 8.767123287671233e-06, + "loss": 3.8304, + "step": 961 + }, + { + "epoch": 0.8785388127853881, + "grad_norm": 2.3679869174957275, + "learning_rate": 8.776255707762558e-06, + "loss": 0.0268, + "step": 962 + }, + { + "epoch": 0.8794520547945206, + "grad_norm": 48.92608642578125, + "learning_rate": 8.785388127853882e-06, + "loss": 2.5467, + "step": 963 + }, + { + "epoch": 0.8803652968036529, + "grad_norm": 84.56397247314453, + "learning_rate": 8.794520547945207e-06, + "loss": 2.9302, + "step": 964 + }, + { + "epoch": 0.8812785388127854, + "grad_norm": 27.48685646057129, + "learning_rate": 8.80365296803653e-06, + "loss": 0.7301, + "step": 965 + }, + { + "epoch": 0.8821917808219178, + "grad_norm": 41.13814163208008, + "learning_rate": 8.812785388127854e-06, + "loss": 0.5862, + "step": 966 + }, + { + "epoch": 0.8831050228310502, + "grad_norm": 0.5637136697769165, + "learning_rate": 8.82191780821918e-06, + "loss": 0.0081, + "step": 967 + }, + { + "epoch": 0.8840182648401826, + "grad_norm": 5.641241550445557, + "learning_rate": 8.831050228310503e-06, + "loss": 0.0945, + "step": 968 + }, + { + "epoch": 0.8849315068493151, + "grad_norm": 18.359323501586914, + "learning_rate": 8.840182648401827e-06, + "loss": 0.267, + "step": 969 + }, + { + "epoch": 0.8858447488584474, + "grad_norm": 26.75527572631836, + "learning_rate": 8.849315068493152e-06, + "loss": 0.5195, + "step": 970 + }, + { + "epoch": 0.8867579908675799, + "grad_norm": 25.780128479003906, + "learning_rate": 8.858447488584476e-06, + "loss": 0.4239, + "step": 971 + }, + { + "epoch": 0.8876712328767123, + "grad_norm": 35.271568298339844, + "learning_rate": 8.8675799086758e-06, + "loss": 0.4683, + "step": 972 + }, + { + "epoch": 0.8885844748858448, + "grad_norm": 32.13649368286133, + "learning_rate": 8.876712328767125e-06, + "loss": 0.7593, + "step": 973 + }, + { + "epoch": 0.8894977168949771, + "grad_norm": 20.80233383178711, + "learning_rate": 8.885844748858448e-06, + "loss": 0.3003, + "step": 974 + }, + { + "epoch": 0.8904109589041096, + "grad_norm": 16.210113525390625, + "learning_rate": 8.894977168949772e-06, + "loss": 0.2757, + "step": 975 + }, + { + "epoch": 0.891324200913242, + "grad_norm": 59.657752990722656, + "learning_rate": 8.904109589041097e-06, + "loss": 1.2818, + "step": 976 + }, + { + "epoch": 0.8922374429223744, + "grad_norm": 8.657734870910645, + "learning_rate": 8.91324200913242e-06, + "loss": 0.122, + "step": 977 + }, + { + "epoch": 0.8931506849315068, + "grad_norm": 42.92193603515625, + "learning_rate": 8.922374429223744e-06, + "loss": 1.6747, + "step": 978 + }, + { + "epoch": 0.8940639269406393, + "grad_norm": 29.727954864501953, + "learning_rate": 8.93150684931507e-06, + "loss": 0.521, + "step": 979 + }, + { + "epoch": 0.8949771689497716, + "grad_norm": 17.443622589111328, + "learning_rate": 8.940639269406393e-06, + "loss": 0.3032, + "step": 980 + }, + { + "epoch": 0.8958904109589041, + "grad_norm": 52.31827163696289, + "learning_rate": 8.949771689497717e-06, + "loss": 2.2938, + "step": 981 + }, + { + "epoch": 0.8968036529680365, + "grad_norm": 53.39495086669922, + "learning_rate": 8.958904109589042e-06, + "loss": 2.8368, + "step": 982 + }, + { + "epoch": 0.897716894977169, + "grad_norm": 42.071800231933594, + "learning_rate": 8.968036529680366e-06, + "loss": 0.8983, + "step": 983 + }, + { + "epoch": 0.8986301369863013, + "grad_norm": 43.95864486694336, + "learning_rate": 8.977168949771691e-06, + "loss": 0.9598, + "step": 984 + }, + { + "epoch": 0.8995433789954338, + "grad_norm": 35.748680114746094, + "learning_rate": 8.986301369863015e-06, + "loss": 0.8824, + "step": 985 + }, + { + "epoch": 0.9004566210045662, + "grad_norm": 21.15885353088379, + "learning_rate": 8.995433789954338e-06, + "loss": 0.2875, + "step": 986 + }, + { + "epoch": 0.9013698630136986, + "grad_norm": 51.5980339050293, + "learning_rate": 9.004566210045664e-06, + "loss": 1.1653, + "step": 987 + }, + { + "epoch": 0.902283105022831, + "grad_norm": 42.0321159362793, + "learning_rate": 9.013698630136987e-06, + "loss": 0.8284, + "step": 988 + }, + { + "epoch": 0.9031963470319635, + "grad_norm": 29.71866798400879, + "learning_rate": 9.022831050228311e-06, + "loss": 0.8041, + "step": 989 + }, + { + "epoch": 0.9041095890410958, + "grad_norm": 35.47916030883789, + "learning_rate": 9.031963470319636e-06, + "loss": 0.8592, + "step": 990 + }, + { + "epoch": 0.9050228310502283, + "grad_norm": 7.410295009613037, + "learning_rate": 9.04109589041096e-06, + "loss": 0.0577, + "step": 991 + }, + { + "epoch": 0.9059360730593607, + "grad_norm": 45.97003936767578, + "learning_rate": 9.050228310502284e-06, + "loss": 1.5958, + "step": 992 + }, + { + "epoch": 0.9068493150684932, + "grad_norm": 24.428483963012695, + "learning_rate": 9.059360730593609e-06, + "loss": 0.313, + "step": 993 + }, + { + "epoch": 0.9077625570776255, + "grad_norm": 23.651872634887695, + "learning_rate": 9.068493150684932e-06, + "loss": 0.4294, + "step": 994 + }, + { + "epoch": 0.908675799086758, + "grad_norm": 24.528078079223633, + "learning_rate": 9.077625570776256e-06, + "loss": 0.5723, + "step": 995 + }, + { + "epoch": 0.9095890410958904, + "grad_norm": 133.62681579589844, + "learning_rate": 9.086757990867581e-06, + "loss": 1.2882, + "step": 996 + }, + { + "epoch": 0.9105022831050228, + "grad_norm": 34.80030059814453, + "learning_rate": 9.095890410958905e-06, + "loss": 0.7539, + "step": 997 + }, + { + "epoch": 0.9114155251141552, + "grad_norm": 58.51625442504883, + "learning_rate": 9.105022831050229e-06, + "loss": 2.0648, + "step": 998 + }, + { + "epoch": 0.9123287671232877, + "grad_norm": 59.904754638671875, + "learning_rate": 9.114155251141554e-06, + "loss": 1.8941, + "step": 999 + }, + { + "epoch": 0.91324200913242, + "grad_norm": 27.789072036743164, + "learning_rate": 9.123287671232878e-06, + "loss": 0.45, + "step": 1000 + }, + { + "epoch": 0.9141552511415525, + "grad_norm": 12.392375946044922, + "learning_rate": 9.132420091324201e-06, + "loss": 0.2095, + "step": 1001 + }, + { + "epoch": 0.915068493150685, + "grad_norm": 13.652204513549805, + "learning_rate": 9.141552511415526e-06, + "loss": 0.1702, + "step": 1002 + }, + { + "epoch": 0.9159817351598174, + "grad_norm": 31.6411190032959, + "learning_rate": 9.15068493150685e-06, + "loss": 0.5856, + "step": 1003 + }, + { + "epoch": 0.9168949771689497, + "grad_norm": 44.953243255615234, + "learning_rate": 9.159817351598175e-06, + "loss": 1.2529, + "step": 1004 + }, + { + "epoch": 0.9178082191780822, + "grad_norm": 38.62617492675781, + "learning_rate": 9.168949771689499e-06, + "loss": 0.7731, + "step": 1005 + }, + { + "epoch": 0.9187214611872146, + "grad_norm": 7.728457450866699, + "learning_rate": 9.178082191780823e-06, + "loss": 0.0974, + "step": 1006 + }, + { + "epoch": 0.919634703196347, + "grad_norm": 19.81455421447754, + "learning_rate": 9.187214611872148e-06, + "loss": 0.3892, + "step": 1007 + }, + { + "epoch": 0.9205479452054794, + "grad_norm": 41.50185012817383, + "learning_rate": 9.19634703196347e-06, + "loss": 0.5363, + "step": 1008 + }, + { + "epoch": 0.9214611872146119, + "grad_norm": 49.965633392333984, + "learning_rate": 9.205479452054795e-06, + "loss": 1.2936, + "step": 1009 + }, + { + "epoch": 0.9223744292237442, + "grad_norm": 7.674129962921143, + "learning_rate": 9.21461187214612e-06, + "loss": 0.0834, + "step": 1010 + }, + { + "epoch": 0.9232876712328767, + "grad_norm": 45.25316619873047, + "learning_rate": 9.223744292237442e-06, + "loss": 2.3148, + "step": 1011 + }, + { + "epoch": 0.9242009132420091, + "grad_norm": 11.981282234191895, + "learning_rate": 9.232876712328768e-06, + "loss": 0.1723, + "step": 1012 + }, + { + "epoch": 0.9251141552511416, + "grad_norm": 39.65054702758789, + "learning_rate": 9.242009132420093e-06, + "loss": 0.818, + "step": 1013 + }, + { + "epoch": 0.9260273972602739, + "grad_norm": 26.392580032348633, + "learning_rate": 9.251141552511417e-06, + "loss": 0.5125, + "step": 1014 + }, + { + "epoch": 0.9269406392694064, + "grad_norm": 29.119352340698242, + "learning_rate": 9.26027397260274e-06, + "loss": 0.6789, + "step": 1015 + }, + { + "epoch": 0.9278538812785389, + "grad_norm": 68.0001220703125, + "learning_rate": 9.269406392694064e-06, + "loss": 1.9861, + "step": 1016 + }, + { + "epoch": 0.9287671232876712, + "grad_norm": 6.92840576171875, + "learning_rate": 9.27853881278539e-06, + "loss": 0.1258, + "step": 1017 + }, + { + "epoch": 0.9296803652968036, + "grad_norm": 44.54508972167969, + "learning_rate": 9.287671232876713e-06, + "loss": 0.9348, + "step": 1018 + }, + { + "epoch": 0.9305936073059361, + "grad_norm": 47.20721435546875, + "learning_rate": 9.296803652968036e-06, + "loss": 1.4473, + "step": 1019 + }, + { + "epoch": 0.9315068493150684, + "grad_norm": 20.939613342285156, + "learning_rate": 9.305936073059362e-06, + "loss": 0.2564, + "step": 1020 + }, + { + "epoch": 0.9324200913242009, + "grad_norm": 41.08513641357422, + "learning_rate": 9.315068493150685e-06, + "loss": 0.4648, + "step": 1021 + }, + { + "epoch": 0.9333333333333333, + "grad_norm": 74.9217758178711, + "learning_rate": 9.324200913242009e-06, + "loss": 1.4167, + "step": 1022 + }, + { + "epoch": 0.9342465753424658, + "grad_norm": 48.378013610839844, + "learning_rate": 9.333333333333334e-06, + "loss": 0.9847, + "step": 1023 + }, + { + "epoch": 0.9351598173515981, + "grad_norm": 30.297513961791992, + "learning_rate": 9.342465753424658e-06, + "loss": 0.5692, + "step": 1024 + }, + { + "epoch": 0.9360730593607306, + "grad_norm": 51.248451232910156, + "learning_rate": 9.351598173515982e-06, + "loss": 1.3891, + "step": 1025 + }, + { + "epoch": 0.936986301369863, + "grad_norm": 19.54279899597168, + "learning_rate": 9.360730593607307e-06, + "loss": 0.3197, + "step": 1026 + }, + { + "epoch": 0.9378995433789954, + "grad_norm": 22.427316665649414, + "learning_rate": 9.36986301369863e-06, + "loss": 0.2777, + "step": 1027 + }, + { + "epoch": 0.9388127853881278, + "grad_norm": 0.9592695236206055, + "learning_rate": 9.378995433789954e-06, + "loss": 0.0103, + "step": 1028 + }, + { + "epoch": 0.9397260273972603, + "grad_norm": 34.64145278930664, + "learning_rate": 9.38812785388128e-06, + "loss": 0.7684, + "step": 1029 + }, + { + "epoch": 0.9406392694063926, + "grad_norm": 15.389307022094727, + "learning_rate": 9.397260273972603e-06, + "loss": 0.2159, + "step": 1030 + }, + { + "epoch": 0.9415525114155251, + "grad_norm": 45.958892822265625, + "learning_rate": 9.406392694063927e-06, + "loss": 1.4095, + "step": 1031 + }, + { + "epoch": 0.9424657534246575, + "grad_norm": 15.546075820922852, + "learning_rate": 9.415525114155252e-06, + "loss": 0.1898, + "step": 1032 + }, + { + "epoch": 0.94337899543379, + "grad_norm": 36.88715744018555, + "learning_rate": 9.424657534246576e-06, + "loss": 0.8986, + "step": 1033 + }, + { + "epoch": 0.9442922374429223, + "grad_norm": 27.952817916870117, + "learning_rate": 9.433789954337901e-06, + "loss": 0.2159, + "step": 1034 + }, + { + "epoch": 0.9452054794520548, + "grad_norm": 4.148837566375732, + "learning_rate": 9.442922374429225e-06, + "loss": 0.0695, + "step": 1035 + }, + { + "epoch": 0.9461187214611873, + "grad_norm": 39.380123138427734, + "learning_rate": 9.452054794520548e-06, + "loss": 0.523, + "step": 1036 + }, + { + "epoch": 0.9470319634703196, + "grad_norm": 8.705638885498047, + "learning_rate": 9.461187214611873e-06, + "loss": 0.1188, + "step": 1037 + }, + { + "epoch": 0.947945205479452, + "grad_norm": 34.61455535888672, + "learning_rate": 9.470319634703197e-06, + "loss": 0.7973, + "step": 1038 + }, + { + "epoch": 0.9488584474885845, + "grad_norm": 50.09312438964844, + "learning_rate": 9.47945205479452e-06, + "loss": 1.0606, + "step": 1039 + }, + { + "epoch": 0.9497716894977168, + "grad_norm": 8.532252311706543, + "learning_rate": 9.488584474885846e-06, + "loss": 0.137, + "step": 1040 + }, + { + "epoch": 0.9506849315068493, + "grad_norm": 5.959938049316406, + "learning_rate": 9.49771689497717e-06, + "loss": 0.0662, + "step": 1041 + }, + { + "epoch": 0.9515981735159817, + "grad_norm": 1.0827172994613647, + "learning_rate": 9.506849315068493e-06, + "loss": 0.0058, + "step": 1042 + }, + { + "epoch": 0.9525114155251142, + "grad_norm": 3.3905909061431885, + "learning_rate": 9.515981735159819e-06, + "loss": 0.0258, + "step": 1043 + }, + { + "epoch": 0.9534246575342465, + "grad_norm": 35.75830078125, + "learning_rate": 9.525114155251142e-06, + "loss": 0.724, + "step": 1044 + }, + { + "epoch": 0.954337899543379, + "grad_norm": 5.112847328186035, + "learning_rate": 9.534246575342466e-06, + "loss": 0.0542, + "step": 1045 + }, + { + "epoch": 0.9552511415525115, + "grad_norm": 18.09858512878418, + "learning_rate": 9.543378995433791e-06, + "loss": 0.1944, + "step": 1046 + }, + { + "epoch": 0.9561643835616438, + "grad_norm": 34.556983947753906, + "learning_rate": 9.552511415525115e-06, + "loss": 0.6588, + "step": 1047 + }, + { + "epoch": 0.9570776255707762, + "grad_norm": 62.830543518066406, + "learning_rate": 9.561643835616438e-06, + "loss": 3.7188, + "step": 1048 + }, + { + "epoch": 0.9579908675799087, + "grad_norm": 42.404991149902344, + "learning_rate": 9.570776255707764e-06, + "loss": 1.1457, + "step": 1049 + }, + { + "epoch": 0.958904109589041, + "grad_norm": 16.603015899658203, + "learning_rate": 9.579908675799087e-06, + "loss": 0.2627, + "step": 1050 + }, + { + "epoch": 0.9598173515981735, + "grad_norm": 51.92247772216797, + "learning_rate": 9.589041095890411e-06, + "loss": 1.5255, + "step": 1051 + }, + { + "epoch": 0.960730593607306, + "grad_norm": 0.06596300005912781, + "learning_rate": 9.598173515981736e-06, + "loss": 0.0007, + "step": 1052 + }, + { + "epoch": 0.9616438356164384, + "grad_norm": 7.2224225997924805, + "learning_rate": 9.60730593607306e-06, + "loss": 0.0779, + "step": 1053 + }, + { + "epoch": 0.9625570776255707, + "grad_norm": 6.8434739112854, + "learning_rate": 9.616438356164385e-06, + "loss": 0.0765, + "step": 1054 + }, + { + "epoch": 0.9634703196347032, + "grad_norm": 4.522412300109863, + "learning_rate": 9.625570776255709e-06, + "loss": 0.0531, + "step": 1055 + }, + { + "epoch": 0.9643835616438357, + "grad_norm": 53.79460144042969, + "learning_rate": 9.634703196347032e-06, + "loss": 2.1834, + "step": 1056 + }, + { + "epoch": 0.965296803652968, + "grad_norm": 2.5256900787353516, + "learning_rate": 9.643835616438358e-06, + "loss": 0.0392, + "step": 1057 + }, + { + "epoch": 0.9662100456621004, + "grad_norm": 9.465861320495605, + "learning_rate": 9.652968036529681e-06, + "loss": 0.1087, + "step": 1058 + }, + { + "epoch": 0.9671232876712329, + "grad_norm": 5.170820713043213, + "learning_rate": 9.662100456621005e-06, + "loss": 0.0812, + "step": 1059 + }, + { + "epoch": 0.9680365296803652, + "grad_norm": 22.73261070251465, + "learning_rate": 9.67123287671233e-06, + "loss": 0.358, + "step": 1060 + }, + { + "epoch": 0.9689497716894977, + "grad_norm": 99.82257843017578, + "learning_rate": 9.680365296803654e-06, + "loss": 2.0249, + "step": 1061 + }, + { + "epoch": 0.9698630136986301, + "grad_norm": 30.074007034301758, + "learning_rate": 9.689497716894977e-06, + "loss": 0.4371, + "step": 1062 + }, + { + "epoch": 0.9707762557077626, + "grad_norm": 57.0470085144043, + "learning_rate": 9.698630136986303e-06, + "loss": 2.4238, + "step": 1063 + }, + { + "epoch": 0.971689497716895, + "grad_norm": 54.115081787109375, + "learning_rate": 9.707762557077626e-06, + "loss": 1.7526, + "step": 1064 + }, + { + "epoch": 0.9726027397260274, + "grad_norm": 15.451104164123535, + "learning_rate": 9.71689497716895e-06, + "loss": 0.3667, + "step": 1065 + }, + { + "epoch": 0.9735159817351599, + "grad_norm": 15.901100158691406, + "learning_rate": 9.726027397260275e-06, + "loss": 0.1961, + "step": 1066 + }, + { + "epoch": 0.9744292237442922, + "grad_norm": 31.556224822998047, + "learning_rate": 9.735159817351599e-06, + "loss": 0.3063, + "step": 1067 + }, + { + "epoch": 0.9753424657534246, + "grad_norm": 32.953277587890625, + "learning_rate": 9.744292237442923e-06, + "loss": 0.4307, + "step": 1068 + }, + { + "epoch": 0.9762557077625571, + "grad_norm": 27.048839569091797, + "learning_rate": 9.753424657534248e-06, + "loss": 0.4849, + "step": 1069 + }, + { + "epoch": 0.9771689497716894, + "grad_norm": 27.89653968811035, + "learning_rate": 9.762557077625571e-06, + "loss": 0.7253, + "step": 1070 + }, + { + "epoch": 0.9780821917808219, + "grad_norm": 44.5951042175293, + "learning_rate": 9.771689497716895e-06, + "loss": 1.0748, + "step": 1071 + }, + { + "epoch": 0.9789954337899544, + "grad_norm": 57.65856170654297, + "learning_rate": 9.78082191780822e-06, + "loss": 3.0942, + "step": 1072 + }, + { + "epoch": 0.9799086757990868, + "grad_norm": 14.681427001953125, + "learning_rate": 9.789954337899544e-06, + "loss": 0.2594, + "step": 1073 + }, + { + "epoch": 0.9808219178082191, + "grad_norm": 39.053646087646484, + "learning_rate": 9.79908675799087e-06, + "loss": 0.6947, + "step": 1074 + }, + { + "epoch": 0.9817351598173516, + "grad_norm": 67.05286407470703, + "learning_rate": 9.808219178082193e-06, + "loss": 3.9007, + "step": 1075 + }, + { + "epoch": 0.982648401826484, + "grad_norm": 24.489797592163086, + "learning_rate": 9.817351598173517e-06, + "loss": 0.2819, + "step": 1076 + }, + { + "epoch": 0.9835616438356164, + "grad_norm": 10.352006912231445, + "learning_rate": 9.826484018264842e-06, + "loss": 0.1535, + "step": 1077 + }, + { + "epoch": 0.9844748858447488, + "grad_norm": 24.944795608520508, + "learning_rate": 9.835616438356166e-06, + "loss": 0.5344, + "step": 1078 + }, + { + "epoch": 0.9853881278538813, + "grad_norm": 42.438385009765625, + "learning_rate": 9.844748858447489e-06, + "loss": 1.907, + "step": 1079 + }, + { + "epoch": 0.9863013698630136, + "grad_norm": 37.1114501953125, + "learning_rate": 9.853881278538814e-06, + "loss": 0.8453, + "step": 1080 + }, + { + "epoch": 0.9872146118721461, + "grad_norm": 18.00391387939453, + "learning_rate": 9.863013698630138e-06, + "loss": 0.2932, + "step": 1081 + }, + { + "epoch": 0.9881278538812786, + "grad_norm": 50.89611053466797, + "learning_rate": 9.872146118721462e-06, + "loss": 1.2029, + "step": 1082 + }, + { + "epoch": 0.989041095890411, + "grad_norm": 330.1656799316406, + "learning_rate": 9.881278538812787e-06, + "loss": 0.3462, + "step": 1083 + }, + { + "epoch": 0.9899543378995433, + "grad_norm": 27.58793830871582, + "learning_rate": 9.89041095890411e-06, + "loss": 0.7806, + "step": 1084 + }, + { + "epoch": 0.9908675799086758, + "grad_norm": 38.77764892578125, + "learning_rate": 9.899543378995434e-06, + "loss": 0.9349, + "step": 1085 + }, + { + "epoch": 0.9917808219178083, + "grad_norm": 4.031241416931152, + "learning_rate": 9.90867579908676e-06, + "loss": 0.0291, + "step": 1086 + }, + { + "epoch": 0.9926940639269406, + "grad_norm": 43.20199966430664, + "learning_rate": 9.917808219178083e-06, + "loss": 0.9786, + "step": 1087 + }, + { + "epoch": 0.993607305936073, + "grad_norm": 15.13979434967041, + "learning_rate": 9.926940639269407e-06, + "loss": 0.2603, + "step": 1088 + }, + { + "epoch": 0.9945205479452055, + "grad_norm": 58.30978012084961, + "learning_rate": 9.936073059360732e-06, + "loss": 1.0497, + "step": 1089 + }, + { + "epoch": 0.9954337899543378, + "grad_norm": 14.354007720947266, + "learning_rate": 9.945205479452056e-06, + "loss": 0.1962, + "step": 1090 + }, + { + "epoch": 0.9963470319634703, + "grad_norm": 19.785078048706055, + "learning_rate": 9.95433789954338e-06, + "loss": 0.2617, + "step": 1091 + }, + { + "epoch": 0.9972602739726028, + "grad_norm": 31.45466423034668, + "learning_rate": 9.963470319634703e-06, + "loss": 0.637, + "step": 1092 + }, + { + "epoch": 0.9981735159817352, + "grad_norm": 32.74040222167969, + "learning_rate": 9.972602739726028e-06, + "loss": 0.8081, + "step": 1093 + }, + { + "epoch": 0.9990867579908675, + "grad_norm": 66.60192108154297, + "learning_rate": 9.981735159817354e-06, + "loss": 2.5766, + "step": 1094 + }, + { + "epoch": 1.0, + "grad_norm": 36.97367858886719, + "learning_rate": 9.990867579908676e-06, + "loss": 1.0699, + "step": 1095 + }, + { + "epoch": 1.0009132420091325, + "grad_norm": 41.73494338989258, + "learning_rate": 1e-05, + "loss": 0.8315, + "step": 1096 + }, + { + "epoch": 1.001826484018265, + "grad_norm": 5.130035877227783, + "learning_rate": 9.998985286656521e-06, + "loss": 0.0574, + "step": 1097 + }, + { + "epoch": 1.0027397260273974, + "grad_norm": 54.90626525878906, + "learning_rate": 9.99797057331304e-06, + "loss": 1.4038, + "step": 1098 + }, + { + "epoch": 1.0036529680365296, + "grad_norm": 4.780283451080322, + "learning_rate": 9.996955859969559e-06, + "loss": 0.0551, + "step": 1099 + }, + { + "epoch": 1.004566210045662, + "grad_norm": 13.630091667175293, + "learning_rate": 9.99594114662608e-06, + "loss": 0.2171, + "step": 1100 + }, + { + "epoch": 1.0054794520547945, + "grad_norm": 18.71612548828125, + "learning_rate": 9.994926433282598e-06, + "loss": 0.3347, + "step": 1101 + }, + { + "epoch": 1.006392694063927, + "grad_norm": 58.860618591308594, + "learning_rate": 9.993911719939117e-06, + "loss": 1.3197, + "step": 1102 + }, + { + "epoch": 1.0073059360730594, + "grad_norm": 2.8231494426727295, + "learning_rate": 9.992897006595638e-06, + "loss": 0.0354, + "step": 1103 + }, + { + "epoch": 1.0082191780821919, + "grad_norm": 22.67633628845215, + "learning_rate": 9.991882293252158e-06, + "loss": 0.3457, + "step": 1104 + }, + { + "epoch": 1.009132420091324, + "grad_norm": 28.52826690673828, + "learning_rate": 9.990867579908676e-06, + "loss": 0.706, + "step": 1105 + }, + { + "epoch": 1.0100456621004565, + "grad_norm": 71.93325805664062, + "learning_rate": 9.989852866565196e-06, + "loss": 2.9128, + "step": 1106 + }, + { + "epoch": 1.010958904109589, + "grad_norm": 103.769287109375, + "learning_rate": 9.988838153221717e-06, + "loss": 1.5303, + "step": 1107 + }, + { + "epoch": 1.0118721461187214, + "grad_norm": 16.21788215637207, + "learning_rate": 9.987823439878235e-06, + "loss": 0.22, + "step": 1108 + }, + { + "epoch": 1.012785388127854, + "grad_norm": 14.558876037597656, + "learning_rate": 9.986808726534754e-06, + "loss": 0.1853, + "step": 1109 + }, + { + "epoch": 1.0136986301369864, + "grad_norm": 15.664973258972168, + "learning_rate": 9.985794013191275e-06, + "loss": 0.1718, + "step": 1110 + }, + { + "epoch": 1.0146118721461188, + "grad_norm": 1.6848019361495972, + "learning_rate": 9.984779299847794e-06, + "loss": 0.0209, + "step": 1111 + }, + { + "epoch": 1.015525114155251, + "grad_norm": 4.810029983520508, + "learning_rate": 9.983764586504313e-06, + "loss": 0.0694, + "step": 1112 + }, + { + "epoch": 1.0164383561643835, + "grad_norm": 42.06098175048828, + "learning_rate": 9.982749873160833e-06, + "loss": 0.7699, + "step": 1113 + }, + { + "epoch": 1.017351598173516, + "grad_norm": 21.307830810546875, + "learning_rate": 9.981735159817354e-06, + "loss": 0.3898, + "step": 1114 + }, + { + "epoch": 1.0182648401826484, + "grad_norm": 36.57358932495117, + "learning_rate": 9.980720446473872e-06, + "loss": 0.8179, + "step": 1115 + }, + { + "epoch": 1.0191780821917809, + "grad_norm": 46.51877975463867, + "learning_rate": 9.979705733130391e-06, + "loss": 0.792, + "step": 1116 + }, + { + "epoch": 1.0200913242009133, + "grad_norm": 17.819929122924805, + "learning_rate": 9.978691019786912e-06, + "loss": 0.1784, + "step": 1117 + }, + { + "epoch": 1.0210045662100458, + "grad_norm": 58.83279800415039, + "learning_rate": 9.97767630644343e-06, + "loss": 1.3236, + "step": 1118 + }, + { + "epoch": 1.021917808219178, + "grad_norm": 25.28819465637207, + "learning_rate": 9.97666159309995e-06, + "loss": 0.4543, + "step": 1119 + }, + { + "epoch": 1.0228310502283104, + "grad_norm": 12.592864990234375, + "learning_rate": 9.97564687975647e-06, + "loss": 0.1562, + "step": 1120 + }, + { + "epoch": 1.023744292237443, + "grad_norm": 41.13899230957031, + "learning_rate": 9.974632166412989e-06, + "loss": 0.932, + "step": 1121 + }, + { + "epoch": 1.0246575342465754, + "grad_norm": 18.961206436157227, + "learning_rate": 9.973617453069508e-06, + "loss": 0.1641, + "step": 1122 + }, + { + "epoch": 1.0255707762557078, + "grad_norm": 67.6712875366211, + "learning_rate": 9.972602739726028e-06, + "loss": 2.2797, + "step": 1123 + }, + { + "epoch": 1.0264840182648403, + "grad_norm": 31.303621292114258, + "learning_rate": 9.971588026382549e-06, + "loss": 0.4387, + "step": 1124 + }, + { + "epoch": 1.0273972602739727, + "grad_norm": 19.241458892822266, + "learning_rate": 9.970573313039068e-06, + "loss": 0.3145, + "step": 1125 + }, + { + "epoch": 1.028310502283105, + "grad_norm": 25.28453254699707, + "learning_rate": 9.969558599695586e-06, + "loss": 0.4734, + "step": 1126 + }, + { + "epoch": 1.0292237442922374, + "grad_norm": 17.40557098388672, + "learning_rate": 9.968543886352107e-06, + "loss": 0.2816, + "step": 1127 + }, + { + "epoch": 1.0301369863013699, + "grad_norm": 63.887474060058594, + "learning_rate": 9.967529173008626e-06, + "loss": 3.0106, + "step": 1128 + }, + { + "epoch": 1.0310502283105023, + "grad_norm": 11.002396583557129, + "learning_rate": 9.966514459665145e-06, + "loss": 0.126, + "step": 1129 + }, + { + "epoch": 1.0319634703196348, + "grad_norm": 38.731719970703125, + "learning_rate": 9.965499746321665e-06, + "loss": 0.6099, + "step": 1130 + }, + { + "epoch": 1.0328767123287672, + "grad_norm": 34.49280548095703, + "learning_rate": 9.964485032978184e-06, + "loss": 0.809, + "step": 1131 + }, + { + "epoch": 1.0337899543378994, + "grad_norm": 23.424684524536133, + "learning_rate": 9.963470319634703e-06, + "loss": 0.3402, + "step": 1132 + }, + { + "epoch": 1.034703196347032, + "grad_norm": 1.6140964031219482, + "learning_rate": 9.962455606291223e-06, + "loss": 0.0239, + "step": 1133 + }, + { + "epoch": 1.0356164383561643, + "grad_norm": 15.017940521240234, + "learning_rate": 9.961440892947744e-06, + "loss": 0.1782, + "step": 1134 + }, + { + "epoch": 1.0365296803652968, + "grad_norm": 14.435965538024902, + "learning_rate": 9.960426179604263e-06, + "loss": 0.2255, + "step": 1135 + }, + { + "epoch": 1.0374429223744293, + "grad_norm": 56.158721923828125, + "learning_rate": 9.959411466260782e-06, + "loss": 0.9746, + "step": 1136 + }, + { + "epoch": 1.0383561643835617, + "grad_norm": 31.676673889160156, + "learning_rate": 9.958396752917302e-06, + "loss": 0.636, + "step": 1137 + }, + { + "epoch": 1.0392694063926942, + "grad_norm": 30.70090103149414, + "learning_rate": 9.957382039573821e-06, + "loss": 0.4555, + "step": 1138 + }, + { + "epoch": 1.0401826484018264, + "grad_norm": 51.653724670410156, + "learning_rate": 9.95636732623034e-06, + "loss": 0.9648, + "step": 1139 + }, + { + "epoch": 1.0410958904109588, + "grad_norm": 31.10382652282715, + "learning_rate": 9.95535261288686e-06, + "loss": 0.3974, + "step": 1140 + }, + { + "epoch": 1.0420091324200913, + "grad_norm": 21.391565322875977, + "learning_rate": 9.95433789954338e-06, + "loss": 0.3152, + "step": 1141 + }, + { + "epoch": 1.0429223744292238, + "grad_norm": 50.56828308105469, + "learning_rate": 9.9533231861999e-06, + "loss": 1.0296, + "step": 1142 + }, + { + "epoch": 1.0438356164383562, + "grad_norm": 9.80189037322998, + "learning_rate": 9.952308472856419e-06, + "loss": 0.0996, + "step": 1143 + }, + { + "epoch": 1.0447488584474887, + "grad_norm": 15.954371452331543, + "learning_rate": 9.95129375951294e-06, + "loss": 0.157, + "step": 1144 + }, + { + "epoch": 1.045662100456621, + "grad_norm": 6.455676555633545, + "learning_rate": 9.950279046169458e-06, + "loss": 0.0889, + "step": 1145 + }, + { + "epoch": 1.0465753424657533, + "grad_norm": 9.065537452697754, + "learning_rate": 9.949264332825977e-06, + "loss": 0.1271, + "step": 1146 + }, + { + "epoch": 1.0474885844748858, + "grad_norm": 15.332294464111328, + "learning_rate": 9.948249619482497e-06, + "loss": 0.1849, + "step": 1147 + }, + { + "epoch": 1.0484018264840183, + "grad_norm": 6.915588855743408, + "learning_rate": 9.947234906139016e-06, + "loss": 0.1029, + "step": 1148 + }, + { + "epoch": 1.0493150684931507, + "grad_norm": 53.33873748779297, + "learning_rate": 9.946220192795535e-06, + "loss": 1.1399, + "step": 1149 + }, + { + "epoch": 1.0502283105022832, + "grad_norm": 3.140573263168335, + "learning_rate": 9.945205479452056e-06, + "loss": 0.0437, + "step": 1150 + }, + { + "epoch": 1.0511415525114156, + "grad_norm": 61.57686233520508, + "learning_rate": 9.944190766108575e-06, + "loss": 1.2633, + "step": 1151 + }, + { + "epoch": 1.0520547945205478, + "grad_norm": 89.97989654541016, + "learning_rate": 9.943176052765095e-06, + "loss": 3.5391, + "step": 1152 + }, + { + "epoch": 1.0529680365296803, + "grad_norm": 10.101164817810059, + "learning_rate": 9.942161339421614e-06, + "loss": 0.1553, + "step": 1153 + }, + { + "epoch": 1.0538812785388127, + "grad_norm": 51.350303649902344, + "learning_rate": 9.941146626078134e-06, + "loss": 1.05, + "step": 1154 + }, + { + "epoch": 1.0547945205479452, + "grad_norm": 35.82583236694336, + "learning_rate": 9.940131912734653e-06, + "loss": 0.6979, + "step": 1155 + }, + { + "epoch": 1.0557077625570777, + "grad_norm": 11.62342357635498, + "learning_rate": 9.939117199391172e-06, + "loss": 0.1846, + "step": 1156 + }, + { + "epoch": 1.05662100456621, + "grad_norm": 3.032097339630127, + "learning_rate": 9.938102486047693e-06, + "loss": 0.0352, + "step": 1157 + }, + { + "epoch": 1.0575342465753426, + "grad_norm": 24.026090621948242, + "learning_rate": 9.937087772704212e-06, + "loss": 0.3156, + "step": 1158 + }, + { + "epoch": 1.0584474885844748, + "grad_norm": 17.078264236450195, + "learning_rate": 9.936073059360732e-06, + "loss": 0.3054, + "step": 1159 + }, + { + "epoch": 1.0593607305936072, + "grad_norm": 66.6882095336914, + "learning_rate": 9.935058346017251e-06, + "loss": 0.7893, + "step": 1160 + }, + { + "epoch": 1.0602739726027397, + "grad_norm": 46.55051803588867, + "learning_rate": 9.93404363267377e-06, + "loss": 1.1405, + "step": 1161 + }, + { + "epoch": 1.0611872146118722, + "grad_norm": 88.59442901611328, + "learning_rate": 9.93302891933029e-06, + "loss": 3.6755, + "step": 1162 + }, + { + "epoch": 1.0621004566210046, + "grad_norm": 2.128161907196045, + "learning_rate": 9.93201420598681e-06, + "loss": 0.0243, + "step": 1163 + }, + { + "epoch": 1.063013698630137, + "grad_norm": 62.743309020996094, + "learning_rate": 9.93099949264333e-06, + "loss": 1.5788, + "step": 1164 + }, + { + "epoch": 1.0639269406392695, + "grad_norm": 25.348119735717773, + "learning_rate": 9.929984779299849e-06, + "loss": 0.4438, + "step": 1165 + }, + { + "epoch": 1.0648401826484017, + "grad_norm": 23.631437301635742, + "learning_rate": 9.928970065956367e-06, + "loss": 0.3763, + "step": 1166 + }, + { + "epoch": 1.0657534246575342, + "grad_norm": 29.899009704589844, + "learning_rate": 9.927955352612888e-06, + "loss": 0.4412, + "step": 1167 + }, + { + "epoch": 1.0666666666666667, + "grad_norm": 34.807411193847656, + "learning_rate": 9.926940639269407e-06, + "loss": 0.4931, + "step": 1168 + }, + { + "epoch": 1.067579908675799, + "grad_norm": 60.9229621887207, + "learning_rate": 9.925925925925927e-06, + "loss": 1.2756, + "step": 1169 + }, + { + "epoch": 1.0684931506849316, + "grad_norm": 30.51190757751465, + "learning_rate": 9.924911212582446e-06, + "loss": 0.4934, + "step": 1170 + }, + { + "epoch": 1.069406392694064, + "grad_norm": 23.544187545776367, + "learning_rate": 9.923896499238965e-06, + "loss": 0.2779, + "step": 1171 + }, + { + "epoch": 1.0703196347031962, + "grad_norm": 33.62771987915039, + "learning_rate": 9.922881785895486e-06, + "loss": 0.9348, + "step": 1172 + }, + { + "epoch": 1.0712328767123287, + "grad_norm": 37.377037048339844, + "learning_rate": 9.921867072552004e-06, + "loss": 0.6885, + "step": 1173 + }, + { + "epoch": 1.0721461187214611, + "grad_norm": 19.940296173095703, + "learning_rate": 9.920852359208525e-06, + "loss": 0.2792, + "step": 1174 + }, + { + "epoch": 1.0730593607305936, + "grad_norm": 11.030420303344727, + "learning_rate": 9.919837645865044e-06, + "loss": 0.1086, + "step": 1175 + }, + { + "epoch": 1.073972602739726, + "grad_norm": 35.80003356933594, + "learning_rate": 9.918822932521563e-06, + "loss": 0.6718, + "step": 1176 + }, + { + "epoch": 1.0748858447488585, + "grad_norm": 50.51363754272461, + "learning_rate": 9.917808219178083e-06, + "loss": 0.9444, + "step": 1177 + }, + { + "epoch": 1.075799086757991, + "grad_norm": 30.380199432373047, + "learning_rate": 9.916793505834602e-06, + "loss": 0.4625, + "step": 1178 + }, + { + "epoch": 1.0767123287671232, + "grad_norm": 12.982952117919922, + "learning_rate": 9.915778792491123e-06, + "loss": 0.207, + "step": 1179 + }, + { + "epoch": 1.0776255707762556, + "grad_norm": 46.49965286254883, + "learning_rate": 9.914764079147641e-06, + "loss": 0.8521, + "step": 1180 + }, + { + "epoch": 1.078538812785388, + "grad_norm": 13.341947555541992, + "learning_rate": 9.91374936580416e-06, + "loss": 0.1945, + "step": 1181 + }, + { + "epoch": 1.0794520547945206, + "grad_norm": 58.88418197631836, + "learning_rate": 9.91273465246068e-06, + "loss": 2.0384, + "step": 1182 + }, + { + "epoch": 1.080365296803653, + "grad_norm": 26.028127670288086, + "learning_rate": 9.9117199391172e-06, + "loss": 0.4063, + "step": 1183 + }, + { + "epoch": 1.0812785388127855, + "grad_norm": 5.739835739135742, + "learning_rate": 9.91070522577372e-06, + "loss": 0.0603, + "step": 1184 + }, + { + "epoch": 1.0821917808219177, + "grad_norm": 31.58884620666504, + "learning_rate": 9.909690512430239e-06, + "loss": 0.5303, + "step": 1185 + }, + { + "epoch": 1.0831050228310501, + "grad_norm": 50.5161247253418, + "learning_rate": 9.90867579908676e-06, + "loss": 2.0742, + "step": 1186 + }, + { + "epoch": 1.0840182648401826, + "grad_norm": 29.272647857666016, + "learning_rate": 9.907661085743278e-06, + "loss": 0.5627, + "step": 1187 + }, + { + "epoch": 1.084931506849315, + "grad_norm": 46.7690544128418, + "learning_rate": 9.906646372399797e-06, + "loss": 1.0921, + "step": 1188 + }, + { + "epoch": 1.0858447488584475, + "grad_norm": 18.687820434570312, + "learning_rate": 9.905631659056318e-06, + "loss": 0.2163, + "step": 1189 + }, + { + "epoch": 1.08675799086758, + "grad_norm": 18.631874084472656, + "learning_rate": 9.904616945712837e-06, + "loss": 0.1921, + "step": 1190 + }, + { + "epoch": 1.0876712328767124, + "grad_norm": 0.3660396635532379, + "learning_rate": 9.903602232369355e-06, + "loss": 0.0047, + "step": 1191 + }, + { + "epoch": 1.0885844748858449, + "grad_norm": 1.7191704511642456, + "learning_rate": 9.902587519025876e-06, + "loss": 0.0213, + "step": 1192 + }, + { + "epoch": 1.089497716894977, + "grad_norm": 12.871003150939941, + "learning_rate": 9.901572805682395e-06, + "loss": 0.2014, + "step": 1193 + }, + { + "epoch": 1.0904109589041096, + "grad_norm": 41.756439208984375, + "learning_rate": 9.900558092338915e-06, + "loss": 1.2721, + "step": 1194 + }, + { + "epoch": 1.091324200913242, + "grad_norm": 29.54254722595215, + "learning_rate": 9.899543378995434e-06, + "loss": 0.4698, + "step": 1195 + }, + { + "epoch": 1.0922374429223745, + "grad_norm": 47.38060760498047, + "learning_rate": 9.898528665651955e-06, + "loss": 0.5604, + "step": 1196 + }, + { + "epoch": 1.093150684931507, + "grad_norm": 14.940537452697754, + "learning_rate": 9.897513952308474e-06, + "loss": 0.1764, + "step": 1197 + }, + { + "epoch": 1.0940639269406394, + "grad_norm": 5.219301223754883, + "learning_rate": 9.896499238964992e-06, + "loss": 0.0584, + "step": 1198 + }, + { + "epoch": 1.0949771689497716, + "grad_norm": 47.89594650268555, + "learning_rate": 9.895484525621513e-06, + "loss": 1.0108, + "step": 1199 + }, + { + "epoch": 1.095890410958904, + "grad_norm": 6.625697612762451, + "learning_rate": 9.894469812278032e-06, + "loss": 0.1012, + "step": 1200 + }, + { + "epoch": 1.0968036529680365, + "grad_norm": 18.147422790527344, + "learning_rate": 9.89345509893455e-06, + "loss": 0.2356, + "step": 1201 + }, + { + "epoch": 1.097716894977169, + "grad_norm": 30.85698699951172, + "learning_rate": 9.892440385591071e-06, + "loss": 0.3667, + "step": 1202 + }, + { + "epoch": 1.0986301369863014, + "grad_norm": 84.84284210205078, + "learning_rate": 9.891425672247592e-06, + "loss": 4.248, + "step": 1203 + }, + { + "epoch": 1.0995433789954339, + "grad_norm": 71.85745239257812, + "learning_rate": 9.89041095890411e-06, + "loss": 2.7516, + "step": 1204 + }, + { + "epoch": 1.1004566210045663, + "grad_norm": 7.036884307861328, + "learning_rate": 9.88939624556063e-06, + "loss": 0.1269, + "step": 1205 + }, + { + "epoch": 1.1013698630136985, + "grad_norm": 25.622577667236328, + "learning_rate": 9.88838153221715e-06, + "loss": 0.5413, + "step": 1206 + }, + { + "epoch": 1.102283105022831, + "grad_norm": 73.6036605834961, + "learning_rate": 9.887366818873669e-06, + "loss": 1.9702, + "step": 1207 + }, + { + "epoch": 1.1031963470319635, + "grad_norm": 55.04779052734375, + "learning_rate": 9.886352105530188e-06, + "loss": 1.7292, + "step": 1208 + }, + { + "epoch": 1.104109589041096, + "grad_norm": 41.49099349975586, + "learning_rate": 9.885337392186708e-06, + "loss": 0.6564, + "step": 1209 + }, + { + "epoch": 1.1050228310502284, + "grad_norm": 51.15057373046875, + "learning_rate": 9.884322678843227e-06, + "loss": 1.2158, + "step": 1210 + }, + { + "epoch": 1.1059360730593608, + "grad_norm": 13.288839340209961, + "learning_rate": 9.883307965499746e-06, + "loss": 0.1857, + "step": 1211 + }, + { + "epoch": 1.106849315068493, + "grad_norm": 0.6238160729408264, + "learning_rate": 9.882293252156266e-06, + "loss": 0.0073, + "step": 1212 + }, + { + "epoch": 1.1077625570776255, + "grad_norm": 11.817009925842285, + "learning_rate": 9.881278538812787e-06, + "loss": 0.2093, + "step": 1213 + }, + { + "epoch": 1.108675799086758, + "grad_norm": 62.526248931884766, + "learning_rate": 9.880263825469306e-06, + "loss": 3.1242, + "step": 1214 + }, + { + "epoch": 1.1095890410958904, + "grad_norm": 62.33058166503906, + "learning_rate": 9.879249112125825e-06, + "loss": 1.9686, + "step": 1215 + }, + { + "epoch": 1.1105022831050229, + "grad_norm": 1.1697734594345093, + "learning_rate": 9.878234398782345e-06, + "loss": 0.0135, + "step": 1216 + }, + { + "epoch": 1.1114155251141553, + "grad_norm": 60.77466583251953, + "learning_rate": 9.877219685438864e-06, + "loss": 1.6575, + "step": 1217 + }, + { + "epoch": 1.1123287671232878, + "grad_norm": 51.92008972167969, + "learning_rate": 9.876204972095383e-06, + "loss": 1.0725, + "step": 1218 + }, + { + "epoch": 1.11324200913242, + "grad_norm": 10.140375137329102, + "learning_rate": 9.875190258751903e-06, + "loss": 0.1537, + "step": 1219 + }, + { + "epoch": 1.1141552511415524, + "grad_norm": 41.750999450683594, + "learning_rate": 9.874175545408424e-06, + "loss": 0.993, + "step": 1220 + }, + { + "epoch": 1.115068493150685, + "grad_norm": 37.88629150390625, + "learning_rate": 9.873160832064941e-06, + "loss": 0.3494, + "step": 1221 + }, + { + "epoch": 1.1159817351598174, + "grad_norm": 59.00156784057617, + "learning_rate": 9.872146118721462e-06, + "loss": 2.1927, + "step": 1222 + }, + { + "epoch": 1.1168949771689498, + "grad_norm": 38.91499328613281, + "learning_rate": 9.871131405377982e-06, + "loss": 0.8595, + "step": 1223 + }, + { + "epoch": 1.1178082191780823, + "grad_norm": 10.798958778381348, + "learning_rate": 9.870116692034501e-06, + "loss": 0.1584, + "step": 1224 + }, + { + "epoch": 1.1187214611872145, + "grad_norm": 74.30647277832031, + "learning_rate": 9.86910197869102e-06, + "loss": 6.2651, + "step": 1225 + }, + { + "epoch": 1.119634703196347, + "grad_norm": 24.033767700195312, + "learning_rate": 9.86808726534754e-06, + "loss": 0.6558, + "step": 1226 + }, + { + "epoch": 1.1205479452054794, + "grad_norm": 59.616737365722656, + "learning_rate": 9.86707255200406e-06, + "loss": 2.8227, + "step": 1227 + }, + { + "epoch": 1.1214611872146119, + "grad_norm": 9.474469184875488, + "learning_rate": 9.866057838660578e-06, + "loss": 0.1687, + "step": 1228 + }, + { + "epoch": 1.1223744292237443, + "grad_norm": 25.690513610839844, + "learning_rate": 9.865043125317099e-06, + "loss": 0.7175, + "step": 1229 + }, + { + "epoch": 1.1232876712328768, + "grad_norm": 15.44788932800293, + "learning_rate": 9.86402841197362e-06, + "loss": 0.3099, + "step": 1230 + }, + { + "epoch": 1.1242009132420092, + "grad_norm": 17.656774520874023, + "learning_rate": 9.863013698630138e-06, + "loss": 0.3916, + "step": 1231 + }, + { + "epoch": 1.1251141552511417, + "grad_norm": 60.810176849365234, + "learning_rate": 9.861998985286657e-06, + "loss": 2.1152, + "step": 1232 + }, + { + "epoch": 1.126027397260274, + "grad_norm": 51.88747787475586, + "learning_rate": 9.860984271943177e-06, + "loss": 1.2317, + "step": 1233 + }, + { + "epoch": 1.1269406392694064, + "grad_norm": 22.638025283813477, + "learning_rate": 9.859969558599696e-06, + "loss": 0.4006, + "step": 1234 + }, + { + "epoch": 1.1278538812785388, + "grad_norm": 4.032437324523926, + "learning_rate": 9.858954845256215e-06, + "loss": 0.0537, + "step": 1235 + }, + { + "epoch": 1.1287671232876713, + "grad_norm": 42.04151916503906, + "learning_rate": 9.857940131912736e-06, + "loss": 2.3131, + "step": 1236 + }, + { + "epoch": 1.1296803652968037, + "grad_norm": 50.88027572631836, + "learning_rate": 9.856925418569255e-06, + "loss": 1.6903, + "step": 1237 + }, + { + "epoch": 1.1305936073059362, + "grad_norm": 18.401762008666992, + "learning_rate": 9.855910705225773e-06, + "loss": 0.3451, + "step": 1238 + }, + { + "epoch": 1.1315068493150684, + "grad_norm": 43.150962829589844, + "learning_rate": 9.854895991882294e-06, + "loss": 0.9674, + "step": 1239 + }, + { + "epoch": 1.1324200913242009, + "grad_norm": 43.93855285644531, + "learning_rate": 9.853881278538814e-06, + "loss": 1.6648, + "step": 1240 + }, + { + "epoch": 1.1333333333333333, + "grad_norm": 12.312539100646973, + "learning_rate": 9.852866565195333e-06, + "loss": 0.2949, + "step": 1241 + }, + { + "epoch": 1.1342465753424658, + "grad_norm": 19.98088836669922, + "learning_rate": 9.851851851851852e-06, + "loss": 0.3258, + "step": 1242 + }, + { + "epoch": 1.1351598173515982, + "grad_norm": 27.595117568969727, + "learning_rate": 9.850837138508373e-06, + "loss": 0.7255, + "step": 1243 + }, + { + "epoch": 1.1360730593607307, + "grad_norm": 23.96356201171875, + "learning_rate": 9.849822425164892e-06, + "loss": 0.6156, + "step": 1244 + }, + { + "epoch": 1.1369863013698631, + "grad_norm": 15.664968490600586, + "learning_rate": 9.84880771182141e-06, + "loss": 0.2446, + "step": 1245 + }, + { + "epoch": 1.1378995433789953, + "grad_norm": 22.355810165405273, + "learning_rate": 9.847792998477931e-06, + "loss": 0.3398, + "step": 1246 + }, + { + "epoch": 1.1388127853881278, + "grad_norm": 61.15244674682617, + "learning_rate": 9.846778285134451e-06, + "loss": 2.7468, + "step": 1247 + }, + { + "epoch": 1.1397260273972603, + "grad_norm": 23.509841918945312, + "learning_rate": 9.845763571790969e-06, + "loss": 0.645, + "step": 1248 + }, + { + "epoch": 1.1406392694063927, + "grad_norm": 11.276506423950195, + "learning_rate": 9.844748858447489e-06, + "loss": 0.147, + "step": 1249 + }, + { + "epoch": 1.1415525114155252, + "grad_norm": 28.268638610839844, + "learning_rate": 9.84373414510401e-06, + "loss": 0.5892, + "step": 1250 + }, + { + "epoch": 1.1424657534246576, + "grad_norm": 19.872909545898438, + "learning_rate": 9.842719431760529e-06, + "loss": 0.3554, + "step": 1251 + }, + { + "epoch": 1.1433789954337898, + "grad_norm": 54.66286849975586, + "learning_rate": 9.841704718417047e-06, + "loss": 2.3764, + "step": 1252 + }, + { + "epoch": 1.1442922374429223, + "grad_norm": 11.771438598632812, + "learning_rate": 9.840690005073568e-06, + "loss": 0.242, + "step": 1253 + }, + { + "epoch": 1.1452054794520548, + "grad_norm": 26.019107818603516, + "learning_rate": 9.839675291730087e-06, + "loss": 0.8545, + "step": 1254 + }, + { + "epoch": 1.1461187214611872, + "grad_norm": 24.470746994018555, + "learning_rate": 9.838660578386606e-06, + "loss": 0.4171, + "step": 1255 + }, + { + "epoch": 1.1470319634703197, + "grad_norm": 22.891498565673828, + "learning_rate": 9.837645865043126e-06, + "loss": 0.5572, + "step": 1256 + }, + { + "epoch": 1.1479452054794521, + "grad_norm": 9.15639591217041, + "learning_rate": 9.836631151699647e-06, + "loss": 0.1543, + "step": 1257 + }, + { + "epoch": 1.1488584474885846, + "grad_norm": 63.10853958129883, + "learning_rate": 9.835616438356166e-06, + "loss": 3.4953, + "step": 1258 + }, + { + "epoch": 1.1497716894977168, + "grad_norm": 24.7301025390625, + "learning_rate": 9.834601725012684e-06, + "loss": 0.6433, + "step": 1259 + }, + { + "epoch": 1.1506849315068493, + "grad_norm": 68.76799774169922, + "learning_rate": 9.833587011669205e-06, + "loss": 3.9181, + "step": 1260 + }, + { + "epoch": 1.1515981735159817, + "grad_norm": 6.9689154624938965, + "learning_rate": 9.832572298325724e-06, + "loss": 0.0594, + "step": 1261 + }, + { + "epoch": 1.1525114155251142, + "grad_norm": 16.575504302978516, + "learning_rate": 9.831557584982243e-06, + "loss": 0.3563, + "step": 1262 + }, + { + "epoch": 1.1534246575342466, + "grad_norm": 72.49592590332031, + "learning_rate": 9.830542871638763e-06, + "loss": 2.5991, + "step": 1263 + }, + { + "epoch": 1.154337899543379, + "grad_norm": 26.978912353515625, + "learning_rate": 9.829528158295284e-06, + "loss": 0.5208, + "step": 1264 + }, + { + "epoch": 1.1552511415525113, + "grad_norm": 6.887632369995117, + "learning_rate": 9.8285134449518e-06, + "loss": 0.1569, + "step": 1265 + }, + { + "epoch": 1.1561643835616437, + "grad_norm": 45.68442916870117, + "learning_rate": 9.827498731608321e-06, + "loss": 2.018, + "step": 1266 + }, + { + "epoch": 1.1570776255707762, + "grad_norm": 36.839176177978516, + "learning_rate": 9.826484018264842e-06, + "loss": 0.8863, + "step": 1267 + }, + { + "epoch": 1.1579908675799087, + "grad_norm": 19.3278865814209, + "learning_rate": 9.82546930492136e-06, + "loss": 0.3375, + "step": 1268 + }, + { + "epoch": 1.158904109589041, + "grad_norm": 30.88810920715332, + "learning_rate": 9.82445459157788e-06, + "loss": 0.8634, + "step": 1269 + }, + { + "epoch": 1.1598173515981736, + "grad_norm": 32.570533752441406, + "learning_rate": 9.8234398782344e-06, + "loss": 0.4779, + "step": 1270 + }, + { + "epoch": 1.160730593607306, + "grad_norm": 37.32749557495117, + "learning_rate": 9.822425164890919e-06, + "loss": 1.78, + "step": 1271 + }, + { + "epoch": 1.1616438356164385, + "grad_norm": 30.596107482910156, + "learning_rate": 9.821410451547438e-06, + "loss": 1.0863, + "step": 1272 + }, + { + "epoch": 1.1625570776255707, + "grad_norm": 34.664066314697266, + "learning_rate": 9.820395738203958e-06, + "loss": 0.3648, + "step": 1273 + }, + { + "epoch": 1.1634703196347032, + "grad_norm": 25.778499603271484, + "learning_rate": 9.819381024860479e-06, + "loss": 0.6214, + "step": 1274 + }, + { + "epoch": 1.1643835616438356, + "grad_norm": 47.24235153198242, + "learning_rate": 9.818366311516998e-06, + "loss": 2.3884, + "step": 1275 + }, + { + "epoch": 1.165296803652968, + "grad_norm": 49.94087219238281, + "learning_rate": 9.817351598173517e-06, + "loss": 1.2706, + "step": 1276 + }, + { + "epoch": 1.1662100456621005, + "grad_norm": 17.875743865966797, + "learning_rate": 9.816336884830037e-06, + "loss": 0.4478, + "step": 1277 + }, + { + "epoch": 1.167123287671233, + "grad_norm": 16.08768081665039, + "learning_rate": 9.815322171486556e-06, + "loss": 0.4077, + "step": 1278 + }, + { + "epoch": 1.1680365296803652, + "grad_norm": 40.78914260864258, + "learning_rate": 9.814307458143075e-06, + "loss": 0.8733, + "step": 1279 + }, + { + "epoch": 1.1689497716894977, + "grad_norm": 20.561742782592773, + "learning_rate": 9.813292744799595e-06, + "loss": 0.3978, + "step": 1280 + }, + { + "epoch": 1.16986301369863, + "grad_norm": 17.136255264282227, + "learning_rate": 9.812278031456114e-06, + "loss": 0.3137, + "step": 1281 + }, + { + "epoch": 1.1707762557077626, + "grad_norm": 2.770081043243408, + "learning_rate": 9.811263318112633e-06, + "loss": 0.0412, + "step": 1282 + }, + { + "epoch": 1.171689497716895, + "grad_norm": 35.56282043457031, + "learning_rate": 9.810248604769154e-06, + "loss": 1.1608, + "step": 1283 + }, + { + "epoch": 1.1726027397260275, + "grad_norm": 7.968186378479004, + "learning_rate": 9.809233891425674e-06, + "loss": 0.1845, + "step": 1284 + }, + { + "epoch": 1.17351598173516, + "grad_norm": 24.85920524597168, + "learning_rate": 9.808219178082193e-06, + "loss": 0.5709, + "step": 1285 + }, + { + "epoch": 1.1744292237442922, + "grad_norm": 43.20223617553711, + "learning_rate": 9.807204464738712e-06, + "loss": 0.7497, + "step": 1286 + }, + { + "epoch": 1.1753424657534246, + "grad_norm": 21.727054595947266, + "learning_rate": 9.806189751395232e-06, + "loss": 0.4302, + "step": 1287 + }, + { + "epoch": 1.176255707762557, + "grad_norm": 11.69324016571045, + "learning_rate": 9.805175038051751e-06, + "loss": 0.224, + "step": 1288 + }, + { + "epoch": 1.1771689497716895, + "grad_norm": 8.154023170471191, + "learning_rate": 9.80416032470827e-06, + "loss": 0.1706, + "step": 1289 + }, + { + "epoch": 1.178082191780822, + "grad_norm": 33.24765396118164, + "learning_rate": 9.80314561136479e-06, + "loss": 1.0727, + "step": 1290 + }, + { + "epoch": 1.1789954337899544, + "grad_norm": 28.79535675048828, + "learning_rate": 9.80213089802131e-06, + "loss": 0.7039, + "step": 1291 + }, + { + "epoch": 1.1799086757990866, + "grad_norm": 21.41539192199707, + "learning_rate": 9.801116184677828e-06, + "loss": 0.4649, + "step": 1292 + }, + { + "epoch": 1.180821917808219, + "grad_norm": 60.59461975097656, + "learning_rate": 9.800101471334349e-06, + "loss": 2.6781, + "step": 1293 + }, + { + "epoch": 1.1817351598173516, + "grad_norm": 42.04032897949219, + "learning_rate": 9.79908675799087e-06, + "loss": 1.2245, + "step": 1294 + }, + { + "epoch": 1.182648401826484, + "grad_norm": 37.307647705078125, + "learning_rate": 9.798072044647388e-06, + "loss": 0.7889, + "step": 1295 + }, + { + "epoch": 1.1835616438356165, + "grad_norm": 9.517409324645996, + "learning_rate": 9.797057331303907e-06, + "loss": 0.2276, + "step": 1296 + }, + { + "epoch": 1.184474885844749, + "grad_norm": 73.59767150878906, + "learning_rate": 9.796042617960428e-06, + "loss": 1.7719, + "step": 1297 + }, + { + "epoch": 1.1853881278538814, + "grad_norm": 4.462095737457275, + "learning_rate": 9.795027904616946e-06, + "loss": 0.0425, + "step": 1298 + }, + { + "epoch": 1.1863013698630138, + "grad_norm": 3.181561231613159, + "learning_rate": 9.794013191273465e-06, + "loss": 0.0448, + "step": 1299 + }, + { + "epoch": 1.187214611872146, + "grad_norm": 14.322896003723145, + "learning_rate": 9.792998477929986e-06, + "loss": 0.3161, + "step": 1300 + }, + { + "epoch": 1.1881278538812785, + "grad_norm": 33.243621826171875, + "learning_rate": 9.791983764586505e-06, + "loss": 0.594, + "step": 1301 + }, + { + "epoch": 1.189041095890411, + "grad_norm": 8.096213340759277, + "learning_rate": 9.790969051243025e-06, + "loss": 0.1452, + "step": 1302 + }, + { + "epoch": 1.1899543378995434, + "grad_norm": 12.708288192749023, + "learning_rate": 9.789954337899544e-06, + "loss": 0.2702, + "step": 1303 + }, + { + "epoch": 1.1908675799086759, + "grad_norm": 2.683201789855957, + "learning_rate": 9.788939624556065e-06, + "loss": 0.0383, + "step": 1304 + }, + { + "epoch": 1.191780821917808, + "grad_norm": 0.9311400651931763, + "learning_rate": 9.787924911212583e-06, + "loss": 0.0142, + "step": 1305 + }, + { + "epoch": 1.1926940639269406, + "grad_norm": 5.295901298522949, + "learning_rate": 9.786910197869102e-06, + "loss": 0.084, + "step": 1306 + }, + { + "epoch": 1.193607305936073, + "grad_norm": 34.207733154296875, + "learning_rate": 9.785895484525623e-06, + "loss": 0.862, + "step": 1307 + }, + { + "epoch": 1.1945205479452055, + "grad_norm": 14.41832160949707, + "learning_rate": 9.784880771182142e-06, + "loss": 0.2122, + "step": 1308 + }, + { + "epoch": 1.195433789954338, + "grad_norm": 34.648555755615234, + "learning_rate": 9.78386605783866e-06, + "loss": 0.6791, + "step": 1309 + }, + { + "epoch": 1.1963470319634704, + "grad_norm": 34.19355010986328, + "learning_rate": 9.782851344495181e-06, + "loss": 0.9617, + "step": 1310 + }, + { + "epoch": 1.1972602739726028, + "grad_norm": 5.954439163208008, + "learning_rate": 9.7818366311517e-06, + "loss": 0.084, + "step": 1311 + }, + { + "epoch": 1.1981735159817353, + "grad_norm": 12.294219017028809, + "learning_rate": 9.78082191780822e-06, + "loss": 0.1892, + "step": 1312 + }, + { + "epoch": 1.1990867579908675, + "grad_norm": 54.13908767700195, + "learning_rate": 9.77980720446474e-06, + "loss": 2.2258, + "step": 1313 + }, + { + "epoch": 1.2, + "grad_norm": 3.8752927780151367, + "learning_rate": 9.77879249112126e-06, + "loss": 0.0404, + "step": 1314 + }, + { + "epoch": 1.2009132420091324, + "grad_norm": 18.28192901611328, + "learning_rate": 9.777777777777779e-06, + "loss": 0.2791, + "step": 1315 + }, + { + "epoch": 1.2018264840182649, + "grad_norm": 67.26031494140625, + "learning_rate": 9.776763064434297e-06, + "loss": 3.8751, + "step": 1316 + }, + { + "epoch": 1.2027397260273973, + "grad_norm": 13.156109809875488, + "learning_rate": 9.775748351090818e-06, + "loss": 0.2358, + "step": 1317 + }, + { + "epoch": 1.2036529680365298, + "grad_norm": 28.64256477355957, + "learning_rate": 9.774733637747337e-06, + "loss": 0.6248, + "step": 1318 + }, + { + "epoch": 1.204566210045662, + "grad_norm": 30.879308700561523, + "learning_rate": 9.773718924403857e-06, + "loss": 0.4688, + "step": 1319 + }, + { + "epoch": 1.2054794520547945, + "grad_norm": 28.877111434936523, + "learning_rate": 9.772704211060376e-06, + "loss": 0.696, + "step": 1320 + }, + { + "epoch": 1.206392694063927, + "grad_norm": 3.4045639038085938, + "learning_rate": 9.771689497716895e-06, + "loss": 0.0617, + "step": 1321 + }, + { + "epoch": 1.2073059360730594, + "grad_norm": 31.455286026000977, + "learning_rate": 9.770674784373416e-06, + "loss": 0.655, + "step": 1322 + }, + { + "epoch": 1.2082191780821918, + "grad_norm": 49.5270881652832, + "learning_rate": 9.769660071029934e-06, + "loss": 0.7373, + "step": 1323 + }, + { + "epoch": 1.2091324200913243, + "grad_norm": 35.715274810791016, + "learning_rate": 9.768645357686455e-06, + "loss": 0.6967, + "step": 1324 + }, + { + "epoch": 1.2100456621004567, + "grad_norm": 10.225111961364746, + "learning_rate": 9.767630644342974e-06, + "loss": 0.1333, + "step": 1325 + }, + { + "epoch": 1.210958904109589, + "grad_norm": 17.6523380279541, + "learning_rate": 9.766615930999493e-06, + "loss": 0.241, + "step": 1326 + }, + { + "epoch": 1.2118721461187214, + "grad_norm": 17.956205368041992, + "learning_rate": 9.765601217656013e-06, + "loss": 0.3455, + "step": 1327 + }, + { + "epoch": 1.2127853881278539, + "grad_norm": 64.51287078857422, + "learning_rate": 9.764586504312532e-06, + "loss": 3.1251, + "step": 1328 + }, + { + "epoch": 1.2136986301369863, + "grad_norm": 53.118263244628906, + "learning_rate": 9.763571790969053e-06, + "loss": 1.4628, + "step": 1329 + }, + { + "epoch": 1.2146118721461188, + "grad_norm": 34.2100944519043, + "learning_rate": 9.762557077625571e-06, + "loss": 0.4582, + "step": 1330 + }, + { + "epoch": 1.2155251141552512, + "grad_norm": 16.1262264251709, + "learning_rate": 9.76154236428209e-06, + "loss": 0.3037, + "step": 1331 + }, + { + "epoch": 1.2164383561643834, + "grad_norm": 43.19185256958008, + "learning_rate": 9.760527650938611e-06, + "loss": 1.2298, + "step": 1332 + }, + { + "epoch": 1.217351598173516, + "grad_norm": 21.861408233642578, + "learning_rate": 9.75951293759513e-06, + "loss": 0.3021, + "step": 1333 + }, + { + "epoch": 1.2182648401826484, + "grad_norm": 19.625431060791016, + "learning_rate": 9.75849822425165e-06, + "loss": 0.4201, + "step": 1334 + }, + { + "epoch": 1.2191780821917808, + "grad_norm": 1.7968097925186157, + "learning_rate": 9.757483510908169e-06, + "loss": 0.0283, + "step": 1335 + }, + { + "epoch": 1.2200913242009133, + "grad_norm": 6.749724388122559, + "learning_rate": 9.756468797564688e-06, + "loss": 0.1261, + "step": 1336 + }, + { + "epoch": 1.2210045662100457, + "grad_norm": 19.47516632080078, + "learning_rate": 9.755454084221208e-06, + "loss": 0.315, + "step": 1337 + }, + { + "epoch": 1.2219178082191782, + "grad_norm": 9.942461967468262, + "learning_rate": 9.754439370877727e-06, + "loss": 0.2051, + "step": 1338 + }, + { + "epoch": 1.2228310502283106, + "grad_norm": 42.48537063598633, + "learning_rate": 9.753424657534248e-06, + "loss": 1.4939, + "step": 1339 + }, + { + "epoch": 1.2237442922374429, + "grad_norm": 12.771733283996582, + "learning_rate": 9.752409944190767e-06, + "loss": 0.1498, + "step": 1340 + }, + { + "epoch": 1.2246575342465753, + "grad_norm": 68.5490951538086, + "learning_rate": 9.751395230847286e-06, + "loss": 4.5597, + "step": 1341 + }, + { + "epoch": 1.2255707762557078, + "grad_norm": 21.719257354736328, + "learning_rate": 9.750380517503806e-06, + "loss": 0.311, + "step": 1342 + }, + { + "epoch": 1.2264840182648402, + "grad_norm": 45.787445068359375, + "learning_rate": 9.749365804160325e-06, + "loss": 1.0146, + "step": 1343 + }, + { + "epoch": 1.2273972602739727, + "grad_norm": 47.37062072753906, + "learning_rate": 9.748351090816845e-06, + "loss": 1.7158, + "step": 1344 + }, + { + "epoch": 1.228310502283105, + "grad_norm": 12.458727836608887, + "learning_rate": 9.747336377473364e-06, + "loss": 0.2046, + "step": 1345 + }, + { + "epoch": 1.2292237442922374, + "grad_norm": 17.398040771484375, + "learning_rate": 9.746321664129885e-06, + "loss": 0.2651, + "step": 1346 + }, + { + "epoch": 1.2301369863013698, + "grad_norm": 32.49959945678711, + "learning_rate": 9.745306950786404e-06, + "loss": 0.4719, + "step": 1347 + }, + { + "epoch": 1.2310502283105023, + "grad_norm": 4.784490585327148, + "learning_rate": 9.744292237442923e-06, + "loss": 0.0812, + "step": 1348 + }, + { + "epoch": 1.2319634703196347, + "grad_norm": 46.62278747558594, + "learning_rate": 9.743277524099443e-06, + "loss": 1.2272, + "step": 1349 + }, + { + "epoch": 1.2328767123287672, + "grad_norm": 29.004411697387695, + "learning_rate": 9.742262810755962e-06, + "loss": 0.6146, + "step": 1350 + }, + { + "epoch": 1.2337899543378996, + "grad_norm": 22.679218292236328, + "learning_rate": 9.74124809741248e-06, + "loss": 0.3649, + "step": 1351 + }, + { + "epoch": 1.234703196347032, + "grad_norm": 42.56064224243164, + "learning_rate": 9.740233384069001e-06, + "loss": 1.3358, + "step": 1352 + }, + { + "epoch": 1.2356164383561643, + "grad_norm": 30.122631072998047, + "learning_rate": 9.73921867072552e-06, + "loss": 0.5824, + "step": 1353 + }, + { + "epoch": 1.2365296803652968, + "grad_norm": 18.59619903564453, + "learning_rate": 9.73820395738204e-06, + "loss": 0.2156, + "step": 1354 + }, + { + "epoch": 1.2374429223744292, + "grad_norm": 47.616397857666016, + "learning_rate": 9.73718924403856e-06, + "loss": 1.683, + "step": 1355 + }, + { + "epoch": 1.2383561643835617, + "grad_norm": 27.933231353759766, + "learning_rate": 9.73617453069508e-06, + "loss": 0.5605, + "step": 1356 + }, + { + "epoch": 1.2392694063926941, + "grad_norm": 62.41736602783203, + "learning_rate": 9.735159817351599e-06, + "loss": 1.6479, + "step": 1357 + }, + { + "epoch": 1.2401826484018266, + "grad_norm": 33.498939514160156, + "learning_rate": 9.734145104008118e-06, + "loss": 0.7567, + "step": 1358 + }, + { + "epoch": 1.2410958904109588, + "grad_norm": 21.877967834472656, + "learning_rate": 9.733130390664638e-06, + "loss": 0.3916, + "step": 1359 + }, + { + "epoch": 1.2420091324200913, + "grad_norm": 13.661853790283203, + "learning_rate": 9.732115677321157e-06, + "loss": 0.2285, + "step": 1360 + }, + { + "epoch": 1.2429223744292237, + "grad_norm": 29.216812133789062, + "learning_rate": 9.731100963977676e-06, + "loss": 0.5225, + "step": 1361 + }, + { + "epoch": 1.2438356164383562, + "grad_norm": 14.63206958770752, + "learning_rate": 9.730086250634197e-06, + "loss": 0.1635, + "step": 1362 + }, + { + "epoch": 1.2447488584474886, + "grad_norm": 6.082688331604004, + "learning_rate": 9.729071537290717e-06, + "loss": 0.0854, + "step": 1363 + }, + { + "epoch": 1.245662100456621, + "grad_norm": 40.588783264160156, + "learning_rate": 9.728056823947236e-06, + "loss": 1.152, + "step": 1364 + }, + { + "epoch": 1.2465753424657535, + "grad_norm": 2.896165370941162, + "learning_rate": 9.727042110603755e-06, + "loss": 0.0481, + "step": 1365 + }, + { + "epoch": 1.2474885844748858, + "grad_norm": 16.9880428314209, + "learning_rate": 9.726027397260275e-06, + "loss": 0.3086, + "step": 1366 + }, + { + "epoch": 1.2484018264840182, + "grad_norm": 73.4358901977539, + "learning_rate": 9.725012683916794e-06, + "loss": 2.4747, + "step": 1367 + }, + { + "epoch": 1.2493150684931507, + "grad_norm": 22.47654914855957, + "learning_rate": 9.723997970573313e-06, + "loss": 0.4253, + "step": 1368 + }, + { + "epoch": 1.2502283105022831, + "grad_norm": 42.28277587890625, + "learning_rate": 9.722983257229834e-06, + "loss": 1.1296, + "step": 1369 + }, + { + "epoch": 1.2511415525114156, + "grad_norm": 29.762554168701172, + "learning_rate": 9.721968543886352e-06, + "loss": 0.5859, + "step": 1370 + }, + { + "epoch": 1.252054794520548, + "grad_norm": 9.078088760375977, + "learning_rate": 9.720953830542871e-06, + "loss": 0.1637, + "step": 1371 + }, + { + "epoch": 1.2529680365296803, + "grad_norm": 20.324073791503906, + "learning_rate": 9.719939117199392e-06, + "loss": 0.3485, + "step": 1372 + }, + { + "epoch": 1.2538812785388127, + "grad_norm": 13.20418930053711, + "learning_rate": 9.718924403855912e-06, + "loss": 0.1711, + "step": 1373 + }, + { + "epoch": 1.2547945205479452, + "grad_norm": 1.1330840587615967, + "learning_rate": 9.717909690512431e-06, + "loss": 0.011, + "step": 1374 + }, + { + "epoch": 1.2557077625570776, + "grad_norm": 22.714250564575195, + "learning_rate": 9.71689497716895e-06, + "loss": 0.3893, + "step": 1375 + }, + { + "epoch": 1.25662100456621, + "grad_norm": 10.159333229064941, + "learning_rate": 9.71588026382547e-06, + "loss": 0.151, + "step": 1376 + }, + { + "epoch": 1.2575342465753425, + "grad_norm": 63.729766845703125, + "learning_rate": 9.71486555048199e-06, + "loss": 1.3613, + "step": 1377 + }, + { + "epoch": 1.258447488584475, + "grad_norm": 10.70964241027832, + "learning_rate": 9.713850837138508e-06, + "loss": 0.1223, + "step": 1378 + }, + { + "epoch": 1.2593607305936074, + "grad_norm": 65.06957244873047, + "learning_rate": 9.712836123795029e-06, + "loss": 1.8224, + "step": 1379 + }, + { + "epoch": 1.2602739726027397, + "grad_norm": 23.739675521850586, + "learning_rate": 9.711821410451548e-06, + "loss": 0.3868, + "step": 1380 + }, + { + "epoch": 1.261187214611872, + "grad_norm": 21.039806365966797, + "learning_rate": 9.710806697108066e-06, + "loss": 0.3269, + "step": 1381 + }, + { + "epoch": 1.2621004566210046, + "grad_norm": 23.28934669494629, + "learning_rate": 9.709791983764587e-06, + "loss": 0.4285, + "step": 1382 + }, + { + "epoch": 1.263013698630137, + "grad_norm": 2.6861157417297363, + "learning_rate": 9.708777270421108e-06, + "loss": 0.0289, + "step": 1383 + }, + { + "epoch": 1.2639269406392695, + "grad_norm": 33.12947463989258, + "learning_rate": 9.707762557077626e-06, + "loss": 0.6333, + "step": 1384 + }, + { + "epoch": 1.2648401826484017, + "grad_norm": 33.663177490234375, + "learning_rate": 9.706747843734145e-06, + "loss": 0.7387, + "step": 1385 + }, + { + "epoch": 1.2657534246575342, + "grad_norm": 4.892024040222168, + "learning_rate": 9.705733130390666e-06, + "loss": 0.0526, + "step": 1386 + }, + { + "epoch": 1.2666666666666666, + "grad_norm": 25.35021209716797, + "learning_rate": 9.704718417047185e-06, + "loss": 0.4126, + "step": 1387 + }, + { + "epoch": 1.267579908675799, + "grad_norm": 18.48503875732422, + "learning_rate": 9.703703703703703e-06, + "loss": 0.2574, + "step": 1388 + }, + { + "epoch": 1.2684931506849315, + "grad_norm": 2.1987931728363037, + "learning_rate": 9.702688990360224e-06, + "loss": 0.0218, + "step": 1389 + }, + { + "epoch": 1.269406392694064, + "grad_norm": 22.297115325927734, + "learning_rate": 9.701674277016745e-06, + "loss": 0.3876, + "step": 1390 + }, + { + "epoch": 1.2703196347031964, + "grad_norm": 51.64357376098633, + "learning_rate": 9.700659563673263e-06, + "loss": 1.8263, + "step": 1391 + }, + { + "epoch": 1.2712328767123289, + "grad_norm": 13.798107147216797, + "learning_rate": 9.699644850329782e-06, + "loss": 0.1685, + "step": 1392 + }, + { + "epoch": 1.272146118721461, + "grad_norm": 10.10886001586914, + "learning_rate": 9.698630136986303e-06, + "loss": 0.1247, + "step": 1393 + }, + { + "epoch": 1.2730593607305936, + "grad_norm": 47.06930923461914, + "learning_rate": 9.697615423642822e-06, + "loss": 0.5886, + "step": 1394 + }, + { + "epoch": 1.273972602739726, + "grad_norm": 41.145626068115234, + "learning_rate": 9.69660071029934e-06, + "loss": 0.7122, + "step": 1395 + }, + { + "epoch": 1.2748858447488585, + "grad_norm": 3.6694250106811523, + "learning_rate": 9.695585996955861e-06, + "loss": 0.0528, + "step": 1396 + }, + { + "epoch": 1.275799086757991, + "grad_norm": 49.31795120239258, + "learning_rate": 9.69457128361238e-06, + "loss": 1.4268, + "step": 1397 + }, + { + "epoch": 1.2767123287671232, + "grad_norm": 55.86100769042969, + "learning_rate": 9.693556570268899e-06, + "loss": 1.8733, + "step": 1398 + }, + { + "epoch": 1.2776255707762556, + "grad_norm": 43.270442962646484, + "learning_rate": 9.69254185692542e-06, + "loss": 0.7456, + "step": 1399 + }, + { + "epoch": 1.278538812785388, + "grad_norm": 26.0916805267334, + "learning_rate": 9.69152714358194e-06, + "loss": 0.4608, + "step": 1400 + }, + { + "epoch": 1.2794520547945205, + "grad_norm": 55.685279846191406, + "learning_rate": 9.690512430238459e-06, + "loss": 1.0923, + "step": 1401 + }, + { + "epoch": 1.280365296803653, + "grad_norm": 17.07508087158203, + "learning_rate": 9.689497716894977e-06, + "loss": 0.2465, + "step": 1402 + }, + { + "epoch": 1.2812785388127854, + "grad_norm": 20.355175018310547, + "learning_rate": 9.688483003551498e-06, + "loss": 0.3179, + "step": 1403 + }, + { + "epoch": 1.2821917808219179, + "grad_norm": 42.15535354614258, + "learning_rate": 9.687468290208017e-06, + "loss": 0.6694, + "step": 1404 + }, + { + "epoch": 1.2831050228310503, + "grad_norm": 38.87717056274414, + "learning_rate": 9.686453576864536e-06, + "loss": 0.714, + "step": 1405 + }, + { + "epoch": 1.2840182648401828, + "grad_norm": 49.489871978759766, + "learning_rate": 9.685438863521056e-06, + "loss": 1.0364, + "step": 1406 + }, + { + "epoch": 1.284931506849315, + "grad_norm": 1.345414400100708, + "learning_rate": 9.684424150177577e-06, + "loss": 0.0133, + "step": 1407 + }, + { + "epoch": 1.2858447488584475, + "grad_norm": 0.8824647665023804, + "learning_rate": 9.683409436834094e-06, + "loss": 0.0123, + "step": 1408 + }, + { + "epoch": 1.28675799086758, + "grad_norm": 19.743803024291992, + "learning_rate": 9.682394723490614e-06, + "loss": 0.4465, + "step": 1409 + }, + { + "epoch": 1.2876712328767124, + "grad_norm": 0.8950315713882446, + "learning_rate": 9.681380010147135e-06, + "loss": 0.0117, + "step": 1410 + }, + { + "epoch": 1.2885844748858448, + "grad_norm": 25.01788330078125, + "learning_rate": 9.680365296803654e-06, + "loss": 0.4085, + "step": 1411 + }, + { + "epoch": 1.289497716894977, + "grad_norm": 66.33527374267578, + "learning_rate": 9.679350583460173e-06, + "loss": 2.7356, + "step": 1412 + }, + { + "epoch": 1.2904109589041095, + "grad_norm": 3.5628650188446045, + "learning_rate": 9.678335870116693e-06, + "loss": 0.0428, + "step": 1413 + }, + { + "epoch": 1.291324200913242, + "grad_norm": 2.682762622833252, + "learning_rate": 9.677321156773212e-06, + "loss": 0.0451, + "step": 1414 + }, + { + "epoch": 1.2922374429223744, + "grad_norm": 49.91830062866211, + "learning_rate": 9.676306443429731e-06, + "loss": 1.327, + "step": 1415 + }, + { + "epoch": 1.2931506849315069, + "grad_norm": 59.67395782470703, + "learning_rate": 9.675291730086251e-06, + "loss": 1.1297, + "step": 1416 + }, + { + "epoch": 1.2940639269406393, + "grad_norm": 23.718154907226562, + "learning_rate": 9.674277016742772e-06, + "loss": 0.511, + "step": 1417 + }, + { + "epoch": 1.2949771689497718, + "grad_norm": 5.683130264282227, + "learning_rate": 9.67326230339929e-06, + "loss": 0.0646, + "step": 1418 + }, + { + "epoch": 1.2958904109589042, + "grad_norm": 58.728721618652344, + "learning_rate": 9.67224759005581e-06, + "loss": 1.6681, + "step": 1419 + }, + { + "epoch": 1.2968036529680365, + "grad_norm": 72.99105072021484, + "learning_rate": 9.67123287671233e-06, + "loss": 3.1088, + "step": 1420 + }, + { + "epoch": 1.297716894977169, + "grad_norm": 33.09652328491211, + "learning_rate": 9.670218163368849e-06, + "loss": 0.7736, + "step": 1421 + }, + { + "epoch": 1.2986301369863014, + "grad_norm": 30.904972076416016, + "learning_rate": 9.669203450025368e-06, + "loss": 0.9249, + "step": 1422 + }, + { + "epoch": 1.2995433789954338, + "grad_norm": 58.64447021484375, + "learning_rate": 9.668188736681888e-06, + "loss": 2.2899, + "step": 1423 + }, + { + "epoch": 1.3004566210045663, + "grad_norm": 71.54244995117188, + "learning_rate": 9.667174023338409e-06, + "loss": 0.8728, + "step": 1424 + }, + { + "epoch": 1.3013698630136985, + "grad_norm": 4.404160499572754, + "learning_rate": 9.666159309994926e-06, + "loss": 0.0534, + "step": 1425 + }, + { + "epoch": 1.302283105022831, + "grad_norm": 13.802167892456055, + "learning_rate": 9.665144596651447e-06, + "loss": 0.3101, + "step": 1426 + }, + { + "epoch": 1.3031963470319634, + "grad_norm": 27.46969223022461, + "learning_rate": 9.664129883307967e-06, + "loss": 0.2479, + "step": 1427 + }, + { + "epoch": 1.3041095890410959, + "grad_norm": 43.584781646728516, + "learning_rate": 9.663115169964486e-06, + "loss": 1.1102, + "step": 1428 + }, + { + "epoch": 1.3050228310502283, + "grad_norm": 42.28525161743164, + "learning_rate": 9.662100456621005e-06, + "loss": 1.2682, + "step": 1429 + }, + { + "epoch": 1.3059360730593608, + "grad_norm": 6.439974308013916, + "learning_rate": 9.661085743277525e-06, + "loss": 0.0691, + "step": 1430 + }, + { + "epoch": 1.3068493150684932, + "grad_norm": 10.394142150878906, + "learning_rate": 9.660071029934044e-06, + "loss": 0.197, + "step": 1431 + }, + { + "epoch": 1.3077625570776257, + "grad_norm": 48.618194580078125, + "learning_rate": 9.659056316590563e-06, + "loss": 0.8464, + "step": 1432 + }, + { + "epoch": 1.3086757990867581, + "grad_norm": 13.243879318237305, + "learning_rate": 9.658041603247084e-06, + "loss": 0.2171, + "step": 1433 + }, + { + "epoch": 1.3095890410958904, + "grad_norm": 33.1775016784668, + "learning_rate": 9.657026889903604e-06, + "loss": 0.585, + "step": 1434 + }, + { + "epoch": 1.3105022831050228, + "grad_norm": 29.087247848510742, + "learning_rate": 9.656012176560123e-06, + "loss": 0.4684, + "step": 1435 + }, + { + "epoch": 1.3114155251141553, + "grad_norm": 45.87799072265625, + "learning_rate": 9.654997463216642e-06, + "loss": 1.2234, + "step": 1436 + }, + { + "epoch": 1.3123287671232877, + "grad_norm": 71.58831787109375, + "learning_rate": 9.653982749873162e-06, + "loss": 2.8581, + "step": 1437 + }, + { + "epoch": 1.3132420091324202, + "grad_norm": 50.051815032958984, + "learning_rate": 9.652968036529681e-06, + "loss": 1.1369, + "step": 1438 + }, + { + "epoch": 1.3141552511415524, + "grad_norm": 26.265668869018555, + "learning_rate": 9.6519533231862e-06, + "loss": 0.7819, + "step": 1439 + }, + { + "epoch": 1.3150684931506849, + "grad_norm": 12.57568073272705, + "learning_rate": 9.65093860984272e-06, + "loss": 0.1806, + "step": 1440 + }, + { + "epoch": 1.3159817351598173, + "grad_norm": 61.04334259033203, + "learning_rate": 9.64992389649924e-06, + "loss": 3.4739, + "step": 1441 + }, + { + "epoch": 1.3168949771689498, + "grad_norm": 48.069068908691406, + "learning_rate": 9.648909183155758e-06, + "loss": 1.0431, + "step": 1442 + }, + { + "epoch": 1.3178082191780822, + "grad_norm": 36.22300720214844, + "learning_rate": 9.647894469812279e-06, + "loss": 0.849, + "step": 1443 + }, + { + "epoch": 1.3187214611872147, + "grad_norm": 5.593836784362793, + "learning_rate": 9.6468797564688e-06, + "loss": 0.1013, + "step": 1444 + }, + { + "epoch": 1.3196347031963471, + "grad_norm": 13.923113822937012, + "learning_rate": 9.645865043125318e-06, + "loss": 0.2192, + "step": 1445 + }, + { + "epoch": 1.3205479452054796, + "grad_norm": 40.375431060791016, + "learning_rate": 9.644850329781837e-06, + "loss": 1.7102, + "step": 1446 + }, + { + "epoch": 1.3214611872146118, + "grad_norm": 26.83962059020996, + "learning_rate": 9.643835616438358e-06, + "loss": 0.5733, + "step": 1447 + }, + { + "epoch": 1.3223744292237443, + "grad_norm": 24.710956573486328, + "learning_rate": 9.642820903094877e-06, + "loss": 0.5362, + "step": 1448 + }, + { + "epoch": 1.3232876712328767, + "grad_norm": 35.9326057434082, + "learning_rate": 9.641806189751395e-06, + "loss": 0.9218, + "step": 1449 + }, + { + "epoch": 1.3242009132420092, + "grad_norm": 9.297933578491211, + "learning_rate": 9.640791476407916e-06, + "loss": 0.1337, + "step": 1450 + }, + { + "epoch": 1.3251141552511416, + "grad_norm": 28.742053985595703, + "learning_rate": 9.639776763064435e-06, + "loss": 0.5523, + "step": 1451 + }, + { + "epoch": 1.3260273972602739, + "grad_norm": 7.808824062347412, + "learning_rate": 9.638762049720954e-06, + "loss": 0.1082, + "step": 1452 + }, + { + "epoch": 1.3269406392694063, + "grad_norm": 14.777586936950684, + "learning_rate": 9.637747336377474e-06, + "loss": 0.2242, + "step": 1453 + }, + { + "epoch": 1.3278538812785388, + "grad_norm": 12.6802978515625, + "learning_rate": 9.636732623033995e-06, + "loss": 0.2237, + "step": 1454 + }, + { + "epoch": 1.3287671232876712, + "grad_norm": 8.94244384765625, + "learning_rate": 9.635717909690514e-06, + "loss": 0.1404, + "step": 1455 + }, + { + "epoch": 1.3296803652968037, + "grad_norm": 42.40574645996094, + "learning_rate": 9.634703196347032e-06, + "loss": 1.3634, + "step": 1456 + }, + { + "epoch": 1.3305936073059361, + "grad_norm": 36.14883804321289, + "learning_rate": 9.633688483003553e-06, + "loss": 0.9918, + "step": 1457 + }, + { + "epoch": 1.3315068493150686, + "grad_norm": 8.181112289428711, + "learning_rate": 9.632673769660072e-06, + "loss": 0.1415, + "step": 1458 + }, + { + "epoch": 1.332420091324201, + "grad_norm": 18.23818588256836, + "learning_rate": 9.63165905631659e-06, + "loss": 0.2308, + "step": 1459 + }, + { + "epoch": 1.3333333333333333, + "grad_norm": 0.31242915987968445, + "learning_rate": 9.630644342973111e-06, + "loss": 0.0039, + "step": 1460 + }, + { + "epoch": 1.3342465753424657, + "grad_norm": 36.6351318359375, + "learning_rate": 9.62962962962963e-06, + "loss": 0.7642, + "step": 1461 + }, + { + "epoch": 1.3351598173515982, + "grad_norm": 27.62470245361328, + "learning_rate": 9.62861491628615e-06, + "loss": 0.841, + "step": 1462 + }, + { + "epoch": 1.3360730593607306, + "grad_norm": 20.4239444732666, + "learning_rate": 9.62760020294267e-06, + "loss": 0.3698, + "step": 1463 + }, + { + "epoch": 1.336986301369863, + "grad_norm": 6.474637508392334, + "learning_rate": 9.62658548959919e-06, + "loss": 0.1044, + "step": 1464 + }, + { + "epoch": 1.3378995433789953, + "grad_norm": 29.797618865966797, + "learning_rate": 9.625570776255709e-06, + "loss": 0.6451, + "step": 1465 + }, + { + "epoch": 1.3388127853881278, + "grad_norm": 49.083351135253906, + "learning_rate": 9.624556062912228e-06, + "loss": 0.8909, + "step": 1466 + }, + { + "epoch": 1.3397260273972602, + "grad_norm": 2.2635443210601807, + "learning_rate": 9.623541349568748e-06, + "loss": 0.041, + "step": 1467 + }, + { + "epoch": 1.3406392694063927, + "grad_norm": 71.11846923828125, + "learning_rate": 9.622526636225267e-06, + "loss": 2.4965, + "step": 1468 + }, + { + "epoch": 1.3415525114155251, + "grad_norm": 57.882415771484375, + "learning_rate": 9.621511922881786e-06, + "loss": 1.6261, + "step": 1469 + }, + { + "epoch": 1.3424657534246576, + "grad_norm": 28.801607131958008, + "learning_rate": 9.620497209538306e-06, + "loss": 0.5976, + "step": 1470 + }, + { + "epoch": 1.34337899543379, + "grad_norm": 41.701446533203125, + "learning_rate": 9.619482496194825e-06, + "loss": 0.8452, + "step": 1471 + }, + { + "epoch": 1.3442922374429225, + "grad_norm": 31.361286163330078, + "learning_rate": 9.618467782851346e-06, + "loss": 0.4684, + "step": 1472 + }, + { + "epoch": 1.345205479452055, + "grad_norm": 30.60369110107422, + "learning_rate": 9.617453069507865e-06, + "loss": 0.8274, + "step": 1473 + }, + { + "epoch": 1.3461187214611872, + "grad_norm": 2.064893960952759, + "learning_rate": 9.616438356164385e-06, + "loss": 0.0208, + "step": 1474 + }, + { + "epoch": 1.3470319634703196, + "grad_norm": 24.43122100830078, + "learning_rate": 9.615423642820904e-06, + "loss": 0.4563, + "step": 1475 + }, + { + "epoch": 1.347945205479452, + "grad_norm": 14.091416358947754, + "learning_rate": 9.614408929477423e-06, + "loss": 0.2053, + "step": 1476 + }, + { + "epoch": 1.3488584474885845, + "grad_norm": 32.81586837768555, + "learning_rate": 9.613394216133943e-06, + "loss": 0.3806, + "step": 1477 + }, + { + "epoch": 1.349771689497717, + "grad_norm": 1.6426706314086914, + "learning_rate": 9.612379502790462e-06, + "loss": 0.0264, + "step": 1478 + }, + { + "epoch": 1.3506849315068492, + "grad_norm": 34.412567138671875, + "learning_rate": 9.611364789446983e-06, + "loss": 1.1683, + "step": 1479 + }, + { + "epoch": 1.3515981735159817, + "grad_norm": 20.966318130493164, + "learning_rate": 9.610350076103502e-06, + "loss": 0.3821, + "step": 1480 + }, + { + "epoch": 1.3525114155251141, + "grad_norm": 39.870887756347656, + "learning_rate": 9.60933536276002e-06, + "loss": 1.2616, + "step": 1481 + }, + { + "epoch": 1.3534246575342466, + "grad_norm": 28.10563850402832, + "learning_rate": 9.608320649416541e-06, + "loss": 0.5629, + "step": 1482 + }, + { + "epoch": 1.354337899543379, + "grad_norm": 34.950931549072266, + "learning_rate": 9.60730593607306e-06, + "loss": 0.9807, + "step": 1483 + }, + { + "epoch": 1.3552511415525115, + "grad_norm": 65.75867462158203, + "learning_rate": 9.60629122272958e-06, + "loss": 3.6203, + "step": 1484 + }, + { + "epoch": 1.356164383561644, + "grad_norm": 47.03571319580078, + "learning_rate": 9.6052765093861e-06, + "loss": 1.2442, + "step": 1485 + }, + { + "epoch": 1.3570776255707764, + "grad_norm": 19.531063079833984, + "learning_rate": 9.604261796042618e-06, + "loss": 0.2834, + "step": 1486 + }, + { + "epoch": 1.3579908675799086, + "grad_norm": 59.54323196411133, + "learning_rate": 9.603247082699139e-06, + "loss": 1.0783, + "step": 1487 + }, + { + "epoch": 1.358904109589041, + "grad_norm": 30.476926803588867, + "learning_rate": 9.602232369355657e-06, + "loss": 0.5907, + "step": 1488 + }, + { + "epoch": 1.3598173515981735, + "grad_norm": 10.693446159362793, + "learning_rate": 9.601217656012178e-06, + "loss": 0.1987, + "step": 1489 + }, + { + "epoch": 1.360730593607306, + "grad_norm": 49.92268371582031, + "learning_rate": 9.600202942668697e-06, + "loss": 1.3322, + "step": 1490 + }, + { + "epoch": 1.3616438356164384, + "grad_norm": 10.584304809570312, + "learning_rate": 9.599188229325216e-06, + "loss": 0.2341, + "step": 1491 + }, + { + "epoch": 1.3625570776255707, + "grad_norm": 2.4408230781555176, + "learning_rate": 9.598173515981736e-06, + "loss": 0.0394, + "step": 1492 + }, + { + "epoch": 1.363470319634703, + "grad_norm": 4.205300331115723, + "learning_rate": 9.597158802638255e-06, + "loss": 0.0531, + "step": 1493 + }, + { + "epoch": 1.3643835616438356, + "grad_norm": 5.087258338928223, + "learning_rate": 9.596144089294776e-06, + "loss": 0.0682, + "step": 1494 + }, + { + "epoch": 1.365296803652968, + "grad_norm": 36.46416473388672, + "learning_rate": 9.595129375951294e-06, + "loss": 0.6205, + "step": 1495 + }, + { + "epoch": 1.3662100456621005, + "grad_norm": 69.49920654296875, + "learning_rate": 9.594114662607813e-06, + "loss": 5.3157, + "step": 1496 + }, + { + "epoch": 1.367123287671233, + "grad_norm": 68.72769165039062, + "learning_rate": 9.593099949264334e-06, + "loss": 3.4061, + "step": 1497 + }, + { + "epoch": 1.3680365296803654, + "grad_norm": 2.5967040061950684, + "learning_rate": 9.592085235920853e-06, + "loss": 0.0345, + "step": 1498 + }, + { + "epoch": 1.3689497716894978, + "grad_norm": 18.941587448120117, + "learning_rate": 9.591070522577373e-06, + "loss": 0.3397, + "step": 1499 + }, + { + "epoch": 1.36986301369863, + "grad_norm": 51.95602035522461, + "learning_rate": 9.590055809233892e-06, + "loss": 2.4274, + "step": 1500 + }, + { + "epoch": 1.3707762557077625, + "grad_norm": 56.73247528076172, + "learning_rate": 9.589041095890411e-06, + "loss": 1.5928, + "step": 1501 + }, + { + "epoch": 1.371689497716895, + "grad_norm": 21.059228897094727, + "learning_rate": 9.588026382546931e-06, + "loss": 0.3887, + "step": 1502 + }, + { + "epoch": 1.3726027397260274, + "grad_norm": 40.32494354248047, + "learning_rate": 9.58701166920345e-06, + "loss": 1.3621, + "step": 1503 + }, + { + "epoch": 1.3735159817351599, + "grad_norm": 34.133514404296875, + "learning_rate": 9.58599695585997e-06, + "loss": 0.8323, + "step": 1504 + }, + { + "epoch": 1.374429223744292, + "grad_norm": 61.046180725097656, + "learning_rate": 9.58498224251649e-06, + "loss": 2.8841, + "step": 1505 + }, + { + "epoch": 1.3753424657534246, + "grad_norm": 40.047706604003906, + "learning_rate": 9.58396752917301e-06, + "loss": 1.48, + "step": 1506 + }, + { + "epoch": 1.376255707762557, + "grad_norm": 17.490833282470703, + "learning_rate": 9.582952815829529e-06, + "loss": 0.3518, + "step": 1507 + }, + { + "epoch": 1.3771689497716895, + "grad_norm": 16.647075653076172, + "learning_rate": 9.581938102486048e-06, + "loss": 0.3375, + "step": 1508 + }, + { + "epoch": 1.378082191780822, + "grad_norm": 10.210820198059082, + "learning_rate": 9.580923389142568e-06, + "loss": 0.1864, + "step": 1509 + }, + { + "epoch": 1.3789954337899544, + "grad_norm": 44.405487060546875, + "learning_rate": 9.579908675799087e-06, + "loss": 2.0496, + "step": 1510 + }, + { + "epoch": 1.3799086757990868, + "grad_norm": 43.68741226196289, + "learning_rate": 9.578893962455606e-06, + "loss": 1.5072, + "step": 1511 + }, + { + "epoch": 1.3808219178082193, + "grad_norm": 38.4391975402832, + "learning_rate": 9.577879249112127e-06, + "loss": 1.2661, + "step": 1512 + }, + { + "epoch": 1.3817351598173517, + "grad_norm": 23.075544357299805, + "learning_rate": 9.576864535768645e-06, + "loss": 0.5746, + "step": 1513 + }, + { + "epoch": 1.382648401826484, + "grad_norm": 21.79705047607422, + "learning_rate": 9.575849822425166e-06, + "loss": 0.5019, + "step": 1514 + }, + { + "epoch": 1.3835616438356164, + "grad_norm": 13.099128723144531, + "learning_rate": 9.574835109081685e-06, + "loss": 0.28, + "step": 1515 + }, + { + "epoch": 1.3844748858447489, + "grad_norm": 19.856470108032227, + "learning_rate": 9.573820395738205e-06, + "loss": 0.3, + "step": 1516 + }, + { + "epoch": 1.3853881278538813, + "grad_norm": 11.160099029541016, + "learning_rate": 9.572805682394724e-06, + "loss": 0.2554, + "step": 1517 + }, + { + "epoch": 1.3863013698630138, + "grad_norm": 39.880035400390625, + "learning_rate": 9.571790969051243e-06, + "loss": 1.4842, + "step": 1518 + }, + { + "epoch": 1.387214611872146, + "grad_norm": 28.36479949951172, + "learning_rate": 9.570776255707764e-06, + "loss": 0.7015, + "step": 1519 + }, + { + "epoch": 1.3881278538812785, + "grad_norm": 14.020882606506348, + "learning_rate": 9.569761542364282e-06, + "loss": 0.2723, + "step": 1520 + }, + { + "epoch": 1.389041095890411, + "grad_norm": 3.3745455741882324, + "learning_rate": 9.568746829020801e-06, + "loss": 0.0497, + "step": 1521 + }, + { + "epoch": 1.3899543378995434, + "grad_norm": 42.23617172241211, + "learning_rate": 9.567732115677322e-06, + "loss": 1.2001, + "step": 1522 + }, + { + "epoch": 1.3908675799086758, + "grad_norm": 38.642616271972656, + "learning_rate": 9.566717402333842e-06, + "loss": 0.8244, + "step": 1523 + }, + { + "epoch": 1.3917808219178083, + "grad_norm": 36.09209442138672, + "learning_rate": 9.565702688990361e-06, + "loss": 0.8586, + "step": 1524 + }, + { + "epoch": 1.3926940639269407, + "grad_norm": 23.85636329650879, + "learning_rate": 9.56468797564688e-06, + "loss": 0.8633, + "step": 1525 + }, + { + "epoch": 1.3936073059360732, + "grad_norm": 52.54085159301758, + "learning_rate": 9.5636732623034e-06, + "loss": 1.892, + "step": 1526 + }, + { + "epoch": 1.3945205479452054, + "grad_norm": 21.67852210998535, + "learning_rate": 9.56265854895992e-06, + "loss": 0.3336, + "step": 1527 + }, + { + "epoch": 1.3954337899543379, + "grad_norm": 12.191291809082031, + "learning_rate": 9.561643835616438e-06, + "loss": 0.2686, + "step": 1528 + }, + { + "epoch": 1.3963470319634703, + "grad_norm": 0.408549427986145, + "learning_rate": 9.560629122272959e-06, + "loss": 0.0059, + "step": 1529 + }, + { + "epoch": 1.3972602739726028, + "grad_norm": 15.58788776397705, + "learning_rate": 9.559614408929478e-06, + "loss": 0.3505, + "step": 1530 + }, + { + "epoch": 1.3981735159817352, + "grad_norm": 35.937904357910156, + "learning_rate": 9.558599695585997e-06, + "loss": 0.7632, + "step": 1531 + }, + { + "epoch": 1.3990867579908675, + "grad_norm": 49.72030258178711, + "learning_rate": 9.557584982242517e-06, + "loss": 0.9698, + "step": 1532 + }, + { + "epoch": 1.4, + "grad_norm": 22.264307022094727, + "learning_rate": 9.556570268899038e-06, + "loss": 0.4657, + "step": 1533 + }, + { + "epoch": 1.4009132420091324, + "grad_norm": 49.791664123535156, + "learning_rate": 9.555555555555556e-06, + "loss": 1.7618, + "step": 1534 + }, + { + "epoch": 1.4018264840182648, + "grad_norm": 25.366289138793945, + "learning_rate": 9.554540842212075e-06, + "loss": 0.6526, + "step": 1535 + }, + { + "epoch": 1.4027397260273973, + "grad_norm": 22.94159698486328, + "learning_rate": 9.553526128868596e-06, + "loss": 0.4485, + "step": 1536 + }, + { + "epoch": 1.4036529680365297, + "grad_norm": 25.193992614746094, + "learning_rate": 9.552511415525115e-06, + "loss": 0.6005, + "step": 1537 + }, + { + "epoch": 1.4045662100456622, + "grad_norm": 61.366600036621094, + "learning_rate": 9.551496702181634e-06, + "loss": 1.1139, + "step": 1538 + }, + { + "epoch": 1.4054794520547946, + "grad_norm": 26.57777976989746, + "learning_rate": 9.550481988838154e-06, + "loss": 0.5509, + "step": 1539 + }, + { + "epoch": 1.4063926940639269, + "grad_norm": 4.3537397384643555, + "learning_rate": 9.549467275494673e-06, + "loss": 0.0633, + "step": 1540 + }, + { + "epoch": 1.4073059360730593, + "grad_norm": 23.456335067749023, + "learning_rate": 9.548452562151192e-06, + "loss": 0.5619, + "step": 1541 + }, + { + "epoch": 1.4082191780821918, + "grad_norm": 27.352781295776367, + "learning_rate": 9.547437848807712e-06, + "loss": 0.3304, + "step": 1542 + }, + { + "epoch": 1.4091324200913242, + "grad_norm": 4.615691184997559, + "learning_rate": 9.546423135464233e-06, + "loss": 0.0644, + "step": 1543 + }, + { + "epoch": 1.4100456621004567, + "grad_norm": 31.63460350036621, + "learning_rate": 9.545408422120752e-06, + "loss": 0.7751, + "step": 1544 + }, + { + "epoch": 1.410958904109589, + "grad_norm": 9.439260482788086, + "learning_rate": 9.54439370877727e-06, + "loss": 0.1238, + "step": 1545 + }, + { + "epoch": 1.4118721461187214, + "grad_norm": 6.572169780731201, + "learning_rate": 9.543378995433791e-06, + "loss": 0.0634, + "step": 1546 + }, + { + "epoch": 1.4127853881278538, + "grad_norm": 17.135822296142578, + "learning_rate": 9.54236428209031e-06, + "loss": 0.2521, + "step": 1547 + }, + { + "epoch": 1.4136986301369863, + "grad_norm": 14.693626403808594, + "learning_rate": 9.541349568746829e-06, + "loss": 0.246, + "step": 1548 + }, + { + "epoch": 1.4146118721461187, + "grad_norm": 38.0050048828125, + "learning_rate": 9.54033485540335e-06, + "loss": 1.0272, + "step": 1549 + }, + { + "epoch": 1.4155251141552512, + "grad_norm": 25.443513870239258, + "learning_rate": 9.53932014205987e-06, + "loss": 0.4717, + "step": 1550 + }, + { + "epoch": 1.4164383561643836, + "grad_norm": 14.427858352661133, + "learning_rate": 9.538305428716389e-06, + "loss": 0.2194, + "step": 1551 + }, + { + "epoch": 1.417351598173516, + "grad_norm": 61.570037841796875, + "learning_rate": 9.537290715372908e-06, + "loss": 3.5519, + "step": 1552 + }, + { + "epoch": 1.4182648401826485, + "grad_norm": 21.17894172668457, + "learning_rate": 9.536276002029428e-06, + "loss": 0.3347, + "step": 1553 + }, + { + "epoch": 1.4191780821917808, + "grad_norm": 19.45554542541504, + "learning_rate": 9.535261288685947e-06, + "loss": 0.391, + "step": 1554 + }, + { + "epoch": 1.4200913242009132, + "grad_norm": 30.781343460083008, + "learning_rate": 9.534246575342466e-06, + "loss": 0.3923, + "step": 1555 + }, + { + "epoch": 1.4210045662100457, + "grad_norm": 29.425912857055664, + "learning_rate": 9.533231861998986e-06, + "loss": 0.6754, + "step": 1556 + }, + { + "epoch": 1.4219178082191781, + "grad_norm": 42.18089294433594, + "learning_rate": 9.532217148655505e-06, + "loss": 1.1248, + "step": 1557 + }, + { + "epoch": 1.4228310502283106, + "grad_norm": 8.929099082946777, + "learning_rate": 9.531202435312024e-06, + "loss": 0.1409, + "step": 1558 + }, + { + "epoch": 1.4237442922374428, + "grad_norm": 17.048194885253906, + "learning_rate": 9.530187721968545e-06, + "loss": 0.3014, + "step": 1559 + }, + { + "epoch": 1.4246575342465753, + "grad_norm": 42.236656188964844, + "learning_rate": 9.529173008625065e-06, + "loss": 2.3603, + "step": 1560 + }, + { + "epoch": 1.4255707762557077, + "grad_norm": 18.779949188232422, + "learning_rate": 9.528158295281584e-06, + "loss": 0.2615, + "step": 1561 + }, + { + "epoch": 1.4264840182648402, + "grad_norm": 36.51250076293945, + "learning_rate": 9.527143581938103e-06, + "loss": 1.1063, + "step": 1562 + }, + { + "epoch": 1.4273972602739726, + "grad_norm": 57.72242736816406, + "learning_rate": 9.526128868594623e-06, + "loss": 3.1185, + "step": 1563 + }, + { + "epoch": 1.428310502283105, + "grad_norm": 51.7912712097168, + "learning_rate": 9.525114155251142e-06, + "loss": 1.6615, + "step": 1564 + }, + { + "epoch": 1.4292237442922375, + "grad_norm": 21.54848861694336, + "learning_rate": 9.524099441907661e-06, + "loss": 0.3446, + "step": 1565 + }, + { + "epoch": 1.43013698630137, + "grad_norm": 45.44028091430664, + "learning_rate": 9.523084728564182e-06, + "loss": 1.0308, + "step": 1566 + }, + { + "epoch": 1.4310502283105022, + "grad_norm": 14.685016632080078, + "learning_rate": 9.522070015220702e-06, + "loss": 0.3034, + "step": 1567 + }, + { + "epoch": 1.4319634703196347, + "grad_norm": 5.308413982391357, + "learning_rate": 9.52105530187722e-06, + "loss": 0.0624, + "step": 1568 + }, + { + "epoch": 1.4328767123287671, + "grad_norm": 46.707847595214844, + "learning_rate": 9.52004058853374e-06, + "loss": 0.5728, + "step": 1569 + }, + { + "epoch": 1.4337899543378996, + "grad_norm": 4.976198196411133, + "learning_rate": 9.51902587519026e-06, + "loss": 0.0611, + "step": 1570 + }, + { + "epoch": 1.434703196347032, + "grad_norm": 9.984073638916016, + "learning_rate": 9.51801116184678e-06, + "loss": 0.1087, + "step": 1571 + }, + { + "epoch": 1.4356164383561643, + "grad_norm": 20.61429786682129, + "learning_rate": 9.516996448503298e-06, + "loss": 0.5392, + "step": 1572 + }, + { + "epoch": 1.4365296803652967, + "grad_norm": 44.6214714050293, + "learning_rate": 9.515981735159819e-06, + "loss": 0.8638, + "step": 1573 + }, + { + "epoch": 1.4374429223744292, + "grad_norm": 7.392256736755371, + "learning_rate": 9.514967021816337e-06, + "loss": 0.1341, + "step": 1574 + }, + { + "epoch": 1.4383561643835616, + "grad_norm": 26.32680320739746, + "learning_rate": 9.513952308472856e-06, + "loss": 0.3303, + "step": 1575 + }, + { + "epoch": 1.439269406392694, + "grad_norm": 32.429569244384766, + "learning_rate": 9.512937595129377e-06, + "loss": 0.8118, + "step": 1576 + }, + { + "epoch": 1.4401826484018265, + "grad_norm": 10.965568542480469, + "learning_rate": 9.511922881785897e-06, + "loss": 0.1804, + "step": 1577 + }, + { + "epoch": 1.441095890410959, + "grad_norm": 16.63292694091797, + "learning_rate": 9.510908168442416e-06, + "loss": 0.2304, + "step": 1578 + }, + { + "epoch": 1.4420091324200914, + "grad_norm": 48.13395690917969, + "learning_rate": 9.509893455098935e-06, + "loss": 0.9143, + "step": 1579 + }, + { + "epoch": 1.4429223744292237, + "grad_norm": 34.774356842041016, + "learning_rate": 9.508878741755456e-06, + "loss": 0.7183, + "step": 1580 + }, + { + "epoch": 1.4438356164383561, + "grad_norm": 13.29495620727539, + "learning_rate": 9.507864028411974e-06, + "loss": 0.3038, + "step": 1581 + }, + { + "epoch": 1.4447488584474886, + "grad_norm": 34.1475830078125, + "learning_rate": 9.506849315068493e-06, + "loss": 0.637, + "step": 1582 + }, + { + "epoch": 1.445662100456621, + "grad_norm": 14.707759857177734, + "learning_rate": 9.505834601725014e-06, + "loss": 0.2564, + "step": 1583 + }, + { + "epoch": 1.4465753424657535, + "grad_norm": 8.18093490600586, + "learning_rate": 9.504819888381533e-06, + "loss": 0.096, + "step": 1584 + }, + { + "epoch": 1.4474885844748857, + "grad_norm": 23.882699966430664, + "learning_rate": 9.503805175038051e-06, + "loss": 0.3374, + "step": 1585 + }, + { + "epoch": 1.4484018264840182, + "grad_norm": 7.437891483306885, + "learning_rate": 9.502790461694572e-06, + "loss": 0.1167, + "step": 1586 + }, + { + "epoch": 1.4493150684931506, + "grad_norm": 4.135950565338135, + "learning_rate": 9.501775748351093e-06, + "loss": 0.069, + "step": 1587 + }, + { + "epoch": 1.450228310502283, + "grad_norm": 14.400588989257812, + "learning_rate": 9.500761035007611e-06, + "loss": 0.2647, + "step": 1588 + }, + { + "epoch": 1.4511415525114155, + "grad_norm": 16.61766242980957, + "learning_rate": 9.49974632166413e-06, + "loss": 0.2851, + "step": 1589 + }, + { + "epoch": 1.452054794520548, + "grad_norm": 3.806891918182373, + "learning_rate": 9.49873160832065e-06, + "loss": 0.0609, + "step": 1590 + }, + { + "epoch": 1.4529680365296804, + "grad_norm": 30.214635848999023, + "learning_rate": 9.49771689497717e-06, + "loss": 0.7528, + "step": 1591 + }, + { + "epoch": 1.4538812785388129, + "grad_norm": 3.6203715801239014, + "learning_rate": 9.496702181633688e-06, + "loss": 0.0536, + "step": 1592 + }, + { + "epoch": 1.4547945205479453, + "grad_norm": 59.90462875366211, + "learning_rate": 9.495687468290209e-06, + "loss": 1.2174, + "step": 1593 + }, + { + "epoch": 1.4557077625570776, + "grad_norm": 2.787414073944092, + "learning_rate": 9.49467275494673e-06, + "loss": 0.0373, + "step": 1594 + }, + { + "epoch": 1.45662100456621, + "grad_norm": 4.813838005065918, + "learning_rate": 9.493658041603248e-06, + "loss": 0.0397, + "step": 1595 + }, + { + "epoch": 1.4575342465753425, + "grad_norm": 0.6255916953086853, + "learning_rate": 9.492643328259767e-06, + "loss": 0.0084, + "step": 1596 + }, + { + "epoch": 1.458447488584475, + "grad_norm": 11.226268768310547, + "learning_rate": 9.491628614916288e-06, + "loss": 0.2018, + "step": 1597 + }, + { + "epoch": 1.4593607305936074, + "grad_norm": 33.30735778808594, + "learning_rate": 9.490613901572807e-06, + "loss": 0.3938, + "step": 1598 + }, + { + "epoch": 1.4602739726027396, + "grad_norm": 0.39837944507598877, + "learning_rate": 9.489599188229325e-06, + "loss": 0.0053, + "step": 1599 + }, + { + "epoch": 1.461187214611872, + "grad_norm": 61.97392272949219, + "learning_rate": 9.488584474885846e-06, + "loss": 1.6036, + "step": 1600 + }, + { + "epoch": 1.4621004566210045, + "grad_norm": 27.624279022216797, + "learning_rate": 9.487569761542365e-06, + "loss": 0.4718, + "step": 1601 + }, + { + "epoch": 1.463013698630137, + "grad_norm": 49.31851577758789, + "learning_rate": 9.486555048198884e-06, + "loss": 1.5961, + "step": 1602 + }, + { + "epoch": 1.4639269406392694, + "grad_norm": 11.32485294342041, + "learning_rate": 9.485540334855404e-06, + "loss": 0.1165, + "step": 1603 + }, + { + "epoch": 1.4648401826484019, + "grad_norm": 1.6487401723861694, + "learning_rate": 9.484525621511925e-06, + "loss": 0.025, + "step": 1604 + }, + { + "epoch": 1.4657534246575343, + "grad_norm": 6.744237422943115, + "learning_rate": 9.483510908168444e-06, + "loss": 0.0919, + "step": 1605 + }, + { + "epoch": 1.4666666666666668, + "grad_norm": 63.7646369934082, + "learning_rate": 9.482496194824962e-06, + "loss": 3.3939, + "step": 1606 + }, + { + "epoch": 1.467579908675799, + "grad_norm": 0.5445136427879333, + "learning_rate": 9.481481481481483e-06, + "loss": 0.0066, + "step": 1607 + }, + { + "epoch": 1.4684931506849315, + "grad_norm": 50.0715446472168, + "learning_rate": 9.480466768138002e-06, + "loss": 1.0128, + "step": 1608 + }, + { + "epoch": 1.469406392694064, + "grad_norm": 20.208147048950195, + "learning_rate": 9.47945205479452e-06, + "loss": 0.2953, + "step": 1609 + }, + { + "epoch": 1.4703196347031964, + "grad_norm": 16.686132431030273, + "learning_rate": 9.478437341451041e-06, + "loss": 0.305, + "step": 1610 + }, + { + "epoch": 1.4712328767123288, + "grad_norm": 49.0375862121582, + "learning_rate": 9.47742262810756e-06, + "loss": 1.1058, + "step": 1611 + }, + { + "epoch": 1.472146118721461, + "grad_norm": 20.07720947265625, + "learning_rate": 9.476407914764079e-06, + "loss": 0.3906, + "step": 1612 + }, + { + "epoch": 1.4730593607305935, + "grad_norm": 36.581748962402344, + "learning_rate": 9.4753932014206e-06, + "loss": 0.5741, + "step": 1613 + }, + { + "epoch": 1.473972602739726, + "grad_norm": 67.08309936523438, + "learning_rate": 9.47437848807712e-06, + "loss": 3.1997, + "step": 1614 + }, + { + "epoch": 1.4748858447488584, + "grad_norm": 8.976205825805664, + "learning_rate": 9.473363774733639e-06, + "loss": 0.1375, + "step": 1615 + }, + { + "epoch": 1.4757990867579909, + "grad_norm": 54.1966438293457, + "learning_rate": 9.472349061390158e-06, + "loss": 1.2939, + "step": 1616 + }, + { + "epoch": 1.4767123287671233, + "grad_norm": 43.84584426879883, + "learning_rate": 9.471334348046678e-06, + "loss": 0.9087, + "step": 1617 + }, + { + "epoch": 1.4776255707762558, + "grad_norm": 1.159874677658081, + "learning_rate": 9.470319634703197e-06, + "loss": 0.0175, + "step": 1618 + }, + { + "epoch": 1.4785388127853882, + "grad_norm": 55.37099075317383, + "learning_rate": 9.469304921359716e-06, + "loss": 2.6649, + "step": 1619 + }, + { + "epoch": 1.4794520547945205, + "grad_norm": 0.8394540548324585, + "learning_rate": 9.468290208016236e-06, + "loss": 0.0118, + "step": 1620 + }, + { + "epoch": 1.480365296803653, + "grad_norm": 11.57043170928955, + "learning_rate": 9.467275494672755e-06, + "loss": 0.1651, + "step": 1621 + }, + { + "epoch": 1.4812785388127854, + "grad_norm": 36.46791076660156, + "learning_rate": 9.466260781329276e-06, + "loss": 0.9479, + "step": 1622 + }, + { + "epoch": 1.4821917808219178, + "grad_norm": 17.68412971496582, + "learning_rate": 9.465246067985795e-06, + "loss": 0.2305, + "step": 1623 + }, + { + "epoch": 1.4831050228310503, + "grad_norm": 25.833812713623047, + "learning_rate": 9.464231354642315e-06, + "loss": 0.526, + "step": 1624 + }, + { + "epoch": 1.4840182648401825, + "grad_norm": 32.861732482910156, + "learning_rate": 9.463216641298834e-06, + "loss": 0.4663, + "step": 1625 + }, + { + "epoch": 1.484931506849315, + "grad_norm": 59.560848236083984, + "learning_rate": 9.462201927955353e-06, + "loss": 1.639, + "step": 1626 + }, + { + "epoch": 1.4858447488584474, + "grad_norm": 10.363544464111328, + "learning_rate": 9.461187214611873e-06, + "loss": 0.1918, + "step": 1627 + }, + { + "epoch": 1.4867579908675799, + "grad_norm": 10.207551002502441, + "learning_rate": 9.460172501268392e-06, + "loss": 0.1685, + "step": 1628 + }, + { + "epoch": 1.4876712328767123, + "grad_norm": 6.954619884490967, + "learning_rate": 9.459157787924911e-06, + "loss": 0.0992, + "step": 1629 + }, + { + "epoch": 1.4885844748858448, + "grad_norm": 32.83219909667969, + "learning_rate": 9.458143074581432e-06, + "loss": 0.9268, + "step": 1630 + }, + { + "epoch": 1.4894977168949772, + "grad_norm": 28.138710021972656, + "learning_rate": 9.45712836123795e-06, + "loss": 0.8017, + "step": 1631 + }, + { + "epoch": 1.4904109589041097, + "grad_norm": 33.681434631347656, + "learning_rate": 9.456113647894471e-06, + "loss": 0.7453, + "step": 1632 + }, + { + "epoch": 1.4913242009132421, + "grad_norm": 16.18006706237793, + "learning_rate": 9.45509893455099e-06, + "loss": 0.3212, + "step": 1633 + }, + { + "epoch": 1.4922374429223744, + "grad_norm": 27.588125228881836, + "learning_rate": 9.45408422120751e-06, + "loss": 0.6491, + "step": 1634 + }, + { + "epoch": 1.4931506849315068, + "grad_norm": 10.136930465698242, + "learning_rate": 9.45306950786403e-06, + "loss": 0.188, + "step": 1635 + }, + { + "epoch": 1.4940639269406393, + "grad_norm": 23.05770492553711, + "learning_rate": 9.452054794520548e-06, + "loss": 0.3955, + "step": 1636 + }, + { + "epoch": 1.4949771689497717, + "grad_norm": 47.64874267578125, + "learning_rate": 9.451040081177069e-06, + "loss": 1.1714, + "step": 1637 + }, + { + "epoch": 1.4958904109589042, + "grad_norm": 5.738443374633789, + "learning_rate": 9.450025367833588e-06, + "loss": 0.0698, + "step": 1638 + }, + { + "epoch": 1.4968036529680364, + "grad_norm": 40.70313262939453, + "learning_rate": 9.449010654490108e-06, + "loss": 1.4375, + "step": 1639 + }, + { + "epoch": 1.4977168949771689, + "grad_norm": 23.644397735595703, + "learning_rate": 9.447995941146627e-06, + "loss": 0.3958, + "step": 1640 + }, + { + "epoch": 1.4986301369863013, + "grad_norm": 17.831966400146484, + "learning_rate": 9.446981227803146e-06, + "loss": 0.3267, + "step": 1641 + }, + { + "epoch": 1.4995433789954338, + "grad_norm": 4.26634407043457, + "learning_rate": 9.445966514459666e-06, + "loss": 0.0746, + "step": 1642 + }, + { + "epoch": 1.5004566210045662, + "grad_norm": 33.35226821899414, + "learning_rate": 9.444951801116185e-06, + "loss": 0.9996, + "step": 1643 + }, + { + "epoch": 1.5013698630136987, + "grad_norm": 53.2601432800293, + "learning_rate": 9.443937087772706e-06, + "loss": 1.1167, + "step": 1644 + }, + { + "epoch": 1.5022831050228311, + "grad_norm": 68.43383026123047, + "learning_rate": 9.442922374429225e-06, + "loss": 3.4835, + "step": 1645 + }, + { + "epoch": 1.5031963470319636, + "grad_norm": 24.473005294799805, + "learning_rate": 9.441907661085743e-06, + "loss": 0.287, + "step": 1646 + }, + { + "epoch": 1.504109589041096, + "grad_norm": 55.74113464355469, + "learning_rate": 9.440892947742264e-06, + "loss": 2.6487, + "step": 1647 + }, + { + "epoch": 1.5050228310502283, + "grad_norm": 25.90568733215332, + "learning_rate": 9.439878234398783e-06, + "loss": 0.8044, + "step": 1648 + }, + { + "epoch": 1.5059360730593607, + "grad_norm": 48.251224517822266, + "learning_rate": 9.438863521055303e-06, + "loss": 0.8219, + "step": 1649 + }, + { + "epoch": 1.5068493150684932, + "grad_norm": 11.775310516357422, + "learning_rate": 9.437848807711822e-06, + "loss": 0.2442, + "step": 1650 + }, + { + "epoch": 1.5077625570776254, + "grad_norm": 20.086978912353516, + "learning_rate": 9.436834094368341e-06, + "loss": 0.4377, + "step": 1651 + }, + { + "epoch": 1.5086757990867579, + "grad_norm": 16.55095863342285, + "learning_rate": 9.435819381024862e-06, + "loss": 0.3379, + "step": 1652 + }, + { + "epoch": 1.5095890410958903, + "grad_norm": 36.978946685791016, + "learning_rate": 9.43480466768138e-06, + "loss": 0.7277, + "step": 1653 + }, + { + "epoch": 1.5105022831050228, + "grad_norm": 36.66012954711914, + "learning_rate": 9.433789954337901e-06, + "loss": 0.3936, + "step": 1654 + }, + { + "epoch": 1.5114155251141552, + "grad_norm": 0.3242569863796234, + "learning_rate": 9.43277524099442e-06, + "loss": 0.0041, + "step": 1655 + }, + { + "epoch": 1.5123287671232877, + "grad_norm": 35.98484802246094, + "learning_rate": 9.431760527650939e-06, + "loss": 0.4633, + "step": 1656 + }, + { + "epoch": 1.5132420091324201, + "grad_norm": 5.785026550292969, + "learning_rate": 9.430745814307459e-06, + "loss": 0.0645, + "step": 1657 + }, + { + "epoch": 1.5141552511415526, + "grad_norm": 11.782848358154297, + "learning_rate": 9.429731100963978e-06, + "loss": 0.1469, + "step": 1658 + }, + { + "epoch": 1.515068493150685, + "grad_norm": 62.84830856323242, + "learning_rate": 9.428716387620499e-06, + "loss": 2.163, + "step": 1659 + }, + { + "epoch": 1.5159817351598175, + "grad_norm": 23.533220291137695, + "learning_rate": 9.427701674277017e-06, + "loss": 0.4642, + "step": 1660 + }, + { + "epoch": 1.5168949771689497, + "grad_norm": 50.0546760559082, + "learning_rate": 9.426686960933536e-06, + "loss": 1.5345, + "step": 1661 + }, + { + "epoch": 1.5178082191780822, + "grad_norm": 81.5702133178711, + "learning_rate": 9.425672247590057e-06, + "loss": 1.6202, + "step": 1662 + }, + { + "epoch": 1.5187214611872146, + "grad_norm": 44.003536224365234, + "learning_rate": 9.424657534246576e-06, + "loss": 1.2602, + "step": 1663 + }, + { + "epoch": 1.519634703196347, + "grad_norm": 4.866281509399414, + "learning_rate": 9.423642820903096e-06, + "loss": 0.067, + "step": 1664 + }, + { + "epoch": 1.5205479452054793, + "grad_norm": 33.356903076171875, + "learning_rate": 9.422628107559615e-06, + "loss": 0.5061, + "step": 1665 + }, + { + "epoch": 1.5214611872146118, + "grad_norm": 24.429662704467773, + "learning_rate": 9.421613394216136e-06, + "loss": 0.4757, + "step": 1666 + }, + { + "epoch": 1.5223744292237442, + "grad_norm": 21.597532272338867, + "learning_rate": 9.420598680872654e-06, + "loss": 0.3241, + "step": 1667 + }, + { + "epoch": 1.5232876712328767, + "grad_norm": 20.296390533447266, + "learning_rate": 9.419583967529173e-06, + "loss": 0.3588, + "step": 1668 + }, + { + "epoch": 1.5242009132420091, + "grad_norm": 11.769937515258789, + "learning_rate": 9.418569254185694e-06, + "loss": 0.1833, + "step": 1669 + }, + { + "epoch": 1.5251141552511416, + "grad_norm": 26.924667358398438, + "learning_rate": 9.417554540842213e-06, + "loss": 0.8554, + "step": 1670 + }, + { + "epoch": 1.526027397260274, + "grad_norm": 13.730826377868652, + "learning_rate": 9.416539827498731e-06, + "loss": 0.2506, + "step": 1671 + }, + { + "epoch": 1.5269406392694065, + "grad_norm": 17.87738609313965, + "learning_rate": 9.415525114155252e-06, + "loss": 0.5207, + "step": 1672 + }, + { + "epoch": 1.527853881278539, + "grad_norm": 13.55672836303711, + "learning_rate": 9.41451040081177e-06, + "loss": 0.2319, + "step": 1673 + }, + { + "epoch": 1.5287671232876714, + "grad_norm": 17.61028289794922, + "learning_rate": 9.413495687468291e-06, + "loss": 0.2959, + "step": 1674 + }, + { + "epoch": 1.5296803652968036, + "grad_norm": 52.587196350097656, + "learning_rate": 9.41248097412481e-06, + "loss": 1.2411, + "step": 1675 + }, + { + "epoch": 1.530593607305936, + "grad_norm": 13.398993492126465, + "learning_rate": 9.41146626078133e-06, + "loss": 0.1883, + "step": 1676 + }, + { + "epoch": 1.5315068493150685, + "grad_norm": 27.11541175842285, + "learning_rate": 9.41045154743785e-06, + "loss": 0.6539, + "step": 1677 + }, + { + "epoch": 1.5324200913242008, + "grad_norm": 5.394829750061035, + "learning_rate": 9.409436834094368e-06, + "loss": 0.054, + "step": 1678 + }, + { + "epoch": 1.5333333333333332, + "grad_norm": 11.85222053527832, + "learning_rate": 9.408422120750889e-06, + "loss": 0.1896, + "step": 1679 + }, + { + "epoch": 1.5342465753424657, + "grad_norm": 34.591575622558594, + "learning_rate": 9.407407407407408e-06, + "loss": 0.6514, + "step": 1680 + }, + { + "epoch": 1.5351598173515981, + "grad_norm": 3.66583514213562, + "learning_rate": 9.406392694063927e-06, + "loss": 0.0412, + "step": 1681 + }, + { + "epoch": 1.5360730593607306, + "grad_norm": 2.824265241622925, + "learning_rate": 9.405377980720447e-06, + "loss": 0.0346, + "step": 1682 + }, + { + "epoch": 1.536986301369863, + "grad_norm": 18.673667907714844, + "learning_rate": 9.404363267376968e-06, + "loss": 0.2199, + "step": 1683 + }, + { + "epoch": 1.5378995433789955, + "grad_norm": 17.731908798217773, + "learning_rate": 9.403348554033487e-06, + "loss": 0.1641, + "step": 1684 + }, + { + "epoch": 1.538812785388128, + "grad_norm": 36.85445022583008, + "learning_rate": 9.402333840690005e-06, + "loss": 0.6318, + "step": 1685 + }, + { + "epoch": 1.5397260273972604, + "grad_norm": 60.23092269897461, + "learning_rate": 9.401319127346526e-06, + "loss": 1.251, + "step": 1686 + }, + { + "epoch": 1.5406392694063928, + "grad_norm": 35.80023193359375, + "learning_rate": 9.400304414003045e-06, + "loss": 0.8624, + "step": 1687 + }, + { + "epoch": 1.541552511415525, + "grad_norm": 6.882970333099365, + "learning_rate": 9.399289700659564e-06, + "loss": 0.0909, + "step": 1688 + }, + { + "epoch": 1.5424657534246575, + "grad_norm": 48.913002014160156, + "learning_rate": 9.398274987316084e-06, + "loss": 1.0186, + "step": 1689 + }, + { + "epoch": 1.54337899543379, + "grad_norm": 16.297182083129883, + "learning_rate": 9.397260273972603e-06, + "loss": 0.2205, + "step": 1690 + }, + { + "epoch": 1.5442922374429222, + "grad_norm": 52.92391586303711, + "learning_rate": 9.396245560629122e-06, + "loss": 1.7194, + "step": 1691 + }, + { + "epoch": 1.5452054794520547, + "grad_norm": 41.19960403442383, + "learning_rate": 9.395230847285642e-06, + "loss": 0.964, + "step": 1692 + }, + { + "epoch": 1.5461187214611871, + "grad_norm": 15.444424629211426, + "learning_rate": 9.394216133942163e-06, + "loss": 0.1769, + "step": 1693 + }, + { + "epoch": 1.5470319634703196, + "grad_norm": 63.82157897949219, + "learning_rate": 9.393201420598682e-06, + "loss": 1.846, + "step": 1694 + }, + { + "epoch": 1.547945205479452, + "grad_norm": 48.431297302246094, + "learning_rate": 9.3921867072552e-06, + "loss": 0.9929, + "step": 1695 + }, + { + "epoch": 1.5488584474885845, + "grad_norm": 41.727943420410156, + "learning_rate": 9.391171993911721e-06, + "loss": 1.0787, + "step": 1696 + }, + { + "epoch": 1.549771689497717, + "grad_norm": 16.806352615356445, + "learning_rate": 9.39015728056824e-06, + "loss": 0.2618, + "step": 1697 + }, + { + "epoch": 1.5506849315068494, + "grad_norm": 31.286245346069336, + "learning_rate": 9.389142567224759e-06, + "loss": 0.568, + "step": 1698 + }, + { + "epoch": 1.5515981735159818, + "grad_norm": 36.83533477783203, + "learning_rate": 9.38812785388128e-06, + "loss": 0.6784, + "step": 1699 + }, + { + "epoch": 1.5525114155251143, + "grad_norm": 78.82209014892578, + "learning_rate": 9.387113140537798e-06, + "loss": 2.8188, + "step": 1700 + }, + { + "epoch": 1.5534246575342465, + "grad_norm": 30.026592254638672, + "learning_rate": 9.386098427194317e-06, + "loss": 0.6, + "step": 1701 + }, + { + "epoch": 1.554337899543379, + "grad_norm": 49.748104095458984, + "learning_rate": 9.385083713850838e-06, + "loss": 1.088, + "step": 1702 + }, + { + "epoch": 1.5552511415525114, + "grad_norm": 6.495776176452637, + "learning_rate": 9.384069000507358e-06, + "loss": 0.0741, + "step": 1703 + }, + { + "epoch": 1.5561643835616439, + "grad_norm": 11.19111442565918, + "learning_rate": 9.383054287163877e-06, + "loss": 0.1099, + "step": 1704 + }, + { + "epoch": 1.5570776255707761, + "grad_norm": 26.11520767211914, + "learning_rate": 9.382039573820396e-06, + "loss": 0.4383, + "step": 1705 + }, + { + "epoch": 1.5579908675799086, + "grad_norm": 73.17781066894531, + "learning_rate": 9.381024860476916e-06, + "loss": 1.6679, + "step": 1706 + }, + { + "epoch": 1.558904109589041, + "grad_norm": 36.738460540771484, + "learning_rate": 9.380010147133435e-06, + "loss": 1.0696, + "step": 1707 + }, + { + "epoch": 1.5598173515981735, + "grad_norm": 9.031254768371582, + "learning_rate": 9.378995433789954e-06, + "loss": 0.1154, + "step": 1708 + }, + { + "epoch": 1.560730593607306, + "grad_norm": 0.6113491654396057, + "learning_rate": 9.377980720446475e-06, + "loss": 0.0047, + "step": 1709 + }, + { + "epoch": 1.5616438356164384, + "grad_norm": 20.717304229736328, + "learning_rate": 9.376966007102995e-06, + "loss": 0.3521, + "step": 1710 + }, + { + "epoch": 1.5625570776255708, + "grad_norm": 7.516520023345947, + "learning_rate": 9.375951293759512e-06, + "loss": 0.0803, + "step": 1711 + }, + { + "epoch": 1.5634703196347033, + "grad_norm": 15.715941429138184, + "learning_rate": 9.374936580416033e-06, + "loss": 0.1569, + "step": 1712 + }, + { + "epoch": 1.5643835616438357, + "grad_norm": 48.97786331176758, + "learning_rate": 9.373921867072553e-06, + "loss": 1.3814, + "step": 1713 + }, + { + "epoch": 1.5652968036529682, + "grad_norm": 5.753415584564209, + "learning_rate": 9.372907153729072e-06, + "loss": 0.0798, + "step": 1714 + }, + { + "epoch": 1.5662100456621004, + "grad_norm": 3.8961374759674072, + "learning_rate": 9.371892440385591e-06, + "loss": 0.0456, + "step": 1715 + }, + { + "epoch": 1.5671232876712329, + "grad_norm": 37.87871170043945, + "learning_rate": 9.370877727042112e-06, + "loss": 0.5868, + "step": 1716 + }, + { + "epoch": 1.5680365296803653, + "grad_norm": 2.7444331645965576, + "learning_rate": 9.36986301369863e-06, + "loss": 0.0352, + "step": 1717 + }, + { + "epoch": 1.5689497716894976, + "grad_norm": 46.39104461669922, + "learning_rate": 9.36884830035515e-06, + "loss": 0.9085, + "step": 1718 + }, + { + "epoch": 1.56986301369863, + "grad_norm": 55.815101623535156, + "learning_rate": 9.36783358701167e-06, + "loss": 1.3695, + "step": 1719 + }, + { + "epoch": 1.5707762557077625, + "grad_norm": 5.104119777679443, + "learning_rate": 9.36681887366819e-06, + "loss": 0.0602, + "step": 1720 + }, + { + "epoch": 1.571689497716895, + "grad_norm": 6.427612781524658, + "learning_rate": 9.36580416032471e-06, + "loss": 0.0762, + "step": 1721 + }, + { + "epoch": 1.5726027397260274, + "grad_norm": 8.958586692810059, + "learning_rate": 9.364789446981228e-06, + "loss": 0.1585, + "step": 1722 + }, + { + "epoch": 1.5735159817351598, + "grad_norm": 74.4018325805664, + "learning_rate": 9.363774733637749e-06, + "loss": 3.438, + "step": 1723 + }, + { + "epoch": 1.5744292237442923, + "grad_norm": 5.897064208984375, + "learning_rate": 9.362760020294267e-06, + "loss": 0.0769, + "step": 1724 + }, + { + "epoch": 1.5753424657534247, + "grad_norm": 5.034976959228516, + "learning_rate": 9.361745306950786e-06, + "loss": 0.0487, + "step": 1725 + }, + { + "epoch": 1.5762557077625572, + "grad_norm": 46.35821533203125, + "learning_rate": 9.360730593607307e-06, + "loss": 1.2415, + "step": 1726 + }, + { + "epoch": 1.5771689497716896, + "grad_norm": 26.4151554107666, + "learning_rate": 9.359715880263827e-06, + "loss": 0.4743, + "step": 1727 + }, + { + "epoch": 1.5780821917808219, + "grad_norm": 45.506126403808594, + "learning_rate": 9.358701166920345e-06, + "loss": 1.4093, + "step": 1728 + }, + { + "epoch": 1.5789954337899543, + "grad_norm": 6.251513957977295, + "learning_rate": 9.357686453576865e-06, + "loss": 0.07, + "step": 1729 + }, + { + "epoch": 1.5799086757990868, + "grad_norm": 42.128719329833984, + "learning_rate": 9.356671740233386e-06, + "loss": 0.6463, + "step": 1730 + }, + { + "epoch": 1.580821917808219, + "grad_norm": 1.6205029487609863, + "learning_rate": 9.355657026889904e-06, + "loss": 0.017, + "step": 1731 + }, + { + "epoch": 1.5817351598173515, + "grad_norm": 39.43138122558594, + "learning_rate": 9.354642313546423e-06, + "loss": 0.5183, + "step": 1732 + }, + { + "epoch": 1.582648401826484, + "grad_norm": 60.398643493652344, + "learning_rate": 9.353627600202944e-06, + "loss": 1.2947, + "step": 1733 + }, + { + "epoch": 1.5835616438356164, + "grad_norm": 4.821058750152588, + "learning_rate": 9.352612886859463e-06, + "loss": 0.0684, + "step": 1734 + }, + { + "epoch": 1.5844748858447488, + "grad_norm": 26.100467681884766, + "learning_rate": 9.351598173515982e-06, + "loss": 0.395, + "step": 1735 + }, + { + "epoch": 1.5853881278538813, + "grad_norm": 14.35145378112793, + "learning_rate": 9.350583460172502e-06, + "loss": 0.2582, + "step": 1736 + }, + { + "epoch": 1.5863013698630137, + "grad_norm": 40.17828369140625, + "learning_rate": 9.349568746829023e-06, + "loss": 1.2862, + "step": 1737 + }, + { + "epoch": 1.5872146118721462, + "grad_norm": 32.8984260559082, + "learning_rate": 9.348554033485541e-06, + "loss": 0.5686, + "step": 1738 + }, + { + "epoch": 1.5881278538812786, + "grad_norm": 55.4236946105957, + "learning_rate": 9.34753932014206e-06, + "loss": 0.6555, + "step": 1739 + }, + { + "epoch": 1.589041095890411, + "grad_norm": 14.425992012023926, + "learning_rate": 9.346524606798581e-06, + "loss": 0.1584, + "step": 1740 + }, + { + "epoch": 1.5899543378995433, + "grad_norm": 37.10551452636719, + "learning_rate": 9.3455098934551e-06, + "loss": 0.8185, + "step": 1741 + }, + { + "epoch": 1.5908675799086758, + "grad_norm": 7.062150001525879, + "learning_rate": 9.344495180111619e-06, + "loss": 0.1013, + "step": 1742 + }, + { + "epoch": 1.5917808219178082, + "grad_norm": 61.130348205566406, + "learning_rate": 9.343480466768139e-06, + "loss": 1.6821, + "step": 1743 + }, + { + "epoch": 1.5926940639269407, + "grad_norm": 2.751373052597046, + "learning_rate": 9.342465753424658e-06, + "loss": 0.0362, + "step": 1744 + }, + { + "epoch": 1.593607305936073, + "grad_norm": 33.80479049682617, + "learning_rate": 9.341451040081177e-06, + "loss": 0.9376, + "step": 1745 + }, + { + "epoch": 1.5945205479452054, + "grad_norm": 19.281545639038086, + "learning_rate": 9.340436326737697e-06, + "loss": 0.4223, + "step": 1746 + }, + { + "epoch": 1.5954337899543378, + "grad_norm": 45.98407745361328, + "learning_rate": 9.339421613394218e-06, + "loss": 0.9773, + "step": 1747 + }, + { + "epoch": 1.5963470319634703, + "grad_norm": 30.206188201904297, + "learning_rate": 9.338406900050737e-06, + "loss": 0.5088, + "step": 1748 + }, + { + "epoch": 1.5972602739726027, + "grad_norm": 25.25067710876465, + "learning_rate": 9.337392186707256e-06, + "loss": 0.2595, + "step": 1749 + }, + { + "epoch": 1.5981735159817352, + "grad_norm": 63.19554138183594, + "learning_rate": 9.336377473363776e-06, + "loss": 1.7965, + "step": 1750 + }, + { + "epoch": 1.5990867579908676, + "grad_norm": 60.626529693603516, + "learning_rate": 9.335362760020295e-06, + "loss": 1.1522, + "step": 1751 + }, + { + "epoch": 1.6, + "grad_norm": 5.210737705230713, + "learning_rate": 9.334348046676814e-06, + "loss": 0.092, + "step": 1752 + }, + { + "epoch": 1.6009132420091325, + "grad_norm": 60.97710418701172, + "learning_rate": 9.333333333333334e-06, + "loss": 1.3049, + "step": 1753 + }, + { + "epoch": 1.601826484018265, + "grad_norm": 54.84774398803711, + "learning_rate": 9.332318619989855e-06, + "loss": 1.0647, + "step": 1754 + }, + { + "epoch": 1.6027397260273972, + "grad_norm": 0.784363329410553, + "learning_rate": 9.331303906646374e-06, + "loss": 0.0099, + "step": 1755 + }, + { + "epoch": 1.6036529680365297, + "grad_norm": 24.13739585876465, + "learning_rate": 9.330289193302893e-06, + "loss": 0.2823, + "step": 1756 + }, + { + "epoch": 1.6045662100456621, + "grad_norm": 15.494380950927734, + "learning_rate": 9.329274479959413e-06, + "loss": 0.1208, + "step": 1757 + }, + { + "epoch": 1.6054794520547944, + "grad_norm": 4.212805271148682, + "learning_rate": 9.328259766615932e-06, + "loss": 0.0441, + "step": 1758 + }, + { + "epoch": 1.6063926940639268, + "grad_norm": 31.5880184173584, + "learning_rate": 9.32724505327245e-06, + "loss": 0.826, + "step": 1759 + }, + { + "epoch": 1.6073059360730593, + "grad_norm": 25.622682571411133, + "learning_rate": 9.326230339928971e-06, + "loss": 0.3112, + "step": 1760 + }, + { + "epoch": 1.6082191780821917, + "grad_norm": 17.127389907836914, + "learning_rate": 9.32521562658549e-06, + "loss": 0.168, + "step": 1761 + }, + { + "epoch": 1.6091324200913242, + "grad_norm": 0.25331366062164307, + "learning_rate": 9.324200913242009e-06, + "loss": 0.0031, + "step": 1762 + }, + { + "epoch": 1.6100456621004566, + "grad_norm": 46.988704681396484, + "learning_rate": 9.32318619989853e-06, + "loss": 1.0796, + "step": 1763 + }, + { + "epoch": 1.610958904109589, + "grad_norm": 31.460407257080078, + "learning_rate": 9.32217148655505e-06, + "loss": 0.7778, + "step": 1764 + }, + { + "epoch": 1.6118721461187215, + "grad_norm": 63.62187957763672, + "learning_rate": 9.321156773211569e-06, + "loss": 2.4286, + "step": 1765 + }, + { + "epoch": 1.612785388127854, + "grad_norm": 41.34366226196289, + "learning_rate": 9.320142059868088e-06, + "loss": 0.7186, + "step": 1766 + }, + { + "epoch": 1.6136986301369864, + "grad_norm": 7.538887023925781, + "learning_rate": 9.319127346524608e-06, + "loss": 0.0945, + "step": 1767 + }, + { + "epoch": 1.6146118721461187, + "grad_norm": 57.86833953857422, + "learning_rate": 9.318112633181127e-06, + "loss": 1.226, + "step": 1768 + }, + { + "epoch": 1.6155251141552511, + "grad_norm": 1.6795562505722046, + "learning_rate": 9.317097919837646e-06, + "loss": 0.0238, + "step": 1769 + }, + { + "epoch": 1.6164383561643836, + "grad_norm": 31.46211814880371, + "learning_rate": 9.316083206494167e-06, + "loss": 0.4072, + "step": 1770 + }, + { + "epoch": 1.617351598173516, + "grad_norm": 42.30515670776367, + "learning_rate": 9.315068493150685e-06, + "loss": 0.6576, + "step": 1771 + }, + { + "epoch": 1.6182648401826483, + "grad_norm": 4.586367130279541, + "learning_rate": 9.314053779807204e-06, + "loss": 0.0518, + "step": 1772 + }, + { + "epoch": 1.6191780821917807, + "grad_norm": 6.327547550201416, + "learning_rate": 9.313039066463725e-06, + "loss": 0.0751, + "step": 1773 + }, + { + "epoch": 1.6200913242009132, + "grad_norm": 30.056392669677734, + "learning_rate": 9.312024353120245e-06, + "loss": 0.4798, + "step": 1774 + }, + { + "epoch": 1.6210045662100456, + "grad_norm": 15.03119945526123, + "learning_rate": 9.311009639776764e-06, + "loss": 0.2008, + "step": 1775 + }, + { + "epoch": 1.621917808219178, + "grad_norm": 25.749114990234375, + "learning_rate": 9.309994926433283e-06, + "loss": 0.331, + "step": 1776 + }, + { + "epoch": 1.6228310502283105, + "grad_norm": 0.9986948370933533, + "learning_rate": 9.308980213089804e-06, + "loss": 0.011, + "step": 1777 + }, + { + "epoch": 1.623744292237443, + "grad_norm": 14.735180854797363, + "learning_rate": 9.307965499746322e-06, + "loss": 0.1882, + "step": 1778 + }, + { + "epoch": 1.6246575342465754, + "grad_norm": 20.001277923583984, + "learning_rate": 9.306950786402841e-06, + "loss": 0.2603, + "step": 1779 + }, + { + "epoch": 1.625570776255708, + "grad_norm": 65.36467742919922, + "learning_rate": 9.305936073059362e-06, + "loss": 3.5735, + "step": 1780 + }, + { + "epoch": 1.6264840182648403, + "grad_norm": 32.928611755371094, + "learning_rate": 9.30492135971588e-06, + "loss": 0.9156, + "step": 1781 + }, + { + "epoch": 1.6273972602739726, + "grad_norm": 15.53187370300293, + "learning_rate": 9.303906646372401e-06, + "loss": 0.1987, + "step": 1782 + }, + { + "epoch": 1.628310502283105, + "grad_norm": 6.939705848693848, + "learning_rate": 9.30289193302892e-06, + "loss": 0.0882, + "step": 1783 + }, + { + "epoch": 1.6292237442922375, + "grad_norm": 57.74967956542969, + "learning_rate": 9.30187721968544e-06, + "loss": 1.1374, + "step": 1784 + }, + { + "epoch": 1.6301369863013697, + "grad_norm": 13.182069778442383, + "learning_rate": 9.30086250634196e-06, + "loss": 0.1868, + "step": 1785 + }, + { + "epoch": 1.6310502283105022, + "grad_norm": 18.02824592590332, + "learning_rate": 9.299847792998478e-06, + "loss": 0.2901, + "step": 1786 + }, + { + "epoch": 1.6319634703196346, + "grad_norm": 14.722010612487793, + "learning_rate": 9.298833079654999e-06, + "loss": 0.2252, + "step": 1787 + }, + { + "epoch": 1.632876712328767, + "grad_norm": 15.555293083190918, + "learning_rate": 9.297818366311518e-06, + "loss": 0.2436, + "step": 1788 + }, + { + "epoch": 1.6337899543378995, + "grad_norm": 50.9629020690918, + "learning_rate": 9.296803652968036e-06, + "loss": 1.221, + "step": 1789 + }, + { + "epoch": 1.634703196347032, + "grad_norm": 3.5206987857818604, + "learning_rate": 9.295788939624557e-06, + "loss": 0.03, + "step": 1790 + }, + { + "epoch": 1.6356164383561644, + "grad_norm": 14.337251663208008, + "learning_rate": 9.294774226281076e-06, + "loss": 0.12, + "step": 1791 + }, + { + "epoch": 1.636529680365297, + "grad_norm": 5.609238147735596, + "learning_rate": 9.293759512937596e-06, + "loss": 0.0622, + "step": 1792 + }, + { + "epoch": 1.6374429223744293, + "grad_norm": 46.65634536743164, + "learning_rate": 9.292744799594115e-06, + "loss": 0.8142, + "step": 1793 + }, + { + "epoch": 1.6383561643835618, + "grad_norm": 50.55832290649414, + "learning_rate": 9.291730086250636e-06, + "loss": 1.3307, + "step": 1794 + }, + { + "epoch": 1.639269406392694, + "grad_norm": 71.38203430175781, + "learning_rate": 9.290715372907155e-06, + "loss": 4.278, + "step": 1795 + }, + { + "epoch": 1.6401826484018265, + "grad_norm": 84.5038833618164, + "learning_rate": 9.289700659563673e-06, + "loss": 2.8982, + "step": 1796 + }, + { + "epoch": 1.641095890410959, + "grad_norm": 3.9114365577697754, + "learning_rate": 9.288685946220194e-06, + "loss": 0.0365, + "step": 1797 + }, + { + "epoch": 1.6420091324200912, + "grad_norm": 6.063363075256348, + "learning_rate": 9.287671232876713e-06, + "loss": 0.0768, + "step": 1798 + }, + { + "epoch": 1.6429223744292236, + "grad_norm": 26.000301361083984, + "learning_rate": 9.286656519533233e-06, + "loss": 0.3748, + "step": 1799 + }, + { + "epoch": 1.643835616438356, + "grad_norm": 4.718715667724609, + "learning_rate": 9.285641806189752e-06, + "loss": 0.0662, + "step": 1800 + }, + { + "epoch": 1.6447488584474885, + "grad_norm": 27.99122428894043, + "learning_rate": 9.284627092846271e-06, + "loss": 0.576, + "step": 1801 + }, + { + "epoch": 1.645662100456621, + "grad_norm": 18.53284454345703, + "learning_rate": 9.283612379502792e-06, + "loss": 0.3167, + "step": 1802 + }, + { + "epoch": 1.6465753424657534, + "grad_norm": 24.91457176208496, + "learning_rate": 9.28259766615931e-06, + "loss": 0.4579, + "step": 1803 + }, + { + "epoch": 1.6474885844748859, + "grad_norm": 21.282194137573242, + "learning_rate": 9.281582952815831e-06, + "loss": 0.3663, + "step": 1804 + }, + { + "epoch": 1.6484018264840183, + "grad_norm": 50.61882400512695, + "learning_rate": 9.28056823947235e-06, + "loss": 0.8111, + "step": 1805 + }, + { + "epoch": 1.6493150684931508, + "grad_norm": 61.768150329589844, + "learning_rate": 9.279553526128869e-06, + "loss": 0.9359, + "step": 1806 + }, + { + "epoch": 1.6502283105022832, + "grad_norm": 20.30813980102539, + "learning_rate": 9.27853881278539e-06, + "loss": 0.1804, + "step": 1807 + }, + { + "epoch": 1.6511415525114155, + "grad_norm": 13.826722145080566, + "learning_rate": 9.277524099441908e-06, + "loss": 0.1724, + "step": 1808 + }, + { + "epoch": 1.652054794520548, + "grad_norm": 45.621028900146484, + "learning_rate": 9.276509386098429e-06, + "loss": 0.7576, + "step": 1809 + }, + { + "epoch": 1.6529680365296804, + "grad_norm": 1.8096333742141724, + "learning_rate": 9.275494672754947e-06, + "loss": 0.0181, + "step": 1810 + }, + { + "epoch": 1.6538812785388128, + "grad_norm": 58.16930389404297, + "learning_rate": 9.274479959411466e-06, + "loss": 1.3325, + "step": 1811 + }, + { + "epoch": 1.654794520547945, + "grad_norm": 22.640851974487305, + "learning_rate": 9.273465246067987e-06, + "loss": 0.2293, + "step": 1812 + }, + { + "epoch": 1.6557077625570775, + "grad_norm": 51.8997688293457, + "learning_rate": 9.272450532724506e-06, + "loss": 1.262, + "step": 1813 + }, + { + "epoch": 1.65662100456621, + "grad_norm": 25.547183990478516, + "learning_rate": 9.271435819381026e-06, + "loss": 0.4328, + "step": 1814 + }, + { + "epoch": 1.6575342465753424, + "grad_norm": 2.017742872238159, + "learning_rate": 9.270421106037545e-06, + "loss": 0.0276, + "step": 1815 + }, + { + "epoch": 1.6584474885844749, + "grad_norm": 5.049849033355713, + "learning_rate": 9.269406392694064e-06, + "loss": 0.0425, + "step": 1816 + }, + { + "epoch": 1.6593607305936073, + "grad_norm": 9.365861892700195, + "learning_rate": 9.268391679350584e-06, + "loss": 0.0973, + "step": 1817 + }, + { + "epoch": 1.6602739726027398, + "grad_norm": 3.696726083755493, + "learning_rate": 9.267376966007103e-06, + "loss": 0.0536, + "step": 1818 + }, + { + "epoch": 1.6611872146118722, + "grad_norm": 4.552874565124512, + "learning_rate": 9.266362252663624e-06, + "loss": 0.0505, + "step": 1819 + }, + { + "epoch": 1.6621004566210047, + "grad_norm": 32.94573974609375, + "learning_rate": 9.265347539320143e-06, + "loss": 0.5871, + "step": 1820 + }, + { + "epoch": 1.6630136986301371, + "grad_norm": 28.14301872253418, + "learning_rate": 9.264332825976662e-06, + "loss": 0.3624, + "step": 1821 + }, + { + "epoch": 1.6639269406392694, + "grad_norm": 2.3041322231292725, + "learning_rate": 9.263318112633182e-06, + "loss": 0.0307, + "step": 1822 + }, + { + "epoch": 1.6648401826484018, + "grad_norm": 2.0049729347229004, + "learning_rate": 9.262303399289701e-06, + "loss": 0.0232, + "step": 1823 + }, + { + "epoch": 1.6657534246575343, + "grad_norm": 33.63010025024414, + "learning_rate": 9.261288685946221e-06, + "loss": 0.4271, + "step": 1824 + }, + { + "epoch": 1.6666666666666665, + "grad_norm": 60.42586135864258, + "learning_rate": 9.26027397260274e-06, + "loss": 0.9306, + "step": 1825 + }, + { + "epoch": 1.667579908675799, + "grad_norm": 37.342071533203125, + "learning_rate": 9.25925925925926e-06, + "loss": 0.5281, + "step": 1826 + }, + { + "epoch": 1.6684931506849314, + "grad_norm": 7.190488338470459, + "learning_rate": 9.25824454591578e-06, + "loss": 0.1153, + "step": 1827 + }, + { + "epoch": 1.6694063926940639, + "grad_norm": 2.1529181003570557, + "learning_rate": 9.257229832572299e-06, + "loss": 0.0213, + "step": 1828 + }, + { + "epoch": 1.6703196347031963, + "grad_norm": 15.884522438049316, + "learning_rate": 9.256215119228819e-06, + "loss": 0.1758, + "step": 1829 + }, + { + "epoch": 1.6712328767123288, + "grad_norm": 43.6501350402832, + "learning_rate": 9.255200405885338e-06, + "loss": 0.4945, + "step": 1830 + }, + { + "epoch": 1.6721461187214612, + "grad_norm": 17.16789436340332, + "learning_rate": 9.254185692541857e-06, + "loss": 0.2352, + "step": 1831 + }, + { + "epoch": 1.6730593607305937, + "grad_norm": 38.093868255615234, + "learning_rate": 9.253170979198377e-06, + "loss": 0.5202, + "step": 1832 + }, + { + "epoch": 1.6739726027397261, + "grad_norm": 0.15458287298679352, + "learning_rate": 9.252156265854896e-06, + "loss": 0.0012, + "step": 1833 + }, + { + "epoch": 1.6748858447488586, + "grad_norm": 51.6360969543457, + "learning_rate": 9.251141552511417e-06, + "loss": 1.3261, + "step": 1834 + }, + { + "epoch": 1.6757990867579908, + "grad_norm": 1.3653427362442017, + "learning_rate": 9.250126839167936e-06, + "loss": 0.0162, + "step": 1835 + }, + { + "epoch": 1.6767123287671233, + "grad_norm": 24.513952255249023, + "learning_rate": 9.249112125824456e-06, + "loss": 0.3682, + "step": 1836 + }, + { + "epoch": 1.6776255707762557, + "grad_norm": 35.86574172973633, + "learning_rate": 9.248097412480975e-06, + "loss": 0.5494, + "step": 1837 + }, + { + "epoch": 1.678538812785388, + "grad_norm": 72.22162628173828, + "learning_rate": 9.247082699137494e-06, + "loss": 3.9638, + "step": 1838 + }, + { + "epoch": 1.6794520547945204, + "grad_norm": 23.4679012298584, + "learning_rate": 9.246067985794014e-06, + "loss": 0.2737, + "step": 1839 + }, + { + "epoch": 1.6803652968036529, + "grad_norm": 6.770679473876953, + "learning_rate": 9.245053272450533e-06, + "loss": 0.1081, + "step": 1840 + }, + { + "epoch": 1.6812785388127853, + "grad_norm": 77.14942932128906, + "learning_rate": 9.244038559107052e-06, + "loss": 2.5911, + "step": 1841 + }, + { + "epoch": 1.6821917808219178, + "grad_norm": 7.051555633544922, + "learning_rate": 9.243023845763573e-06, + "loss": 0.0786, + "step": 1842 + }, + { + "epoch": 1.6831050228310502, + "grad_norm": 28.346994400024414, + "learning_rate": 9.242009132420093e-06, + "loss": 0.378, + "step": 1843 + }, + { + "epoch": 1.6840182648401827, + "grad_norm": 24.94316864013672, + "learning_rate": 9.240994419076612e-06, + "loss": 0.2505, + "step": 1844 + }, + { + "epoch": 1.6849315068493151, + "grad_norm": 23.739004135131836, + "learning_rate": 9.23997970573313e-06, + "loss": 0.3134, + "step": 1845 + }, + { + "epoch": 1.6858447488584476, + "grad_norm": 9.941673278808594, + "learning_rate": 9.238964992389651e-06, + "loss": 0.1119, + "step": 1846 + }, + { + "epoch": 1.68675799086758, + "grad_norm": 36.98472213745117, + "learning_rate": 9.23795027904617e-06, + "loss": 0.6981, + "step": 1847 + }, + { + "epoch": 1.6876712328767123, + "grad_norm": 36.553627014160156, + "learning_rate": 9.236935565702689e-06, + "loss": 0.945, + "step": 1848 + }, + { + "epoch": 1.6885844748858447, + "grad_norm": 4.553706645965576, + "learning_rate": 9.23592085235921e-06, + "loss": 0.0476, + "step": 1849 + }, + { + "epoch": 1.6894977168949772, + "grad_norm": 8.74838924407959, + "learning_rate": 9.234906139015728e-06, + "loss": 0.108, + "step": 1850 + }, + { + "epoch": 1.6904109589041096, + "grad_norm": 18.37128448486328, + "learning_rate": 9.233891425672247e-06, + "loss": 0.1485, + "step": 1851 + }, + { + "epoch": 1.6913242009132419, + "grad_norm": 10.521088600158691, + "learning_rate": 9.232876712328768e-06, + "loss": 0.0988, + "step": 1852 + }, + { + "epoch": 1.6922374429223743, + "grad_norm": 87.95681762695312, + "learning_rate": 9.231861998985288e-06, + "loss": 3.2124, + "step": 1853 + }, + { + "epoch": 1.6931506849315068, + "grad_norm": 27.310626983642578, + "learning_rate": 9.230847285641807e-06, + "loss": 0.3554, + "step": 1854 + }, + { + "epoch": 1.6940639269406392, + "grad_norm": 6.194880485534668, + "learning_rate": 9.229832572298326e-06, + "loss": 0.0457, + "step": 1855 + }, + { + "epoch": 1.6949771689497717, + "grad_norm": 12.369104385375977, + "learning_rate": 9.228817858954847e-06, + "loss": 0.187, + "step": 1856 + }, + { + "epoch": 1.6958904109589041, + "grad_norm": 78.02377319335938, + "learning_rate": 9.227803145611365e-06, + "loss": 1.7571, + "step": 1857 + }, + { + "epoch": 1.6968036529680366, + "grad_norm": 99.61138153076172, + "learning_rate": 9.226788432267884e-06, + "loss": 6.8577, + "step": 1858 + }, + { + "epoch": 1.697716894977169, + "grad_norm": 55.48210906982422, + "learning_rate": 9.225773718924405e-06, + "loss": 2.691, + "step": 1859 + }, + { + "epoch": 1.6986301369863015, + "grad_norm": 34.219627380371094, + "learning_rate": 9.224759005580924e-06, + "loss": 0.4373, + "step": 1860 + }, + { + "epoch": 1.699543378995434, + "grad_norm": 4.740018844604492, + "learning_rate": 9.223744292237442e-06, + "loss": 0.0661, + "step": 1861 + }, + { + "epoch": 1.7004566210045662, + "grad_norm": 36.55588912963867, + "learning_rate": 9.222729578893963e-06, + "loss": 0.5522, + "step": 1862 + }, + { + "epoch": 1.7013698630136986, + "grad_norm": 8.581446647644043, + "learning_rate": 9.221714865550484e-06, + "loss": 0.0793, + "step": 1863 + }, + { + "epoch": 1.702283105022831, + "grad_norm": 21.36312484741211, + "learning_rate": 9.220700152207002e-06, + "loss": 0.5913, + "step": 1864 + }, + { + "epoch": 1.7031963470319633, + "grad_norm": 57.74395751953125, + "learning_rate": 9.219685438863521e-06, + "loss": 1.0969, + "step": 1865 + }, + { + "epoch": 1.7041095890410958, + "grad_norm": 6.691015243530273, + "learning_rate": 9.218670725520042e-06, + "loss": 0.0643, + "step": 1866 + }, + { + "epoch": 1.7050228310502282, + "grad_norm": 12.74386978149414, + "learning_rate": 9.21765601217656e-06, + "loss": 0.2383, + "step": 1867 + }, + { + "epoch": 1.7059360730593607, + "grad_norm": 18.798978805541992, + "learning_rate": 9.21664129883308e-06, + "loss": 0.2438, + "step": 1868 + }, + { + "epoch": 1.7068493150684931, + "grad_norm": 38.78778076171875, + "learning_rate": 9.2156265854896e-06, + "loss": 0.6016, + "step": 1869 + }, + { + "epoch": 1.7077625570776256, + "grad_norm": 12.517702102661133, + "learning_rate": 9.21461187214612e-06, + "loss": 0.2221, + "step": 1870 + }, + { + "epoch": 1.708675799086758, + "grad_norm": 59.607967376708984, + "learning_rate": 9.213597158802638e-06, + "loss": 0.6806, + "step": 1871 + }, + { + "epoch": 1.7095890410958905, + "grad_norm": 3.067878484725952, + "learning_rate": 9.212582445459158e-06, + "loss": 0.0313, + "step": 1872 + }, + { + "epoch": 1.710502283105023, + "grad_norm": 18.44402503967285, + "learning_rate": 9.211567732115679e-06, + "loss": 0.2326, + "step": 1873 + }, + { + "epoch": 1.7114155251141554, + "grad_norm": 3.6779115200042725, + "learning_rate": 9.210553018772198e-06, + "loss": 0.0255, + "step": 1874 + }, + { + "epoch": 1.7123287671232876, + "grad_norm": 64.3386001586914, + "learning_rate": 9.209538305428716e-06, + "loss": 1.9308, + "step": 1875 + }, + { + "epoch": 1.71324200913242, + "grad_norm": 4.581904411315918, + "learning_rate": 9.208523592085237e-06, + "loss": 0.0536, + "step": 1876 + }, + { + "epoch": 1.7141552511415525, + "grad_norm": 29.24956512451172, + "learning_rate": 9.207508878741756e-06, + "loss": 0.4739, + "step": 1877 + }, + { + "epoch": 1.7150684931506848, + "grad_norm": 25.95320701599121, + "learning_rate": 9.206494165398275e-06, + "loss": 0.3311, + "step": 1878 + }, + { + "epoch": 1.7159817351598172, + "grad_norm": 41.02568817138672, + "learning_rate": 9.205479452054795e-06, + "loss": 0.4236, + "step": 1879 + }, + { + "epoch": 1.7168949771689497, + "grad_norm": 49.36776351928711, + "learning_rate": 9.204464738711316e-06, + "loss": 1.0745, + "step": 1880 + }, + { + "epoch": 1.7178082191780821, + "grad_norm": 49.92814254760742, + "learning_rate": 9.203450025367835e-06, + "loss": 1.4461, + "step": 1881 + }, + { + "epoch": 1.7187214611872146, + "grad_norm": 23.710285186767578, + "learning_rate": 9.202435312024353e-06, + "loss": 0.3656, + "step": 1882 + }, + { + "epoch": 1.719634703196347, + "grad_norm": 47.05572509765625, + "learning_rate": 9.201420598680874e-06, + "loss": 1.2656, + "step": 1883 + }, + { + "epoch": 1.7205479452054795, + "grad_norm": 61.87148666381836, + "learning_rate": 9.200405885337393e-06, + "loss": 1.6875, + "step": 1884 + }, + { + "epoch": 1.721461187214612, + "grad_norm": 7.1731343269348145, + "learning_rate": 9.199391171993912e-06, + "loss": 0.0863, + "step": 1885 + }, + { + "epoch": 1.7223744292237444, + "grad_norm": 39.880821228027344, + "learning_rate": 9.198376458650432e-06, + "loss": 0.9619, + "step": 1886 + }, + { + "epoch": 1.7232876712328768, + "grad_norm": 35.586578369140625, + "learning_rate": 9.197361745306953e-06, + "loss": 0.5833, + "step": 1887 + }, + { + "epoch": 1.724200913242009, + "grad_norm": 26.742828369140625, + "learning_rate": 9.19634703196347e-06, + "loss": 0.4928, + "step": 1888 + }, + { + "epoch": 1.7251141552511415, + "grad_norm": 56.261783599853516, + "learning_rate": 9.19533231861999e-06, + "loss": 0.9461, + "step": 1889 + }, + { + "epoch": 1.726027397260274, + "grad_norm": 108.25684356689453, + "learning_rate": 9.194317605276511e-06, + "loss": 0.9836, + "step": 1890 + }, + { + "epoch": 1.7269406392694064, + "grad_norm": 8.400162696838379, + "learning_rate": 9.19330289193303e-06, + "loss": 0.1169, + "step": 1891 + }, + { + "epoch": 1.7278538812785387, + "grad_norm": 38.74379348754883, + "learning_rate": 9.192288178589549e-06, + "loss": 0.8005, + "step": 1892 + }, + { + "epoch": 1.7287671232876711, + "grad_norm": 16.71647071838379, + "learning_rate": 9.19127346524607e-06, + "loss": 0.2625, + "step": 1893 + }, + { + "epoch": 1.7296803652968036, + "grad_norm": 5.197293758392334, + "learning_rate": 9.190258751902588e-06, + "loss": 0.0471, + "step": 1894 + }, + { + "epoch": 1.730593607305936, + "grad_norm": 10.194311141967773, + "learning_rate": 9.189244038559107e-06, + "loss": 0.1391, + "step": 1895 + }, + { + "epoch": 1.7315068493150685, + "grad_norm": 78.04730224609375, + "learning_rate": 9.188229325215627e-06, + "loss": 2.5983, + "step": 1896 + }, + { + "epoch": 1.732420091324201, + "grad_norm": 28.764789581298828, + "learning_rate": 9.187214611872148e-06, + "loss": 0.3106, + "step": 1897 + }, + { + "epoch": 1.7333333333333334, + "grad_norm": 1.3356714248657227, + "learning_rate": 9.186199898528667e-06, + "loss": 0.0163, + "step": 1898 + }, + { + "epoch": 1.7342465753424658, + "grad_norm": 46.74583053588867, + "learning_rate": 9.185185185185186e-06, + "loss": 1.3728, + "step": 1899 + }, + { + "epoch": 1.7351598173515983, + "grad_norm": 71.4071044921875, + "learning_rate": 9.184170471841706e-06, + "loss": 1.5235, + "step": 1900 + }, + { + "epoch": 1.7360730593607308, + "grad_norm": 3.5475566387176514, + "learning_rate": 9.183155758498225e-06, + "loss": 0.0499, + "step": 1901 + }, + { + "epoch": 1.736986301369863, + "grad_norm": 22.058671951293945, + "learning_rate": 9.182141045154744e-06, + "loss": 0.411, + "step": 1902 + }, + { + "epoch": 1.7378995433789954, + "grad_norm": 74.29597473144531, + "learning_rate": 9.181126331811264e-06, + "loss": 3.8888, + "step": 1903 + }, + { + "epoch": 1.738812785388128, + "grad_norm": 3.048731803894043, + "learning_rate": 9.180111618467783e-06, + "loss": 0.0484, + "step": 1904 + }, + { + "epoch": 1.7397260273972601, + "grad_norm": 7.721730709075928, + "learning_rate": 9.179096905124302e-06, + "loss": 0.0684, + "step": 1905 + }, + { + "epoch": 1.7406392694063926, + "grad_norm": 20.000749588012695, + "learning_rate": 9.178082191780823e-06, + "loss": 0.1799, + "step": 1906 + }, + { + "epoch": 1.741552511415525, + "grad_norm": 11.852544784545898, + "learning_rate": 9.177067478437343e-06, + "loss": 0.1504, + "step": 1907 + }, + { + "epoch": 1.7424657534246575, + "grad_norm": 46.39354705810547, + "learning_rate": 9.176052765093862e-06, + "loss": 1.3775, + "step": 1908 + }, + { + "epoch": 1.74337899543379, + "grad_norm": 44.149375915527344, + "learning_rate": 9.175038051750381e-06, + "loss": 1.1773, + "step": 1909 + }, + { + "epoch": 1.7442922374429224, + "grad_norm": 13.558700561523438, + "learning_rate": 9.174023338406901e-06, + "loss": 0.1558, + "step": 1910 + }, + { + "epoch": 1.7452054794520548, + "grad_norm": 23.77564239501953, + "learning_rate": 9.17300862506342e-06, + "loss": 0.4407, + "step": 1911 + }, + { + "epoch": 1.7461187214611873, + "grad_norm": 16.42371368408203, + "learning_rate": 9.171993911719939e-06, + "loss": 0.3173, + "step": 1912 + }, + { + "epoch": 1.7470319634703197, + "grad_norm": 1.7326911687850952, + "learning_rate": 9.17097919837646e-06, + "loss": 0.0277, + "step": 1913 + }, + { + "epoch": 1.7479452054794522, + "grad_norm": 16.872018814086914, + "learning_rate": 9.16996448503298e-06, + "loss": 0.2427, + "step": 1914 + }, + { + "epoch": 1.7488584474885844, + "grad_norm": 65.64193725585938, + "learning_rate": 9.168949771689499e-06, + "loss": 2.3854, + "step": 1915 + }, + { + "epoch": 1.7497716894977169, + "grad_norm": 13.89633846282959, + "learning_rate": 9.167935058346018e-06, + "loss": 0.1957, + "step": 1916 + }, + { + "epoch": 1.7506849315068493, + "grad_norm": 33.74470901489258, + "learning_rate": 9.166920345002538e-06, + "loss": 0.6258, + "step": 1917 + }, + { + "epoch": 1.7515981735159816, + "grad_norm": 6.10433292388916, + "learning_rate": 9.165905631659057e-06, + "loss": 0.0967, + "step": 1918 + }, + { + "epoch": 1.752511415525114, + "grad_norm": 20.445646286010742, + "learning_rate": 9.164890918315576e-06, + "loss": 0.3635, + "step": 1919 + }, + { + "epoch": 1.7534246575342465, + "grad_norm": 44.33466339111328, + "learning_rate": 9.163876204972097e-06, + "loss": 2.4905, + "step": 1920 + }, + { + "epoch": 1.754337899543379, + "grad_norm": 43.17270278930664, + "learning_rate": 9.162861491628615e-06, + "loss": 1.0057, + "step": 1921 + }, + { + "epoch": 1.7552511415525114, + "grad_norm": 7.2655253410339355, + "learning_rate": 9.161846778285134e-06, + "loss": 0.0984, + "step": 1922 + }, + { + "epoch": 1.7561643835616438, + "grad_norm": 6.635156154632568, + "learning_rate": 9.160832064941655e-06, + "loss": 0.1202, + "step": 1923 + }, + { + "epoch": 1.7570776255707763, + "grad_norm": 39.477882385253906, + "learning_rate": 9.159817351598175e-06, + "loss": 0.8113, + "step": 1924 + }, + { + "epoch": 1.7579908675799087, + "grad_norm": 6.741147518157959, + "learning_rate": 9.158802638254694e-06, + "loss": 0.1031, + "step": 1925 + }, + { + "epoch": 1.7589041095890412, + "grad_norm": 24.234859466552734, + "learning_rate": 9.157787924911213e-06, + "loss": 0.4583, + "step": 1926 + }, + { + "epoch": 1.7598173515981737, + "grad_norm": 12.373668670654297, + "learning_rate": 9.156773211567734e-06, + "loss": 0.1721, + "step": 1927 + }, + { + "epoch": 1.7607305936073059, + "grad_norm": 35.06208419799805, + "learning_rate": 9.155758498224252e-06, + "loss": 0.6245, + "step": 1928 + }, + { + "epoch": 1.7616438356164383, + "grad_norm": 13.248064994812012, + "learning_rate": 9.154743784880771e-06, + "loss": 0.2398, + "step": 1929 + }, + { + "epoch": 1.7625570776255708, + "grad_norm": 71.98290252685547, + "learning_rate": 9.153729071537292e-06, + "loss": 2.093, + "step": 1930 + }, + { + "epoch": 1.7634703196347032, + "grad_norm": 38.720802307128906, + "learning_rate": 9.15271435819381e-06, + "loss": 0.348, + "step": 1931 + }, + { + "epoch": 1.7643835616438355, + "grad_norm": 11.92423152923584, + "learning_rate": 9.15169964485033e-06, + "loss": 0.152, + "step": 1932 + }, + { + "epoch": 1.765296803652968, + "grad_norm": 29.853839874267578, + "learning_rate": 9.15068493150685e-06, + "loss": 0.344, + "step": 1933 + }, + { + "epoch": 1.7662100456621004, + "grad_norm": 16.946006774902344, + "learning_rate": 9.14967021816337e-06, + "loss": 0.2391, + "step": 1934 + }, + { + "epoch": 1.7671232876712328, + "grad_norm": 21.455095291137695, + "learning_rate": 9.14865550481989e-06, + "loss": 0.4216, + "step": 1935 + }, + { + "epoch": 1.7680365296803653, + "grad_norm": 15.675932884216309, + "learning_rate": 9.147640791476408e-06, + "loss": 0.253, + "step": 1936 + }, + { + "epoch": 1.7689497716894977, + "grad_norm": 20.049278259277344, + "learning_rate": 9.146626078132929e-06, + "loss": 0.3537, + "step": 1937 + }, + { + "epoch": 1.7698630136986302, + "grad_norm": 28.72800636291504, + "learning_rate": 9.145611364789448e-06, + "loss": 0.496, + "step": 1938 + }, + { + "epoch": 1.7707762557077626, + "grad_norm": 35.22303771972656, + "learning_rate": 9.144596651445967e-06, + "loss": 0.8003, + "step": 1939 + }, + { + "epoch": 1.771689497716895, + "grad_norm": 11.705825805664062, + "learning_rate": 9.143581938102487e-06, + "loss": 0.2043, + "step": 1940 + }, + { + "epoch": 1.7726027397260276, + "grad_norm": 53.10376739501953, + "learning_rate": 9.142567224759006e-06, + "loss": 1.052, + "step": 1941 + }, + { + "epoch": 1.7735159817351598, + "grad_norm": 59.93878173828125, + "learning_rate": 9.141552511415526e-06, + "loss": 0.5172, + "step": 1942 + }, + { + "epoch": 1.7744292237442922, + "grad_norm": 56.488704681396484, + "learning_rate": 9.140537798072045e-06, + "loss": 1.6115, + "step": 1943 + }, + { + "epoch": 1.7753424657534247, + "grad_norm": 28.992332458496094, + "learning_rate": 9.139523084728566e-06, + "loss": 0.4459, + "step": 1944 + }, + { + "epoch": 1.776255707762557, + "grad_norm": 3.5406744480133057, + "learning_rate": 9.138508371385085e-06, + "loss": 0.057, + "step": 1945 + }, + { + "epoch": 1.7771689497716894, + "grad_norm": 20.325593948364258, + "learning_rate": 9.137493658041604e-06, + "loss": 0.3641, + "step": 1946 + }, + { + "epoch": 1.7780821917808218, + "grad_norm": 2.5948898792266846, + "learning_rate": 9.136478944698124e-06, + "loss": 0.0348, + "step": 1947 + }, + { + "epoch": 1.7789954337899543, + "grad_norm": 100.69123840332031, + "learning_rate": 9.135464231354643e-06, + "loss": 3.349, + "step": 1948 + }, + { + "epoch": 1.7799086757990867, + "grad_norm": 16.857770919799805, + "learning_rate": 9.134449518011162e-06, + "loss": 0.2538, + "step": 1949 + }, + { + "epoch": 1.7808219178082192, + "grad_norm": 8.248610496520996, + "learning_rate": 9.133434804667682e-06, + "loss": 0.1092, + "step": 1950 + }, + { + "epoch": 1.7817351598173516, + "grad_norm": 20.496505737304688, + "learning_rate": 9.132420091324201e-06, + "loss": 0.2881, + "step": 1951 + }, + { + "epoch": 1.782648401826484, + "grad_norm": 4.066421031951904, + "learning_rate": 9.131405377980722e-06, + "loss": 0.0451, + "step": 1952 + }, + { + "epoch": 1.7835616438356166, + "grad_norm": 4.960690498352051, + "learning_rate": 9.13039066463724e-06, + "loss": 0.0428, + "step": 1953 + }, + { + "epoch": 1.784474885844749, + "grad_norm": 10.683622360229492, + "learning_rate": 9.129375951293761e-06, + "loss": 0.1198, + "step": 1954 + }, + { + "epoch": 1.7853881278538812, + "grad_norm": 22.319787979125977, + "learning_rate": 9.12836123795028e-06, + "loss": 0.3497, + "step": 1955 + }, + { + "epoch": 1.7863013698630137, + "grad_norm": 0.4250911474227905, + "learning_rate": 9.127346524606799e-06, + "loss": 0.0042, + "step": 1956 + }, + { + "epoch": 1.7872146118721461, + "grad_norm": 31.278839111328125, + "learning_rate": 9.12633181126332e-06, + "loss": 0.5355, + "step": 1957 + }, + { + "epoch": 1.7881278538812784, + "grad_norm": 69.46388244628906, + "learning_rate": 9.125317097919838e-06, + "loss": 3.0911, + "step": 1958 + }, + { + "epoch": 1.7890410958904108, + "grad_norm": 16.187314987182617, + "learning_rate": 9.124302384576359e-06, + "loss": 0.1992, + "step": 1959 + }, + { + "epoch": 1.7899543378995433, + "grad_norm": 25.220584869384766, + "learning_rate": 9.123287671232878e-06, + "loss": 0.7933, + "step": 1960 + }, + { + "epoch": 1.7908675799086757, + "grad_norm": 57.38944625854492, + "learning_rate": 9.122272957889396e-06, + "loss": 1.8879, + "step": 1961 + }, + { + "epoch": 1.7917808219178082, + "grad_norm": 59.5839958190918, + "learning_rate": 9.121258244545917e-06, + "loss": 0.963, + "step": 1962 + }, + { + "epoch": 1.7926940639269406, + "grad_norm": 6.68317985534668, + "learning_rate": 9.120243531202436e-06, + "loss": 0.0895, + "step": 1963 + }, + { + "epoch": 1.793607305936073, + "grad_norm": 8.862060546875, + "learning_rate": 9.119228817858956e-06, + "loss": 0.1331, + "step": 1964 + }, + { + "epoch": 1.7945205479452055, + "grad_norm": 28.14803695678711, + "learning_rate": 9.118214104515475e-06, + "loss": 0.6832, + "step": 1965 + }, + { + "epoch": 1.795433789954338, + "grad_norm": 28.824853897094727, + "learning_rate": 9.117199391171994e-06, + "loss": 0.2709, + "step": 1966 + }, + { + "epoch": 1.7963470319634705, + "grad_norm": 14.853056907653809, + "learning_rate": 9.116184677828515e-06, + "loss": 0.2106, + "step": 1967 + }, + { + "epoch": 1.7972602739726027, + "grad_norm": 13.969993591308594, + "learning_rate": 9.115169964485033e-06, + "loss": 0.2686, + "step": 1968 + }, + { + "epoch": 1.7981735159817351, + "grad_norm": 36.67208480834961, + "learning_rate": 9.114155251141554e-06, + "loss": 1.0861, + "step": 1969 + }, + { + "epoch": 1.7990867579908676, + "grad_norm": 43.31771469116211, + "learning_rate": 9.113140537798073e-06, + "loss": 1.0899, + "step": 1970 + }, + { + "epoch": 1.8, + "grad_norm": 11.946415901184082, + "learning_rate": 9.112125824454592e-06, + "loss": 0.1644, + "step": 1971 + }, + { + "epoch": 1.8009132420091323, + "grad_norm": 19.902257919311523, + "learning_rate": 9.111111111111112e-06, + "loss": 0.3128, + "step": 1972 + }, + { + "epoch": 1.8018264840182647, + "grad_norm": 1.1602364778518677, + "learning_rate": 9.110096397767631e-06, + "loss": 0.0121, + "step": 1973 + }, + { + "epoch": 1.8027397260273972, + "grad_norm": 9.811938285827637, + "learning_rate": 9.109081684424152e-06, + "loss": 0.1504, + "step": 1974 + }, + { + "epoch": 1.8036529680365296, + "grad_norm": 2.1686532497406006, + "learning_rate": 9.10806697108067e-06, + "loss": 0.0372, + "step": 1975 + }, + { + "epoch": 1.804566210045662, + "grad_norm": 68.3234634399414, + "learning_rate": 9.10705225773719e-06, + "loss": 1.635, + "step": 1976 + }, + { + "epoch": 1.8054794520547945, + "grad_norm": 13.412549018859863, + "learning_rate": 9.10603754439371e-06, + "loss": 0.2009, + "step": 1977 + }, + { + "epoch": 1.806392694063927, + "grad_norm": 40.58222579956055, + "learning_rate": 9.105022831050229e-06, + "loss": 0.6767, + "step": 1978 + }, + { + "epoch": 1.8073059360730594, + "grad_norm": 36.06586837768555, + "learning_rate": 9.10400811770675e-06, + "loss": 0.4139, + "step": 1979 + }, + { + "epoch": 1.808219178082192, + "grad_norm": 9.179815292358398, + "learning_rate": 9.102993404363268e-06, + "loss": 0.1329, + "step": 1980 + }, + { + "epoch": 1.8091324200913244, + "grad_norm": 19.118297576904297, + "learning_rate": 9.101978691019787e-06, + "loss": 0.3187, + "step": 1981 + }, + { + "epoch": 1.8100456621004566, + "grad_norm": 45.98594284057617, + "learning_rate": 9.100963977676307e-06, + "loss": 2.879, + "step": 1982 + }, + { + "epoch": 1.810958904109589, + "grad_norm": 2.590245008468628, + "learning_rate": 9.099949264332826e-06, + "loss": 0.0359, + "step": 1983 + }, + { + "epoch": 1.8118721461187215, + "grad_norm": 17.265804290771484, + "learning_rate": 9.098934550989347e-06, + "loss": 0.1745, + "step": 1984 + }, + { + "epoch": 1.8127853881278537, + "grad_norm": 13.547858238220215, + "learning_rate": 9.097919837645866e-06, + "loss": 0.1354, + "step": 1985 + }, + { + "epoch": 1.8136986301369862, + "grad_norm": 3.5078399181365967, + "learning_rate": 9.096905124302386e-06, + "loss": 0.0415, + "step": 1986 + }, + { + "epoch": 1.8146118721461186, + "grad_norm": 2.7151894569396973, + "learning_rate": 9.095890410958905e-06, + "loss": 0.0244, + "step": 1987 + }, + { + "epoch": 1.815525114155251, + "grad_norm": 14.006754875183105, + "learning_rate": 9.094875697615424e-06, + "loss": 0.14, + "step": 1988 + }, + { + "epoch": 1.8164383561643835, + "grad_norm": 21.815204620361328, + "learning_rate": 9.093860984271944e-06, + "loss": 0.3214, + "step": 1989 + }, + { + "epoch": 1.817351598173516, + "grad_norm": 0.5984048843383789, + "learning_rate": 9.092846270928463e-06, + "loss": 0.0065, + "step": 1990 + }, + { + "epoch": 1.8182648401826484, + "grad_norm": 49.59217071533203, + "learning_rate": 9.091831557584982e-06, + "loss": 1.1061, + "step": 1991 + }, + { + "epoch": 1.819178082191781, + "grad_norm": 42.74799728393555, + "learning_rate": 9.090816844241503e-06, + "loss": 1.0948, + "step": 1992 + }, + { + "epoch": 1.8200913242009134, + "grad_norm": 0.4261360466480255, + "learning_rate": 9.089802130898021e-06, + "loss": 0.0038, + "step": 1993 + }, + { + "epoch": 1.8210045662100458, + "grad_norm": 21.491924285888672, + "learning_rate": 9.088787417554542e-06, + "loss": 0.3048, + "step": 1994 + }, + { + "epoch": 1.821917808219178, + "grad_norm": 18.246946334838867, + "learning_rate": 9.08777270421106e-06, + "loss": 0.2268, + "step": 1995 + }, + { + "epoch": 1.8228310502283105, + "grad_norm": 16.37883186340332, + "learning_rate": 9.086757990867581e-06, + "loss": 0.2957, + "step": 1996 + }, + { + "epoch": 1.823744292237443, + "grad_norm": 56.557273864746094, + "learning_rate": 9.0857432775241e-06, + "loss": 1.4098, + "step": 1997 + }, + { + "epoch": 1.8246575342465754, + "grad_norm": 15.22604751586914, + "learning_rate": 9.084728564180619e-06, + "loss": 0.3281, + "step": 1998 + }, + { + "epoch": 1.8255707762557076, + "grad_norm": 14.07145881652832, + "learning_rate": 9.08371385083714e-06, + "loss": 0.1838, + "step": 1999 + }, + { + "epoch": 1.82648401826484, + "grad_norm": 31.455408096313477, + "learning_rate": 9.082699137493658e-06, + "loss": 0.6266, + "step": 2000 + }, + { + "epoch": 1.8273972602739725, + "grad_norm": 70.33071899414062, + "learning_rate": 9.081684424150177e-06, + "loss": 2.7874, + "step": 2001 + }, + { + "epoch": 1.828310502283105, + "grad_norm": 46.94278335571289, + "learning_rate": 9.080669710806698e-06, + "loss": 0.9083, + "step": 2002 + }, + { + "epoch": 1.8292237442922374, + "grad_norm": 17.546323776245117, + "learning_rate": 9.079654997463218e-06, + "loss": 0.2695, + "step": 2003 + }, + { + "epoch": 1.83013698630137, + "grad_norm": 75.57548522949219, + "learning_rate": 9.078640284119737e-06, + "loss": 0.4958, + "step": 2004 + }, + { + "epoch": 1.8310502283105023, + "grad_norm": 1.4048138856887817, + "learning_rate": 9.077625570776256e-06, + "loss": 0.0166, + "step": 2005 + }, + { + "epoch": 1.8319634703196348, + "grad_norm": 34.1883544921875, + "learning_rate": 9.076610857432777e-06, + "loss": 0.6514, + "step": 2006 + }, + { + "epoch": 1.8328767123287673, + "grad_norm": 204.56472778320312, + "learning_rate": 9.075596144089295e-06, + "loss": 0.4802, + "step": 2007 + }, + { + "epoch": 1.8337899543378997, + "grad_norm": 34.878604888916016, + "learning_rate": 9.074581430745814e-06, + "loss": 0.7283, + "step": 2008 + }, + { + "epoch": 1.834703196347032, + "grad_norm": 28.823284149169922, + "learning_rate": 9.073566717402335e-06, + "loss": 0.5314, + "step": 2009 + }, + { + "epoch": 1.8356164383561644, + "grad_norm": 0.9354396462440491, + "learning_rate": 9.072552004058854e-06, + "loss": 0.0152, + "step": 2010 + }, + { + "epoch": 1.8365296803652968, + "grad_norm": 46.362911224365234, + "learning_rate": 9.071537290715373e-06, + "loss": 1.1101, + "step": 2011 + }, + { + "epoch": 1.837442922374429, + "grad_norm": 74.9817123413086, + "learning_rate": 9.070522577371893e-06, + "loss": 3.5092, + "step": 2012 + }, + { + "epoch": 1.8383561643835615, + "grad_norm": 2.50156569480896, + "learning_rate": 9.069507864028414e-06, + "loss": 0.0369, + "step": 2013 + }, + { + "epoch": 1.839269406392694, + "grad_norm": 24.64972686767578, + "learning_rate": 9.068493150684932e-06, + "loss": 0.4415, + "step": 2014 + }, + { + "epoch": 1.8401826484018264, + "grad_norm": 65.9261245727539, + "learning_rate": 9.067478437341451e-06, + "loss": 1.9971, + "step": 2015 + }, + { + "epoch": 1.841095890410959, + "grad_norm": 40.373504638671875, + "learning_rate": 9.066463723997972e-06, + "loss": 0.8046, + "step": 2016 + }, + { + "epoch": 1.8420091324200913, + "grad_norm": 15.681282997131348, + "learning_rate": 9.06544901065449e-06, + "loss": 0.3185, + "step": 2017 + }, + { + "epoch": 1.8429223744292238, + "grad_norm": 40.13237762451172, + "learning_rate": 9.06443429731101e-06, + "loss": 0.5315, + "step": 2018 + }, + { + "epoch": 1.8438356164383563, + "grad_norm": 26.856840133666992, + "learning_rate": 9.06341958396753e-06, + "loss": 0.3363, + "step": 2019 + }, + { + "epoch": 1.8447488584474887, + "grad_norm": 15.281487464904785, + "learning_rate": 9.062404870624049e-06, + "loss": 0.2018, + "step": 2020 + }, + { + "epoch": 1.8456621004566212, + "grad_norm": 3.9771058559417725, + "learning_rate": 9.061390157280568e-06, + "loss": 0.0528, + "step": 2021 + }, + { + "epoch": 1.8465753424657534, + "grad_norm": 39.555152893066406, + "learning_rate": 9.060375443937088e-06, + "loss": 0.9436, + "step": 2022 + }, + { + "epoch": 1.8474885844748858, + "grad_norm": 100.9719467163086, + "learning_rate": 9.059360730593609e-06, + "loss": 2.7357, + "step": 2023 + }, + { + "epoch": 1.8484018264840183, + "grad_norm": 65.73405456542969, + "learning_rate": 9.058346017250128e-06, + "loss": 2.3533, + "step": 2024 + }, + { + "epoch": 1.8493150684931505, + "grad_norm": 8.380437850952148, + "learning_rate": 9.057331303906647e-06, + "loss": 0.0821, + "step": 2025 + }, + { + "epoch": 1.850228310502283, + "grad_norm": 55.67951202392578, + "learning_rate": 9.056316590563167e-06, + "loss": 1.076, + "step": 2026 + }, + { + "epoch": 1.8511415525114154, + "grad_norm": 24.52341079711914, + "learning_rate": 9.055301877219686e-06, + "loss": 0.3405, + "step": 2027 + }, + { + "epoch": 1.8520547945205479, + "grad_norm": 55.337501525878906, + "learning_rate": 9.054287163876205e-06, + "loss": 1.8448, + "step": 2028 + }, + { + "epoch": 1.8529680365296803, + "grad_norm": 13.22169017791748, + "learning_rate": 9.053272450532725e-06, + "loss": 0.278, + "step": 2029 + }, + { + "epoch": 1.8538812785388128, + "grad_norm": 44.12617492675781, + "learning_rate": 9.052257737189246e-06, + "loss": 0.958, + "step": 2030 + }, + { + "epoch": 1.8547945205479452, + "grad_norm": 17.560640335083008, + "learning_rate": 9.051243023845763e-06, + "loss": 0.2258, + "step": 2031 + }, + { + "epoch": 1.8557077625570777, + "grad_norm": 12.068150520324707, + "learning_rate": 9.050228310502284e-06, + "loss": 0.1446, + "step": 2032 + }, + { + "epoch": 1.8566210045662102, + "grad_norm": 28.809377670288086, + "learning_rate": 9.049213597158804e-06, + "loss": 0.4602, + "step": 2033 + }, + { + "epoch": 1.8575342465753426, + "grad_norm": 7.441327095031738, + "learning_rate": 9.048198883815323e-06, + "loss": 0.0594, + "step": 2034 + }, + { + "epoch": 1.8584474885844748, + "grad_norm": 69.93152618408203, + "learning_rate": 9.047184170471842e-06, + "loss": 1.5106, + "step": 2035 + }, + { + "epoch": 1.8593607305936073, + "grad_norm": 21.39373016357422, + "learning_rate": 9.046169457128362e-06, + "loss": 0.2231, + "step": 2036 + }, + { + "epoch": 1.8602739726027397, + "grad_norm": 64.36167907714844, + "learning_rate": 9.045154743784881e-06, + "loss": 1.9066, + "step": 2037 + }, + { + "epoch": 1.8611872146118722, + "grad_norm": 50.494384765625, + "learning_rate": 9.0441400304414e-06, + "loss": 0.5917, + "step": 2038 + }, + { + "epoch": 1.8621004566210044, + "grad_norm": 18.602489471435547, + "learning_rate": 9.04312531709792e-06, + "loss": 0.2501, + "step": 2039 + }, + { + "epoch": 1.8630136986301369, + "grad_norm": 27.38548469543457, + "learning_rate": 9.042110603754441e-06, + "loss": 0.5158, + "step": 2040 + }, + { + "epoch": 1.8639269406392693, + "grad_norm": 28.934738159179688, + "learning_rate": 9.04109589041096e-06, + "loss": 0.5284, + "step": 2041 + }, + { + "epoch": 1.8648401826484018, + "grad_norm": 52.0392951965332, + "learning_rate": 9.040081177067479e-06, + "loss": 1.0893, + "step": 2042 + }, + { + "epoch": 1.8657534246575342, + "grad_norm": 59.191829681396484, + "learning_rate": 9.039066463724e-06, + "loss": 1.3229, + "step": 2043 + }, + { + "epoch": 1.8666666666666667, + "grad_norm": 20.985498428344727, + "learning_rate": 9.038051750380518e-06, + "loss": 0.3804, + "step": 2044 + }, + { + "epoch": 1.8675799086757991, + "grad_norm": 54.19367599487305, + "learning_rate": 9.037037037037037e-06, + "loss": 1.9642, + "step": 2045 + }, + { + "epoch": 1.8684931506849316, + "grad_norm": 3.6913812160491943, + "learning_rate": 9.036022323693558e-06, + "loss": 0.0472, + "step": 2046 + }, + { + "epoch": 1.869406392694064, + "grad_norm": 29.66805648803711, + "learning_rate": 9.035007610350078e-06, + "loss": 0.3142, + "step": 2047 + }, + { + "epoch": 1.8703196347031965, + "grad_norm": 36.06983947753906, + "learning_rate": 9.033992897006595e-06, + "loss": 0.346, + "step": 2048 + }, + { + "epoch": 1.8712328767123287, + "grad_norm": 33.085262298583984, + "learning_rate": 9.032978183663116e-06, + "loss": 0.4368, + "step": 2049 + }, + { + "epoch": 1.8721461187214612, + "grad_norm": 4.680069923400879, + "learning_rate": 9.031963470319636e-06, + "loss": 0.0756, + "step": 2050 + }, + { + "epoch": 1.8730593607305936, + "grad_norm": 41.681602478027344, + "learning_rate": 9.030948756976155e-06, + "loss": 1.3873, + "step": 2051 + }, + { + "epoch": 1.8739726027397259, + "grad_norm": 137.02345275878906, + "learning_rate": 9.029934043632674e-06, + "loss": 2.0485, + "step": 2052 + }, + { + "epoch": 1.8748858447488583, + "grad_norm": 8.320183753967285, + "learning_rate": 9.028919330289195e-06, + "loss": 0.0789, + "step": 2053 + }, + { + "epoch": 1.8757990867579908, + "grad_norm": 42.6152458190918, + "learning_rate": 9.027904616945713e-06, + "loss": 1.1837, + "step": 2054 + }, + { + "epoch": 1.8767123287671232, + "grad_norm": 47.040382385253906, + "learning_rate": 9.026889903602232e-06, + "loss": 0.8524, + "step": 2055 + }, + { + "epoch": 1.8776255707762557, + "grad_norm": 19.774805068969727, + "learning_rate": 9.025875190258753e-06, + "loss": 0.3276, + "step": 2056 + }, + { + "epoch": 1.8785388127853881, + "grad_norm": 123.4826431274414, + "learning_rate": 9.024860476915273e-06, + "loss": 0.831, + "step": 2057 + }, + { + "epoch": 1.8794520547945206, + "grad_norm": 14.978431701660156, + "learning_rate": 9.023845763571792e-06, + "loss": 0.2766, + "step": 2058 + }, + { + "epoch": 1.880365296803653, + "grad_norm": 23.409420013427734, + "learning_rate": 9.022831050228311e-06, + "loss": 0.3624, + "step": 2059 + }, + { + "epoch": 1.8812785388127855, + "grad_norm": 39.75717544555664, + "learning_rate": 9.021816336884832e-06, + "loss": 0.9354, + "step": 2060 + }, + { + "epoch": 1.882191780821918, + "grad_norm": 36.96929168701172, + "learning_rate": 9.02080162354135e-06, + "loss": 0.4811, + "step": 2061 + }, + { + "epoch": 1.8831050228310502, + "grad_norm": 6.65207576751709, + "learning_rate": 9.01978691019787e-06, + "loss": 0.0732, + "step": 2062 + }, + { + "epoch": 1.8840182648401826, + "grad_norm": 52.666473388671875, + "learning_rate": 9.01877219685439e-06, + "loss": 0.9499, + "step": 2063 + }, + { + "epoch": 1.884931506849315, + "grad_norm": 70.7039794921875, + "learning_rate": 9.017757483510909e-06, + "loss": 1.6141, + "step": 2064 + }, + { + "epoch": 1.8858447488584473, + "grad_norm": 55.88430404663086, + "learning_rate": 9.016742770167427e-06, + "loss": 0.9634, + "step": 2065 + }, + { + "epoch": 1.8867579908675798, + "grad_norm": 34.373226165771484, + "learning_rate": 9.015728056823948e-06, + "loss": 0.5767, + "step": 2066 + }, + { + "epoch": 1.8876712328767122, + "grad_norm": 43.35969924926758, + "learning_rate": 9.014713343480469e-06, + "loss": 1.3142, + "step": 2067 + }, + { + "epoch": 1.8885844748858447, + "grad_norm": 17.524457931518555, + "learning_rate": 9.013698630136987e-06, + "loss": 0.2982, + "step": 2068 + }, + { + "epoch": 1.8894977168949771, + "grad_norm": 28.574953079223633, + "learning_rate": 9.012683916793506e-06, + "loss": 0.6096, + "step": 2069 + }, + { + "epoch": 1.8904109589041096, + "grad_norm": 72.46683502197266, + "learning_rate": 9.011669203450027e-06, + "loss": 3.3692, + "step": 2070 + }, + { + "epoch": 1.891324200913242, + "grad_norm": 38.02703857421875, + "learning_rate": 9.010654490106546e-06, + "loss": 0.8111, + "step": 2071 + }, + { + "epoch": 1.8922374429223745, + "grad_norm": 6.054198741912842, + "learning_rate": 9.009639776763064e-06, + "loss": 0.0494, + "step": 2072 + }, + { + "epoch": 1.893150684931507, + "grad_norm": 24.778661727905273, + "learning_rate": 9.008625063419585e-06, + "loss": 0.4145, + "step": 2073 + }, + { + "epoch": 1.8940639269406394, + "grad_norm": 54.945926666259766, + "learning_rate": 9.007610350076106e-06, + "loss": 1.3484, + "step": 2074 + }, + { + "epoch": 1.8949771689497716, + "grad_norm": 10.86800765991211, + "learning_rate": 9.006595636732623e-06, + "loss": 0.1485, + "step": 2075 + }, + { + "epoch": 1.895890410958904, + "grad_norm": 14.317304611206055, + "learning_rate": 9.005580923389143e-06, + "loss": 0.1705, + "step": 2076 + }, + { + "epoch": 1.8968036529680365, + "grad_norm": 31.4681396484375, + "learning_rate": 9.004566210045664e-06, + "loss": 0.4773, + "step": 2077 + }, + { + "epoch": 1.897716894977169, + "grad_norm": 9.712669372558594, + "learning_rate": 9.003551496702183e-06, + "loss": 0.1529, + "step": 2078 + }, + { + "epoch": 1.8986301369863012, + "grad_norm": 70.6971435546875, + "learning_rate": 9.002536783358701e-06, + "loss": 1.8552, + "step": 2079 + }, + { + "epoch": 1.8995433789954337, + "grad_norm": 47.28068542480469, + "learning_rate": 9.001522070015222e-06, + "loss": 1.088, + "step": 2080 + }, + { + "epoch": 1.9004566210045661, + "grad_norm": 42.041481018066406, + "learning_rate": 9.00050735667174e-06, + "loss": 0.2137, + "step": 2081 + }, + { + "epoch": 1.9013698630136986, + "grad_norm": 43.9588623046875, + "learning_rate": 8.99949264332826e-06, + "loss": 1.6487, + "step": 2082 + }, + { + "epoch": 1.902283105022831, + "grad_norm": 13.210415840148926, + "learning_rate": 8.99847792998478e-06, + "loss": 0.2004, + "step": 2083 + }, + { + "epoch": 1.9031963470319635, + "grad_norm": 1.165274977684021, + "learning_rate": 8.9974632166413e-06, + "loss": 0.0106, + "step": 2084 + }, + { + "epoch": 1.904109589041096, + "grad_norm": 7.808990001678467, + "learning_rate": 8.99644850329782e-06, + "loss": 0.0911, + "step": 2085 + }, + { + "epoch": 1.9050228310502284, + "grad_norm": 0.929504930973053, + "learning_rate": 8.995433789954338e-06, + "loss": 0.0127, + "step": 2086 + }, + { + "epoch": 1.9059360730593609, + "grad_norm": 51.62894821166992, + "learning_rate": 8.994419076610859e-06, + "loss": 2.0851, + "step": 2087 + }, + { + "epoch": 1.9068493150684933, + "grad_norm": 15.606156349182129, + "learning_rate": 8.993404363267378e-06, + "loss": 0.2462, + "step": 2088 + }, + { + "epoch": 1.9077625570776255, + "grad_norm": 69.82485961914062, + "learning_rate": 8.992389649923897e-06, + "loss": 1.932, + "step": 2089 + }, + { + "epoch": 1.908675799086758, + "grad_norm": 63.79825210571289, + "learning_rate": 8.991374936580417e-06, + "loss": 1.7144, + "step": 2090 + }, + { + "epoch": 1.9095890410958904, + "grad_norm": 35.01719665527344, + "learning_rate": 8.990360223236936e-06, + "loss": 0.8646, + "step": 2091 + }, + { + "epoch": 1.9105022831050227, + "grad_norm": 41.35883331298828, + "learning_rate": 8.989345509893455e-06, + "loss": 0.8017, + "step": 2092 + }, + { + "epoch": 1.9114155251141551, + "grad_norm": 19.447975158691406, + "learning_rate": 8.988330796549975e-06, + "loss": 0.2628, + "step": 2093 + }, + { + "epoch": 1.9123287671232876, + "grad_norm": 4.440453052520752, + "learning_rate": 8.987316083206496e-06, + "loss": 0.0629, + "step": 2094 + }, + { + "epoch": 1.91324200913242, + "grad_norm": 24.451797485351562, + "learning_rate": 8.986301369863015e-06, + "loss": 0.4252, + "step": 2095 + }, + { + "epoch": 1.9141552511415525, + "grad_norm": 33.458126068115234, + "learning_rate": 8.985286656519534e-06, + "loss": 0.5512, + "step": 2096 + }, + { + "epoch": 1.915068493150685, + "grad_norm": 110.45926666259766, + "learning_rate": 8.984271943176054e-06, + "loss": 0.6218, + "step": 2097 + }, + { + "epoch": 1.9159817351598174, + "grad_norm": 9.672475814819336, + "learning_rate": 8.983257229832573e-06, + "loss": 0.084, + "step": 2098 + }, + { + "epoch": 1.9168949771689499, + "grad_norm": 37.185794830322266, + "learning_rate": 8.982242516489092e-06, + "loss": 1.004, + "step": 2099 + }, + { + "epoch": 1.9178082191780823, + "grad_norm": 29.3632869720459, + "learning_rate": 8.981227803145612e-06, + "loss": 0.5699, + "step": 2100 + }, + { + "epoch": 1.9187214611872148, + "grad_norm": 50.07635498046875, + "learning_rate": 8.980213089802131e-06, + "loss": 1.0903, + "step": 2101 + }, + { + "epoch": 1.919634703196347, + "grad_norm": 58.36561584472656, + "learning_rate": 8.979198376458652e-06, + "loss": 1.7723, + "step": 2102 + }, + { + "epoch": 1.9205479452054794, + "grad_norm": 40.47492218017578, + "learning_rate": 8.97818366311517e-06, + "loss": 0.7659, + "step": 2103 + }, + { + "epoch": 1.921461187214612, + "grad_norm": 2.4550492763519287, + "learning_rate": 8.977168949771691e-06, + "loss": 0.0249, + "step": 2104 + }, + { + "epoch": 1.9223744292237441, + "grad_norm": 29.055448532104492, + "learning_rate": 8.97615423642821e-06, + "loss": 0.4271, + "step": 2105 + }, + { + "epoch": 1.9232876712328766, + "grad_norm": 51.16083908081055, + "learning_rate": 8.975139523084729e-06, + "loss": 0.5123, + "step": 2106 + }, + { + "epoch": 1.924200913242009, + "grad_norm": 48.239707946777344, + "learning_rate": 8.97412480974125e-06, + "loss": 1.1029, + "step": 2107 + }, + { + "epoch": 1.9251141552511415, + "grad_norm": 42.04818344116211, + "learning_rate": 8.973110096397768e-06, + "loss": 1.4491, + "step": 2108 + }, + { + "epoch": 1.926027397260274, + "grad_norm": 28.59501838684082, + "learning_rate": 8.972095383054287e-06, + "loss": 0.7901, + "step": 2109 + }, + { + "epoch": 1.9269406392694064, + "grad_norm": 6.921403408050537, + "learning_rate": 8.971080669710808e-06, + "loss": 0.0505, + "step": 2110 + }, + { + "epoch": 1.9278538812785389, + "grad_norm": 10.205168724060059, + "learning_rate": 8.970065956367326e-06, + "loss": 0.118, + "step": 2111 + }, + { + "epoch": 1.9287671232876713, + "grad_norm": 21.482807159423828, + "learning_rate": 8.969051243023847e-06, + "loss": 0.3754, + "step": 2112 + }, + { + "epoch": 1.9296803652968038, + "grad_norm": 22.863189697265625, + "learning_rate": 8.968036529680366e-06, + "loss": 0.3211, + "step": 2113 + }, + { + "epoch": 1.9305936073059362, + "grad_norm": 44.54662322998047, + "learning_rate": 8.967021816336886e-06, + "loss": 0.693, + "step": 2114 + }, + { + "epoch": 1.9315068493150684, + "grad_norm": 40.12987518310547, + "learning_rate": 8.966007102993405e-06, + "loss": 0.4465, + "step": 2115 + }, + { + "epoch": 1.932420091324201, + "grad_norm": 9.40845775604248, + "learning_rate": 8.964992389649924e-06, + "loss": 0.1575, + "step": 2116 + }, + { + "epoch": 1.9333333333333333, + "grad_norm": 42.69550323486328, + "learning_rate": 8.963977676306445e-06, + "loss": 1.0768, + "step": 2117 + }, + { + "epoch": 1.9342465753424658, + "grad_norm": 23.66881561279297, + "learning_rate": 8.962962962962963e-06, + "loss": 0.3359, + "step": 2118 + }, + { + "epoch": 1.935159817351598, + "grad_norm": 1.9381041526794434, + "learning_rate": 8.961948249619484e-06, + "loss": 0.024, + "step": 2119 + }, + { + "epoch": 1.9360730593607305, + "grad_norm": 6.825874328613281, + "learning_rate": 8.960933536276003e-06, + "loss": 0.0793, + "step": 2120 + }, + { + "epoch": 1.936986301369863, + "grad_norm": 3.840965986251831, + "learning_rate": 8.959918822932522e-06, + "loss": 0.0445, + "step": 2121 + }, + { + "epoch": 1.9378995433789954, + "grad_norm": 2.266350269317627, + "learning_rate": 8.958904109589042e-06, + "loss": 0.026, + "step": 2122 + }, + { + "epoch": 1.9388127853881278, + "grad_norm": 1.3120416402816772, + "learning_rate": 8.957889396245561e-06, + "loss": 0.0111, + "step": 2123 + }, + { + "epoch": 1.9397260273972603, + "grad_norm": 53.53962326049805, + "learning_rate": 8.956874682902082e-06, + "loss": 1.1581, + "step": 2124 + }, + { + "epoch": 1.9406392694063928, + "grad_norm": 4.481299877166748, + "learning_rate": 8.9558599695586e-06, + "loss": 0.0682, + "step": 2125 + }, + { + "epoch": 1.9415525114155252, + "grad_norm": 32.83872985839844, + "learning_rate": 8.95484525621512e-06, + "loss": 0.3854, + "step": 2126 + }, + { + "epoch": 1.9424657534246577, + "grad_norm": 60.76952362060547, + "learning_rate": 8.95383054287164e-06, + "loss": 0.9885, + "step": 2127 + }, + { + "epoch": 1.9433789954337901, + "grad_norm": 21.06499671936035, + "learning_rate": 8.952815829528159e-06, + "loss": 0.3698, + "step": 2128 + }, + { + "epoch": 1.9442922374429223, + "grad_norm": 4.1351752281188965, + "learning_rate": 8.95180111618468e-06, + "loss": 0.0433, + "step": 2129 + }, + { + "epoch": 1.9452054794520548, + "grad_norm": 18.273405075073242, + "learning_rate": 8.950786402841198e-06, + "loss": 0.3646, + "step": 2130 + }, + { + "epoch": 1.9461187214611873, + "grad_norm": 55.38542938232422, + "learning_rate": 8.949771689497717e-06, + "loss": 0.8881, + "step": 2131 + }, + { + "epoch": 1.9470319634703195, + "grad_norm": 27.814796447753906, + "learning_rate": 8.948756976154237e-06, + "loss": 0.6249, + "step": 2132 + }, + { + "epoch": 1.947945205479452, + "grad_norm": 7.2405924797058105, + "learning_rate": 8.947742262810756e-06, + "loss": 0.0935, + "step": 2133 + }, + { + "epoch": 1.9488584474885844, + "grad_norm": 4.455573081970215, + "learning_rate": 8.946727549467277e-06, + "loss": 0.0739, + "step": 2134 + }, + { + "epoch": 1.9497716894977168, + "grad_norm": 60.802913665771484, + "learning_rate": 8.945712836123796e-06, + "loss": 1.0727, + "step": 2135 + }, + { + "epoch": 1.9506849315068493, + "grad_norm": 4.6139421463012695, + "learning_rate": 8.944698122780315e-06, + "loss": 0.0485, + "step": 2136 + }, + { + "epoch": 1.9515981735159817, + "grad_norm": 26.4127197265625, + "learning_rate": 8.943683409436835e-06, + "loss": 0.4805, + "step": 2137 + }, + { + "epoch": 1.9525114155251142, + "grad_norm": 31.85651969909668, + "learning_rate": 8.942668696093354e-06, + "loss": 0.7371, + "step": 2138 + }, + { + "epoch": 1.9534246575342467, + "grad_norm": 4.407130241394043, + "learning_rate": 8.941653982749874e-06, + "loss": 0.0705, + "step": 2139 + }, + { + "epoch": 1.954337899543379, + "grad_norm": 8.277033805847168, + "learning_rate": 8.940639269406393e-06, + "loss": 0.0793, + "step": 2140 + }, + { + "epoch": 1.9552511415525116, + "grad_norm": 5.344384670257568, + "learning_rate": 8.939624556062912e-06, + "loss": 0.0786, + "step": 2141 + }, + { + "epoch": 1.9561643835616438, + "grad_norm": 15.2696533203125, + "learning_rate": 8.938609842719433e-06, + "loss": 0.2231, + "step": 2142 + }, + { + "epoch": 1.9570776255707762, + "grad_norm": 32.44679260253906, + "learning_rate": 8.937595129375952e-06, + "loss": 0.4112, + "step": 2143 + }, + { + "epoch": 1.9579908675799087, + "grad_norm": 50.55205154418945, + "learning_rate": 8.936580416032472e-06, + "loss": 1.0925, + "step": 2144 + }, + { + "epoch": 1.958904109589041, + "grad_norm": 42.377357482910156, + "learning_rate": 8.935565702688991e-06, + "loss": 0.6436, + "step": 2145 + }, + { + "epoch": 1.9598173515981734, + "grad_norm": 26.620758056640625, + "learning_rate": 8.934550989345511e-06, + "loss": 0.3409, + "step": 2146 + }, + { + "epoch": 1.9607305936073058, + "grad_norm": 51.99641418457031, + "learning_rate": 8.93353627600203e-06, + "loss": 1.075, + "step": 2147 + }, + { + "epoch": 1.9616438356164383, + "grad_norm": 45.58644104003906, + "learning_rate": 8.93252156265855e-06, + "loss": 0.7528, + "step": 2148 + }, + { + "epoch": 1.9625570776255707, + "grad_norm": 88.09746551513672, + "learning_rate": 8.93150684931507e-06, + "loss": 2.8063, + "step": 2149 + }, + { + "epoch": 1.9634703196347032, + "grad_norm": 0.29491081833839417, + "learning_rate": 8.930492135971589e-06, + "loss": 0.0029, + "step": 2150 + }, + { + "epoch": 1.9643835616438357, + "grad_norm": 1.9316858053207397, + "learning_rate": 8.929477422628107e-06, + "loss": 0.0213, + "step": 2151 + }, + { + "epoch": 1.965296803652968, + "grad_norm": 56.81181716918945, + "learning_rate": 8.928462709284628e-06, + "loss": 1.4913, + "step": 2152 + }, + { + "epoch": 1.9662100456621006, + "grad_norm": 23.609575271606445, + "learning_rate": 8.927447995941147e-06, + "loss": 0.3465, + "step": 2153 + }, + { + "epoch": 1.967123287671233, + "grad_norm": 22.45415496826172, + "learning_rate": 8.926433282597667e-06, + "loss": 0.2475, + "step": 2154 + }, + { + "epoch": 1.9680365296803652, + "grad_norm": 3.2570364475250244, + "learning_rate": 8.925418569254186e-06, + "loss": 0.0399, + "step": 2155 + }, + { + "epoch": 1.9689497716894977, + "grad_norm": 1.7164921760559082, + "learning_rate": 8.924403855910707e-06, + "loss": 0.022, + "step": 2156 + }, + { + "epoch": 1.9698630136986301, + "grad_norm": 39.47987365722656, + "learning_rate": 8.923389142567226e-06, + "loss": 0.4952, + "step": 2157 + }, + { + "epoch": 1.9707762557077626, + "grad_norm": 11.822546005249023, + "learning_rate": 8.922374429223744e-06, + "loss": 0.1359, + "step": 2158 + }, + { + "epoch": 1.9716894977168948, + "grad_norm": 1.2347729206085205, + "learning_rate": 8.921359715880265e-06, + "loss": 0.012, + "step": 2159 + }, + { + "epoch": 1.9726027397260273, + "grad_norm": 28.711843490600586, + "learning_rate": 8.920345002536784e-06, + "loss": 0.213, + "step": 2160 + }, + { + "epoch": 1.9735159817351597, + "grad_norm": 50.72312545776367, + "learning_rate": 8.919330289193303e-06, + "loss": 1.3125, + "step": 2161 + }, + { + "epoch": 1.9744292237442922, + "grad_norm": 10.346537590026855, + "learning_rate": 8.918315575849823e-06, + "loss": 0.1152, + "step": 2162 + }, + { + "epoch": 1.9753424657534246, + "grad_norm": 9.035018920898438, + "learning_rate": 8.917300862506344e-06, + "loss": 0.1684, + "step": 2163 + }, + { + "epoch": 1.976255707762557, + "grad_norm": 9.06872272491455, + "learning_rate": 8.916286149162863e-06, + "loss": 0.1077, + "step": 2164 + }, + { + "epoch": 1.9771689497716896, + "grad_norm": 0.4022529721260071, + "learning_rate": 8.915271435819381e-06, + "loss": 0.0043, + "step": 2165 + }, + { + "epoch": 1.978082191780822, + "grad_norm": 53.73905563354492, + "learning_rate": 8.914256722475902e-06, + "loss": 1.272, + "step": 2166 + }, + { + "epoch": 1.9789954337899545, + "grad_norm": 4.98986291885376, + "learning_rate": 8.91324200913242e-06, + "loss": 0.0618, + "step": 2167 + }, + { + "epoch": 1.979908675799087, + "grad_norm": 48.99872970581055, + "learning_rate": 8.91222729578894e-06, + "loss": 0.9604, + "step": 2168 + }, + { + "epoch": 1.9808219178082191, + "grad_norm": 20.25933837890625, + "learning_rate": 8.91121258244546e-06, + "loss": 0.2615, + "step": 2169 + }, + { + "epoch": 1.9817351598173516, + "grad_norm": 26.927989959716797, + "learning_rate": 8.910197869101979e-06, + "loss": 0.2806, + "step": 2170 + }, + { + "epoch": 1.982648401826484, + "grad_norm": 62.73289108276367, + "learning_rate": 8.909183155758498e-06, + "loss": 1.9353, + "step": 2171 + }, + { + "epoch": 1.9835616438356163, + "grad_norm": 46.96379470825195, + "learning_rate": 8.908168442415018e-06, + "loss": 0.9189, + "step": 2172 + }, + { + "epoch": 1.9844748858447487, + "grad_norm": 8.685837745666504, + "learning_rate": 8.907153729071539e-06, + "loss": 0.0762, + "step": 2173 + }, + { + "epoch": 1.9853881278538812, + "grad_norm": 43.844581604003906, + "learning_rate": 8.906139015728058e-06, + "loss": 0.7823, + "step": 2174 + }, + { + "epoch": 1.9863013698630136, + "grad_norm": 1.2047873735427856, + "learning_rate": 8.905124302384577e-06, + "loss": 0.0155, + "step": 2175 + }, + { + "epoch": 1.987214611872146, + "grad_norm": 48.845672607421875, + "learning_rate": 8.904109589041097e-06, + "loss": 0.8899, + "step": 2176 + }, + { + "epoch": 1.9881278538812786, + "grad_norm": 1.179038643836975, + "learning_rate": 8.903094875697616e-06, + "loss": 0.0177, + "step": 2177 + }, + { + "epoch": 1.989041095890411, + "grad_norm": 70.23085021972656, + "learning_rate": 8.902080162354135e-06, + "loss": 2.1398, + "step": 2178 + }, + { + "epoch": 1.9899543378995435, + "grad_norm": 46.47139358520508, + "learning_rate": 8.901065449010655e-06, + "loss": 1.0614, + "step": 2179 + }, + { + "epoch": 1.990867579908676, + "grad_norm": 99.44682312011719, + "learning_rate": 8.900050735667174e-06, + "loss": 2.7078, + "step": 2180 + }, + { + "epoch": 1.9917808219178084, + "grad_norm": 56.76755905151367, + "learning_rate": 8.899036022323693e-06, + "loss": 1.3175, + "step": 2181 + }, + { + "epoch": 1.9926940639269406, + "grad_norm": 28.05853843688965, + "learning_rate": 8.898021308980214e-06, + "loss": 0.6848, + "step": 2182 + }, + { + "epoch": 1.993607305936073, + "grad_norm": 20.71917724609375, + "learning_rate": 8.897006595636734e-06, + "loss": 0.2504, + "step": 2183 + }, + { + "epoch": 1.9945205479452055, + "grad_norm": 2.4107418060302734, + "learning_rate": 8.895991882293253e-06, + "loss": 0.0133, + "step": 2184 + }, + { + "epoch": 1.9954337899543377, + "grad_norm": 9.706146240234375, + "learning_rate": 8.894977168949772e-06, + "loss": 0.1058, + "step": 2185 + }, + { + "epoch": 1.9963470319634702, + "grad_norm": 7.268063068389893, + "learning_rate": 8.893962455606292e-06, + "loss": 0.0877, + "step": 2186 + }, + { + "epoch": 1.9972602739726026, + "grad_norm": 66.63713836669922, + "learning_rate": 8.892947742262811e-06, + "loss": 0.8643, + "step": 2187 + }, + { + "epoch": 1.998173515981735, + "grad_norm": 55.78333282470703, + "learning_rate": 8.89193302891933e-06, + "loss": 0.4041, + "step": 2188 + }, + { + "epoch": 1.9990867579908675, + "grad_norm": 41.5733642578125, + "learning_rate": 8.89091831557585e-06, + "loss": 1.2627, + "step": 2189 + }, + { + "epoch": 2.0, + "grad_norm": 8.025931358337402, + "learning_rate": 8.889903602232371e-06, + "loss": 0.0667, + "step": 2190 + }, + { + "epoch": 2.0009132420091325, + "grad_norm": 45.63469314575195, + "learning_rate": 8.888888888888888e-06, + "loss": 0.9003, + "step": 2191 + }, + { + "epoch": 2.001826484018265, + "grad_norm": 8.092175483703613, + "learning_rate": 8.887874175545409e-06, + "loss": 0.1372, + "step": 2192 + }, + { + "epoch": 2.0027397260273974, + "grad_norm": 7.72491979598999, + "learning_rate": 8.88685946220193e-06, + "loss": 0.0844, + "step": 2193 + }, + { + "epoch": 2.00365296803653, + "grad_norm": 31.150463104248047, + "learning_rate": 8.885844748858448e-06, + "loss": 0.3022, + "step": 2194 + }, + { + "epoch": 2.0045662100456623, + "grad_norm": 59.10044860839844, + "learning_rate": 8.884830035514967e-06, + "loss": 0.7436, + "step": 2195 + }, + { + "epoch": 2.0054794520547947, + "grad_norm": 35.776466369628906, + "learning_rate": 8.883815322171488e-06, + "loss": 0.548, + "step": 2196 + }, + { + "epoch": 2.0063926940639267, + "grad_norm": 25.448070526123047, + "learning_rate": 8.882800608828006e-06, + "loss": 0.3331, + "step": 2197 + }, + { + "epoch": 2.007305936073059, + "grad_norm": 7.6725592613220215, + "learning_rate": 8.881785895484525e-06, + "loss": 0.0818, + "step": 2198 + }, + { + "epoch": 2.0082191780821916, + "grad_norm": 18.666629791259766, + "learning_rate": 8.880771182141046e-06, + "loss": 0.2901, + "step": 2199 + }, + { + "epoch": 2.009132420091324, + "grad_norm": 1.0407016277313232, + "learning_rate": 8.879756468797566e-06, + "loss": 0.0103, + "step": 2200 + }, + { + "epoch": 2.0100456621004565, + "grad_norm": 11.550631523132324, + "learning_rate": 8.878741755454085e-06, + "loss": 0.1344, + "step": 2201 + }, + { + "epoch": 2.010958904109589, + "grad_norm": 0.8595427870750427, + "learning_rate": 8.877727042110604e-06, + "loss": 0.0091, + "step": 2202 + }, + { + "epoch": 2.0118721461187214, + "grad_norm": 2.4590206146240234, + "learning_rate": 8.876712328767125e-06, + "loss": 0.0139, + "step": 2203 + }, + { + "epoch": 2.012785388127854, + "grad_norm": 7.625921726226807, + "learning_rate": 8.875697615423643e-06, + "loss": 0.1002, + "step": 2204 + }, + { + "epoch": 2.0136986301369864, + "grad_norm": 32.457618713378906, + "learning_rate": 8.874682902080162e-06, + "loss": 0.6248, + "step": 2205 + }, + { + "epoch": 2.014611872146119, + "grad_norm": 4.914506912231445, + "learning_rate": 8.873668188736683e-06, + "loss": 0.0684, + "step": 2206 + }, + { + "epoch": 2.0155251141552513, + "grad_norm": 24.09096908569336, + "learning_rate": 8.872653475393203e-06, + "loss": 0.3371, + "step": 2207 + }, + { + "epoch": 2.0164383561643837, + "grad_norm": 50.653358459472656, + "learning_rate": 8.87163876204972e-06, + "loss": 0.7019, + "step": 2208 + }, + { + "epoch": 2.017351598173516, + "grad_norm": 17.300846099853516, + "learning_rate": 8.870624048706241e-06, + "loss": 0.2629, + "step": 2209 + }, + { + "epoch": 2.018264840182648, + "grad_norm": 3.9494071006774902, + "learning_rate": 8.869609335362762e-06, + "loss": 0.0384, + "step": 2210 + }, + { + "epoch": 2.0191780821917806, + "grad_norm": 5.754113674163818, + "learning_rate": 8.86859462201928e-06, + "loss": 0.0542, + "step": 2211 + }, + { + "epoch": 2.020091324200913, + "grad_norm": 17.592899322509766, + "learning_rate": 8.8675799086758e-06, + "loss": 0.1851, + "step": 2212 + }, + { + "epoch": 2.0210045662100455, + "grad_norm": 7.555271148681641, + "learning_rate": 8.86656519533232e-06, + "loss": 0.0795, + "step": 2213 + }, + { + "epoch": 2.021917808219178, + "grad_norm": 16.812400817871094, + "learning_rate": 8.865550481988839e-06, + "loss": 0.1145, + "step": 2214 + }, + { + "epoch": 2.0228310502283104, + "grad_norm": 82.88603973388672, + "learning_rate": 8.864535768645358e-06, + "loss": 4.4075, + "step": 2215 + }, + { + "epoch": 2.023744292237443, + "grad_norm": 110.70137786865234, + "learning_rate": 8.863521055301878e-06, + "loss": 2.4579, + "step": 2216 + }, + { + "epoch": 2.0246575342465754, + "grad_norm": 88.93843078613281, + "learning_rate": 8.862506341958399e-06, + "loss": 3.8932, + "step": 2217 + }, + { + "epoch": 2.025570776255708, + "grad_norm": 0.9171050786972046, + "learning_rate": 8.861491628614917e-06, + "loss": 0.0086, + "step": 2218 + }, + { + "epoch": 2.0264840182648403, + "grad_norm": 3.7509026527404785, + "learning_rate": 8.860476915271436e-06, + "loss": 0.0375, + "step": 2219 + }, + { + "epoch": 2.0273972602739727, + "grad_norm": 2.5152952671051025, + "learning_rate": 8.859462201927957e-06, + "loss": 0.0222, + "step": 2220 + }, + { + "epoch": 2.028310502283105, + "grad_norm": 71.3983383178711, + "learning_rate": 8.858447488584476e-06, + "loss": 1.3413, + "step": 2221 + }, + { + "epoch": 2.0292237442922376, + "grad_norm": 0.9106821417808533, + "learning_rate": 8.857432775240995e-06, + "loss": 0.0092, + "step": 2222 + }, + { + "epoch": 2.03013698630137, + "grad_norm": 64.71249389648438, + "learning_rate": 8.856418061897515e-06, + "loss": 0.4888, + "step": 2223 + }, + { + "epoch": 2.031050228310502, + "grad_norm": 13.037842750549316, + "learning_rate": 8.855403348554034e-06, + "loss": 0.1522, + "step": 2224 + }, + { + "epoch": 2.0319634703196345, + "grad_norm": 7.658244609832764, + "learning_rate": 8.854388635210553e-06, + "loss": 0.095, + "step": 2225 + }, + { + "epoch": 2.032876712328767, + "grad_norm": 15.299314498901367, + "learning_rate": 8.853373921867073e-06, + "loss": 0.2031, + "step": 2226 + }, + { + "epoch": 2.0337899543378994, + "grad_norm": 2.5588245391845703, + "learning_rate": 8.852359208523594e-06, + "loss": 0.0294, + "step": 2227 + }, + { + "epoch": 2.034703196347032, + "grad_norm": 6.0897321701049805, + "learning_rate": 8.851344495180113e-06, + "loss": 0.0749, + "step": 2228 + }, + { + "epoch": 2.0356164383561643, + "grad_norm": 17.339998245239258, + "learning_rate": 8.850329781836632e-06, + "loss": 0.3632, + "step": 2229 + }, + { + "epoch": 2.036529680365297, + "grad_norm": 11.612112998962402, + "learning_rate": 8.849315068493152e-06, + "loss": 0.1646, + "step": 2230 + }, + { + "epoch": 2.0374429223744293, + "grad_norm": 23.702648162841797, + "learning_rate": 8.848300355149671e-06, + "loss": 0.3979, + "step": 2231 + }, + { + "epoch": 2.0383561643835617, + "grad_norm": 20.77340316772461, + "learning_rate": 8.84728564180619e-06, + "loss": 0.3555, + "step": 2232 + }, + { + "epoch": 2.039269406392694, + "grad_norm": 23.478239059448242, + "learning_rate": 8.84627092846271e-06, + "loss": 0.3638, + "step": 2233 + }, + { + "epoch": 2.0401826484018266, + "grad_norm": 16.110736846923828, + "learning_rate": 8.84525621511923e-06, + "loss": 0.2488, + "step": 2234 + }, + { + "epoch": 2.041095890410959, + "grad_norm": 18.49338150024414, + "learning_rate": 8.844241501775748e-06, + "loss": 0.3211, + "step": 2235 + }, + { + "epoch": 2.0420091324200915, + "grad_norm": 31.17902374267578, + "learning_rate": 8.843226788432269e-06, + "loss": 0.3316, + "step": 2236 + }, + { + "epoch": 2.0429223744292235, + "grad_norm": 5.485903739929199, + "learning_rate": 8.842212075088789e-06, + "loss": 0.0667, + "step": 2237 + }, + { + "epoch": 2.043835616438356, + "grad_norm": 15.04597282409668, + "learning_rate": 8.841197361745308e-06, + "loss": 0.2098, + "step": 2238 + }, + { + "epoch": 2.0447488584474884, + "grad_norm": 31.933670043945312, + "learning_rate": 8.840182648401827e-06, + "loss": 0.724, + "step": 2239 + }, + { + "epoch": 2.045662100456621, + "grad_norm": 7.834482669830322, + "learning_rate": 8.839167935058347e-06, + "loss": 0.1046, + "step": 2240 + }, + { + "epoch": 2.0465753424657533, + "grad_norm": 1.1873388290405273, + "learning_rate": 8.838153221714866e-06, + "loss": 0.0107, + "step": 2241 + }, + { + "epoch": 2.047488584474886, + "grad_norm": 1.3374353647232056, + "learning_rate": 8.837138508371385e-06, + "loss": 0.0181, + "step": 2242 + }, + { + "epoch": 2.0484018264840183, + "grad_norm": 8.083165168762207, + "learning_rate": 8.836123795027906e-06, + "loss": 0.0941, + "step": 2243 + }, + { + "epoch": 2.0493150684931507, + "grad_norm": 72.96696472167969, + "learning_rate": 8.835109081684426e-06, + "loss": 1.1676, + "step": 2244 + }, + { + "epoch": 2.050228310502283, + "grad_norm": 20.56219482421875, + "learning_rate": 8.834094368340945e-06, + "loss": 0.3013, + "step": 2245 + }, + { + "epoch": 2.0511415525114156, + "grad_norm": 3.789736032485962, + "learning_rate": 8.833079654997464e-06, + "loss": 0.0391, + "step": 2246 + }, + { + "epoch": 2.052054794520548, + "grad_norm": 53.66828918457031, + "learning_rate": 8.832064941653984e-06, + "loss": 1.4982, + "step": 2247 + }, + { + "epoch": 2.0529680365296805, + "grad_norm": 13.110556602478027, + "learning_rate": 8.831050228310503e-06, + "loss": 0.122, + "step": 2248 + }, + { + "epoch": 2.053881278538813, + "grad_norm": 11.473655700683594, + "learning_rate": 8.830035514967022e-06, + "loss": 0.1141, + "step": 2249 + }, + { + "epoch": 2.0547945205479454, + "grad_norm": 20.443531036376953, + "learning_rate": 8.829020801623543e-06, + "loss": 0.3316, + "step": 2250 + }, + { + "epoch": 2.0557077625570774, + "grad_norm": 34.195587158203125, + "learning_rate": 8.828006088280061e-06, + "loss": 0.5567, + "step": 2251 + }, + { + "epoch": 2.05662100456621, + "grad_norm": 4.849049091339111, + "learning_rate": 8.82699137493658e-06, + "loss": 0.0394, + "step": 2252 + }, + { + "epoch": 2.0575342465753423, + "grad_norm": 23.617429733276367, + "learning_rate": 8.8259766615931e-06, + "loss": 0.3491, + "step": 2253 + }, + { + "epoch": 2.058447488584475, + "grad_norm": 0.8062554597854614, + "learning_rate": 8.824961948249621e-06, + "loss": 0.0085, + "step": 2254 + }, + { + "epoch": 2.0593607305936072, + "grad_norm": 43.33163070678711, + "learning_rate": 8.82394723490614e-06, + "loss": 0.9742, + "step": 2255 + }, + { + "epoch": 2.0602739726027397, + "grad_norm": 8.575739860534668, + "learning_rate": 8.822932521562659e-06, + "loss": 0.1136, + "step": 2256 + }, + { + "epoch": 2.061187214611872, + "grad_norm": 8.394306182861328, + "learning_rate": 8.82191780821918e-06, + "loss": 0.119, + "step": 2257 + }, + { + "epoch": 2.0621004566210046, + "grad_norm": 29.557575225830078, + "learning_rate": 8.820903094875698e-06, + "loss": 0.4562, + "step": 2258 + }, + { + "epoch": 2.063013698630137, + "grad_norm": 82.86088562011719, + "learning_rate": 8.819888381532217e-06, + "loss": 6.1609, + "step": 2259 + }, + { + "epoch": 2.0639269406392695, + "grad_norm": 23.313156127929688, + "learning_rate": 8.818873668188738e-06, + "loss": 0.1484, + "step": 2260 + }, + { + "epoch": 2.064840182648402, + "grad_norm": 3.65464186668396, + "learning_rate": 8.817858954845257e-06, + "loss": 0.055, + "step": 2261 + }, + { + "epoch": 2.0657534246575344, + "grad_norm": 2.975421905517578, + "learning_rate": 8.816844241501777e-06, + "loss": 0.0275, + "step": 2262 + }, + { + "epoch": 2.066666666666667, + "grad_norm": 8.197759628295898, + "learning_rate": 8.815829528158296e-06, + "loss": 0.1187, + "step": 2263 + }, + { + "epoch": 2.067579908675799, + "grad_norm": 22.734737396240234, + "learning_rate": 8.814814814814817e-06, + "loss": 0.348, + "step": 2264 + }, + { + "epoch": 2.0684931506849313, + "grad_norm": 7.389542579650879, + "learning_rate": 8.813800101471335e-06, + "loss": 0.0812, + "step": 2265 + }, + { + "epoch": 2.069406392694064, + "grad_norm": 15.192793846130371, + "learning_rate": 8.812785388127854e-06, + "loss": 0.2075, + "step": 2266 + }, + { + "epoch": 2.0703196347031962, + "grad_norm": 5.353795528411865, + "learning_rate": 8.811770674784375e-06, + "loss": 0.0686, + "step": 2267 + }, + { + "epoch": 2.0712328767123287, + "grad_norm": 15.80949878692627, + "learning_rate": 8.810755961440894e-06, + "loss": 0.2895, + "step": 2268 + }, + { + "epoch": 2.072146118721461, + "grad_norm": 0.6923192143440247, + "learning_rate": 8.809741248097412e-06, + "loss": 0.007, + "step": 2269 + }, + { + "epoch": 2.0730593607305936, + "grad_norm": 47.099082946777344, + "learning_rate": 8.808726534753933e-06, + "loss": 0.6433, + "step": 2270 + }, + { + "epoch": 2.073972602739726, + "grad_norm": 5.944159507751465, + "learning_rate": 8.807711821410452e-06, + "loss": 0.0618, + "step": 2271 + }, + { + "epoch": 2.0748858447488585, + "grad_norm": 2.2993931770324707, + "learning_rate": 8.806697108066972e-06, + "loss": 0.0219, + "step": 2272 + }, + { + "epoch": 2.075799086757991, + "grad_norm": 2.3433456420898438, + "learning_rate": 8.805682394723491e-06, + "loss": 0.0404, + "step": 2273 + }, + { + "epoch": 2.0767123287671234, + "grad_norm": 3.2806525230407715, + "learning_rate": 8.804667681380012e-06, + "loss": 0.0406, + "step": 2274 + }, + { + "epoch": 2.077625570776256, + "grad_norm": 27.95182228088379, + "learning_rate": 8.80365296803653e-06, + "loss": 0.4801, + "step": 2275 + }, + { + "epoch": 2.0785388127853883, + "grad_norm": 84.80361938476562, + "learning_rate": 8.80263825469305e-06, + "loss": 5.137, + "step": 2276 + }, + { + "epoch": 2.0794520547945208, + "grad_norm": 36.49735641479492, + "learning_rate": 8.80162354134957e-06, + "loss": 1.0121, + "step": 2277 + }, + { + "epoch": 2.080365296803653, + "grad_norm": 47.45792007446289, + "learning_rate": 8.800608828006089e-06, + "loss": 0.9483, + "step": 2278 + }, + { + "epoch": 2.0812785388127852, + "grad_norm": 45.64060974121094, + "learning_rate": 8.799594114662608e-06, + "loss": 0.6528, + "step": 2279 + }, + { + "epoch": 2.0821917808219177, + "grad_norm": 28.979110717773438, + "learning_rate": 8.798579401319128e-06, + "loss": 0.3313, + "step": 2280 + }, + { + "epoch": 2.08310502283105, + "grad_norm": 31.54657554626465, + "learning_rate": 8.797564687975647e-06, + "loss": 0.8262, + "step": 2281 + }, + { + "epoch": 2.0840182648401826, + "grad_norm": 22.40372085571289, + "learning_rate": 8.796549974632168e-06, + "loss": 0.2795, + "step": 2282 + }, + { + "epoch": 2.084931506849315, + "grad_norm": 28.319671630859375, + "learning_rate": 8.795535261288686e-06, + "loss": 0.3116, + "step": 2283 + }, + { + "epoch": 2.0858447488584475, + "grad_norm": 2.6901209354400635, + "learning_rate": 8.794520547945207e-06, + "loss": 0.0282, + "step": 2284 + }, + { + "epoch": 2.08675799086758, + "grad_norm": 7.214332580566406, + "learning_rate": 8.793505834601726e-06, + "loss": 0.052, + "step": 2285 + }, + { + "epoch": 2.0876712328767124, + "grad_norm": 20.22762107849121, + "learning_rate": 8.792491121258245e-06, + "loss": 0.2691, + "step": 2286 + }, + { + "epoch": 2.088584474885845, + "grad_norm": 48.72825622558594, + "learning_rate": 8.791476407914765e-06, + "loss": 1.4279, + "step": 2287 + }, + { + "epoch": 2.0894977168949773, + "grad_norm": 35.47091293334961, + "learning_rate": 8.790461694571284e-06, + "loss": 0.7009, + "step": 2288 + }, + { + "epoch": 2.0904109589041098, + "grad_norm": 34.72886657714844, + "learning_rate": 8.789446981227805e-06, + "loss": 0.2645, + "step": 2289 + }, + { + "epoch": 2.091324200913242, + "grad_norm": 31.303110122680664, + "learning_rate": 8.788432267884323e-06, + "loss": 0.6412, + "step": 2290 + }, + { + "epoch": 2.0922374429223742, + "grad_norm": 0.7096964120864868, + "learning_rate": 8.787417554540842e-06, + "loss": 0.0065, + "step": 2291 + }, + { + "epoch": 2.0931506849315067, + "grad_norm": 38.92616271972656, + "learning_rate": 8.786402841197363e-06, + "loss": 0.8394, + "step": 2292 + }, + { + "epoch": 2.094063926940639, + "grad_norm": 37.57209396362305, + "learning_rate": 8.785388127853882e-06, + "loss": 0.6199, + "step": 2293 + }, + { + "epoch": 2.0949771689497716, + "grad_norm": 18.873756408691406, + "learning_rate": 8.784373414510402e-06, + "loss": 0.1742, + "step": 2294 + }, + { + "epoch": 2.095890410958904, + "grad_norm": 21.88759994506836, + "learning_rate": 8.783358701166921e-06, + "loss": 0.265, + "step": 2295 + }, + { + "epoch": 2.0968036529680365, + "grad_norm": 51.55546951293945, + "learning_rate": 8.78234398782344e-06, + "loss": 1.3756, + "step": 2296 + }, + { + "epoch": 2.097716894977169, + "grad_norm": 4.63505220413208, + "learning_rate": 8.78132927447996e-06, + "loss": 0.0499, + "step": 2297 + }, + { + "epoch": 2.0986301369863014, + "grad_norm": 5.776541709899902, + "learning_rate": 8.78031456113648e-06, + "loss": 0.0668, + "step": 2298 + }, + { + "epoch": 2.099543378995434, + "grad_norm": 4.5936479568481445, + "learning_rate": 8.779299847793e-06, + "loss": 0.0594, + "step": 2299 + }, + { + "epoch": 2.1004566210045663, + "grad_norm": 12.244194984436035, + "learning_rate": 8.778285134449519e-06, + "loss": 0.1292, + "step": 2300 + }, + { + "epoch": 2.1013698630136988, + "grad_norm": 69.3301010131836, + "learning_rate": 8.777270421106037e-06, + "loss": 1.6644, + "step": 2301 + }, + { + "epoch": 2.1022831050228312, + "grad_norm": 37.00226593017578, + "learning_rate": 8.776255707762558e-06, + "loss": 0.8655, + "step": 2302 + }, + { + "epoch": 2.1031963470319637, + "grad_norm": 1.9968937635421753, + "learning_rate": 8.775240994419077e-06, + "loss": 0.0318, + "step": 2303 + }, + { + "epoch": 2.1041095890410957, + "grad_norm": 39.762367248535156, + "learning_rate": 8.774226281075597e-06, + "loss": 0.3807, + "step": 2304 + }, + { + "epoch": 2.105022831050228, + "grad_norm": 27.392995834350586, + "learning_rate": 8.773211567732116e-06, + "loss": 0.3957, + "step": 2305 + }, + { + "epoch": 2.1059360730593606, + "grad_norm": 18.58510971069336, + "learning_rate": 8.772196854388637e-06, + "loss": 0.2244, + "step": 2306 + }, + { + "epoch": 2.106849315068493, + "grad_norm": 24.63326644897461, + "learning_rate": 8.771182141045156e-06, + "loss": 0.4402, + "step": 2307 + }, + { + "epoch": 2.1077625570776255, + "grad_norm": 14.8113431930542, + "learning_rate": 8.770167427701674e-06, + "loss": 0.1384, + "step": 2308 + }, + { + "epoch": 2.108675799086758, + "grad_norm": 11.059378623962402, + "learning_rate": 8.769152714358195e-06, + "loss": 0.1371, + "step": 2309 + }, + { + "epoch": 2.1095890410958904, + "grad_norm": 4.040293216705322, + "learning_rate": 8.768138001014714e-06, + "loss": 0.0508, + "step": 2310 + }, + { + "epoch": 2.110502283105023, + "grad_norm": 18.840560913085938, + "learning_rate": 8.767123287671233e-06, + "loss": 0.1734, + "step": 2311 + }, + { + "epoch": 2.1114155251141553, + "grad_norm": 30.230104446411133, + "learning_rate": 8.766108574327753e-06, + "loss": 0.7127, + "step": 2312 + }, + { + "epoch": 2.1123287671232878, + "grad_norm": 50.8250617980957, + "learning_rate": 8.765093860984272e-06, + "loss": 1.0995, + "step": 2313 + }, + { + "epoch": 2.11324200913242, + "grad_norm": 74.09111022949219, + "learning_rate": 8.764079147640793e-06, + "loss": 2.7381, + "step": 2314 + }, + { + "epoch": 2.1141552511415527, + "grad_norm": 5.536567687988281, + "learning_rate": 8.763064434297311e-06, + "loss": 0.0638, + "step": 2315 + }, + { + "epoch": 2.115068493150685, + "grad_norm": 40.03697204589844, + "learning_rate": 8.762049720953832e-06, + "loss": 0.603, + "step": 2316 + }, + { + "epoch": 2.115981735159817, + "grad_norm": 4.707198143005371, + "learning_rate": 8.761035007610351e-06, + "loss": 0.0593, + "step": 2317 + }, + { + "epoch": 2.1168949771689496, + "grad_norm": 87.72502899169922, + "learning_rate": 8.76002029426687e-06, + "loss": 2.9295, + "step": 2318 + }, + { + "epoch": 2.117808219178082, + "grad_norm": 27.52436065673828, + "learning_rate": 8.75900558092339e-06, + "loss": 0.2984, + "step": 2319 + }, + { + "epoch": 2.1187214611872145, + "grad_norm": 20.998449325561523, + "learning_rate": 8.757990867579909e-06, + "loss": 0.2354, + "step": 2320 + }, + { + "epoch": 2.119634703196347, + "grad_norm": 31.233707427978516, + "learning_rate": 8.756976154236428e-06, + "loss": 0.4301, + "step": 2321 + }, + { + "epoch": 2.1205479452054794, + "grad_norm": 0.6371352672576904, + "learning_rate": 8.755961440892948e-06, + "loss": 0.0049, + "step": 2322 + }, + { + "epoch": 2.121461187214612, + "grad_norm": 12.005119323730469, + "learning_rate": 8.754946727549469e-06, + "loss": 0.1445, + "step": 2323 + }, + { + "epoch": 2.1223744292237443, + "grad_norm": 71.5086898803711, + "learning_rate": 8.753932014205988e-06, + "loss": 1.4607, + "step": 2324 + }, + { + "epoch": 2.1232876712328768, + "grad_norm": 0.8359043002128601, + "learning_rate": 8.752917300862507e-06, + "loss": 0.0089, + "step": 2325 + }, + { + "epoch": 2.124200913242009, + "grad_norm": 0.5089883208274841, + "learning_rate": 8.751902587519027e-06, + "loss": 0.006, + "step": 2326 + }, + { + "epoch": 2.1251141552511417, + "grad_norm": 7.509218215942383, + "learning_rate": 8.750887874175546e-06, + "loss": 0.0805, + "step": 2327 + }, + { + "epoch": 2.126027397260274, + "grad_norm": 1.039494514465332, + "learning_rate": 8.749873160832065e-06, + "loss": 0.011, + "step": 2328 + }, + { + "epoch": 2.1269406392694066, + "grad_norm": 3.6633458137512207, + "learning_rate": 8.748858447488585e-06, + "loss": 0.0347, + "step": 2329 + }, + { + "epoch": 2.127853881278539, + "grad_norm": 26.848947525024414, + "learning_rate": 8.747843734145104e-06, + "loss": 0.3287, + "step": 2330 + }, + { + "epoch": 2.128767123287671, + "grad_norm": 14.931839942932129, + "learning_rate": 8.746829020801623e-06, + "loss": 0.161, + "step": 2331 + }, + { + "epoch": 2.1296803652968035, + "grad_norm": 32.15043258666992, + "learning_rate": 8.745814307458144e-06, + "loss": 0.4705, + "step": 2332 + }, + { + "epoch": 2.130593607305936, + "grad_norm": 38.9244270324707, + "learning_rate": 8.744799594114664e-06, + "loss": 0.6821, + "step": 2333 + }, + { + "epoch": 2.1315068493150684, + "grad_norm": 11.608969688415527, + "learning_rate": 8.743784880771183e-06, + "loss": 0.1203, + "step": 2334 + }, + { + "epoch": 2.132420091324201, + "grad_norm": 77.22737884521484, + "learning_rate": 8.742770167427702e-06, + "loss": 1.2664, + "step": 2335 + }, + { + "epoch": 2.1333333333333333, + "grad_norm": 3.7683231830596924, + "learning_rate": 8.741755454084222e-06, + "loss": 0.0599, + "step": 2336 + }, + { + "epoch": 2.1342465753424658, + "grad_norm": 61.02764892578125, + "learning_rate": 8.740740740740741e-06, + "loss": 1.4999, + "step": 2337 + }, + { + "epoch": 2.135159817351598, + "grad_norm": 2.6975436210632324, + "learning_rate": 8.73972602739726e-06, + "loss": 0.0345, + "step": 2338 + }, + { + "epoch": 2.1360730593607307, + "grad_norm": 18.23945426940918, + "learning_rate": 8.73871131405378e-06, + "loss": 0.2867, + "step": 2339 + }, + { + "epoch": 2.136986301369863, + "grad_norm": 41.18519973754883, + "learning_rate": 8.7376966007103e-06, + "loss": 1.1038, + "step": 2340 + }, + { + "epoch": 2.1378995433789956, + "grad_norm": 3.3781490325927734, + "learning_rate": 8.736681887366818e-06, + "loss": 0.0564, + "step": 2341 + }, + { + "epoch": 2.138812785388128, + "grad_norm": 6.411795139312744, + "learning_rate": 8.735667174023339e-06, + "loss": 0.0645, + "step": 2342 + }, + { + "epoch": 2.1397260273972605, + "grad_norm": 5.428591251373291, + "learning_rate": 8.73465246067986e-06, + "loss": 0.0614, + "step": 2343 + }, + { + "epoch": 2.1406392694063925, + "grad_norm": 8.027497291564941, + "learning_rate": 8.733637747336378e-06, + "loss": 0.088, + "step": 2344 + }, + { + "epoch": 2.141552511415525, + "grad_norm": 2.7638981342315674, + "learning_rate": 8.732623033992897e-06, + "loss": 0.0232, + "step": 2345 + }, + { + "epoch": 2.1424657534246574, + "grad_norm": 4.429522514343262, + "learning_rate": 8.731608320649418e-06, + "loss": 0.0571, + "step": 2346 + }, + { + "epoch": 2.14337899543379, + "grad_norm": 6.770659446716309, + "learning_rate": 8.730593607305937e-06, + "loss": 0.0829, + "step": 2347 + }, + { + "epoch": 2.1442922374429223, + "grad_norm": 18.74239158630371, + "learning_rate": 8.729578893962455e-06, + "loss": 0.3177, + "step": 2348 + }, + { + "epoch": 2.1452054794520548, + "grad_norm": 12.126999855041504, + "learning_rate": 8.728564180618976e-06, + "loss": 0.1717, + "step": 2349 + }, + { + "epoch": 2.146118721461187, + "grad_norm": 5.817458629608154, + "learning_rate": 8.727549467275496e-06, + "loss": 0.065, + "step": 2350 + }, + { + "epoch": 2.1470319634703197, + "grad_norm": 12.04076862335205, + "learning_rate": 8.726534753932014e-06, + "loss": 0.1068, + "step": 2351 + }, + { + "epoch": 2.147945205479452, + "grad_norm": 66.51536560058594, + "learning_rate": 8.725520040588534e-06, + "loss": 2.0818, + "step": 2352 + }, + { + "epoch": 2.1488584474885846, + "grad_norm": 12.905122756958008, + "learning_rate": 8.724505327245055e-06, + "loss": 0.163, + "step": 2353 + }, + { + "epoch": 2.149771689497717, + "grad_norm": 0.679485559463501, + "learning_rate": 8.723490613901574e-06, + "loss": 0.0034, + "step": 2354 + }, + { + "epoch": 2.1506849315068495, + "grad_norm": 23.456323623657227, + "learning_rate": 8.722475900558092e-06, + "loss": 0.3233, + "step": 2355 + }, + { + "epoch": 2.151598173515982, + "grad_norm": 46.86351013183594, + "learning_rate": 8.721461187214613e-06, + "loss": 0.4924, + "step": 2356 + }, + { + "epoch": 2.1525114155251144, + "grad_norm": 3.6552557945251465, + "learning_rate": 8.720446473871132e-06, + "loss": 0.0476, + "step": 2357 + }, + { + "epoch": 2.1534246575342464, + "grad_norm": 18.557411193847656, + "learning_rate": 8.71943176052765e-06, + "loss": 0.2607, + "step": 2358 + }, + { + "epoch": 2.154337899543379, + "grad_norm": 0.7075247764587402, + "learning_rate": 8.718417047184171e-06, + "loss": 0.0055, + "step": 2359 + }, + { + "epoch": 2.1552511415525113, + "grad_norm": 15.82413387298584, + "learning_rate": 8.717402333840692e-06, + "loss": 0.2252, + "step": 2360 + }, + { + "epoch": 2.1561643835616437, + "grad_norm": 2.352525234222412, + "learning_rate": 8.71638762049721e-06, + "loss": 0.0234, + "step": 2361 + }, + { + "epoch": 2.157077625570776, + "grad_norm": 16.970657348632812, + "learning_rate": 8.71537290715373e-06, + "loss": 0.2167, + "step": 2362 + }, + { + "epoch": 2.1579908675799087, + "grad_norm": 7.944981575012207, + "learning_rate": 8.71435819381025e-06, + "loss": 0.0774, + "step": 2363 + }, + { + "epoch": 2.158904109589041, + "grad_norm": 115.69821166992188, + "learning_rate": 8.713343480466769e-06, + "loss": 0.8875, + "step": 2364 + }, + { + "epoch": 2.1598173515981736, + "grad_norm": 11.285526275634766, + "learning_rate": 8.712328767123288e-06, + "loss": 0.1632, + "step": 2365 + }, + { + "epoch": 2.160730593607306, + "grad_norm": 17.417407989501953, + "learning_rate": 8.711314053779808e-06, + "loss": 0.2418, + "step": 2366 + }, + { + "epoch": 2.1616438356164385, + "grad_norm": 17.864118576049805, + "learning_rate": 8.710299340436329e-06, + "loss": 0.2359, + "step": 2367 + }, + { + "epoch": 2.162557077625571, + "grad_norm": 1.6118557453155518, + "learning_rate": 8.709284627092846e-06, + "loss": 0.0204, + "step": 2368 + }, + { + "epoch": 2.1634703196347034, + "grad_norm": 19.0137882232666, + "learning_rate": 8.708269913749366e-06, + "loss": 0.2559, + "step": 2369 + }, + { + "epoch": 2.1643835616438354, + "grad_norm": 13.817403793334961, + "learning_rate": 8.707255200405887e-06, + "loss": 0.1319, + "step": 2370 + }, + { + "epoch": 2.165296803652968, + "grad_norm": 19.294809341430664, + "learning_rate": 8.706240487062406e-06, + "loss": 0.1571, + "step": 2371 + }, + { + "epoch": 2.1662100456621003, + "grad_norm": 2.148376941680908, + "learning_rate": 8.705225773718925e-06, + "loss": 0.0288, + "step": 2372 + }, + { + "epoch": 2.1671232876712327, + "grad_norm": 66.64163970947266, + "learning_rate": 8.704211060375445e-06, + "loss": 1.245, + "step": 2373 + }, + { + "epoch": 2.168036529680365, + "grad_norm": 81.16895294189453, + "learning_rate": 8.703196347031964e-06, + "loss": 2.9863, + "step": 2374 + }, + { + "epoch": 2.1689497716894977, + "grad_norm": 17.258771896362305, + "learning_rate": 8.702181633688483e-06, + "loss": 0.2609, + "step": 2375 + }, + { + "epoch": 2.16986301369863, + "grad_norm": 0.7045344710350037, + "learning_rate": 8.701166920345003e-06, + "loss": 0.0063, + "step": 2376 + }, + { + "epoch": 2.1707762557077626, + "grad_norm": 9.62130069732666, + "learning_rate": 8.700152207001524e-06, + "loss": 0.091, + "step": 2377 + }, + { + "epoch": 2.171689497716895, + "grad_norm": 18.1558837890625, + "learning_rate": 8.699137493658043e-06, + "loss": 0.2362, + "step": 2378 + }, + { + "epoch": 2.1726027397260275, + "grad_norm": 5.992794036865234, + "learning_rate": 8.698122780314562e-06, + "loss": 0.0466, + "step": 2379 + }, + { + "epoch": 2.17351598173516, + "grad_norm": 2.960843086242676, + "learning_rate": 8.697108066971082e-06, + "loss": 0.0368, + "step": 2380 + }, + { + "epoch": 2.1744292237442924, + "grad_norm": 35.6290283203125, + "learning_rate": 8.696093353627601e-06, + "loss": 0.4791, + "step": 2381 + }, + { + "epoch": 2.175342465753425, + "grad_norm": 53.11851119995117, + "learning_rate": 8.69507864028412e-06, + "loss": 0.7208, + "step": 2382 + }, + { + "epoch": 2.1762557077625573, + "grad_norm": 3.4597091674804688, + "learning_rate": 8.69406392694064e-06, + "loss": 0.0117, + "step": 2383 + }, + { + "epoch": 2.1771689497716897, + "grad_norm": 45.916778564453125, + "learning_rate": 8.69304921359716e-06, + "loss": 0.6558, + "step": 2384 + }, + { + "epoch": 2.1780821917808217, + "grad_norm": 11.718358039855957, + "learning_rate": 8.692034500253678e-06, + "loss": 0.1128, + "step": 2385 + }, + { + "epoch": 2.178995433789954, + "grad_norm": 0.7710853815078735, + "learning_rate": 8.691019786910199e-06, + "loss": 0.0082, + "step": 2386 + }, + { + "epoch": 2.1799086757990866, + "grad_norm": 6.511626720428467, + "learning_rate": 8.690005073566719e-06, + "loss": 0.0744, + "step": 2387 + }, + { + "epoch": 2.180821917808219, + "grad_norm": 9.516329765319824, + "learning_rate": 8.688990360223238e-06, + "loss": 0.1021, + "step": 2388 + }, + { + "epoch": 2.1817351598173516, + "grad_norm": 64.71952819824219, + "learning_rate": 8.687975646879757e-06, + "loss": 1.1967, + "step": 2389 + }, + { + "epoch": 2.182648401826484, + "grad_norm": 6.1615400314331055, + "learning_rate": 8.686960933536277e-06, + "loss": 0.0734, + "step": 2390 + }, + { + "epoch": 2.1835616438356165, + "grad_norm": 4.245400905609131, + "learning_rate": 8.685946220192796e-06, + "loss": 0.0431, + "step": 2391 + }, + { + "epoch": 2.184474885844749, + "grad_norm": 92.30901336669922, + "learning_rate": 8.684931506849315e-06, + "loss": 2.3967, + "step": 2392 + }, + { + "epoch": 2.1853881278538814, + "grad_norm": 4.96692419052124, + "learning_rate": 8.683916793505836e-06, + "loss": 0.0381, + "step": 2393 + }, + { + "epoch": 2.186301369863014, + "grad_norm": 1.5251747369766235, + "learning_rate": 8.682902080162356e-06, + "loss": 0.0099, + "step": 2394 + }, + { + "epoch": 2.1872146118721463, + "grad_norm": 0.8557568192481995, + "learning_rate": 8.681887366818873e-06, + "loss": 0.0043, + "step": 2395 + }, + { + "epoch": 2.1881278538812787, + "grad_norm": 46.55625534057617, + "learning_rate": 8.680872653475394e-06, + "loss": 0.768, + "step": 2396 + }, + { + "epoch": 2.1890410958904107, + "grad_norm": 37.159793853759766, + "learning_rate": 8.679857940131914e-06, + "loss": 0.7693, + "step": 2397 + }, + { + "epoch": 2.189954337899543, + "grad_norm": 34.30898666381836, + "learning_rate": 8.678843226788433e-06, + "loss": 0.5757, + "step": 2398 + }, + { + "epoch": 2.1908675799086756, + "grad_norm": 6.654991149902344, + "learning_rate": 8.677828513444952e-06, + "loss": 0.0716, + "step": 2399 + }, + { + "epoch": 2.191780821917808, + "grad_norm": 0.5525369048118591, + "learning_rate": 8.676813800101473e-06, + "loss": 0.0069, + "step": 2400 + }, + { + "epoch": 2.1926940639269406, + "grad_norm": 16.0076904296875, + "learning_rate": 8.675799086757991e-06, + "loss": 0.1643, + "step": 2401 + }, + { + "epoch": 2.193607305936073, + "grad_norm": 16.825511932373047, + "learning_rate": 8.67478437341451e-06, + "loss": 0.1836, + "step": 2402 + }, + { + "epoch": 2.1945205479452055, + "grad_norm": 100.20521545410156, + "learning_rate": 8.67376966007103e-06, + "loss": 2.6957, + "step": 2403 + }, + { + "epoch": 2.195433789954338, + "grad_norm": 33.35700225830078, + "learning_rate": 8.672754946727551e-06, + "loss": 0.3809, + "step": 2404 + }, + { + "epoch": 2.1963470319634704, + "grad_norm": 2.8637709617614746, + "learning_rate": 8.67174023338407e-06, + "loss": 0.0407, + "step": 2405 + }, + { + "epoch": 2.197260273972603, + "grad_norm": 7.062195301055908, + "learning_rate": 8.670725520040589e-06, + "loss": 0.0607, + "step": 2406 + }, + { + "epoch": 2.1981735159817353, + "grad_norm": 4.075605392456055, + "learning_rate": 8.66971080669711e-06, + "loss": 0.0535, + "step": 2407 + }, + { + "epoch": 2.1990867579908677, + "grad_norm": 3.933199405670166, + "learning_rate": 8.668696093353628e-06, + "loss": 0.0306, + "step": 2408 + }, + { + "epoch": 2.2, + "grad_norm": 24.790237426757812, + "learning_rate": 8.667681380010147e-06, + "loss": 0.2275, + "step": 2409 + }, + { + "epoch": 2.2009132420091326, + "grad_norm": 7.80652379989624, + "learning_rate": 8.666666666666668e-06, + "loss": 0.0775, + "step": 2410 + }, + { + "epoch": 2.2018264840182646, + "grad_norm": 36.266639709472656, + "learning_rate": 8.665651953323187e-06, + "loss": 0.5542, + "step": 2411 + }, + { + "epoch": 2.202739726027397, + "grad_norm": 6.6564106941223145, + "learning_rate": 8.664637239979706e-06, + "loss": 0.0519, + "step": 2412 + }, + { + "epoch": 2.2036529680365295, + "grad_norm": 19.617307662963867, + "learning_rate": 8.663622526636226e-06, + "loss": 0.3169, + "step": 2413 + }, + { + "epoch": 2.204566210045662, + "grad_norm": 11.541341781616211, + "learning_rate": 8.662607813292747e-06, + "loss": 0.1362, + "step": 2414 + }, + { + "epoch": 2.2054794520547945, + "grad_norm": 9.774374961853027, + "learning_rate": 8.661593099949265e-06, + "loss": 0.0962, + "step": 2415 + }, + { + "epoch": 2.206392694063927, + "grad_norm": 33.94244384765625, + "learning_rate": 8.660578386605784e-06, + "loss": 0.2418, + "step": 2416 + }, + { + "epoch": 2.2073059360730594, + "grad_norm": 22.73053741455078, + "learning_rate": 8.659563673262305e-06, + "loss": 0.2037, + "step": 2417 + }, + { + "epoch": 2.208219178082192, + "grad_norm": 97.52508544921875, + "learning_rate": 8.658548959918824e-06, + "loss": 2.3292, + "step": 2418 + }, + { + "epoch": 2.2091324200913243, + "grad_norm": 16.156827926635742, + "learning_rate": 8.657534246575343e-06, + "loss": 0.1121, + "step": 2419 + }, + { + "epoch": 2.2100456621004567, + "grad_norm": 25.874643325805664, + "learning_rate": 8.656519533231863e-06, + "loss": 0.3268, + "step": 2420 + }, + { + "epoch": 2.210958904109589, + "grad_norm": 27.777057647705078, + "learning_rate": 8.655504819888382e-06, + "loss": 0.3854, + "step": 2421 + }, + { + "epoch": 2.2118721461187216, + "grad_norm": 8.711482048034668, + "learning_rate": 8.654490106544902e-06, + "loss": 0.0873, + "step": 2422 + }, + { + "epoch": 2.212785388127854, + "grad_norm": 34.30756378173828, + "learning_rate": 8.653475393201421e-06, + "loss": 0.2699, + "step": 2423 + }, + { + "epoch": 2.213698630136986, + "grad_norm": 2.8007800579071045, + "learning_rate": 8.652460679857942e-06, + "loss": 0.023, + "step": 2424 + }, + { + "epoch": 2.2146118721461185, + "grad_norm": 33.672943115234375, + "learning_rate": 8.65144596651446e-06, + "loss": 0.5924, + "step": 2425 + }, + { + "epoch": 2.215525114155251, + "grad_norm": 74.95576477050781, + "learning_rate": 8.65043125317098e-06, + "loss": 0.7891, + "step": 2426 + }, + { + "epoch": 2.2164383561643834, + "grad_norm": 10.385817527770996, + "learning_rate": 8.6494165398275e-06, + "loss": 0.0865, + "step": 2427 + }, + { + "epoch": 2.217351598173516, + "grad_norm": 9.591035842895508, + "learning_rate": 8.648401826484019e-06, + "loss": 0.0581, + "step": 2428 + }, + { + "epoch": 2.2182648401826484, + "grad_norm": 55.1292610168457, + "learning_rate": 8.647387113140538e-06, + "loss": 1.0454, + "step": 2429 + }, + { + "epoch": 2.219178082191781, + "grad_norm": 38.48688507080078, + "learning_rate": 8.646372399797058e-06, + "loss": 0.3057, + "step": 2430 + }, + { + "epoch": 2.2200913242009133, + "grad_norm": 30.678316116333008, + "learning_rate": 8.645357686453577e-06, + "loss": 0.4022, + "step": 2431 + }, + { + "epoch": 2.2210045662100457, + "grad_norm": 68.80884552001953, + "learning_rate": 8.644342973110098e-06, + "loss": 2.1517, + "step": 2432 + }, + { + "epoch": 2.221917808219178, + "grad_norm": 31.008066177368164, + "learning_rate": 8.643328259766617e-06, + "loss": 0.303, + "step": 2433 + }, + { + "epoch": 2.2228310502283106, + "grad_norm": 67.28533935546875, + "learning_rate": 8.642313546423137e-06, + "loss": 1.425, + "step": 2434 + }, + { + "epoch": 2.223744292237443, + "grad_norm": 32.93142318725586, + "learning_rate": 8.641298833079656e-06, + "loss": 0.3398, + "step": 2435 + }, + { + "epoch": 2.2246575342465755, + "grad_norm": 26.35773277282715, + "learning_rate": 8.640284119736175e-06, + "loss": 0.343, + "step": 2436 + }, + { + "epoch": 2.225570776255708, + "grad_norm": 41.266544342041016, + "learning_rate": 8.639269406392695e-06, + "loss": 0.2086, + "step": 2437 + }, + { + "epoch": 2.22648401826484, + "grad_norm": 48.4571647644043, + "learning_rate": 8.638254693049214e-06, + "loss": 0.7046, + "step": 2438 + }, + { + "epoch": 2.2273972602739724, + "grad_norm": 17.340442657470703, + "learning_rate": 8.637239979705733e-06, + "loss": 0.0611, + "step": 2439 + }, + { + "epoch": 2.228310502283105, + "grad_norm": 214.38241577148438, + "learning_rate": 8.636225266362254e-06, + "loss": 0.9995, + "step": 2440 + }, + { + "epoch": 2.2292237442922374, + "grad_norm": 217.89306640625, + "learning_rate": 8.635210553018772e-06, + "loss": 1.0913, + "step": 2441 + }, + { + "epoch": 2.23013698630137, + "grad_norm": 32.342796325683594, + "learning_rate": 8.634195839675293e-06, + "loss": 0.3613, + "step": 2442 + }, + { + "epoch": 2.2310502283105023, + "grad_norm": 24.00410270690918, + "learning_rate": 8.633181126331812e-06, + "loss": 0.2844, + "step": 2443 + }, + { + "epoch": 2.2319634703196347, + "grad_norm": 6.826633930206299, + "learning_rate": 8.632166412988332e-06, + "loss": 0.0626, + "step": 2444 + }, + { + "epoch": 2.232876712328767, + "grad_norm": 1.3984366655349731, + "learning_rate": 8.631151699644851e-06, + "loss": 0.0117, + "step": 2445 + }, + { + "epoch": 2.2337899543378996, + "grad_norm": 22.326032638549805, + "learning_rate": 8.63013698630137e-06, + "loss": 0.1637, + "step": 2446 + }, + { + "epoch": 2.234703196347032, + "grad_norm": 17.513853073120117, + "learning_rate": 8.62912227295789e-06, + "loss": 0.225, + "step": 2447 + }, + { + "epoch": 2.2356164383561645, + "grad_norm": 1.5506482124328613, + "learning_rate": 8.62810755961441e-06, + "loss": 0.0154, + "step": 2448 + }, + { + "epoch": 2.236529680365297, + "grad_norm": 5.171593189239502, + "learning_rate": 8.62709284627093e-06, + "loss": 0.0533, + "step": 2449 + }, + { + "epoch": 2.237442922374429, + "grad_norm": 91.62384796142578, + "learning_rate": 8.626078132927449e-06, + "loss": 1.5812, + "step": 2450 + }, + { + "epoch": 2.2383561643835614, + "grad_norm": 27.26803970336914, + "learning_rate": 8.625063419583968e-06, + "loss": 0.3038, + "step": 2451 + }, + { + "epoch": 2.239269406392694, + "grad_norm": 11.299261093139648, + "learning_rate": 8.624048706240488e-06, + "loss": 0.1382, + "step": 2452 + }, + { + "epoch": 2.2401826484018263, + "grad_norm": 12.083605766296387, + "learning_rate": 8.623033992897007e-06, + "loss": 0.1558, + "step": 2453 + }, + { + "epoch": 2.241095890410959, + "grad_norm": 19.495601654052734, + "learning_rate": 8.622019279553528e-06, + "loss": 0.155, + "step": 2454 + }, + { + "epoch": 2.2420091324200913, + "grad_norm": 55.157264709472656, + "learning_rate": 8.621004566210046e-06, + "loss": 0.9264, + "step": 2455 + }, + { + "epoch": 2.2429223744292237, + "grad_norm": 20.250322341918945, + "learning_rate": 8.619989852866565e-06, + "loss": 0.1999, + "step": 2456 + }, + { + "epoch": 2.243835616438356, + "grad_norm": 42.36797332763672, + "learning_rate": 8.618975139523086e-06, + "loss": 0.75, + "step": 2457 + }, + { + "epoch": 2.2447488584474886, + "grad_norm": 56.07738494873047, + "learning_rate": 8.617960426179605e-06, + "loss": 0.7569, + "step": 2458 + }, + { + "epoch": 2.245662100456621, + "grad_norm": 2.907106876373291, + "learning_rate": 8.616945712836125e-06, + "loss": 0.0193, + "step": 2459 + }, + { + "epoch": 2.2465753424657535, + "grad_norm": 73.21109008789062, + "learning_rate": 8.615930999492644e-06, + "loss": 2.0594, + "step": 2460 + }, + { + "epoch": 2.247488584474886, + "grad_norm": 39.634796142578125, + "learning_rate": 8.614916286149163e-06, + "loss": 0.37, + "step": 2461 + }, + { + "epoch": 2.2484018264840184, + "grad_norm": 9.39201831817627, + "learning_rate": 8.613901572805683e-06, + "loss": 0.1009, + "step": 2462 + }, + { + "epoch": 2.249315068493151, + "grad_norm": 29.542251586914062, + "learning_rate": 8.612886859462202e-06, + "loss": 0.8135, + "step": 2463 + }, + { + "epoch": 2.2502283105022833, + "grad_norm": 11.01793384552002, + "learning_rate": 8.611872146118723e-06, + "loss": 0.103, + "step": 2464 + }, + { + "epoch": 2.2511415525114153, + "grad_norm": 1.089413046836853, + "learning_rate": 8.610857432775242e-06, + "loss": 0.008, + "step": 2465 + }, + { + "epoch": 2.252054794520548, + "grad_norm": 16.028453826904297, + "learning_rate": 8.609842719431762e-06, + "loss": 0.2565, + "step": 2466 + }, + { + "epoch": 2.2529680365296803, + "grad_norm": 4.2744059562683105, + "learning_rate": 8.608828006088281e-06, + "loss": 0.0343, + "step": 2467 + }, + { + "epoch": 2.2538812785388127, + "grad_norm": 38.262020111083984, + "learning_rate": 8.6078132927448e-06, + "loss": 0.7189, + "step": 2468 + }, + { + "epoch": 2.254794520547945, + "grad_norm": 22.292856216430664, + "learning_rate": 8.60679857940132e-06, + "loss": 0.2362, + "step": 2469 + }, + { + "epoch": 2.2557077625570776, + "grad_norm": 6.176025867462158, + "learning_rate": 8.60578386605784e-06, + "loss": 0.0373, + "step": 2470 + }, + { + "epoch": 2.25662100456621, + "grad_norm": 74.72148132324219, + "learning_rate": 8.604769152714358e-06, + "loss": 4.4792, + "step": 2471 + }, + { + "epoch": 2.2575342465753425, + "grad_norm": 5.742807865142822, + "learning_rate": 8.603754439370879e-06, + "loss": 0.0495, + "step": 2472 + }, + { + "epoch": 2.258447488584475, + "grad_norm": 1.5260032415390015, + "learning_rate": 8.602739726027397e-06, + "loss": 0.014, + "step": 2473 + }, + { + "epoch": 2.2593607305936074, + "grad_norm": 15.377545356750488, + "learning_rate": 8.601725012683918e-06, + "loss": 0.2306, + "step": 2474 + }, + { + "epoch": 2.26027397260274, + "grad_norm": 72.84937286376953, + "learning_rate": 8.600710299340437e-06, + "loss": 0.1464, + "step": 2475 + }, + { + "epoch": 2.2611872146118723, + "grad_norm": 18.12810707092285, + "learning_rate": 8.599695585996957e-06, + "loss": 0.2411, + "step": 2476 + }, + { + "epoch": 2.2621004566210043, + "grad_norm": 1.296041488647461, + "learning_rate": 8.598680872653476e-06, + "loss": 0.01, + "step": 2477 + }, + { + "epoch": 2.263013698630137, + "grad_norm": 24.56373405456543, + "learning_rate": 8.597666159309995e-06, + "loss": 0.3447, + "step": 2478 + }, + { + "epoch": 2.2639269406392692, + "grad_norm": 1.7323980331420898, + "learning_rate": 8.596651445966516e-06, + "loss": 0.0155, + "step": 2479 + }, + { + "epoch": 2.2648401826484017, + "grad_norm": 39.896183013916016, + "learning_rate": 8.595636732623034e-06, + "loss": 0.545, + "step": 2480 + }, + { + "epoch": 2.265753424657534, + "grad_norm": 2.95975399017334, + "learning_rate": 8.594622019279553e-06, + "loss": 0.0336, + "step": 2481 + }, + { + "epoch": 2.2666666666666666, + "grad_norm": 0.554725170135498, + "learning_rate": 8.593607305936074e-06, + "loss": 0.0051, + "step": 2482 + }, + { + "epoch": 2.267579908675799, + "grad_norm": 20.296300888061523, + "learning_rate": 8.592592592592593e-06, + "loss": 0.2011, + "step": 2483 + }, + { + "epoch": 2.2684931506849315, + "grad_norm": 67.0678482055664, + "learning_rate": 8.591577879249113e-06, + "loss": 1.8297, + "step": 2484 + }, + { + "epoch": 2.269406392694064, + "grad_norm": 27.936630249023438, + "learning_rate": 8.590563165905632e-06, + "loss": 0.3167, + "step": 2485 + }, + { + "epoch": 2.2703196347031964, + "grad_norm": 9.453453063964844, + "learning_rate": 8.589548452562153e-06, + "loss": 0.094, + "step": 2486 + }, + { + "epoch": 2.271232876712329, + "grad_norm": 24.065174102783203, + "learning_rate": 8.588533739218671e-06, + "loss": 0.2911, + "step": 2487 + }, + { + "epoch": 2.2721461187214613, + "grad_norm": 44.28615951538086, + "learning_rate": 8.58751902587519e-06, + "loss": 0.7856, + "step": 2488 + }, + { + "epoch": 2.273059360730594, + "grad_norm": 22.359777450561523, + "learning_rate": 8.58650431253171e-06, + "loss": 0.2298, + "step": 2489 + }, + { + "epoch": 2.2739726027397262, + "grad_norm": 23.08578109741211, + "learning_rate": 8.58548959918823e-06, + "loss": 0.4391, + "step": 2490 + }, + { + "epoch": 2.2748858447488587, + "grad_norm": 66.94755554199219, + "learning_rate": 8.584474885844748e-06, + "loss": 1.0808, + "step": 2491 + }, + { + "epoch": 2.2757990867579907, + "grad_norm": 36.23536682128906, + "learning_rate": 8.583460172501269e-06, + "loss": 0.526, + "step": 2492 + }, + { + "epoch": 2.276712328767123, + "grad_norm": 22.811065673828125, + "learning_rate": 8.58244545915779e-06, + "loss": 0.2995, + "step": 2493 + }, + { + "epoch": 2.2776255707762556, + "grad_norm": 51.52692794799805, + "learning_rate": 8.581430745814308e-06, + "loss": 0.7994, + "step": 2494 + }, + { + "epoch": 2.278538812785388, + "grad_norm": 16.293066024780273, + "learning_rate": 8.580416032470827e-06, + "loss": 0.1788, + "step": 2495 + }, + { + "epoch": 2.2794520547945205, + "grad_norm": 11.566032409667969, + "learning_rate": 8.579401319127348e-06, + "loss": 0.1505, + "step": 2496 + }, + { + "epoch": 2.280365296803653, + "grad_norm": 0.3246036767959595, + "learning_rate": 8.578386605783867e-06, + "loss": 0.0021, + "step": 2497 + }, + { + "epoch": 2.2812785388127854, + "grad_norm": 7.979886054992676, + "learning_rate": 8.577371892440385e-06, + "loss": 0.0896, + "step": 2498 + }, + { + "epoch": 2.282191780821918, + "grad_norm": 97.38020324707031, + "learning_rate": 8.576357179096906e-06, + "loss": 1.0619, + "step": 2499 + }, + { + "epoch": 2.2831050228310503, + "grad_norm": 4.804965019226074, + "learning_rate": 8.575342465753425e-06, + "loss": 0.0452, + "step": 2500 + }, + { + "epoch": 2.2840182648401828, + "grad_norm": 16.47871208190918, + "learning_rate": 8.574327752409944e-06, + "loss": 0.1354, + "step": 2501 + }, + { + "epoch": 2.2849315068493152, + "grad_norm": 116.62307739257812, + "learning_rate": 8.573313039066464e-06, + "loss": 5.3118, + "step": 2502 + }, + { + "epoch": 2.2858447488584472, + "grad_norm": 28.51523208618164, + "learning_rate": 8.572298325722985e-06, + "loss": 0.2864, + "step": 2503 + }, + { + "epoch": 2.2867579908675797, + "grad_norm": 97.35140991210938, + "learning_rate": 8.571283612379504e-06, + "loss": 2.3623, + "step": 2504 + }, + { + "epoch": 2.287671232876712, + "grad_norm": 39.22638702392578, + "learning_rate": 8.570268899036022e-06, + "loss": 0.6379, + "step": 2505 + }, + { + "epoch": 2.2885844748858446, + "grad_norm": 83.85657501220703, + "learning_rate": 8.569254185692543e-06, + "loss": 1.95, + "step": 2506 + }, + { + "epoch": 2.289497716894977, + "grad_norm": 5.238973140716553, + "learning_rate": 8.568239472349062e-06, + "loss": 0.0524, + "step": 2507 + }, + { + "epoch": 2.2904109589041095, + "grad_norm": 6.3516669273376465, + "learning_rate": 8.56722475900558e-06, + "loss": 0.0705, + "step": 2508 + }, + { + "epoch": 2.291324200913242, + "grad_norm": 29.124011993408203, + "learning_rate": 8.566210045662101e-06, + "loss": 0.2747, + "step": 2509 + }, + { + "epoch": 2.2922374429223744, + "grad_norm": 37.78256607055664, + "learning_rate": 8.565195332318622e-06, + "loss": 0.5195, + "step": 2510 + }, + { + "epoch": 2.293150684931507, + "grad_norm": 60.615718841552734, + "learning_rate": 8.564180618975139e-06, + "loss": 1.0581, + "step": 2511 + }, + { + "epoch": 2.2940639269406393, + "grad_norm": 66.2877426147461, + "learning_rate": 8.56316590563166e-06, + "loss": 1.3082, + "step": 2512 + }, + { + "epoch": 2.2949771689497718, + "grad_norm": 14.597664833068848, + "learning_rate": 8.56215119228818e-06, + "loss": 0.1482, + "step": 2513 + }, + { + "epoch": 2.2958904109589042, + "grad_norm": 15.945693016052246, + "learning_rate": 8.561136478944699e-06, + "loss": 0.2215, + "step": 2514 + }, + { + "epoch": 2.2968036529680367, + "grad_norm": 5.7241973876953125, + "learning_rate": 8.560121765601218e-06, + "loss": 0.0506, + "step": 2515 + }, + { + "epoch": 2.297716894977169, + "grad_norm": 7.385769367218018, + "learning_rate": 8.559107052257738e-06, + "loss": 0.0825, + "step": 2516 + }, + { + "epoch": 2.2986301369863016, + "grad_norm": 62.43697738647461, + "learning_rate": 8.558092338914257e-06, + "loss": 0.4029, + "step": 2517 + }, + { + "epoch": 2.2995433789954336, + "grad_norm": 31.72388458251953, + "learning_rate": 8.557077625570776e-06, + "loss": 0.3319, + "step": 2518 + }, + { + "epoch": 2.300456621004566, + "grad_norm": 9.59335994720459, + "learning_rate": 8.556062912227296e-06, + "loss": 0.088, + "step": 2519 + }, + { + "epoch": 2.3013698630136985, + "grad_norm": 35.66094970703125, + "learning_rate": 8.555048198883817e-06, + "loss": 0.3802, + "step": 2520 + }, + { + "epoch": 2.302283105022831, + "grad_norm": 10.6251220703125, + "learning_rate": 8.554033485540336e-06, + "loss": 0.0858, + "step": 2521 + }, + { + "epoch": 2.3031963470319634, + "grad_norm": 15.084885597229004, + "learning_rate": 8.553018772196855e-06, + "loss": 0.2103, + "step": 2522 + }, + { + "epoch": 2.304109589041096, + "grad_norm": 2.833773612976074, + "learning_rate": 8.552004058853375e-06, + "loss": 0.0393, + "step": 2523 + }, + { + "epoch": 2.3050228310502283, + "grad_norm": 57.790077209472656, + "learning_rate": 8.550989345509894e-06, + "loss": 1.3947, + "step": 2524 + }, + { + "epoch": 2.3059360730593608, + "grad_norm": 43.976444244384766, + "learning_rate": 8.549974632166413e-06, + "loss": 0.8234, + "step": 2525 + }, + { + "epoch": 2.3068493150684932, + "grad_norm": 35.979923248291016, + "learning_rate": 8.548959918822933e-06, + "loss": 0.9239, + "step": 2526 + }, + { + "epoch": 2.3077625570776257, + "grad_norm": 54.662864685058594, + "learning_rate": 8.547945205479454e-06, + "loss": 0.7052, + "step": 2527 + }, + { + "epoch": 2.308675799086758, + "grad_norm": 17.7056941986084, + "learning_rate": 8.546930492135971e-06, + "loss": 0.2703, + "step": 2528 + }, + { + "epoch": 2.3095890410958906, + "grad_norm": 22.95075225830078, + "learning_rate": 8.545915778792492e-06, + "loss": 0.1682, + "step": 2529 + }, + { + "epoch": 2.3105022831050226, + "grad_norm": 1.6539087295532227, + "learning_rate": 8.544901065449012e-06, + "loss": 0.0172, + "step": 2530 + }, + { + "epoch": 2.311415525114155, + "grad_norm": 7.595870494842529, + "learning_rate": 8.543886352105531e-06, + "loss": 0.0581, + "step": 2531 + }, + { + "epoch": 2.3123287671232875, + "grad_norm": 51.124351501464844, + "learning_rate": 8.54287163876205e-06, + "loss": 1.0177, + "step": 2532 + }, + { + "epoch": 2.31324200913242, + "grad_norm": 12.623218536376953, + "learning_rate": 8.54185692541857e-06, + "loss": 0.1035, + "step": 2533 + }, + { + "epoch": 2.3141552511415524, + "grad_norm": 7.650071620941162, + "learning_rate": 8.54084221207509e-06, + "loss": 0.0837, + "step": 2534 + }, + { + "epoch": 2.315068493150685, + "grad_norm": 4.104850769042969, + "learning_rate": 8.539827498731608e-06, + "loss": 0.0287, + "step": 2535 + }, + { + "epoch": 2.3159817351598173, + "grad_norm": 21.494104385375977, + "learning_rate": 8.538812785388129e-06, + "loss": 0.2366, + "step": 2536 + }, + { + "epoch": 2.3168949771689498, + "grad_norm": 1.3706049919128418, + "learning_rate": 8.53779807204465e-06, + "loss": 0.0135, + "step": 2537 + }, + { + "epoch": 2.317808219178082, + "grad_norm": 17.670623779296875, + "learning_rate": 8.536783358701168e-06, + "loss": 0.3505, + "step": 2538 + }, + { + "epoch": 2.3187214611872147, + "grad_norm": 8.227909088134766, + "learning_rate": 8.535768645357687e-06, + "loss": 0.0672, + "step": 2539 + }, + { + "epoch": 2.319634703196347, + "grad_norm": 8.898208618164062, + "learning_rate": 8.534753932014207e-06, + "loss": 0.1017, + "step": 2540 + }, + { + "epoch": 2.3205479452054796, + "grad_norm": 54.90418243408203, + "learning_rate": 8.533739218670726e-06, + "loss": 0.8073, + "step": 2541 + }, + { + "epoch": 2.321461187214612, + "grad_norm": 18.218969345092773, + "learning_rate": 8.532724505327245e-06, + "loss": 0.2066, + "step": 2542 + }, + { + "epoch": 2.3223744292237445, + "grad_norm": 7.022598743438721, + "learning_rate": 8.531709791983766e-06, + "loss": 0.1168, + "step": 2543 + }, + { + "epoch": 2.323287671232877, + "grad_norm": 40.33091354370117, + "learning_rate": 8.530695078640285e-06, + "loss": 0.6757, + "step": 2544 + }, + { + "epoch": 2.324200913242009, + "grad_norm": 24.82576560974121, + "learning_rate": 8.529680365296803e-06, + "loss": 0.2095, + "step": 2545 + }, + { + "epoch": 2.3251141552511414, + "grad_norm": 13.451592445373535, + "learning_rate": 8.528665651953324e-06, + "loss": 0.1481, + "step": 2546 + }, + { + "epoch": 2.326027397260274, + "grad_norm": 10.416699409484863, + "learning_rate": 8.527650938609844e-06, + "loss": 0.1275, + "step": 2547 + }, + { + "epoch": 2.3269406392694063, + "grad_norm": 12.251401901245117, + "learning_rate": 8.526636225266363e-06, + "loss": 0.1245, + "step": 2548 + }, + { + "epoch": 2.3278538812785388, + "grad_norm": 59.48749542236328, + "learning_rate": 8.525621511922882e-06, + "loss": 1.2211, + "step": 2549 + }, + { + "epoch": 2.328767123287671, + "grad_norm": 35.97823715209961, + "learning_rate": 8.524606798579403e-06, + "loss": 0.7375, + "step": 2550 + }, + { + "epoch": 2.3296803652968037, + "grad_norm": 27.389890670776367, + "learning_rate": 8.523592085235922e-06, + "loss": 0.3558, + "step": 2551 + }, + { + "epoch": 2.330593607305936, + "grad_norm": 21.23909568786621, + "learning_rate": 8.52257737189244e-06, + "loss": 0.2363, + "step": 2552 + }, + { + "epoch": 2.3315068493150686, + "grad_norm": 22.51871681213379, + "learning_rate": 8.521562658548961e-06, + "loss": 0.3633, + "step": 2553 + }, + { + "epoch": 2.332420091324201, + "grad_norm": 7.164186954498291, + "learning_rate": 8.520547945205481e-06, + "loss": 0.0643, + "step": 2554 + }, + { + "epoch": 2.3333333333333335, + "grad_norm": 33.92820358276367, + "learning_rate": 8.519533231861999e-06, + "loss": 0.572, + "step": 2555 + }, + { + "epoch": 2.334246575342466, + "grad_norm": 2.8569979667663574, + "learning_rate": 8.518518518518519e-06, + "loss": 0.0383, + "step": 2556 + }, + { + "epoch": 2.335159817351598, + "grad_norm": 9.5850830078125, + "learning_rate": 8.51750380517504e-06, + "loss": 0.0818, + "step": 2557 + }, + { + "epoch": 2.3360730593607304, + "grad_norm": 23.453548431396484, + "learning_rate": 8.516489091831559e-06, + "loss": 0.2341, + "step": 2558 + }, + { + "epoch": 2.336986301369863, + "grad_norm": 10.279058456420898, + "learning_rate": 8.515474378488077e-06, + "loss": 0.0988, + "step": 2559 + }, + { + "epoch": 2.3378995433789953, + "grad_norm": 4.855576515197754, + "learning_rate": 8.514459665144598e-06, + "loss": 0.0612, + "step": 2560 + }, + { + "epoch": 2.3388127853881278, + "grad_norm": 34.258148193359375, + "learning_rate": 8.513444951801117e-06, + "loss": 0.5572, + "step": 2561 + }, + { + "epoch": 2.33972602739726, + "grad_norm": 26.81546401977539, + "learning_rate": 8.512430238457636e-06, + "loss": 0.3462, + "step": 2562 + }, + { + "epoch": 2.3406392694063927, + "grad_norm": 43.61079025268555, + "learning_rate": 8.511415525114156e-06, + "loss": 0.7081, + "step": 2563 + }, + { + "epoch": 2.341552511415525, + "grad_norm": 38.401100158691406, + "learning_rate": 8.510400811770677e-06, + "loss": 0.6696, + "step": 2564 + }, + { + "epoch": 2.3424657534246576, + "grad_norm": 93.31207275390625, + "learning_rate": 8.509386098427196e-06, + "loss": 1.5782, + "step": 2565 + }, + { + "epoch": 2.34337899543379, + "grad_norm": 15.269840240478516, + "learning_rate": 8.508371385083714e-06, + "loss": 0.1832, + "step": 2566 + }, + { + "epoch": 2.3442922374429225, + "grad_norm": 21.6315860748291, + "learning_rate": 8.507356671740235e-06, + "loss": 0.2671, + "step": 2567 + }, + { + "epoch": 2.345205479452055, + "grad_norm": 1.5377742052078247, + "learning_rate": 8.506341958396754e-06, + "loss": 0.0124, + "step": 2568 + }, + { + "epoch": 2.3461187214611874, + "grad_norm": 10.304574966430664, + "learning_rate": 8.505327245053273e-06, + "loss": 0.0975, + "step": 2569 + }, + { + "epoch": 2.34703196347032, + "grad_norm": 76.87966918945312, + "learning_rate": 8.504312531709793e-06, + "loss": 2.0125, + "step": 2570 + }, + { + "epoch": 2.3479452054794523, + "grad_norm": 41.43619155883789, + "learning_rate": 8.503297818366312e-06, + "loss": 0.7732, + "step": 2571 + }, + { + "epoch": 2.3488584474885843, + "grad_norm": 24.396278381347656, + "learning_rate": 8.50228310502283e-06, + "loss": 0.2456, + "step": 2572 + }, + { + "epoch": 2.3497716894977168, + "grad_norm": 56.82029724121094, + "learning_rate": 8.501268391679351e-06, + "loss": 0.9676, + "step": 2573 + }, + { + "epoch": 2.350684931506849, + "grad_norm": 14.897598266601562, + "learning_rate": 8.500253678335872e-06, + "loss": 0.1888, + "step": 2574 + }, + { + "epoch": 2.3515981735159817, + "grad_norm": 4.411530494689941, + "learning_rate": 8.49923896499239e-06, + "loss": 0.0485, + "step": 2575 + }, + { + "epoch": 2.352511415525114, + "grad_norm": 13.339557647705078, + "learning_rate": 8.49822425164891e-06, + "loss": 0.0905, + "step": 2576 + }, + { + "epoch": 2.3534246575342466, + "grad_norm": 7.402656555175781, + "learning_rate": 8.49720953830543e-06, + "loss": 0.0752, + "step": 2577 + }, + { + "epoch": 2.354337899543379, + "grad_norm": 52.366947174072266, + "learning_rate": 8.496194824961949e-06, + "loss": 1.5175, + "step": 2578 + }, + { + "epoch": 2.3552511415525115, + "grad_norm": 23.853784561157227, + "learning_rate": 8.495180111618468e-06, + "loss": 0.3055, + "step": 2579 + }, + { + "epoch": 2.356164383561644, + "grad_norm": 5.731657981872559, + "learning_rate": 8.494165398274988e-06, + "loss": 0.0555, + "step": 2580 + }, + { + "epoch": 2.3570776255707764, + "grad_norm": 13.033509254455566, + "learning_rate": 8.493150684931507e-06, + "loss": 0.0984, + "step": 2581 + }, + { + "epoch": 2.357990867579909, + "grad_norm": 0.5984832644462585, + "learning_rate": 8.492135971588028e-06, + "loss": 0.0032, + "step": 2582 + }, + { + "epoch": 2.3589041095890413, + "grad_norm": 13.269862174987793, + "learning_rate": 8.491121258244547e-06, + "loss": 0.1858, + "step": 2583 + }, + { + "epoch": 2.3598173515981733, + "grad_norm": 2.735943078994751, + "learning_rate": 8.490106544901067e-06, + "loss": 0.031, + "step": 2584 + }, + { + "epoch": 2.3607305936073057, + "grad_norm": 2.7740471363067627, + "learning_rate": 8.489091831557586e-06, + "loss": 0.0349, + "step": 2585 + }, + { + "epoch": 2.361643835616438, + "grad_norm": 6.614674091339111, + "learning_rate": 8.488077118214105e-06, + "loss": 0.0625, + "step": 2586 + }, + { + "epoch": 2.3625570776255707, + "grad_norm": 71.26132202148438, + "learning_rate": 8.487062404870625e-06, + "loss": 1.8956, + "step": 2587 + }, + { + "epoch": 2.363470319634703, + "grad_norm": 4.389730453491211, + "learning_rate": 8.486047691527144e-06, + "loss": 0.0377, + "step": 2588 + }, + { + "epoch": 2.3643835616438356, + "grad_norm": 18.56266975402832, + "learning_rate": 8.485032978183663e-06, + "loss": 0.1326, + "step": 2589 + }, + { + "epoch": 2.365296803652968, + "grad_norm": 16.902164459228516, + "learning_rate": 8.484018264840184e-06, + "loss": 0.1454, + "step": 2590 + }, + { + "epoch": 2.3662100456621005, + "grad_norm": 7.4359869956970215, + "learning_rate": 8.483003551496702e-06, + "loss": 0.0761, + "step": 2591 + }, + { + "epoch": 2.367123287671233, + "grad_norm": 36.49121856689453, + "learning_rate": 8.481988838153223e-06, + "loss": 0.6271, + "step": 2592 + }, + { + "epoch": 2.3680365296803654, + "grad_norm": 5.046899795532227, + "learning_rate": 8.480974124809742e-06, + "loss": 0.0423, + "step": 2593 + }, + { + "epoch": 2.368949771689498, + "grad_norm": 48.76486587524414, + "learning_rate": 8.479959411466262e-06, + "loss": 0.8713, + "step": 2594 + }, + { + "epoch": 2.3698630136986303, + "grad_norm": 16.764951705932617, + "learning_rate": 8.478944698122781e-06, + "loss": 0.2275, + "step": 2595 + }, + { + "epoch": 2.3707762557077627, + "grad_norm": 25.81731414794922, + "learning_rate": 8.4779299847793e-06, + "loss": 0.3392, + "step": 2596 + }, + { + "epoch": 2.371689497716895, + "grad_norm": 3.1424009799957275, + "learning_rate": 8.47691527143582e-06, + "loss": 0.0372, + "step": 2597 + }, + { + "epoch": 2.3726027397260276, + "grad_norm": 27.430540084838867, + "learning_rate": 8.47590055809234e-06, + "loss": 0.2349, + "step": 2598 + }, + { + "epoch": 2.3735159817351597, + "grad_norm": 2.192211389541626, + "learning_rate": 8.474885844748858e-06, + "loss": 0.0177, + "step": 2599 + }, + { + "epoch": 2.374429223744292, + "grad_norm": 38.63566207885742, + "learning_rate": 8.473871131405379e-06, + "loss": 0.4687, + "step": 2600 + }, + { + "epoch": 2.3753424657534246, + "grad_norm": 61.828224182128906, + "learning_rate": 8.472856418061898e-06, + "loss": 1.1157, + "step": 2601 + }, + { + "epoch": 2.376255707762557, + "grad_norm": 44.94215393066406, + "learning_rate": 8.471841704718418e-06, + "loss": 0.8097, + "step": 2602 + }, + { + "epoch": 2.3771689497716895, + "grad_norm": 35.639678955078125, + "learning_rate": 8.470826991374937e-06, + "loss": 0.7122, + "step": 2603 + }, + { + "epoch": 2.378082191780822, + "grad_norm": 1.0172736644744873, + "learning_rate": 8.469812278031458e-06, + "loss": 0.0111, + "step": 2604 + }, + { + "epoch": 2.3789954337899544, + "grad_norm": 28.700725555419922, + "learning_rate": 8.468797564687976e-06, + "loss": 0.443, + "step": 2605 + }, + { + "epoch": 2.379908675799087, + "grad_norm": 2.8080434799194336, + "learning_rate": 8.467782851344495e-06, + "loss": 0.0366, + "step": 2606 + }, + { + "epoch": 2.3808219178082193, + "grad_norm": 18.297073364257812, + "learning_rate": 8.466768138001016e-06, + "loss": 0.1639, + "step": 2607 + }, + { + "epoch": 2.3817351598173517, + "grad_norm": 18.1204776763916, + "learning_rate": 8.465753424657535e-06, + "loss": 0.1465, + "step": 2608 + }, + { + "epoch": 2.382648401826484, + "grad_norm": 12.552175521850586, + "learning_rate": 8.464738711314055e-06, + "loss": 0.1132, + "step": 2609 + }, + { + "epoch": 2.383561643835616, + "grad_norm": 23.569944381713867, + "learning_rate": 8.463723997970574e-06, + "loss": 0.2408, + "step": 2610 + }, + { + "epoch": 2.3844748858447486, + "grad_norm": 56.35377502441406, + "learning_rate": 8.462709284627093e-06, + "loss": 0.6837, + "step": 2611 + }, + { + "epoch": 2.385388127853881, + "grad_norm": 5.958493232727051, + "learning_rate": 8.461694571283613e-06, + "loss": 0.0481, + "step": 2612 + }, + { + "epoch": 2.3863013698630136, + "grad_norm": 13.23412036895752, + "learning_rate": 8.460679857940132e-06, + "loss": 0.2547, + "step": 2613 + }, + { + "epoch": 2.387214611872146, + "grad_norm": 57.03971481323242, + "learning_rate": 8.459665144596653e-06, + "loss": 1.8369, + "step": 2614 + }, + { + "epoch": 2.3881278538812785, + "grad_norm": 38.753623962402344, + "learning_rate": 8.458650431253172e-06, + "loss": 0.5749, + "step": 2615 + }, + { + "epoch": 2.389041095890411, + "grad_norm": 5.369328022003174, + "learning_rate": 8.45763571790969e-06, + "loss": 0.04, + "step": 2616 + }, + { + "epoch": 2.3899543378995434, + "grad_norm": 10.712569236755371, + "learning_rate": 8.456621004566211e-06, + "loss": 0.1334, + "step": 2617 + }, + { + "epoch": 2.390867579908676, + "grad_norm": 27.73740005493164, + "learning_rate": 8.45560629122273e-06, + "loss": 0.3587, + "step": 2618 + }, + { + "epoch": 2.3917808219178083, + "grad_norm": 43.29261779785156, + "learning_rate": 8.45459157787925e-06, + "loss": 0.3301, + "step": 2619 + }, + { + "epoch": 2.3926940639269407, + "grad_norm": 13.756902694702148, + "learning_rate": 8.45357686453577e-06, + "loss": 0.1482, + "step": 2620 + }, + { + "epoch": 2.393607305936073, + "grad_norm": 66.64447021484375, + "learning_rate": 8.452562151192288e-06, + "loss": 1.492, + "step": 2621 + }, + { + "epoch": 2.3945205479452056, + "grad_norm": 6.40380859375, + "learning_rate": 8.451547437848809e-06, + "loss": 0.0544, + "step": 2622 + }, + { + "epoch": 2.395433789954338, + "grad_norm": 0.5340489149093628, + "learning_rate": 8.450532724505328e-06, + "loss": 0.0054, + "step": 2623 + }, + { + "epoch": 2.3963470319634705, + "grad_norm": 13.62696647644043, + "learning_rate": 8.449518011161848e-06, + "loss": 0.1568, + "step": 2624 + }, + { + "epoch": 2.3972602739726026, + "grad_norm": 61.63732147216797, + "learning_rate": 8.448503297818367e-06, + "loss": 1.6704, + "step": 2625 + }, + { + "epoch": 2.398173515981735, + "grad_norm": 24.835927963256836, + "learning_rate": 8.447488584474887e-06, + "loss": 0.2211, + "step": 2626 + }, + { + "epoch": 2.3990867579908675, + "grad_norm": 16.208232879638672, + "learning_rate": 8.446473871131406e-06, + "loss": 0.1426, + "step": 2627 + }, + { + "epoch": 2.4, + "grad_norm": 32.6987419128418, + "learning_rate": 8.445459157787925e-06, + "loss": 0.3347, + "step": 2628 + }, + { + "epoch": 2.4009132420091324, + "grad_norm": 24.62846565246582, + "learning_rate": 8.444444444444446e-06, + "loss": 0.1982, + "step": 2629 + }, + { + "epoch": 2.401826484018265, + "grad_norm": 1.6118731498718262, + "learning_rate": 8.443429731100965e-06, + "loss": 0.0137, + "step": 2630 + }, + { + "epoch": 2.4027397260273973, + "grad_norm": 7.035207271575928, + "learning_rate": 8.442415017757483e-06, + "loss": 0.0875, + "step": 2631 + }, + { + "epoch": 2.4036529680365297, + "grad_norm": 19.472564697265625, + "learning_rate": 8.441400304414004e-06, + "loss": 0.1854, + "step": 2632 + }, + { + "epoch": 2.404566210045662, + "grad_norm": 20.37861442565918, + "learning_rate": 8.440385591070523e-06, + "loss": 0.1563, + "step": 2633 + }, + { + "epoch": 2.4054794520547946, + "grad_norm": 3.6155683994293213, + "learning_rate": 8.439370877727043e-06, + "loss": 0.0413, + "step": 2634 + }, + { + "epoch": 2.406392694063927, + "grad_norm": 2.082277297973633, + "learning_rate": 8.438356164383562e-06, + "loss": 0.0225, + "step": 2635 + }, + { + "epoch": 2.4073059360730595, + "grad_norm": 13.226804733276367, + "learning_rate": 8.437341451040083e-06, + "loss": 0.0974, + "step": 2636 + }, + { + "epoch": 2.4082191780821915, + "grad_norm": 46.35184860229492, + "learning_rate": 8.436326737696602e-06, + "loss": 0.7124, + "step": 2637 + }, + { + "epoch": 2.409132420091324, + "grad_norm": 15.939962387084961, + "learning_rate": 8.43531202435312e-06, + "loss": 0.1823, + "step": 2638 + }, + { + "epoch": 2.4100456621004565, + "grad_norm": 9.95329475402832, + "learning_rate": 8.434297311009641e-06, + "loss": 0.0901, + "step": 2639 + }, + { + "epoch": 2.410958904109589, + "grad_norm": 45.39498519897461, + "learning_rate": 8.43328259766616e-06, + "loss": 0.6564, + "step": 2640 + }, + { + "epoch": 2.4118721461187214, + "grad_norm": 11.99289608001709, + "learning_rate": 8.432267884322679e-06, + "loss": 0.1393, + "step": 2641 + }, + { + "epoch": 2.412785388127854, + "grad_norm": 27.21029281616211, + "learning_rate": 8.431253170979199e-06, + "loss": 0.2587, + "step": 2642 + }, + { + "epoch": 2.4136986301369863, + "grad_norm": 61.79631805419922, + "learning_rate": 8.430238457635718e-06, + "loss": 1.4053, + "step": 2643 + }, + { + "epoch": 2.4146118721461187, + "grad_norm": 56.75800323486328, + "learning_rate": 8.429223744292239e-06, + "loss": 1.202, + "step": 2644 + }, + { + "epoch": 2.415525114155251, + "grad_norm": 7.621906757354736, + "learning_rate": 8.428209030948757e-06, + "loss": 0.071, + "step": 2645 + }, + { + "epoch": 2.4164383561643836, + "grad_norm": 0.10764028877019882, + "learning_rate": 8.427194317605278e-06, + "loss": 0.0013, + "step": 2646 + }, + { + "epoch": 2.417351598173516, + "grad_norm": 44.684879302978516, + "learning_rate": 8.426179604261797e-06, + "loss": 0.6012, + "step": 2647 + }, + { + "epoch": 2.4182648401826485, + "grad_norm": 1.7938376665115356, + "learning_rate": 8.425164890918316e-06, + "loss": 0.0172, + "step": 2648 + }, + { + "epoch": 2.419178082191781, + "grad_norm": 4.508960723876953, + "learning_rate": 8.424150177574836e-06, + "loss": 0.0516, + "step": 2649 + }, + { + "epoch": 2.4200913242009134, + "grad_norm": 54.250545501708984, + "learning_rate": 8.423135464231355e-06, + "loss": 1.0917, + "step": 2650 + }, + { + "epoch": 2.421004566210046, + "grad_norm": 3.033536195755005, + "learning_rate": 8.422120750887874e-06, + "loss": 0.025, + "step": 2651 + }, + { + "epoch": 2.421917808219178, + "grad_norm": 1.3633770942687988, + "learning_rate": 8.421106037544394e-06, + "loss": 0.0088, + "step": 2652 + }, + { + "epoch": 2.4228310502283104, + "grad_norm": 1.2213168144226074, + "learning_rate": 8.420091324200915e-06, + "loss": 0.0157, + "step": 2653 + }, + { + "epoch": 2.423744292237443, + "grad_norm": 7.108090877532959, + "learning_rate": 8.419076610857434e-06, + "loss": 0.0436, + "step": 2654 + }, + { + "epoch": 2.4246575342465753, + "grad_norm": 6.768992900848389, + "learning_rate": 8.418061897513953e-06, + "loss": 0.0463, + "step": 2655 + }, + { + "epoch": 2.4255707762557077, + "grad_norm": 5.4637885093688965, + "learning_rate": 8.417047184170473e-06, + "loss": 0.0608, + "step": 2656 + }, + { + "epoch": 2.42648401826484, + "grad_norm": 2.0356247425079346, + "learning_rate": 8.416032470826992e-06, + "loss": 0.0219, + "step": 2657 + }, + { + "epoch": 2.4273972602739726, + "grad_norm": 6.600022792816162, + "learning_rate": 8.41501775748351e-06, + "loss": 0.0834, + "step": 2658 + }, + { + "epoch": 2.428310502283105, + "grad_norm": 74.02056884765625, + "learning_rate": 8.414003044140031e-06, + "loss": 1.6222, + "step": 2659 + }, + { + "epoch": 2.4292237442922375, + "grad_norm": 9.870484352111816, + "learning_rate": 8.41298833079655e-06, + "loss": 0.1067, + "step": 2660 + }, + { + "epoch": 2.43013698630137, + "grad_norm": 11.687897682189941, + "learning_rate": 8.411973617453069e-06, + "loss": 0.1208, + "step": 2661 + }, + { + "epoch": 2.4310502283105024, + "grad_norm": 15.59425163269043, + "learning_rate": 8.41095890410959e-06, + "loss": 0.1327, + "step": 2662 + }, + { + "epoch": 2.431963470319635, + "grad_norm": 10.092334747314453, + "learning_rate": 8.40994419076611e-06, + "loss": 0.1176, + "step": 2663 + }, + { + "epoch": 2.432876712328767, + "grad_norm": 6.752047061920166, + "learning_rate": 8.408929477422629e-06, + "loss": 0.0665, + "step": 2664 + }, + { + "epoch": 2.4337899543378994, + "grad_norm": 37.04958724975586, + "learning_rate": 8.407914764079148e-06, + "loss": 0.3794, + "step": 2665 + }, + { + "epoch": 2.434703196347032, + "grad_norm": 2.5792903900146484, + "learning_rate": 8.406900050735668e-06, + "loss": 0.0289, + "step": 2666 + }, + { + "epoch": 2.4356164383561643, + "grad_norm": 8.589284896850586, + "learning_rate": 8.405885337392187e-06, + "loss": 0.084, + "step": 2667 + }, + { + "epoch": 2.4365296803652967, + "grad_norm": 48.2645263671875, + "learning_rate": 8.404870624048706e-06, + "loss": 0.7054, + "step": 2668 + }, + { + "epoch": 2.437442922374429, + "grad_norm": 53.561771392822266, + "learning_rate": 8.403855910705227e-06, + "loss": 0.8283, + "step": 2669 + }, + { + "epoch": 2.4383561643835616, + "grad_norm": 9.828437805175781, + "learning_rate": 8.402841197361747e-06, + "loss": 0.1023, + "step": 2670 + }, + { + "epoch": 2.439269406392694, + "grad_norm": 29.345176696777344, + "learning_rate": 8.401826484018264e-06, + "loss": 0.3757, + "step": 2671 + }, + { + "epoch": 2.4401826484018265, + "grad_norm": 8.882883071899414, + "learning_rate": 8.400811770674785e-06, + "loss": 0.0733, + "step": 2672 + }, + { + "epoch": 2.441095890410959, + "grad_norm": 20.851537704467773, + "learning_rate": 8.399797057331305e-06, + "loss": 0.2084, + "step": 2673 + }, + { + "epoch": 2.4420091324200914, + "grad_norm": 29.0509033203125, + "learning_rate": 8.398782343987824e-06, + "loss": 0.2294, + "step": 2674 + }, + { + "epoch": 2.442922374429224, + "grad_norm": 4.511923313140869, + "learning_rate": 8.397767630644343e-06, + "loss": 0.0399, + "step": 2675 + }, + { + "epoch": 2.4438356164383563, + "grad_norm": 14.718377113342285, + "learning_rate": 8.396752917300864e-06, + "loss": 0.1388, + "step": 2676 + }, + { + "epoch": 2.444748858447489, + "grad_norm": 29.290224075317383, + "learning_rate": 8.395738203957382e-06, + "loss": 0.3474, + "step": 2677 + }, + { + "epoch": 2.4456621004566212, + "grad_norm": 17.324918746948242, + "learning_rate": 8.394723490613901e-06, + "loss": 0.228, + "step": 2678 + }, + { + "epoch": 2.4465753424657533, + "grad_norm": 4.285766124725342, + "learning_rate": 8.393708777270422e-06, + "loss": 0.0318, + "step": 2679 + }, + { + "epoch": 2.4474885844748857, + "grad_norm": 1.7056492567062378, + "learning_rate": 8.392694063926942e-06, + "loss": 0.0172, + "step": 2680 + }, + { + "epoch": 2.448401826484018, + "grad_norm": 5.613767623901367, + "learning_rate": 8.391679350583461e-06, + "loss": 0.0465, + "step": 2681 + }, + { + "epoch": 2.4493150684931506, + "grad_norm": 33.188541412353516, + "learning_rate": 8.39066463723998e-06, + "loss": 0.427, + "step": 2682 + }, + { + "epoch": 2.450228310502283, + "grad_norm": 76.30276489257812, + "learning_rate": 8.3896499238965e-06, + "loss": 0.9989, + "step": 2683 + }, + { + "epoch": 2.4511415525114155, + "grad_norm": 6.418438911437988, + "learning_rate": 8.38863521055302e-06, + "loss": 0.0496, + "step": 2684 + }, + { + "epoch": 2.452054794520548, + "grad_norm": 12.079424858093262, + "learning_rate": 8.387620497209538e-06, + "loss": 0.1366, + "step": 2685 + }, + { + "epoch": 2.4529680365296804, + "grad_norm": 72.07073974609375, + "learning_rate": 8.386605783866059e-06, + "loss": 2.5518, + "step": 2686 + }, + { + "epoch": 2.453881278538813, + "grad_norm": 56.30064010620117, + "learning_rate": 8.385591070522578e-06, + "loss": 0.8349, + "step": 2687 + }, + { + "epoch": 2.4547945205479453, + "grad_norm": 20.112092971801758, + "learning_rate": 8.384576357179096e-06, + "loss": 0.2356, + "step": 2688 + }, + { + "epoch": 2.455707762557078, + "grad_norm": 12.27629280090332, + "learning_rate": 8.383561643835617e-06, + "loss": 0.1859, + "step": 2689 + }, + { + "epoch": 2.45662100456621, + "grad_norm": 6.0922722816467285, + "learning_rate": 8.382546930492138e-06, + "loss": 0.0615, + "step": 2690 + }, + { + "epoch": 2.4575342465753423, + "grad_norm": 46.47526550292969, + "learning_rate": 8.381532217148656e-06, + "loss": 0.3897, + "step": 2691 + }, + { + "epoch": 2.4584474885844747, + "grad_norm": 9.209482192993164, + "learning_rate": 8.380517503805175e-06, + "loss": 0.0664, + "step": 2692 + }, + { + "epoch": 2.459360730593607, + "grad_norm": 11.075854301452637, + "learning_rate": 8.379502790461696e-06, + "loss": 0.1274, + "step": 2693 + }, + { + "epoch": 2.4602739726027396, + "grad_norm": 14.703022956848145, + "learning_rate": 8.378488077118215e-06, + "loss": 0.2107, + "step": 2694 + }, + { + "epoch": 2.461187214611872, + "grad_norm": 12.104707717895508, + "learning_rate": 8.377473363774733e-06, + "loss": 0.1063, + "step": 2695 + }, + { + "epoch": 2.4621004566210045, + "grad_norm": 31.809188842773438, + "learning_rate": 8.376458650431254e-06, + "loss": 0.4077, + "step": 2696 + }, + { + "epoch": 2.463013698630137, + "grad_norm": 9.573740005493164, + "learning_rate": 8.375443937087775e-06, + "loss": 0.0792, + "step": 2697 + }, + { + "epoch": 2.4639269406392694, + "grad_norm": 9.475218772888184, + "learning_rate": 8.374429223744293e-06, + "loss": 0.0973, + "step": 2698 + }, + { + "epoch": 2.464840182648402, + "grad_norm": 0.4318276643753052, + "learning_rate": 8.373414510400812e-06, + "loss": 0.004, + "step": 2699 + }, + { + "epoch": 2.4657534246575343, + "grad_norm": 16.085918426513672, + "learning_rate": 8.372399797057333e-06, + "loss": 0.1695, + "step": 2700 + }, + { + "epoch": 2.466666666666667, + "grad_norm": 1.2622133493423462, + "learning_rate": 8.371385083713852e-06, + "loss": 0.0068, + "step": 2701 + }, + { + "epoch": 2.4675799086757992, + "grad_norm": 80.40673828125, + "learning_rate": 8.37037037037037e-06, + "loss": 2.064, + "step": 2702 + }, + { + "epoch": 2.4684931506849317, + "grad_norm": 75.78958129882812, + "learning_rate": 8.369355657026891e-06, + "loss": 1.2078, + "step": 2703 + }, + { + "epoch": 2.469406392694064, + "grad_norm": 29.229589462280273, + "learning_rate": 8.36834094368341e-06, + "loss": 0.2306, + "step": 2704 + }, + { + "epoch": 2.470319634703196, + "grad_norm": 0.46337375044822693, + "learning_rate": 8.367326230339929e-06, + "loss": 0.0043, + "step": 2705 + }, + { + "epoch": 2.4712328767123286, + "grad_norm": 41.77325439453125, + "learning_rate": 8.36631151699645e-06, + "loss": 0.4753, + "step": 2706 + }, + { + "epoch": 2.472146118721461, + "grad_norm": 11.46943473815918, + "learning_rate": 8.36529680365297e-06, + "loss": 0.1039, + "step": 2707 + }, + { + "epoch": 2.4730593607305935, + "grad_norm": 59.442447662353516, + "learning_rate": 8.364282090309489e-06, + "loss": 0.8298, + "step": 2708 + }, + { + "epoch": 2.473972602739726, + "grad_norm": 14.889369010925293, + "learning_rate": 8.363267376966007e-06, + "loss": 0.1796, + "step": 2709 + }, + { + "epoch": 2.4748858447488584, + "grad_norm": 1.0185943841934204, + "learning_rate": 8.362252663622528e-06, + "loss": 0.0079, + "step": 2710 + }, + { + "epoch": 2.475799086757991, + "grad_norm": 59.614933013916016, + "learning_rate": 8.361237950279047e-06, + "loss": 1.1118, + "step": 2711 + }, + { + "epoch": 2.4767123287671233, + "grad_norm": 21.11859130859375, + "learning_rate": 8.360223236935566e-06, + "loss": 0.2339, + "step": 2712 + }, + { + "epoch": 2.477625570776256, + "grad_norm": 3.268998146057129, + "learning_rate": 8.359208523592086e-06, + "loss": 0.0286, + "step": 2713 + }, + { + "epoch": 2.4785388127853882, + "grad_norm": 77.48931121826172, + "learning_rate": 8.358193810248607e-06, + "loss": 1.9156, + "step": 2714 + }, + { + "epoch": 2.4794520547945207, + "grad_norm": 13.012415885925293, + "learning_rate": 8.357179096905124e-06, + "loss": 0.121, + "step": 2715 + }, + { + "epoch": 2.480365296803653, + "grad_norm": 20.251296997070312, + "learning_rate": 8.356164383561644e-06, + "loss": 0.1224, + "step": 2716 + }, + { + "epoch": 2.481278538812785, + "grad_norm": 2.0045394897460938, + "learning_rate": 8.355149670218165e-06, + "loss": 0.0123, + "step": 2717 + }, + { + "epoch": 2.4821917808219176, + "grad_norm": 24.715843200683594, + "learning_rate": 8.354134956874684e-06, + "loss": 0.1903, + "step": 2718 + }, + { + "epoch": 2.48310502283105, + "grad_norm": 7.311784267425537, + "learning_rate": 8.353120243531203e-06, + "loss": 0.0843, + "step": 2719 + }, + { + "epoch": 2.4840182648401825, + "grad_norm": 16.103408813476562, + "learning_rate": 8.352105530187723e-06, + "loss": 0.1521, + "step": 2720 + }, + { + "epoch": 2.484931506849315, + "grad_norm": 50.80609893798828, + "learning_rate": 8.351090816844242e-06, + "loss": 0.6423, + "step": 2721 + }, + { + "epoch": 2.4858447488584474, + "grad_norm": 0.35935378074645996, + "learning_rate": 8.350076103500761e-06, + "loss": 0.0023, + "step": 2722 + }, + { + "epoch": 2.48675799086758, + "grad_norm": 29.30653953552246, + "learning_rate": 8.349061390157281e-06, + "loss": 0.4134, + "step": 2723 + }, + { + "epoch": 2.4876712328767123, + "grad_norm": 55.785804748535156, + "learning_rate": 8.348046676813802e-06, + "loss": 0.7501, + "step": 2724 + }, + { + "epoch": 2.4885844748858448, + "grad_norm": 23.66931915283203, + "learning_rate": 8.347031963470321e-06, + "loss": 0.2651, + "step": 2725 + }, + { + "epoch": 2.4894977168949772, + "grad_norm": 1.4225407838821411, + "learning_rate": 8.34601725012684e-06, + "loss": 0.0102, + "step": 2726 + }, + { + "epoch": 2.4904109589041097, + "grad_norm": 0.1034230962395668, + "learning_rate": 8.34500253678336e-06, + "loss": 0.0012, + "step": 2727 + }, + { + "epoch": 2.491324200913242, + "grad_norm": 23.933752059936523, + "learning_rate": 8.343987823439879e-06, + "loss": 0.3086, + "step": 2728 + }, + { + "epoch": 2.4922374429223746, + "grad_norm": 18.508281707763672, + "learning_rate": 8.342973110096398e-06, + "loss": 0.1103, + "step": 2729 + }, + { + "epoch": 2.493150684931507, + "grad_norm": 23.058855056762695, + "learning_rate": 8.341958396752918e-06, + "loss": 0.2246, + "step": 2730 + }, + { + "epoch": 2.4940639269406395, + "grad_norm": 93.01765441894531, + "learning_rate": 8.340943683409437e-06, + "loss": 1.7961, + "step": 2731 + }, + { + "epoch": 2.4949771689497715, + "grad_norm": 31.28460121154785, + "learning_rate": 8.339928970065956e-06, + "loss": 0.3612, + "step": 2732 + }, + { + "epoch": 2.495890410958904, + "grad_norm": 41.646148681640625, + "learning_rate": 8.338914256722477e-06, + "loss": 0.6193, + "step": 2733 + }, + { + "epoch": 2.4968036529680364, + "grad_norm": 12.266534805297852, + "learning_rate": 8.337899543378997e-06, + "loss": 0.0761, + "step": 2734 + }, + { + "epoch": 2.497716894977169, + "grad_norm": 3.68121075630188, + "learning_rate": 8.336884830035516e-06, + "loss": 0.0305, + "step": 2735 + }, + { + "epoch": 2.4986301369863013, + "grad_norm": 2.187771797180176, + "learning_rate": 8.335870116692035e-06, + "loss": 0.0168, + "step": 2736 + }, + { + "epoch": 2.4995433789954338, + "grad_norm": 73.39777374267578, + "learning_rate": 8.334855403348555e-06, + "loss": 1.6792, + "step": 2737 + }, + { + "epoch": 2.5004566210045662, + "grad_norm": 0.10653827339410782, + "learning_rate": 8.333840690005074e-06, + "loss": 0.0009, + "step": 2738 + }, + { + "epoch": 2.5013698630136987, + "grad_norm": 9.925699234008789, + "learning_rate": 8.332825976661593e-06, + "loss": 0.0826, + "step": 2739 + }, + { + "epoch": 2.502283105022831, + "grad_norm": 7.170538425445557, + "learning_rate": 8.331811263318114e-06, + "loss": 0.072, + "step": 2740 + }, + { + "epoch": 2.5031963470319636, + "grad_norm": 8.047054290771484, + "learning_rate": 8.330796549974633e-06, + "loss": 0.0789, + "step": 2741 + }, + { + "epoch": 2.504109589041096, + "grad_norm": 59.355648040771484, + "learning_rate": 8.329781836631153e-06, + "loss": 0.78, + "step": 2742 + }, + { + "epoch": 2.505022831050228, + "grad_norm": 8.21023941040039, + "learning_rate": 8.328767123287672e-06, + "loss": 0.0807, + "step": 2743 + }, + { + "epoch": 2.5059360730593605, + "grad_norm": 0.620217502117157, + "learning_rate": 8.327752409944192e-06, + "loss": 0.0063, + "step": 2744 + }, + { + "epoch": 2.506849315068493, + "grad_norm": 38.21015548706055, + "learning_rate": 8.326737696600711e-06, + "loss": 0.4065, + "step": 2745 + }, + { + "epoch": 2.5077625570776254, + "grad_norm": 67.07665252685547, + "learning_rate": 8.32572298325723e-06, + "loss": 1.748, + "step": 2746 + }, + { + "epoch": 2.508675799086758, + "grad_norm": 58.56071853637695, + "learning_rate": 8.32470826991375e-06, + "loss": 1.163, + "step": 2747 + }, + { + "epoch": 2.5095890410958903, + "grad_norm": 33.4600830078125, + "learning_rate": 8.32369355657027e-06, + "loss": 0.3921, + "step": 2748 + }, + { + "epoch": 2.5105022831050228, + "grad_norm": 8.62390422821045, + "learning_rate": 8.322678843226788e-06, + "loss": 0.0718, + "step": 2749 + }, + { + "epoch": 2.5114155251141552, + "grad_norm": 20.687862396240234, + "learning_rate": 8.321664129883309e-06, + "loss": 0.23, + "step": 2750 + }, + { + "epoch": 2.5123287671232877, + "grad_norm": 62.50138473510742, + "learning_rate": 8.320649416539828e-06, + "loss": 0.7787, + "step": 2751 + }, + { + "epoch": 2.51324200913242, + "grad_norm": 54.9876594543457, + "learning_rate": 8.319634703196348e-06, + "loss": 0.5335, + "step": 2752 + }, + { + "epoch": 2.5141552511415526, + "grad_norm": 11.534967422485352, + "learning_rate": 8.318619989852867e-06, + "loss": 0.1089, + "step": 2753 + }, + { + "epoch": 2.515068493150685, + "grad_norm": 13.687469482421875, + "learning_rate": 8.317605276509388e-06, + "loss": 0.1643, + "step": 2754 + }, + { + "epoch": 2.5159817351598175, + "grad_norm": 11.179917335510254, + "learning_rate": 8.316590563165907e-06, + "loss": 0.1444, + "step": 2755 + }, + { + "epoch": 2.51689497716895, + "grad_norm": 13.334508895874023, + "learning_rate": 8.315575849822425e-06, + "loss": 0.1478, + "step": 2756 + }, + { + "epoch": 2.5178082191780824, + "grad_norm": 22.4639835357666, + "learning_rate": 8.314561136478946e-06, + "loss": 0.2329, + "step": 2757 + }, + { + "epoch": 2.518721461187215, + "grad_norm": 77.99217224121094, + "learning_rate": 8.313546423135465e-06, + "loss": 1.9181, + "step": 2758 + }, + { + "epoch": 2.5196347031963473, + "grad_norm": 10.758180618286133, + "learning_rate": 8.312531709791984e-06, + "loss": 0.0707, + "step": 2759 + }, + { + "epoch": 2.5205479452054793, + "grad_norm": 62.135616302490234, + "learning_rate": 8.311516996448504e-06, + "loss": 0.9215, + "step": 2760 + }, + { + "epoch": 2.5214611872146118, + "grad_norm": 15.853436470031738, + "learning_rate": 8.310502283105023e-06, + "loss": 0.1268, + "step": 2761 + }, + { + "epoch": 2.522374429223744, + "grad_norm": 20.395179748535156, + "learning_rate": 8.309487569761544e-06, + "loss": 0.1488, + "step": 2762 + }, + { + "epoch": 2.5232876712328767, + "grad_norm": 56.18986511230469, + "learning_rate": 8.308472856418062e-06, + "loss": 0.6758, + "step": 2763 + }, + { + "epoch": 2.524200913242009, + "grad_norm": 38.42169952392578, + "learning_rate": 8.307458143074583e-06, + "loss": 0.4002, + "step": 2764 + }, + { + "epoch": 2.5251141552511416, + "grad_norm": 96.90222930908203, + "learning_rate": 8.306443429731102e-06, + "loss": 3.2418, + "step": 2765 + }, + { + "epoch": 2.526027397260274, + "grad_norm": 11.168106079101562, + "learning_rate": 8.30542871638762e-06, + "loss": 0.1298, + "step": 2766 + }, + { + "epoch": 2.5269406392694065, + "grad_norm": 52.7137451171875, + "learning_rate": 8.304414003044141e-06, + "loss": 0.5587, + "step": 2767 + }, + { + "epoch": 2.527853881278539, + "grad_norm": 34.87836456298828, + "learning_rate": 8.30339928970066e-06, + "loss": 0.6347, + "step": 2768 + }, + { + "epoch": 2.5287671232876714, + "grad_norm": 59.58406066894531, + "learning_rate": 8.30238457635718e-06, + "loss": 0.859, + "step": 2769 + }, + { + "epoch": 2.5296803652968034, + "grad_norm": 24.404123306274414, + "learning_rate": 8.3013698630137e-06, + "loss": 0.5437, + "step": 2770 + }, + { + "epoch": 2.530593607305936, + "grad_norm": 72.8993148803711, + "learning_rate": 8.300355149670218e-06, + "loss": 3.0296, + "step": 2771 + }, + { + "epoch": 2.5315068493150683, + "grad_norm": 7.019927024841309, + "learning_rate": 8.299340436326739e-06, + "loss": 0.1022, + "step": 2772 + }, + { + "epoch": 2.5324200913242008, + "grad_norm": 6.204838752746582, + "learning_rate": 8.298325722983258e-06, + "loss": 0.0583, + "step": 2773 + }, + { + "epoch": 2.533333333333333, + "grad_norm": 4.378254413604736, + "learning_rate": 8.297311009639778e-06, + "loss": 0.0283, + "step": 2774 + }, + { + "epoch": 2.5342465753424657, + "grad_norm": 19.76360511779785, + "learning_rate": 8.296296296296297e-06, + "loss": 0.2253, + "step": 2775 + }, + { + "epoch": 2.535159817351598, + "grad_norm": 23.70954704284668, + "learning_rate": 8.295281582952816e-06, + "loss": 0.3057, + "step": 2776 + }, + { + "epoch": 2.5360730593607306, + "grad_norm": 5.8491291999816895, + "learning_rate": 8.294266869609336e-06, + "loss": 0.0536, + "step": 2777 + }, + { + "epoch": 2.536986301369863, + "grad_norm": 10.406301498413086, + "learning_rate": 8.293252156265855e-06, + "loss": 0.0937, + "step": 2778 + }, + { + "epoch": 2.5378995433789955, + "grad_norm": 29.73343276977539, + "learning_rate": 8.292237442922376e-06, + "loss": 0.3028, + "step": 2779 + }, + { + "epoch": 2.538812785388128, + "grad_norm": 6.507935047149658, + "learning_rate": 8.291222729578895e-06, + "loss": 0.0489, + "step": 2780 + }, + { + "epoch": 2.5397260273972604, + "grad_norm": 19.46573257446289, + "learning_rate": 8.290208016235413e-06, + "loss": 0.1584, + "step": 2781 + }, + { + "epoch": 2.540639269406393, + "grad_norm": 89.6505126953125, + "learning_rate": 8.289193302891934e-06, + "loss": 1.4467, + "step": 2782 + }, + { + "epoch": 2.5415525114155253, + "grad_norm": 21.781070709228516, + "learning_rate": 8.288178589548453e-06, + "loss": 0.2426, + "step": 2783 + }, + { + "epoch": 2.5424657534246577, + "grad_norm": 61.01799011230469, + "learning_rate": 8.287163876204973e-06, + "loss": 1.2703, + "step": 2784 + }, + { + "epoch": 2.54337899543379, + "grad_norm": 1.8585435152053833, + "learning_rate": 8.286149162861492e-06, + "loss": 0.0199, + "step": 2785 + }, + { + "epoch": 2.544292237442922, + "grad_norm": 0.6630399823188782, + "learning_rate": 8.285134449518013e-06, + "loss": 0.0046, + "step": 2786 + }, + { + "epoch": 2.5452054794520547, + "grad_norm": 2.8765268325805664, + "learning_rate": 8.284119736174532e-06, + "loss": 0.0213, + "step": 2787 + }, + { + "epoch": 2.546118721461187, + "grad_norm": 8.302046775817871, + "learning_rate": 8.28310502283105e-06, + "loss": 0.0725, + "step": 2788 + }, + { + "epoch": 2.5470319634703196, + "grad_norm": 41.93244934082031, + "learning_rate": 8.282090309487571e-06, + "loss": 0.3963, + "step": 2789 + }, + { + "epoch": 2.547945205479452, + "grad_norm": 8.02572250366211, + "learning_rate": 8.28107559614409e-06, + "loss": 0.0674, + "step": 2790 + }, + { + "epoch": 2.5488584474885845, + "grad_norm": 11.768741607666016, + "learning_rate": 8.280060882800609e-06, + "loss": 0.073, + "step": 2791 + }, + { + "epoch": 2.549771689497717, + "grad_norm": 47.31782913208008, + "learning_rate": 8.27904616945713e-06, + "loss": 0.7792, + "step": 2792 + }, + { + "epoch": 2.5506849315068494, + "grad_norm": 8.346504211425781, + "learning_rate": 8.278031456113648e-06, + "loss": 0.0664, + "step": 2793 + }, + { + "epoch": 2.551598173515982, + "grad_norm": 1.4273167848587036, + "learning_rate": 8.277016742770169e-06, + "loss": 0.0119, + "step": 2794 + }, + { + "epoch": 2.5525114155251143, + "grad_norm": 1.9210302829742432, + "learning_rate": 8.276002029426687e-06, + "loss": 0.0128, + "step": 2795 + }, + { + "epoch": 2.5534246575342463, + "grad_norm": 65.39762878417969, + "learning_rate": 8.274987316083208e-06, + "loss": 1.2465, + "step": 2796 + }, + { + "epoch": 2.5543378995433788, + "grad_norm": 18.987443923950195, + "learning_rate": 8.273972602739727e-06, + "loss": 0.2024, + "step": 2797 + }, + { + "epoch": 2.555251141552511, + "grad_norm": 53.15958023071289, + "learning_rate": 8.272957889396246e-06, + "loss": 1.0993, + "step": 2798 + }, + { + "epoch": 2.5561643835616437, + "grad_norm": 1.0118749141693115, + "learning_rate": 8.271943176052766e-06, + "loss": 0.0082, + "step": 2799 + }, + { + "epoch": 2.557077625570776, + "grad_norm": 56.69356155395508, + "learning_rate": 8.270928462709285e-06, + "loss": 0.4744, + "step": 2800 + }, + { + "epoch": 2.5579908675799086, + "grad_norm": 5.466797351837158, + "learning_rate": 8.269913749365804e-06, + "loss": 0.0341, + "step": 2801 + }, + { + "epoch": 2.558904109589041, + "grad_norm": 56.51985549926758, + "learning_rate": 8.268899036022324e-06, + "loss": 0.938, + "step": 2802 + }, + { + "epoch": 2.5598173515981735, + "grad_norm": 3.997755765914917, + "learning_rate": 8.267884322678843e-06, + "loss": 0.0359, + "step": 2803 + }, + { + "epoch": 2.560730593607306, + "grad_norm": 74.29866027832031, + "learning_rate": 8.266869609335364e-06, + "loss": 1.5884, + "step": 2804 + }, + { + "epoch": 2.5616438356164384, + "grad_norm": 44.02268600463867, + "learning_rate": 8.265854895991883e-06, + "loss": 1.1233, + "step": 2805 + }, + { + "epoch": 2.562557077625571, + "grad_norm": 5.894692897796631, + "learning_rate": 8.264840182648403e-06, + "loss": 0.0664, + "step": 2806 + }, + { + "epoch": 2.5634703196347033, + "grad_norm": 24.51699447631836, + "learning_rate": 8.263825469304922e-06, + "loss": 0.2313, + "step": 2807 + }, + { + "epoch": 2.5643835616438357, + "grad_norm": 67.05191802978516, + "learning_rate": 8.262810755961441e-06, + "loss": 1.2275, + "step": 2808 + }, + { + "epoch": 2.565296803652968, + "grad_norm": 28.235090255737305, + "learning_rate": 8.261796042617961e-06, + "loss": 0.2291, + "step": 2809 + }, + { + "epoch": 2.5662100456621006, + "grad_norm": 4.24524450302124, + "learning_rate": 8.26078132927448e-06, + "loss": 0.0453, + "step": 2810 + }, + { + "epoch": 2.567123287671233, + "grad_norm": 17.42947006225586, + "learning_rate": 8.259766615930999e-06, + "loss": 0.1627, + "step": 2811 + }, + { + "epoch": 2.5680365296803656, + "grad_norm": 24.65912437438965, + "learning_rate": 8.25875190258752e-06, + "loss": 0.3106, + "step": 2812 + }, + { + "epoch": 2.5689497716894976, + "grad_norm": 63.009918212890625, + "learning_rate": 8.25773718924404e-06, + "loss": 1.1897, + "step": 2813 + }, + { + "epoch": 2.56986301369863, + "grad_norm": 82.48648071289062, + "learning_rate": 8.256722475900559e-06, + "loss": 3.2574, + "step": 2814 + }, + { + "epoch": 2.5707762557077625, + "grad_norm": 102.63710021972656, + "learning_rate": 8.255707762557078e-06, + "loss": 3.3847, + "step": 2815 + }, + { + "epoch": 2.571689497716895, + "grad_norm": 37.830562591552734, + "learning_rate": 8.254693049213598e-06, + "loss": 0.1464, + "step": 2816 + }, + { + "epoch": 2.5726027397260274, + "grad_norm": 18.08279037475586, + "learning_rate": 8.253678335870117e-06, + "loss": 0.2104, + "step": 2817 + }, + { + "epoch": 2.57351598173516, + "grad_norm": 17.800739288330078, + "learning_rate": 8.252663622526636e-06, + "loss": 0.1885, + "step": 2818 + }, + { + "epoch": 2.5744292237442923, + "grad_norm": 41.7382926940918, + "learning_rate": 8.251648909183157e-06, + "loss": 0.4553, + "step": 2819 + }, + { + "epoch": 2.5753424657534247, + "grad_norm": 47.331687927246094, + "learning_rate": 8.250634195839676e-06, + "loss": 0.4322, + "step": 2820 + }, + { + "epoch": 2.576255707762557, + "grad_norm": 61.48670196533203, + "learning_rate": 8.249619482496194e-06, + "loss": 0.7212, + "step": 2821 + }, + { + "epoch": 2.5771689497716896, + "grad_norm": 4.450453758239746, + "learning_rate": 8.248604769152715e-06, + "loss": 0.0327, + "step": 2822 + }, + { + "epoch": 2.5780821917808217, + "grad_norm": 4.936711311340332, + "learning_rate": 8.247590055809235e-06, + "loss": 0.0649, + "step": 2823 + }, + { + "epoch": 2.578995433789954, + "grad_norm": 23.266584396362305, + "learning_rate": 8.246575342465754e-06, + "loss": 0.3151, + "step": 2824 + }, + { + "epoch": 2.5799086757990866, + "grad_norm": 55.16679382324219, + "learning_rate": 8.245560629122273e-06, + "loss": 1.0252, + "step": 2825 + }, + { + "epoch": 2.580821917808219, + "grad_norm": 2.744443416595459, + "learning_rate": 8.244545915778794e-06, + "loss": 0.0345, + "step": 2826 + }, + { + "epoch": 2.5817351598173515, + "grad_norm": 68.33212280273438, + "learning_rate": 8.243531202435313e-06, + "loss": 1.4139, + "step": 2827 + }, + { + "epoch": 2.582648401826484, + "grad_norm": 2.157315254211426, + "learning_rate": 8.242516489091831e-06, + "loss": 0.0272, + "step": 2828 + }, + { + "epoch": 2.5835616438356164, + "grad_norm": 95.36726379394531, + "learning_rate": 8.241501775748352e-06, + "loss": 2.4004, + "step": 2829 + }, + { + "epoch": 2.584474885844749, + "grad_norm": 15.945535659790039, + "learning_rate": 8.240487062404872e-06, + "loss": 0.2193, + "step": 2830 + }, + { + "epoch": 2.5853881278538813, + "grad_norm": 4.24946928024292, + "learning_rate": 8.23947234906139e-06, + "loss": 0.0391, + "step": 2831 + }, + { + "epoch": 2.5863013698630137, + "grad_norm": 12.57540225982666, + "learning_rate": 8.23845763571791e-06, + "loss": 0.1278, + "step": 2832 + }, + { + "epoch": 2.587214611872146, + "grad_norm": 6.705111026763916, + "learning_rate": 8.23744292237443e-06, + "loss": 0.0572, + "step": 2833 + }, + { + "epoch": 2.5881278538812786, + "grad_norm": 9.917808532714844, + "learning_rate": 8.23642820903095e-06, + "loss": 0.0999, + "step": 2834 + }, + { + "epoch": 2.589041095890411, + "grad_norm": 45.629852294921875, + "learning_rate": 8.235413495687468e-06, + "loss": 0.6841, + "step": 2835 + }, + { + "epoch": 2.5899543378995435, + "grad_norm": 65.3563461303711, + "learning_rate": 8.234398782343989e-06, + "loss": 3.002, + "step": 2836 + }, + { + "epoch": 2.590867579908676, + "grad_norm": 25.260419845581055, + "learning_rate": 8.233384069000508e-06, + "loss": 0.3529, + "step": 2837 + }, + { + "epoch": 2.5917808219178085, + "grad_norm": 6.146728515625, + "learning_rate": 8.232369355657027e-06, + "loss": 0.0532, + "step": 2838 + }, + { + "epoch": 2.592694063926941, + "grad_norm": 32.807838439941406, + "learning_rate": 8.231354642313547e-06, + "loss": 0.4769, + "step": 2839 + }, + { + "epoch": 2.593607305936073, + "grad_norm": 38.914920806884766, + "learning_rate": 8.230339928970068e-06, + "loss": 0.7339, + "step": 2840 + }, + { + "epoch": 2.5945205479452054, + "grad_norm": 17.93787956237793, + "learning_rate": 8.229325215626586e-06, + "loss": 0.169, + "step": 2841 + }, + { + "epoch": 2.595433789954338, + "grad_norm": 66.27507781982422, + "learning_rate": 8.228310502283105e-06, + "loss": 1.5637, + "step": 2842 + }, + { + "epoch": 2.5963470319634703, + "grad_norm": 16.01495361328125, + "learning_rate": 8.227295788939626e-06, + "loss": 0.2061, + "step": 2843 + }, + { + "epoch": 2.5972602739726027, + "grad_norm": 75.90478515625, + "learning_rate": 8.226281075596145e-06, + "loss": 0.7511, + "step": 2844 + }, + { + "epoch": 2.598173515981735, + "grad_norm": 49.67019271850586, + "learning_rate": 8.225266362252664e-06, + "loss": 0.4761, + "step": 2845 + }, + { + "epoch": 2.5990867579908676, + "grad_norm": 14.212100982666016, + "learning_rate": 8.224251648909184e-06, + "loss": 0.2229, + "step": 2846 + }, + { + "epoch": 2.6, + "grad_norm": 13.04800796508789, + "learning_rate": 8.223236935565703e-06, + "loss": 0.1472, + "step": 2847 + }, + { + "epoch": 2.6009132420091325, + "grad_norm": 0.9872356653213501, + "learning_rate": 8.222222222222222e-06, + "loss": 0.0101, + "step": 2848 + }, + { + "epoch": 2.601826484018265, + "grad_norm": 42.962615966796875, + "learning_rate": 8.221207508878742e-06, + "loss": 0.5364, + "step": 2849 + }, + { + "epoch": 2.602739726027397, + "grad_norm": 8.096575736999512, + "learning_rate": 8.220192795535263e-06, + "loss": 0.0972, + "step": 2850 + }, + { + "epoch": 2.6036529680365295, + "grad_norm": 58.8614616394043, + "learning_rate": 8.219178082191782e-06, + "loss": 1.1101, + "step": 2851 + }, + { + "epoch": 2.604566210045662, + "grad_norm": 85.11378479003906, + "learning_rate": 8.2181633688483e-06, + "loss": 1.8561, + "step": 2852 + }, + { + "epoch": 2.6054794520547944, + "grad_norm": 4.514827251434326, + "learning_rate": 8.217148655504821e-06, + "loss": 0.0568, + "step": 2853 + }, + { + "epoch": 2.606392694063927, + "grad_norm": 18.182069778442383, + "learning_rate": 8.21613394216134e-06, + "loss": 0.2388, + "step": 2854 + }, + { + "epoch": 2.6073059360730593, + "grad_norm": 36.600791931152344, + "learning_rate": 8.215119228817859e-06, + "loss": 0.4983, + "step": 2855 + }, + { + "epoch": 2.6082191780821917, + "grad_norm": 7.503384113311768, + "learning_rate": 8.21410451547438e-06, + "loss": 0.0958, + "step": 2856 + }, + { + "epoch": 2.609132420091324, + "grad_norm": 62.89133071899414, + "learning_rate": 8.2130898021309e-06, + "loss": 0.907, + "step": 2857 + }, + { + "epoch": 2.6100456621004566, + "grad_norm": 19.595354080200195, + "learning_rate": 8.212075088787419e-06, + "loss": 0.2456, + "step": 2858 + }, + { + "epoch": 2.610958904109589, + "grad_norm": 68.60964965820312, + "learning_rate": 8.211060375443938e-06, + "loss": 3.285, + "step": 2859 + }, + { + "epoch": 2.6118721461187215, + "grad_norm": 55.22373580932617, + "learning_rate": 8.210045662100458e-06, + "loss": 0.5625, + "step": 2860 + }, + { + "epoch": 2.612785388127854, + "grad_norm": 15.362253189086914, + "learning_rate": 8.209030948756977e-06, + "loss": 0.1994, + "step": 2861 + }, + { + "epoch": 2.6136986301369864, + "grad_norm": 17.142133712768555, + "learning_rate": 8.208016235413496e-06, + "loss": 0.1937, + "step": 2862 + }, + { + "epoch": 2.614611872146119, + "grad_norm": 22.338891983032227, + "learning_rate": 8.207001522070016e-06, + "loss": 0.317, + "step": 2863 + }, + { + "epoch": 2.6155251141552514, + "grad_norm": 11.859549522399902, + "learning_rate": 8.205986808726535e-06, + "loss": 0.1161, + "step": 2864 + }, + { + "epoch": 2.616438356164384, + "grad_norm": 36.30438232421875, + "learning_rate": 8.204972095383054e-06, + "loss": 0.6452, + "step": 2865 + }, + { + "epoch": 2.6173515981735163, + "grad_norm": 60.57075500488281, + "learning_rate": 8.203957382039575e-06, + "loss": 0.9706, + "step": 2866 + }, + { + "epoch": 2.6182648401826483, + "grad_norm": 5.628260612487793, + "learning_rate": 8.202942668696095e-06, + "loss": 0.0624, + "step": 2867 + }, + { + "epoch": 2.6191780821917807, + "grad_norm": 41.340633392333984, + "learning_rate": 8.201927955352614e-06, + "loss": 1.0611, + "step": 2868 + }, + { + "epoch": 2.620091324200913, + "grad_norm": 40.182376861572266, + "learning_rate": 8.200913242009133e-06, + "loss": 0.5876, + "step": 2869 + }, + { + "epoch": 2.6210045662100456, + "grad_norm": 73.98869323730469, + "learning_rate": 8.199898528665653e-06, + "loss": 2.2058, + "step": 2870 + }, + { + "epoch": 2.621917808219178, + "grad_norm": 23.342012405395508, + "learning_rate": 8.198883815322172e-06, + "loss": 0.3856, + "step": 2871 + }, + { + "epoch": 2.6228310502283105, + "grad_norm": 28.1170654296875, + "learning_rate": 8.197869101978691e-06, + "loss": 0.3086, + "step": 2872 + }, + { + "epoch": 2.623744292237443, + "grad_norm": 20.881399154663086, + "learning_rate": 8.196854388635212e-06, + "loss": 0.2787, + "step": 2873 + }, + { + "epoch": 2.6246575342465754, + "grad_norm": 21.525951385498047, + "learning_rate": 8.195839675291732e-06, + "loss": 0.2924, + "step": 2874 + }, + { + "epoch": 2.625570776255708, + "grad_norm": 54.85594940185547, + "learning_rate": 8.19482496194825e-06, + "loss": 1.1928, + "step": 2875 + }, + { + "epoch": 2.6264840182648403, + "grad_norm": 9.421939849853516, + "learning_rate": 8.19381024860477e-06, + "loss": 0.1048, + "step": 2876 + }, + { + "epoch": 2.6273972602739724, + "grad_norm": 2.8741822242736816, + "learning_rate": 8.19279553526129e-06, + "loss": 0.0337, + "step": 2877 + }, + { + "epoch": 2.628310502283105, + "grad_norm": 3.9550845623016357, + "learning_rate": 8.19178082191781e-06, + "loss": 0.0419, + "step": 2878 + }, + { + "epoch": 2.6292237442922373, + "grad_norm": 5.44850492477417, + "learning_rate": 8.190766108574328e-06, + "loss": 0.0407, + "step": 2879 + }, + { + "epoch": 2.6301369863013697, + "grad_norm": 52.0770263671875, + "learning_rate": 8.189751395230849e-06, + "loss": 1.0201, + "step": 2880 + }, + { + "epoch": 2.631050228310502, + "grad_norm": 16.625043869018555, + "learning_rate": 8.188736681887367e-06, + "loss": 0.2799, + "step": 2881 + }, + { + "epoch": 2.6319634703196346, + "grad_norm": 51.39398193359375, + "learning_rate": 8.187721968543886e-06, + "loss": 0.9166, + "step": 2882 + }, + { + "epoch": 2.632876712328767, + "grad_norm": 41.31390380859375, + "learning_rate": 8.186707255200407e-06, + "loss": 0.6038, + "step": 2883 + }, + { + "epoch": 2.6337899543378995, + "grad_norm": 8.023261070251465, + "learning_rate": 8.185692541856927e-06, + "loss": 0.1063, + "step": 2884 + }, + { + "epoch": 2.634703196347032, + "grad_norm": 25.082393646240234, + "learning_rate": 8.184677828513446e-06, + "loss": 0.2652, + "step": 2885 + }, + { + "epoch": 2.6356164383561644, + "grad_norm": 8.001657485961914, + "learning_rate": 8.183663115169965e-06, + "loss": 0.0941, + "step": 2886 + }, + { + "epoch": 2.636529680365297, + "grad_norm": 38.287166595458984, + "learning_rate": 8.182648401826486e-06, + "loss": 0.9338, + "step": 2887 + }, + { + "epoch": 2.6374429223744293, + "grad_norm": 9.263327598571777, + "learning_rate": 8.181633688483004e-06, + "loss": 0.1417, + "step": 2888 + }, + { + "epoch": 2.638356164383562, + "grad_norm": 4.206135272979736, + "learning_rate": 8.180618975139523e-06, + "loss": 0.0432, + "step": 2889 + }, + { + "epoch": 2.6392694063926943, + "grad_norm": 15.628210067749023, + "learning_rate": 8.179604261796044e-06, + "loss": 0.2282, + "step": 2890 + }, + { + "epoch": 2.6401826484018267, + "grad_norm": 47.42488479614258, + "learning_rate": 8.178589548452563e-06, + "loss": 1.1085, + "step": 2891 + }, + { + "epoch": 2.641095890410959, + "grad_norm": 27.771259307861328, + "learning_rate": 8.177574835109081e-06, + "loss": 0.4215, + "step": 2892 + }, + { + "epoch": 2.642009132420091, + "grad_norm": 0.8659820556640625, + "learning_rate": 8.176560121765602e-06, + "loss": 0.0071, + "step": 2893 + }, + { + "epoch": 2.6429223744292236, + "grad_norm": 49.7021369934082, + "learning_rate": 8.175545408422123e-06, + "loss": 0.6707, + "step": 2894 + }, + { + "epoch": 2.643835616438356, + "grad_norm": 14.629914283752441, + "learning_rate": 8.174530695078641e-06, + "loss": 0.1473, + "step": 2895 + }, + { + "epoch": 2.6447488584474885, + "grad_norm": 52.19961166381836, + "learning_rate": 8.17351598173516e-06, + "loss": 1.1486, + "step": 2896 + }, + { + "epoch": 2.645662100456621, + "grad_norm": 3.3537521362304688, + "learning_rate": 8.17250126839168e-06, + "loss": 0.0459, + "step": 2897 + }, + { + "epoch": 2.6465753424657534, + "grad_norm": 10.264307975769043, + "learning_rate": 8.1714865550482e-06, + "loss": 0.1296, + "step": 2898 + }, + { + "epoch": 2.647488584474886, + "grad_norm": 12.675682067871094, + "learning_rate": 8.170471841704718e-06, + "loss": 0.152, + "step": 2899 + }, + { + "epoch": 2.6484018264840183, + "grad_norm": 10.470269203186035, + "learning_rate": 8.169457128361239e-06, + "loss": 0.1152, + "step": 2900 + }, + { + "epoch": 2.649315068493151, + "grad_norm": 42.5329704284668, + "learning_rate": 8.168442415017758e-06, + "loss": 0.5415, + "step": 2901 + }, + { + "epoch": 2.6502283105022832, + "grad_norm": 28.97399139404297, + "learning_rate": 8.167427701674278e-06, + "loss": 0.3827, + "step": 2902 + }, + { + "epoch": 2.6511415525114153, + "grad_norm": 57.499671936035156, + "learning_rate": 8.166412988330797e-06, + "loss": 2.9544, + "step": 2903 + }, + { + "epoch": 2.6520547945205477, + "grad_norm": 9.530143737792969, + "learning_rate": 8.165398274987318e-06, + "loss": 0.0951, + "step": 2904 + }, + { + "epoch": 2.65296803652968, + "grad_norm": 45.93037796020508, + "learning_rate": 8.164383561643837e-06, + "loss": 1.3609, + "step": 2905 + }, + { + "epoch": 2.6538812785388126, + "grad_norm": 52.25910949707031, + "learning_rate": 8.163368848300355e-06, + "loss": 1.8342, + "step": 2906 + }, + { + "epoch": 2.654794520547945, + "grad_norm": 25.069894790649414, + "learning_rate": 8.162354134956876e-06, + "loss": 0.3693, + "step": 2907 + }, + { + "epoch": 2.6557077625570775, + "grad_norm": 18.698040008544922, + "learning_rate": 8.161339421613395e-06, + "loss": 0.4107, + "step": 2908 + }, + { + "epoch": 2.65662100456621, + "grad_norm": 3.100306510925293, + "learning_rate": 8.160324708269914e-06, + "loss": 0.0299, + "step": 2909 + }, + { + "epoch": 2.6575342465753424, + "grad_norm": 78.31378173828125, + "learning_rate": 8.159309994926434e-06, + "loss": 2.054, + "step": 2910 + }, + { + "epoch": 2.658447488584475, + "grad_norm": 51.62287139892578, + "learning_rate": 8.158295281582953e-06, + "loss": 0.9074, + "step": 2911 + }, + { + "epoch": 2.6593607305936073, + "grad_norm": 201.46945190429688, + "learning_rate": 8.157280568239474e-06, + "loss": 1.4519, + "step": 2912 + }, + { + "epoch": 2.66027397260274, + "grad_norm": 0.6692739725112915, + "learning_rate": 8.156265854895992e-06, + "loss": 0.0083, + "step": 2913 + }, + { + "epoch": 2.6611872146118722, + "grad_norm": 15.828566551208496, + "learning_rate": 8.155251141552513e-06, + "loss": 0.2095, + "step": 2914 + }, + { + "epoch": 2.6621004566210047, + "grad_norm": 52.45486068725586, + "learning_rate": 8.154236428209032e-06, + "loss": 1.2072, + "step": 2915 + }, + { + "epoch": 2.663013698630137, + "grad_norm": 1.6354520320892334, + "learning_rate": 8.15322171486555e-06, + "loss": 0.019, + "step": 2916 + }, + { + "epoch": 2.6639269406392696, + "grad_norm": 21.961181640625, + "learning_rate": 8.152207001522071e-06, + "loss": 0.4466, + "step": 2917 + }, + { + "epoch": 2.664840182648402, + "grad_norm": 88.7099609375, + "learning_rate": 8.15119228817859e-06, + "loss": 4.1494, + "step": 2918 + }, + { + "epoch": 2.6657534246575345, + "grad_norm": 22.65300750732422, + "learning_rate": 8.150177574835109e-06, + "loss": 0.4345, + "step": 2919 + }, + { + "epoch": 2.6666666666666665, + "grad_norm": 7.227492332458496, + "learning_rate": 8.14916286149163e-06, + "loss": 0.0684, + "step": 2920 + }, + { + "epoch": 2.667579908675799, + "grad_norm": 7.695145606994629, + "learning_rate": 8.148148148148148e-06, + "loss": 0.0783, + "step": 2921 + }, + { + "epoch": 2.6684931506849314, + "grad_norm": 15.250795364379883, + "learning_rate": 8.147133434804669e-06, + "loss": 0.234, + "step": 2922 + }, + { + "epoch": 2.669406392694064, + "grad_norm": 23.701704025268555, + "learning_rate": 8.146118721461188e-06, + "loss": 0.3791, + "step": 2923 + }, + { + "epoch": 2.6703196347031963, + "grad_norm": 59.55739212036133, + "learning_rate": 8.145104008117708e-06, + "loss": 3.8779, + "step": 2924 + }, + { + "epoch": 2.671232876712329, + "grad_norm": 4.629158020019531, + "learning_rate": 8.144089294774227e-06, + "loss": 0.0634, + "step": 2925 + }, + { + "epoch": 2.6721461187214612, + "grad_norm": 23.956632614135742, + "learning_rate": 8.143074581430746e-06, + "loss": 0.3614, + "step": 2926 + }, + { + "epoch": 2.6730593607305937, + "grad_norm": 6.706465244293213, + "learning_rate": 8.142059868087266e-06, + "loss": 0.0596, + "step": 2927 + }, + { + "epoch": 2.673972602739726, + "grad_norm": 22.08623695373535, + "learning_rate": 8.141045154743785e-06, + "loss": 0.2463, + "step": 2928 + }, + { + "epoch": 2.6748858447488586, + "grad_norm": 41.438392639160156, + "learning_rate": 8.140030441400306e-06, + "loss": 0.816, + "step": 2929 + }, + { + "epoch": 2.6757990867579906, + "grad_norm": 11.215436935424805, + "learning_rate": 8.139015728056825e-06, + "loss": 0.1278, + "step": 2930 + }, + { + "epoch": 2.676712328767123, + "grad_norm": 26.934465408325195, + "learning_rate": 8.138001014713344e-06, + "loss": 0.3552, + "step": 2931 + }, + { + "epoch": 2.6776255707762555, + "grad_norm": 8.381523132324219, + "learning_rate": 8.136986301369864e-06, + "loss": 0.1063, + "step": 2932 + }, + { + "epoch": 2.678538812785388, + "grad_norm": 7.209593296051025, + "learning_rate": 8.135971588026383e-06, + "loss": 0.0735, + "step": 2933 + }, + { + "epoch": 2.6794520547945204, + "grad_norm": 27.570337295532227, + "learning_rate": 8.134956874682903e-06, + "loss": 0.5419, + "step": 2934 + }, + { + "epoch": 2.680365296803653, + "grad_norm": 14.208805084228516, + "learning_rate": 8.133942161339422e-06, + "loss": 0.1521, + "step": 2935 + }, + { + "epoch": 2.6812785388127853, + "grad_norm": 16.124542236328125, + "learning_rate": 8.132927447995941e-06, + "loss": 0.2238, + "step": 2936 + }, + { + "epoch": 2.682191780821918, + "grad_norm": 23.347461700439453, + "learning_rate": 8.131912734652462e-06, + "loss": 0.4085, + "step": 2937 + }, + { + "epoch": 2.6831050228310502, + "grad_norm": 13.98337173461914, + "learning_rate": 8.13089802130898e-06, + "loss": 0.1238, + "step": 2938 + }, + { + "epoch": 2.6840182648401827, + "grad_norm": 24.11298942565918, + "learning_rate": 8.129883307965501e-06, + "loss": 0.3877, + "step": 2939 + }, + { + "epoch": 2.684931506849315, + "grad_norm": 19.159873962402344, + "learning_rate": 8.12886859462202e-06, + "loss": 0.2635, + "step": 2940 + }, + { + "epoch": 2.6858447488584476, + "grad_norm": 0.4685268700122833, + "learning_rate": 8.127853881278539e-06, + "loss": 0.0032, + "step": 2941 + }, + { + "epoch": 2.68675799086758, + "grad_norm": 37.736419677734375, + "learning_rate": 8.12683916793506e-06, + "loss": 0.6834, + "step": 2942 + }, + { + "epoch": 2.6876712328767125, + "grad_norm": 12.275391578674316, + "learning_rate": 8.125824454591578e-06, + "loss": 0.0878, + "step": 2943 + }, + { + "epoch": 2.688584474885845, + "grad_norm": 21.986501693725586, + "learning_rate": 8.124809741248099e-06, + "loss": 0.3587, + "step": 2944 + }, + { + "epoch": 2.6894977168949774, + "grad_norm": 43.480003356933594, + "learning_rate": 8.123795027904618e-06, + "loss": 0.9057, + "step": 2945 + }, + { + "epoch": 2.69041095890411, + "grad_norm": 6.284364223480225, + "learning_rate": 8.122780314561138e-06, + "loss": 0.0675, + "step": 2946 + }, + { + "epoch": 2.691324200913242, + "grad_norm": 8.983999252319336, + "learning_rate": 8.121765601217657e-06, + "loss": 0.0875, + "step": 2947 + }, + { + "epoch": 2.6922374429223743, + "grad_norm": 7.600605010986328, + "learning_rate": 8.120750887874176e-06, + "loss": 0.073, + "step": 2948 + }, + { + "epoch": 2.6931506849315068, + "grad_norm": 0.7426122426986694, + "learning_rate": 8.119736174530696e-06, + "loss": 0.0053, + "step": 2949 + }, + { + "epoch": 2.6940639269406392, + "grad_norm": 0.5879680514335632, + "learning_rate": 8.118721461187215e-06, + "loss": 0.0066, + "step": 2950 + }, + { + "epoch": 2.6949771689497717, + "grad_norm": 1.2741496562957764, + "learning_rate": 8.117706747843734e-06, + "loss": 0.016, + "step": 2951 + }, + { + "epoch": 2.695890410958904, + "grad_norm": 56.49313735961914, + "learning_rate": 8.116692034500255e-06, + "loss": 1.5275, + "step": 2952 + }, + { + "epoch": 2.6968036529680366, + "grad_norm": 13.591636657714844, + "learning_rate": 8.115677321156773e-06, + "loss": 0.1466, + "step": 2953 + }, + { + "epoch": 2.697716894977169, + "grad_norm": 3.4322571754455566, + "learning_rate": 8.114662607813294e-06, + "loss": 0.0395, + "step": 2954 + }, + { + "epoch": 2.6986301369863015, + "grad_norm": 23.81235694885254, + "learning_rate": 8.113647894469813e-06, + "loss": 0.325, + "step": 2955 + }, + { + "epoch": 2.699543378995434, + "grad_norm": 26.732877731323242, + "learning_rate": 8.112633181126333e-06, + "loss": 0.4884, + "step": 2956 + }, + { + "epoch": 2.700456621004566, + "grad_norm": 6.327198505401611, + "learning_rate": 8.111618467782852e-06, + "loss": 0.0588, + "step": 2957 + }, + { + "epoch": 2.7013698630136984, + "grad_norm": 0.38398870825767517, + "learning_rate": 8.110603754439371e-06, + "loss": 0.0036, + "step": 2958 + }, + { + "epoch": 2.702283105022831, + "grad_norm": 13.692051887512207, + "learning_rate": 8.109589041095892e-06, + "loss": 0.233, + "step": 2959 + }, + { + "epoch": 2.7031963470319633, + "grad_norm": 51.10465621948242, + "learning_rate": 8.10857432775241e-06, + "loss": 1.5642, + "step": 2960 + }, + { + "epoch": 2.7041095890410958, + "grad_norm": 16.6375789642334, + "learning_rate": 8.10755961440893e-06, + "loss": 0.1459, + "step": 2961 + }, + { + "epoch": 2.7050228310502282, + "grad_norm": 8.278741836547852, + "learning_rate": 8.10654490106545e-06, + "loss": 0.0715, + "step": 2962 + }, + { + "epoch": 2.7059360730593607, + "grad_norm": 17.168180465698242, + "learning_rate": 8.105530187721969e-06, + "loss": 0.2804, + "step": 2963 + }, + { + "epoch": 2.706849315068493, + "grad_norm": 4.615281581878662, + "learning_rate": 8.104515474378489e-06, + "loss": 0.0375, + "step": 2964 + }, + { + "epoch": 2.7077625570776256, + "grad_norm": 69.18186950683594, + "learning_rate": 8.103500761035008e-06, + "loss": 2.0179, + "step": 2965 + }, + { + "epoch": 2.708675799086758, + "grad_norm": 7.289391040802002, + "learning_rate": 8.102486047691529e-06, + "loss": 0.1172, + "step": 2966 + }, + { + "epoch": 2.7095890410958905, + "grad_norm": 62.73874282836914, + "learning_rate": 8.101471334348047e-06, + "loss": 0.6444, + "step": 2967 + }, + { + "epoch": 2.710502283105023, + "grad_norm": 10.788712501525879, + "learning_rate": 8.100456621004566e-06, + "loss": 0.1597, + "step": 2968 + }, + { + "epoch": 2.7114155251141554, + "grad_norm": 40.0704460144043, + "learning_rate": 8.099441907661087e-06, + "loss": 0.4317, + "step": 2969 + }, + { + "epoch": 2.712328767123288, + "grad_norm": 44.55440139770508, + "learning_rate": 8.098427194317606e-06, + "loss": 1.3644, + "step": 2970 + }, + { + "epoch": 2.7132420091324203, + "grad_norm": 4.825766086578369, + "learning_rate": 8.097412480974124e-06, + "loss": 0.0467, + "step": 2971 + }, + { + "epoch": 2.7141552511415528, + "grad_norm": 69.48554229736328, + "learning_rate": 8.096397767630645e-06, + "loss": 1.4138, + "step": 2972 + }, + { + "epoch": 2.7150684931506848, + "grad_norm": 6.398422718048096, + "learning_rate": 8.095383054287166e-06, + "loss": 0.0555, + "step": 2973 + }, + { + "epoch": 2.7159817351598172, + "grad_norm": 99.86685180664062, + "learning_rate": 8.094368340943684e-06, + "loss": 4.9383, + "step": 2974 + }, + { + "epoch": 2.7168949771689497, + "grad_norm": 25.40868377685547, + "learning_rate": 8.093353627600203e-06, + "loss": 0.2234, + "step": 2975 + }, + { + "epoch": 2.717808219178082, + "grad_norm": 48.7321662902832, + "learning_rate": 8.092338914256724e-06, + "loss": 0.9116, + "step": 2976 + }, + { + "epoch": 2.7187214611872146, + "grad_norm": 35.12198257446289, + "learning_rate": 8.091324200913243e-06, + "loss": 0.6419, + "step": 2977 + }, + { + "epoch": 2.719634703196347, + "grad_norm": 15.225481033325195, + "learning_rate": 8.090309487569761e-06, + "loss": 0.141, + "step": 2978 + }, + { + "epoch": 2.7205479452054795, + "grad_norm": 32.79735565185547, + "learning_rate": 8.089294774226282e-06, + "loss": 0.6877, + "step": 2979 + }, + { + "epoch": 2.721461187214612, + "grad_norm": 82.72187042236328, + "learning_rate": 8.0882800608828e-06, + "loss": 2.3675, + "step": 2980 + }, + { + "epoch": 2.7223744292237444, + "grad_norm": 62.823143005371094, + "learning_rate": 8.08726534753932e-06, + "loss": 0.9645, + "step": 2981 + }, + { + "epoch": 2.723287671232877, + "grad_norm": 15.822604179382324, + "learning_rate": 8.08625063419584e-06, + "loss": 0.1665, + "step": 2982 + }, + { + "epoch": 2.724200913242009, + "grad_norm": 20.124752044677734, + "learning_rate": 8.08523592085236e-06, + "loss": 0.2073, + "step": 2983 + }, + { + "epoch": 2.7251141552511413, + "grad_norm": 5.293281555175781, + "learning_rate": 8.08422120750888e-06, + "loss": 0.0539, + "step": 2984 + }, + { + "epoch": 2.7260273972602738, + "grad_norm": 40.05672073364258, + "learning_rate": 8.083206494165398e-06, + "loss": 0.3339, + "step": 2985 + }, + { + "epoch": 2.726940639269406, + "grad_norm": 2.7517430782318115, + "learning_rate": 8.082191780821919e-06, + "loss": 0.0261, + "step": 2986 + }, + { + "epoch": 2.7278538812785387, + "grad_norm": 1.4772974252700806, + "learning_rate": 8.081177067478438e-06, + "loss": 0.0138, + "step": 2987 + }, + { + "epoch": 2.728767123287671, + "grad_norm": 11.093177795410156, + "learning_rate": 8.080162354134957e-06, + "loss": 0.1232, + "step": 2988 + }, + { + "epoch": 2.7296803652968036, + "grad_norm": 28.603351593017578, + "learning_rate": 8.079147640791477e-06, + "loss": 0.3879, + "step": 2989 + }, + { + "epoch": 2.730593607305936, + "grad_norm": 118.74646759033203, + "learning_rate": 8.078132927447998e-06, + "loss": 2.1241, + "step": 2990 + }, + { + "epoch": 2.7315068493150685, + "grad_norm": 9.599401473999023, + "learning_rate": 8.077118214104515e-06, + "loss": 0.0866, + "step": 2991 + }, + { + "epoch": 2.732420091324201, + "grad_norm": 4.118143081665039, + "learning_rate": 8.076103500761035e-06, + "loss": 0.0443, + "step": 2992 + }, + { + "epoch": 2.7333333333333334, + "grad_norm": 12.01229476928711, + "learning_rate": 8.075088787417556e-06, + "loss": 0.1441, + "step": 2993 + }, + { + "epoch": 2.734246575342466, + "grad_norm": 12.775846481323242, + "learning_rate": 8.074074074074075e-06, + "loss": 0.0819, + "step": 2994 + }, + { + "epoch": 2.7351598173515983, + "grad_norm": 15.658886909484863, + "learning_rate": 8.073059360730594e-06, + "loss": 0.0879, + "step": 2995 + }, + { + "epoch": 2.7360730593607308, + "grad_norm": 8.022988319396973, + "learning_rate": 8.072044647387114e-06, + "loss": 0.0983, + "step": 2996 + }, + { + "epoch": 2.736986301369863, + "grad_norm": 12.21670913696289, + "learning_rate": 8.071029934043633e-06, + "loss": 0.1838, + "step": 2997 + }, + { + "epoch": 2.7378995433789957, + "grad_norm": 10.241971969604492, + "learning_rate": 8.070015220700152e-06, + "loss": 0.1108, + "step": 2998 + }, + { + "epoch": 2.738812785388128, + "grad_norm": 1.465744972229004, + "learning_rate": 8.069000507356672e-06, + "loss": 0.0141, + "step": 2999 + }, + { + "epoch": 2.73972602739726, + "grad_norm": 4.166306972503662, + "learning_rate": 8.067985794013193e-06, + "loss": 0.0373, + "step": 3000 + }, + { + "epoch": 2.7406392694063926, + "grad_norm": 11.585968017578125, + "learning_rate": 8.066971080669712e-06, + "loss": 0.2131, + "step": 3001 + }, + { + "epoch": 2.741552511415525, + "grad_norm": 16.37436294555664, + "learning_rate": 8.06595636732623e-06, + "loss": 0.1356, + "step": 3002 + }, + { + "epoch": 2.7424657534246575, + "grad_norm": 61.958377838134766, + "learning_rate": 8.064941653982751e-06, + "loss": 0.6885, + "step": 3003 + }, + { + "epoch": 2.74337899543379, + "grad_norm": 41.36397933959961, + "learning_rate": 8.06392694063927e-06, + "loss": 0.6962, + "step": 3004 + }, + { + "epoch": 2.7442922374429224, + "grad_norm": 1.6594418287277222, + "learning_rate": 8.062912227295789e-06, + "loss": 0.0163, + "step": 3005 + }, + { + "epoch": 2.745205479452055, + "grad_norm": 8.35507583618164, + "learning_rate": 8.06189751395231e-06, + "loss": 0.1141, + "step": 3006 + }, + { + "epoch": 2.7461187214611873, + "grad_norm": 9.902356147766113, + "learning_rate": 8.060882800608828e-06, + "loss": 0.0625, + "step": 3007 + }, + { + "epoch": 2.7470319634703197, + "grad_norm": 20.731889724731445, + "learning_rate": 8.059868087265347e-06, + "loss": 0.2022, + "step": 3008 + }, + { + "epoch": 2.747945205479452, + "grad_norm": 11.830646514892578, + "learning_rate": 8.058853373921868e-06, + "loss": 0.1333, + "step": 3009 + }, + { + "epoch": 2.748858447488584, + "grad_norm": 23.337848663330078, + "learning_rate": 8.057838660578388e-06, + "loss": 0.2587, + "step": 3010 + }, + { + "epoch": 2.7497716894977167, + "grad_norm": 36.61185073852539, + "learning_rate": 8.056823947234907e-06, + "loss": 0.7687, + "step": 3011 + }, + { + "epoch": 2.750684931506849, + "grad_norm": 29.144695281982422, + "learning_rate": 8.055809233891426e-06, + "loss": 0.3671, + "step": 3012 + }, + { + "epoch": 2.7515981735159816, + "grad_norm": 30.362627029418945, + "learning_rate": 8.054794520547946e-06, + "loss": 0.5566, + "step": 3013 + }, + { + "epoch": 2.752511415525114, + "grad_norm": 5.262637138366699, + "learning_rate": 8.053779807204465e-06, + "loss": 0.0426, + "step": 3014 + }, + { + "epoch": 2.7534246575342465, + "grad_norm": 24.24032974243164, + "learning_rate": 8.052765093860984e-06, + "loss": 0.4081, + "step": 3015 + }, + { + "epoch": 2.754337899543379, + "grad_norm": 12.07713794708252, + "learning_rate": 8.051750380517505e-06, + "loss": 0.1501, + "step": 3016 + }, + { + "epoch": 2.7552511415525114, + "grad_norm": 9.117316246032715, + "learning_rate": 8.050735667174025e-06, + "loss": 0.0645, + "step": 3017 + }, + { + "epoch": 2.756164383561644, + "grad_norm": 82.37154388427734, + "learning_rate": 8.049720953830542e-06, + "loss": 2.3894, + "step": 3018 + }, + { + "epoch": 2.7570776255707763, + "grad_norm": 50.310001373291016, + "learning_rate": 8.048706240487063e-06, + "loss": 1.3138, + "step": 3019 + }, + { + "epoch": 2.7579908675799087, + "grad_norm": 11.22953987121582, + "learning_rate": 8.047691527143583e-06, + "loss": 0.1182, + "step": 3020 + }, + { + "epoch": 2.758904109589041, + "grad_norm": 2.2795863151550293, + "learning_rate": 8.046676813800102e-06, + "loss": 0.023, + "step": 3021 + }, + { + "epoch": 2.7598173515981737, + "grad_norm": 8.864395141601562, + "learning_rate": 8.045662100456621e-06, + "loss": 0.0919, + "step": 3022 + }, + { + "epoch": 2.760730593607306, + "grad_norm": 30.504920959472656, + "learning_rate": 8.044647387113142e-06, + "loss": 0.2936, + "step": 3023 + }, + { + "epoch": 2.7616438356164386, + "grad_norm": 5.070019245147705, + "learning_rate": 8.04363267376966e-06, + "loss": 0.0573, + "step": 3024 + }, + { + "epoch": 2.762557077625571, + "grad_norm": 27.782201766967773, + "learning_rate": 8.04261796042618e-06, + "loss": 0.4365, + "step": 3025 + }, + { + "epoch": 2.7634703196347035, + "grad_norm": 36.762542724609375, + "learning_rate": 8.0416032470827e-06, + "loss": 0.7338, + "step": 3026 + }, + { + "epoch": 2.7643835616438355, + "grad_norm": 9.038432121276855, + "learning_rate": 8.04058853373922e-06, + "loss": 0.1486, + "step": 3027 + }, + { + "epoch": 2.765296803652968, + "grad_norm": 64.13188171386719, + "learning_rate": 8.03957382039574e-06, + "loss": 1.4692, + "step": 3028 + }, + { + "epoch": 2.7662100456621004, + "grad_norm": 16.393220901489258, + "learning_rate": 8.038559107052258e-06, + "loss": 0.2383, + "step": 3029 + }, + { + "epoch": 2.767123287671233, + "grad_norm": 3.402338743209839, + "learning_rate": 8.037544393708779e-06, + "loss": 0.0459, + "step": 3030 + }, + { + "epoch": 2.7680365296803653, + "grad_norm": 25.38410186767578, + "learning_rate": 8.036529680365297e-06, + "loss": 0.3207, + "step": 3031 + }, + { + "epoch": 2.7689497716894977, + "grad_norm": 67.30159759521484, + "learning_rate": 8.035514967021816e-06, + "loss": 2.6989, + "step": 3032 + }, + { + "epoch": 2.76986301369863, + "grad_norm": 51.55864334106445, + "learning_rate": 8.034500253678337e-06, + "loss": 0.603, + "step": 3033 + }, + { + "epoch": 2.7707762557077626, + "grad_norm": 7.236391544342041, + "learning_rate": 8.033485540334857e-06, + "loss": 0.0742, + "step": 3034 + }, + { + "epoch": 2.771689497716895, + "grad_norm": 10.39455509185791, + "learning_rate": 8.032470826991375e-06, + "loss": 0.1278, + "step": 3035 + }, + { + "epoch": 2.7726027397260276, + "grad_norm": 7.902087688446045, + "learning_rate": 8.031456113647895e-06, + "loss": 0.0685, + "step": 3036 + }, + { + "epoch": 2.7735159817351596, + "grad_norm": 12.022306442260742, + "learning_rate": 8.030441400304416e-06, + "loss": 0.1219, + "step": 3037 + }, + { + "epoch": 2.774429223744292, + "grad_norm": 78.17046356201172, + "learning_rate": 8.029426686960934e-06, + "loss": 1.6695, + "step": 3038 + }, + { + "epoch": 2.7753424657534245, + "grad_norm": 22.8796329498291, + "learning_rate": 8.028411973617453e-06, + "loss": 0.237, + "step": 3039 + }, + { + "epoch": 2.776255707762557, + "grad_norm": 9.921940803527832, + "learning_rate": 8.027397260273974e-06, + "loss": 0.1469, + "step": 3040 + }, + { + "epoch": 2.7771689497716894, + "grad_norm": 5.6445441246032715, + "learning_rate": 8.026382546930493e-06, + "loss": 0.0727, + "step": 3041 + }, + { + "epoch": 2.778082191780822, + "grad_norm": 33.55414581298828, + "learning_rate": 8.025367833587012e-06, + "loss": 0.2982, + "step": 3042 + }, + { + "epoch": 2.7789954337899543, + "grad_norm": 27.896360397338867, + "learning_rate": 8.024353120243532e-06, + "loss": 0.2893, + "step": 3043 + }, + { + "epoch": 2.7799086757990867, + "grad_norm": 36.70811462402344, + "learning_rate": 8.023338406900053e-06, + "loss": 0.2783, + "step": 3044 + }, + { + "epoch": 2.780821917808219, + "grad_norm": 29.436491012573242, + "learning_rate": 8.022323693556571e-06, + "loss": 0.4172, + "step": 3045 + }, + { + "epoch": 2.7817351598173516, + "grad_norm": 3.1886353492736816, + "learning_rate": 8.02130898021309e-06, + "loss": 0.0401, + "step": 3046 + }, + { + "epoch": 2.782648401826484, + "grad_norm": 1.2836374044418335, + "learning_rate": 8.020294266869611e-06, + "loss": 0.0111, + "step": 3047 + }, + { + "epoch": 2.7835616438356166, + "grad_norm": 19.420228958129883, + "learning_rate": 8.01927955352613e-06, + "loss": 0.2427, + "step": 3048 + }, + { + "epoch": 2.784474885844749, + "grad_norm": 51.74895095825195, + "learning_rate": 8.018264840182649e-06, + "loss": 0.8678, + "step": 3049 + }, + { + "epoch": 2.7853881278538815, + "grad_norm": 26.62108039855957, + "learning_rate": 8.017250126839169e-06, + "loss": 0.3537, + "step": 3050 + }, + { + "epoch": 2.786301369863014, + "grad_norm": 33.27643585205078, + "learning_rate": 8.016235413495688e-06, + "loss": 0.6308, + "step": 3051 + }, + { + "epoch": 2.7872146118721464, + "grad_norm": 69.25408935546875, + "learning_rate": 8.015220700152207e-06, + "loss": 1.4673, + "step": 3052 + }, + { + "epoch": 2.7881278538812784, + "grad_norm": 6.946503639221191, + "learning_rate": 8.014205986808727e-06, + "loss": 0.0674, + "step": 3053 + }, + { + "epoch": 2.789041095890411, + "grad_norm": 9.659690856933594, + "learning_rate": 8.013191273465248e-06, + "loss": 0.0917, + "step": 3054 + }, + { + "epoch": 2.7899543378995433, + "grad_norm": 1.2217824459075928, + "learning_rate": 8.012176560121767e-06, + "loss": 0.0139, + "step": 3055 + }, + { + "epoch": 2.7908675799086757, + "grad_norm": 3.7603914737701416, + "learning_rate": 8.011161846778286e-06, + "loss": 0.0491, + "step": 3056 + }, + { + "epoch": 2.791780821917808, + "grad_norm": 43.9328498840332, + "learning_rate": 8.010147133434806e-06, + "loss": 0.8585, + "step": 3057 + }, + { + "epoch": 2.7926940639269406, + "grad_norm": 39.31818389892578, + "learning_rate": 8.009132420091325e-06, + "loss": 0.6404, + "step": 3058 + }, + { + "epoch": 2.793607305936073, + "grad_norm": 0.5397601127624512, + "learning_rate": 8.008117706747844e-06, + "loss": 0.0057, + "step": 3059 + }, + { + "epoch": 2.7945205479452055, + "grad_norm": 41.467891693115234, + "learning_rate": 8.007102993404364e-06, + "loss": 0.6574, + "step": 3060 + }, + { + "epoch": 2.795433789954338, + "grad_norm": 44.4515380859375, + "learning_rate": 8.006088280060883e-06, + "loss": 0.7105, + "step": 3061 + }, + { + "epoch": 2.7963470319634705, + "grad_norm": 59.94812774658203, + "learning_rate": 8.005073566717404e-06, + "loss": 3.0023, + "step": 3062 + }, + { + "epoch": 2.7972602739726025, + "grad_norm": 34.104759216308594, + "learning_rate": 8.004058853373923e-06, + "loss": 0.546, + "step": 3063 + }, + { + "epoch": 2.798173515981735, + "grad_norm": 46.50446319580078, + "learning_rate": 8.003044140030443e-06, + "loss": 1.2493, + "step": 3064 + }, + { + "epoch": 2.7990867579908674, + "grad_norm": 14.682753562927246, + "learning_rate": 8.002029426686962e-06, + "loss": 0.1016, + "step": 3065 + }, + { + "epoch": 2.8, + "grad_norm": 37.964237213134766, + "learning_rate": 8.00101471334348e-06, + "loss": 0.373, + "step": 3066 + }, + { + "epoch": 2.8009132420091323, + "grad_norm": 4.369853973388672, + "learning_rate": 8.000000000000001e-06, + "loss": 0.0479, + "step": 3067 + }, + { + "epoch": 2.8018264840182647, + "grad_norm": 14.39913272857666, + "learning_rate": 7.99898528665652e-06, + "loss": 0.2648, + "step": 3068 + }, + { + "epoch": 2.802739726027397, + "grad_norm": 32.394989013671875, + "learning_rate": 7.997970573313039e-06, + "loss": 0.4532, + "step": 3069 + }, + { + "epoch": 2.8036529680365296, + "grad_norm": 1.6877996921539307, + "learning_rate": 7.99695585996956e-06, + "loss": 0.0135, + "step": 3070 + }, + { + "epoch": 2.804566210045662, + "grad_norm": 8.766670227050781, + "learning_rate": 7.995941146626078e-06, + "loss": 0.0667, + "step": 3071 + }, + { + "epoch": 2.8054794520547945, + "grad_norm": 7.932440280914307, + "learning_rate": 7.994926433282599e-06, + "loss": 0.0927, + "step": 3072 + }, + { + "epoch": 2.806392694063927, + "grad_norm": 20.410654067993164, + "learning_rate": 7.993911719939118e-06, + "loss": 0.2882, + "step": 3073 + }, + { + "epoch": 2.8073059360730594, + "grad_norm": 61.35655212402344, + "learning_rate": 7.992897006595637e-06, + "loss": 4.5035, + "step": 3074 + }, + { + "epoch": 2.808219178082192, + "grad_norm": 64.97151947021484, + "learning_rate": 7.991882293252157e-06, + "loss": 2.1254, + "step": 3075 + }, + { + "epoch": 2.8091324200913244, + "grad_norm": 23.54462432861328, + "learning_rate": 7.990867579908676e-06, + "loss": 0.3509, + "step": 3076 + }, + { + "epoch": 2.810045662100457, + "grad_norm": 23.49645233154297, + "learning_rate": 7.989852866565197e-06, + "loss": 0.2309, + "step": 3077 + }, + { + "epoch": 2.8109589041095893, + "grad_norm": 14.621864318847656, + "learning_rate": 7.988838153221715e-06, + "loss": 0.1397, + "step": 3078 + }, + { + "epoch": 2.8118721461187217, + "grad_norm": 27.848947525024414, + "learning_rate": 7.987823439878234e-06, + "loss": 0.5267, + "step": 3079 + }, + { + "epoch": 2.8127853881278537, + "grad_norm": 47.04505157470703, + "learning_rate": 7.986808726534755e-06, + "loss": 0.7245, + "step": 3080 + }, + { + "epoch": 2.813698630136986, + "grad_norm": 10.97865104675293, + "learning_rate": 7.985794013191274e-06, + "loss": 0.1667, + "step": 3081 + }, + { + "epoch": 2.8146118721461186, + "grad_norm": 51.286094665527344, + "learning_rate": 7.984779299847794e-06, + "loss": 1.1043, + "step": 3082 + }, + { + "epoch": 2.815525114155251, + "grad_norm": 4.798668384552002, + "learning_rate": 7.983764586504313e-06, + "loss": 0.0659, + "step": 3083 + }, + { + "epoch": 2.8164383561643835, + "grad_norm": 9.428156852722168, + "learning_rate": 7.982749873160832e-06, + "loss": 0.1303, + "step": 3084 + }, + { + "epoch": 2.817351598173516, + "grad_norm": 16.631973266601562, + "learning_rate": 7.981735159817352e-06, + "loss": 0.2557, + "step": 3085 + }, + { + "epoch": 2.8182648401826484, + "grad_norm": 33.126590728759766, + "learning_rate": 7.980720446473871e-06, + "loss": 0.3484, + "step": 3086 + }, + { + "epoch": 2.819178082191781, + "grad_norm": 26.653919219970703, + "learning_rate": 7.979705733130392e-06, + "loss": 0.5004, + "step": 3087 + }, + { + "epoch": 2.8200913242009134, + "grad_norm": 63.17335510253906, + "learning_rate": 7.97869101978691e-06, + "loss": 1.2814, + "step": 3088 + }, + { + "epoch": 2.821004566210046, + "grad_norm": 4.289588928222656, + "learning_rate": 7.977676306443431e-06, + "loss": 0.0479, + "step": 3089 + }, + { + "epoch": 2.821917808219178, + "grad_norm": 20.496793746948242, + "learning_rate": 7.97666159309995e-06, + "loss": 0.2343, + "step": 3090 + }, + { + "epoch": 2.8228310502283103, + "grad_norm": 6.326916217803955, + "learning_rate": 7.975646879756469e-06, + "loss": 0.0929, + "step": 3091 + }, + { + "epoch": 2.8237442922374427, + "grad_norm": 77.16057586669922, + "learning_rate": 7.97463216641299e-06, + "loss": 0.5648, + "step": 3092 + }, + { + "epoch": 2.824657534246575, + "grad_norm": 9.96139144897461, + "learning_rate": 7.973617453069508e-06, + "loss": 0.1275, + "step": 3093 + }, + { + "epoch": 2.8255707762557076, + "grad_norm": 28.553197860717773, + "learning_rate": 7.972602739726027e-06, + "loss": 0.4305, + "step": 3094 + }, + { + "epoch": 2.82648401826484, + "grad_norm": 7.191932201385498, + "learning_rate": 7.971588026382548e-06, + "loss": 0.0612, + "step": 3095 + }, + { + "epoch": 2.8273972602739725, + "grad_norm": 2.7752697467803955, + "learning_rate": 7.970573313039066e-06, + "loss": 0.0137, + "step": 3096 + }, + { + "epoch": 2.828310502283105, + "grad_norm": 12.41064453125, + "learning_rate": 7.969558599695587e-06, + "loss": 0.134, + "step": 3097 + }, + { + "epoch": 2.8292237442922374, + "grad_norm": 20.607746124267578, + "learning_rate": 7.968543886352106e-06, + "loss": 0.3665, + "step": 3098 + }, + { + "epoch": 2.83013698630137, + "grad_norm": 15.165226936340332, + "learning_rate": 7.967529173008626e-06, + "loss": 0.2334, + "step": 3099 + }, + { + "epoch": 2.8310502283105023, + "grad_norm": 21.647249221801758, + "learning_rate": 7.966514459665145e-06, + "loss": 0.315, + "step": 3100 + }, + { + "epoch": 2.831963470319635, + "grad_norm": 64.48255920410156, + "learning_rate": 7.965499746321664e-06, + "loss": 1.4602, + "step": 3101 + }, + { + "epoch": 2.8328767123287673, + "grad_norm": 7.007905960083008, + "learning_rate": 7.964485032978185e-06, + "loss": 0.0785, + "step": 3102 + }, + { + "epoch": 2.8337899543378997, + "grad_norm": 9.063576698303223, + "learning_rate": 7.963470319634703e-06, + "loss": 0.0811, + "step": 3103 + }, + { + "epoch": 2.834703196347032, + "grad_norm": 2.466517448425293, + "learning_rate": 7.962455606291222e-06, + "loss": 0.0348, + "step": 3104 + }, + { + "epoch": 2.8356164383561646, + "grad_norm": 68.73615264892578, + "learning_rate": 7.961440892947743e-06, + "loss": 1.9915, + "step": 3105 + }, + { + "epoch": 2.836529680365297, + "grad_norm": 64.19255828857422, + "learning_rate": 7.960426179604263e-06, + "loss": 1.3509, + "step": 3106 + }, + { + "epoch": 2.837442922374429, + "grad_norm": 26.148189544677734, + "learning_rate": 7.959411466260782e-06, + "loss": 0.389, + "step": 3107 + }, + { + "epoch": 2.8383561643835615, + "grad_norm": 10.742561340332031, + "learning_rate": 7.958396752917301e-06, + "loss": 0.1632, + "step": 3108 + }, + { + "epoch": 2.839269406392694, + "grad_norm": 2.822744131088257, + "learning_rate": 7.957382039573822e-06, + "loss": 0.0272, + "step": 3109 + }, + { + "epoch": 2.8401826484018264, + "grad_norm": 22.131433486938477, + "learning_rate": 7.95636732623034e-06, + "loss": 0.3557, + "step": 3110 + }, + { + "epoch": 2.841095890410959, + "grad_norm": 41.77159118652344, + "learning_rate": 7.95535261288686e-06, + "loss": 0.508, + "step": 3111 + }, + { + "epoch": 2.8420091324200913, + "grad_norm": 15.627573013305664, + "learning_rate": 7.95433789954338e-06, + "loss": 0.2105, + "step": 3112 + }, + { + "epoch": 2.842922374429224, + "grad_norm": 13.608348846435547, + "learning_rate": 7.953323186199899e-06, + "loss": 0.1455, + "step": 3113 + }, + { + "epoch": 2.8438356164383563, + "grad_norm": 13.29078197479248, + "learning_rate": 7.952308472856418e-06, + "loss": 0.1909, + "step": 3114 + }, + { + "epoch": 2.8447488584474887, + "grad_norm": 23.698272705078125, + "learning_rate": 7.951293759512938e-06, + "loss": 0.2129, + "step": 3115 + }, + { + "epoch": 2.845662100456621, + "grad_norm": 45.69972229003906, + "learning_rate": 7.950279046169459e-06, + "loss": 0.6982, + "step": 3116 + }, + { + "epoch": 2.846575342465753, + "grad_norm": 1.0200293064117432, + "learning_rate": 7.949264332825977e-06, + "loss": 0.0062, + "step": 3117 + }, + { + "epoch": 2.8474885844748856, + "grad_norm": 34.28459167480469, + "learning_rate": 7.948249619482496e-06, + "loss": 0.3164, + "step": 3118 + }, + { + "epoch": 2.848401826484018, + "grad_norm": 23.41339111328125, + "learning_rate": 7.947234906139017e-06, + "loss": 0.1727, + "step": 3119 + }, + { + "epoch": 2.8493150684931505, + "grad_norm": 43.83945846557617, + "learning_rate": 7.946220192795536e-06, + "loss": 0.5061, + "step": 3120 + }, + { + "epoch": 2.850228310502283, + "grad_norm": 69.60856628417969, + "learning_rate": 7.945205479452055e-06, + "loss": 1.8275, + "step": 3121 + }, + { + "epoch": 2.8511415525114154, + "grad_norm": 51.99209213256836, + "learning_rate": 7.944190766108575e-06, + "loss": 1.0266, + "step": 3122 + }, + { + "epoch": 2.852054794520548, + "grad_norm": 77.13423919677734, + "learning_rate": 7.943176052765094e-06, + "loss": 1.1593, + "step": 3123 + }, + { + "epoch": 2.8529680365296803, + "grad_norm": 10.37430477142334, + "learning_rate": 7.942161339421613e-06, + "loss": 0.0999, + "step": 3124 + }, + { + "epoch": 2.853881278538813, + "grad_norm": 76.4687728881836, + "learning_rate": 7.941146626078133e-06, + "loss": 2.0698, + "step": 3125 + }, + { + "epoch": 2.8547945205479452, + "grad_norm": 3.2218871116638184, + "learning_rate": 7.940131912734654e-06, + "loss": 0.0271, + "step": 3126 + }, + { + "epoch": 2.8557077625570777, + "grad_norm": 51.73987579345703, + "learning_rate": 7.939117199391173e-06, + "loss": 2.0796, + "step": 3127 + }, + { + "epoch": 2.85662100456621, + "grad_norm": 66.05243682861328, + "learning_rate": 7.938102486047692e-06, + "loss": 1.7101, + "step": 3128 + }, + { + "epoch": 2.8575342465753426, + "grad_norm": 29.20876693725586, + "learning_rate": 7.937087772704212e-06, + "loss": 0.4342, + "step": 3129 + }, + { + "epoch": 2.858447488584475, + "grad_norm": 2.863192558288574, + "learning_rate": 7.936073059360731e-06, + "loss": 0.0274, + "step": 3130 + }, + { + "epoch": 2.8593607305936075, + "grad_norm": 1.2382991313934326, + "learning_rate": 7.93505834601725e-06, + "loss": 0.009, + "step": 3131 + }, + { + "epoch": 2.86027397260274, + "grad_norm": 2.334897041320801, + "learning_rate": 7.93404363267377e-06, + "loss": 0.0212, + "step": 3132 + }, + { + "epoch": 2.8611872146118724, + "grad_norm": 62.49687576293945, + "learning_rate": 7.93302891933029e-06, + "loss": 1.0588, + "step": 3133 + }, + { + "epoch": 2.8621004566210044, + "grad_norm": 0.18166369199752808, + "learning_rate": 7.932014205986808e-06, + "loss": 0.0014, + "step": 3134 + }, + { + "epoch": 2.863013698630137, + "grad_norm": 58.386817932128906, + "learning_rate": 7.930999492643329e-06, + "loss": 1.1506, + "step": 3135 + }, + { + "epoch": 2.8639269406392693, + "grad_norm": 90.24066925048828, + "learning_rate": 7.929984779299849e-06, + "loss": 2.1896, + "step": 3136 + }, + { + "epoch": 2.864840182648402, + "grad_norm": 31.145767211914062, + "learning_rate": 7.928970065956368e-06, + "loss": 0.4622, + "step": 3137 + }, + { + "epoch": 2.8657534246575342, + "grad_norm": 21.630088806152344, + "learning_rate": 7.927955352612887e-06, + "loss": 0.2003, + "step": 3138 + }, + { + "epoch": 2.8666666666666667, + "grad_norm": 5.361281871795654, + "learning_rate": 7.926940639269407e-06, + "loss": 0.0558, + "step": 3139 + }, + { + "epoch": 2.867579908675799, + "grad_norm": 58.64895248413086, + "learning_rate": 7.925925925925926e-06, + "loss": 0.6333, + "step": 3140 + }, + { + "epoch": 2.8684931506849316, + "grad_norm": 16.31788444519043, + "learning_rate": 7.924911212582445e-06, + "loss": 0.2041, + "step": 3141 + }, + { + "epoch": 2.869406392694064, + "grad_norm": 49.112205505371094, + "learning_rate": 7.923896499238966e-06, + "loss": 0.7473, + "step": 3142 + }, + { + "epoch": 2.8703196347031965, + "grad_norm": 23.17085075378418, + "learning_rate": 7.922881785895486e-06, + "loss": 0.2988, + "step": 3143 + }, + { + "epoch": 2.8712328767123285, + "grad_norm": 56.35826873779297, + "learning_rate": 7.921867072552005e-06, + "loss": 1.1006, + "step": 3144 + }, + { + "epoch": 2.872146118721461, + "grad_norm": 4.81275749206543, + "learning_rate": 7.920852359208524e-06, + "loss": 0.0424, + "step": 3145 + }, + { + "epoch": 2.8730593607305934, + "grad_norm": 79.0522689819336, + "learning_rate": 7.919837645865044e-06, + "loss": 0.8535, + "step": 3146 + }, + { + "epoch": 2.873972602739726, + "grad_norm": 1.8562755584716797, + "learning_rate": 7.918822932521563e-06, + "loss": 0.02, + "step": 3147 + }, + { + "epoch": 2.8748858447488583, + "grad_norm": 82.50406646728516, + "learning_rate": 7.917808219178082e-06, + "loss": 2.5136, + "step": 3148 + }, + { + "epoch": 2.875799086757991, + "grad_norm": 4.429163455963135, + "learning_rate": 7.916793505834603e-06, + "loss": 0.052, + "step": 3149 + }, + { + "epoch": 2.8767123287671232, + "grad_norm": 67.08084869384766, + "learning_rate": 7.915778792491123e-06, + "loss": 0.7528, + "step": 3150 + }, + { + "epoch": 2.8776255707762557, + "grad_norm": 89.16523742675781, + "learning_rate": 7.91476407914764e-06, + "loss": 0.8909, + "step": 3151 + }, + { + "epoch": 2.878538812785388, + "grad_norm": 8.522201538085938, + "learning_rate": 7.91374936580416e-06, + "loss": 0.102, + "step": 3152 + }, + { + "epoch": 2.8794520547945206, + "grad_norm": 6.13755464553833, + "learning_rate": 7.912734652460681e-06, + "loss": 0.0505, + "step": 3153 + }, + { + "epoch": 2.880365296803653, + "grad_norm": 10.52943229675293, + "learning_rate": 7.9117199391172e-06, + "loss": 0.1124, + "step": 3154 + }, + { + "epoch": 2.8812785388127855, + "grad_norm": 20.672069549560547, + "learning_rate": 7.910705225773719e-06, + "loss": 0.149, + "step": 3155 + }, + { + "epoch": 2.882191780821918, + "grad_norm": 60.779136657714844, + "learning_rate": 7.90969051243024e-06, + "loss": 1.2618, + "step": 3156 + }, + { + "epoch": 2.8831050228310504, + "grad_norm": 36.05194091796875, + "learning_rate": 7.908675799086758e-06, + "loss": 0.6956, + "step": 3157 + }, + { + "epoch": 2.884018264840183, + "grad_norm": 6.255834102630615, + "learning_rate": 7.907661085743277e-06, + "loss": 0.0656, + "step": 3158 + }, + { + "epoch": 2.8849315068493153, + "grad_norm": 24.092731475830078, + "learning_rate": 7.906646372399798e-06, + "loss": 0.2308, + "step": 3159 + }, + { + "epoch": 2.8858447488584473, + "grad_norm": 1.0531953573226929, + "learning_rate": 7.905631659056318e-06, + "loss": 0.0108, + "step": 3160 + }, + { + "epoch": 2.88675799086758, + "grad_norm": 44.55252456665039, + "learning_rate": 7.904616945712837e-06, + "loss": 0.4047, + "step": 3161 + }, + { + "epoch": 2.8876712328767122, + "grad_norm": 3.8411712646484375, + "learning_rate": 7.903602232369356e-06, + "loss": 0.029, + "step": 3162 + }, + { + "epoch": 2.8885844748858447, + "grad_norm": 1.892507791519165, + "learning_rate": 7.902587519025877e-06, + "loss": 0.014, + "step": 3163 + }, + { + "epoch": 2.889497716894977, + "grad_norm": 0.18025518953800201, + "learning_rate": 7.901572805682395e-06, + "loss": 0.0016, + "step": 3164 + }, + { + "epoch": 2.8904109589041096, + "grad_norm": 8.7930269241333, + "learning_rate": 7.900558092338914e-06, + "loss": 0.0644, + "step": 3165 + }, + { + "epoch": 2.891324200913242, + "grad_norm": 32.25492858886719, + "learning_rate": 7.899543378995435e-06, + "loss": 0.2937, + "step": 3166 + }, + { + "epoch": 2.8922374429223745, + "grad_norm": 72.72599029541016, + "learning_rate": 7.898528665651954e-06, + "loss": 0.9685, + "step": 3167 + }, + { + "epoch": 2.893150684931507, + "grad_norm": 29.699684143066406, + "learning_rate": 7.897513952308472e-06, + "loss": 0.3963, + "step": 3168 + }, + { + "epoch": 2.8940639269406394, + "grad_norm": 15.971379280090332, + "learning_rate": 7.896499238964993e-06, + "loss": 0.1617, + "step": 3169 + }, + { + "epoch": 2.8949771689497714, + "grad_norm": 33.22615051269531, + "learning_rate": 7.895484525621514e-06, + "loss": 0.5571, + "step": 3170 + }, + { + "epoch": 2.895890410958904, + "grad_norm": 10.724595069885254, + "learning_rate": 7.894469812278032e-06, + "loss": 0.1, + "step": 3171 + }, + { + "epoch": 2.8968036529680363, + "grad_norm": 2.0476107597351074, + "learning_rate": 7.893455098934551e-06, + "loss": 0.0255, + "step": 3172 + }, + { + "epoch": 2.8977168949771688, + "grad_norm": 11.601114273071289, + "learning_rate": 7.892440385591072e-06, + "loss": 0.1164, + "step": 3173 + }, + { + "epoch": 2.8986301369863012, + "grad_norm": 18.04137420654297, + "learning_rate": 7.89142567224759e-06, + "loss": 0.225, + "step": 3174 + }, + { + "epoch": 2.8995433789954337, + "grad_norm": 12.724217414855957, + "learning_rate": 7.89041095890411e-06, + "loss": 0.1154, + "step": 3175 + }, + { + "epoch": 2.900456621004566, + "grad_norm": 6.78741979598999, + "learning_rate": 7.88939624556063e-06, + "loss": 0.0612, + "step": 3176 + }, + { + "epoch": 2.9013698630136986, + "grad_norm": 33.45146560668945, + "learning_rate": 7.88838153221715e-06, + "loss": 0.3564, + "step": 3177 + }, + { + "epoch": 2.902283105022831, + "grad_norm": 41.81211471557617, + "learning_rate": 7.887366818873668e-06, + "loss": 1.0468, + "step": 3178 + }, + { + "epoch": 2.9031963470319635, + "grad_norm": 0.5690261125564575, + "learning_rate": 7.886352105530188e-06, + "loss": 0.0066, + "step": 3179 + }, + { + "epoch": 2.904109589041096, + "grad_norm": 32.82957077026367, + "learning_rate": 7.885337392186709e-06, + "loss": 0.3454, + "step": 3180 + }, + { + "epoch": 2.9050228310502284, + "grad_norm": 0.4620952010154724, + "learning_rate": 7.884322678843228e-06, + "loss": 0.0054, + "step": 3181 + }, + { + "epoch": 2.905936073059361, + "grad_norm": 31.309860229492188, + "learning_rate": 7.883307965499746e-06, + "loss": 0.2527, + "step": 3182 + }, + { + "epoch": 2.9068493150684933, + "grad_norm": 26.704090118408203, + "learning_rate": 7.882293252156267e-06, + "loss": 0.2875, + "step": 3183 + }, + { + "epoch": 2.9077625570776258, + "grad_norm": 62.264801025390625, + "learning_rate": 7.881278538812786e-06, + "loss": 1.483, + "step": 3184 + }, + { + "epoch": 2.908675799086758, + "grad_norm": 1.4738080501556396, + "learning_rate": 7.880263825469305e-06, + "loss": 0.0094, + "step": 3185 + }, + { + "epoch": 2.9095890410958907, + "grad_norm": 43.04144287109375, + "learning_rate": 7.879249112125825e-06, + "loss": 0.4586, + "step": 3186 + }, + { + "epoch": 2.9105022831050227, + "grad_norm": 15.962101936340332, + "learning_rate": 7.878234398782346e-06, + "loss": 0.1548, + "step": 3187 + }, + { + "epoch": 2.911415525114155, + "grad_norm": 16.796476364135742, + "learning_rate": 7.877219685438865e-06, + "loss": 0.1138, + "step": 3188 + }, + { + "epoch": 2.9123287671232876, + "grad_norm": 5.363128662109375, + "learning_rate": 7.876204972095383e-06, + "loss": 0.0467, + "step": 3189 + }, + { + "epoch": 2.91324200913242, + "grad_norm": 2.1115944385528564, + "learning_rate": 7.875190258751904e-06, + "loss": 0.0171, + "step": 3190 + }, + { + "epoch": 2.9141552511415525, + "grad_norm": 58.45020294189453, + "learning_rate": 7.874175545408423e-06, + "loss": 0.9248, + "step": 3191 + }, + { + "epoch": 2.915068493150685, + "grad_norm": 8.666828155517578, + "learning_rate": 7.873160832064942e-06, + "loss": 0.0589, + "step": 3192 + }, + { + "epoch": 2.9159817351598174, + "grad_norm": 9.891768455505371, + "learning_rate": 7.872146118721462e-06, + "loss": 0.0751, + "step": 3193 + }, + { + "epoch": 2.91689497716895, + "grad_norm": 0.3444499373435974, + "learning_rate": 7.871131405377981e-06, + "loss": 0.0034, + "step": 3194 + }, + { + "epoch": 2.9178082191780823, + "grad_norm": 79.40483093261719, + "learning_rate": 7.8701166920345e-06, + "loss": 3.3612, + "step": 3195 + }, + { + "epoch": 2.9187214611872148, + "grad_norm": 17.837697982788086, + "learning_rate": 7.86910197869102e-06, + "loss": 0.17, + "step": 3196 + }, + { + "epoch": 2.9196347031963468, + "grad_norm": 15.137681007385254, + "learning_rate": 7.868087265347541e-06, + "loss": 0.1413, + "step": 3197 + }, + { + "epoch": 2.9205479452054792, + "grad_norm": 15.481264114379883, + "learning_rate": 7.86707255200406e-06, + "loss": 0.1734, + "step": 3198 + }, + { + "epoch": 2.9214611872146117, + "grad_norm": 0.2312554568052292, + "learning_rate": 7.866057838660579e-06, + "loss": 0.002, + "step": 3199 + }, + { + "epoch": 2.922374429223744, + "grad_norm": 24.98311996459961, + "learning_rate": 7.8650431253171e-06, + "loss": 0.2732, + "step": 3200 + }, + { + "epoch": 2.9232876712328766, + "grad_norm": 8.84883975982666, + "learning_rate": 7.864028411973618e-06, + "loss": 0.0604, + "step": 3201 + }, + { + "epoch": 2.924200913242009, + "grad_norm": 3.6488749980926514, + "learning_rate": 7.863013698630137e-06, + "loss": 0.027, + "step": 3202 + }, + { + "epoch": 2.9251141552511415, + "grad_norm": 4.990022659301758, + "learning_rate": 7.861998985286657e-06, + "loss": 0.0331, + "step": 3203 + }, + { + "epoch": 2.926027397260274, + "grad_norm": 1.5021251440048218, + "learning_rate": 7.860984271943176e-06, + "loss": 0.0136, + "step": 3204 + }, + { + "epoch": 2.9269406392694064, + "grad_norm": 31.164955139160156, + "learning_rate": 7.859969558599697e-06, + "loss": 0.3371, + "step": 3205 + }, + { + "epoch": 2.927853881278539, + "grad_norm": 3.312035322189331, + "learning_rate": 7.858954845256216e-06, + "loss": 0.0222, + "step": 3206 + }, + { + "epoch": 2.9287671232876713, + "grad_norm": 2.9034905433654785, + "learning_rate": 7.857940131912736e-06, + "loss": 0.0312, + "step": 3207 + }, + { + "epoch": 2.9296803652968038, + "grad_norm": 47.165855407714844, + "learning_rate": 7.856925418569255e-06, + "loss": 0.6179, + "step": 3208 + }, + { + "epoch": 2.930593607305936, + "grad_norm": 22.775562286376953, + "learning_rate": 7.855910705225774e-06, + "loss": 0.2665, + "step": 3209 + }, + { + "epoch": 2.9315068493150687, + "grad_norm": 82.63949584960938, + "learning_rate": 7.854895991882294e-06, + "loss": 1.5608, + "step": 3210 + }, + { + "epoch": 2.932420091324201, + "grad_norm": 46.8050422668457, + "learning_rate": 7.853881278538813e-06, + "loss": 0.4904, + "step": 3211 + }, + { + "epoch": 2.9333333333333336, + "grad_norm": 1.1557191610336304, + "learning_rate": 7.852866565195332e-06, + "loss": 0.0091, + "step": 3212 + }, + { + "epoch": 2.934246575342466, + "grad_norm": 3.8494346141815186, + "learning_rate": 7.851851851851853e-06, + "loss": 0.0298, + "step": 3213 + }, + { + "epoch": 2.935159817351598, + "grad_norm": 7.675119400024414, + "learning_rate": 7.850837138508372e-06, + "loss": 0.0758, + "step": 3214 + }, + { + "epoch": 2.9360730593607305, + "grad_norm": 30.51547622680664, + "learning_rate": 7.849822425164892e-06, + "loss": 0.0947, + "step": 3215 + }, + { + "epoch": 2.936986301369863, + "grad_norm": 4.152851104736328, + "learning_rate": 7.848807711821411e-06, + "loss": 0.036, + "step": 3216 + }, + { + "epoch": 2.9378995433789954, + "grad_norm": 60.107093811035156, + "learning_rate": 7.847792998477931e-06, + "loss": 0.9212, + "step": 3217 + }, + { + "epoch": 2.938812785388128, + "grad_norm": 71.23222351074219, + "learning_rate": 7.84677828513445e-06, + "loss": 1.2282, + "step": 3218 + }, + { + "epoch": 2.9397260273972603, + "grad_norm": 46.852195739746094, + "learning_rate": 7.845763571790969e-06, + "loss": 0.5671, + "step": 3219 + }, + { + "epoch": 2.9406392694063928, + "grad_norm": 2.952425241470337, + "learning_rate": 7.84474885844749e-06, + "loss": 0.0265, + "step": 3220 + }, + { + "epoch": 2.941552511415525, + "grad_norm": 15.122941017150879, + "learning_rate": 7.843734145104008e-06, + "loss": 0.1306, + "step": 3221 + }, + { + "epoch": 2.9424657534246577, + "grad_norm": 21.04680633544922, + "learning_rate": 7.842719431760527e-06, + "loss": 0.2318, + "step": 3222 + }, + { + "epoch": 2.94337899543379, + "grad_norm": 98.2325210571289, + "learning_rate": 7.841704718417048e-06, + "loss": 2.0453, + "step": 3223 + }, + { + "epoch": 2.944292237442922, + "grad_norm": 0.30369290709495544, + "learning_rate": 7.840690005073567e-06, + "loss": 0.0031, + "step": 3224 + }, + { + "epoch": 2.9452054794520546, + "grad_norm": 0.8364961743354797, + "learning_rate": 7.839675291730087e-06, + "loss": 0.0047, + "step": 3225 + }, + { + "epoch": 2.946118721461187, + "grad_norm": 2.7298941612243652, + "learning_rate": 7.838660578386606e-06, + "loss": 0.0279, + "step": 3226 + }, + { + "epoch": 2.9470319634703195, + "grad_norm": 127.12773895263672, + "learning_rate": 7.837645865043127e-06, + "loss": 7.7623, + "step": 3227 + }, + { + "epoch": 2.947945205479452, + "grad_norm": 18.630483627319336, + "learning_rate": 7.836631151699645e-06, + "loss": 0.255, + "step": 3228 + }, + { + "epoch": 2.9488584474885844, + "grad_norm": 3.939215898513794, + "learning_rate": 7.835616438356164e-06, + "loss": 0.0221, + "step": 3229 + }, + { + "epoch": 2.949771689497717, + "grad_norm": 78.66751861572266, + "learning_rate": 7.834601725012685e-06, + "loss": 1.1899, + "step": 3230 + }, + { + "epoch": 2.9506849315068493, + "grad_norm": 39.1065673828125, + "learning_rate": 7.833587011669204e-06, + "loss": 0.3485, + "step": 3231 + }, + { + "epoch": 2.9515981735159817, + "grad_norm": 54.553871154785156, + "learning_rate": 7.832572298325724e-06, + "loss": 0.8952, + "step": 3232 + }, + { + "epoch": 2.952511415525114, + "grad_norm": 8.567354202270508, + "learning_rate": 7.831557584982243e-06, + "loss": 0.0748, + "step": 3233 + }, + { + "epoch": 2.9534246575342467, + "grad_norm": 42.8589973449707, + "learning_rate": 7.830542871638762e-06, + "loss": 0.6421, + "step": 3234 + }, + { + "epoch": 2.954337899543379, + "grad_norm": 19.834062576293945, + "learning_rate": 7.829528158295282e-06, + "loss": 0.1763, + "step": 3235 + }, + { + "epoch": 2.9552511415525116, + "grad_norm": 14.896092414855957, + "learning_rate": 7.828513444951801e-06, + "loss": 0.1152, + "step": 3236 + }, + { + "epoch": 2.956164383561644, + "grad_norm": 2.2884938716888428, + "learning_rate": 7.827498731608322e-06, + "loss": 0.0167, + "step": 3237 + }, + { + "epoch": 2.9570776255707765, + "grad_norm": 7.848386764526367, + "learning_rate": 7.82648401826484e-06, + "loss": 0.0874, + "step": 3238 + }, + { + "epoch": 2.957990867579909, + "grad_norm": 15.774504661560059, + "learning_rate": 7.82546930492136e-06, + "loss": 0.1751, + "step": 3239 + }, + { + "epoch": 2.958904109589041, + "grad_norm": 45.44812774658203, + "learning_rate": 7.82445459157788e-06, + "loss": 0.7174, + "step": 3240 + }, + { + "epoch": 2.9598173515981734, + "grad_norm": 59.47959518432617, + "learning_rate": 7.823439878234399e-06, + "loss": 1.2928, + "step": 3241 + }, + { + "epoch": 2.960730593607306, + "grad_norm": 7.830738544464111, + "learning_rate": 7.82242516489092e-06, + "loss": 0.0924, + "step": 3242 + }, + { + "epoch": 2.9616438356164383, + "grad_norm": 3.8612406253814697, + "learning_rate": 7.821410451547438e-06, + "loss": 0.0343, + "step": 3243 + }, + { + "epoch": 2.9625570776255707, + "grad_norm": 1.2385950088500977, + "learning_rate": 7.820395738203957e-06, + "loss": 0.0119, + "step": 3244 + }, + { + "epoch": 2.963470319634703, + "grad_norm": 0.3307510018348694, + "learning_rate": 7.819381024860478e-06, + "loss": 0.004, + "step": 3245 + }, + { + "epoch": 2.9643835616438357, + "grad_norm": 68.89839935302734, + "learning_rate": 7.818366311516997e-06, + "loss": 0.4537, + "step": 3246 + }, + { + "epoch": 2.965296803652968, + "grad_norm": 5.570034980773926, + "learning_rate": 7.817351598173517e-06, + "loss": 0.0568, + "step": 3247 + }, + { + "epoch": 2.9662100456621006, + "grad_norm": 14.139936447143555, + "learning_rate": 7.816336884830036e-06, + "loss": 0.112, + "step": 3248 + }, + { + "epoch": 2.967123287671233, + "grad_norm": 16.887123107910156, + "learning_rate": 7.815322171486556e-06, + "loss": 0.1278, + "step": 3249 + }, + { + "epoch": 2.968036529680365, + "grad_norm": 11.108536720275879, + "learning_rate": 7.814307458143075e-06, + "loss": 0.0754, + "step": 3250 + }, + { + "epoch": 2.9689497716894975, + "grad_norm": 21.4368839263916, + "learning_rate": 7.813292744799594e-06, + "loss": 0.243, + "step": 3251 + }, + { + "epoch": 2.96986301369863, + "grad_norm": 52.76861572265625, + "learning_rate": 7.812278031456115e-06, + "loss": 0.5395, + "step": 3252 + }, + { + "epoch": 2.9707762557077624, + "grad_norm": 7.942706108093262, + "learning_rate": 7.811263318112634e-06, + "loss": 0.0679, + "step": 3253 + }, + { + "epoch": 2.971689497716895, + "grad_norm": 15.26542854309082, + "learning_rate": 7.810248604769152e-06, + "loss": 0.1329, + "step": 3254 + }, + { + "epoch": 2.9726027397260273, + "grad_norm": 42.865806579589844, + "learning_rate": 7.809233891425673e-06, + "loss": 0.3993, + "step": 3255 + }, + { + "epoch": 2.9735159817351597, + "grad_norm": 71.90483856201172, + "learning_rate": 7.808219178082192e-06, + "loss": 1.652, + "step": 3256 + }, + { + "epoch": 2.974429223744292, + "grad_norm": 19.545324325561523, + "learning_rate": 7.807204464738712e-06, + "loss": 0.2658, + "step": 3257 + }, + { + "epoch": 2.9753424657534246, + "grad_norm": 18.254114151000977, + "learning_rate": 7.806189751395231e-06, + "loss": 0.1997, + "step": 3258 + }, + { + "epoch": 2.976255707762557, + "grad_norm": 0.16255956888198853, + "learning_rate": 7.805175038051752e-06, + "loss": 0.0018, + "step": 3259 + }, + { + "epoch": 2.9771689497716896, + "grad_norm": 24.044254302978516, + "learning_rate": 7.80416032470827e-06, + "loss": 0.267, + "step": 3260 + }, + { + "epoch": 2.978082191780822, + "grad_norm": 6.822444438934326, + "learning_rate": 7.80314561136479e-06, + "loss": 0.0508, + "step": 3261 + }, + { + "epoch": 2.9789954337899545, + "grad_norm": 11.900712966918945, + "learning_rate": 7.80213089802131e-06, + "loss": 0.1254, + "step": 3262 + }, + { + "epoch": 2.979908675799087, + "grad_norm": 18.833654403686523, + "learning_rate": 7.801116184677829e-06, + "loss": 0.214, + "step": 3263 + }, + { + "epoch": 2.9808219178082194, + "grad_norm": 1.138232946395874, + "learning_rate": 7.800101471334348e-06, + "loss": 0.0106, + "step": 3264 + }, + { + "epoch": 2.981735159817352, + "grad_norm": 26.815479278564453, + "learning_rate": 7.799086757990868e-06, + "loss": 0.3759, + "step": 3265 + }, + { + "epoch": 2.9826484018264843, + "grad_norm": 12.89944839477539, + "learning_rate": 7.798072044647389e-06, + "loss": 0.1229, + "step": 3266 + }, + { + "epoch": 2.9835616438356163, + "grad_norm": 9.030132293701172, + "learning_rate": 7.797057331303908e-06, + "loss": 0.1027, + "step": 3267 + }, + { + "epoch": 2.9844748858447487, + "grad_norm": 43.9536247253418, + "learning_rate": 7.796042617960426e-06, + "loss": 0.3937, + "step": 3268 + }, + { + "epoch": 2.985388127853881, + "grad_norm": 53.7775764465332, + "learning_rate": 7.795027904616947e-06, + "loss": 0.9665, + "step": 3269 + }, + { + "epoch": 2.9863013698630136, + "grad_norm": 5.54789400100708, + "learning_rate": 7.794013191273466e-06, + "loss": 0.0495, + "step": 3270 + }, + { + "epoch": 2.987214611872146, + "grad_norm": 31.575658798217773, + "learning_rate": 7.792998477929985e-06, + "loss": 0.332, + "step": 3271 + }, + { + "epoch": 2.9881278538812786, + "grad_norm": 84.67100524902344, + "learning_rate": 7.791983764586505e-06, + "loss": 3.458, + "step": 3272 + }, + { + "epoch": 2.989041095890411, + "grad_norm": 4.416590690612793, + "learning_rate": 7.790969051243024e-06, + "loss": 0.0376, + "step": 3273 + }, + { + "epoch": 2.9899543378995435, + "grad_norm": 15.614153861999512, + "learning_rate": 7.789954337899543e-06, + "loss": 0.1376, + "step": 3274 + }, + { + "epoch": 2.990867579908676, + "grad_norm": 1.2185002565383911, + "learning_rate": 7.788939624556063e-06, + "loss": 0.0134, + "step": 3275 + }, + { + "epoch": 2.9917808219178084, + "grad_norm": 49.77379608154297, + "learning_rate": 7.787924911212584e-06, + "loss": 1.0812, + "step": 3276 + }, + { + "epoch": 2.9926940639269404, + "grad_norm": 1.4294081926345825, + "learning_rate": 7.786910197869103e-06, + "loss": 0.0163, + "step": 3277 + }, + { + "epoch": 2.993607305936073, + "grad_norm": 7.635512828826904, + "learning_rate": 7.785895484525622e-06, + "loss": 0.0715, + "step": 3278 + }, + { + "epoch": 2.9945205479452053, + "grad_norm": 0.420058012008667, + "learning_rate": 7.784880771182142e-06, + "loss": 0.0042, + "step": 3279 + }, + { + "epoch": 2.9954337899543377, + "grad_norm": 75.74231719970703, + "learning_rate": 7.783866057838661e-06, + "loss": 2.3144, + "step": 3280 + }, + { + "epoch": 2.99634703196347, + "grad_norm": 11.533279418945312, + "learning_rate": 7.78285134449518e-06, + "loss": 0.1023, + "step": 3281 + }, + { + "epoch": 2.9972602739726026, + "grad_norm": 14.197988510131836, + "learning_rate": 7.7818366311517e-06, + "loss": 0.1089, + "step": 3282 + }, + { + "epoch": 2.998173515981735, + "grad_norm": 66.04591369628906, + "learning_rate": 7.78082191780822e-06, + "loss": 2.4271, + "step": 3283 + }, + { + "epoch": 2.9990867579908675, + "grad_norm": 14.031656265258789, + "learning_rate": 7.779807204464738e-06, + "loss": 0.1524, + "step": 3284 + }, + { + "epoch": 3.0, + "grad_norm": 35.00727844238281, + "learning_rate": 7.778792491121259e-06, + "loss": 0.3765, + "step": 3285 + }, + { + "epoch": 3.0009132420091325, + "grad_norm": 1.124720573425293, + "learning_rate": 7.77777777777778e-06, + "loss": 0.0143, + "step": 3286 + }, + { + "epoch": 3.001826484018265, + "grad_norm": 62.71676254272461, + "learning_rate": 7.776763064434298e-06, + "loss": 1.5267, + "step": 3287 + }, + { + "epoch": 3.0027397260273974, + "grad_norm": 5.655252456665039, + "learning_rate": 7.775748351090817e-06, + "loss": 0.0507, + "step": 3288 + }, + { + "epoch": 3.00365296803653, + "grad_norm": 0.24385951459407806, + "learning_rate": 7.774733637747337e-06, + "loss": 0.0023, + "step": 3289 + }, + { + "epoch": 3.0045662100456623, + "grad_norm": 6.278900623321533, + "learning_rate": 7.773718924403856e-06, + "loss": 0.0752, + "step": 3290 + }, + { + "epoch": 3.0054794520547947, + "grad_norm": 71.92940521240234, + "learning_rate": 7.772704211060375e-06, + "loss": 1.56, + "step": 3291 + }, + { + "epoch": 3.0063926940639267, + "grad_norm": 5.098449230194092, + "learning_rate": 7.771689497716896e-06, + "loss": 0.0603, + "step": 3292 + }, + { + "epoch": 3.007305936073059, + "grad_norm": 29.94342041015625, + "learning_rate": 7.770674784373416e-06, + "loss": 0.4751, + "step": 3293 + }, + { + "epoch": 3.0082191780821916, + "grad_norm": 42.115509033203125, + "learning_rate": 7.769660071029933e-06, + "loss": 0.6498, + "step": 3294 + }, + { + "epoch": 3.009132420091324, + "grad_norm": 7.050472736358643, + "learning_rate": 7.768645357686454e-06, + "loss": 0.0846, + "step": 3295 + }, + { + "epoch": 3.0100456621004565, + "grad_norm": 5.791268825531006, + "learning_rate": 7.767630644342974e-06, + "loss": 0.0626, + "step": 3296 + }, + { + "epoch": 3.010958904109589, + "grad_norm": 30.589218139648438, + "learning_rate": 7.766615930999493e-06, + "loss": 0.3347, + "step": 3297 + }, + { + "epoch": 3.0118721461187214, + "grad_norm": 58.423255920410156, + "learning_rate": 7.765601217656012e-06, + "loss": 1.7025, + "step": 3298 + }, + { + "epoch": 3.012785388127854, + "grad_norm": 13.282039642333984, + "learning_rate": 7.764586504312533e-06, + "loss": 0.144, + "step": 3299 + }, + { + "epoch": 3.0136986301369864, + "grad_norm": 2.5507850646972656, + "learning_rate": 7.763571790969051e-06, + "loss": 0.0224, + "step": 3300 + }, + { + "epoch": 3.014611872146119, + "grad_norm": 1.538385033607483, + "learning_rate": 7.76255707762557e-06, + "loss": 0.0201, + "step": 3301 + }, + { + "epoch": 3.0155251141552513, + "grad_norm": 1.0569857358932495, + "learning_rate": 7.761542364282091e-06, + "loss": 0.0112, + "step": 3302 + }, + { + "epoch": 3.0164383561643837, + "grad_norm": 9.0682373046875, + "learning_rate": 7.760527650938611e-06, + "loss": 0.1119, + "step": 3303 + }, + { + "epoch": 3.017351598173516, + "grad_norm": 74.96056365966797, + "learning_rate": 7.75951293759513e-06, + "loss": 2.172, + "step": 3304 + }, + { + "epoch": 3.018264840182648, + "grad_norm": 1.2546732425689697, + "learning_rate": 7.758498224251649e-06, + "loss": 0.0108, + "step": 3305 + }, + { + "epoch": 3.0191780821917806, + "grad_norm": 12.12769889831543, + "learning_rate": 7.75748351090817e-06, + "loss": 0.1052, + "step": 3306 + }, + { + "epoch": 3.020091324200913, + "grad_norm": 14.98312759399414, + "learning_rate": 7.756468797564688e-06, + "loss": 0.1525, + "step": 3307 + }, + { + "epoch": 3.0210045662100455, + "grad_norm": 40.85886764526367, + "learning_rate": 7.755454084221207e-06, + "loss": 0.5298, + "step": 3308 + }, + { + "epoch": 3.021917808219178, + "grad_norm": 1.8579397201538086, + "learning_rate": 7.754439370877728e-06, + "loss": 0.0176, + "step": 3309 + }, + { + "epoch": 3.0228310502283104, + "grad_norm": 24.349349975585938, + "learning_rate": 7.753424657534248e-06, + "loss": 0.1791, + "step": 3310 + }, + { + "epoch": 3.023744292237443, + "grad_norm": 80.60004425048828, + "learning_rate": 7.752409944190766e-06, + "loss": 1.227, + "step": 3311 + }, + { + "epoch": 3.0246575342465754, + "grad_norm": 39.102684020996094, + "learning_rate": 7.751395230847286e-06, + "loss": 0.6765, + "step": 3312 + }, + { + "epoch": 3.025570776255708, + "grad_norm": 13.762646675109863, + "learning_rate": 7.750380517503807e-06, + "loss": 0.1557, + "step": 3313 + }, + { + "epoch": 3.0264840182648403, + "grad_norm": 9.281769752502441, + "learning_rate": 7.749365804160325e-06, + "loss": 0.0984, + "step": 3314 + }, + { + "epoch": 3.0273972602739727, + "grad_norm": 23.197301864624023, + "learning_rate": 7.748351090816844e-06, + "loss": 0.2474, + "step": 3315 + }, + { + "epoch": 3.028310502283105, + "grad_norm": 4.146899700164795, + "learning_rate": 7.747336377473365e-06, + "loss": 0.0268, + "step": 3316 + }, + { + "epoch": 3.0292237442922376, + "grad_norm": 7.674070358276367, + "learning_rate": 7.746321664129884e-06, + "loss": 0.1067, + "step": 3317 + }, + { + "epoch": 3.03013698630137, + "grad_norm": 33.518287658691406, + "learning_rate": 7.745306950786403e-06, + "loss": 0.4665, + "step": 3318 + }, + { + "epoch": 3.031050228310502, + "grad_norm": 14.366250038146973, + "learning_rate": 7.744292237442923e-06, + "loss": 0.1694, + "step": 3319 + }, + { + "epoch": 3.0319634703196345, + "grad_norm": 72.21148681640625, + "learning_rate": 7.743277524099444e-06, + "loss": 1.9468, + "step": 3320 + }, + { + "epoch": 3.032876712328767, + "grad_norm": 10.926424026489258, + "learning_rate": 7.742262810755962e-06, + "loss": 0.1291, + "step": 3321 + }, + { + "epoch": 3.0337899543378994, + "grad_norm": 20.164268493652344, + "learning_rate": 7.741248097412481e-06, + "loss": 0.3594, + "step": 3322 + }, + { + "epoch": 3.034703196347032, + "grad_norm": 44.34687423706055, + "learning_rate": 7.740233384069002e-06, + "loss": 0.7421, + "step": 3323 + }, + { + "epoch": 3.0356164383561643, + "grad_norm": 41.129920959472656, + "learning_rate": 7.73921867072552e-06, + "loss": 0.7871, + "step": 3324 + }, + { + "epoch": 3.036529680365297, + "grad_norm": 21.977678298950195, + "learning_rate": 7.73820395738204e-06, + "loss": 0.317, + "step": 3325 + }, + { + "epoch": 3.0374429223744293, + "grad_norm": 3.315148115158081, + "learning_rate": 7.73718924403856e-06, + "loss": 0.0344, + "step": 3326 + }, + { + "epoch": 3.0383561643835617, + "grad_norm": 21.71779441833496, + "learning_rate": 7.736174530695079e-06, + "loss": 0.2151, + "step": 3327 + }, + { + "epoch": 3.039269406392694, + "grad_norm": 4.540500164031982, + "learning_rate": 7.735159817351598e-06, + "loss": 0.0459, + "step": 3328 + }, + { + "epoch": 3.0401826484018266, + "grad_norm": 34.39149475097656, + "learning_rate": 7.734145104008118e-06, + "loss": 0.7393, + "step": 3329 + }, + { + "epoch": 3.041095890410959, + "grad_norm": 29.581851959228516, + "learning_rate": 7.733130390664639e-06, + "loss": 0.4237, + "step": 3330 + }, + { + "epoch": 3.0420091324200915, + "grad_norm": 9.673309326171875, + "learning_rate": 7.732115677321158e-06, + "loss": 0.1261, + "step": 3331 + }, + { + "epoch": 3.0429223744292235, + "grad_norm": 4.3571882247924805, + "learning_rate": 7.731100963977677e-06, + "loss": 0.0382, + "step": 3332 + }, + { + "epoch": 3.043835616438356, + "grad_norm": 7.215229511260986, + "learning_rate": 7.730086250634197e-06, + "loss": 0.0825, + "step": 3333 + }, + { + "epoch": 3.0447488584474884, + "grad_norm": 43.21517562866211, + "learning_rate": 7.729071537290716e-06, + "loss": 0.4694, + "step": 3334 + }, + { + "epoch": 3.045662100456621, + "grad_norm": 2.7083144187927246, + "learning_rate": 7.728056823947235e-06, + "loss": 0.025, + "step": 3335 + }, + { + "epoch": 3.0465753424657533, + "grad_norm": 33.8132438659668, + "learning_rate": 7.727042110603755e-06, + "loss": 0.3214, + "step": 3336 + }, + { + "epoch": 3.047488584474886, + "grad_norm": 7.755730152130127, + "learning_rate": 7.726027397260276e-06, + "loss": 0.091, + "step": 3337 + }, + { + "epoch": 3.0484018264840183, + "grad_norm": 22.137853622436523, + "learning_rate": 7.725012683916793e-06, + "loss": 0.2873, + "step": 3338 + }, + { + "epoch": 3.0493150684931507, + "grad_norm": 13.151230812072754, + "learning_rate": 7.723997970573314e-06, + "loss": 0.1441, + "step": 3339 + }, + { + "epoch": 3.050228310502283, + "grad_norm": 12.517772674560547, + "learning_rate": 7.722983257229834e-06, + "loss": 0.0892, + "step": 3340 + }, + { + "epoch": 3.0511415525114156, + "grad_norm": 28.747806549072266, + "learning_rate": 7.721968543886353e-06, + "loss": 0.4672, + "step": 3341 + }, + { + "epoch": 3.052054794520548, + "grad_norm": 13.619359970092773, + "learning_rate": 7.720953830542872e-06, + "loss": 0.1532, + "step": 3342 + }, + { + "epoch": 3.0529680365296805, + "grad_norm": 0.6897098422050476, + "learning_rate": 7.719939117199392e-06, + "loss": 0.0068, + "step": 3343 + }, + { + "epoch": 3.053881278538813, + "grad_norm": 1.0780397653579712, + "learning_rate": 7.718924403855911e-06, + "loss": 0.0098, + "step": 3344 + }, + { + "epoch": 3.0547945205479454, + "grad_norm": 66.2275390625, + "learning_rate": 7.71790969051243e-06, + "loss": 1.1227, + "step": 3345 + }, + { + "epoch": 3.0557077625570774, + "grad_norm": 0.7069715261459351, + "learning_rate": 7.71689497716895e-06, + "loss": 0.0054, + "step": 3346 + }, + { + "epoch": 3.05662100456621, + "grad_norm": 1.782910943031311, + "learning_rate": 7.715880263825471e-06, + "loss": 0.0205, + "step": 3347 + }, + { + "epoch": 3.0575342465753423, + "grad_norm": 45.55470657348633, + "learning_rate": 7.71486555048199e-06, + "loss": 1.1839, + "step": 3348 + }, + { + "epoch": 3.058447488584475, + "grad_norm": 31.388355255126953, + "learning_rate": 7.713850837138509e-06, + "loss": 0.2978, + "step": 3349 + }, + { + "epoch": 3.0593607305936072, + "grad_norm": 26.690956115722656, + "learning_rate": 7.71283612379503e-06, + "loss": 0.6797, + "step": 3350 + }, + { + "epoch": 3.0602739726027397, + "grad_norm": 3.5128040313720703, + "learning_rate": 7.711821410451548e-06, + "loss": 0.0464, + "step": 3351 + }, + { + "epoch": 3.061187214611872, + "grad_norm": 12.030360221862793, + "learning_rate": 7.710806697108067e-06, + "loss": 0.0763, + "step": 3352 + }, + { + "epoch": 3.0621004566210046, + "grad_norm": 20.79634666442871, + "learning_rate": 7.709791983764588e-06, + "loss": 0.2148, + "step": 3353 + }, + { + "epoch": 3.063013698630137, + "grad_norm": 17.242521286010742, + "learning_rate": 7.708777270421106e-06, + "loss": 0.2386, + "step": 3354 + }, + { + "epoch": 3.0639269406392695, + "grad_norm": 6.533170223236084, + "learning_rate": 7.707762557077625e-06, + "loss": 0.0488, + "step": 3355 + }, + { + "epoch": 3.064840182648402, + "grad_norm": 0.8924813866615295, + "learning_rate": 7.706747843734146e-06, + "loss": 0.0123, + "step": 3356 + }, + { + "epoch": 3.0657534246575344, + "grad_norm": 2.7396905422210693, + "learning_rate": 7.705733130390666e-06, + "loss": 0.0246, + "step": 3357 + }, + { + "epoch": 3.066666666666667, + "grad_norm": 0.4651915431022644, + "learning_rate": 7.704718417047185e-06, + "loss": 0.0055, + "step": 3358 + }, + { + "epoch": 3.067579908675799, + "grad_norm": 8.008918762207031, + "learning_rate": 7.703703703703704e-06, + "loss": 0.0627, + "step": 3359 + }, + { + "epoch": 3.0684931506849313, + "grad_norm": 22.957313537597656, + "learning_rate": 7.702688990360225e-06, + "loss": 0.474, + "step": 3360 + }, + { + "epoch": 3.069406392694064, + "grad_norm": 39.18450927734375, + "learning_rate": 7.701674277016743e-06, + "loss": 0.478, + "step": 3361 + }, + { + "epoch": 3.0703196347031962, + "grad_norm": 18.00519371032715, + "learning_rate": 7.700659563673262e-06, + "loss": 0.0822, + "step": 3362 + }, + { + "epoch": 3.0712328767123287, + "grad_norm": 8.73715877532959, + "learning_rate": 7.699644850329783e-06, + "loss": 0.089, + "step": 3363 + }, + { + "epoch": 3.072146118721461, + "grad_norm": 30.467164993286133, + "learning_rate": 7.698630136986302e-06, + "loss": 0.2942, + "step": 3364 + }, + { + "epoch": 3.0730593607305936, + "grad_norm": 71.09735107421875, + "learning_rate": 7.697615423642822e-06, + "loss": 2.9905, + "step": 3365 + }, + { + "epoch": 3.073972602739726, + "grad_norm": 0.6421196460723877, + "learning_rate": 7.696600710299341e-06, + "loss": 0.0052, + "step": 3366 + }, + { + "epoch": 3.0748858447488585, + "grad_norm": 0.3885412812232971, + "learning_rate": 7.695585996955862e-06, + "loss": 0.0051, + "step": 3367 + }, + { + "epoch": 3.075799086757991, + "grad_norm": 5.971086025238037, + "learning_rate": 7.69457128361238e-06, + "loss": 0.0542, + "step": 3368 + }, + { + "epoch": 3.0767123287671234, + "grad_norm": 32.89466094970703, + "learning_rate": 7.6935565702689e-06, + "loss": 0.7348, + "step": 3369 + }, + { + "epoch": 3.077625570776256, + "grad_norm": 14.820106506347656, + "learning_rate": 7.69254185692542e-06, + "loss": 0.1686, + "step": 3370 + }, + { + "epoch": 3.0785388127853883, + "grad_norm": 10.313182830810547, + "learning_rate": 7.691527143581939e-06, + "loss": 0.0819, + "step": 3371 + }, + { + "epoch": 3.0794520547945208, + "grad_norm": 25.976545333862305, + "learning_rate": 7.690512430238457e-06, + "loss": 0.2499, + "step": 3372 + }, + { + "epoch": 3.080365296803653, + "grad_norm": 6.274001598358154, + "learning_rate": 7.689497716894978e-06, + "loss": 0.0721, + "step": 3373 + }, + { + "epoch": 3.0812785388127852, + "grad_norm": 34.575138092041016, + "learning_rate": 7.688483003551497e-06, + "loss": 0.4118, + "step": 3374 + }, + { + "epoch": 3.0821917808219177, + "grad_norm": 6.704229354858398, + "learning_rate": 7.687468290208017e-06, + "loss": 0.0725, + "step": 3375 + }, + { + "epoch": 3.08310502283105, + "grad_norm": 25.467632293701172, + "learning_rate": 7.686453576864536e-06, + "loss": 0.2655, + "step": 3376 + }, + { + "epoch": 3.0840182648401826, + "grad_norm": 18.170711517333984, + "learning_rate": 7.685438863521057e-06, + "loss": 0.3718, + "step": 3377 + }, + { + "epoch": 3.084931506849315, + "grad_norm": 24.876968383789062, + "learning_rate": 7.684424150177576e-06, + "loss": 0.3891, + "step": 3378 + }, + { + "epoch": 3.0858447488584475, + "grad_norm": 21.736587524414062, + "learning_rate": 7.683409436834094e-06, + "loss": 0.337, + "step": 3379 + }, + { + "epoch": 3.08675799086758, + "grad_norm": 23.55972671508789, + "learning_rate": 7.682394723490615e-06, + "loss": 0.4304, + "step": 3380 + }, + { + "epoch": 3.0876712328767124, + "grad_norm": 14.68620777130127, + "learning_rate": 7.681380010147134e-06, + "loss": 0.1582, + "step": 3381 + }, + { + "epoch": 3.088584474885845, + "grad_norm": 14.707161903381348, + "learning_rate": 7.680365296803653e-06, + "loss": 0.1511, + "step": 3382 + }, + { + "epoch": 3.0894977168949773, + "grad_norm": 0.4962908625602722, + "learning_rate": 7.679350583460173e-06, + "loss": 0.0051, + "step": 3383 + }, + { + "epoch": 3.0904109589041098, + "grad_norm": 10.5215425491333, + "learning_rate": 7.678335870116692e-06, + "loss": 0.0979, + "step": 3384 + }, + { + "epoch": 3.091324200913242, + "grad_norm": 28.326093673706055, + "learning_rate": 7.677321156773213e-06, + "loss": 0.2327, + "step": 3385 + }, + { + "epoch": 3.0922374429223742, + "grad_norm": 25.23579216003418, + "learning_rate": 7.676306443429731e-06, + "loss": 0.2052, + "step": 3386 + }, + { + "epoch": 3.0931506849315067, + "grad_norm": 16.03923797607422, + "learning_rate": 7.675291730086252e-06, + "loss": 0.1509, + "step": 3387 + }, + { + "epoch": 3.094063926940639, + "grad_norm": 3.8286609649658203, + "learning_rate": 7.67427701674277e-06, + "loss": 0.0284, + "step": 3388 + }, + { + "epoch": 3.0949771689497716, + "grad_norm": 6.851012229919434, + "learning_rate": 7.67326230339929e-06, + "loss": 0.074, + "step": 3389 + }, + { + "epoch": 3.095890410958904, + "grad_norm": 10.3844575881958, + "learning_rate": 7.67224759005581e-06, + "loss": 0.0989, + "step": 3390 + }, + { + "epoch": 3.0968036529680365, + "grad_norm": 2.5858395099639893, + "learning_rate": 7.671232876712329e-06, + "loss": 0.0313, + "step": 3391 + }, + { + "epoch": 3.097716894977169, + "grad_norm": 4.3933024406433105, + "learning_rate": 7.67021816336885e-06, + "loss": 0.0483, + "step": 3392 + }, + { + "epoch": 3.0986301369863014, + "grad_norm": 63.89727783203125, + "learning_rate": 7.669203450025368e-06, + "loss": 2.0521, + "step": 3393 + }, + { + "epoch": 3.099543378995434, + "grad_norm": 0.831802487373352, + "learning_rate": 7.668188736681887e-06, + "loss": 0.0102, + "step": 3394 + }, + { + "epoch": 3.1004566210045663, + "grad_norm": 69.02181243896484, + "learning_rate": 7.667174023338408e-06, + "loss": 0.9281, + "step": 3395 + }, + { + "epoch": 3.1013698630136988, + "grad_norm": 2.1638097763061523, + "learning_rate": 7.666159309994927e-06, + "loss": 0.0153, + "step": 3396 + }, + { + "epoch": 3.1022831050228312, + "grad_norm": 63.31378936767578, + "learning_rate": 7.665144596651447e-06, + "loss": 1.8104, + "step": 3397 + }, + { + "epoch": 3.1031963470319637, + "grad_norm": 240.322509765625, + "learning_rate": 7.664129883307966e-06, + "loss": 0.0948, + "step": 3398 + }, + { + "epoch": 3.1041095890410957, + "grad_norm": 20.482358932495117, + "learning_rate": 7.663115169964485e-06, + "loss": 0.2329, + "step": 3399 + }, + { + "epoch": 3.105022831050228, + "grad_norm": 1.2089966535568237, + "learning_rate": 7.662100456621005e-06, + "loss": 0.0095, + "step": 3400 + }, + { + "epoch": 3.1059360730593606, + "grad_norm": 49.86781692504883, + "learning_rate": 7.661085743277524e-06, + "loss": 0.601, + "step": 3401 + }, + { + "epoch": 3.106849315068493, + "grad_norm": 49.3849983215332, + "learning_rate": 7.660071029934045e-06, + "loss": 1.2734, + "step": 3402 + }, + { + "epoch": 3.1077625570776255, + "grad_norm": 43.58937072753906, + "learning_rate": 7.659056316590564e-06, + "loss": 0.7463, + "step": 3403 + }, + { + "epoch": 3.108675799086758, + "grad_norm": 15.75900936126709, + "learning_rate": 7.658041603247083e-06, + "loss": 0.1826, + "step": 3404 + }, + { + "epoch": 3.1095890410958904, + "grad_norm": 26.68613052368164, + "learning_rate": 7.657026889903603e-06, + "loss": 0.3644, + "step": 3405 + }, + { + "epoch": 3.110502283105023, + "grad_norm": 69.89656066894531, + "learning_rate": 7.656012176560122e-06, + "loss": 4.9837, + "step": 3406 + }, + { + "epoch": 3.1114155251141553, + "grad_norm": 0.8717828392982483, + "learning_rate": 7.654997463216642e-06, + "loss": 0.0056, + "step": 3407 + }, + { + "epoch": 3.1123287671232878, + "grad_norm": 28.202533721923828, + "learning_rate": 7.653982749873161e-06, + "loss": 0.187, + "step": 3408 + }, + { + "epoch": 3.11324200913242, + "grad_norm": 19.049625396728516, + "learning_rate": 7.652968036529682e-06, + "loss": 0.1717, + "step": 3409 + }, + { + "epoch": 3.1141552511415527, + "grad_norm": 1.905434250831604, + "learning_rate": 7.6519533231862e-06, + "loss": 0.0155, + "step": 3410 + }, + { + "epoch": 3.115068493150685, + "grad_norm": 1.4224799871444702, + "learning_rate": 7.65093860984272e-06, + "loss": 0.0164, + "step": 3411 + }, + { + "epoch": 3.115981735159817, + "grad_norm": 67.15062713623047, + "learning_rate": 7.64992389649924e-06, + "loss": 0.6441, + "step": 3412 + }, + { + "epoch": 3.1168949771689496, + "grad_norm": 26.43345069885254, + "learning_rate": 7.648909183155759e-06, + "loss": 0.2305, + "step": 3413 + }, + { + "epoch": 3.117808219178082, + "grad_norm": 3.400460958480835, + "learning_rate": 7.647894469812278e-06, + "loss": 0.0323, + "step": 3414 + }, + { + "epoch": 3.1187214611872145, + "grad_norm": 56.218109130859375, + "learning_rate": 7.646879756468798e-06, + "loss": 0.592, + "step": 3415 + }, + { + "epoch": 3.119634703196347, + "grad_norm": 56.98219299316406, + "learning_rate": 7.645865043125317e-06, + "loss": 1.1656, + "step": 3416 + }, + { + "epoch": 3.1205479452054794, + "grad_norm": 23.883716583251953, + "learning_rate": 7.644850329781838e-06, + "loss": 0.316, + "step": 3417 + }, + { + "epoch": 3.121461187214612, + "grad_norm": 33.753658294677734, + "learning_rate": 7.643835616438356e-06, + "loss": 0.5589, + "step": 3418 + }, + { + "epoch": 3.1223744292237443, + "grad_norm": 73.76075744628906, + "learning_rate": 7.642820903094877e-06, + "loss": 2.6169, + "step": 3419 + }, + { + "epoch": 3.1232876712328768, + "grad_norm": 39.49466323852539, + "learning_rate": 7.641806189751396e-06, + "loss": 0.4804, + "step": 3420 + }, + { + "epoch": 3.124200913242009, + "grad_norm": 11.170639991760254, + "learning_rate": 7.640791476407915e-06, + "loss": 0.1203, + "step": 3421 + }, + { + "epoch": 3.1251141552511417, + "grad_norm": 14.695128440856934, + "learning_rate": 7.639776763064435e-06, + "loss": 0.1731, + "step": 3422 + }, + { + "epoch": 3.126027397260274, + "grad_norm": 4.188926696777344, + "learning_rate": 7.638762049720954e-06, + "loss": 0.0608, + "step": 3423 + }, + { + "epoch": 3.1269406392694066, + "grad_norm": 13.778498649597168, + "learning_rate": 7.637747336377473e-06, + "loss": 0.1742, + "step": 3424 + }, + { + "epoch": 3.127853881278539, + "grad_norm": 35.60322952270508, + "learning_rate": 7.636732623033993e-06, + "loss": 0.7482, + "step": 3425 + }, + { + "epoch": 3.128767123287671, + "grad_norm": 0.3911658227443695, + "learning_rate": 7.635717909690512e-06, + "loss": 0.0043, + "step": 3426 + }, + { + "epoch": 3.1296803652968035, + "grad_norm": 2.8678112030029297, + "learning_rate": 7.634703196347033e-06, + "loss": 0.0319, + "step": 3427 + }, + { + "epoch": 3.130593607305936, + "grad_norm": 5.244919776916504, + "learning_rate": 7.633688483003552e-06, + "loss": 0.064, + "step": 3428 + }, + { + "epoch": 3.1315068493150684, + "grad_norm": 1.7655904293060303, + "learning_rate": 7.632673769660072e-06, + "loss": 0.0198, + "step": 3429 + }, + { + "epoch": 3.132420091324201, + "grad_norm": 1.1342931985855103, + "learning_rate": 7.631659056316591e-06, + "loss": 0.0143, + "step": 3430 + }, + { + "epoch": 3.1333333333333333, + "grad_norm": 0.45797985792160034, + "learning_rate": 7.63064434297311e-06, + "loss": 0.0044, + "step": 3431 + }, + { + "epoch": 3.1342465753424658, + "grad_norm": 2.8702993392944336, + "learning_rate": 7.62962962962963e-06, + "loss": 0.0416, + "step": 3432 + }, + { + "epoch": 3.135159817351598, + "grad_norm": 32.64763259887695, + "learning_rate": 7.62861491628615e-06, + "loss": 0.5029, + "step": 3433 + }, + { + "epoch": 3.1360730593607307, + "grad_norm": 21.119638442993164, + "learning_rate": 7.627600202942669e-06, + "loss": 0.2813, + "step": 3434 + }, + { + "epoch": 3.136986301369863, + "grad_norm": 15.279278755187988, + "learning_rate": 7.626585489599189e-06, + "loss": 0.2261, + "step": 3435 + }, + { + "epoch": 3.1378995433789956, + "grad_norm": 0.9460284113883972, + "learning_rate": 7.625570776255708e-06, + "loss": 0.0074, + "step": 3436 + }, + { + "epoch": 3.138812785388128, + "grad_norm": 7.892376899719238, + "learning_rate": 7.624556062912228e-06, + "loss": 0.0837, + "step": 3437 + }, + { + "epoch": 3.1397260273972605, + "grad_norm": 48.23689651489258, + "learning_rate": 7.623541349568747e-06, + "loss": 1.5374, + "step": 3438 + }, + { + "epoch": 3.1406392694063925, + "grad_norm": 0.048983048647642136, + "learning_rate": 7.622526636225267e-06, + "loss": 0.0005, + "step": 3439 + }, + { + "epoch": 3.141552511415525, + "grad_norm": 11.001269340515137, + "learning_rate": 7.621511922881787e-06, + "loss": 0.0916, + "step": 3440 + }, + { + "epoch": 3.1424657534246574, + "grad_norm": 12.659975051879883, + "learning_rate": 7.620497209538306e-06, + "loss": 0.1412, + "step": 3441 + }, + { + "epoch": 3.14337899543379, + "grad_norm": 1.5726265907287598, + "learning_rate": 7.619482496194826e-06, + "loss": 0.02, + "step": 3442 + }, + { + "epoch": 3.1442922374429223, + "grad_norm": 3.5385849475860596, + "learning_rate": 7.618467782851345e-06, + "loss": 0.0513, + "step": 3443 + }, + { + "epoch": 3.1452054794520548, + "grad_norm": 49.55977249145508, + "learning_rate": 7.617453069507864e-06, + "loss": 1.7506, + "step": 3444 + }, + { + "epoch": 3.146118721461187, + "grad_norm": 14.859722137451172, + "learning_rate": 7.616438356164384e-06, + "loss": 0.1502, + "step": 3445 + }, + { + "epoch": 3.1470319634703197, + "grad_norm": 7.21556282043457, + "learning_rate": 7.615423642820904e-06, + "loss": 0.0563, + "step": 3446 + }, + { + "epoch": 3.147945205479452, + "grad_norm": 9.731677055358887, + "learning_rate": 7.614408929477423e-06, + "loss": 0.1173, + "step": 3447 + }, + { + "epoch": 3.1488584474885846, + "grad_norm": 32.7054328918457, + "learning_rate": 7.613394216133942e-06, + "loss": 0.6513, + "step": 3448 + }, + { + "epoch": 3.149771689497717, + "grad_norm": 0.21388676762580872, + "learning_rate": 7.612379502790463e-06, + "loss": 0.0014, + "step": 3449 + }, + { + "epoch": 3.1506849315068495, + "grad_norm": 36.091278076171875, + "learning_rate": 7.611364789446982e-06, + "loss": 0.4158, + "step": 3450 + }, + { + "epoch": 3.151598173515982, + "grad_norm": 13.597611427307129, + "learning_rate": 7.610350076103501e-06, + "loss": 0.1971, + "step": 3451 + }, + { + "epoch": 3.1525114155251144, + "grad_norm": 30.057085037231445, + "learning_rate": 7.609335362760021e-06, + "loss": 0.4119, + "step": 3452 + }, + { + "epoch": 3.1534246575342464, + "grad_norm": 2.3247897624969482, + "learning_rate": 7.608320649416541e-06, + "loss": 0.0222, + "step": 3453 + }, + { + "epoch": 3.154337899543379, + "grad_norm": 2.1495249271392822, + "learning_rate": 7.6073059360730595e-06, + "loss": 0.0329, + "step": 3454 + }, + { + "epoch": 3.1552511415525113, + "grad_norm": 44.68389892578125, + "learning_rate": 7.606291222729579e-06, + "loss": 0.8151, + "step": 3455 + }, + { + "epoch": 3.1561643835616437, + "grad_norm": 66.24957275390625, + "learning_rate": 7.605276509386099e-06, + "loss": 3.1942, + "step": 3456 + }, + { + "epoch": 3.157077625570776, + "grad_norm": 39.0673828125, + "learning_rate": 7.604261796042619e-06, + "loss": 0.7301, + "step": 3457 + }, + { + "epoch": 3.1579908675799087, + "grad_norm": 10.576455116271973, + "learning_rate": 7.603247082699137e-06, + "loss": 0.1455, + "step": 3458 + }, + { + "epoch": 3.158904109589041, + "grad_norm": 14.59505558013916, + "learning_rate": 7.602232369355658e-06, + "loss": 0.0975, + "step": 3459 + }, + { + "epoch": 3.1598173515981736, + "grad_norm": 14.154467582702637, + "learning_rate": 7.601217656012178e-06, + "loss": 0.1651, + "step": 3460 + }, + { + "epoch": 3.160730593607306, + "grad_norm": 46.89035415649414, + "learning_rate": 7.6002029426686965e-06, + "loss": 0.6737, + "step": 3461 + }, + { + "epoch": 3.1616438356164385, + "grad_norm": 22.47974395751953, + "learning_rate": 7.599188229325216e-06, + "loss": 0.2907, + "step": 3462 + }, + { + "epoch": 3.162557077625571, + "grad_norm": 43.193267822265625, + "learning_rate": 7.598173515981736e-06, + "loss": 0.8143, + "step": 3463 + }, + { + "epoch": 3.1634703196347034, + "grad_norm": 40.80278015136719, + "learning_rate": 7.597158802638255e-06, + "loss": 0.4009, + "step": 3464 + }, + { + "epoch": 3.1643835616438354, + "grad_norm": 1.2435517311096191, + "learning_rate": 7.596144089294774e-06, + "loss": 0.01, + "step": 3465 + }, + { + "epoch": 3.165296803652968, + "grad_norm": 25.4896240234375, + "learning_rate": 7.595129375951294e-06, + "loss": 0.3911, + "step": 3466 + }, + { + "epoch": 3.1662100456621003, + "grad_norm": 38.0204963684082, + "learning_rate": 7.594114662607815e-06, + "loss": 0.9038, + "step": 3467 + }, + { + "epoch": 3.1671232876712327, + "grad_norm": 3.922563076019287, + "learning_rate": 7.5930999492643335e-06, + "loss": 0.0523, + "step": 3468 + }, + { + "epoch": 3.168036529680365, + "grad_norm": 25.26481819152832, + "learning_rate": 7.592085235920853e-06, + "loss": 0.2551, + "step": 3469 + }, + { + "epoch": 3.1689497716894977, + "grad_norm": 1.872641921043396, + "learning_rate": 7.591070522577373e-06, + "loss": 0.0191, + "step": 3470 + }, + { + "epoch": 3.16986301369863, + "grad_norm": 2.9875595569610596, + "learning_rate": 7.590055809233892e-06, + "loss": 0.0212, + "step": 3471 + }, + { + "epoch": 3.1707762557077626, + "grad_norm": 19.218124389648438, + "learning_rate": 7.589041095890411e-06, + "loss": 0.2372, + "step": 3472 + }, + { + "epoch": 3.171689497716895, + "grad_norm": 45.9088249206543, + "learning_rate": 7.588026382546931e-06, + "loss": 0.7726, + "step": 3473 + }, + { + "epoch": 3.1726027397260275, + "grad_norm": 4.793603420257568, + "learning_rate": 7.58701166920345e-06, + "loss": 0.0467, + "step": 3474 + }, + { + "epoch": 3.17351598173516, + "grad_norm": 21.902189254760742, + "learning_rate": 7.58599695585997e-06, + "loss": 0.2783, + "step": 3475 + }, + { + "epoch": 3.1744292237442924, + "grad_norm": 12.75236701965332, + "learning_rate": 7.58498224251649e-06, + "loss": 0.1324, + "step": 3476 + }, + { + "epoch": 3.175342465753425, + "grad_norm": 6.28772497177124, + "learning_rate": 7.58396752917301e-06, + "loss": 0.0705, + "step": 3477 + }, + { + "epoch": 3.1762557077625573, + "grad_norm": 34.690399169921875, + "learning_rate": 7.582952815829529e-06, + "loss": 0.4243, + "step": 3478 + }, + { + "epoch": 3.1771689497716897, + "grad_norm": 24.493379592895508, + "learning_rate": 7.581938102486048e-06, + "loss": 0.2686, + "step": 3479 + }, + { + "epoch": 3.1780821917808217, + "grad_norm": 0.4346271753311157, + "learning_rate": 7.580923389142568e-06, + "loss": 0.0047, + "step": 3480 + }, + { + "epoch": 3.178995433789954, + "grad_norm": 89.44664764404297, + "learning_rate": 7.579908675799087e-06, + "loss": 1.812, + "step": 3481 + }, + { + "epoch": 3.1799086757990866, + "grad_norm": 3.940136432647705, + "learning_rate": 7.578893962455607e-06, + "loss": 0.0453, + "step": 3482 + }, + { + "epoch": 3.180821917808219, + "grad_norm": 1.2753896713256836, + "learning_rate": 7.577879249112126e-06, + "loss": 0.0175, + "step": 3483 + }, + { + "epoch": 3.1817351598173516, + "grad_norm": 2.2602195739746094, + "learning_rate": 7.576864535768645e-06, + "loss": 0.0211, + "step": 3484 + }, + { + "epoch": 3.182648401826484, + "grad_norm": 0.3498055338859558, + "learning_rate": 7.575849822425166e-06, + "loss": 0.003, + "step": 3485 + }, + { + "epoch": 3.1835616438356165, + "grad_norm": 35.02100372314453, + "learning_rate": 7.574835109081685e-06, + "loss": 0.3089, + "step": 3486 + }, + { + "epoch": 3.184474885844749, + "grad_norm": 13.03968334197998, + "learning_rate": 7.573820395738205e-06, + "loss": 0.1152, + "step": 3487 + }, + { + "epoch": 3.1853881278538814, + "grad_norm": 36.713584899902344, + "learning_rate": 7.572805682394724e-06, + "loss": 0.529, + "step": 3488 + }, + { + "epoch": 3.186301369863014, + "grad_norm": 2.048720598220825, + "learning_rate": 7.571790969051244e-06, + "loss": 0.0216, + "step": 3489 + }, + { + "epoch": 3.1872146118721463, + "grad_norm": 2.115630626678467, + "learning_rate": 7.570776255707763e-06, + "loss": 0.0133, + "step": 3490 + }, + { + "epoch": 3.1881278538812787, + "grad_norm": 24.846534729003906, + "learning_rate": 7.569761542364282e-06, + "loss": 0.1116, + "step": 3491 + }, + { + "epoch": 3.1890410958904107, + "grad_norm": 0.3958185017108917, + "learning_rate": 7.568746829020802e-06, + "loss": 0.0039, + "step": 3492 + }, + { + "epoch": 3.189954337899543, + "grad_norm": 52.8867301940918, + "learning_rate": 7.567732115677322e-06, + "loss": 0.5788, + "step": 3493 + }, + { + "epoch": 3.1908675799086756, + "grad_norm": 10.844905853271484, + "learning_rate": 7.56671740233384e-06, + "loss": 0.1076, + "step": 3494 + }, + { + "epoch": 3.191780821917808, + "grad_norm": 16.679485321044922, + "learning_rate": 7.565702688990361e-06, + "loss": 0.2132, + "step": 3495 + }, + { + "epoch": 3.1926940639269406, + "grad_norm": 0.7525991201400757, + "learning_rate": 7.564687975646881e-06, + "loss": 0.0072, + "step": 3496 + }, + { + "epoch": 3.193607305936073, + "grad_norm": 2.604631185531616, + "learning_rate": 7.5636732623034e-06, + "loss": 0.0211, + "step": 3497 + }, + { + "epoch": 3.1945205479452055, + "grad_norm": 1.2738513946533203, + "learning_rate": 7.562658548959919e-06, + "loss": 0.0113, + "step": 3498 + }, + { + "epoch": 3.195433789954338, + "grad_norm": 12.312764167785645, + "learning_rate": 7.561643835616439e-06, + "loss": 0.1448, + "step": 3499 + }, + { + "epoch": 3.1963470319634704, + "grad_norm": 4.854381561279297, + "learning_rate": 7.5606291222729585e-06, + "loss": 0.041, + "step": 3500 + }, + { + "epoch": 3.197260273972603, + "grad_norm": 0.584735631942749, + "learning_rate": 7.559614408929477e-06, + "loss": 0.0043, + "step": 3501 + }, + { + "epoch": 3.1981735159817353, + "grad_norm": 30.510759353637695, + "learning_rate": 7.558599695585997e-06, + "loss": 0.4646, + "step": 3502 + }, + { + "epoch": 3.1990867579908677, + "grad_norm": 0.1504766196012497, + "learning_rate": 7.557584982242518e-06, + "loss": 0.0011, + "step": 3503 + }, + { + "epoch": 3.2, + "grad_norm": 4.328934669494629, + "learning_rate": 7.5565702688990365e-06, + "loss": 0.0428, + "step": 3504 + }, + { + "epoch": 3.2009132420091326, + "grad_norm": 2.133570671081543, + "learning_rate": 7.555555555555556e-06, + "loss": 0.0187, + "step": 3505 + }, + { + "epoch": 3.2018264840182646, + "grad_norm": 17.265993118286133, + "learning_rate": 7.554540842212076e-06, + "loss": 0.2084, + "step": 3506 + }, + { + "epoch": 3.202739726027397, + "grad_norm": 4.200481414794922, + "learning_rate": 7.5535261288685955e-06, + "loss": 0.0351, + "step": 3507 + }, + { + "epoch": 3.2036529680365295, + "grad_norm": 12.485267639160156, + "learning_rate": 7.552511415525114e-06, + "loss": 0.1179, + "step": 3508 + }, + { + "epoch": 3.204566210045662, + "grad_norm": 1.4229270219802856, + "learning_rate": 7.551496702181634e-06, + "loss": 0.0121, + "step": 3509 + }, + { + "epoch": 3.2054794520547945, + "grad_norm": 73.47805786132812, + "learning_rate": 7.550481988838155e-06, + "loss": 2.0438, + "step": 3510 + }, + { + "epoch": 3.206392694063927, + "grad_norm": 11.41339111328125, + "learning_rate": 7.549467275494673e-06, + "loss": 0.0859, + "step": 3511 + }, + { + "epoch": 3.2073059360730594, + "grad_norm": 9.861115455627441, + "learning_rate": 7.548452562151193e-06, + "loss": 0.1197, + "step": 3512 + }, + { + "epoch": 3.208219178082192, + "grad_norm": 2.2843332290649414, + "learning_rate": 7.547437848807713e-06, + "loss": 0.0204, + "step": 3513 + }, + { + "epoch": 3.2091324200913243, + "grad_norm": 8.437621116638184, + "learning_rate": 7.546423135464232e-06, + "loss": 0.0711, + "step": 3514 + }, + { + "epoch": 3.2100456621004567, + "grad_norm": 8.041351318359375, + "learning_rate": 7.545408422120751e-06, + "loss": 0.0746, + "step": 3515 + }, + { + "epoch": 3.210958904109589, + "grad_norm": 2.0196192264556885, + "learning_rate": 7.544393708777271e-06, + "loss": 0.0186, + "step": 3516 + }, + { + "epoch": 3.2118721461187216, + "grad_norm": 10.212859153747559, + "learning_rate": 7.543378995433791e-06, + "loss": 0.0645, + "step": 3517 + }, + { + "epoch": 3.212785388127854, + "grad_norm": 5.333665370941162, + "learning_rate": 7.54236428209031e-06, + "loss": 0.0636, + "step": 3518 + }, + { + "epoch": 3.213698630136986, + "grad_norm": 25.253433227539062, + "learning_rate": 7.541349568746829e-06, + "loss": 0.2908, + "step": 3519 + }, + { + "epoch": 3.2146118721461185, + "grad_norm": 3.7076854705810547, + "learning_rate": 7.54033485540335e-06, + "loss": 0.035, + "step": 3520 + }, + { + "epoch": 3.215525114155251, + "grad_norm": 15.350374221801758, + "learning_rate": 7.539320142059869e-06, + "loss": 0.1248, + "step": 3521 + }, + { + "epoch": 3.2164383561643834, + "grad_norm": 0.32652878761291504, + "learning_rate": 7.538305428716388e-06, + "loss": 0.0026, + "step": 3522 + }, + { + "epoch": 3.217351598173516, + "grad_norm": 36.59679412841797, + "learning_rate": 7.537290715372908e-06, + "loss": 0.3834, + "step": 3523 + }, + { + "epoch": 3.2182648401826484, + "grad_norm": 7.8363800048828125, + "learning_rate": 7.536276002029427e-06, + "loss": 0.0704, + "step": 3524 + }, + { + "epoch": 3.219178082191781, + "grad_norm": 70.27706146240234, + "learning_rate": 7.535261288685947e-06, + "loss": 1.7223, + "step": 3525 + }, + { + "epoch": 3.2200913242009133, + "grad_norm": 4.425665855407715, + "learning_rate": 7.534246575342466e-06, + "loss": 0.0453, + "step": 3526 + }, + { + "epoch": 3.2210045662100457, + "grad_norm": 14.2524995803833, + "learning_rate": 7.533231861998986e-06, + "loss": 0.1542, + "step": 3527 + }, + { + "epoch": 3.221917808219178, + "grad_norm": 1.79550302028656, + "learning_rate": 7.532217148655505e-06, + "loss": 0.0153, + "step": 3528 + }, + { + "epoch": 3.2228310502283106, + "grad_norm": 17.944549560546875, + "learning_rate": 7.531202435312025e-06, + "loss": 0.1665, + "step": 3529 + }, + { + "epoch": 3.223744292237443, + "grad_norm": 0.28567901253700256, + "learning_rate": 7.530187721968545e-06, + "loss": 0.0029, + "step": 3530 + }, + { + "epoch": 3.2246575342465755, + "grad_norm": 17.404109954833984, + "learning_rate": 7.529173008625064e-06, + "loss": 0.1221, + "step": 3531 + }, + { + "epoch": 3.225570776255708, + "grad_norm": 2.586944103240967, + "learning_rate": 7.528158295281584e-06, + "loss": 0.0294, + "step": 3532 + }, + { + "epoch": 3.22648401826484, + "grad_norm": 16.415508270263672, + "learning_rate": 7.527143581938103e-06, + "loss": 0.1593, + "step": 3533 + }, + { + "epoch": 3.2273972602739724, + "grad_norm": 11.84469985961914, + "learning_rate": 7.526128868594622e-06, + "loss": 0.0769, + "step": 3534 + }, + { + "epoch": 3.228310502283105, + "grad_norm": 5.364933490753174, + "learning_rate": 7.525114155251142e-06, + "loss": 0.0543, + "step": 3535 + }, + { + "epoch": 3.2292237442922374, + "grad_norm": 55.78830337524414, + "learning_rate": 7.5240994419076615e-06, + "loss": 0.9031, + "step": 3536 + }, + { + "epoch": 3.23013698630137, + "grad_norm": 17.22977066040039, + "learning_rate": 7.523084728564182e-06, + "loss": 0.2111, + "step": 3537 + }, + { + "epoch": 3.2310502283105023, + "grad_norm": 7.829189777374268, + "learning_rate": 7.5220700152207e-06, + "loss": 0.0545, + "step": 3538 + }, + { + "epoch": 3.2319634703196347, + "grad_norm": 1.7348815202713013, + "learning_rate": 7.521055301877221e-06, + "loss": 0.02, + "step": 3539 + }, + { + "epoch": 3.232876712328767, + "grad_norm": 3.6387507915496826, + "learning_rate": 7.52004058853374e-06, + "loss": 0.0223, + "step": 3540 + }, + { + "epoch": 3.2337899543378996, + "grad_norm": 35.38957214355469, + "learning_rate": 7.519025875190259e-06, + "loss": 0.3954, + "step": 3541 + }, + { + "epoch": 3.234703196347032, + "grad_norm": 20.19486427307129, + "learning_rate": 7.518011161846779e-06, + "loss": 0.1497, + "step": 3542 + }, + { + "epoch": 3.2356164383561645, + "grad_norm": 53.942138671875, + "learning_rate": 7.5169964485032985e-06, + "loss": 1.0082, + "step": 3543 + }, + { + "epoch": 3.236529680365297, + "grad_norm": 15.1071195602417, + "learning_rate": 7.515981735159817e-06, + "loss": 0.1542, + "step": 3544 + }, + { + "epoch": 3.237442922374429, + "grad_norm": 26.46891975402832, + "learning_rate": 7.514967021816337e-06, + "loss": 0.2381, + "step": 3545 + }, + { + "epoch": 3.2383561643835614, + "grad_norm": 0.74367356300354, + "learning_rate": 7.513952308472857e-06, + "loss": 0.006, + "step": 3546 + }, + { + "epoch": 3.239269406392694, + "grad_norm": 79.33977508544922, + "learning_rate": 7.512937595129377e-06, + "loss": 1.5031, + "step": 3547 + }, + { + "epoch": 3.2401826484018263, + "grad_norm": 2.433579206466675, + "learning_rate": 7.511922881785896e-06, + "loss": 0.0222, + "step": 3548 + }, + { + "epoch": 3.241095890410959, + "grad_norm": 3.4793763160705566, + "learning_rate": 7.510908168442416e-06, + "loss": 0.0276, + "step": 3549 + }, + { + "epoch": 3.2420091324200913, + "grad_norm": 9.86294937133789, + "learning_rate": 7.5098934550989355e-06, + "loss": 0.0774, + "step": 3550 + }, + { + "epoch": 3.2429223744292237, + "grad_norm": 46.51811599731445, + "learning_rate": 7.508878741755454e-06, + "loss": 0.7455, + "step": 3551 + }, + { + "epoch": 3.243835616438356, + "grad_norm": 10.628626823425293, + "learning_rate": 7.507864028411974e-06, + "loss": 0.0985, + "step": 3552 + }, + { + "epoch": 3.2447488584474886, + "grad_norm": 3.3174920082092285, + "learning_rate": 7.506849315068494e-06, + "loss": 0.0304, + "step": 3553 + }, + { + "epoch": 3.245662100456621, + "grad_norm": 4.353148937225342, + "learning_rate": 7.505834601725013e-06, + "loss": 0.0282, + "step": 3554 + }, + { + "epoch": 3.2465753424657535, + "grad_norm": 21.35881233215332, + "learning_rate": 7.504819888381532e-06, + "loss": 0.1561, + "step": 3555 + }, + { + "epoch": 3.247488584474886, + "grad_norm": 35.09859848022461, + "learning_rate": 7.503805175038053e-06, + "loss": 0.4471, + "step": 3556 + }, + { + "epoch": 3.2484018264840184, + "grad_norm": 18.99220848083496, + "learning_rate": 7.5027904616945725e-06, + "loss": 0.1909, + "step": 3557 + }, + { + "epoch": 3.249315068493151, + "grad_norm": 2.532351016998291, + "learning_rate": 7.501775748351091e-06, + "loss": 0.024, + "step": 3558 + }, + { + "epoch": 3.2502283105022833, + "grad_norm": 29.798688888549805, + "learning_rate": 7.500761035007611e-06, + "loss": 0.5776, + "step": 3559 + }, + { + "epoch": 3.2511415525114153, + "grad_norm": 13.038468360900879, + "learning_rate": 7.499746321664131e-06, + "loss": 0.0953, + "step": 3560 + }, + { + "epoch": 3.252054794520548, + "grad_norm": 3.116807222366333, + "learning_rate": 7.49873160832065e-06, + "loss": 0.028, + "step": 3561 + }, + { + "epoch": 3.2529680365296803, + "grad_norm": 36.433441162109375, + "learning_rate": 7.497716894977169e-06, + "loss": 0.4054, + "step": 3562 + }, + { + "epoch": 3.2538812785388127, + "grad_norm": 7.381392002105713, + "learning_rate": 7.496702181633689e-06, + "loss": 0.0521, + "step": 3563 + }, + { + "epoch": 3.254794520547945, + "grad_norm": 1.1776835918426514, + "learning_rate": 7.495687468290208e-06, + "loss": 0.0061, + "step": 3564 + }, + { + "epoch": 3.2557077625570776, + "grad_norm": 2.2982635498046875, + "learning_rate": 7.494672754946728e-06, + "loss": 0.0224, + "step": 3565 + }, + { + "epoch": 3.25662100456621, + "grad_norm": 1.5317697525024414, + "learning_rate": 7.493658041603248e-06, + "loss": 0.0132, + "step": 3566 + }, + { + "epoch": 3.2575342465753425, + "grad_norm": 3.9572842121124268, + "learning_rate": 7.492643328259768e-06, + "loss": 0.0349, + "step": 3567 + }, + { + "epoch": 3.258447488584475, + "grad_norm": 16.557687759399414, + "learning_rate": 7.491628614916287e-06, + "loss": 0.137, + "step": 3568 + }, + { + "epoch": 3.2593607305936074, + "grad_norm": 65.80517578125, + "learning_rate": 7.490613901572806e-06, + "loss": 1.2247, + "step": 3569 + }, + { + "epoch": 3.26027397260274, + "grad_norm": 0.17612913250923157, + "learning_rate": 7.489599188229326e-06, + "loss": 0.0015, + "step": 3570 + }, + { + "epoch": 3.2611872146118723, + "grad_norm": 12.335405349731445, + "learning_rate": 7.488584474885845e-06, + "loss": 0.1026, + "step": 3571 + }, + { + "epoch": 3.2621004566210043, + "grad_norm": 6.797485828399658, + "learning_rate": 7.4875697615423645e-06, + "loss": 0.0413, + "step": 3572 + }, + { + "epoch": 3.263013698630137, + "grad_norm": 18.5383243560791, + "learning_rate": 7.486555048198885e-06, + "loss": 0.2042, + "step": 3573 + }, + { + "epoch": 3.2639269406392692, + "grad_norm": 37.07294845581055, + "learning_rate": 7.485540334855403e-06, + "loss": 0.3871, + "step": 3574 + }, + { + "epoch": 3.2648401826484017, + "grad_norm": 32.66383743286133, + "learning_rate": 7.484525621511924e-06, + "loss": 0.2318, + "step": 3575 + }, + { + "epoch": 3.265753424657534, + "grad_norm": 32.96186828613281, + "learning_rate": 7.483510908168443e-06, + "loss": 0.3158, + "step": 3576 + }, + { + "epoch": 3.2666666666666666, + "grad_norm": 3.4811105728149414, + "learning_rate": 7.482496194824963e-06, + "loss": 0.0293, + "step": 3577 + }, + { + "epoch": 3.267579908675799, + "grad_norm": 28.78657341003418, + "learning_rate": 7.481481481481482e-06, + "loss": 0.3909, + "step": 3578 + }, + { + "epoch": 3.2684931506849315, + "grad_norm": 26.912212371826172, + "learning_rate": 7.4804667681380015e-06, + "loss": 0.1491, + "step": 3579 + }, + { + "epoch": 3.269406392694064, + "grad_norm": 3.486870527267456, + "learning_rate": 7.479452054794521e-06, + "loss": 0.0325, + "step": 3580 + }, + { + "epoch": 3.2703196347031964, + "grad_norm": 40.132354736328125, + "learning_rate": 7.47843734145104e-06, + "loss": 0.4598, + "step": 3581 + }, + { + "epoch": 3.271232876712329, + "grad_norm": 1.715198278427124, + "learning_rate": 7.47742262810756e-06, + "loss": 0.0124, + "step": 3582 + }, + { + "epoch": 3.2721461187214613, + "grad_norm": 5.215312957763672, + "learning_rate": 7.47640791476408e-06, + "loss": 0.0568, + "step": 3583 + }, + { + "epoch": 3.273059360730594, + "grad_norm": 2.3393399715423584, + "learning_rate": 7.475393201420599e-06, + "loss": 0.0162, + "step": 3584 + }, + { + "epoch": 3.2739726027397262, + "grad_norm": 0.4026965796947479, + "learning_rate": 7.474378488077119e-06, + "loss": 0.0046, + "step": 3585 + }, + { + "epoch": 3.2748858447488587, + "grad_norm": 5.444977283477783, + "learning_rate": 7.4733637747336385e-06, + "loss": 0.0454, + "step": 3586 + }, + { + "epoch": 3.2757990867579907, + "grad_norm": 27.651521682739258, + "learning_rate": 7.472349061390158e-06, + "loss": 0.3684, + "step": 3587 + }, + { + "epoch": 3.276712328767123, + "grad_norm": 0.43439605832099915, + "learning_rate": 7.471334348046677e-06, + "loss": 0.0043, + "step": 3588 + }, + { + "epoch": 3.2776255707762556, + "grad_norm": 9.587159156799316, + "learning_rate": 7.470319634703197e-06, + "loss": 0.0798, + "step": 3589 + }, + { + "epoch": 3.278538812785388, + "grad_norm": 21.593505859375, + "learning_rate": 7.469304921359717e-06, + "loss": 0.2096, + "step": 3590 + }, + { + "epoch": 3.2794520547945205, + "grad_norm": 2.391850709915161, + "learning_rate": 7.468290208016235e-06, + "loss": 0.0276, + "step": 3591 + }, + { + "epoch": 3.280365296803653, + "grad_norm": 4.429548263549805, + "learning_rate": 7.467275494672756e-06, + "loss": 0.0339, + "step": 3592 + }, + { + "epoch": 3.2812785388127854, + "grad_norm": 11.501486778259277, + "learning_rate": 7.4662607813292755e-06, + "loss": 0.0805, + "step": 3593 + }, + { + "epoch": 3.282191780821918, + "grad_norm": 2.608569383621216, + "learning_rate": 7.465246067985794e-06, + "loss": 0.0187, + "step": 3594 + }, + { + "epoch": 3.2831050228310503, + "grad_norm": 1.5774757862091064, + "learning_rate": 7.464231354642314e-06, + "loss": 0.0142, + "step": 3595 + }, + { + "epoch": 3.2840182648401828, + "grad_norm": 21.461973190307617, + "learning_rate": 7.463216641298834e-06, + "loss": 0.1697, + "step": 3596 + }, + { + "epoch": 3.2849315068493152, + "grad_norm": 22.068880081176758, + "learning_rate": 7.4622019279553534e-06, + "loss": 0.1916, + "step": 3597 + }, + { + "epoch": 3.2858447488584472, + "grad_norm": 20.466014862060547, + "learning_rate": 7.461187214611872e-06, + "loss": 0.1648, + "step": 3598 + }, + { + "epoch": 3.2867579908675797, + "grad_norm": 4.599705219268799, + "learning_rate": 7.460172501268392e-06, + "loss": 0.0447, + "step": 3599 + }, + { + "epoch": 3.287671232876712, + "grad_norm": 15.694513320922852, + "learning_rate": 7.4591577879249125e-06, + "loss": 0.1155, + "step": 3600 + }, + { + "epoch": 3.2885844748858446, + "grad_norm": 2.018209218978882, + "learning_rate": 7.458143074581431e-06, + "loss": 0.0166, + "step": 3601 + }, + { + "epoch": 3.289497716894977, + "grad_norm": 3.1981008052825928, + "learning_rate": 7.457128361237951e-06, + "loss": 0.0222, + "step": 3602 + }, + { + "epoch": 3.2904109589041095, + "grad_norm": 19.474105834960938, + "learning_rate": 7.456113647894471e-06, + "loss": 0.1942, + "step": 3603 + }, + { + "epoch": 3.291324200913242, + "grad_norm": 5.143622875213623, + "learning_rate": 7.4550989345509896e-06, + "loss": 0.0379, + "step": 3604 + }, + { + "epoch": 3.2922374429223744, + "grad_norm": 79.49713897705078, + "learning_rate": 7.454084221207509e-06, + "loss": 1.245, + "step": 3605 + }, + { + "epoch": 3.293150684931507, + "grad_norm": 13.878167152404785, + "learning_rate": 7.453069507864029e-06, + "loss": 0.0881, + "step": 3606 + }, + { + "epoch": 3.2940639269406393, + "grad_norm": 14.716658592224121, + "learning_rate": 7.452054794520549e-06, + "loss": 0.1855, + "step": 3607 + }, + { + "epoch": 3.2949771689497718, + "grad_norm": 78.50464630126953, + "learning_rate": 7.4510400811770675e-06, + "loss": 2.1185, + "step": 3608 + }, + { + "epoch": 3.2958904109589042, + "grad_norm": 14.834837913513184, + "learning_rate": 7.450025367833588e-06, + "loss": 0.1523, + "step": 3609 + }, + { + "epoch": 3.2968036529680367, + "grad_norm": 10.69626235961914, + "learning_rate": 7.449010654490108e-06, + "loss": 0.0958, + "step": 3610 + }, + { + "epoch": 3.297716894977169, + "grad_norm": 37.739112854003906, + "learning_rate": 7.4479959411466266e-06, + "loss": 0.6527, + "step": 3611 + }, + { + "epoch": 3.2986301369863016, + "grad_norm": 4.720743656158447, + "learning_rate": 7.446981227803146e-06, + "loss": 0.0331, + "step": 3612 + }, + { + "epoch": 3.2995433789954336, + "grad_norm": 77.88298797607422, + "learning_rate": 7.445966514459666e-06, + "loss": 1.8776, + "step": 3613 + }, + { + "epoch": 3.300456621004566, + "grad_norm": 10.700353622436523, + "learning_rate": 7.444951801116185e-06, + "loss": 0.0846, + "step": 3614 + }, + { + "epoch": 3.3013698630136985, + "grad_norm": 44.7128791809082, + "learning_rate": 7.4439370877727045e-06, + "loss": 0.3063, + "step": 3615 + }, + { + "epoch": 3.302283105022831, + "grad_norm": 7.858157634735107, + "learning_rate": 7.442922374429224e-06, + "loss": 0.066, + "step": 3616 + }, + { + "epoch": 3.3031963470319634, + "grad_norm": 0.9851614236831665, + "learning_rate": 7.441907661085745e-06, + "loss": 0.006, + "step": 3617 + }, + { + "epoch": 3.304109589041096, + "grad_norm": 1.0160948038101196, + "learning_rate": 7.440892947742263e-06, + "loss": 0.0074, + "step": 3618 + }, + { + "epoch": 3.3050228310502283, + "grad_norm": 11.484540939331055, + "learning_rate": 7.439878234398783e-06, + "loss": 0.0867, + "step": 3619 + }, + { + "epoch": 3.3059360730593608, + "grad_norm": 74.7523422241211, + "learning_rate": 7.438863521055303e-06, + "loss": 1.0107, + "step": 3620 + }, + { + "epoch": 3.3068493150684932, + "grad_norm": 12.995521545410156, + "learning_rate": 7.437848807711822e-06, + "loss": 0.1186, + "step": 3621 + }, + { + "epoch": 3.3077625570776257, + "grad_norm": 8.54312801361084, + "learning_rate": 7.4368340943683415e-06, + "loss": 0.0754, + "step": 3622 + }, + { + "epoch": 3.308675799086758, + "grad_norm": 18.064481735229492, + "learning_rate": 7.435819381024861e-06, + "loss": 0.1418, + "step": 3623 + }, + { + "epoch": 3.3095890410958906, + "grad_norm": 43.95150375366211, + "learning_rate": 7.43480466768138e-06, + "loss": 0.5964, + "step": 3624 + }, + { + "epoch": 3.3105022831050226, + "grad_norm": 27.575672149658203, + "learning_rate": 7.4337899543379e-06, + "loss": 0.2306, + "step": 3625 + }, + { + "epoch": 3.311415525114155, + "grad_norm": 4.444556713104248, + "learning_rate": 7.432775240994419e-06, + "loss": 0.0296, + "step": 3626 + }, + { + "epoch": 3.3123287671232875, + "grad_norm": 9.68759536743164, + "learning_rate": 7.43176052765094e-06, + "loss": 0.0921, + "step": 3627 + }, + { + "epoch": 3.31324200913242, + "grad_norm": 10.669509887695312, + "learning_rate": 7.430745814307459e-06, + "loss": 0.0633, + "step": 3628 + }, + { + "epoch": 3.3141552511415524, + "grad_norm": 2.11781907081604, + "learning_rate": 7.4297311009639785e-06, + "loss": 0.0198, + "step": 3629 + }, + { + "epoch": 3.315068493150685, + "grad_norm": 3.2055585384368896, + "learning_rate": 7.428716387620498e-06, + "loss": 0.0303, + "step": 3630 + }, + { + "epoch": 3.3159817351598173, + "grad_norm": 1.2806391716003418, + "learning_rate": 7.427701674277017e-06, + "loss": 0.0127, + "step": 3631 + }, + { + "epoch": 3.3168949771689498, + "grad_norm": 25.999774932861328, + "learning_rate": 7.426686960933537e-06, + "loss": 0.2356, + "step": 3632 + }, + { + "epoch": 3.317808219178082, + "grad_norm": 76.9455337524414, + "learning_rate": 7.425672247590056e-06, + "loss": 3.9323, + "step": 3633 + }, + { + "epoch": 3.3187214611872147, + "grad_norm": 28.526582717895508, + "learning_rate": 7.424657534246575e-06, + "loss": 0.3303, + "step": 3634 + }, + { + "epoch": 3.319634703196347, + "grad_norm": 14.929758071899414, + "learning_rate": 7.423642820903095e-06, + "loss": 0.1515, + "step": 3635 + }, + { + "epoch": 3.3205479452054796, + "grad_norm": 41.02516555786133, + "learning_rate": 7.4226281075596155e-06, + "loss": 0.4559, + "step": 3636 + }, + { + "epoch": 3.321461187214612, + "grad_norm": 9.71328353881836, + "learning_rate": 7.421613394216135e-06, + "loss": 0.0631, + "step": 3637 + }, + { + "epoch": 3.3223744292237445, + "grad_norm": 0.10615668445825577, + "learning_rate": 7.420598680872654e-06, + "loss": 0.0008, + "step": 3638 + }, + { + "epoch": 3.323287671232877, + "grad_norm": 29.232860565185547, + "learning_rate": 7.419583967529174e-06, + "loss": 0.218, + "step": 3639 + }, + { + "epoch": 3.324200913242009, + "grad_norm": 13.149770736694336, + "learning_rate": 7.418569254185693e-06, + "loss": 0.0838, + "step": 3640 + }, + { + "epoch": 3.3251141552511414, + "grad_norm": 12.68301010131836, + "learning_rate": 7.417554540842212e-06, + "loss": 0.1026, + "step": 3641 + }, + { + "epoch": 3.326027397260274, + "grad_norm": 4.3785223960876465, + "learning_rate": 7.416539827498732e-06, + "loss": 0.0379, + "step": 3642 + }, + { + "epoch": 3.3269406392694063, + "grad_norm": 68.14091491699219, + "learning_rate": 7.415525114155252e-06, + "loss": 0.9539, + "step": 3643 + }, + { + "epoch": 3.3278538812785388, + "grad_norm": 2.1319973468780518, + "learning_rate": 7.4145104008117705e-06, + "loss": 0.0184, + "step": 3644 + }, + { + "epoch": 3.328767123287671, + "grad_norm": 1.690772294998169, + "learning_rate": 7.413495687468291e-06, + "loss": 0.0133, + "step": 3645 + }, + { + "epoch": 3.3296803652968037, + "grad_norm": 3.0462629795074463, + "learning_rate": 7.412480974124811e-06, + "loss": 0.0208, + "step": 3646 + }, + { + "epoch": 3.330593607305936, + "grad_norm": 2.034606456756592, + "learning_rate": 7.41146626078133e-06, + "loss": 0.0161, + "step": 3647 + }, + { + "epoch": 3.3315068493150686, + "grad_norm": 0.9330837726593018, + "learning_rate": 7.410451547437849e-06, + "loss": 0.0064, + "step": 3648 + }, + { + "epoch": 3.332420091324201, + "grad_norm": 94.87248992919922, + "learning_rate": 7.409436834094369e-06, + "loss": 0.8226, + "step": 3649 + }, + { + "epoch": 3.3333333333333335, + "grad_norm": 0.6692129969596863, + "learning_rate": 7.408422120750889e-06, + "loss": 0.0041, + "step": 3650 + }, + { + "epoch": 3.334246575342466, + "grad_norm": 43.048362731933594, + "learning_rate": 7.4074074074074075e-06, + "loss": 0.6751, + "step": 3651 + }, + { + "epoch": 3.335159817351598, + "grad_norm": 2.2803244590759277, + "learning_rate": 7.406392694063927e-06, + "loss": 0.0126, + "step": 3652 + }, + { + "epoch": 3.3360730593607304, + "grad_norm": 1.3087080717086792, + "learning_rate": 7.405377980720448e-06, + "loss": 0.0112, + "step": 3653 + }, + { + "epoch": 3.336986301369863, + "grad_norm": 8.86911392211914, + "learning_rate": 7.404363267376966e-06, + "loss": 0.053, + "step": 3654 + }, + { + "epoch": 3.3378995433789953, + "grad_norm": 50.219215393066406, + "learning_rate": 7.403348554033486e-06, + "loss": 0.8506, + "step": 3655 + }, + { + "epoch": 3.3388127853881278, + "grad_norm": 1.4910390377044678, + "learning_rate": 7.402333840690006e-06, + "loss": 0.0104, + "step": 3656 + }, + { + "epoch": 3.33972602739726, + "grad_norm": 2.5245440006256104, + "learning_rate": 7.401319127346526e-06, + "loss": 0.0159, + "step": 3657 + }, + { + "epoch": 3.3406392694063927, + "grad_norm": 1.7285356521606445, + "learning_rate": 7.4003044140030445e-06, + "loss": 0.0152, + "step": 3658 + }, + { + "epoch": 3.341552511415525, + "grad_norm": 0.7006621360778809, + "learning_rate": 7.399289700659564e-06, + "loss": 0.0064, + "step": 3659 + }, + { + "epoch": 3.3424657534246576, + "grad_norm": 6.2133307456970215, + "learning_rate": 7.398274987316084e-06, + "loss": 0.0142, + "step": 3660 + }, + { + "epoch": 3.34337899543379, + "grad_norm": 12.006552696228027, + "learning_rate": 7.397260273972603e-06, + "loss": 0.0967, + "step": 3661 + }, + { + "epoch": 3.3442922374429225, + "grad_norm": 3.4876954555511475, + "learning_rate": 7.396245560629122e-06, + "loss": 0.0233, + "step": 3662 + }, + { + "epoch": 3.345205479452055, + "grad_norm": 47.15279769897461, + "learning_rate": 7.395230847285643e-06, + "loss": 0.4911, + "step": 3663 + }, + { + "epoch": 3.3461187214611874, + "grad_norm": 0.11928242444992065, + "learning_rate": 7.394216133942162e-06, + "loss": 0.0009, + "step": 3664 + }, + { + "epoch": 3.34703196347032, + "grad_norm": 34.49175262451172, + "learning_rate": 7.3932014205986815e-06, + "loss": 0.1596, + "step": 3665 + }, + { + "epoch": 3.3479452054794523, + "grad_norm": 0.2472149133682251, + "learning_rate": 7.392186707255201e-06, + "loss": 0.0021, + "step": 3666 + }, + { + "epoch": 3.3488584474885843, + "grad_norm": 0.31829604506492615, + "learning_rate": 7.391171993911721e-06, + "loss": 0.0031, + "step": 3667 + }, + { + "epoch": 3.3497716894977168, + "grad_norm": 9.841808319091797, + "learning_rate": 7.39015728056824e-06, + "loss": 0.1, + "step": 3668 + }, + { + "epoch": 3.350684931506849, + "grad_norm": 19.456308364868164, + "learning_rate": 7.389142567224759e-06, + "loss": 0.1921, + "step": 3669 + }, + { + "epoch": 3.3515981735159817, + "grad_norm": 76.02880859375, + "learning_rate": 7.388127853881279e-06, + "loss": 1.1532, + "step": 3670 + }, + { + "epoch": 3.352511415525114, + "grad_norm": 8.1799898147583, + "learning_rate": 7.387113140537798e-06, + "loss": 0.0916, + "step": 3671 + }, + { + "epoch": 3.3534246575342466, + "grad_norm": 8.138143539428711, + "learning_rate": 7.3860984271943185e-06, + "loss": 0.0819, + "step": 3672 + }, + { + "epoch": 3.354337899543379, + "grad_norm": 44.29637908935547, + "learning_rate": 7.385083713850838e-06, + "loss": 0.511, + "step": 3673 + }, + { + "epoch": 3.3552511415525115, + "grad_norm": 7.990817070007324, + "learning_rate": 7.384069000507357e-06, + "loss": 0.0706, + "step": 3674 + }, + { + "epoch": 3.356164383561644, + "grad_norm": 2.1027023792266846, + "learning_rate": 7.383054287163877e-06, + "loss": 0.0114, + "step": 3675 + }, + { + "epoch": 3.3570776255707764, + "grad_norm": 15.210891723632812, + "learning_rate": 7.382039573820396e-06, + "loss": 0.0946, + "step": 3676 + }, + { + "epoch": 3.357990867579909, + "grad_norm": 8.807382583618164, + "learning_rate": 7.381024860476916e-06, + "loss": 0.0814, + "step": 3677 + }, + { + "epoch": 3.3589041095890413, + "grad_norm": 1.0962740182876587, + "learning_rate": 7.380010147133435e-06, + "loss": 0.0091, + "step": 3678 + }, + { + "epoch": 3.3598173515981733, + "grad_norm": 2.0246317386627197, + "learning_rate": 7.378995433789955e-06, + "loss": 0.0173, + "step": 3679 + }, + { + "epoch": 3.3607305936073057, + "grad_norm": 3.9636058807373047, + "learning_rate": 7.377980720446475e-06, + "loss": 0.0194, + "step": 3680 + }, + { + "epoch": 3.361643835616438, + "grad_norm": 13.404891967773438, + "learning_rate": 7.376966007102994e-06, + "loss": 0.0827, + "step": 3681 + }, + { + "epoch": 3.3625570776255707, + "grad_norm": 98.78411102294922, + "learning_rate": 7.375951293759514e-06, + "loss": 1.1256, + "step": 3682 + }, + { + "epoch": 3.363470319634703, + "grad_norm": 32.00291061401367, + "learning_rate": 7.374936580416033e-06, + "loss": 0.2668, + "step": 3683 + }, + { + "epoch": 3.3643835616438356, + "grad_norm": 4.3558430671691895, + "learning_rate": 7.373921867072552e-06, + "loss": 0.042, + "step": 3684 + }, + { + "epoch": 3.365296803652968, + "grad_norm": 35.261497497558594, + "learning_rate": 7.372907153729072e-06, + "loss": 0.2668, + "step": 3685 + }, + { + "epoch": 3.3662100456621005, + "grad_norm": 0.8153903484344482, + "learning_rate": 7.371892440385592e-06, + "loss": 0.0079, + "step": 3686 + }, + { + "epoch": 3.367123287671233, + "grad_norm": 24.81711196899414, + "learning_rate": 7.370877727042111e-06, + "loss": 0.1708, + "step": 3687 + }, + { + "epoch": 3.3680365296803654, + "grad_norm": 32.83957290649414, + "learning_rate": 7.36986301369863e-06, + "loss": 0.3356, + "step": 3688 + }, + { + "epoch": 3.368949771689498, + "grad_norm": 48.84785461425781, + "learning_rate": 7.368848300355151e-06, + "loss": 0.4026, + "step": 3689 + }, + { + "epoch": 3.3698630136986303, + "grad_norm": 11.289193153381348, + "learning_rate": 7.36783358701167e-06, + "loss": 0.0615, + "step": 3690 + }, + { + "epoch": 3.3707762557077627, + "grad_norm": 9.09987735748291, + "learning_rate": 7.366818873668189e-06, + "loss": 0.0901, + "step": 3691 + }, + { + "epoch": 3.371689497716895, + "grad_norm": 1.154522180557251, + "learning_rate": 7.365804160324709e-06, + "loss": 0.007, + "step": 3692 + }, + { + "epoch": 3.3726027397260276, + "grad_norm": 84.81550598144531, + "learning_rate": 7.364789446981229e-06, + "loss": 0.9875, + "step": 3693 + }, + { + "epoch": 3.3735159817351597, + "grad_norm": 13.37501335144043, + "learning_rate": 7.3637747336377475e-06, + "loss": 0.117, + "step": 3694 + }, + { + "epoch": 3.374429223744292, + "grad_norm": 31.37482452392578, + "learning_rate": 7.362760020294267e-06, + "loss": 0.3591, + "step": 3695 + }, + { + "epoch": 3.3753424657534246, + "grad_norm": 75.90715026855469, + "learning_rate": 7.361745306950787e-06, + "loss": 0.4819, + "step": 3696 + }, + { + "epoch": 3.376255707762557, + "grad_norm": 4.04400634765625, + "learning_rate": 7.360730593607307e-06, + "loss": 0.0267, + "step": 3697 + }, + { + "epoch": 3.3771689497716895, + "grad_norm": 77.6044921875, + "learning_rate": 7.359715880263825e-06, + "loss": 0.6247, + "step": 3698 + }, + { + "epoch": 3.378082191780822, + "grad_norm": 12.47231388092041, + "learning_rate": 7.358701166920346e-06, + "loss": 0.104, + "step": 3699 + }, + { + "epoch": 3.3789954337899544, + "grad_norm": 88.83605194091797, + "learning_rate": 7.357686453576866e-06, + "loss": 2.1296, + "step": 3700 + }, + { + "epoch": 3.379908675799087, + "grad_norm": 100.72651672363281, + "learning_rate": 7.3566717402333845e-06, + "loss": 4.3216, + "step": 3701 + }, + { + "epoch": 3.3808219178082193, + "grad_norm": 10.100130081176758, + "learning_rate": 7.355657026889904e-06, + "loss": 0.0691, + "step": 3702 + }, + { + "epoch": 3.3817351598173517, + "grad_norm": 4.334282398223877, + "learning_rate": 7.354642313546424e-06, + "loss": 0.0321, + "step": 3703 + }, + { + "epoch": 3.382648401826484, + "grad_norm": 7.886394023895264, + "learning_rate": 7.353627600202943e-06, + "loss": 0.066, + "step": 3704 + }, + { + "epoch": 3.383561643835616, + "grad_norm": 45.271934509277344, + "learning_rate": 7.352612886859462e-06, + "loss": 0.369, + "step": 3705 + }, + { + "epoch": 3.3844748858447486, + "grad_norm": 125.6565933227539, + "learning_rate": 7.351598173515982e-06, + "loss": 3.3545, + "step": 3706 + }, + { + "epoch": 3.385388127853881, + "grad_norm": 33.0079460144043, + "learning_rate": 7.350583460172503e-06, + "loss": 0.5109, + "step": 3707 + }, + { + "epoch": 3.3863013698630136, + "grad_norm": 73.44644927978516, + "learning_rate": 7.3495687468290215e-06, + "loss": 0.7355, + "step": 3708 + }, + { + "epoch": 3.387214611872146, + "grad_norm": 8.962692260742188, + "learning_rate": 7.348554033485541e-06, + "loss": 0.0749, + "step": 3709 + }, + { + "epoch": 3.3881278538812785, + "grad_norm": 0.9368263483047485, + "learning_rate": 7.347539320142061e-06, + "loss": 0.0057, + "step": 3710 + }, + { + "epoch": 3.389041095890411, + "grad_norm": 6.582111358642578, + "learning_rate": 7.34652460679858e-06, + "loss": 0.0423, + "step": 3711 + }, + { + "epoch": 3.3899543378995434, + "grad_norm": 12.929895401000977, + "learning_rate": 7.345509893455099e-06, + "loss": 0.1326, + "step": 3712 + }, + { + "epoch": 3.390867579908676, + "grad_norm": 66.59628295898438, + "learning_rate": 7.344495180111619e-06, + "loss": 0.3823, + "step": 3713 + }, + { + "epoch": 3.3917808219178083, + "grad_norm": 17.82769012451172, + "learning_rate": 7.343480466768138e-06, + "loss": 0.1279, + "step": 3714 + }, + { + "epoch": 3.3926940639269407, + "grad_norm": 52.607608795166016, + "learning_rate": 7.342465753424658e-06, + "loss": 0.3172, + "step": 3715 + }, + { + "epoch": 3.393607305936073, + "grad_norm": 3.589524507522583, + "learning_rate": 7.341451040081178e-06, + "loss": 0.0281, + "step": 3716 + }, + { + "epoch": 3.3945205479452056, + "grad_norm": 72.50357818603516, + "learning_rate": 7.340436326737698e-06, + "loss": 1.0669, + "step": 3717 + }, + { + "epoch": 3.395433789954338, + "grad_norm": 8.398853302001953, + "learning_rate": 7.339421613394217e-06, + "loss": 0.0701, + "step": 3718 + }, + { + "epoch": 3.3963470319634705, + "grad_norm": 57.733253479003906, + "learning_rate": 7.338406900050736e-06, + "loss": 0.8343, + "step": 3719 + }, + { + "epoch": 3.3972602739726026, + "grad_norm": 29.902555465698242, + "learning_rate": 7.337392186707256e-06, + "loss": 0.229, + "step": 3720 + }, + { + "epoch": 3.398173515981735, + "grad_norm": 86.1796875, + "learning_rate": 7.336377473363775e-06, + "loss": 1.0109, + "step": 3721 + }, + { + "epoch": 3.3990867579908675, + "grad_norm": 60.246253967285156, + "learning_rate": 7.335362760020295e-06, + "loss": 0.883, + "step": 3722 + }, + { + "epoch": 3.4, + "grad_norm": 106.66954040527344, + "learning_rate": 7.334348046676814e-06, + "loss": 1.8537, + "step": 3723 + }, + { + "epoch": 3.4009132420091324, + "grad_norm": 6.590406894683838, + "learning_rate": 7.333333333333333e-06, + "loss": 0.0398, + "step": 3724 + }, + { + "epoch": 3.401826484018265, + "grad_norm": 35.09221267700195, + "learning_rate": 7.332318619989854e-06, + "loss": 0.2227, + "step": 3725 + }, + { + "epoch": 3.4027397260273973, + "grad_norm": 65.13378143310547, + "learning_rate": 7.331303906646373e-06, + "loss": 0.4786, + "step": 3726 + }, + { + "epoch": 3.4036529680365297, + "grad_norm": 10.791385650634766, + "learning_rate": 7.330289193302893e-06, + "loss": 0.1107, + "step": 3727 + }, + { + "epoch": 3.404566210045662, + "grad_norm": 3.161222219467163, + "learning_rate": 7.329274479959412e-06, + "loss": 0.0242, + "step": 3728 + }, + { + "epoch": 3.4054794520547946, + "grad_norm": 2.2444350719451904, + "learning_rate": 7.328259766615932e-06, + "loss": 0.0205, + "step": 3729 + }, + { + "epoch": 3.406392694063927, + "grad_norm": 1.6596577167510986, + "learning_rate": 7.327245053272451e-06, + "loss": 0.0126, + "step": 3730 + }, + { + "epoch": 3.4073059360730595, + "grad_norm": 34.61076736450195, + "learning_rate": 7.32623033992897e-06, + "loss": 0.4013, + "step": 3731 + }, + { + "epoch": 3.4082191780821915, + "grad_norm": 80.0744857788086, + "learning_rate": 7.32521562658549e-06, + "loss": 0.626, + "step": 3732 + }, + { + "epoch": 3.409132420091324, + "grad_norm": 23.68175506591797, + "learning_rate": 7.32420091324201e-06, + "loss": 0.1899, + "step": 3733 + }, + { + "epoch": 3.4100456621004565, + "grad_norm": 41.74819564819336, + "learning_rate": 7.323186199898528e-06, + "loss": 0.3092, + "step": 3734 + }, + { + "epoch": 3.410958904109589, + "grad_norm": 61.07679748535156, + "learning_rate": 7.322171486555049e-06, + "loss": 0.6049, + "step": 3735 + }, + { + "epoch": 3.4118721461187214, + "grad_norm": 63.00529098510742, + "learning_rate": 7.321156773211569e-06, + "loss": 0.384, + "step": 3736 + }, + { + "epoch": 3.412785388127854, + "grad_norm": 89.58842468261719, + "learning_rate": 7.320142059868088e-06, + "loss": 1.3843, + "step": 3737 + }, + { + "epoch": 3.4136986301369863, + "grad_norm": 12.406987190246582, + "learning_rate": 7.319127346524607e-06, + "loss": 0.0646, + "step": 3738 + }, + { + "epoch": 3.4146118721461187, + "grad_norm": 2.349290609359741, + "learning_rate": 7.318112633181127e-06, + "loss": 0.0209, + "step": 3739 + }, + { + "epoch": 3.415525114155251, + "grad_norm": 21.688228607177734, + "learning_rate": 7.3170979198376465e-06, + "loss": 0.2049, + "step": 3740 + }, + { + "epoch": 3.4164383561643836, + "grad_norm": 20.18193244934082, + "learning_rate": 7.316083206494165e-06, + "loss": 0.1457, + "step": 3741 + }, + { + "epoch": 3.417351598173516, + "grad_norm": 16.84245491027832, + "learning_rate": 7.315068493150685e-06, + "loss": 0.1652, + "step": 3742 + }, + { + "epoch": 3.4182648401826485, + "grad_norm": 1.2702491283416748, + "learning_rate": 7.314053779807206e-06, + "loss": 0.0108, + "step": 3743 + }, + { + "epoch": 3.419178082191781, + "grad_norm": 39.25830841064453, + "learning_rate": 7.3130390664637244e-06, + "loss": 0.4649, + "step": 3744 + }, + { + "epoch": 3.4200913242009134, + "grad_norm": 33.41330337524414, + "learning_rate": 7.312024353120244e-06, + "loss": 0.3848, + "step": 3745 + }, + { + "epoch": 3.421004566210046, + "grad_norm": 87.26612854003906, + "learning_rate": 7.311009639776764e-06, + "loss": 0.8527, + "step": 3746 + }, + { + "epoch": 3.421917808219178, + "grad_norm": 15.412285804748535, + "learning_rate": 7.3099949264332835e-06, + "loss": 0.0842, + "step": 3747 + }, + { + "epoch": 3.4228310502283104, + "grad_norm": 31.928558349609375, + "learning_rate": 7.308980213089802e-06, + "loss": 0.2972, + "step": 3748 + }, + { + "epoch": 3.423744292237443, + "grad_norm": 25.243492126464844, + "learning_rate": 7.307965499746322e-06, + "loss": 0.2404, + "step": 3749 + }, + { + "epoch": 3.4246575342465753, + "grad_norm": 30.23314094543457, + "learning_rate": 7.306950786402842e-06, + "loss": 0.1743, + "step": 3750 + }, + { + "epoch": 3.4255707762557077, + "grad_norm": 24.982467651367188, + "learning_rate": 7.305936073059361e-06, + "loss": 0.2102, + "step": 3751 + }, + { + "epoch": 3.42648401826484, + "grad_norm": 10.010111808776855, + "learning_rate": 7.304921359715881e-06, + "loss": 0.0775, + "step": 3752 + }, + { + "epoch": 3.4273972602739726, + "grad_norm": 68.42272186279297, + "learning_rate": 7.303906646372401e-06, + "loss": 0.7454, + "step": 3753 + }, + { + "epoch": 3.428310502283105, + "grad_norm": 1.923167109489441, + "learning_rate": 7.30289193302892e-06, + "loss": 0.0183, + "step": 3754 + }, + { + "epoch": 3.4292237442922375, + "grad_norm": 12.223678588867188, + "learning_rate": 7.301877219685439e-06, + "loss": 0.127, + "step": 3755 + }, + { + "epoch": 3.43013698630137, + "grad_norm": 72.10932922363281, + "learning_rate": 7.300862506341959e-06, + "loss": 0.8367, + "step": 3756 + }, + { + "epoch": 3.4310502283105024, + "grad_norm": 84.68018341064453, + "learning_rate": 7.299847792998479e-06, + "loss": 1.4332, + "step": 3757 + }, + { + "epoch": 3.431963470319635, + "grad_norm": 2.4010162353515625, + "learning_rate": 7.298833079654998e-06, + "loss": 0.0191, + "step": 3758 + }, + { + "epoch": 3.432876712328767, + "grad_norm": 22.2031192779541, + "learning_rate": 7.297818366311517e-06, + "loss": 0.102, + "step": 3759 + }, + { + "epoch": 3.4337899543378994, + "grad_norm": 53.11552429199219, + "learning_rate": 7.296803652968038e-06, + "loss": 0.338, + "step": 3760 + }, + { + "epoch": 3.434703196347032, + "grad_norm": 0.10757207870483398, + "learning_rate": 7.295788939624557e-06, + "loss": 0.0006, + "step": 3761 + }, + { + "epoch": 3.4356164383561643, + "grad_norm": 0.10518930852413177, + "learning_rate": 7.294774226281076e-06, + "loss": 0.0008, + "step": 3762 + }, + { + "epoch": 3.4365296803652967, + "grad_norm": 62.0676155090332, + "learning_rate": 7.293759512937596e-06, + "loss": 0.3015, + "step": 3763 + }, + { + "epoch": 3.437442922374429, + "grad_norm": 0.12484054267406464, + "learning_rate": 7.292744799594115e-06, + "loss": 0.0006, + "step": 3764 + }, + { + "epoch": 3.4383561643835616, + "grad_norm": 25.775836944580078, + "learning_rate": 7.291730086250635e-06, + "loss": 0.1027, + "step": 3765 + }, + { + "epoch": 3.439269406392694, + "grad_norm": 4.177459239959717, + "learning_rate": 7.290715372907154e-06, + "loss": 0.0325, + "step": 3766 + }, + { + "epoch": 3.4401826484018265, + "grad_norm": 0.3062857985496521, + "learning_rate": 7.289700659563674e-06, + "loss": 0.0017, + "step": 3767 + }, + { + "epoch": 3.441095890410959, + "grad_norm": 4.872993469238281, + "learning_rate": 7.288685946220193e-06, + "loss": 0.0356, + "step": 3768 + }, + { + "epoch": 3.4420091324200914, + "grad_norm": 1.8937301635742188, + "learning_rate": 7.287671232876713e-06, + "loss": 0.0135, + "step": 3769 + }, + { + "epoch": 3.442922374429224, + "grad_norm": 11.737838745117188, + "learning_rate": 7.286656519533233e-06, + "loss": 0.0742, + "step": 3770 + }, + { + "epoch": 3.4438356164383563, + "grad_norm": 26.676191329956055, + "learning_rate": 7.285641806189752e-06, + "loss": 0.3435, + "step": 3771 + }, + { + "epoch": 3.444748858447489, + "grad_norm": 52.98796081542969, + "learning_rate": 7.284627092846272e-06, + "loss": 0.1976, + "step": 3772 + }, + { + "epoch": 3.4456621004566212, + "grad_norm": 5.995681285858154, + "learning_rate": 7.283612379502791e-06, + "loss": 0.049, + "step": 3773 + }, + { + "epoch": 3.4465753424657533, + "grad_norm": 22.733205795288086, + "learning_rate": 7.28259766615931e-06, + "loss": 0.155, + "step": 3774 + }, + { + "epoch": 3.4474885844748857, + "grad_norm": 32.6937255859375, + "learning_rate": 7.28158295281583e-06, + "loss": 0.3175, + "step": 3775 + }, + { + "epoch": 3.448401826484018, + "grad_norm": 54.10548400878906, + "learning_rate": 7.2805682394723495e-06, + "loss": 0.3465, + "step": 3776 + }, + { + "epoch": 3.4493150684931506, + "grad_norm": 76.88023376464844, + "learning_rate": 7.27955352612887e-06, + "loss": 1.1688, + "step": 3777 + }, + { + "epoch": 3.450228310502283, + "grad_norm": 5.1652984619140625, + "learning_rate": 7.278538812785388e-06, + "loss": 0.0383, + "step": 3778 + }, + { + "epoch": 3.4511415525114155, + "grad_norm": 11.179449081420898, + "learning_rate": 7.277524099441909e-06, + "loss": 0.0648, + "step": 3779 + }, + { + "epoch": 3.452054794520548, + "grad_norm": 2.0969603061676025, + "learning_rate": 7.276509386098428e-06, + "loss": 0.0089, + "step": 3780 + }, + { + "epoch": 3.4529680365296804, + "grad_norm": 3.0927321910858154, + "learning_rate": 7.275494672754947e-06, + "loss": 0.0196, + "step": 3781 + }, + { + "epoch": 3.453881278538813, + "grad_norm": 68.31182861328125, + "learning_rate": 7.274479959411467e-06, + "loss": 0.7148, + "step": 3782 + }, + { + "epoch": 3.4547945205479453, + "grad_norm": 6.6343092918396, + "learning_rate": 7.2734652460679865e-06, + "loss": 0.0504, + "step": 3783 + }, + { + "epoch": 3.455707762557078, + "grad_norm": 7.793300151824951, + "learning_rate": 7.272450532724505e-06, + "loss": 0.0459, + "step": 3784 + }, + { + "epoch": 3.45662100456621, + "grad_norm": 1.9552077054977417, + "learning_rate": 7.271435819381025e-06, + "loss": 0.0131, + "step": 3785 + }, + { + "epoch": 3.4575342465753423, + "grad_norm": 125.25228881835938, + "learning_rate": 7.270421106037545e-06, + "loss": 1.2114, + "step": 3786 + }, + { + "epoch": 3.4584474885844747, + "grad_norm": 111.69233703613281, + "learning_rate": 7.269406392694065e-06, + "loss": 1.8548, + "step": 3787 + }, + { + "epoch": 3.459360730593607, + "grad_norm": 63.71443557739258, + "learning_rate": 7.268391679350584e-06, + "loss": 0.9247, + "step": 3788 + }, + { + "epoch": 3.4602739726027396, + "grad_norm": 50.90168762207031, + "learning_rate": 7.267376966007104e-06, + "loss": 0.1919, + "step": 3789 + }, + { + "epoch": 3.461187214611872, + "grad_norm": 75.64427947998047, + "learning_rate": 7.2663622526636235e-06, + "loss": 1.6786, + "step": 3790 + }, + { + "epoch": 3.4621004566210045, + "grad_norm": 1.5558748245239258, + "learning_rate": 7.265347539320142e-06, + "loss": 0.0108, + "step": 3791 + }, + { + "epoch": 3.463013698630137, + "grad_norm": 1.2314780950546265, + "learning_rate": 7.264332825976662e-06, + "loss": 0.0137, + "step": 3792 + }, + { + "epoch": 3.4639269406392694, + "grad_norm": 79.94889068603516, + "learning_rate": 7.263318112633182e-06, + "loss": 1.8883, + "step": 3793 + }, + { + "epoch": 3.464840182648402, + "grad_norm": 2.1431400775909424, + "learning_rate": 7.2623033992897006e-06, + "loss": 0.0112, + "step": 3794 + }, + { + "epoch": 3.4657534246575343, + "grad_norm": 125.04882049560547, + "learning_rate": 7.26128868594622e-06, + "loss": 1.6847, + "step": 3795 + }, + { + "epoch": 3.466666666666667, + "grad_norm": 105.94441223144531, + "learning_rate": 7.260273972602741e-06, + "loss": 1.8273, + "step": 3796 + }, + { + "epoch": 3.4675799086757992, + "grad_norm": 27.199825286865234, + "learning_rate": 7.2592592592592605e-06, + "loss": 0.2507, + "step": 3797 + }, + { + "epoch": 3.4684931506849317, + "grad_norm": 11.024593353271484, + "learning_rate": 7.258244545915779e-06, + "loss": 0.066, + "step": 3798 + }, + { + "epoch": 3.469406392694064, + "grad_norm": 2.549783706665039, + "learning_rate": 7.257229832572299e-06, + "loss": 0.021, + "step": 3799 + }, + { + "epoch": 3.470319634703196, + "grad_norm": 5.815380573272705, + "learning_rate": 7.256215119228819e-06, + "loss": 0.0627, + "step": 3800 + }, + { + "epoch": 3.4712328767123286, + "grad_norm": 5.442661285400391, + "learning_rate": 7.2552004058853376e-06, + "loss": 0.0289, + "step": 3801 + }, + { + "epoch": 3.472146118721461, + "grad_norm": 1.2726701498031616, + "learning_rate": 7.254185692541857e-06, + "loss": 0.0115, + "step": 3802 + }, + { + "epoch": 3.4730593607305935, + "grad_norm": 6.768798828125, + "learning_rate": 7.253170979198377e-06, + "loss": 0.0485, + "step": 3803 + }, + { + "epoch": 3.473972602739726, + "grad_norm": 36.20145034790039, + "learning_rate": 7.252156265854896e-06, + "loss": 0.314, + "step": 3804 + }, + { + "epoch": 3.4748858447488584, + "grad_norm": 0.7726436257362366, + "learning_rate": 7.251141552511416e-06, + "loss": 0.0057, + "step": 3805 + }, + { + "epoch": 3.475799086757991, + "grad_norm": 14.488198280334473, + "learning_rate": 7.250126839167936e-06, + "loss": 0.1129, + "step": 3806 + }, + { + "epoch": 3.4767123287671233, + "grad_norm": 5.180599689483643, + "learning_rate": 7.249112125824456e-06, + "loss": 0.0354, + "step": 3807 + }, + { + "epoch": 3.477625570776256, + "grad_norm": 77.6166000366211, + "learning_rate": 7.2480974124809746e-06, + "loss": 2.2974, + "step": 3808 + }, + { + "epoch": 3.4785388127853882, + "grad_norm": 37.98282241821289, + "learning_rate": 7.247082699137494e-06, + "loss": 0.1754, + "step": 3809 + }, + { + "epoch": 3.4794520547945207, + "grad_norm": 68.57730865478516, + "learning_rate": 7.246067985794014e-06, + "loss": 2.2246, + "step": 3810 + }, + { + "epoch": 3.480365296803653, + "grad_norm": 12.524876594543457, + "learning_rate": 7.245053272450533e-06, + "loss": 0.1311, + "step": 3811 + }, + { + "epoch": 3.481278538812785, + "grad_norm": 16.802921295166016, + "learning_rate": 7.2440385591070525e-06, + "loss": 0.1074, + "step": 3812 + }, + { + "epoch": 3.4821917808219176, + "grad_norm": 0.8948838710784912, + "learning_rate": 7.243023845763573e-06, + "loss": 0.0075, + "step": 3813 + }, + { + "epoch": 3.48310502283105, + "grad_norm": 10.657308578491211, + "learning_rate": 7.242009132420091e-06, + "loss": 0.0944, + "step": 3814 + }, + { + "epoch": 3.4840182648401825, + "grad_norm": 2.215744733810425, + "learning_rate": 7.2409944190766116e-06, + "loss": 0.0194, + "step": 3815 + }, + { + "epoch": 3.484931506849315, + "grad_norm": 8.610310554504395, + "learning_rate": 7.239979705733131e-06, + "loss": 0.0766, + "step": 3816 + }, + { + "epoch": 3.4858447488584474, + "grad_norm": 38.25651931762695, + "learning_rate": 7.238964992389651e-06, + "loss": 0.4317, + "step": 3817 + }, + { + "epoch": 3.48675799086758, + "grad_norm": 14.514307022094727, + "learning_rate": 7.23795027904617e-06, + "loss": 0.1186, + "step": 3818 + }, + { + "epoch": 3.4876712328767123, + "grad_norm": 15.912886619567871, + "learning_rate": 7.2369355657026895e-06, + "loss": 0.1952, + "step": 3819 + }, + { + "epoch": 3.4885844748858448, + "grad_norm": 0.7034034729003906, + "learning_rate": 7.235920852359209e-06, + "loss": 0.0044, + "step": 3820 + }, + { + "epoch": 3.4894977168949772, + "grad_norm": 59.039695739746094, + "learning_rate": 7.234906139015728e-06, + "loss": 0.6438, + "step": 3821 + }, + { + "epoch": 3.4904109589041097, + "grad_norm": 9.998018264770508, + "learning_rate": 7.233891425672248e-06, + "loss": 0.0661, + "step": 3822 + }, + { + "epoch": 3.491324200913242, + "grad_norm": 25.003034591674805, + "learning_rate": 7.232876712328768e-06, + "loss": 0.1597, + "step": 3823 + }, + { + "epoch": 3.4922374429223746, + "grad_norm": 24.713308334350586, + "learning_rate": 7.231861998985287e-06, + "loss": 0.4559, + "step": 3824 + }, + { + "epoch": 3.493150684931507, + "grad_norm": 20.804616928100586, + "learning_rate": 7.230847285641807e-06, + "loss": 0.1733, + "step": 3825 + }, + { + "epoch": 3.4940639269406395, + "grad_norm": 6.773073196411133, + "learning_rate": 7.2298325722983265e-06, + "loss": 0.0845, + "step": 3826 + }, + { + "epoch": 3.4949771689497715, + "grad_norm": 9.275197982788086, + "learning_rate": 7.228817858954846e-06, + "loss": 0.0647, + "step": 3827 + }, + { + "epoch": 3.495890410958904, + "grad_norm": 1.2430436611175537, + "learning_rate": 7.227803145611365e-06, + "loss": 0.0082, + "step": 3828 + }, + { + "epoch": 3.4968036529680364, + "grad_norm": 3.0306644439697266, + "learning_rate": 7.226788432267885e-06, + "loss": 0.0284, + "step": 3829 + }, + { + "epoch": 3.497716894977169, + "grad_norm": 2.8547921180725098, + "learning_rate": 7.225773718924404e-06, + "loss": 0.0276, + "step": 3830 + }, + { + "epoch": 3.4986301369863013, + "grad_norm": 67.93802642822266, + "learning_rate": 7.224759005580923e-06, + "loss": 0.7111, + "step": 3831 + }, + { + "epoch": 3.4995433789954338, + "grad_norm": 2.4605908393859863, + "learning_rate": 7.223744292237444e-06, + "loss": 0.0165, + "step": 3832 + }, + { + "epoch": 3.5004566210045662, + "grad_norm": 22.651033401489258, + "learning_rate": 7.2227295788939635e-06, + "loss": 0.2708, + "step": 3833 + }, + { + "epoch": 3.5013698630136987, + "grad_norm": 33.822689056396484, + "learning_rate": 7.221714865550482e-06, + "loss": 0.2574, + "step": 3834 + }, + { + "epoch": 3.502283105022831, + "grad_norm": 5.664705276489258, + "learning_rate": 7.220700152207002e-06, + "loss": 0.044, + "step": 3835 + }, + { + "epoch": 3.5031963470319636, + "grad_norm": 8.25391960144043, + "learning_rate": 7.219685438863522e-06, + "loss": 0.0559, + "step": 3836 + }, + { + "epoch": 3.504109589041096, + "grad_norm": 9.642809867858887, + "learning_rate": 7.218670725520041e-06, + "loss": 0.0835, + "step": 3837 + }, + { + "epoch": 3.505022831050228, + "grad_norm": 6.856866836547852, + "learning_rate": 7.21765601217656e-06, + "loss": 0.0616, + "step": 3838 + }, + { + "epoch": 3.5059360730593605, + "grad_norm": 20.46308708190918, + "learning_rate": 7.21664129883308e-06, + "loss": 0.1956, + "step": 3839 + }, + { + "epoch": 3.506849315068493, + "grad_norm": 12.180232048034668, + "learning_rate": 7.2156265854896005e-06, + "loss": 0.1017, + "step": 3840 + }, + { + "epoch": 3.5077625570776254, + "grad_norm": 20.182315826416016, + "learning_rate": 7.214611872146119e-06, + "loss": 0.1611, + "step": 3841 + }, + { + "epoch": 3.508675799086758, + "grad_norm": 27.45448112487793, + "learning_rate": 7.213597158802639e-06, + "loss": 0.2062, + "step": 3842 + }, + { + "epoch": 3.5095890410958903, + "grad_norm": 1.9321335554122925, + "learning_rate": 7.212582445459159e-06, + "loss": 0.0143, + "step": 3843 + }, + { + "epoch": 3.5105022831050228, + "grad_norm": 2.637571334838867, + "learning_rate": 7.2115677321156776e-06, + "loss": 0.0253, + "step": 3844 + }, + { + "epoch": 3.5114155251141552, + "grad_norm": 16.213499069213867, + "learning_rate": 7.210553018772197e-06, + "loss": 0.1861, + "step": 3845 + }, + { + "epoch": 3.5123287671232877, + "grad_norm": 23.258949279785156, + "learning_rate": 7.209538305428717e-06, + "loss": 0.1178, + "step": 3846 + }, + { + "epoch": 3.51324200913242, + "grad_norm": 71.92507934570312, + "learning_rate": 7.208523592085237e-06, + "loss": 0.6933, + "step": 3847 + }, + { + "epoch": 3.5141552511415526, + "grad_norm": 27.511844635009766, + "learning_rate": 7.2075088787417555e-06, + "loss": 0.2311, + "step": 3848 + }, + { + "epoch": 3.515068493150685, + "grad_norm": 2.766828775405884, + "learning_rate": 7.206494165398276e-06, + "loss": 0.0224, + "step": 3849 + }, + { + "epoch": 3.5159817351598175, + "grad_norm": 47.66574478149414, + "learning_rate": 7.205479452054796e-06, + "loss": 0.4754, + "step": 3850 + }, + { + "epoch": 3.51689497716895, + "grad_norm": 12.636367797851562, + "learning_rate": 7.2044647387113146e-06, + "loss": 0.0861, + "step": 3851 + }, + { + "epoch": 3.5178082191780824, + "grad_norm": 69.69611358642578, + "learning_rate": 7.203450025367834e-06, + "loss": 0.7413, + "step": 3852 + }, + { + "epoch": 3.518721461187215, + "grad_norm": 7.054023742675781, + "learning_rate": 7.202435312024354e-06, + "loss": 0.0361, + "step": 3853 + }, + { + "epoch": 3.5196347031963473, + "grad_norm": 30.754072189331055, + "learning_rate": 7.201420598680873e-06, + "loss": 0.3206, + "step": 3854 + }, + { + "epoch": 3.5205479452054793, + "grad_norm": 2.4979054927825928, + "learning_rate": 7.2004058853373925e-06, + "loss": 0.015, + "step": 3855 + }, + { + "epoch": 3.5214611872146118, + "grad_norm": 13.669219970703125, + "learning_rate": 7.199391171993912e-06, + "loss": 0.1, + "step": 3856 + }, + { + "epoch": 3.522374429223744, + "grad_norm": 2.5342884063720703, + "learning_rate": 7.198376458650433e-06, + "loss": 0.0212, + "step": 3857 + }, + { + "epoch": 3.5232876712328767, + "grad_norm": 5.246401786804199, + "learning_rate": 7.197361745306951e-06, + "loss": 0.0467, + "step": 3858 + }, + { + "epoch": 3.524200913242009, + "grad_norm": 0.5654072761535645, + "learning_rate": 7.196347031963471e-06, + "loss": 0.0043, + "step": 3859 + }, + { + "epoch": 3.5251141552511416, + "grad_norm": 11.775298118591309, + "learning_rate": 7.195332318619991e-06, + "loss": 0.1453, + "step": 3860 + }, + { + "epoch": 3.526027397260274, + "grad_norm": 6.9141845703125, + "learning_rate": 7.19431760527651e-06, + "loss": 0.0687, + "step": 3861 + }, + { + "epoch": 3.5269406392694065, + "grad_norm": 19.786001205444336, + "learning_rate": 7.1933028919330295e-06, + "loss": 0.2175, + "step": 3862 + }, + { + "epoch": 3.527853881278539, + "grad_norm": 9.731376647949219, + "learning_rate": 7.192288178589549e-06, + "loss": 0.0794, + "step": 3863 + }, + { + "epoch": 3.5287671232876714, + "grad_norm": 15.69041633605957, + "learning_rate": 7.191273465246068e-06, + "loss": 0.0799, + "step": 3864 + }, + { + "epoch": 3.5296803652968034, + "grad_norm": 3.4210407733917236, + "learning_rate": 7.190258751902588e-06, + "loss": 0.0314, + "step": 3865 + }, + { + "epoch": 3.530593607305936, + "grad_norm": 12.234436988830566, + "learning_rate": 7.189244038559107e-06, + "loss": 0.1392, + "step": 3866 + }, + { + "epoch": 3.5315068493150683, + "grad_norm": 11.938841819763184, + "learning_rate": 7.188229325215628e-06, + "loss": 0.0827, + "step": 3867 + }, + { + "epoch": 3.5324200913242008, + "grad_norm": 91.5467529296875, + "learning_rate": 7.187214611872147e-06, + "loss": 1.2088, + "step": 3868 + }, + { + "epoch": 3.533333333333333, + "grad_norm": 1.4177132844924927, + "learning_rate": 7.1861998985286665e-06, + "loss": 0.0113, + "step": 3869 + }, + { + "epoch": 3.5342465753424657, + "grad_norm": 5.213834762573242, + "learning_rate": 7.185185185185186e-06, + "loss": 0.0567, + "step": 3870 + }, + { + "epoch": 3.535159817351598, + "grad_norm": 9.331609725952148, + "learning_rate": 7.184170471841705e-06, + "loss": 0.0806, + "step": 3871 + }, + { + "epoch": 3.5360730593607306, + "grad_norm": 8.708220481872559, + "learning_rate": 7.183155758498225e-06, + "loss": 0.0591, + "step": 3872 + }, + { + "epoch": 3.536986301369863, + "grad_norm": 26.432912826538086, + "learning_rate": 7.182141045154744e-06, + "loss": 0.3382, + "step": 3873 + }, + { + "epoch": 3.5378995433789955, + "grad_norm": 13.571451187133789, + "learning_rate": 7.181126331811263e-06, + "loss": 0.1716, + "step": 3874 + }, + { + "epoch": 3.538812785388128, + "grad_norm": 78.60059356689453, + "learning_rate": 7.180111618467783e-06, + "loss": 0.9985, + "step": 3875 + }, + { + "epoch": 3.5397260273972604, + "grad_norm": 1.3147553205490112, + "learning_rate": 7.1790969051243035e-06, + "loss": 0.0064, + "step": 3876 + }, + { + "epoch": 3.540639269406393, + "grad_norm": 10.48000431060791, + "learning_rate": 7.178082191780823e-06, + "loss": 0.1154, + "step": 3877 + }, + { + "epoch": 3.5415525114155253, + "grad_norm": 148.652099609375, + "learning_rate": 7.177067478437342e-06, + "loss": 3.9663, + "step": 3878 + }, + { + "epoch": 3.5424657534246577, + "grad_norm": 65.07691192626953, + "learning_rate": 7.176052765093862e-06, + "loss": 0.8561, + "step": 3879 + }, + { + "epoch": 3.54337899543379, + "grad_norm": 7.5022687911987305, + "learning_rate": 7.175038051750381e-06, + "loss": 0.0539, + "step": 3880 + }, + { + "epoch": 3.544292237442922, + "grad_norm": 2.1281161308288574, + "learning_rate": 7.1740233384069e-06, + "loss": 0.0141, + "step": 3881 + }, + { + "epoch": 3.5452054794520547, + "grad_norm": 0.018969759345054626, + "learning_rate": 7.17300862506342e-06, + "loss": 0.0002, + "step": 3882 + }, + { + "epoch": 3.546118721461187, + "grad_norm": 23.401819229125977, + "learning_rate": 7.17199391171994e-06, + "loss": 0.2819, + "step": 3883 + }, + { + "epoch": 3.5470319634703196, + "grad_norm": 14.992055892944336, + "learning_rate": 7.1709791983764585e-06, + "loss": 0.1153, + "step": 3884 + }, + { + "epoch": 3.547945205479452, + "grad_norm": 17.318939208984375, + "learning_rate": 7.169964485032979e-06, + "loss": 0.0675, + "step": 3885 + }, + { + "epoch": 3.5488584474885845, + "grad_norm": 0.26669374108314514, + "learning_rate": 7.168949771689499e-06, + "loss": 0.0021, + "step": 3886 + }, + { + "epoch": 3.549771689497717, + "grad_norm": 45.82871627807617, + "learning_rate": 7.167935058346018e-06, + "loss": 0.7239, + "step": 3887 + }, + { + "epoch": 3.5506849315068494, + "grad_norm": 47.900657653808594, + "learning_rate": 7.166920345002537e-06, + "loss": 0.9108, + "step": 3888 + }, + { + "epoch": 3.551598173515982, + "grad_norm": 2.867262601852417, + "learning_rate": 7.165905631659057e-06, + "loss": 0.0267, + "step": 3889 + }, + { + "epoch": 3.5525114155251143, + "grad_norm": 7.2443156242370605, + "learning_rate": 7.164890918315577e-06, + "loss": 0.0547, + "step": 3890 + }, + { + "epoch": 3.5534246575342463, + "grad_norm": 9.392186164855957, + "learning_rate": 7.1638762049720955e-06, + "loss": 0.1098, + "step": 3891 + }, + { + "epoch": 3.5543378995433788, + "grad_norm": 26.70610237121582, + "learning_rate": 7.162861491628615e-06, + "loss": 0.1703, + "step": 3892 + }, + { + "epoch": 3.555251141552511, + "grad_norm": 21.693159103393555, + "learning_rate": 7.161846778285136e-06, + "loss": 0.1853, + "step": 3893 + }, + { + "epoch": 3.5561643835616437, + "grad_norm": 87.85321807861328, + "learning_rate": 7.160832064941654e-06, + "loss": 1.4921, + "step": 3894 + }, + { + "epoch": 3.557077625570776, + "grad_norm": 22.062326431274414, + "learning_rate": 7.159817351598174e-06, + "loss": 0.1533, + "step": 3895 + }, + { + "epoch": 3.5579908675799086, + "grad_norm": 3.827669382095337, + "learning_rate": 7.158802638254694e-06, + "loss": 0.0397, + "step": 3896 + }, + { + "epoch": 3.558904109589041, + "grad_norm": 57.047157287597656, + "learning_rate": 7.157787924911214e-06, + "loss": 0.8305, + "step": 3897 + }, + { + "epoch": 3.5598173515981735, + "grad_norm": 4.078414440155029, + "learning_rate": 7.1567732115677325e-06, + "loss": 0.0503, + "step": 3898 + }, + { + "epoch": 3.560730593607306, + "grad_norm": 15.594172477722168, + "learning_rate": 7.155758498224252e-06, + "loss": 0.1634, + "step": 3899 + }, + { + "epoch": 3.5616438356164384, + "grad_norm": 28.94975471496582, + "learning_rate": 7.154743784880772e-06, + "loss": 0.2481, + "step": 3900 + }, + { + "epoch": 3.562557077625571, + "grad_norm": 20.204540252685547, + "learning_rate": 7.153729071537291e-06, + "loss": 0.1857, + "step": 3901 + }, + { + "epoch": 3.5634703196347033, + "grad_norm": 32.77769088745117, + "learning_rate": 7.15271435819381e-06, + "loss": 0.3112, + "step": 3902 + }, + { + "epoch": 3.5643835616438357, + "grad_norm": 6.813997268676758, + "learning_rate": 7.151699644850331e-06, + "loss": 0.0667, + "step": 3903 + }, + { + "epoch": 3.565296803652968, + "grad_norm": 76.70555114746094, + "learning_rate": 7.15068493150685e-06, + "loss": 1.1868, + "step": 3904 + }, + { + "epoch": 3.5662100456621006, + "grad_norm": 7.449581623077393, + "learning_rate": 7.1496702181633695e-06, + "loss": 0.0645, + "step": 3905 + }, + { + "epoch": 3.567123287671233, + "grad_norm": 52.817970275878906, + "learning_rate": 7.148655504819889e-06, + "loss": 0.6461, + "step": 3906 + }, + { + "epoch": 3.5680365296803656, + "grad_norm": 58.2159309387207, + "learning_rate": 7.147640791476409e-06, + "loss": 0.6045, + "step": 3907 + }, + { + "epoch": 3.5689497716894976, + "grad_norm": 2.178591012954712, + "learning_rate": 7.146626078132928e-06, + "loss": 0.0204, + "step": 3908 + }, + { + "epoch": 3.56986301369863, + "grad_norm": 8.917287826538086, + "learning_rate": 7.145611364789447e-06, + "loss": 0.0665, + "step": 3909 + }, + { + "epoch": 3.5707762557077625, + "grad_norm": 19.0910701751709, + "learning_rate": 7.144596651445967e-06, + "loss": 0.1219, + "step": 3910 + }, + { + "epoch": 3.571689497716895, + "grad_norm": 35.11634826660156, + "learning_rate": 7.143581938102486e-06, + "loss": 0.504, + "step": 3911 + }, + { + "epoch": 3.5726027397260274, + "grad_norm": 28.527645111083984, + "learning_rate": 7.1425672247590065e-06, + "loss": 0.5051, + "step": 3912 + }, + { + "epoch": 3.57351598173516, + "grad_norm": 6.287680149078369, + "learning_rate": 7.141552511415526e-06, + "loss": 0.053, + "step": 3913 + }, + { + "epoch": 3.5744292237442923, + "grad_norm": 1.570504903793335, + "learning_rate": 7.140537798072045e-06, + "loss": 0.0177, + "step": 3914 + }, + { + "epoch": 3.5753424657534247, + "grad_norm": 39.06342315673828, + "learning_rate": 7.139523084728565e-06, + "loss": 0.2672, + "step": 3915 + }, + { + "epoch": 3.576255707762557, + "grad_norm": 3.662600517272949, + "learning_rate": 7.138508371385084e-06, + "loss": 0.0475, + "step": 3916 + }, + { + "epoch": 3.5771689497716896, + "grad_norm": 44.383365631103516, + "learning_rate": 7.137493658041604e-06, + "loss": 0.1132, + "step": 3917 + }, + { + "epoch": 3.5780821917808217, + "grad_norm": 2.7766833305358887, + "learning_rate": 7.136478944698123e-06, + "loss": 0.0242, + "step": 3918 + }, + { + "epoch": 3.578995433789954, + "grad_norm": 1.0503308773040771, + "learning_rate": 7.135464231354643e-06, + "loss": 0.008, + "step": 3919 + }, + { + "epoch": 3.5799086757990866, + "grad_norm": 27.71277618408203, + "learning_rate": 7.134449518011163e-06, + "loss": 0.2833, + "step": 3920 + }, + { + "epoch": 3.580821917808219, + "grad_norm": 3.718998670578003, + "learning_rate": 7.133434804667682e-06, + "loss": 0.0388, + "step": 3921 + }, + { + "epoch": 3.5817351598173515, + "grad_norm": 20.25920867919922, + "learning_rate": 7.132420091324202e-06, + "loss": 0.1766, + "step": 3922 + }, + { + "epoch": 3.582648401826484, + "grad_norm": 18.6181697845459, + "learning_rate": 7.131405377980721e-06, + "loss": 0.1596, + "step": 3923 + }, + { + "epoch": 3.5835616438356164, + "grad_norm": 19.3155517578125, + "learning_rate": 7.13039066463724e-06, + "loss": 0.1403, + "step": 3924 + }, + { + "epoch": 3.584474885844749, + "grad_norm": 7.594532489776611, + "learning_rate": 7.12937595129376e-06, + "loss": 0.0961, + "step": 3925 + }, + { + "epoch": 3.5853881278538813, + "grad_norm": 5.1033101081848145, + "learning_rate": 7.12836123795028e-06, + "loss": 0.0382, + "step": 3926 + }, + { + "epoch": 3.5863013698630137, + "grad_norm": 37.73222351074219, + "learning_rate": 7.127346524606799e-06, + "loss": 0.4446, + "step": 3927 + }, + { + "epoch": 3.587214611872146, + "grad_norm": 6.55682897567749, + "learning_rate": 7.126331811263318e-06, + "loss": 0.038, + "step": 3928 + }, + { + "epoch": 3.5881278538812786, + "grad_norm": 16.5572509765625, + "learning_rate": 7.125317097919839e-06, + "loss": 0.13, + "step": 3929 + }, + { + "epoch": 3.589041095890411, + "grad_norm": 24.27862548828125, + "learning_rate": 7.124302384576358e-06, + "loss": 0.3162, + "step": 3930 + }, + { + "epoch": 3.5899543378995435, + "grad_norm": 56.422359466552734, + "learning_rate": 7.123287671232877e-06, + "loss": 0.8316, + "step": 3931 + }, + { + "epoch": 3.590867579908676, + "grad_norm": 7.018030643463135, + "learning_rate": 7.122272957889397e-06, + "loss": 0.0474, + "step": 3932 + }, + { + "epoch": 3.5917808219178085, + "grad_norm": 6.704410552978516, + "learning_rate": 7.121258244545917e-06, + "loss": 0.0601, + "step": 3933 + }, + { + "epoch": 3.592694063926941, + "grad_norm": 0.40939465165138245, + "learning_rate": 7.1202435312024354e-06, + "loss": 0.0025, + "step": 3934 + }, + { + "epoch": 3.593607305936073, + "grad_norm": 41.846195220947266, + "learning_rate": 7.119228817858955e-06, + "loss": 0.6421, + "step": 3935 + }, + { + "epoch": 3.5945205479452054, + "grad_norm": 0.11038219183683395, + "learning_rate": 7.118214104515475e-06, + "loss": 0.0011, + "step": 3936 + }, + { + "epoch": 3.595433789954338, + "grad_norm": 2.431227207183838, + "learning_rate": 7.117199391171995e-06, + "loss": 0.0196, + "step": 3937 + }, + { + "epoch": 3.5963470319634703, + "grad_norm": 108.02498626708984, + "learning_rate": 7.116184677828513e-06, + "loss": 2.9489, + "step": 3938 + }, + { + "epoch": 3.5972602739726027, + "grad_norm": 4.685796737670898, + "learning_rate": 7.115169964485034e-06, + "loss": 0.0351, + "step": 3939 + }, + { + "epoch": 3.598173515981735, + "grad_norm": 30.26410675048828, + "learning_rate": 7.114155251141554e-06, + "loss": 0.1963, + "step": 3940 + }, + { + "epoch": 3.5990867579908676, + "grad_norm": 2.463937759399414, + "learning_rate": 7.1131405377980724e-06, + "loss": 0.0188, + "step": 3941 + }, + { + "epoch": 3.6, + "grad_norm": 78.02005767822266, + "learning_rate": 7.112125824454592e-06, + "loss": 1.8361, + "step": 3942 + }, + { + "epoch": 3.6009132420091325, + "grad_norm": 7.698858737945557, + "learning_rate": 7.111111111111112e-06, + "loss": 0.0521, + "step": 3943 + }, + { + "epoch": 3.601826484018265, + "grad_norm": 0.8851739764213562, + "learning_rate": 7.110096397767631e-06, + "loss": 0.0083, + "step": 3944 + }, + { + "epoch": 3.602739726027397, + "grad_norm": 39.52031326293945, + "learning_rate": 7.10908168442415e-06, + "loss": 0.4901, + "step": 3945 + }, + { + "epoch": 3.6036529680365295, + "grad_norm": 37.186405181884766, + "learning_rate": 7.10806697108067e-06, + "loss": 0.2446, + "step": 3946 + }, + { + "epoch": 3.604566210045662, + "grad_norm": 13.94626522064209, + "learning_rate": 7.107052257737191e-06, + "loss": 0.1446, + "step": 3947 + }, + { + "epoch": 3.6054794520547944, + "grad_norm": 32.27663040161133, + "learning_rate": 7.1060375443937094e-06, + "loss": 0.5941, + "step": 3948 + }, + { + "epoch": 3.606392694063927, + "grad_norm": 35.18804168701172, + "learning_rate": 7.105022831050229e-06, + "loss": 0.2583, + "step": 3949 + }, + { + "epoch": 3.6073059360730593, + "grad_norm": 0.20201706886291504, + "learning_rate": 7.104008117706749e-06, + "loss": 0.0016, + "step": 3950 + }, + { + "epoch": 3.6082191780821917, + "grad_norm": 159.42420959472656, + "learning_rate": 7.102993404363268e-06, + "loss": 0.1219, + "step": 3951 + }, + { + "epoch": 3.609132420091324, + "grad_norm": 22.462963104248047, + "learning_rate": 7.101978691019787e-06, + "loss": 0.1682, + "step": 3952 + }, + { + "epoch": 3.6100456621004566, + "grad_norm": 23.34221076965332, + "learning_rate": 7.100963977676307e-06, + "loss": 0.2919, + "step": 3953 + }, + { + "epoch": 3.610958904109589, + "grad_norm": 22.886873245239258, + "learning_rate": 7.099949264332826e-06, + "loss": 0.3281, + "step": 3954 + }, + { + "epoch": 3.6118721461187215, + "grad_norm": 16.15188980102539, + "learning_rate": 7.098934550989346e-06, + "loss": 0.1666, + "step": 3955 + }, + { + "epoch": 3.612785388127854, + "grad_norm": 10.357332229614258, + "learning_rate": 7.097919837645866e-06, + "loss": 0.0945, + "step": 3956 + }, + { + "epoch": 3.6136986301369864, + "grad_norm": 4.821236610412598, + "learning_rate": 7.096905124302386e-06, + "loss": 0.0328, + "step": 3957 + }, + { + "epoch": 3.614611872146119, + "grad_norm": 27.3601016998291, + "learning_rate": 7.095890410958905e-06, + "loss": 0.261, + "step": 3958 + }, + { + "epoch": 3.6155251141552514, + "grad_norm": 3.315312623977661, + "learning_rate": 7.094875697615424e-06, + "loss": 0.0226, + "step": 3959 + }, + { + "epoch": 3.616438356164384, + "grad_norm": 17.354019165039062, + "learning_rate": 7.093860984271944e-06, + "loss": 0.1481, + "step": 3960 + }, + { + "epoch": 3.6173515981735163, + "grad_norm": 11.857542037963867, + "learning_rate": 7.092846270928463e-06, + "loss": 0.0927, + "step": 3961 + }, + { + "epoch": 3.6182648401826483, + "grad_norm": 1.0208712816238403, + "learning_rate": 7.091831557584983e-06, + "loss": 0.0107, + "step": 3962 + }, + { + "epoch": 3.6191780821917807, + "grad_norm": 0.4303695559501648, + "learning_rate": 7.090816844241502e-06, + "loss": 0.0041, + "step": 3963 + }, + { + "epoch": 3.620091324200913, + "grad_norm": 60.32521057128906, + "learning_rate": 7.089802130898021e-06, + "loss": 0.7848, + "step": 3964 + }, + { + "epoch": 3.6210045662100456, + "grad_norm": 52.885711669921875, + "learning_rate": 7.088787417554542e-06, + "loss": 0.793, + "step": 3965 + }, + { + "epoch": 3.621917808219178, + "grad_norm": 1.3028759956359863, + "learning_rate": 7.087772704211061e-06, + "loss": 0.0109, + "step": 3966 + }, + { + "epoch": 3.6228310502283105, + "grad_norm": 10.593269348144531, + "learning_rate": 7.086757990867581e-06, + "loss": 0.1318, + "step": 3967 + }, + { + "epoch": 3.623744292237443, + "grad_norm": 7.147963523864746, + "learning_rate": 7.0857432775241e-06, + "loss": 0.0572, + "step": 3968 + }, + { + "epoch": 3.6246575342465754, + "grad_norm": 25.000865936279297, + "learning_rate": 7.08472856418062e-06, + "loss": 0.2569, + "step": 3969 + }, + { + "epoch": 3.625570776255708, + "grad_norm": 21.897747039794922, + "learning_rate": 7.083713850837139e-06, + "loss": 0.1841, + "step": 3970 + }, + { + "epoch": 3.6264840182648403, + "grad_norm": 17.670181274414062, + "learning_rate": 7.082699137493658e-06, + "loss": 0.1083, + "step": 3971 + }, + { + "epoch": 3.6273972602739724, + "grad_norm": 29.462202072143555, + "learning_rate": 7.081684424150178e-06, + "loss": 0.123, + "step": 3972 + }, + { + "epoch": 3.628310502283105, + "grad_norm": 3.3760859966278076, + "learning_rate": 7.080669710806698e-06, + "loss": 0.0362, + "step": 3973 + }, + { + "epoch": 3.6292237442922373, + "grad_norm": 5.151828289031982, + "learning_rate": 7.079654997463216e-06, + "loss": 0.044, + "step": 3974 + }, + { + "epoch": 3.6301369863013697, + "grad_norm": 0.5407745242118835, + "learning_rate": 7.078640284119737e-06, + "loss": 0.0031, + "step": 3975 + }, + { + "epoch": 3.631050228310502, + "grad_norm": 39.5467529296875, + "learning_rate": 7.077625570776257e-06, + "loss": 0.317, + "step": 3976 + }, + { + "epoch": 3.6319634703196346, + "grad_norm": 16.02716064453125, + "learning_rate": 7.076610857432776e-06, + "loss": 0.1809, + "step": 3977 + }, + { + "epoch": 3.632876712328767, + "grad_norm": 85.01050567626953, + "learning_rate": 7.075596144089295e-06, + "loss": 1.5819, + "step": 3978 + }, + { + "epoch": 3.6337899543378995, + "grad_norm": 12.150839805603027, + "learning_rate": 7.074581430745815e-06, + "loss": 0.0944, + "step": 3979 + }, + { + "epoch": 3.634703196347032, + "grad_norm": 1.1354063749313354, + "learning_rate": 7.0735667174023345e-06, + "loss": 0.0102, + "step": 3980 + }, + { + "epoch": 3.6356164383561644, + "grad_norm": 55.14291763305664, + "learning_rate": 7.072552004058853e-06, + "loss": 0.7103, + "step": 3981 + }, + { + "epoch": 3.636529680365297, + "grad_norm": 88.41465759277344, + "learning_rate": 7.071537290715373e-06, + "loss": 2.2635, + "step": 3982 + }, + { + "epoch": 3.6374429223744293, + "grad_norm": 4.577165603637695, + "learning_rate": 7.070522577371894e-06, + "loss": 0.0295, + "step": 3983 + }, + { + "epoch": 3.638356164383562, + "grad_norm": 0.6544575691223145, + "learning_rate": 7.069507864028412e-06, + "loss": 0.0069, + "step": 3984 + }, + { + "epoch": 3.6392694063926943, + "grad_norm": 10.192462921142578, + "learning_rate": 7.068493150684932e-06, + "loss": 0.0672, + "step": 3985 + }, + { + "epoch": 3.6401826484018267, + "grad_norm": 2.699558973312378, + "learning_rate": 7.067478437341452e-06, + "loss": 0.0293, + "step": 3986 + }, + { + "epoch": 3.641095890410959, + "grad_norm": 1.516311764717102, + "learning_rate": 7.0664637239979715e-06, + "loss": 0.0122, + "step": 3987 + }, + { + "epoch": 3.642009132420091, + "grad_norm": 4.844880104064941, + "learning_rate": 7.06544901065449e-06, + "loss": 0.0459, + "step": 3988 + }, + { + "epoch": 3.6429223744292236, + "grad_norm": 3.5934691429138184, + "learning_rate": 7.06443429731101e-06, + "loss": 0.021, + "step": 3989 + }, + { + "epoch": 3.643835616438356, + "grad_norm": 5.554154872894287, + "learning_rate": 7.06341958396753e-06, + "loss": 0.0548, + "step": 3990 + }, + { + "epoch": 3.6447488584474885, + "grad_norm": 8.18395709991455, + "learning_rate": 7.0624048706240486e-06, + "loss": 0.0612, + "step": 3991 + }, + { + "epoch": 3.645662100456621, + "grad_norm": 16.42348861694336, + "learning_rate": 7.061390157280569e-06, + "loss": 0.1503, + "step": 3992 + }, + { + "epoch": 3.6465753424657534, + "grad_norm": 43.452781677246094, + "learning_rate": 7.060375443937089e-06, + "loss": 0.4715, + "step": 3993 + }, + { + "epoch": 3.647488584474886, + "grad_norm": 22.929311752319336, + "learning_rate": 7.059360730593608e-06, + "loss": 0.2891, + "step": 3994 + }, + { + "epoch": 3.6484018264840183, + "grad_norm": 9.050600051879883, + "learning_rate": 7.058346017250127e-06, + "loss": 0.0607, + "step": 3995 + }, + { + "epoch": 3.649315068493151, + "grad_norm": 60.30252456665039, + "learning_rate": 7.057331303906647e-06, + "loss": 0.1842, + "step": 3996 + }, + { + "epoch": 3.6502283105022832, + "grad_norm": 1.0533430576324463, + "learning_rate": 7.056316590563167e-06, + "loss": 0.0123, + "step": 3997 + }, + { + "epoch": 3.6511415525114153, + "grad_norm": 27.760929107666016, + "learning_rate": 7.0553018772196856e-06, + "loss": 0.1809, + "step": 3998 + }, + { + "epoch": 3.6520547945205477, + "grad_norm": 45.95585632324219, + "learning_rate": 7.054287163876205e-06, + "loss": 0.3923, + "step": 3999 + }, + { + "epoch": 3.65296803652968, + "grad_norm": 43.91360092163086, + "learning_rate": 7.053272450532726e-06, + "loss": 0.3694, + "step": 4000 + }, + { + "epoch": 3.6538812785388126, + "grad_norm": 38.87520217895508, + "learning_rate": 7.052257737189245e-06, + "loss": 0.7311, + "step": 4001 + }, + { + "epoch": 3.654794520547945, + "grad_norm": 37.25546646118164, + "learning_rate": 7.051243023845764e-06, + "loss": 0.4008, + "step": 4002 + }, + { + "epoch": 3.6557077625570775, + "grad_norm": 81.3643798828125, + "learning_rate": 7.050228310502284e-06, + "loss": 1.5193, + "step": 4003 + }, + { + "epoch": 3.65662100456621, + "grad_norm": 10.102080345153809, + "learning_rate": 7.049213597158803e-06, + "loss": 0.0823, + "step": 4004 + }, + { + "epoch": 3.6575342465753424, + "grad_norm": 67.89530181884766, + "learning_rate": 7.0481988838153226e-06, + "loss": 1.4826, + "step": 4005 + }, + { + "epoch": 3.658447488584475, + "grad_norm": 1.1841387748718262, + "learning_rate": 7.047184170471842e-06, + "loss": 0.0091, + "step": 4006 + }, + { + "epoch": 3.6593607305936073, + "grad_norm": 26.711162567138672, + "learning_rate": 7.046169457128362e-06, + "loss": 0.2502, + "step": 4007 + }, + { + "epoch": 3.66027397260274, + "grad_norm": 28.602663040161133, + "learning_rate": 7.045154743784881e-06, + "loss": 0.2311, + "step": 4008 + }, + { + "epoch": 3.6611872146118722, + "grad_norm": 4.836391448974609, + "learning_rate": 7.044140030441401e-06, + "loss": 0.0327, + "step": 4009 + }, + { + "epoch": 3.6621004566210047, + "grad_norm": 9.57752513885498, + "learning_rate": 7.043125317097921e-06, + "loss": 0.0287, + "step": 4010 + }, + { + "epoch": 3.663013698630137, + "grad_norm": 9.717257499694824, + "learning_rate": 7.04211060375444e-06, + "loss": 0.1134, + "step": 4011 + }, + { + "epoch": 3.6639269406392696, + "grad_norm": 177.47886657714844, + "learning_rate": 7.0410958904109596e-06, + "loss": 2.9607, + "step": 4012 + }, + { + "epoch": 3.664840182648402, + "grad_norm": 4.676396369934082, + "learning_rate": 7.040081177067479e-06, + "loss": 0.0443, + "step": 4013 + }, + { + "epoch": 3.6657534246575345, + "grad_norm": 2.407611131668091, + "learning_rate": 7.039066463723998e-06, + "loss": 0.014, + "step": 4014 + }, + { + "epoch": 3.6666666666666665, + "grad_norm": 9.386616706848145, + "learning_rate": 7.038051750380518e-06, + "loss": 0.0928, + "step": 4015 + }, + { + "epoch": 3.667579908675799, + "grad_norm": 14.781816482543945, + "learning_rate": 7.0370370370370375e-06, + "loss": 0.1041, + "step": 4016 + }, + { + "epoch": 3.6684931506849314, + "grad_norm": 12.154622077941895, + "learning_rate": 7.036022323693558e-06, + "loss": 0.1093, + "step": 4017 + }, + { + "epoch": 3.669406392694064, + "grad_norm": 83.732177734375, + "learning_rate": 7.035007610350076e-06, + "loss": 0.8852, + "step": 4018 + }, + { + "epoch": 3.6703196347031963, + "grad_norm": 111.54467010498047, + "learning_rate": 7.0339928970065966e-06, + "loss": 0.881, + "step": 4019 + }, + { + "epoch": 3.671232876712329, + "grad_norm": 36.96257781982422, + "learning_rate": 7.032978183663116e-06, + "loss": 0.3309, + "step": 4020 + }, + { + "epoch": 3.6721461187214612, + "grad_norm": 65.57998657226562, + "learning_rate": 7.031963470319635e-06, + "loss": 2.5761, + "step": 4021 + }, + { + "epoch": 3.6730593607305937, + "grad_norm": 2.7769887447357178, + "learning_rate": 7.030948756976155e-06, + "loss": 0.0159, + "step": 4022 + }, + { + "epoch": 3.673972602739726, + "grad_norm": 8.379192352294922, + "learning_rate": 7.0299340436326745e-06, + "loss": 0.0487, + "step": 4023 + }, + { + "epoch": 3.6748858447488586, + "grad_norm": 12.61352252960205, + "learning_rate": 7.028919330289193e-06, + "loss": 0.1034, + "step": 4024 + }, + { + "epoch": 3.6757990867579906, + "grad_norm": 6.9824538230896, + "learning_rate": 7.027904616945713e-06, + "loss": 0.055, + "step": 4025 + }, + { + "epoch": 3.676712328767123, + "grad_norm": 7.793228626251221, + "learning_rate": 7.026889903602233e-06, + "loss": 0.0778, + "step": 4026 + }, + { + "epoch": 3.6776255707762555, + "grad_norm": 41.00870895385742, + "learning_rate": 7.025875190258753e-06, + "loss": 0.7043, + "step": 4027 + }, + { + "epoch": 3.678538812785388, + "grad_norm": 0.18722328543663025, + "learning_rate": 7.024860476915272e-06, + "loss": 0.0017, + "step": 4028 + }, + { + "epoch": 3.6794520547945204, + "grad_norm": 96.05824279785156, + "learning_rate": 7.023845763571792e-06, + "loss": 3.7296, + "step": 4029 + }, + { + "epoch": 3.680365296803653, + "grad_norm": 4.1015825271606445, + "learning_rate": 7.0228310502283115e-06, + "loss": 0.0341, + "step": 4030 + }, + { + "epoch": 3.6812785388127853, + "grad_norm": 14.045299530029297, + "learning_rate": 7.02181633688483e-06, + "loss": 0.1778, + "step": 4031 + }, + { + "epoch": 3.682191780821918, + "grad_norm": 4.976772785186768, + "learning_rate": 7.02080162354135e-06, + "loss": 0.03, + "step": 4032 + }, + { + "epoch": 3.6831050228310502, + "grad_norm": 57.8084716796875, + "learning_rate": 7.01978691019787e-06, + "loss": 0.6751, + "step": 4033 + }, + { + "epoch": 3.6840182648401827, + "grad_norm": 1.349110722541809, + "learning_rate": 7.0187721968543886e-06, + "loss": 0.009, + "step": 4034 + }, + { + "epoch": 3.684931506849315, + "grad_norm": 1.1153082847595215, + "learning_rate": 7.017757483510908e-06, + "loss": 0.0094, + "step": 4035 + }, + { + "epoch": 3.6858447488584476, + "grad_norm": 101.41547393798828, + "learning_rate": 7.016742770167429e-06, + "loss": 2.2474, + "step": 4036 + }, + { + "epoch": 3.68675799086758, + "grad_norm": 24.089948654174805, + "learning_rate": 7.0157280568239485e-06, + "loss": 0.267, + "step": 4037 + }, + { + "epoch": 3.6876712328767125, + "grad_norm": 13.753326416015625, + "learning_rate": 7.014713343480467e-06, + "loss": 0.1442, + "step": 4038 + }, + { + "epoch": 3.688584474885845, + "grad_norm": 100.21710968017578, + "learning_rate": 7.013698630136987e-06, + "loss": 2.0011, + "step": 4039 + }, + { + "epoch": 3.6894977168949774, + "grad_norm": 10.61806583404541, + "learning_rate": 7.012683916793507e-06, + "loss": 0.0967, + "step": 4040 + }, + { + "epoch": 3.69041095890411, + "grad_norm": 17.041580200195312, + "learning_rate": 7.0116692034500256e-06, + "loss": 0.2468, + "step": 4041 + }, + { + "epoch": 3.691324200913242, + "grad_norm": 50.68961715698242, + "learning_rate": 7.010654490106545e-06, + "loss": 0.907, + "step": 4042 + }, + { + "epoch": 3.6922374429223743, + "grad_norm": 63.067928314208984, + "learning_rate": 7.009639776763065e-06, + "loss": 1.2679, + "step": 4043 + }, + { + "epoch": 3.6931506849315068, + "grad_norm": 33.84656524658203, + "learning_rate": 7.008625063419584e-06, + "loss": 0.2817, + "step": 4044 + }, + { + "epoch": 3.6940639269406392, + "grad_norm": 8.709909439086914, + "learning_rate": 7.007610350076104e-06, + "loss": 0.069, + "step": 4045 + }, + { + "epoch": 3.6949771689497717, + "grad_norm": 38.1615104675293, + "learning_rate": 7.006595636732624e-06, + "loss": 0.5807, + "step": 4046 + }, + { + "epoch": 3.695890410958904, + "grad_norm": 48.45713806152344, + "learning_rate": 7.005580923389144e-06, + "loss": 0.0807, + "step": 4047 + }, + { + "epoch": 3.6968036529680366, + "grad_norm": 83.36092376708984, + "learning_rate": 7.0045662100456626e-06, + "loss": 0.5589, + "step": 4048 + }, + { + "epoch": 3.697716894977169, + "grad_norm": 47.22810745239258, + "learning_rate": 7.003551496702182e-06, + "loss": 0.7189, + "step": 4049 + }, + { + "epoch": 3.6986301369863015, + "grad_norm": 18.372392654418945, + "learning_rate": 7.002536783358702e-06, + "loss": 0.1418, + "step": 4050 + }, + { + "epoch": 3.699543378995434, + "grad_norm": 22.558198928833008, + "learning_rate": 7.001522070015221e-06, + "loss": 0.2757, + "step": 4051 + }, + { + "epoch": 3.700456621004566, + "grad_norm": 5.520749568939209, + "learning_rate": 7.0005073566717405e-06, + "loss": 0.0432, + "step": 4052 + }, + { + "epoch": 3.7013698630136984, + "grad_norm": 5.785286903381348, + "learning_rate": 6.999492643328261e-06, + "loss": 0.0507, + "step": 4053 + }, + { + "epoch": 3.702283105022831, + "grad_norm": 8.824065208435059, + "learning_rate": 6.998477929984779e-06, + "loss": 0.0906, + "step": 4054 + }, + { + "epoch": 3.7031963470319633, + "grad_norm": 91.46139526367188, + "learning_rate": 6.9974632166412995e-06, + "loss": 0.8155, + "step": 4055 + }, + { + "epoch": 3.7041095890410958, + "grad_norm": 29.944211959838867, + "learning_rate": 6.996448503297819e-06, + "loss": 0.1729, + "step": 4056 + }, + { + "epoch": 3.7050228310502282, + "grad_norm": 42.497779846191406, + "learning_rate": 6.995433789954339e-06, + "loss": 0.5519, + "step": 4057 + }, + { + "epoch": 3.7059360730593607, + "grad_norm": 8.29946517944336, + "learning_rate": 6.994419076610858e-06, + "loss": 0.0705, + "step": 4058 + }, + { + "epoch": 3.706849315068493, + "grad_norm": 5.346439361572266, + "learning_rate": 6.9934043632673775e-06, + "loss": 0.0411, + "step": 4059 + }, + { + "epoch": 3.7077625570776256, + "grad_norm": 8.891144752502441, + "learning_rate": 6.992389649923897e-06, + "loss": 0.0782, + "step": 4060 + }, + { + "epoch": 3.708675799086758, + "grad_norm": 2.1491708755493164, + "learning_rate": 6.991374936580416e-06, + "loss": 0.0141, + "step": 4061 + }, + { + "epoch": 3.7095890410958905, + "grad_norm": 7.490489959716797, + "learning_rate": 6.990360223236936e-06, + "loss": 0.0611, + "step": 4062 + }, + { + "epoch": 3.710502283105023, + "grad_norm": 34.87735366821289, + "learning_rate": 6.989345509893456e-06, + "loss": 0.3097, + "step": 4063 + }, + { + "epoch": 3.7114155251141554, + "grad_norm": 0.369791716337204, + "learning_rate": 6.988330796549975e-06, + "loss": 0.0029, + "step": 4064 + }, + { + "epoch": 3.712328767123288, + "grad_norm": 4.45798921585083, + "learning_rate": 6.987316083206495e-06, + "loss": 0.0452, + "step": 4065 + }, + { + "epoch": 3.7132420091324203, + "grad_norm": 38.75932312011719, + "learning_rate": 6.9863013698630145e-06, + "loss": 0.427, + "step": 4066 + }, + { + "epoch": 3.7141552511415528, + "grad_norm": 122.8971939086914, + "learning_rate": 6.985286656519534e-06, + "loss": 1.9841, + "step": 4067 + }, + { + "epoch": 3.7150684931506848, + "grad_norm": 3.660996198654175, + "learning_rate": 6.984271943176053e-06, + "loss": 0.0223, + "step": 4068 + }, + { + "epoch": 3.7159817351598172, + "grad_norm": 27.754501342773438, + "learning_rate": 6.983257229832573e-06, + "loss": 0.1632, + "step": 4069 + }, + { + "epoch": 3.7168949771689497, + "grad_norm": 65.46509552001953, + "learning_rate": 6.982242516489092e-06, + "loss": 0.7564, + "step": 4070 + }, + { + "epoch": 3.717808219178082, + "grad_norm": 58.20396423339844, + "learning_rate": 6.981227803145611e-06, + "loss": 1.158, + "step": 4071 + }, + { + "epoch": 3.7187214611872146, + "grad_norm": 8.242439270019531, + "learning_rate": 6.980213089802132e-06, + "loss": 0.0561, + "step": 4072 + }, + { + "epoch": 3.719634703196347, + "grad_norm": 3.418930768966675, + "learning_rate": 6.9791983764586515e-06, + "loss": 0.0316, + "step": 4073 + }, + { + "epoch": 3.7205479452054795, + "grad_norm": 4.627779006958008, + "learning_rate": 6.97818366311517e-06, + "loss": 0.0337, + "step": 4074 + }, + { + "epoch": 3.721461187214612, + "grad_norm": 19.766698837280273, + "learning_rate": 6.97716894977169e-06, + "loss": 0.1291, + "step": 4075 + }, + { + "epoch": 3.7223744292237444, + "grad_norm": 19.141319274902344, + "learning_rate": 6.97615423642821e-06, + "loss": 0.1387, + "step": 4076 + }, + { + "epoch": 3.723287671232877, + "grad_norm": 32.72361755371094, + "learning_rate": 6.975139523084729e-06, + "loss": 0.3386, + "step": 4077 + }, + { + "epoch": 3.724200913242009, + "grad_norm": 9.295126914978027, + "learning_rate": 6.974124809741248e-06, + "loss": 0.0995, + "step": 4078 + }, + { + "epoch": 3.7251141552511413, + "grad_norm": 13.847743034362793, + "learning_rate": 6.973110096397768e-06, + "loss": 0.1311, + "step": 4079 + }, + { + "epoch": 3.7260273972602738, + "grad_norm": 3.4769251346588135, + "learning_rate": 6.9720953830542885e-06, + "loss": 0.0197, + "step": 4080 + }, + { + "epoch": 3.726940639269406, + "grad_norm": 100.58331298828125, + "learning_rate": 6.9710806697108065e-06, + "loss": 1.1829, + "step": 4081 + }, + { + "epoch": 3.7278538812785387, + "grad_norm": 101.59220123291016, + "learning_rate": 6.970065956367327e-06, + "loss": 0.8263, + "step": 4082 + }, + { + "epoch": 3.728767123287671, + "grad_norm": 2.0198493003845215, + "learning_rate": 6.969051243023847e-06, + "loss": 0.0188, + "step": 4083 + }, + { + "epoch": 3.7296803652968036, + "grad_norm": 3.3662269115448, + "learning_rate": 6.9680365296803655e-06, + "loss": 0.0257, + "step": 4084 + }, + { + "epoch": 3.730593607305936, + "grad_norm": 104.607666015625, + "learning_rate": 6.967021816336885e-06, + "loss": 3.3429, + "step": 4085 + }, + { + "epoch": 3.7315068493150685, + "grad_norm": 60.70170974731445, + "learning_rate": 6.966007102993405e-06, + "loss": 0.7707, + "step": 4086 + }, + { + "epoch": 3.732420091324201, + "grad_norm": 10.866700172424316, + "learning_rate": 6.964992389649925e-06, + "loss": 0.0524, + "step": 4087 + }, + { + "epoch": 3.7333333333333334, + "grad_norm": 3.078634262084961, + "learning_rate": 6.9639776763064435e-06, + "loss": 0.0323, + "step": 4088 + }, + { + "epoch": 3.734246575342466, + "grad_norm": 14.761197090148926, + "learning_rate": 6.962962962962964e-06, + "loss": 0.1302, + "step": 4089 + }, + { + "epoch": 3.7351598173515983, + "grad_norm": 43.965091705322266, + "learning_rate": 6.961948249619484e-06, + "loss": 0.4923, + "step": 4090 + }, + { + "epoch": 3.7360730593607308, + "grad_norm": 100.3169174194336, + "learning_rate": 6.9609335362760025e-06, + "loss": 3.7678, + "step": 4091 + }, + { + "epoch": 3.736986301369863, + "grad_norm": 31.005741119384766, + "learning_rate": 6.959918822932522e-06, + "loss": 0.3655, + "step": 4092 + }, + { + "epoch": 3.7378995433789957, + "grad_norm": 27.72931480407715, + "learning_rate": 6.958904109589042e-06, + "loss": 0.2515, + "step": 4093 + }, + { + "epoch": 3.738812785388128, + "grad_norm": 62.60251998901367, + "learning_rate": 6.957889396245561e-06, + "loss": 0.8526, + "step": 4094 + }, + { + "epoch": 3.73972602739726, + "grad_norm": 60.11124801635742, + "learning_rate": 6.9568746829020805e-06, + "loss": 0.7138, + "step": 4095 + }, + { + "epoch": 3.7406392694063926, + "grad_norm": 30.912822723388672, + "learning_rate": 6.9558599695586e-06, + "loss": 0.1695, + "step": 4096 + }, + { + "epoch": 3.741552511415525, + "grad_norm": 12.43880844116211, + "learning_rate": 6.954845256215121e-06, + "loss": 0.0761, + "step": 4097 + }, + { + "epoch": 3.7424657534246575, + "grad_norm": 13.621712684631348, + "learning_rate": 6.953830542871639e-06, + "loss": 0.139, + "step": 4098 + }, + { + "epoch": 3.74337899543379, + "grad_norm": 2.9703009128570557, + "learning_rate": 6.952815829528159e-06, + "loss": 0.0237, + "step": 4099 + }, + { + "epoch": 3.7442922374429224, + "grad_norm": 93.09142303466797, + "learning_rate": 6.951801116184679e-06, + "loss": 5.096, + "step": 4100 + }, + { + "epoch": 3.745205479452055, + "grad_norm": 7.204035758972168, + "learning_rate": 6.950786402841198e-06, + "loss": 0.0491, + "step": 4101 + }, + { + "epoch": 3.7461187214611873, + "grad_norm": 2.7288734912872314, + "learning_rate": 6.9497716894977175e-06, + "loss": 0.0178, + "step": 4102 + }, + { + "epoch": 3.7470319634703197, + "grad_norm": 97.48639678955078, + "learning_rate": 6.948756976154237e-06, + "loss": 0.4591, + "step": 4103 + }, + { + "epoch": 3.747945205479452, + "grad_norm": 22.326244354248047, + "learning_rate": 6.947742262810756e-06, + "loss": 0.2474, + "step": 4104 + }, + { + "epoch": 3.748858447488584, + "grad_norm": 27.69082260131836, + "learning_rate": 6.946727549467276e-06, + "loss": 0.3537, + "step": 4105 + }, + { + "epoch": 3.7497716894977167, + "grad_norm": 67.82952880859375, + "learning_rate": 6.945712836123795e-06, + "loss": 2.4936, + "step": 4106 + }, + { + "epoch": 3.750684931506849, + "grad_norm": 53.81013488769531, + "learning_rate": 6.944698122780316e-06, + "loss": 0.821, + "step": 4107 + }, + { + "epoch": 3.7515981735159816, + "grad_norm": 33.139774322509766, + "learning_rate": 6.943683409436835e-06, + "loss": 0.4335, + "step": 4108 + }, + { + "epoch": 3.752511415525114, + "grad_norm": 43.5211067199707, + "learning_rate": 6.9426686960933545e-06, + "loss": 0.4656, + "step": 4109 + }, + { + "epoch": 3.7534246575342465, + "grad_norm": 5.425942897796631, + "learning_rate": 6.941653982749874e-06, + "loss": 0.0374, + "step": 4110 + }, + { + "epoch": 3.754337899543379, + "grad_norm": 89.70516967773438, + "learning_rate": 6.940639269406393e-06, + "loss": 0.8684, + "step": 4111 + }, + { + "epoch": 3.7552511415525114, + "grad_norm": 47.58772659301758, + "learning_rate": 6.939624556062913e-06, + "loss": 0.5801, + "step": 4112 + }, + { + "epoch": 3.756164383561644, + "grad_norm": 41.39839553833008, + "learning_rate": 6.938609842719432e-06, + "loss": 0.4959, + "step": 4113 + }, + { + "epoch": 3.7570776255707763, + "grad_norm": 3.8638617992401123, + "learning_rate": 6.937595129375951e-06, + "loss": 0.0241, + "step": 4114 + }, + { + "epoch": 3.7579908675799087, + "grad_norm": 12.70724868774414, + "learning_rate": 6.936580416032471e-06, + "loss": 0.057, + "step": 4115 + }, + { + "epoch": 3.758904109589041, + "grad_norm": 38.83985137939453, + "learning_rate": 6.9355657026889914e-06, + "loss": 0.6589, + "step": 4116 + }, + { + "epoch": 3.7598173515981737, + "grad_norm": 2.552283763885498, + "learning_rate": 6.934550989345511e-06, + "loss": 0.0198, + "step": 4117 + }, + { + "epoch": 3.760730593607306, + "grad_norm": 25.230274200439453, + "learning_rate": 6.93353627600203e-06, + "loss": 0.2811, + "step": 4118 + }, + { + "epoch": 3.7616438356164386, + "grad_norm": 4.280491352081299, + "learning_rate": 6.93252156265855e-06, + "loss": 0.0436, + "step": 4119 + }, + { + "epoch": 3.762557077625571, + "grad_norm": 18.130826950073242, + "learning_rate": 6.931506849315069e-06, + "loss": 0.1336, + "step": 4120 + }, + { + "epoch": 3.7634703196347035, + "grad_norm": 10.276589393615723, + "learning_rate": 6.930492135971588e-06, + "loss": 0.1048, + "step": 4121 + }, + { + "epoch": 3.7643835616438355, + "grad_norm": 72.60216522216797, + "learning_rate": 6.929477422628108e-06, + "loss": 0.6905, + "step": 4122 + }, + { + "epoch": 3.765296803652968, + "grad_norm": 67.39918518066406, + "learning_rate": 6.928462709284628e-06, + "loss": 1.7141, + "step": 4123 + }, + { + "epoch": 3.7662100456621004, + "grad_norm": 50.55080032348633, + "learning_rate": 6.9274479959411464e-06, + "loss": 0.5516, + "step": 4124 + }, + { + "epoch": 3.767123287671233, + "grad_norm": 57.02503967285156, + "learning_rate": 6.926433282597667e-06, + "loss": 0.2902, + "step": 4125 + }, + { + "epoch": 3.7680365296803653, + "grad_norm": 81.5577621459961, + "learning_rate": 6.925418569254187e-06, + "loss": 0.9442, + "step": 4126 + }, + { + "epoch": 3.7689497716894977, + "grad_norm": 13.466780662536621, + "learning_rate": 6.924403855910706e-06, + "loss": 0.0966, + "step": 4127 + }, + { + "epoch": 3.76986301369863, + "grad_norm": 5.203613758087158, + "learning_rate": 6.923389142567225e-06, + "loss": 0.0368, + "step": 4128 + }, + { + "epoch": 3.7707762557077626, + "grad_norm": 7.041478633880615, + "learning_rate": 6.922374429223745e-06, + "loss": 0.053, + "step": 4129 + }, + { + "epoch": 3.771689497716895, + "grad_norm": 34.57265853881836, + "learning_rate": 6.921359715880265e-06, + "loss": 0.5573, + "step": 4130 + }, + { + "epoch": 3.7726027397260276, + "grad_norm": 67.06427001953125, + "learning_rate": 6.9203450025367834e-06, + "loss": 0.8817, + "step": 4131 + }, + { + "epoch": 3.7735159817351596, + "grad_norm": 27.10072898864746, + "learning_rate": 6.919330289193303e-06, + "loss": 0.2506, + "step": 4132 + }, + { + "epoch": 3.774429223744292, + "grad_norm": 15.03530502319336, + "learning_rate": 6.918315575849824e-06, + "loss": 0.1373, + "step": 4133 + }, + { + "epoch": 3.7753424657534245, + "grad_norm": 13.9747896194458, + "learning_rate": 6.917300862506342e-06, + "loss": 0.1239, + "step": 4134 + }, + { + "epoch": 3.776255707762557, + "grad_norm": 7.104930400848389, + "learning_rate": 6.916286149162862e-06, + "loss": 0.0676, + "step": 4135 + }, + { + "epoch": 3.7771689497716894, + "grad_norm": 42.40242004394531, + "learning_rate": 6.915271435819382e-06, + "loss": 0.6283, + "step": 4136 + }, + { + "epoch": 3.778082191780822, + "grad_norm": 71.12675476074219, + "learning_rate": 6.914256722475902e-06, + "loss": 1.9308, + "step": 4137 + }, + { + "epoch": 3.7789954337899543, + "grad_norm": 22.905731201171875, + "learning_rate": 6.9132420091324204e-06, + "loss": 0.2487, + "step": 4138 + }, + { + "epoch": 3.7799086757990867, + "grad_norm": 2.0129411220550537, + "learning_rate": 6.91222729578894e-06, + "loss": 0.016, + "step": 4139 + }, + { + "epoch": 3.780821917808219, + "grad_norm": 103.38575744628906, + "learning_rate": 6.91121258244546e-06, + "loss": 1.5068, + "step": 4140 + }, + { + "epoch": 3.7817351598173516, + "grad_norm": 17.0504150390625, + "learning_rate": 6.910197869101979e-06, + "loss": 0.155, + "step": 4141 + }, + { + "epoch": 3.782648401826484, + "grad_norm": 4.239623069763184, + "learning_rate": 6.909183155758498e-06, + "loss": 0.0391, + "step": 4142 + }, + { + "epoch": 3.7835616438356166, + "grad_norm": 28.492765426635742, + "learning_rate": 6.908168442415019e-06, + "loss": 0.2222, + "step": 4143 + }, + { + "epoch": 3.784474885844749, + "grad_norm": 31.74612808227539, + "learning_rate": 6.907153729071538e-06, + "loss": 0.2525, + "step": 4144 + }, + { + "epoch": 3.7853881278538815, + "grad_norm": 0.7095952033996582, + "learning_rate": 6.9061390157280574e-06, + "loss": 0.0059, + "step": 4145 + }, + { + "epoch": 3.786301369863014, + "grad_norm": 48.371482849121094, + "learning_rate": 6.905124302384577e-06, + "loss": 0.6214, + "step": 4146 + }, + { + "epoch": 3.7872146118721464, + "grad_norm": 21.31192970275879, + "learning_rate": 6.904109589041097e-06, + "loss": 0.1476, + "step": 4147 + }, + { + "epoch": 3.7881278538812784, + "grad_norm": 4.7344536781311035, + "learning_rate": 6.903094875697616e-06, + "loss": 0.0484, + "step": 4148 + }, + { + "epoch": 3.789041095890411, + "grad_norm": 49.21636199951172, + "learning_rate": 6.902080162354135e-06, + "loss": 0.1468, + "step": 4149 + }, + { + "epoch": 3.7899543378995433, + "grad_norm": 7.588986396789551, + "learning_rate": 6.901065449010655e-06, + "loss": 0.0596, + "step": 4150 + }, + { + "epoch": 3.7908675799086757, + "grad_norm": 11.906952857971191, + "learning_rate": 6.900050735667174e-06, + "loss": 0.0952, + "step": 4151 + }, + { + "epoch": 3.791780821917808, + "grad_norm": 11.389811515808105, + "learning_rate": 6.8990360223236944e-06, + "loss": 0.1135, + "step": 4152 + }, + { + "epoch": 3.7926940639269406, + "grad_norm": 9.768601417541504, + "learning_rate": 6.898021308980214e-06, + "loss": 0.0873, + "step": 4153 + }, + { + "epoch": 3.793607305936073, + "grad_norm": 81.614013671875, + "learning_rate": 6.897006595636733e-06, + "loss": 0.8236, + "step": 4154 + }, + { + "epoch": 3.7945205479452055, + "grad_norm": 0.14118674397468567, + "learning_rate": 6.895991882293253e-06, + "loss": 0.0016, + "step": 4155 + }, + { + "epoch": 3.795433789954338, + "grad_norm": 90.93199920654297, + "learning_rate": 6.894977168949772e-06, + "loss": 0.7253, + "step": 4156 + }, + { + "epoch": 3.7963470319634705, + "grad_norm": 19.0283203125, + "learning_rate": 6.893962455606292e-06, + "loss": 0.1385, + "step": 4157 + }, + { + "epoch": 3.7972602739726025, + "grad_norm": 51.81993103027344, + "learning_rate": 6.892947742262811e-06, + "loss": 0.545, + "step": 4158 + }, + { + "epoch": 3.798173515981735, + "grad_norm": 48.28123474121094, + "learning_rate": 6.891933028919331e-06, + "loss": 0.3251, + "step": 4159 + }, + { + "epoch": 3.7990867579908674, + "grad_norm": 50.32973861694336, + "learning_rate": 6.890918315575851e-06, + "loss": 0.8023, + "step": 4160 + }, + { + "epoch": 3.8, + "grad_norm": 66.84577178955078, + "learning_rate": 6.889903602232369e-06, + "loss": 0.406, + "step": 4161 + }, + { + "epoch": 3.8009132420091323, + "grad_norm": 11.802168846130371, + "learning_rate": 6.88888888888889e-06, + "loss": 0.1242, + "step": 4162 + }, + { + "epoch": 3.8018264840182647, + "grad_norm": 30.63341522216797, + "learning_rate": 6.887874175545409e-06, + "loss": 0.4132, + "step": 4163 + }, + { + "epoch": 3.802739726027397, + "grad_norm": 39.16127014160156, + "learning_rate": 6.886859462201928e-06, + "loss": 0.4794, + "step": 4164 + }, + { + "epoch": 3.8036529680365296, + "grad_norm": 81.10557556152344, + "learning_rate": 6.885844748858448e-06, + "loss": 0.6374, + "step": 4165 + }, + { + "epoch": 3.804566210045662, + "grad_norm": 5.324373722076416, + "learning_rate": 6.884830035514968e-06, + "loss": 0.0532, + "step": 4166 + }, + { + "epoch": 3.8054794520547945, + "grad_norm": 14.080344200134277, + "learning_rate": 6.883815322171487e-06, + "loss": 0.1114, + "step": 4167 + }, + { + "epoch": 3.806392694063927, + "grad_norm": 32.56422424316406, + "learning_rate": 6.882800608828006e-06, + "loss": 0.2973, + "step": 4168 + }, + { + "epoch": 3.8073059360730594, + "grad_norm": 96.63970184326172, + "learning_rate": 6.881785895484527e-06, + "loss": 0.6242, + "step": 4169 + }, + { + "epoch": 3.808219178082192, + "grad_norm": 6.857370853424072, + "learning_rate": 6.880771182141046e-06, + "loss": 0.0467, + "step": 4170 + }, + { + "epoch": 3.8091324200913244, + "grad_norm": 0.08474837243556976, + "learning_rate": 6.879756468797565e-06, + "loss": 0.0008, + "step": 4171 + }, + { + "epoch": 3.810045662100457, + "grad_norm": 80.91785430908203, + "learning_rate": 6.878741755454085e-06, + "loss": 0.3092, + "step": 4172 + }, + { + "epoch": 3.8109589041095893, + "grad_norm": 38.17671585083008, + "learning_rate": 6.877727042110605e-06, + "loss": 0.2474, + "step": 4173 + }, + { + "epoch": 3.8118721461187217, + "grad_norm": 111.60735321044922, + "learning_rate": 6.876712328767123e-06, + "loss": 0.8631, + "step": 4174 + }, + { + "epoch": 3.8127853881278537, + "grad_norm": 53.80270004272461, + "learning_rate": 6.875697615423643e-06, + "loss": 0.4674, + "step": 4175 + }, + { + "epoch": 3.813698630136986, + "grad_norm": 88.536865234375, + "learning_rate": 6.874682902080163e-06, + "loss": 0.9183, + "step": 4176 + }, + { + "epoch": 3.8146118721461186, + "grad_norm": 75.49354553222656, + "learning_rate": 6.873668188736683e-06, + "loss": 0.6961, + "step": 4177 + }, + { + "epoch": 3.815525114155251, + "grad_norm": 0.6336660981178284, + "learning_rate": 6.872653475393201e-06, + "loss": 0.0053, + "step": 4178 + }, + { + "epoch": 3.8164383561643835, + "grad_norm": 4.276423931121826, + "learning_rate": 6.871638762049722e-06, + "loss": 0.0323, + "step": 4179 + }, + { + "epoch": 3.817351598173516, + "grad_norm": 74.5279769897461, + "learning_rate": 6.8706240487062416e-06, + "loss": 0.5371, + "step": 4180 + }, + { + "epoch": 3.8182648401826484, + "grad_norm": 5.601336479187012, + "learning_rate": 6.86960933536276e-06, + "loss": 0.0407, + "step": 4181 + }, + { + "epoch": 3.819178082191781, + "grad_norm": 1.9987716674804688, + "learning_rate": 6.86859462201928e-06, + "loss": 0.0208, + "step": 4182 + }, + { + "epoch": 3.8200913242009134, + "grad_norm": 3.0787851810455322, + "learning_rate": 6.8675799086758e-06, + "loss": 0.023, + "step": 4183 + }, + { + "epoch": 3.821004566210046, + "grad_norm": 9.664642333984375, + "learning_rate": 6.866565195332319e-06, + "loss": 0.0269, + "step": 4184 + }, + { + "epoch": 3.821917808219178, + "grad_norm": 60.211708068847656, + "learning_rate": 6.865550481988838e-06, + "loss": 0.6387, + "step": 4185 + }, + { + "epoch": 3.8228310502283103, + "grad_norm": 2.9773316383361816, + "learning_rate": 6.864535768645358e-06, + "loss": 0.0222, + "step": 4186 + }, + { + "epoch": 3.8237442922374427, + "grad_norm": 85.91699981689453, + "learning_rate": 6.8635210553018786e-06, + "loss": 2.5961, + "step": 4187 + }, + { + "epoch": 3.824657534246575, + "grad_norm": 0.855254054069519, + "learning_rate": 6.862506341958397e-06, + "loss": 0.0057, + "step": 4188 + }, + { + "epoch": 3.8255707762557076, + "grad_norm": 277.4908447265625, + "learning_rate": 6.861491628614917e-06, + "loss": 0.5282, + "step": 4189 + }, + { + "epoch": 3.82648401826484, + "grad_norm": 1.867969036102295, + "learning_rate": 6.860476915271437e-06, + "loss": 0.0125, + "step": 4190 + }, + { + "epoch": 3.8273972602739725, + "grad_norm": 80.42984771728516, + "learning_rate": 6.859462201927956e-06, + "loss": 1.4073, + "step": 4191 + }, + { + "epoch": 3.828310502283105, + "grad_norm": 37.31423568725586, + "learning_rate": 6.858447488584475e-06, + "loss": 0.3299, + "step": 4192 + }, + { + "epoch": 3.8292237442922374, + "grad_norm": 5.631241321563721, + "learning_rate": 6.857432775240995e-06, + "loss": 0.0389, + "step": 4193 + }, + { + "epoch": 3.83013698630137, + "grad_norm": 13.508816719055176, + "learning_rate": 6.856418061897514e-06, + "loss": 0.0985, + "step": 4194 + }, + { + "epoch": 3.8310502283105023, + "grad_norm": 1.3727188110351562, + "learning_rate": 6.8554033485540336e-06, + "loss": 0.0105, + "step": 4195 + }, + { + "epoch": 3.831963470319635, + "grad_norm": 7.146186351776123, + "learning_rate": 6.854388635210554e-06, + "loss": 0.0527, + "step": 4196 + }, + { + "epoch": 3.8328767123287673, + "grad_norm": 8.883960723876953, + "learning_rate": 6.853373921867074e-06, + "loss": 0.0928, + "step": 4197 + }, + { + "epoch": 3.8337899543378997, + "grad_norm": 0.892481803894043, + "learning_rate": 6.852359208523593e-06, + "loss": 0.0063, + "step": 4198 + }, + { + "epoch": 3.834703196347032, + "grad_norm": 2.113551616668701, + "learning_rate": 6.851344495180112e-06, + "loss": 0.0194, + "step": 4199 + }, + { + "epoch": 3.8356164383561646, + "grad_norm": 30.46528434753418, + "learning_rate": 6.850329781836632e-06, + "loss": 0.2386, + "step": 4200 + }, + { + "epoch": 3.836529680365297, + "grad_norm": 28.855754852294922, + "learning_rate": 6.849315068493151e-06, + "loss": 0.256, + "step": 4201 + }, + { + "epoch": 3.837442922374429, + "grad_norm": 10.826168060302734, + "learning_rate": 6.8483003551496706e-06, + "loss": 0.0903, + "step": 4202 + }, + { + "epoch": 3.8383561643835615, + "grad_norm": 10.874220848083496, + "learning_rate": 6.84728564180619e-06, + "loss": 0.0688, + "step": 4203 + }, + { + "epoch": 3.839269406392694, + "grad_norm": 80.93440246582031, + "learning_rate": 6.846270928462709e-06, + "loss": 2.8431, + "step": 4204 + }, + { + "epoch": 3.8401826484018264, + "grad_norm": 67.75418853759766, + "learning_rate": 6.84525621511923e-06, + "loss": 1.3814, + "step": 4205 + }, + { + "epoch": 3.841095890410959, + "grad_norm": 7.1154279708862305, + "learning_rate": 6.844241501775749e-06, + "loss": 0.0653, + "step": 4206 + }, + { + "epoch": 3.8420091324200913, + "grad_norm": 4.789438724517822, + "learning_rate": 6.843226788432269e-06, + "loss": 0.0336, + "step": 4207 + }, + { + "epoch": 3.842922374429224, + "grad_norm": 15.806500434875488, + "learning_rate": 6.842212075088788e-06, + "loss": 0.1022, + "step": 4208 + }, + { + "epoch": 3.8438356164383563, + "grad_norm": 8.38022232055664, + "learning_rate": 6.8411973617453076e-06, + "loss": 0.0881, + "step": 4209 + }, + { + "epoch": 3.8447488584474887, + "grad_norm": 7.858353614807129, + "learning_rate": 6.840182648401827e-06, + "loss": 0.0413, + "step": 4210 + }, + { + "epoch": 3.845662100456621, + "grad_norm": 37.5943717956543, + "learning_rate": 6.839167935058346e-06, + "loss": 0.3872, + "step": 4211 + }, + { + "epoch": 3.846575342465753, + "grad_norm": 10.368494033813477, + "learning_rate": 6.838153221714866e-06, + "loss": 0.0798, + "step": 4212 + }, + { + "epoch": 3.8474885844748856, + "grad_norm": 1.6636970043182373, + "learning_rate": 6.837138508371386e-06, + "loss": 0.0087, + "step": 4213 + }, + { + "epoch": 3.848401826484018, + "grad_norm": 18.152326583862305, + "learning_rate": 6.836123795027904e-06, + "loss": 0.1798, + "step": 4214 + }, + { + "epoch": 3.8493150684931505, + "grad_norm": 14.31734561920166, + "learning_rate": 6.835109081684425e-06, + "loss": 0.0991, + "step": 4215 + }, + { + "epoch": 3.850228310502283, + "grad_norm": 0.3662695288658142, + "learning_rate": 6.8340943683409446e-06, + "loss": 0.0023, + "step": 4216 + }, + { + "epoch": 3.8511415525114154, + "grad_norm": 6.867833137512207, + "learning_rate": 6.833079654997464e-06, + "loss": 0.062, + "step": 4217 + }, + { + "epoch": 3.852054794520548, + "grad_norm": 26.68232536315918, + "learning_rate": 6.832064941653983e-06, + "loss": 0.1983, + "step": 4218 + }, + { + "epoch": 3.8529680365296803, + "grad_norm": 94.62554931640625, + "learning_rate": 6.831050228310503e-06, + "loss": 2.2307, + "step": 4219 + }, + { + "epoch": 3.853881278538813, + "grad_norm": 65.51835632324219, + "learning_rate": 6.8300355149670225e-06, + "loss": 1.6007, + "step": 4220 + }, + { + "epoch": 3.8547945205479452, + "grad_norm": 80.72303771972656, + "learning_rate": 6.829020801623541e-06, + "loss": 2.1323, + "step": 4221 + }, + { + "epoch": 3.8557077625570777, + "grad_norm": 21.59514617919922, + "learning_rate": 6.828006088280061e-06, + "loss": 0.1553, + "step": 4222 + }, + { + "epoch": 3.85662100456621, + "grad_norm": 7.938109397888184, + "learning_rate": 6.8269913749365816e-06, + "loss": 0.0606, + "step": 4223 + }, + { + "epoch": 3.8575342465753426, + "grad_norm": 39.692813873291016, + "learning_rate": 6.8259766615931e-06, + "loss": 0.446, + "step": 4224 + }, + { + "epoch": 3.858447488584475, + "grad_norm": 81.62430572509766, + "learning_rate": 6.82496194824962e-06, + "loss": 1.3954, + "step": 4225 + }, + { + "epoch": 3.8593607305936075, + "grad_norm": 65.53504943847656, + "learning_rate": 6.82394723490614e-06, + "loss": 1.3144, + "step": 4226 + }, + { + "epoch": 3.86027397260274, + "grad_norm": 6.922418117523193, + "learning_rate": 6.8229325215626595e-06, + "loss": 0.0524, + "step": 4227 + }, + { + "epoch": 3.8611872146118724, + "grad_norm": 17.13482093811035, + "learning_rate": 6.821917808219178e-06, + "loss": 0.1582, + "step": 4228 + }, + { + "epoch": 3.8621004566210044, + "grad_norm": 8.93890380859375, + "learning_rate": 6.820903094875698e-06, + "loss": 0.0941, + "step": 4229 + }, + { + "epoch": 3.863013698630137, + "grad_norm": 38.4453125, + "learning_rate": 6.819888381532218e-06, + "loss": 0.3926, + "step": 4230 + }, + { + "epoch": 3.8639269406392693, + "grad_norm": 7.556799411773682, + "learning_rate": 6.8188736681887366e-06, + "loss": 0.0722, + "step": 4231 + }, + { + "epoch": 3.864840182648402, + "grad_norm": 23.75188636779785, + "learning_rate": 6.817858954845257e-06, + "loss": 0.2251, + "step": 4232 + }, + { + "epoch": 3.8657534246575342, + "grad_norm": 87.07123565673828, + "learning_rate": 6.816844241501777e-06, + "loss": 1.8912, + "step": 4233 + }, + { + "epoch": 3.8666666666666667, + "grad_norm": 21.695959091186523, + "learning_rate": 6.815829528158296e-06, + "loss": 0.1316, + "step": 4234 + }, + { + "epoch": 3.867579908675799, + "grad_norm": 0.4320295751094818, + "learning_rate": 6.814814814814815e-06, + "loss": 0.0047, + "step": 4235 + }, + { + "epoch": 3.8684931506849316, + "grad_norm": 6.295693874359131, + "learning_rate": 6.813800101471335e-06, + "loss": 0.047, + "step": 4236 + }, + { + "epoch": 3.869406392694064, + "grad_norm": 43.295528411865234, + "learning_rate": 6.812785388127855e-06, + "loss": 0.4172, + "step": 4237 + }, + { + "epoch": 3.8703196347031965, + "grad_norm": 10.469178199768066, + "learning_rate": 6.8117706747843736e-06, + "loss": 0.1002, + "step": 4238 + }, + { + "epoch": 3.8712328767123285, + "grad_norm": 78.80437469482422, + "learning_rate": 6.810755961440893e-06, + "loss": 2.2272, + "step": 4239 + }, + { + "epoch": 3.872146118721461, + "grad_norm": 0.46431002020835876, + "learning_rate": 6.809741248097414e-06, + "loss": 0.0044, + "step": 4240 + }, + { + "epoch": 3.8730593607305934, + "grad_norm": 10.994425773620605, + "learning_rate": 6.808726534753932e-06, + "loss": 0.1036, + "step": 4241 + }, + { + "epoch": 3.873972602739726, + "grad_norm": 13.867098808288574, + "learning_rate": 6.807711821410452e-06, + "loss": 0.0896, + "step": 4242 + }, + { + "epoch": 3.8748858447488583, + "grad_norm": 14.171052932739258, + "learning_rate": 6.806697108066972e-06, + "loss": 0.1356, + "step": 4243 + }, + { + "epoch": 3.875799086757991, + "grad_norm": 5.951180458068848, + "learning_rate": 6.805682394723491e-06, + "loss": 0.0427, + "step": 4244 + }, + { + "epoch": 3.8767123287671232, + "grad_norm": 7.920841217041016, + "learning_rate": 6.8046676813800105e-06, + "loss": 0.0852, + "step": 4245 + }, + { + "epoch": 3.8776255707762557, + "grad_norm": 66.16240692138672, + "learning_rate": 6.80365296803653e-06, + "loss": 0.6279, + "step": 4246 + }, + { + "epoch": 3.878538812785388, + "grad_norm": 25.445606231689453, + "learning_rate": 6.80263825469305e-06, + "loss": 0.3469, + "step": 4247 + }, + { + "epoch": 3.8794520547945206, + "grad_norm": 65.69315338134766, + "learning_rate": 6.801623541349569e-06, + "loss": 0.9893, + "step": 4248 + }, + { + "epoch": 3.880365296803653, + "grad_norm": 87.05208587646484, + "learning_rate": 6.800608828006089e-06, + "loss": 1.1211, + "step": 4249 + }, + { + "epoch": 3.8812785388127855, + "grad_norm": 82.93859100341797, + "learning_rate": 6.799594114662609e-06, + "loss": 0.4593, + "step": 4250 + }, + { + "epoch": 3.882191780821918, + "grad_norm": 20.92125701904297, + "learning_rate": 6.798579401319128e-06, + "loss": 0.1845, + "step": 4251 + }, + { + "epoch": 3.8831050228310504, + "grad_norm": 15.88623332977295, + "learning_rate": 6.7975646879756475e-06, + "loss": 0.1844, + "step": 4252 + }, + { + "epoch": 3.884018264840183, + "grad_norm": 1.3300421237945557, + "learning_rate": 6.796549974632167e-06, + "loss": 0.0104, + "step": 4253 + }, + { + "epoch": 3.8849315068493153, + "grad_norm": 2.3516130447387695, + "learning_rate": 6.795535261288686e-06, + "loss": 0.0139, + "step": 4254 + }, + { + "epoch": 3.8858447488584473, + "grad_norm": 31.851987838745117, + "learning_rate": 6.794520547945206e-06, + "loss": 0.2418, + "step": 4255 + }, + { + "epoch": 3.88675799086758, + "grad_norm": 11.601852416992188, + "learning_rate": 6.7935058346017255e-06, + "loss": 0.1088, + "step": 4256 + }, + { + "epoch": 3.8876712328767122, + "grad_norm": 62.182525634765625, + "learning_rate": 6.792491121258246e-06, + "loss": 0.5271, + "step": 4257 + }, + { + "epoch": 3.8885844748858447, + "grad_norm": 0.8803709149360657, + "learning_rate": 6.791476407914764e-06, + "loss": 0.0067, + "step": 4258 + }, + { + "epoch": 3.889497716894977, + "grad_norm": 34.51132583618164, + "learning_rate": 6.7904616945712845e-06, + "loss": 0.4087, + "step": 4259 + }, + { + "epoch": 3.8904109589041096, + "grad_norm": 0.24422675371170044, + "learning_rate": 6.789446981227804e-06, + "loss": 0.0016, + "step": 4260 + }, + { + "epoch": 3.891324200913242, + "grad_norm": 9.694807052612305, + "learning_rate": 6.788432267884323e-06, + "loss": 0.0894, + "step": 4261 + }, + { + "epoch": 3.8922374429223745, + "grad_norm": 10.546720504760742, + "learning_rate": 6.787417554540843e-06, + "loss": 0.0826, + "step": 4262 + }, + { + "epoch": 3.893150684931507, + "grad_norm": 91.55973052978516, + "learning_rate": 6.7864028411973625e-06, + "loss": 0.9804, + "step": 4263 + }, + { + "epoch": 3.8940639269406394, + "grad_norm": 145.73501586914062, + "learning_rate": 6.785388127853881e-06, + "loss": 1.3537, + "step": 4264 + }, + { + "epoch": 3.8949771689497714, + "grad_norm": 87.34558868408203, + "learning_rate": 6.784373414510401e-06, + "loss": 1.1207, + "step": 4265 + }, + { + "epoch": 3.895890410958904, + "grad_norm": 28.515798568725586, + "learning_rate": 6.783358701166921e-06, + "loss": 0.3008, + "step": 4266 + }, + { + "epoch": 3.8968036529680363, + "grad_norm": 34.35063171386719, + "learning_rate": 6.782343987823441e-06, + "loss": 0.2381, + "step": 4267 + }, + { + "epoch": 3.8977168949771688, + "grad_norm": 19.35005760192871, + "learning_rate": 6.78132927447996e-06, + "loss": 0.1946, + "step": 4268 + }, + { + "epoch": 3.8986301369863012, + "grad_norm": 25.33070182800293, + "learning_rate": 6.78031456113648e-06, + "loss": 0.1853, + "step": 4269 + }, + { + "epoch": 3.8995433789954337, + "grad_norm": 32.388118743896484, + "learning_rate": 6.7792998477929995e-06, + "loss": 0.1849, + "step": 4270 + }, + { + "epoch": 3.900456621004566, + "grad_norm": 36.979393005371094, + "learning_rate": 6.778285134449518e-06, + "loss": 0.2766, + "step": 4271 + }, + { + "epoch": 3.9013698630136986, + "grad_norm": 40.81441116333008, + "learning_rate": 6.777270421106038e-06, + "loss": 0.6241, + "step": 4272 + }, + { + "epoch": 3.902283105022831, + "grad_norm": 7.545085906982422, + "learning_rate": 6.776255707762558e-06, + "loss": 0.0412, + "step": 4273 + }, + { + "epoch": 3.9031963470319635, + "grad_norm": 11.973668098449707, + "learning_rate": 6.7752409944190765e-06, + "loss": 0.1021, + "step": 4274 + }, + { + "epoch": 3.904109589041096, + "grad_norm": 76.30376434326172, + "learning_rate": 6.774226281075596e-06, + "loss": 0.2782, + "step": 4275 + }, + { + "epoch": 3.9050228310502284, + "grad_norm": 4.77261209487915, + "learning_rate": 6.773211567732117e-06, + "loss": 0.0306, + "step": 4276 + }, + { + "epoch": 3.905936073059361, + "grad_norm": 48.74757766723633, + "learning_rate": 6.7721968543886365e-06, + "loss": 0.4541, + "step": 4277 + }, + { + "epoch": 3.9068493150684933, + "grad_norm": 26.886003494262695, + "learning_rate": 6.771182141045155e-06, + "loss": 0.2916, + "step": 4278 + }, + { + "epoch": 3.9077625570776258, + "grad_norm": 8.2391939163208, + "learning_rate": 6.770167427701675e-06, + "loss": 0.0768, + "step": 4279 + }, + { + "epoch": 3.908675799086758, + "grad_norm": 19.315074920654297, + "learning_rate": 6.769152714358195e-06, + "loss": 0.1871, + "step": 4280 + }, + { + "epoch": 3.9095890410958907, + "grad_norm": 5.4190192222595215, + "learning_rate": 6.7681380010147135e-06, + "loss": 0.0602, + "step": 4281 + }, + { + "epoch": 3.9105022831050227, + "grad_norm": 119.4616928100586, + "learning_rate": 6.767123287671233e-06, + "loss": 4.112, + "step": 4282 + }, + { + "epoch": 3.911415525114155, + "grad_norm": 7.274829387664795, + "learning_rate": 6.766108574327753e-06, + "loss": 0.0444, + "step": 4283 + }, + { + "epoch": 3.9123287671232876, + "grad_norm": 24.031593322753906, + "learning_rate": 6.765093860984272e-06, + "loss": 0.2027, + "step": 4284 + }, + { + "epoch": 3.91324200913242, + "grad_norm": 67.71493530273438, + "learning_rate": 6.7640791476407915e-06, + "loss": 0.6614, + "step": 4285 + }, + { + "epoch": 3.9141552511415525, + "grad_norm": 36.378414154052734, + "learning_rate": 6.763064434297312e-06, + "loss": 0.4806, + "step": 4286 + }, + { + "epoch": 3.915068493150685, + "grad_norm": 23.688112258911133, + "learning_rate": 6.762049720953832e-06, + "loss": 0.29, + "step": 4287 + }, + { + "epoch": 3.9159817351598174, + "grad_norm": 24.015331268310547, + "learning_rate": 6.7610350076103505e-06, + "loss": 0.1682, + "step": 4288 + }, + { + "epoch": 3.91689497716895, + "grad_norm": 4.418943405151367, + "learning_rate": 6.76002029426687e-06, + "loss": 0.0388, + "step": 4289 + }, + { + "epoch": 3.9178082191780823, + "grad_norm": 1.044451117515564, + "learning_rate": 6.75900558092339e-06, + "loss": 0.0074, + "step": 4290 + }, + { + "epoch": 3.9187214611872148, + "grad_norm": 0.6328316330909729, + "learning_rate": 6.757990867579909e-06, + "loss": 0.005, + "step": 4291 + }, + { + "epoch": 3.9196347031963468, + "grad_norm": 10.499855041503906, + "learning_rate": 6.7569761542364285e-06, + "loss": 0.0823, + "step": 4292 + }, + { + "epoch": 3.9205479452054792, + "grad_norm": 6.150994777679443, + "learning_rate": 6.755961440892949e-06, + "loss": 0.034, + "step": 4293 + }, + { + "epoch": 3.9214611872146117, + "grad_norm": 19.57874298095703, + "learning_rate": 6.754946727549467e-06, + "loss": 0.1468, + "step": 4294 + }, + { + "epoch": 3.922374429223744, + "grad_norm": 40.14896774291992, + "learning_rate": 6.7539320142059875e-06, + "loss": 0.1455, + "step": 4295 + }, + { + "epoch": 3.9232876712328766, + "grad_norm": 28.922378540039062, + "learning_rate": 6.752917300862507e-06, + "loss": 0.24, + "step": 4296 + }, + { + "epoch": 3.924200913242009, + "grad_norm": 105.78565216064453, + "learning_rate": 6.751902587519027e-06, + "loss": 1.7377, + "step": 4297 + }, + { + "epoch": 3.9251141552511415, + "grad_norm": 21.722854614257812, + "learning_rate": 6.750887874175546e-06, + "loss": 0.1717, + "step": 4298 + }, + { + "epoch": 3.926027397260274, + "grad_norm": 39.59685134887695, + "learning_rate": 6.7498731608320655e-06, + "loss": 0.1845, + "step": 4299 + }, + { + "epoch": 3.9269406392694064, + "grad_norm": 1.2466681003570557, + "learning_rate": 6.748858447488585e-06, + "loss": 0.0116, + "step": 4300 + }, + { + "epoch": 3.927853881278539, + "grad_norm": 31.58193016052246, + "learning_rate": 6.747843734145104e-06, + "loss": 0.3089, + "step": 4301 + }, + { + "epoch": 3.9287671232876713, + "grad_norm": 36.916297912597656, + "learning_rate": 6.746829020801624e-06, + "loss": 0.3519, + "step": 4302 + }, + { + "epoch": 3.9296803652968038, + "grad_norm": 3.133134603500366, + "learning_rate": 6.745814307458144e-06, + "loss": 0.023, + "step": 4303 + }, + { + "epoch": 3.930593607305936, + "grad_norm": 32.42827224731445, + "learning_rate": 6.744799594114663e-06, + "loss": 0.2679, + "step": 4304 + }, + { + "epoch": 3.9315068493150687, + "grad_norm": 11.868208885192871, + "learning_rate": 6.743784880771183e-06, + "loss": 0.1424, + "step": 4305 + }, + { + "epoch": 3.932420091324201, + "grad_norm": 80.53726196289062, + "learning_rate": 6.7427701674277024e-06, + "loss": 0.4972, + "step": 4306 + }, + { + "epoch": 3.9333333333333336, + "grad_norm": 3.488868474960327, + "learning_rate": 6.741755454084222e-06, + "loss": 0.0337, + "step": 4307 + }, + { + "epoch": 3.934246575342466, + "grad_norm": 7.605586528778076, + "learning_rate": 6.740740740740741e-06, + "loss": 0.0388, + "step": 4308 + }, + { + "epoch": 3.935159817351598, + "grad_norm": 7.6224541664123535, + "learning_rate": 6.739726027397261e-06, + "loss": 0.0441, + "step": 4309 + }, + { + "epoch": 3.9360730593607305, + "grad_norm": 1.0749619007110596, + "learning_rate": 6.73871131405378e-06, + "loss": 0.0079, + "step": 4310 + }, + { + "epoch": 3.936986301369863, + "grad_norm": 71.8794937133789, + "learning_rate": 6.737696600710299e-06, + "loss": 1.0752, + "step": 4311 + }, + { + "epoch": 3.9378995433789954, + "grad_norm": 22.104257583618164, + "learning_rate": 6.73668188736682e-06, + "loss": 0.1504, + "step": 4312 + }, + { + "epoch": 3.938812785388128, + "grad_norm": 19.212215423583984, + "learning_rate": 6.7356671740233394e-06, + "loss": 0.157, + "step": 4313 + }, + { + "epoch": 3.9397260273972603, + "grad_norm": 91.52999877929688, + "learning_rate": 6.734652460679858e-06, + "loss": 0.7026, + "step": 4314 + }, + { + "epoch": 3.9406392694063928, + "grad_norm": 1.034664273262024, + "learning_rate": 6.733637747336378e-06, + "loss": 0.0101, + "step": 4315 + }, + { + "epoch": 3.941552511415525, + "grad_norm": 19.3087158203125, + "learning_rate": 6.732623033992898e-06, + "loss": 0.2195, + "step": 4316 + }, + { + "epoch": 3.9424657534246577, + "grad_norm": 0.6011020541191101, + "learning_rate": 6.731608320649417e-06, + "loss": 0.0053, + "step": 4317 + }, + { + "epoch": 3.94337899543379, + "grad_norm": 44.321414947509766, + "learning_rate": 6.730593607305936e-06, + "loss": 0.2572, + "step": 4318 + }, + { + "epoch": 3.944292237442922, + "grad_norm": 20.596702575683594, + "learning_rate": 6.729578893962456e-06, + "loss": 0.1728, + "step": 4319 + }, + { + "epoch": 3.9452054794520546, + "grad_norm": 30.044042587280273, + "learning_rate": 6.7285641806189764e-06, + "loss": 0.2433, + "step": 4320 + }, + { + "epoch": 3.946118721461187, + "grad_norm": 2.2727675437927246, + "learning_rate": 6.7275494672754944e-06, + "loss": 0.0224, + "step": 4321 + }, + { + "epoch": 3.9470319634703195, + "grad_norm": 0.6288885474205017, + "learning_rate": 6.726534753932015e-06, + "loss": 0.0044, + "step": 4322 + }, + { + "epoch": 3.947945205479452, + "grad_norm": 0.2791077196598053, + "learning_rate": 6.725520040588535e-06, + "loss": 0.0022, + "step": 4323 + }, + { + "epoch": 3.9488584474885844, + "grad_norm": 13.12707233428955, + "learning_rate": 6.7245053272450535e-06, + "loss": 0.0947, + "step": 4324 + }, + { + "epoch": 3.949771689497717, + "grad_norm": 18.0816593170166, + "learning_rate": 6.723490613901573e-06, + "loss": 0.1634, + "step": 4325 + }, + { + "epoch": 3.9506849315068493, + "grad_norm": 0.5879494547843933, + "learning_rate": 6.722475900558093e-06, + "loss": 0.0047, + "step": 4326 + }, + { + "epoch": 3.9515981735159817, + "grad_norm": 5.563528060913086, + "learning_rate": 6.721461187214613e-06, + "loss": 0.0498, + "step": 4327 + }, + { + "epoch": 3.952511415525114, + "grad_norm": 4.876616477966309, + "learning_rate": 6.7204464738711314e-06, + "loss": 0.0489, + "step": 4328 + }, + { + "epoch": 3.9534246575342467, + "grad_norm": 6.339224338531494, + "learning_rate": 6.719431760527652e-06, + "loss": 0.0382, + "step": 4329 + }, + { + "epoch": 3.954337899543379, + "grad_norm": 1.7836486101150513, + "learning_rate": 6.718417047184172e-06, + "loss": 0.0179, + "step": 4330 + }, + { + "epoch": 3.9552511415525116, + "grad_norm": 1.3074904680252075, + "learning_rate": 6.7174023338406905e-06, + "loss": 0.0103, + "step": 4331 + }, + { + "epoch": 3.956164383561644, + "grad_norm": 11.216903686523438, + "learning_rate": 6.71638762049721e-06, + "loss": 0.077, + "step": 4332 + }, + { + "epoch": 3.9570776255707765, + "grad_norm": 9.743718147277832, + "learning_rate": 6.71537290715373e-06, + "loss": 0.0545, + "step": 4333 + }, + { + "epoch": 3.957990867579909, + "grad_norm": 63.19379425048828, + "learning_rate": 6.714358193810249e-06, + "loss": 0.9966, + "step": 4334 + }, + { + "epoch": 3.958904109589041, + "grad_norm": 36.86085891723633, + "learning_rate": 6.7133434804667684e-06, + "loss": 0.2633, + "step": 4335 + }, + { + "epoch": 3.9598173515981734, + "grad_norm": 2.444204092025757, + "learning_rate": 6.712328767123288e-06, + "loss": 0.0179, + "step": 4336 + }, + { + "epoch": 3.960730593607306, + "grad_norm": 47.57561492919922, + "learning_rate": 6.711314053779809e-06, + "loss": 0.8144, + "step": 4337 + }, + { + "epoch": 3.9616438356164383, + "grad_norm": 2.150628089904785, + "learning_rate": 6.710299340436327e-06, + "loss": 0.0142, + "step": 4338 + }, + { + "epoch": 3.9625570776255707, + "grad_norm": 1.4354990720748901, + "learning_rate": 6.709284627092847e-06, + "loss": 0.0119, + "step": 4339 + }, + { + "epoch": 3.963470319634703, + "grad_norm": 132.86793518066406, + "learning_rate": 6.708269913749367e-06, + "loss": 0.6347, + "step": 4340 + }, + { + "epoch": 3.9643835616438357, + "grad_norm": 2.479013204574585, + "learning_rate": 6.707255200405886e-06, + "loss": 0.0132, + "step": 4341 + }, + { + "epoch": 3.965296803652968, + "grad_norm": 3.937727451324463, + "learning_rate": 6.7062404870624054e-06, + "loss": 0.0281, + "step": 4342 + }, + { + "epoch": 3.9662100456621006, + "grad_norm": 48.693668365478516, + "learning_rate": 6.705225773718925e-06, + "loss": 0.3122, + "step": 4343 + }, + { + "epoch": 3.967123287671233, + "grad_norm": 5.347801685333252, + "learning_rate": 6.704211060375444e-06, + "loss": 0.0303, + "step": 4344 + }, + { + "epoch": 3.968036529680365, + "grad_norm": 9.527832984924316, + "learning_rate": 6.703196347031964e-06, + "loss": 0.0668, + "step": 4345 + }, + { + "epoch": 3.9689497716894975, + "grad_norm": 39.94521713256836, + "learning_rate": 6.702181633688483e-06, + "loss": 0.2408, + "step": 4346 + }, + { + "epoch": 3.96986301369863, + "grad_norm": 48.19981002807617, + "learning_rate": 6.701166920345004e-06, + "loss": 0.8693, + "step": 4347 + }, + { + "epoch": 3.9707762557077624, + "grad_norm": 12.122493743896484, + "learning_rate": 6.700152207001523e-06, + "loss": 0.1206, + "step": 4348 + }, + { + "epoch": 3.971689497716895, + "grad_norm": 44.496768951416016, + "learning_rate": 6.6991374936580424e-06, + "loss": 0.3438, + "step": 4349 + }, + { + "epoch": 3.9726027397260273, + "grad_norm": 81.839599609375, + "learning_rate": 6.698122780314562e-06, + "loss": 0.3867, + "step": 4350 + }, + { + "epoch": 3.9735159817351597, + "grad_norm": 0.6325887441635132, + "learning_rate": 6.697108066971081e-06, + "loss": 0.0059, + "step": 4351 + }, + { + "epoch": 3.974429223744292, + "grad_norm": 86.35652923583984, + "learning_rate": 6.696093353627601e-06, + "loss": 0.0897, + "step": 4352 + }, + { + "epoch": 3.9753424657534246, + "grad_norm": 87.46035766601562, + "learning_rate": 6.69507864028412e-06, + "loss": 1.3576, + "step": 4353 + }, + { + "epoch": 3.976255707762557, + "grad_norm": 16.7125186920166, + "learning_rate": 6.694063926940639e-06, + "loss": 0.0914, + "step": 4354 + }, + { + "epoch": 3.9771689497716896, + "grad_norm": 2.724289894104004, + "learning_rate": 6.693049213597159e-06, + "loss": 0.0124, + "step": 4355 + }, + { + "epoch": 3.978082191780822, + "grad_norm": 5.211586952209473, + "learning_rate": 6.6920345002536794e-06, + "loss": 0.0377, + "step": 4356 + }, + { + "epoch": 3.9789954337899545, + "grad_norm": 8.124407768249512, + "learning_rate": 6.691019786910199e-06, + "loss": 0.061, + "step": 4357 + }, + { + "epoch": 3.979908675799087, + "grad_norm": 98.47388458251953, + "learning_rate": 6.690005073566718e-06, + "loss": 2.1717, + "step": 4358 + }, + { + "epoch": 3.9808219178082194, + "grad_norm": 21.272846221923828, + "learning_rate": 6.688990360223238e-06, + "loss": 0.1271, + "step": 4359 + }, + { + "epoch": 3.981735159817352, + "grad_norm": 0.27824869751930237, + "learning_rate": 6.687975646879757e-06, + "loss": 0.002, + "step": 4360 + }, + { + "epoch": 3.9826484018264843, + "grad_norm": 5.492011070251465, + "learning_rate": 6.686960933536276e-06, + "loss": 0.0332, + "step": 4361 + }, + { + "epoch": 3.9835616438356163, + "grad_norm": 44.3621711730957, + "learning_rate": 6.685946220192796e-06, + "loss": 0.2909, + "step": 4362 + }, + { + "epoch": 3.9844748858447487, + "grad_norm": 34.36272430419922, + "learning_rate": 6.684931506849316e-06, + "loss": 0.4014, + "step": 4363 + }, + { + "epoch": 3.985388127853881, + "grad_norm": 3.666189193725586, + "learning_rate": 6.683916793505834e-06, + "loss": 0.0232, + "step": 4364 + }, + { + "epoch": 3.9863013698630136, + "grad_norm": 90.56001281738281, + "learning_rate": 6.682902080162354e-06, + "loss": 1.7625, + "step": 4365 + }, + { + "epoch": 3.987214611872146, + "grad_norm": 39.0978889465332, + "learning_rate": 6.681887366818875e-06, + "loss": 0.2042, + "step": 4366 + }, + { + "epoch": 3.9881278538812786, + "grad_norm": 25.471769332885742, + "learning_rate": 6.680872653475394e-06, + "loss": 0.2673, + "step": 4367 + }, + { + "epoch": 3.989041095890411, + "grad_norm": 59.59672164916992, + "learning_rate": 6.679857940131913e-06, + "loss": 0.8788, + "step": 4368 + }, + { + "epoch": 3.9899543378995435, + "grad_norm": 19.202037811279297, + "learning_rate": 6.678843226788433e-06, + "loss": 0.066, + "step": 4369 + }, + { + "epoch": 3.990867579908676, + "grad_norm": 12.094440460205078, + "learning_rate": 6.6778285134449526e-06, + "loss": 0.0766, + "step": 4370 + }, + { + "epoch": 3.9917808219178084, + "grad_norm": 0.09415345638990402, + "learning_rate": 6.676813800101471e-06, + "loss": 0.0007, + "step": 4371 + }, + { + "epoch": 3.9926940639269404, + "grad_norm": 25.924060821533203, + "learning_rate": 6.675799086757991e-06, + "loss": 0.1814, + "step": 4372 + }, + { + "epoch": 3.993607305936073, + "grad_norm": 1.611132264137268, + "learning_rate": 6.674784373414512e-06, + "loss": 0.0114, + "step": 4373 + }, + { + "epoch": 3.9945205479452053, + "grad_norm": 30.711837768554688, + "learning_rate": 6.67376966007103e-06, + "loss": 0.2685, + "step": 4374 + }, + { + "epoch": 3.9954337899543377, + "grad_norm": 101.3351821899414, + "learning_rate": 6.67275494672755e-06, + "loss": 2.3578, + "step": 4375 + }, + { + "epoch": 3.99634703196347, + "grad_norm": 12.098511695861816, + "learning_rate": 6.67174023338407e-06, + "loss": 0.1183, + "step": 4376 + }, + { + "epoch": 3.9972602739726026, + "grad_norm": 20.30687713623047, + "learning_rate": 6.6707255200405896e-06, + "loss": 0.2433, + "step": 4377 + }, + { + "epoch": 3.998173515981735, + "grad_norm": 1.6079951524734497, + "learning_rate": 6.669710806697108e-06, + "loss": 0.01, + "step": 4378 + }, + { + "epoch": 3.9990867579908675, + "grad_norm": 2.8325774669647217, + "learning_rate": 6.668696093353628e-06, + "loss": 0.0262, + "step": 4379 + }, + { + "epoch": 4.0, + "grad_norm": 0.45723164081573486, + "learning_rate": 6.667681380010148e-06, + "loss": 0.0036, + "step": 4380 + }, + { + "epoch": 4.0009132420091325, + "grad_norm": 2.435805320739746, + "learning_rate": 6.666666666666667e-06, + "loss": 0.0243, + "step": 4381 + }, + { + "epoch": 4.001826484018265, + "grad_norm": 58.344215393066406, + "learning_rate": 6.665651953323186e-06, + "loss": 0.7065, + "step": 4382 + }, + { + "epoch": 4.002739726027397, + "grad_norm": 27.90064811706543, + "learning_rate": 6.664637239979707e-06, + "loss": 0.2876, + "step": 4383 + }, + { + "epoch": 4.00365296803653, + "grad_norm": 16.34041404724121, + "learning_rate": 6.663622526636226e-06, + "loss": 0.0721, + "step": 4384 + }, + { + "epoch": 4.004566210045662, + "grad_norm": 5.769531726837158, + "learning_rate": 6.662607813292745e-06, + "loss": 0.0318, + "step": 4385 + }, + { + "epoch": 4.005479452054795, + "grad_norm": 30.308813095092773, + "learning_rate": 6.661593099949265e-06, + "loss": 0.2018, + "step": 4386 + }, + { + "epoch": 4.006392694063927, + "grad_norm": 0.739010214805603, + "learning_rate": 6.660578386605785e-06, + "loss": 0.004, + "step": 4387 + }, + { + "epoch": 4.00730593607306, + "grad_norm": 0.822601318359375, + "learning_rate": 6.659563673262304e-06, + "loss": 0.005, + "step": 4388 + }, + { + "epoch": 4.008219178082192, + "grad_norm": 3.0794565677642822, + "learning_rate": 6.658548959918823e-06, + "loss": 0.0251, + "step": 4389 + }, + { + "epoch": 4.0091324200913245, + "grad_norm": 5.869495868682861, + "learning_rate": 6.657534246575343e-06, + "loss": 0.0412, + "step": 4390 + }, + { + "epoch": 4.010045662100457, + "grad_norm": 12.77481460571289, + "learning_rate": 6.656519533231862e-06, + "loss": 0.0676, + "step": 4391 + }, + { + "epoch": 4.010958904109589, + "grad_norm": 2.483219861984253, + "learning_rate": 6.655504819888382e-06, + "loss": 0.0152, + "step": 4392 + }, + { + "epoch": 4.011872146118722, + "grad_norm": 159.28160095214844, + "learning_rate": 6.654490106544902e-06, + "loss": 1.4445, + "step": 4393 + }, + { + "epoch": 4.0127853881278535, + "grad_norm": 75.76805114746094, + "learning_rate": 6.653475393201421e-06, + "loss": 0.9614, + "step": 4394 + }, + { + "epoch": 4.013698630136986, + "grad_norm": 4.696766376495361, + "learning_rate": 6.652460679857941e-06, + "loss": 0.0251, + "step": 4395 + }, + { + "epoch": 4.014611872146118, + "grad_norm": 22.216842651367188, + "learning_rate": 6.65144596651446e-06, + "loss": 0.201, + "step": 4396 + }, + { + "epoch": 4.015525114155251, + "grad_norm": 7.928744316101074, + "learning_rate": 6.65043125317098e-06, + "loss": 0.0907, + "step": 4397 + }, + { + "epoch": 4.016438356164383, + "grad_norm": 23.277019500732422, + "learning_rate": 6.649416539827499e-06, + "loss": 0.1497, + "step": 4398 + }, + { + "epoch": 4.017351598173516, + "grad_norm": 22.602384567260742, + "learning_rate": 6.6484018264840186e-06, + "loss": 0.2443, + "step": 4399 + }, + { + "epoch": 4.018264840182648, + "grad_norm": 2.2280287742614746, + "learning_rate": 6.647387113140539e-06, + "loss": 0.012, + "step": 4400 + }, + { + "epoch": 4.019178082191781, + "grad_norm": 2.4238452911376953, + "learning_rate": 6.646372399797057e-06, + "loss": 0.0257, + "step": 4401 + }, + { + "epoch": 4.020091324200913, + "grad_norm": 55.62525177001953, + "learning_rate": 6.645357686453578e-06, + "loss": 0.7403, + "step": 4402 + }, + { + "epoch": 4.0210045662100455, + "grad_norm": 0.348141610622406, + "learning_rate": 6.644342973110097e-06, + "loss": 0.003, + "step": 4403 + }, + { + "epoch": 4.021917808219178, + "grad_norm": 1.5674104690551758, + "learning_rate": 6.643328259766616e-06, + "loss": 0.012, + "step": 4404 + }, + { + "epoch": 4.0228310502283104, + "grad_norm": 9.88326644897461, + "learning_rate": 6.642313546423136e-06, + "loss": 0.0761, + "step": 4405 + }, + { + "epoch": 4.023744292237443, + "grad_norm": 44.893768310546875, + "learning_rate": 6.6412988330796556e-06, + "loss": 0.6358, + "step": 4406 + }, + { + "epoch": 4.024657534246575, + "grad_norm": 0.9071488976478577, + "learning_rate": 6.640284119736175e-06, + "loss": 0.0094, + "step": 4407 + }, + { + "epoch": 4.025570776255708, + "grad_norm": 21.2736873626709, + "learning_rate": 6.639269406392694e-06, + "loss": 0.2153, + "step": 4408 + }, + { + "epoch": 4.02648401826484, + "grad_norm": 7.915892601013184, + "learning_rate": 6.638254693049215e-06, + "loss": 0.0766, + "step": 4409 + }, + { + "epoch": 4.027397260273973, + "grad_norm": 39.61974334716797, + "learning_rate": 6.637239979705734e-06, + "loss": 0.463, + "step": 4410 + }, + { + "epoch": 4.028310502283105, + "grad_norm": 42.4293212890625, + "learning_rate": 6.636225266362253e-06, + "loss": 0.394, + "step": 4411 + }, + { + "epoch": 4.029223744292238, + "grad_norm": 0.052751168608665466, + "learning_rate": 6.635210553018773e-06, + "loss": 0.0004, + "step": 4412 + }, + { + "epoch": 4.03013698630137, + "grad_norm": 4.407172679901123, + "learning_rate": 6.6341958396752926e-06, + "loss": 0.0414, + "step": 4413 + }, + { + "epoch": 4.0310502283105025, + "grad_norm": 30.33489990234375, + "learning_rate": 6.633181126331811e-06, + "loss": 0.2897, + "step": 4414 + }, + { + "epoch": 4.031963470319635, + "grad_norm": 6.516455173492432, + "learning_rate": 6.632166412988331e-06, + "loss": 0.0526, + "step": 4415 + }, + { + "epoch": 4.032876712328767, + "grad_norm": 0.5170145034790039, + "learning_rate": 6.631151699644851e-06, + "loss": 0.0042, + "step": 4416 + }, + { + "epoch": 4.0337899543379, + "grad_norm": 1.3840211629867554, + "learning_rate": 6.630136986301371e-06, + "loss": 0.0136, + "step": 4417 + }, + { + "epoch": 4.034703196347032, + "grad_norm": 0.15211573243141174, + "learning_rate": 6.629122272957889e-06, + "loss": 0.001, + "step": 4418 + }, + { + "epoch": 4.035616438356165, + "grad_norm": 17.914194107055664, + "learning_rate": 6.62810755961441e-06, + "loss": 0.1327, + "step": 4419 + }, + { + "epoch": 4.036529680365296, + "grad_norm": 40.19512176513672, + "learning_rate": 6.6270928462709296e-06, + "loss": 0.5376, + "step": 4420 + }, + { + "epoch": 4.037442922374429, + "grad_norm": 4.277857303619385, + "learning_rate": 6.626078132927448e-06, + "loss": 0.0363, + "step": 4421 + }, + { + "epoch": 4.038356164383561, + "grad_norm": 9.8043851852417, + "learning_rate": 6.625063419583968e-06, + "loss": 0.1042, + "step": 4422 + }, + { + "epoch": 4.039269406392694, + "grad_norm": 1.533724308013916, + "learning_rate": 6.624048706240488e-06, + "loss": 0.0127, + "step": 4423 + }, + { + "epoch": 4.040182648401826, + "grad_norm": 24.74752426147461, + "learning_rate": 6.623033992897007e-06, + "loss": 0.2379, + "step": 4424 + }, + { + "epoch": 4.041095890410959, + "grad_norm": 26.298107147216797, + "learning_rate": 6.622019279553526e-06, + "loss": 0.2109, + "step": 4425 + }, + { + "epoch": 4.042009132420091, + "grad_norm": 6.738763332366943, + "learning_rate": 6.621004566210046e-06, + "loss": 0.0467, + "step": 4426 + }, + { + "epoch": 4.0429223744292235, + "grad_norm": 16.888765335083008, + "learning_rate": 6.6199898528665666e-06, + "loss": 0.2146, + "step": 4427 + }, + { + "epoch": 4.043835616438356, + "grad_norm": 4.286468505859375, + "learning_rate": 6.618975139523085e-06, + "loss": 0.0422, + "step": 4428 + }, + { + "epoch": 4.044748858447488, + "grad_norm": 0.3127930462360382, + "learning_rate": 6.617960426179605e-06, + "loss": 0.0032, + "step": 4429 + }, + { + "epoch": 4.045662100456621, + "grad_norm": 0.9827511310577393, + "learning_rate": 6.616945712836125e-06, + "loss": 0.0064, + "step": 4430 + }, + { + "epoch": 4.046575342465753, + "grad_norm": 13.976661682128906, + "learning_rate": 6.615930999492644e-06, + "loss": 0.1281, + "step": 4431 + }, + { + "epoch": 4.047488584474886, + "grad_norm": 36.08419418334961, + "learning_rate": 6.614916286149163e-06, + "loss": 0.384, + "step": 4432 + }, + { + "epoch": 4.048401826484018, + "grad_norm": 6.909160614013672, + "learning_rate": 6.613901572805683e-06, + "loss": 0.0575, + "step": 4433 + }, + { + "epoch": 4.049315068493151, + "grad_norm": 8.666584968566895, + "learning_rate": 6.612886859462202e-06, + "loss": 0.0947, + "step": 4434 + }, + { + "epoch": 4.050228310502283, + "grad_norm": 1.2464118003845215, + "learning_rate": 6.6118721461187215e-06, + "loss": 0.0088, + "step": 4435 + }, + { + "epoch": 4.051141552511416, + "grad_norm": 66.56172943115234, + "learning_rate": 6.610857432775242e-06, + "loss": 0.3433, + "step": 4436 + }, + { + "epoch": 4.052054794520548, + "grad_norm": 1.5193458795547485, + "learning_rate": 6.609842719431762e-06, + "loss": 0.0114, + "step": 4437 + }, + { + "epoch": 4.0529680365296805, + "grad_norm": 1.2864638566970825, + "learning_rate": 6.608828006088281e-06, + "loss": 0.0107, + "step": 4438 + }, + { + "epoch": 4.053881278538813, + "grad_norm": 9.245170593261719, + "learning_rate": 6.6078132927448e-06, + "loss": 0.0774, + "step": 4439 + }, + { + "epoch": 4.054794520547945, + "grad_norm": 18.059717178344727, + "learning_rate": 6.60679857940132e-06, + "loss": 0.1603, + "step": 4440 + }, + { + "epoch": 4.055707762557078, + "grad_norm": 10.395149230957031, + "learning_rate": 6.605783866057839e-06, + "loss": 0.0566, + "step": 4441 + }, + { + "epoch": 4.05662100456621, + "grad_norm": 41.8426399230957, + "learning_rate": 6.6047691527143585e-06, + "loss": 0.1851, + "step": 4442 + }, + { + "epoch": 4.057534246575343, + "grad_norm": 2.2139511108398438, + "learning_rate": 6.603754439370878e-06, + "loss": 0.0145, + "step": 4443 + }, + { + "epoch": 4.058447488584475, + "grad_norm": 115.05686950683594, + "learning_rate": 6.602739726027397e-06, + "loss": 1.5622, + "step": 4444 + }, + { + "epoch": 4.059360730593608, + "grad_norm": 0.8994469046592712, + "learning_rate": 6.601725012683917e-06, + "loss": 0.0066, + "step": 4445 + }, + { + "epoch": 4.06027397260274, + "grad_norm": 39.098636627197266, + "learning_rate": 6.600710299340437e-06, + "loss": 0.2958, + "step": 4446 + }, + { + "epoch": 4.061187214611872, + "grad_norm": 0.1298443078994751, + "learning_rate": 6.599695585996957e-06, + "loss": 0.0011, + "step": 4447 + }, + { + "epoch": 4.062100456621004, + "grad_norm": 0.10445152968168259, + "learning_rate": 6.598680872653476e-06, + "loss": 0.0012, + "step": 4448 + }, + { + "epoch": 4.063013698630137, + "grad_norm": 5.583752155303955, + "learning_rate": 6.5976661593099955e-06, + "loss": 0.0367, + "step": 4449 + }, + { + "epoch": 4.063926940639269, + "grad_norm": 9.341440200805664, + "learning_rate": 6.596651445966515e-06, + "loss": 0.0778, + "step": 4450 + }, + { + "epoch": 4.0648401826484015, + "grad_norm": 33.624542236328125, + "learning_rate": 6.595636732623034e-06, + "loss": 0.3397, + "step": 4451 + }, + { + "epoch": 4.065753424657534, + "grad_norm": 30.285396575927734, + "learning_rate": 6.594622019279554e-06, + "loss": 0.1568, + "step": 4452 + }, + { + "epoch": 4.066666666666666, + "grad_norm": 113.89126586914062, + "learning_rate": 6.593607305936074e-06, + "loss": 1.6226, + "step": 4453 + }, + { + "epoch": 4.067579908675799, + "grad_norm": 113.54317474365234, + "learning_rate": 6.592592592592592e-06, + "loss": 6.1082, + "step": 4454 + }, + { + "epoch": 4.068493150684931, + "grad_norm": 80.27813720703125, + "learning_rate": 6.591577879249113e-06, + "loss": 0.3892, + "step": 4455 + }, + { + "epoch": 4.069406392694064, + "grad_norm": 4.648252010345459, + "learning_rate": 6.5905631659056325e-06, + "loss": 0.0432, + "step": 4456 + }, + { + "epoch": 4.070319634703196, + "grad_norm": 5.4394612312316895, + "learning_rate": 6.589548452562152e-06, + "loss": 0.0395, + "step": 4457 + }, + { + "epoch": 4.071232876712329, + "grad_norm": 61.03096008300781, + "learning_rate": 6.588533739218671e-06, + "loss": 1.1193, + "step": 4458 + }, + { + "epoch": 4.072146118721461, + "grad_norm": 2.304340124130249, + "learning_rate": 6.587519025875191e-06, + "loss": 0.0193, + "step": 4459 + }, + { + "epoch": 4.073059360730594, + "grad_norm": 9.054872512817383, + "learning_rate": 6.5865043125317105e-06, + "loss": 0.0607, + "step": 4460 + }, + { + "epoch": 4.073972602739726, + "grad_norm": 18.113128662109375, + "learning_rate": 6.585489599188229e-06, + "loss": 0.1592, + "step": 4461 + }, + { + "epoch": 4.0748858447488585, + "grad_norm": 18.196331024169922, + "learning_rate": 6.584474885844749e-06, + "loss": 0.0273, + "step": 4462 + }, + { + "epoch": 4.075799086757991, + "grad_norm": 35.978511810302734, + "learning_rate": 6.5834601725012695e-06, + "loss": 0.3368, + "step": 4463 + }, + { + "epoch": 4.076712328767123, + "grad_norm": 36.508358001708984, + "learning_rate": 6.582445459157788e-06, + "loss": 0.3849, + "step": 4464 + }, + { + "epoch": 4.077625570776256, + "grad_norm": 20.21786880493164, + "learning_rate": 6.581430745814308e-06, + "loss": 0.1867, + "step": 4465 + }, + { + "epoch": 4.078538812785388, + "grad_norm": 2.141177177429199, + "learning_rate": 6.580416032470828e-06, + "loss": 0.02, + "step": 4466 + }, + { + "epoch": 4.079452054794521, + "grad_norm": 5.260298728942871, + "learning_rate": 6.5794013191273475e-06, + "loss": 0.0416, + "step": 4467 + }, + { + "epoch": 4.080365296803653, + "grad_norm": 7.634228706359863, + "learning_rate": 6.578386605783866e-06, + "loss": 0.0439, + "step": 4468 + }, + { + "epoch": 4.081278538812786, + "grad_norm": 17.862014770507812, + "learning_rate": 6.577371892440386e-06, + "loss": 0.1408, + "step": 4469 + }, + { + "epoch": 4.082191780821918, + "grad_norm": 1.393771767616272, + "learning_rate": 6.576357179096906e-06, + "loss": 0.01, + "step": 4470 + }, + { + "epoch": 4.083105022831051, + "grad_norm": 7.689780235290527, + "learning_rate": 6.5753424657534245e-06, + "loss": 0.062, + "step": 4471 + }, + { + "epoch": 4.084018264840183, + "grad_norm": 0.11017690598964691, + "learning_rate": 6.574327752409945e-06, + "loss": 0.0009, + "step": 4472 + }, + { + "epoch": 4.0849315068493155, + "grad_norm": 30.870521545410156, + "learning_rate": 6.573313039066465e-06, + "loss": 0.2007, + "step": 4473 + }, + { + "epoch": 4.085844748858447, + "grad_norm": 18.450910568237305, + "learning_rate": 6.572298325722984e-06, + "loss": 0.1924, + "step": 4474 + }, + { + "epoch": 4.0867579908675795, + "grad_norm": 11.562594413757324, + "learning_rate": 6.571283612379503e-06, + "loss": 0.0957, + "step": 4475 + }, + { + "epoch": 4.087671232876712, + "grad_norm": 10.449548721313477, + "learning_rate": 6.570268899036023e-06, + "loss": 0.0712, + "step": 4476 + }, + { + "epoch": 4.088584474885844, + "grad_norm": 6.1408867835998535, + "learning_rate": 6.569254185692543e-06, + "loss": 0.0609, + "step": 4477 + }, + { + "epoch": 4.089497716894977, + "grad_norm": 3.923980474472046, + "learning_rate": 6.5682394723490615e-06, + "loss": 0.0366, + "step": 4478 + }, + { + "epoch": 4.090410958904109, + "grad_norm": 5.030934810638428, + "learning_rate": 6.567224759005581e-06, + "loss": 0.0318, + "step": 4479 + }, + { + "epoch": 4.091324200913242, + "grad_norm": 0.5549654960632324, + "learning_rate": 6.566210045662102e-06, + "loss": 0.005, + "step": 4480 + }, + { + "epoch": 4.092237442922374, + "grad_norm": 17.293142318725586, + "learning_rate": 6.56519533231862e-06, + "loss": 0.1462, + "step": 4481 + }, + { + "epoch": 4.093150684931507, + "grad_norm": 4.267709255218506, + "learning_rate": 6.56418061897514e-06, + "loss": 0.031, + "step": 4482 + }, + { + "epoch": 4.094063926940639, + "grad_norm": 5.625371932983398, + "learning_rate": 6.56316590563166e-06, + "loss": 0.0555, + "step": 4483 + }, + { + "epoch": 4.094977168949772, + "grad_norm": 26.965133666992188, + "learning_rate": 6.562151192288179e-06, + "loss": 0.1773, + "step": 4484 + }, + { + "epoch": 4.095890410958904, + "grad_norm": 23.677810668945312, + "learning_rate": 6.5611364789446985e-06, + "loss": 0.2065, + "step": 4485 + }, + { + "epoch": 4.0968036529680365, + "grad_norm": 1.1376606225967407, + "learning_rate": 6.560121765601218e-06, + "loss": 0.0102, + "step": 4486 + }, + { + "epoch": 4.097716894977169, + "grad_norm": 1.2317384481430054, + "learning_rate": 6.559107052257738e-06, + "loss": 0.01, + "step": 4487 + }, + { + "epoch": 4.098630136986301, + "grad_norm": 3.4474759101867676, + "learning_rate": 6.558092338914257e-06, + "loss": 0.028, + "step": 4488 + }, + { + "epoch": 4.099543378995434, + "grad_norm": 2.4673070907592773, + "learning_rate": 6.5570776255707765e-06, + "loss": 0.0271, + "step": 4489 + }, + { + "epoch": 4.100456621004566, + "grad_norm": 12.144601821899414, + "learning_rate": 6.556062912227297e-06, + "loss": 0.094, + "step": 4490 + }, + { + "epoch": 4.101369863013699, + "grad_norm": 8.067458152770996, + "learning_rate": 6.555048198883816e-06, + "loss": 0.0605, + "step": 4491 + }, + { + "epoch": 4.102283105022831, + "grad_norm": 44.41965866088867, + "learning_rate": 6.5540334855403355e-06, + "loss": 1.1534, + "step": 4492 + }, + { + "epoch": 4.103196347031964, + "grad_norm": 13.958203315734863, + "learning_rate": 6.553018772196855e-06, + "loss": 0.2246, + "step": 4493 + }, + { + "epoch": 4.104109589041096, + "grad_norm": 17.53349494934082, + "learning_rate": 6.552004058853374e-06, + "loss": 0.1598, + "step": 4494 + }, + { + "epoch": 4.105022831050229, + "grad_norm": 4.0198259353637695, + "learning_rate": 6.550989345509894e-06, + "loss": 0.0279, + "step": 4495 + }, + { + "epoch": 4.105936073059361, + "grad_norm": 29.197580337524414, + "learning_rate": 6.5499746321664134e-06, + "loss": 0.2816, + "step": 4496 + }, + { + "epoch": 4.1068493150684935, + "grad_norm": 39.48283004760742, + "learning_rate": 6.548959918822934e-06, + "loss": 0.2907, + "step": 4497 + }, + { + "epoch": 4.107762557077626, + "grad_norm": 24.12481117248535, + "learning_rate": 6.547945205479452e-06, + "loss": 0.1453, + "step": 4498 + }, + { + "epoch": 4.108675799086758, + "grad_norm": 1.867784023284912, + "learning_rate": 6.5469304921359725e-06, + "loss": 0.0206, + "step": 4499 + }, + { + "epoch": 4.109589041095891, + "grad_norm": 0.5020431280136108, + "learning_rate": 6.545915778792492e-06, + "loss": 0.0027, + "step": 4500 + }, + { + "epoch": 4.110502283105022, + "grad_norm": 18.50476837158203, + "learning_rate": 6.544901065449011e-06, + "loss": 0.1239, + "step": 4501 + }, + { + "epoch": 4.111415525114155, + "grad_norm": 0.27403533458709717, + "learning_rate": 6.543886352105531e-06, + "loss": 0.0017, + "step": 4502 + }, + { + "epoch": 4.112328767123287, + "grad_norm": 4.71080207824707, + "learning_rate": 6.5428716387620504e-06, + "loss": 0.0385, + "step": 4503 + }, + { + "epoch": 4.11324200913242, + "grad_norm": 31.19354820251465, + "learning_rate": 6.541856925418569e-06, + "loss": 0.1418, + "step": 4504 + }, + { + "epoch": 4.114155251141552, + "grad_norm": 37.2147216796875, + "learning_rate": 6.540842212075089e-06, + "loss": 0.2856, + "step": 4505 + }, + { + "epoch": 4.115068493150685, + "grad_norm": 5.798498630523682, + "learning_rate": 6.539827498731609e-06, + "loss": 0.0532, + "step": 4506 + }, + { + "epoch": 4.115981735159817, + "grad_norm": 4.647637844085693, + "learning_rate": 6.538812785388129e-06, + "loss": 0.0542, + "step": 4507 + }, + { + "epoch": 4.11689497716895, + "grad_norm": 1.5229458808898926, + "learning_rate": 6.537798072044648e-06, + "loss": 0.014, + "step": 4508 + }, + { + "epoch": 4.117808219178082, + "grad_norm": 5.459377765655518, + "learning_rate": 6.536783358701168e-06, + "loss": 0.0432, + "step": 4509 + }, + { + "epoch": 4.1187214611872145, + "grad_norm": 4.097538948059082, + "learning_rate": 6.5357686453576874e-06, + "loss": 0.0307, + "step": 4510 + }, + { + "epoch": 4.119634703196347, + "grad_norm": 33.15443420410156, + "learning_rate": 6.534753932014206e-06, + "loss": 0.312, + "step": 4511 + }, + { + "epoch": 4.120547945205479, + "grad_norm": 9.414664268493652, + "learning_rate": 6.533739218670726e-06, + "loss": 0.0805, + "step": 4512 + }, + { + "epoch": 4.121461187214612, + "grad_norm": 6.385096549987793, + "learning_rate": 6.532724505327246e-06, + "loss": 0.0546, + "step": 4513 + }, + { + "epoch": 4.122374429223744, + "grad_norm": 2.9895715713500977, + "learning_rate": 6.5317097919837645e-06, + "loss": 0.0221, + "step": 4514 + }, + { + "epoch": 4.123287671232877, + "grad_norm": 64.1574478149414, + "learning_rate": 6.530695078640284e-06, + "loss": 0.7101, + "step": 4515 + }, + { + "epoch": 4.124200913242009, + "grad_norm": 0.9229259490966797, + "learning_rate": 6.529680365296805e-06, + "loss": 0.0093, + "step": 4516 + }, + { + "epoch": 4.125114155251142, + "grad_norm": 20.12781524658203, + "learning_rate": 6.5286656519533244e-06, + "loss": 0.2177, + "step": 4517 + }, + { + "epoch": 4.126027397260274, + "grad_norm": 2.6945931911468506, + "learning_rate": 6.527650938609843e-06, + "loss": 0.0211, + "step": 4518 + }, + { + "epoch": 4.126940639269407, + "grad_norm": 16.08707046508789, + "learning_rate": 6.526636225266363e-06, + "loss": 0.1518, + "step": 4519 + }, + { + "epoch": 4.127853881278539, + "grad_norm": 7.809907913208008, + "learning_rate": 6.525621511922883e-06, + "loss": 0.0594, + "step": 4520 + }, + { + "epoch": 4.1287671232876715, + "grad_norm": 5.5574421882629395, + "learning_rate": 6.5246067985794015e-06, + "loss": 0.0343, + "step": 4521 + }, + { + "epoch": 4.129680365296804, + "grad_norm": 50.32450485229492, + "learning_rate": 6.523592085235921e-06, + "loss": 0.619, + "step": 4522 + }, + { + "epoch": 4.130593607305936, + "grad_norm": 11.167019844055176, + "learning_rate": 6.522577371892441e-06, + "loss": 0.0625, + "step": 4523 + }, + { + "epoch": 4.131506849315069, + "grad_norm": 3.2540690898895264, + "learning_rate": 6.52156265854896e-06, + "loss": 0.022, + "step": 4524 + }, + { + "epoch": 4.132420091324201, + "grad_norm": 56.501548767089844, + "learning_rate": 6.5205479452054794e-06, + "loss": 0.5163, + "step": 4525 + }, + { + "epoch": 4.133333333333334, + "grad_norm": 0.23581387102603912, + "learning_rate": 6.519533231862e-06, + "loss": 0.0022, + "step": 4526 + }, + { + "epoch": 4.134246575342466, + "grad_norm": 30.1176700592041, + "learning_rate": 6.51851851851852e-06, + "loss": 0.2466, + "step": 4527 + }, + { + "epoch": 4.135159817351598, + "grad_norm": 0.4412866234779358, + "learning_rate": 6.5175038051750385e-06, + "loss": 0.0037, + "step": 4528 + }, + { + "epoch": 4.13607305936073, + "grad_norm": 32.85404586791992, + "learning_rate": 6.516489091831558e-06, + "loss": 0.2472, + "step": 4529 + }, + { + "epoch": 4.136986301369863, + "grad_norm": 1.036070466041565, + "learning_rate": 6.515474378488078e-06, + "loss": 0.0082, + "step": 4530 + }, + { + "epoch": 4.137899543378995, + "grad_norm": 1.421667456626892, + "learning_rate": 6.514459665144597e-06, + "loss": 0.0092, + "step": 4531 + }, + { + "epoch": 4.138812785388128, + "grad_norm": 23.806921005249023, + "learning_rate": 6.5134449518011164e-06, + "loss": 0.3556, + "step": 4532 + }, + { + "epoch": 4.13972602739726, + "grad_norm": 32.564090728759766, + "learning_rate": 6.512430238457637e-06, + "loss": 0.2285, + "step": 4533 + }, + { + "epoch": 4.1406392694063925, + "grad_norm": 8.095531463623047, + "learning_rate": 6.511415525114155e-06, + "loss": 0.0581, + "step": 4534 + }, + { + "epoch": 4.141552511415525, + "grad_norm": 65.14067077636719, + "learning_rate": 6.5104008117706755e-06, + "loss": 0.6312, + "step": 4535 + }, + { + "epoch": 4.142465753424657, + "grad_norm": 7.733774185180664, + "learning_rate": 6.509386098427195e-06, + "loss": 0.0569, + "step": 4536 + }, + { + "epoch": 4.14337899543379, + "grad_norm": 7.461920738220215, + "learning_rate": 6.508371385083715e-06, + "loss": 0.0535, + "step": 4537 + }, + { + "epoch": 4.144292237442922, + "grad_norm": 74.86632537841797, + "learning_rate": 6.507356671740234e-06, + "loss": 1.1409, + "step": 4538 + }, + { + "epoch": 4.145205479452055, + "grad_norm": 33.13275146484375, + "learning_rate": 6.5063419583967534e-06, + "loss": 0.3087, + "step": 4539 + }, + { + "epoch": 4.146118721461187, + "grad_norm": 29.868070602416992, + "learning_rate": 6.505327245053273e-06, + "loss": 0.2175, + "step": 4540 + }, + { + "epoch": 4.14703196347032, + "grad_norm": 27.403226852416992, + "learning_rate": 6.504312531709792e-06, + "loss": 0.1754, + "step": 4541 + }, + { + "epoch": 4.147945205479452, + "grad_norm": 13.595090866088867, + "learning_rate": 6.503297818366312e-06, + "loss": 0.1254, + "step": 4542 + }, + { + "epoch": 4.148858447488585, + "grad_norm": 17.18593978881836, + "learning_rate": 6.502283105022832e-06, + "loss": 0.1104, + "step": 4543 + }, + { + "epoch": 4.149771689497717, + "grad_norm": 7.916858196258545, + "learning_rate": 6.501268391679351e-06, + "loss": 0.0394, + "step": 4544 + }, + { + "epoch": 4.1506849315068495, + "grad_norm": 0.7441664338111877, + "learning_rate": 6.500253678335871e-06, + "loss": 0.0066, + "step": 4545 + }, + { + "epoch": 4.151598173515982, + "grad_norm": 3.907395124435425, + "learning_rate": 6.4992389649923904e-06, + "loss": 0.0338, + "step": 4546 + }, + { + "epoch": 4.152511415525114, + "grad_norm": 20.055885314941406, + "learning_rate": 6.49822425164891e-06, + "loss": 0.1781, + "step": 4547 + }, + { + "epoch": 4.153424657534247, + "grad_norm": 4.1514434814453125, + "learning_rate": 6.497209538305429e-06, + "loss": 0.0404, + "step": 4548 + }, + { + "epoch": 4.154337899543379, + "grad_norm": 2.105046272277832, + "learning_rate": 6.496194824961949e-06, + "loss": 0.011, + "step": 4549 + }, + { + "epoch": 4.155251141552512, + "grad_norm": 54.67424011230469, + "learning_rate": 6.495180111618468e-06, + "loss": 0.6202, + "step": 4550 + }, + { + "epoch": 4.156164383561644, + "grad_norm": 0.6449389457702637, + "learning_rate": 6.494165398274987e-06, + "loss": 0.006, + "step": 4551 + }, + { + "epoch": 4.157077625570777, + "grad_norm": 23.336048126220703, + "learning_rate": 6.493150684931508e-06, + "loss": 0.1975, + "step": 4552 + }, + { + "epoch": 4.157990867579909, + "grad_norm": 72.10836791992188, + "learning_rate": 6.4921359715880274e-06, + "loss": 0.8188, + "step": 4553 + }, + { + "epoch": 4.1589041095890416, + "grad_norm": 10.553226470947266, + "learning_rate": 6.491121258244546e-06, + "loss": 0.1016, + "step": 4554 + }, + { + "epoch": 4.159817351598173, + "grad_norm": 0.7782381772994995, + "learning_rate": 6.490106544901066e-06, + "loss": 0.0066, + "step": 4555 + }, + { + "epoch": 4.160730593607306, + "grad_norm": 2.082488775253296, + "learning_rate": 6.489091831557586e-06, + "loss": 0.0171, + "step": 4556 + }, + { + "epoch": 4.161643835616438, + "grad_norm": 48.258277893066406, + "learning_rate": 6.488077118214105e-06, + "loss": 0.458, + "step": 4557 + }, + { + "epoch": 4.1625570776255705, + "grad_norm": 0.582051157951355, + "learning_rate": 6.487062404870624e-06, + "loss": 0.0036, + "step": 4558 + }, + { + "epoch": 4.163470319634703, + "grad_norm": 19.036006927490234, + "learning_rate": 6.486047691527144e-06, + "loss": 0.1278, + "step": 4559 + }, + { + "epoch": 4.164383561643835, + "grad_norm": 2.5531556606292725, + "learning_rate": 6.485032978183664e-06, + "loss": 0.0186, + "step": 4560 + }, + { + "epoch": 4.165296803652968, + "grad_norm": 0.7332303524017334, + "learning_rate": 6.484018264840182e-06, + "loss": 0.0056, + "step": 4561 + }, + { + "epoch": 4.1662100456621, + "grad_norm": 1.4610965251922607, + "learning_rate": 6.483003551496703e-06, + "loss": 0.0133, + "step": 4562 + }, + { + "epoch": 4.167123287671233, + "grad_norm": 35.40577697753906, + "learning_rate": 6.481988838153223e-06, + "loss": 0.2259, + "step": 4563 + }, + { + "epoch": 4.168036529680365, + "grad_norm": 46.01250076293945, + "learning_rate": 6.4809741248097415e-06, + "loss": 0.2166, + "step": 4564 + }, + { + "epoch": 4.168949771689498, + "grad_norm": 16.01497459411621, + "learning_rate": 6.479959411466261e-06, + "loss": 0.1466, + "step": 4565 + }, + { + "epoch": 4.16986301369863, + "grad_norm": 2.822516679763794, + "learning_rate": 6.478944698122781e-06, + "loss": 0.0298, + "step": 4566 + }, + { + "epoch": 4.170776255707763, + "grad_norm": 0.8645080924034119, + "learning_rate": 6.4779299847793006e-06, + "loss": 0.0056, + "step": 4567 + }, + { + "epoch": 4.171689497716895, + "grad_norm": 1.8179057836532593, + "learning_rate": 6.476915271435819e-06, + "loss": 0.0097, + "step": 4568 + }, + { + "epoch": 4.1726027397260275, + "grad_norm": 4.713949203491211, + "learning_rate": 6.475900558092339e-06, + "loss": 0.0352, + "step": 4569 + }, + { + "epoch": 4.17351598173516, + "grad_norm": 0.39442798495292664, + "learning_rate": 6.47488584474886e-06, + "loss": 0.0022, + "step": 4570 + }, + { + "epoch": 4.174429223744292, + "grad_norm": 23.985898971557617, + "learning_rate": 6.4738711314053785e-06, + "loss": 0.3231, + "step": 4571 + }, + { + "epoch": 4.175342465753425, + "grad_norm": 3.42883038520813, + "learning_rate": 6.472856418061898e-06, + "loss": 0.0238, + "step": 4572 + }, + { + "epoch": 4.176255707762557, + "grad_norm": 4.594439506530762, + "learning_rate": 6.471841704718418e-06, + "loss": 0.0371, + "step": 4573 + }, + { + "epoch": 4.17716894977169, + "grad_norm": 7.657230854034424, + "learning_rate": 6.470826991374937e-06, + "loss": 0.0547, + "step": 4574 + }, + { + "epoch": 4.178082191780822, + "grad_norm": 12.283493995666504, + "learning_rate": 6.469812278031456e-06, + "loss": 0.1384, + "step": 4575 + }, + { + "epoch": 4.178995433789955, + "grad_norm": 60.91883087158203, + "learning_rate": 6.468797564687976e-06, + "loss": 0.6544, + "step": 4576 + }, + { + "epoch": 4.179908675799087, + "grad_norm": 6.642535209655762, + "learning_rate": 6.467782851344497e-06, + "loss": 0.0476, + "step": 4577 + }, + { + "epoch": 4.1808219178082195, + "grad_norm": 8.948246002197266, + "learning_rate": 6.466768138001015e-06, + "loss": 0.0606, + "step": 4578 + }, + { + "epoch": 4.181735159817352, + "grad_norm": 5.84262752532959, + "learning_rate": 6.465753424657535e-06, + "loss": 0.0434, + "step": 4579 + }, + { + "epoch": 4.182648401826484, + "grad_norm": 65.27162170410156, + "learning_rate": 6.464738711314055e-06, + "loss": 0.5055, + "step": 4580 + }, + { + "epoch": 4.183561643835616, + "grad_norm": 15.817388534545898, + "learning_rate": 6.463723997970574e-06, + "loss": 0.1008, + "step": 4581 + }, + { + "epoch": 4.1844748858447485, + "grad_norm": 47.80996322631836, + "learning_rate": 6.462709284627093e-06, + "loss": 0.3689, + "step": 4582 + }, + { + "epoch": 4.185388127853881, + "grad_norm": 0.31925731897354126, + "learning_rate": 6.461694571283613e-06, + "loss": 0.0025, + "step": 4583 + }, + { + "epoch": 4.186301369863013, + "grad_norm": 2.446183681488037, + "learning_rate": 6.460679857940132e-06, + "loss": 0.0159, + "step": 4584 + }, + { + "epoch": 4.187214611872146, + "grad_norm": 47.25514221191406, + "learning_rate": 6.459665144596652e-06, + "loss": 0.3618, + "step": 4585 + }, + { + "epoch": 4.188127853881278, + "grad_norm": 1.2495988607406616, + "learning_rate": 6.458650431253171e-06, + "loss": 0.0091, + "step": 4586 + }, + { + "epoch": 4.189041095890411, + "grad_norm": 46.53325271606445, + "learning_rate": 6.457635717909692e-06, + "loss": 0.5238, + "step": 4587 + }, + { + "epoch": 4.189954337899543, + "grad_norm": 12.103004455566406, + "learning_rate": 6.456621004566211e-06, + "loss": 0.1117, + "step": 4588 + }, + { + "epoch": 4.190867579908676, + "grad_norm": 10.00370979309082, + "learning_rate": 6.45560629122273e-06, + "loss": 0.0531, + "step": 4589 + }, + { + "epoch": 4.191780821917808, + "grad_norm": 4.915951251983643, + "learning_rate": 6.45459157787925e-06, + "loss": 0.0385, + "step": 4590 + }, + { + "epoch": 4.1926940639269406, + "grad_norm": 1.7599152326583862, + "learning_rate": 6.453576864535769e-06, + "loss": 0.0125, + "step": 4591 + }, + { + "epoch": 4.193607305936073, + "grad_norm": 13.598397254943848, + "learning_rate": 6.452562151192289e-06, + "loss": 0.0643, + "step": 4592 + }, + { + "epoch": 4.1945205479452055, + "grad_norm": 2.245572328567505, + "learning_rate": 6.451547437848808e-06, + "loss": 0.0244, + "step": 4593 + }, + { + "epoch": 4.195433789954338, + "grad_norm": 2.0079658031463623, + "learning_rate": 6.450532724505327e-06, + "loss": 0.0171, + "step": 4594 + }, + { + "epoch": 4.19634703196347, + "grad_norm": 24.565954208374023, + "learning_rate": 6.449518011161847e-06, + "loss": 0.1561, + "step": 4595 + }, + { + "epoch": 4.197260273972603, + "grad_norm": 0.754324197769165, + "learning_rate": 6.448503297818367e-06, + "loss": 0.0054, + "step": 4596 + }, + { + "epoch": 4.198173515981735, + "grad_norm": 1.7054839134216309, + "learning_rate": 6.447488584474887e-06, + "loss": 0.0126, + "step": 4597 + }, + { + "epoch": 4.199086757990868, + "grad_norm": 4.289804935455322, + "learning_rate": 6.446473871131406e-06, + "loss": 0.0375, + "step": 4598 + }, + { + "epoch": 4.2, + "grad_norm": 60.323299407958984, + "learning_rate": 6.445459157787926e-06, + "loss": 0.4342, + "step": 4599 + }, + { + "epoch": 4.200913242009133, + "grad_norm": 3.8570590019226074, + "learning_rate": 6.444444444444445e-06, + "loss": 0.0184, + "step": 4600 + }, + { + "epoch": 4.201826484018265, + "grad_norm": 2.4641294479370117, + "learning_rate": 6.443429731100964e-06, + "loss": 0.021, + "step": 4601 + }, + { + "epoch": 4.2027397260273975, + "grad_norm": 27.23026466369629, + "learning_rate": 6.442415017757484e-06, + "loss": 0.2066, + "step": 4602 + }, + { + "epoch": 4.20365296803653, + "grad_norm": 19.353057861328125, + "learning_rate": 6.4414003044140036e-06, + "loss": 0.1989, + "step": 4603 + }, + { + "epoch": 4.2045662100456624, + "grad_norm": 0.3278482258319855, + "learning_rate": 6.440385591070522e-06, + "loss": 0.0018, + "step": 4604 + }, + { + "epoch": 4.205479452054795, + "grad_norm": 29.92876434326172, + "learning_rate": 6.439370877727042e-06, + "loss": 0.1143, + "step": 4605 + }, + { + "epoch": 4.206392694063927, + "grad_norm": 23.81500244140625, + "learning_rate": 6.438356164383563e-06, + "loss": 0.1944, + "step": 4606 + }, + { + "epoch": 4.207305936073059, + "grad_norm": 103.01128387451172, + "learning_rate": 6.437341451040082e-06, + "loss": 5.9906, + "step": 4607 + }, + { + "epoch": 4.208219178082191, + "grad_norm": 42.32953643798828, + "learning_rate": 6.436326737696601e-06, + "loss": 0.3204, + "step": 4608 + }, + { + "epoch": 4.209132420091324, + "grad_norm": 64.22208404541016, + "learning_rate": 6.435312024353121e-06, + "loss": 1.7152, + "step": 4609 + }, + { + "epoch": 4.210045662100456, + "grad_norm": 13.186112403869629, + "learning_rate": 6.4342973110096406e-06, + "loss": 0.1036, + "step": 4610 + }, + { + "epoch": 4.210958904109589, + "grad_norm": 0.0876799151301384, + "learning_rate": 6.433282597666159e-06, + "loss": 0.0006, + "step": 4611 + }, + { + "epoch": 4.211872146118721, + "grad_norm": 8.589727401733398, + "learning_rate": 6.432267884322679e-06, + "loss": 0.0761, + "step": 4612 + }, + { + "epoch": 4.212785388127854, + "grad_norm": 71.62464904785156, + "learning_rate": 6.4312531709792e-06, + "loss": 1.7039, + "step": 4613 + }, + { + "epoch": 4.213698630136986, + "grad_norm": 68.05785369873047, + "learning_rate": 6.430238457635718e-06, + "loss": 0.7098, + "step": 4614 + }, + { + "epoch": 4.2146118721461185, + "grad_norm": 52.61067581176758, + "learning_rate": 6.429223744292238e-06, + "loss": 0.5389, + "step": 4615 + }, + { + "epoch": 4.215525114155251, + "grad_norm": 2.7903575897216797, + "learning_rate": 6.428209030948758e-06, + "loss": 0.0181, + "step": 4616 + }, + { + "epoch": 4.2164383561643834, + "grad_norm": 3.225630044937134, + "learning_rate": 6.4271943176052776e-06, + "loss": 0.0212, + "step": 4617 + }, + { + "epoch": 4.217351598173516, + "grad_norm": 58.77482986450195, + "learning_rate": 6.426179604261796e-06, + "loss": 0.6272, + "step": 4618 + }, + { + "epoch": 4.218264840182648, + "grad_norm": 3.4132890701293945, + "learning_rate": 6.425164890918316e-06, + "loss": 0.0179, + "step": 4619 + }, + { + "epoch": 4.219178082191781, + "grad_norm": 3.6728665828704834, + "learning_rate": 6.424150177574836e-06, + "loss": 0.0327, + "step": 4620 + }, + { + "epoch": 4.220091324200913, + "grad_norm": 26.290355682373047, + "learning_rate": 6.423135464231355e-06, + "loss": 0.2375, + "step": 4621 + }, + { + "epoch": 4.221004566210046, + "grad_norm": 0.9011427164077759, + "learning_rate": 6.422120750887874e-06, + "loss": 0.0078, + "step": 4622 + }, + { + "epoch": 4.221917808219178, + "grad_norm": 49.5380973815918, + "learning_rate": 6.421106037544395e-06, + "loss": 0.5297, + "step": 4623 + }, + { + "epoch": 4.222831050228311, + "grad_norm": 28.5705509185791, + "learning_rate": 6.420091324200914e-06, + "loss": 0.2682, + "step": 4624 + }, + { + "epoch": 4.223744292237443, + "grad_norm": 0.14069043099880219, + "learning_rate": 6.419076610857433e-06, + "loss": 0.0013, + "step": 4625 + }, + { + "epoch": 4.2246575342465755, + "grad_norm": 2.432973861694336, + "learning_rate": 6.418061897513953e-06, + "loss": 0.0191, + "step": 4626 + }, + { + "epoch": 4.225570776255708, + "grad_norm": 12.229848861694336, + "learning_rate": 6.417047184170473e-06, + "loss": 0.1605, + "step": 4627 + }, + { + "epoch": 4.22648401826484, + "grad_norm": 0.3053395748138428, + "learning_rate": 6.416032470826992e-06, + "loss": 0.0028, + "step": 4628 + }, + { + "epoch": 4.227397260273973, + "grad_norm": 56.31455993652344, + "learning_rate": 6.415017757483511e-06, + "loss": 0.3757, + "step": 4629 + }, + { + "epoch": 4.228310502283105, + "grad_norm": 10.205220222473145, + "learning_rate": 6.414003044140031e-06, + "loss": 0.112, + "step": 4630 + }, + { + "epoch": 4.229223744292238, + "grad_norm": 0.11238788068294525, + "learning_rate": 6.41298833079655e-06, + "loss": 0.001, + "step": 4631 + }, + { + "epoch": 4.23013698630137, + "grad_norm": 10.245647430419922, + "learning_rate": 6.41197361745307e-06, + "loss": 0.1137, + "step": 4632 + }, + { + "epoch": 4.231050228310503, + "grad_norm": 80.11329650878906, + "learning_rate": 6.41095890410959e-06, + "loss": 1.3284, + "step": 4633 + }, + { + "epoch": 4.231963470319634, + "grad_norm": 30.44449806213379, + "learning_rate": 6.409944190766109e-06, + "loss": 0.2668, + "step": 4634 + }, + { + "epoch": 4.232876712328767, + "grad_norm": 0.3327291011810303, + "learning_rate": 6.408929477422629e-06, + "loss": 0.0029, + "step": 4635 + }, + { + "epoch": 4.233789954337899, + "grad_norm": 29.665565490722656, + "learning_rate": 6.407914764079148e-06, + "loss": 0.2346, + "step": 4636 + }, + { + "epoch": 4.234703196347032, + "grad_norm": 0.5424866080284119, + "learning_rate": 6.406900050735668e-06, + "loss": 0.0052, + "step": 4637 + }, + { + "epoch": 4.235616438356164, + "grad_norm": 35.04395294189453, + "learning_rate": 6.405885337392187e-06, + "loss": 0.2596, + "step": 4638 + }, + { + "epoch": 4.2365296803652965, + "grad_norm": 5.730775833129883, + "learning_rate": 6.4048706240487065e-06, + "loss": 0.0475, + "step": 4639 + }, + { + "epoch": 4.237442922374429, + "grad_norm": 5.528679370880127, + "learning_rate": 6.403855910705227e-06, + "loss": 0.0368, + "step": 4640 + }, + { + "epoch": 4.238356164383561, + "grad_norm": 108.11963653564453, + "learning_rate": 6.402841197361745e-06, + "loss": 0.9906, + "step": 4641 + }, + { + "epoch": 4.239269406392694, + "grad_norm": 77.06529235839844, + "learning_rate": 6.401826484018266e-06, + "loss": 1.6971, + "step": 4642 + }, + { + "epoch": 4.240182648401826, + "grad_norm": 1.0025508403778076, + "learning_rate": 6.400811770674785e-06, + "loss": 0.0088, + "step": 4643 + }, + { + "epoch": 4.241095890410959, + "grad_norm": 6.194265842437744, + "learning_rate": 6.399797057331304e-06, + "loss": 0.04, + "step": 4644 + }, + { + "epoch": 4.242009132420091, + "grad_norm": 14.260992050170898, + "learning_rate": 6.398782343987824e-06, + "loss": 0.1245, + "step": 4645 + }, + { + "epoch": 4.242922374429224, + "grad_norm": 40.02146911621094, + "learning_rate": 6.3977676306443435e-06, + "loss": 0.4127, + "step": 4646 + }, + { + "epoch": 4.243835616438356, + "grad_norm": 1316.828125, + "learning_rate": 6.396752917300863e-06, + "loss": 0.6137, + "step": 4647 + }, + { + "epoch": 4.244748858447489, + "grad_norm": 0.3183004558086395, + "learning_rate": 6.395738203957382e-06, + "loss": 0.0027, + "step": 4648 + }, + { + "epoch": 4.245662100456621, + "grad_norm": 1.3824635744094849, + "learning_rate": 6.394723490613902e-06, + "loss": 0.0125, + "step": 4649 + }, + { + "epoch": 4.2465753424657535, + "grad_norm": 9.736000061035156, + "learning_rate": 6.393708777270422e-06, + "loss": 0.0887, + "step": 4650 + }, + { + "epoch": 4.247488584474886, + "grad_norm": 0.6881074905395508, + "learning_rate": 6.392694063926941e-06, + "loss": 0.0064, + "step": 4651 + }, + { + "epoch": 4.248401826484018, + "grad_norm": 28.57099151611328, + "learning_rate": 6.391679350583461e-06, + "loss": 0.3424, + "step": 4652 + }, + { + "epoch": 4.249315068493151, + "grad_norm": 9.264402389526367, + "learning_rate": 6.3906646372399805e-06, + "loss": 0.0565, + "step": 4653 + }, + { + "epoch": 4.250228310502283, + "grad_norm": 92.48439025878906, + "learning_rate": 6.389649923896499e-06, + "loss": 1.5695, + "step": 4654 + }, + { + "epoch": 4.251141552511416, + "grad_norm": 1.0450857877731323, + "learning_rate": 6.388635210553019e-06, + "loss": 0.0065, + "step": 4655 + }, + { + "epoch": 4.252054794520548, + "grad_norm": 2.471742868423462, + "learning_rate": 6.387620497209539e-06, + "loss": 0.0165, + "step": 4656 + }, + { + "epoch": 4.252968036529681, + "grad_norm": 9.062052726745605, + "learning_rate": 6.386605783866059e-06, + "loss": 0.0691, + "step": 4657 + }, + { + "epoch": 4.253881278538813, + "grad_norm": 19.034103393554688, + "learning_rate": 6.385591070522577e-06, + "loss": 0.2005, + "step": 4658 + }, + { + "epoch": 4.254794520547946, + "grad_norm": 2.3786253929138184, + "learning_rate": 6.384576357179098e-06, + "loss": 0.0213, + "step": 4659 + }, + { + "epoch": 4.255707762557078, + "grad_norm": 1.8888232707977295, + "learning_rate": 6.3835616438356175e-06, + "loss": 0.0176, + "step": 4660 + }, + { + "epoch": 4.25662100456621, + "grad_norm": 4.969609260559082, + "learning_rate": 6.382546930492136e-06, + "loss": 0.0366, + "step": 4661 + }, + { + "epoch": 4.257534246575342, + "grad_norm": 3.7179760932922363, + "learning_rate": 6.381532217148656e-06, + "loss": 0.0252, + "step": 4662 + }, + { + "epoch": 4.2584474885844745, + "grad_norm": 26.133310317993164, + "learning_rate": 6.380517503805176e-06, + "loss": 0.2914, + "step": 4663 + }, + { + "epoch": 4.259360730593607, + "grad_norm": 0.3901888430118561, + "learning_rate": 6.379502790461695e-06, + "loss": 0.0029, + "step": 4664 + }, + { + "epoch": 4.260273972602739, + "grad_norm": 73.99212646484375, + "learning_rate": 6.378488077118214e-06, + "loss": 1.1186, + "step": 4665 + }, + { + "epoch": 4.261187214611872, + "grad_norm": 8.302350997924805, + "learning_rate": 6.377473363774734e-06, + "loss": 0.0643, + "step": 4666 + }, + { + "epoch": 4.262100456621004, + "grad_norm": 15.009247779846191, + "learning_rate": 6.3764586504312545e-06, + "loss": 0.0396, + "step": 4667 + }, + { + "epoch": 4.263013698630137, + "grad_norm": 124.6875228881836, + "learning_rate": 6.375443937087773e-06, + "loss": 1.7726, + "step": 4668 + }, + { + "epoch": 4.263926940639269, + "grad_norm": 3.651212692260742, + "learning_rate": 6.374429223744293e-06, + "loss": 0.028, + "step": 4669 + }, + { + "epoch": 4.264840182648402, + "grad_norm": 46.82889938354492, + "learning_rate": 6.373414510400813e-06, + "loss": 0.2438, + "step": 4670 + }, + { + "epoch": 4.265753424657534, + "grad_norm": 3.9010419845581055, + "learning_rate": 6.372399797057332e-06, + "loss": 0.0365, + "step": 4671 + }, + { + "epoch": 4.266666666666667, + "grad_norm": 6.4378342628479, + "learning_rate": 6.371385083713851e-06, + "loss": 0.0477, + "step": 4672 + }, + { + "epoch": 4.267579908675799, + "grad_norm": 3.3285045623779297, + "learning_rate": 6.370370370370371e-06, + "loss": 0.025, + "step": 4673 + }, + { + "epoch": 4.2684931506849315, + "grad_norm": 42.963356018066406, + "learning_rate": 6.36935565702689e-06, + "loss": 0.5123, + "step": 4674 + }, + { + "epoch": 4.269406392694064, + "grad_norm": 89.96503448486328, + "learning_rate": 6.3683409436834095e-06, + "loss": 2.2843, + "step": 4675 + }, + { + "epoch": 4.270319634703196, + "grad_norm": 1.426408052444458, + "learning_rate": 6.36732623033993e-06, + "loss": 0.0124, + "step": 4676 + }, + { + "epoch": 4.271232876712329, + "grad_norm": 4.92905855178833, + "learning_rate": 6.36631151699645e-06, + "loss": 0.0442, + "step": 4677 + }, + { + "epoch": 4.272146118721461, + "grad_norm": 0.3701678216457367, + "learning_rate": 6.365296803652969e-06, + "loss": 0.0026, + "step": 4678 + }, + { + "epoch": 4.273059360730594, + "grad_norm": 0.21238063275814056, + "learning_rate": 6.364282090309488e-06, + "loss": 0.0016, + "step": 4679 + }, + { + "epoch": 4.273972602739726, + "grad_norm": 6.612258434295654, + "learning_rate": 6.363267376966008e-06, + "loss": 0.0575, + "step": 4680 + }, + { + "epoch": 4.274885844748859, + "grad_norm": 20.356666564941406, + "learning_rate": 6.362252663622527e-06, + "loss": 0.2031, + "step": 4681 + }, + { + "epoch": 4.275799086757991, + "grad_norm": 9.300827026367188, + "learning_rate": 6.3612379502790465e-06, + "loss": 0.0605, + "step": 4682 + }, + { + "epoch": 4.276712328767124, + "grad_norm": 16.953767776489258, + "learning_rate": 6.360223236935566e-06, + "loss": 0.1876, + "step": 4683 + }, + { + "epoch": 4.277625570776256, + "grad_norm": 104.20226287841797, + "learning_rate": 6.359208523592085e-06, + "loss": 0.7258, + "step": 4684 + }, + { + "epoch": 4.2785388127853885, + "grad_norm": 7.348443508148193, + "learning_rate": 6.358193810248605e-06, + "loss": 0.0461, + "step": 4685 + }, + { + "epoch": 4.279452054794521, + "grad_norm": 50.969669342041016, + "learning_rate": 6.357179096905125e-06, + "loss": 0.5057, + "step": 4686 + }, + { + "epoch": 4.280365296803653, + "grad_norm": 3.4861087799072266, + "learning_rate": 6.356164383561645e-06, + "loss": 0.0122, + "step": 4687 + }, + { + "epoch": 4.281278538812785, + "grad_norm": 0.5218837857246399, + "learning_rate": 6.355149670218164e-06, + "loss": 0.0058, + "step": 4688 + }, + { + "epoch": 4.282191780821917, + "grad_norm": 10.703332901000977, + "learning_rate": 6.3541349568746835e-06, + "loss": 0.1085, + "step": 4689 + }, + { + "epoch": 4.28310502283105, + "grad_norm": 14.562235832214355, + "learning_rate": 6.353120243531203e-06, + "loss": 0.0426, + "step": 4690 + }, + { + "epoch": 4.284018264840182, + "grad_norm": 0.4716612696647644, + "learning_rate": 6.352105530187722e-06, + "loss": 0.0039, + "step": 4691 + }, + { + "epoch": 4.284931506849315, + "grad_norm": 8.0724515914917, + "learning_rate": 6.351090816844242e-06, + "loss": 0.0611, + "step": 4692 + }, + { + "epoch": 4.285844748858447, + "grad_norm": 9.253839492797852, + "learning_rate": 6.3500761035007614e-06, + "loss": 0.0758, + "step": 4693 + }, + { + "epoch": 4.28675799086758, + "grad_norm": 4.691112518310547, + "learning_rate": 6.34906139015728e-06, + "loss": 0.0429, + "step": 4694 + }, + { + "epoch": 4.287671232876712, + "grad_norm": 88.08259582519531, + "learning_rate": 6.348046676813801e-06, + "loss": 0.3771, + "step": 4695 + }, + { + "epoch": 4.288584474885845, + "grad_norm": 64.28394317626953, + "learning_rate": 6.3470319634703205e-06, + "loss": 0.7956, + "step": 4696 + }, + { + "epoch": 4.289497716894977, + "grad_norm": 35.900421142578125, + "learning_rate": 6.34601725012684e-06, + "loss": 0.3691, + "step": 4697 + }, + { + "epoch": 4.2904109589041095, + "grad_norm": 0.7793693542480469, + "learning_rate": 6.345002536783359e-06, + "loss": 0.0068, + "step": 4698 + }, + { + "epoch": 4.291324200913242, + "grad_norm": 0.2405690848827362, + "learning_rate": 6.343987823439879e-06, + "loss": 0.0026, + "step": 4699 + }, + { + "epoch": 4.292237442922374, + "grad_norm": 0.7307930588722229, + "learning_rate": 6.3429731100963984e-06, + "loss": 0.0062, + "step": 4700 + }, + { + "epoch": 4.293150684931507, + "grad_norm": 29.723247528076172, + "learning_rate": 6.341958396752917e-06, + "loss": 0.2854, + "step": 4701 + }, + { + "epoch": 4.294063926940639, + "grad_norm": 57.67272186279297, + "learning_rate": 6.340943683409437e-06, + "loss": 0.9366, + "step": 4702 + }, + { + "epoch": 4.294977168949772, + "grad_norm": 0.9962139129638672, + "learning_rate": 6.3399289700659575e-06, + "loss": 0.0089, + "step": 4703 + }, + { + "epoch": 4.295890410958904, + "grad_norm": 3.6348297595977783, + "learning_rate": 6.338914256722476e-06, + "loss": 0.0181, + "step": 4704 + }, + { + "epoch": 4.296803652968037, + "grad_norm": 3.360567569732666, + "learning_rate": 6.337899543378996e-06, + "loss": 0.0245, + "step": 4705 + }, + { + "epoch": 4.297716894977169, + "grad_norm": 1.1486908197402954, + "learning_rate": 6.336884830035516e-06, + "loss": 0.0106, + "step": 4706 + }, + { + "epoch": 4.298630136986302, + "grad_norm": 2.9081289768218994, + "learning_rate": 6.3358701166920354e-06, + "loss": 0.0172, + "step": 4707 + }, + { + "epoch": 4.299543378995434, + "grad_norm": 0.6556829214096069, + "learning_rate": 6.334855403348554e-06, + "loss": 0.004, + "step": 4708 + }, + { + "epoch": 4.3004566210045665, + "grad_norm": 0.4261699318885803, + "learning_rate": 6.333840690005074e-06, + "loss": 0.0025, + "step": 4709 + }, + { + "epoch": 4.301369863013699, + "grad_norm": 2.2324378490448, + "learning_rate": 6.332825976661594e-06, + "loss": 0.0183, + "step": 4710 + }, + { + "epoch": 4.302283105022831, + "grad_norm": 27.92601776123047, + "learning_rate": 6.3318112633181125e-06, + "loss": 0.23, + "step": 4711 + }, + { + "epoch": 4.303196347031964, + "grad_norm": 17.88141632080078, + "learning_rate": 6.330796549974633e-06, + "loss": 0.1666, + "step": 4712 + }, + { + "epoch": 4.304109589041096, + "grad_norm": 64.5567626953125, + "learning_rate": 6.329781836631153e-06, + "loss": 0.2413, + "step": 4713 + }, + { + "epoch": 4.305022831050229, + "grad_norm": 61.956939697265625, + "learning_rate": 6.328767123287672e-06, + "loss": 1.5384, + "step": 4714 + }, + { + "epoch": 4.30593607305936, + "grad_norm": 58.72237777709961, + "learning_rate": 6.327752409944191e-06, + "loss": 0.7413, + "step": 4715 + }, + { + "epoch": 4.306849315068493, + "grad_norm": 10.538629531860352, + "learning_rate": 6.326737696600711e-06, + "loss": 0.0955, + "step": 4716 + }, + { + "epoch": 4.307762557077625, + "grad_norm": 124.40076446533203, + "learning_rate": 6.325722983257231e-06, + "loss": 1.459, + "step": 4717 + }, + { + "epoch": 4.308675799086758, + "grad_norm": 70.29277801513672, + "learning_rate": 6.3247082699137495e-06, + "loss": 1.9464, + "step": 4718 + }, + { + "epoch": 4.30958904109589, + "grad_norm": 19.945070266723633, + "learning_rate": 6.323693556570269e-06, + "loss": 0.1273, + "step": 4719 + }, + { + "epoch": 4.310502283105023, + "grad_norm": 9.135456085205078, + "learning_rate": 6.32267884322679e-06, + "loss": 0.0835, + "step": 4720 + }, + { + "epoch": 4.311415525114155, + "grad_norm": 26.185001373291016, + "learning_rate": 6.321664129883308e-06, + "loss": 0.2709, + "step": 4721 + }, + { + "epoch": 4.3123287671232875, + "grad_norm": 36.56275939941406, + "learning_rate": 6.320649416539828e-06, + "loss": 0.3719, + "step": 4722 + }, + { + "epoch": 4.31324200913242, + "grad_norm": 2.5232694149017334, + "learning_rate": 6.319634703196348e-06, + "loss": 0.0159, + "step": 4723 + }, + { + "epoch": 4.314155251141552, + "grad_norm": 13.461603164672852, + "learning_rate": 6.318619989852867e-06, + "loss": 0.1142, + "step": 4724 + }, + { + "epoch": 4.315068493150685, + "grad_norm": 5.87761926651001, + "learning_rate": 6.3176052765093865e-06, + "loss": 0.0526, + "step": 4725 + }, + { + "epoch": 4.315981735159817, + "grad_norm": 9.315469741821289, + "learning_rate": 6.316590563165906e-06, + "loss": 0.0867, + "step": 4726 + }, + { + "epoch": 4.31689497716895, + "grad_norm": 30.11777687072754, + "learning_rate": 6.315575849822426e-06, + "loss": 0.368, + "step": 4727 + }, + { + "epoch": 4.317808219178082, + "grad_norm": 8.166757583618164, + "learning_rate": 6.314561136478945e-06, + "loss": 0.0955, + "step": 4728 + }, + { + "epoch": 4.318721461187215, + "grad_norm": 1.3047641515731812, + "learning_rate": 6.3135464231354644e-06, + "loss": 0.0127, + "step": 4729 + }, + { + "epoch": 4.319634703196347, + "grad_norm": 3.882798433303833, + "learning_rate": 6.312531709791985e-06, + "loss": 0.0358, + "step": 4730 + }, + { + "epoch": 4.32054794520548, + "grad_norm": 6.646291255950928, + "learning_rate": 6.311516996448504e-06, + "loss": 0.016, + "step": 4731 + }, + { + "epoch": 4.321461187214612, + "grad_norm": 15.458346366882324, + "learning_rate": 6.3105022831050235e-06, + "loss": 0.1083, + "step": 4732 + }, + { + "epoch": 4.3223744292237445, + "grad_norm": 53.56071853637695, + "learning_rate": 6.309487569761543e-06, + "loss": 1.0398, + "step": 4733 + }, + { + "epoch": 4.323287671232877, + "grad_norm": 0.06264549493789673, + "learning_rate": 6.308472856418062e-06, + "loss": 0.0004, + "step": 4734 + }, + { + "epoch": 4.324200913242009, + "grad_norm": 9.277984619140625, + "learning_rate": 6.307458143074582e-06, + "loss": 0.084, + "step": 4735 + }, + { + "epoch": 4.325114155251142, + "grad_norm": 7.663239002227783, + "learning_rate": 6.3064434297311014e-06, + "loss": 0.0661, + "step": 4736 + }, + { + "epoch": 4.326027397260274, + "grad_norm": 0.8765352964401245, + "learning_rate": 6.305428716387622e-06, + "loss": 0.0067, + "step": 4737 + }, + { + "epoch": 4.326940639269407, + "grad_norm": 13.288926124572754, + "learning_rate": 6.30441400304414e-06, + "loss": 0.0983, + "step": 4738 + }, + { + "epoch": 4.327853881278539, + "grad_norm": 47.01072311401367, + "learning_rate": 6.3033992897006605e-06, + "loss": 0.4953, + "step": 4739 + }, + { + "epoch": 4.328767123287671, + "grad_norm": 22.91814613342285, + "learning_rate": 6.30238457635718e-06, + "loss": 0.2749, + "step": 4740 + }, + { + "epoch": 4.329680365296804, + "grad_norm": 14.015368461608887, + "learning_rate": 6.301369863013699e-06, + "loss": 0.1305, + "step": 4741 + }, + { + "epoch": 4.330593607305936, + "grad_norm": 74.41735076904297, + "learning_rate": 6.300355149670219e-06, + "loss": 0.6655, + "step": 4742 + }, + { + "epoch": 4.331506849315068, + "grad_norm": 0.3744845688343048, + "learning_rate": 6.2993404363267384e-06, + "loss": 0.0029, + "step": 4743 + }, + { + "epoch": 4.332420091324201, + "grad_norm": 1.602512240409851, + "learning_rate": 6.298325722983257e-06, + "loss": 0.0136, + "step": 4744 + }, + { + "epoch": 4.333333333333333, + "grad_norm": 1.8858500719070435, + "learning_rate": 6.297311009639777e-06, + "loss": 0.0178, + "step": 4745 + }, + { + "epoch": 4.3342465753424655, + "grad_norm": 83.28115844726562, + "learning_rate": 6.296296296296297e-06, + "loss": 1.7472, + "step": 4746 + }, + { + "epoch": 4.335159817351598, + "grad_norm": 0.28219011425971985, + "learning_rate": 6.295281582952817e-06, + "loss": 0.002, + "step": 4747 + }, + { + "epoch": 4.33607305936073, + "grad_norm": 12.193062782287598, + "learning_rate": 6.294266869609336e-06, + "loss": 0.1042, + "step": 4748 + }, + { + "epoch": 4.336986301369863, + "grad_norm": 61.98012924194336, + "learning_rate": 6.293252156265856e-06, + "loss": 1.2094, + "step": 4749 + }, + { + "epoch": 4.337899543378995, + "grad_norm": 24.87941551208496, + "learning_rate": 6.292237442922375e-06, + "loss": 0.1408, + "step": 4750 + }, + { + "epoch": 4.338812785388128, + "grad_norm": 15.394709587097168, + "learning_rate": 6.291222729578894e-06, + "loss": 0.0825, + "step": 4751 + }, + { + "epoch": 4.33972602739726, + "grad_norm": 148.4625244140625, + "learning_rate": 6.290208016235414e-06, + "loss": 0.5745, + "step": 4752 + }, + { + "epoch": 4.340639269406393, + "grad_norm": 73.7815170288086, + "learning_rate": 6.289193302891934e-06, + "loss": 1.0998, + "step": 4753 + }, + { + "epoch": 4.341552511415525, + "grad_norm": 11.528754234313965, + "learning_rate": 6.2881785895484525e-06, + "loss": 0.1024, + "step": 4754 + }, + { + "epoch": 4.342465753424658, + "grad_norm": 40.11244201660156, + "learning_rate": 6.287163876204972e-06, + "loss": 0.4755, + "step": 4755 + }, + { + "epoch": 4.34337899543379, + "grad_norm": 10.793607711791992, + "learning_rate": 6.286149162861493e-06, + "loss": 0.0893, + "step": 4756 + }, + { + "epoch": 4.3442922374429225, + "grad_norm": 9.867579460144043, + "learning_rate": 6.285134449518012e-06, + "loss": 0.0839, + "step": 4757 + }, + { + "epoch": 4.345205479452055, + "grad_norm": 49.850345611572266, + "learning_rate": 6.284119736174531e-06, + "loss": 0.5456, + "step": 4758 + }, + { + "epoch": 4.346118721461187, + "grad_norm": 59.16559600830078, + "learning_rate": 6.283105022831051e-06, + "loss": 0.7745, + "step": 4759 + }, + { + "epoch": 4.34703196347032, + "grad_norm": 0.12657774984836578, + "learning_rate": 6.282090309487571e-06, + "loss": 0.0013, + "step": 4760 + }, + { + "epoch": 4.347945205479452, + "grad_norm": 18.49146842956543, + "learning_rate": 6.2810755961440895e-06, + "loss": 0.1885, + "step": 4761 + }, + { + "epoch": 4.348858447488585, + "grad_norm": 0.03422616422176361, + "learning_rate": 6.280060882800609e-06, + "loss": 0.0002, + "step": 4762 + }, + { + "epoch": 4.349771689497717, + "grad_norm": 2.21702241897583, + "learning_rate": 6.279046169457129e-06, + "loss": 0.0125, + "step": 4763 + }, + { + "epoch": 4.35068493150685, + "grad_norm": 5.7882890701293945, + "learning_rate": 6.278031456113648e-06, + "loss": 0.0567, + "step": 4764 + }, + { + "epoch": 4.351598173515982, + "grad_norm": 22.411466598510742, + "learning_rate": 6.277016742770167e-06, + "loss": 0.326, + "step": 4765 + }, + { + "epoch": 4.352511415525115, + "grad_norm": 88.02349853515625, + "learning_rate": 6.276002029426688e-06, + "loss": 1.5607, + "step": 4766 + }, + { + "epoch": 4.353424657534246, + "grad_norm": 1.594599723815918, + "learning_rate": 6.274987316083208e-06, + "loss": 0.0107, + "step": 4767 + }, + { + "epoch": 4.3543378995433795, + "grad_norm": 4.438618183135986, + "learning_rate": 6.2739726027397265e-06, + "loss": 0.0348, + "step": 4768 + }, + { + "epoch": 4.355251141552511, + "grad_norm": 0.6216772198677063, + "learning_rate": 6.272957889396246e-06, + "loss": 0.0045, + "step": 4769 + }, + { + "epoch": 4.3561643835616435, + "grad_norm": 1.450791358947754, + "learning_rate": 6.271943176052766e-06, + "loss": 0.0145, + "step": 4770 + }, + { + "epoch": 4.357077625570776, + "grad_norm": 8.916505813598633, + "learning_rate": 6.270928462709285e-06, + "loss": 0.0824, + "step": 4771 + }, + { + "epoch": 4.357990867579908, + "grad_norm": 0.4082838296890259, + "learning_rate": 6.269913749365804e-06, + "loss": 0.0025, + "step": 4772 + }, + { + "epoch": 4.358904109589041, + "grad_norm": 8.22829532623291, + "learning_rate": 6.268899036022324e-06, + "loss": 0.0809, + "step": 4773 + }, + { + "epoch": 4.359817351598173, + "grad_norm": 6.548989295959473, + "learning_rate": 6.267884322678843e-06, + "loss": 0.0556, + "step": 4774 + }, + { + "epoch": 4.360730593607306, + "grad_norm": 17.488065719604492, + "learning_rate": 6.2668696093353635e-06, + "loss": 0.235, + "step": 4775 + }, + { + "epoch": 4.361643835616438, + "grad_norm": 28.340349197387695, + "learning_rate": 6.265854895991883e-06, + "loss": 0.2848, + "step": 4776 + }, + { + "epoch": 4.362557077625571, + "grad_norm": 2.629654884338379, + "learning_rate": 6.264840182648403e-06, + "loss": 0.0146, + "step": 4777 + }, + { + "epoch": 4.363470319634703, + "grad_norm": 0.13954634964466095, + "learning_rate": 6.263825469304922e-06, + "loss": 0.0013, + "step": 4778 + }, + { + "epoch": 4.364383561643836, + "grad_norm": 154.35064697265625, + "learning_rate": 6.262810755961441e-06, + "loss": 0.9538, + "step": 4779 + }, + { + "epoch": 4.365296803652968, + "grad_norm": 8.796686172485352, + "learning_rate": 6.261796042617961e-06, + "loss": 0.1076, + "step": 4780 + }, + { + "epoch": 4.3662100456621005, + "grad_norm": 14.796630859375, + "learning_rate": 6.26078132927448e-06, + "loss": 0.1688, + "step": 4781 + }, + { + "epoch": 4.367123287671233, + "grad_norm": 66.11138916015625, + "learning_rate": 6.259766615931e-06, + "loss": 0.689, + "step": 4782 + }, + { + "epoch": 4.368036529680365, + "grad_norm": 3.1680033206939697, + "learning_rate": 6.25875190258752e-06, + "loss": 0.0307, + "step": 4783 + }, + { + "epoch": 4.368949771689498, + "grad_norm": 0.5809063911437988, + "learning_rate": 6.257737189244039e-06, + "loss": 0.0066, + "step": 4784 + }, + { + "epoch": 4.36986301369863, + "grad_norm": 2.7968804836273193, + "learning_rate": 6.256722475900559e-06, + "loss": 0.0192, + "step": 4785 + }, + { + "epoch": 4.370776255707763, + "grad_norm": 2.0423476696014404, + "learning_rate": 6.255707762557078e-06, + "loss": 0.0145, + "step": 4786 + }, + { + "epoch": 4.371689497716895, + "grad_norm": 0.4327980875968933, + "learning_rate": 6.254693049213598e-06, + "loss": 0.0027, + "step": 4787 + }, + { + "epoch": 4.372602739726028, + "grad_norm": 11.107736587524414, + "learning_rate": 6.253678335870117e-06, + "loss": 0.0795, + "step": 4788 + }, + { + "epoch": 4.37351598173516, + "grad_norm": 68.69876098632812, + "learning_rate": 6.252663622526637e-06, + "loss": 1.7009, + "step": 4789 + }, + { + "epoch": 4.3744292237442925, + "grad_norm": 16.590307235717773, + "learning_rate": 6.251648909183156e-06, + "loss": 0.0729, + "step": 4790 + }, + { + "epoch": 4.375342465753425, + "grad_norm": 0.5059512257575989, + "learning_rate": 6.250634195839675e-06, + "loss": 0.0041, + "step": 4791 + }, + { + "epoch": 4.3762557077625575, + "grad_norm": 11.74620246887207, + "learning_rate": 6.249619482496196e-06, + "loss": 0.1542, + "step": 4792 + }, + { + "epoch": 4.37716894977169, + "grad_norm": 0.6318731307983398, + "learning_rate": 6.248604769152715e-06, + "loss": 0.0057, + "step": 4793 + }, + { + "epoch": 4.3780821917808215, + "grad_norm": 57.38898468017578, + "learning_rate": 6.247590055809234e-06, + "loss": 0.587, + "step": 4794 + }, + { + "epoch": 4.378995433789954, + "grad_norm": 20.035226821899414, + "learning_rate": 6.246575342465754e-06, + "loss": 0.1295, + "step": 4795 + }, + { + "epoch": 4.379908675799086, + "grad_norm": 11.615687370300293, + "learning_rate": 6.245560629122274e-06, + "loss": 0.1178, + "step": 4796 + }, + { + "epoch": 4.380821917808219, + "grad_norm": 80.20098876953125, + "learning_rate": 6.244545915778793e-06, + "loss": 2.2413, + "step": 4797 + }, + { + "epoch": 4.381735159817351, + "grad_norm": 4.665956020355225, + "learning_rate": 6.243531202435312e-06, + "loss": 0.0353, + "step": 4798 + }, + { + "epoch": 4.382648401826484, + "grad_norm": 5.585910320281982, + "learning_rate": 6.242516489091832e-06, + "loss": 0.0569, + "step": 4799 + }, + { + "epoch": 4.383561643835616, + "grad_norm": 28.583106994628906, + "learning_rate": 6.241501775748352e-06, + "loss": 0.3668, + "step": 4800 + }, + { + "epoch": 4.384474885844749, + "grad_norm": 17.339080810546875, + "learning_rate": 6.24048706240487e-06, + "loss": 0.1473, + "step": 4801 + }, + { + "epoch": 4.385388127853881, + "grad_norm": 1.1464970111846924, + "learning_rate": 6.239472349061391e-06, + "loss": 0.0084, + "step": 4802 + }, + { + "epoch": 4.3863013698630136, + "grad_norm": 4.016493797302246, + "learning_rate": 6.238457635717911e-06, + "loss": 0.0371, + "step": 4803 + }, + { + "epoch": 4.387214611872146, + "grad_norm": 11.850817680358887, + "learning_rate": 6.2374429223744295e-06, + "loss": 0.0746, + "step": 4804 + }, + { + "epoch": 4.3881278538812785, + "grad_norm": 0.11479367315769196, + "learning_rate": 6.236428209030949e-06, + "loss": 0.001, + "step": 4805 + }, + { + "epoch": 4.389041095890411, + "grad_norm": 0.8178774118423462, + "learning_rate": 6.235413495687469e-06, + "loss": 0.0048, + "step": 4806 + }, + { + "epoch": 4.389954337899543, + "grad_norm": 8.803354263305664, + "learning_rate": 6.2343987823439886e-06, + "loss": 0.0753, + "step": 4807 + }, + { + "epoch": 4.390867579908676, + "grad_norm": 12.343573570251465, + "learning_rate": 6.233384069000507e-06, + "loss": 0.0917, + "step": 4808 + }, + { + "epoch": 4.391780821917808, + "grad_norm": 0.13044971227645874, + "learning_rate": 6.232369355657027e-06, + "loss": 0.0011, + "step": 4809 + }, + { + "epoch": 4.392694063926941, + "grad_norm": 6.090965747833252, + "learning_rate": 6.231354642313548e-06, + "loss": 0.0788, + "step": 4810 + }, + { + "epoch": 4.393607305936073, + "grad_norm": 2.6946020126342773, + "learning_rate": 6.2303399289700665e-06, + "loss": 0.0292, + "step": 4811 + }, + { + "epoch": 4.394520547945206, + "grad_norm": 69.2060546875, + "learning_rate": 6.229325215626586e-06, + "loss": 0.6176, + "step": 4812 + }, + { + "epoch": 4.395433789954338, + "grad_norm": 7.384162425994873, + "learning_rate": 6.228310502283106e-06, + "loss": 0.0662, + "step": 4813 + }, + { + "epoch": 4.3963470319634705, + "grad_norm": 26.054433822631836, + "learning_rate": 6.227295788939625e-06, + "loss": 0.2769, + "step": 4814 + }, + { + "epoch": 4.397260273972603, + "grad_norm": 199.82093811035156, + "learning_rate": 6.226281075596144e-06, + "loss": 0.3845, + "step": 4815 + }, + { + "epoch": 4.3981735159817354, + "grad_norm": 1.689378023147583, + "learning_rate": 6.225266362252664e-06, + "loss": 0.0142, + "step": 4816 + }, + { + "epoch": 4.399086757990868, + "grad_norm": 1.3947052955627441, + "learning_rate": 6.224251648909185e-06, + "loss": 0.0166, + "step": 4817 + }, + { + "epoch": 4.4, + "grad_norm": 1.4608404636383057, + "learning_rate": 6.223236935565703e-06, + "loss": 0.0098, + "step": 4818 + }, + { + "epoch": 4.400913242009133, + "grad_norm": 21.276901245117188, + "learning_rate": 6.222222222222223e-06, + "loss": 0.0514, + "step": 4819 + }, + { + "epoch": 4.401826484018265, + "grad_norm": 4.9438300132751465, + "learning_rate": 6.221207508878743e-06, + "loss": 0.0517, + "step": 4820 + }, + { + "epoch": 4.402739726027397, + "grad_norm": 1.6236546039581299, + "learning_rate": 6.220192795535262e-06, + "loss": 0.0168, + "step": 4821 + }, + { + "epoch": 4.403652968036529, + "grad_norm": 2.18082857131958, + "learning_rate": 6.219178082191781e-06, + "loss": 0.0187, + "step": 4822 + }, + { + "epoch": 4.404566210045662, + "grad_norm": 1.6026337146759033, + "learning_rate": 6.218163368848301e-06, + "loss": 0.0138, + "step": 4823 + }, + { + "epoch": 4.405479452054794, + "grad_norm": 2.2471745014190674, + "learning_rate": 6.21714865550482e-06, + "loss": 0.0131, + "step": 4824 + }, + { + "epoch": 4.406392694063927, + "grad_norm": 13.301112174987793, + "learning_rate": 6.21613394216134e-06, + "loss": 0.2025, + "step": 4825 + }, + { + "epoch": 4.407305936073059, + "grad_norm": 62.2446174621582, + "learning_rate": 6.215119228817859e-06, + "loss": 2.3046, + "step": 4826 + }, + { + "epoch": 4.4082191780821915, + "grad_norm": 1.239205002784729, + "learning_rate": 6.21410451547438e-06, + "loss": 0.0102, + "step": 4827 + }, + { + "epoch": 4.409132420091324, + "grad_norm": 2.2417006492614746, + "learning_rate": 6.213089802130899e-06, + "loss": 0.0221, + "step": 4828 + }, + { + "epoch": 4.4100456621004565, + "grad_norm": 12.62962532043457, + "learning_rate": 6.212075088787418e-06, + "loss": 0.103, + "step": 4829 + }, + { + "epoch": 4.410958904109589, + "grad_norm": 32.40512466430664, + "learning_rate": 6.211060375443938e-06, + "loss": 0.4688, + "step": 4830 + }, + { + "epoch": 4.411872146118721, + "grad_norm": 1.73295259475708, + "learning_rate": 6.210045662100457e-06, + "loss": 0.0131, + "step": 4831 + }, + { + "epoch": 4.412785388127854, + "grad_norm": 43.58491516113281, + "learning_rate": 6.209030948756977e-06, + "loss": 0.5311, + "step": 4832 + }, + { + "epoch": 4.413698630136986, + "grad_norm": 0.3663579225540161, + "learning_rate": 6.208016235413496e-06, + "loss": 0.0026, + "step": 4833 + }, + { + "epoch": 4.414611872146119, + "grad_norm": 97.81590270996094, + "learning_rate": 6.207001522070015e-06, + "loss": 3.7278, + "step": 4834 + }, + { + "epoch": 4.415525114155251, + "grad_norm": 2.6155853271484375, + "learning_rate": 6.205986808726535e-06, + "loss": 0.0301, + "step": 4835 + }, + { + "epoch": 4.416438356164384, + "grad_norm": 11.30250072479248, + "learning_rate": 6.204972095383055e-06, + "loss": 0.1082, + "step": 4836 + }, + { + "epoch": 4.417351598173516, + "grad_norm": 15.017504692077637, + "learning_rate": 6.203957382039575e-06, + "loss": 0.1932, + "step": 4837 + }, + { + "epoch": 4.4182648401826485, + "grad_norm": 27.03785514831543, + "learning_rate": 6.202942668696094e-06, + "loss": 0.2076, + "step": 4838 + }, + { + "epoch": 4.419178082191781, + "grad_norm": 48.63595199584961, + "learning_rate": 6.201927955352614e-06, + "loss": 0.587, + "step": 4839 + }, + { + "epoch": 4.420091324200913, + "grad_norm": 5.634256362915039, + "learning_rate": 6.200913242009133e-06, + "loss": 0.0658, + "step": 4840 + }, + { + "epoch": 4.421004566210046, + "grad_norm": 0.7416161298751831, + "learning_rate": 6.199898528665652e-06, + "loss": 0.0053, + "step": 4841 + }, + { + "epoch": 4.421917808219178, + "grad_norm": 42.29817581176758, + "learning_rate": 6.198883815322172e-06, + "loss": 0.3368, + "step": 4842 + }, + { + "epoch": 4.422831050228311, + "grad_norm": 28.28836441040039, + "learning_rate": 6.1978691019786915e-06, + "loss": 0.351, + "step": 4843 + }, + { + "epoch": 4.423744292237443, + "grad_norm": 5.622524261474609, + "learning_rate": 6.19685438863521e-06, + "loss": 0.0482, + "step": 4844 + }, + { + "epoch": 4.424657534246576, + "grad_norm": 13.4956693649292, + "learning_rate": 6.19583967529173e-06, + "loss": 0.1686, + "step": 4845 + }, + { + "epoch": 4.425570776255708, + "grad_norm": 63.184226989746094, + "learning_rate": 6.194824961948251e-06, + "loss": 0.7111, + "step": 4846 + }, + { + "epoch": 4.426484018264841, + "grad_norm": 5.271518230438232, + "learning_rate": 6.19381024860477e-06, + "loss": 0.0367, + "step": 4847 + }, + { + "epoch": 4.427397260273972, + "grad_norm": 13.507400512695312, + "learning_rate": 6.192795535261289e-06, + "loss": 0.1114, + "step": 4848 + }, + { + "epoch": 4.428310502283105, + "grad_norm": 1.1916154623031616, + "learning_rate": 6.191780821917809e-06, + "loss": 0.009, + "step": 4849 + }, + { + "epoch": 4.429223744292237, + "grad_norm": 3.709444522857666, + "learning_rate": 6.1907661085743285e-06, + "loss": 0.0227, + "step": 4850 + }, + { + "epoch": 4.4301369863013695, + "grad_norm": 4.626984596252441, + "learning_rate": 6.189751395230847e-06, + "loss": 0.0384, + "step": 4851 + }, + { + "epoch": 4.431050228310502, + "grad_norm": 0.44060465693473816, + "learning_rate": 6.188736681887367e-06, + "loss": 0.0042, + "step": 4852 + }, + { + "epoch": 4.4319634703196344, + "grad_norm": 7.16013765335083, + "learning_rate": 6.187721968543887e-06, + "loss": 0.0775, + "step": 4853 + }, + { + "epoch": 4.432876712328767, + "grad_norm": 2.8040926456451416, + "learning_rate": 6.186707255200406e-06, + "loss": 0.021, + "step": 4854 + }, + { + "epoch": 4.433789954337899, + "grad_norm": 7.870980262756348, + "learning_rate": 6.185692541856926e-06, + "loss": 0.0684, + "step": 4855 + }, + { + "epoch": 4.434703196347032, + "grad_norm": 26.00895881652832, + "learning_rate": 6.184677828513446e-06, + "loss": 0.1829, + "step": 4856 + }, + { + "epoch": 4.435616438356164, + "grad_norm": 10.80495834350586, + "learning_rate": 6.1836631151699655e-06, + "loss": 0.0856, + "step": 4857 + }, + { + "epoch": 4.436529680365297, + "grad_norm": 33.20359420776367, + "learning_rate": 6.182648401826484e-06, + "loss": 0.2695, + "step": 4858 + }, + { + "epoch": 4.437442922374429, + "grad_norm": 69.5818862915039, + "learning_rate": 6.181633688483004e-06, + "loss": 0.8802, + "step": 4859 + }, + { + "epoch": 4.438356164383562, + "grad_norm": 0.18046438694000244, + "learning_rate": 6.180618975139524e-06, + "loss": 0.0018, + "step": 4860 + }, + { + "epoch": 4.439269406392694, + "grad_norm": 2.4806885719299316, + "learning_rate": 6.179604261796043e-06, + "loss": 0.0224, + "step": 4861 + }, + { + "epoch": 4.4401826484018265, + "grad_norm": 4.3167572021484375, + "learning_rate": 6.178589548452562e-06, + "loss": 0.0391, + "step": 4862 + }, + { + "epoch": 4.441095890410959, + "grad_norm": 8.844544410705566, + "learning_rate": 6.177574835109083e-06, + "loss": 0.0866, + "step": 4863 + }, + { + "epoch": 4.442009132420091, + "grad_norm": 1.7616888284683228, + "learning_rate": 6.176560121765602e-06, + "loss": 0.0146, + "step": 4864 + }, + { + "epoch": 4.442922374429224, + "grad_norm": 3.345799207687378, + "learning_rate": 6.175545408422121e-06, + "loss": 0.022, + "step": 4865 + }, + { + "epoch": 4.443835616438356, + "grad_norm": 63.03179168701172, + "learning_rate": 6.174530695078641e-06, + "loss": 1.2233, + "step": 4866 + }, + { + "epoch": 4.444748858447489, + "grad_norm": 5.637470722198486, + "learning_rate": 6.173515981735161e-06, + "loss": 0.0409, + "step": 4867 + }, + { + "epoch": 4.445662100456621, + "grad_norm": 1.140840768814087, + "learning_rate": 6.17250126839168e-06, + "loss": 0.0059, + "step": 4868 + }, + { + "epoch": 4.446575342465754, + "grad_norm": 11.460139274597168, + "learning_rate": 6.171486555048199e-06, + "loss": 0.0689, + "step": 4869 + }, + { + "epoch": 4.447488584474886, + "grad_norm": 2.8065435886383057, + "learning_rate": 6.170471841704719e-06, + "loss": 0.0263, + "step": 4870 + }, + { + "epoch": 4.448401826484019, + "grad_norm": 5.00742244720459, + "learning_rate": 6.169457128361238e-06, + "loss": 0.0257, + "step": 4871 + }, + { + "epoch": 4.449315068493151, + "grad_norm": 8.201238632202148, + "learning_rate": 6.168442415017758e-06, + "loss": 0.045, + "step": 4872 + }, + { + "epoch": 4.4502283105022835, + "grad_norm": 63.79883575439453, + "learning_rate": 6.167427701674278e-06, + "loss": 0.5221, + "step": 4873 + }, + { + "epoch": 4.451141552511416, + "grad_norm": 9.889117240905762, + "learning_rate": 6.166412988330797e-06, + "loss": 0.1111, + "step": 4874 + }, + { + "epoch": 4.4520547945205475, + "grad_norm": 3.969895362854004, + "learning_rate": 6.165398274987317e-06, + "loss": 0.0317, + "step": 4875 + }, + { + "epoch": 4.45296803652968, + "grad_norm": 51.39961624145508, + "learning_rate": 6.164383561643836e-06, + "loss": 0.5812, + "step": 4876 + }, + { + "epoch": 4.453881278538812, + "grad_norm": 4.3905134201049805, + "learning_rate": 6.163368848300356e-06, + "loss": 0.0391, + "step": 4877 + }, + { + "epoch": 4.454794520547945, + "grad_norm": 2.146899461746216, + "learning_rate": 6.162354134956875e-06, + "loss": 0.0121, + "step": 4878 + }, + { + "epoch": 4.455707762557077, + "grad_norm": 0.0988040342926979, + "learning_rate": 6.1613394216133945e-06, + "loss": 0.0011, + "step": 4879 + }, + { + "epoch": 4.45662100456621, + "grad_norm": 17.47664451599121, + "learning_rate": 6.160324708269915e-06, + "loss": 0.2198, + "step": 4880 + }, + { + "epoch": 4.457534246575342, + "grad_norm": 14.543764114379883, + "learning_rate": 6.159309994926433e-06, + "loss": 0.1026, + "step": 4881 + }, + { + "epoch": 4.458447488584475, + "grad_norm": 18.2650203704834, + "learning_rate": 6.158295281582954e-06, + "loss": 0.2251, + "step": 4882 + }, + { + "epoch": 4.459360730593607, + "grad_norm": 0.7360461354255676, + "learning_rate": 6.157280568239473e-06, + "loss": 0.0047, + "step": 4883 + }, + { + "epoch": 4.46027397260274, + "grad_norm": 43.72280502319336, + "learning_rate": 6.156265854895992e-06, + "loss": 0.4654, + "step": 4884 + }, + { + "epoch": 4.461187214611872, + "grad_norm": 0.1787296086549759, + "learning_rate": 6.155251141552512e-06, + "loss": 0.0016, + "step": 4885 + }, + { + "epoch": 4.4621004566210045, + "grad_norm": 0.4777890145778656, + "learning_rate": 6.1542364282090315e-06, + "loss": 0.0036, + "step": 4886 + }, + { + "epoch": 4.463013698630137, + "grad_norm": 23.59482765197754, + "learning_rate": 6.153221714865551e-06, + "loss": 0.0958, + "step": 4887 + }, + { + "epoch": 4.463926940639269, + "grad_norm": 1.4789557456970215, + "learning_rate": 6.15220700152207e-06, + "loss": 0.0113, + "step": 4888 + }, + { + "epoch": 4.464840182648402, + "grad_norm": 1.1777477264404297, + "learning_rate": 6.15119228817859e-06, + "loss": 0.0064, + "step": 4889 + }, + { + "epoch": 4.465753424657534, + "grad_norm": 25.7457332611084, + "learning_rate": 6.15017757483511e-06, + "loss": 0.2749, + "step": 4890 + }, + { + "epoch": 4.466666666666667, + "grad_norm": 3.1483259201049805, + "learning_rate": 6.149162861491629e-06, + "loss": 0.031, + "step": 4891 + }, + { + "epoch": 4.467579908675799, + "grad_norm": 1.1644207239151, + "learning_rate": 6.148148148148149e-06, + "loss": 0.009, + "step": 4892 + }, + { + "epoch": 4.468493150684932, + "grad_norm": 35.16437911987305, + "learning_rate": 6.1471334348046685e-06, + "loss": 0.5259, + "step": 4893 + }, + { + "epoch": 4.469406392694064, + "grad_norm": 32.45118713378906, + "learning_rate": 6.146118721461187e-06, + "loss": 0.2736, + "step": 4894 + }, + { + "epoch": 4.470319634703197, + "grad_norm": 34.94199752807617, + "learning_rate": 6.145104008117707e-06, + "loss": 0.2274, + "step": 4895 + }, + { + "epoch": 4.471232876712329, + "grad_norm": 10.794196128845215, + "learning_rate": 6.144089294774227e-06, + "loss": 0.0674, + "step": 4896 + }, + { + "epoch": 4.4721461187214615, + "grad_norm": 4.637907981872559, + "learning_rate": 6.143074581430747e-06, + "loss": 0.024, + "step": 4897 + }, + { + "epoch": 4.473059360730594, + "grad_norm": 54.7872428894043, + "learning_rate": 6.142059868087265e-06, + "loss": 1.0556, + "step": 4898 + }, + { + "epoch": 4.473972602739726, + "grad_norm": 1.3433198928833008, + "learning_rate": 6.141045154743786e-06, + "loss": 0.0102, + "step": 4899 + }, + { + "epoch": 4.474885844748858, + "grad_norm": 1.015788197517395, + "learning_rate": 6.1400304414003055e-06, + "loss": 0.0074, + "step": 4900 + }, + { + "epoch": 4.475799086757991, + "grad_norm": 4.112566947937012, + "learning_rate": 6.139015728056824e-06, + "loss": 0.042, + "step": 4901 + }, + { + "epoch": 4.476712328767123, + "grad_norm": 0.9103612899780273, + "learning_rate": 6.138001014713344e-06, + "loss": 0.0079, + "step": 4902 + }, + { + "epoch": 4.477625570776255, + "grad_norm": 6.794249057769775, + "learning_rate": 6.136986301369864e-06, + "loss": 0.0526, + "step": 4903 + }, + { + "epoch": 4.478538812785388, + "grad_norm": 1.9883391857147217, + "learning_rate": 6.135971588026383e-06, + "loss": 0.0203, + "step": 4904 + }, + { + "epoch": 4.47945205479452, + "grad_norm": 92.91966247558594, + "learning_rate": 6.134956874682902e-06, + "loss": 0.5686, + "step": 4905 + }, + { + "epoch": 4.480365296803653, + "grad_norm": 27.256311416625977, + "learning_rate": 6.133942161339422e-06, + "loss": 0.2563, + "step": 4906 + }, + { + "epoch": 4.481278538812785, + "grad_norm": 0.9850491881370544, + "learning_rate": 6.1329274479959425e-06, + "loss": 0.0087, + "step": 4907 + }, + { + "epoch": 4.482191780821918, + "grad_norm": 34.71474075317383, + "learning_rate": 6.131912734652461e-06, + "loss": 0.1201, + "step": 4908 + }, + { + "epoch": 4.48310502283105, + "grad_norm": 66.4706039428711, + "learning_rate": 6.130898021308981e-06, + "loss": 1.173, + "step": 4909 + }, + { + "epoch": 4.4840182648401825, + "grad_norm": 7.310168266296387, + "learning_rate": 6.129883307965501e-06, + "loss": 0.0422, + "step": 4910 + }, + { + "epoch": 4.484931506849315, + "grad_norm": 86.06778717041016, + "learning_rate": 6.12886859462202e-06, + "loss": 1.5264, + "step": 4911 + }, + { + "epoch": 4.485844748858447, + "grad_norm": 57.671016693115234, + "learning_rate": 6.127853881278539e-06, + "loss": 0.7863, + "step": 4912 + }, + { + "epoch": 4.48675799086758, + "grad_norm": 36.52067947387695, + "learning_rate": 6.126839167935059e-06, + "loss": 0.3236, + "step": 4913 + }, + { + "epoch": 4.487671232876712, + "grad_norm": 0.47161000967025757, + "learning_rate": 6.125824454591578e-06, + "loss": 0.004, + "step": 4914 + }, + { + "epoch": 4.488584474885845, + "grad_norm": 6.864529609680176, + "learning_rate": 6.1248097412480975e-06, + "loss": 0.0422, + "step": 4915 + }, + { + "epoch": 4.489497716894977, + "grad_norm": 0.7755139470100403, + "learning_rate": 6.123795027904618e-06, + "loss": 0.0055, + "step": 4916 + }, + { + "epoch": 4.49041095890411, + "grad_norm": 18.804540634155273, + "learning_rate": 6.122780314561138e-06, + "loss": 0.1635, + "step": 4917 + }, + { + "epoch": 4.491324200913242, + "grad_norm": 1.754526138305664, + "learning_rate": 6.121765601217657e-06, + "loss": 0.0148, + "step": 4918 + }, + { + "epoch": 4.492237442922375, + "grad_norm": 76.7637939453125, + "learning_rate": 6.120750887874176e-06, + "loss": 1.5588, + "step": 4919 + }, + { + "epoch": 4.493150684931507, + "grad_norm": 1.361520528793335, + "learning_rate": 6.119736174530696e-06, + "loss": 0.0151, + "step": 4920 + }, + { + "epoch": 4.4940639269406395, + "grad_norm": 0.11254852265119553, + "learning_rate": 6.118721461187215e-06, + "loss": 0.0009, + "step": 4921 + }, + { + "epoch": 4.494977168949772, + "grad_norm": 26.30062484741211, + "learning_rate": 6.1177067478437345e-06, + "loss": 0.1924, + "step": 4922 + }, + { + "epoch": 4.495890410958904, + "grad_norm": 0.19212475419044495, + "learning_rate": 6.116692034500254e-06, + "loss": 0.0015, + "step": 4923 + }, + { + "epoch": 4.496803652968037, + "grad_norm": 4.169552326202393, + "learning_rate": 6.115677321156773e-06, + "loss": 0.0363, + "step": 4924 + }, + { + "epoch": 4.497716894977169, + "grad_norm": 37.7362174987793, + "learning_rate": 6.114662607813293e-06, + "loss": 0.3473, + "step": 4925 + }, + { + "epoch": 4.498630136986302, + "grad_norm": 2.609520673751831, + "learning_rate": 6.113647894469813e-06, + "loss": 0.0198, + "step": 4926 + }, + { + "epoch": 4.499543378995433, + "grad_norm": 35.20524978637695, + "learning_rate": 6.112633181126333e-06, + "loss": 0.2129, + "step": 4927 + }, + { + "epoch": 4.500456621004567, + "grad_norm": 6.961897373199463, + "learning_rate": 6.111618467782852e-06, + "loss": 0.0427, + "step": 4928 + }, + { + "epoch": 4.501369863013698, + "grad_norm": 1.712124228477478, + "learning_rate": 6.1106037544393715e-06, + "loss": 0.0076, + "step": 4929 + }, + { + "epoch": 4.502283105022831, + "grad_norm": 8.047490119934082, + "learning_rate": 6.109589041095891e-06, + "loss": 0.0737, + "step": 4930 + }, + { + "epoch": 4.503196347031963, + "grad_norm": 0.956339955329895, + "learning_rate": 6.10857432775241e-06, + "loss": 0.0068, + "step": 4931 + }, + { + "epoch": 4.504109589041096, + "grad_norm": 8.238567352294922, + "learning_rate": 6.10755961440893e-06, + "loss": 0.0645, + "step": 4932 + }, + { + "epoch": 4.505022831050228, + "grad_norm": 0.19102199375629425, + "learning_rate": 6.1065449010654494e-06, + "loss": 0.0012, + "step": 4933 + }, + { + "epoch": 4.5059360730593605, + "grad_norm": 1.376244306564331, + "learning_rate": 6.105530187721968e-06, + "loss": 0.0088, + "step": 4934 + }, + { + "epoch": 4.506849315068493, + "grad_norm": 25.611133575439453, + "learning_rate": 6.104515474378489e-06, + "loss": 0.206, + "step": 4935 + }, + { + "epoch": 4.507762557077625, + "grad_norm": 4.0026021003723145, + "learning_rate": 6.1035007610350085e-06, + "loss": 0.0255, + "step": 4936 + }, + { + "epoch": 4.508675799086758, + "grad_norm": 0.9970349669456482, + "learning_rate": 6.102486047691528e-06, + "loss": 0.0097, + "step": 4937 + }, + { + "epoch": 4.50958904109589, + "grad_norm": 34.09914016723633, + "learning_rate": 6.101471334348047e-06, + "loss": 0.1689, + "step": 4938 + }, + { + "epoch": 4.510502283105023, + "grad_norm": 13.34179973602295, + "learning_rate": 6.100456621004567e-06, + "loss": 0.1558, + "step": 4939 + }, + { + "epoch": 4.511415525114155, + "grad_norm": 0.7548406720161438, + "learning_rate": 6.099441907661086e-06, + "loss": 0.0082, + "step": 4940 + }, + { + "epoch": 4.512328767123288, + "grad_norm": 0.4636147916316986, + "learning_rate": 6.098427194317605e-06, + "loss": 0.0028, + "step": 4941 + }, + { + "epoch": 4.51324200913242, + "grad_norm": 1.76813805103302, + "learning_rate": 6.097412480974125e-06, + "loss": 0.0129, + "step": 4942 + }, + { + "epoch": 4.514155251141553, + "grad_norm": 11.935117721557617, + "learning_rate": 6.0963977676306455e-06, + "loss": 0.0758, + "step": 4943 + }, + { + "epoch": 4.515068493150685, + "grad_norm": 3.3046932220458984, + "learning_rate": 6.095383054287164e-06, + "loss": 0.0148, + "step": 4944 + }, + { + "epoch": 4.5159817351598175, + "grad_norm": 17.11959457397461, + "learning_rate": 6.094368340943684e-06, + "loss": 0.0743, + "step": 4945 + }, + { + "epoch": 4.51689497716895, + "grad_norm": 35.50286102294922, + "learning_rate": 6.093353627600204e-06, + "loss": 0.3828, + "step": 4946 + }, + { + "epoch": 4.517808219178082, + "grad_norm": 37.36850357055664, + "learning_rate": 6.092338914256723e-06, + "loss": 0.4379, + "step": 4947 + }, + { + "epoch": 4.518721461187215, + "grad_norm": 79.39689636230469, + "learning_rate": 6.091324200913242e-06, + "loss": 0.6034, + "step": 4948 + }, + { + "epoch": 4.519634703196347, + "grad_norm": 0.326158732175827, + "learning_rate": 6.090309487569762e-06, + "loss": 0.0024, + "step": 4949 + }, + { + "epoch": 4.52054794520548, + "grad_norm": 1.4555734395980835, + "learning_rate": 6.089294774226282e-06, + "loss": 0.0119, + "step": 4950 + }, + { + "epoch": 4.521461187214612, + "grad_norm": 6.3045525550842285, + "learning_rate": 6.0882800608828005e-06, + "loss": 0.0544, + "step": 4951 + }, + { + "epoch": 4.522374429223745, + "grad_norm": 57.88554000854492, + "learning_rate": 6.087265347539321e-06, + "loss": 0.7651, + "step": 4952 + }, + { + "epoch": 4.523287671232877, + "grad_norm": 13.819686889648438, + "learning_rate": 6.086250634195841e-06, + "loss": 0.0901, + "step": 4953 + }, + { + "epoch": 4.524200913242009, + "grad_norm": 4.938825607299805, + "learning_rate": 6.0852359208523596e-06, + "loss": 0.0387, + "step": 4954 + }, + { + "epoch": 4.525114155251142, + "grad_norm": 30.48051643371582, + "learning_rate": 6.084221207508879e-06, + "loss": 0.1937, + "step": 4955 + }, + { + "epoch": 4.526027397260274, + "grad_norm": 5.513085842132568, + "learning_rate": 6.083206494165399e-06, + "loss": 0.0313, + "step": 4956 + }, + { + "epoch": 4.526940639269406, + "grad_norm": 5.443197250366211, + "learning_rate": 6.082191780821919e-06, + "loss": 0.0475, + "step": 4957 + }, + { + "epoch": 4.5278538812785385, + "grad_norm": 41.82814407348633, + "learning_rate": 6.0811770674784375e-06, + "loss": 0.2349, + "step": 4958 + }, + { + "epoch": 4.528767123287671, + "grad_norm": 0.4692452549934387, + "learning_rate": 6.080162354134957e-06, + "loss": 0.0039, + "step": 4959 + }, + { + "epoch": 4.529680365296803, + "grad_norm": 2.942383050918579, + "learning_rate": 6.079147640791478e-06, + "loss": 0.0167, + "step": 4960 + }, + { + "epoch": 4.530593607305936, + "grad_norm": 7.061032772064209, + "learning_rate": 6.078132927447996e-06, + "loss": 0.0477, + "step": 4961 + }, + { + "epoch": 4.531506849315068, + "grad_norm": 3.908989429473877, + "learning_rate": 6.077118214104516e-06, + "loss": 0.0258, + "step": 4962 + }, + { + "epoch": 4.532420091324201, + "grad_norm": 1.71480131149292, + "learning_rate": 6.076103500761036e-06, + "loss": 0.0123, + "step": 4963 + }, + { + "epoch": 4.533333333333333, + "grad_norm": 39.683292388916016, + "learning_rate": 6.075088787417555e-06, + "loss": 0.3813, + "step": 4964 + }, + { + "epoch": 4.534246575342466, + "grad_norm": 3.938717842102051, + "learning_rate": 6.0740740740740745e-06, + "loss": 0.0291, + "step": 4965 + }, + { + "epoch": 4.535159817351598, + "grad_norm": 0.049052439630031586, + "learning_rate": 6.073059360730594e-06, + "loss": 0.0004, + "step": 4966 + }, + { + "epoch": 4.536073059360731, + "grad_norm": 34.27362060546875, + "learning_rate": 6.072044647387114e-06, + "loss": 0.2562, + "step": 4967 + }, + { + "epoch": 4.536986301369863, + "grad_norm": 17.649690628051758, + "learning_rate": 6.071029934043633e-06, + "loss": 0.1212, + "step": 4968 + }, + { + "epoch": 4.5378995433789955, + "grad_norm": 0.7052974104881287, + "learning_rate": 6.070015220700152e-06, + "loss": 0.0058, + "step": 4969 + }, + { + "epoch": 4.538812785388128, + "grad_norm": 10.50788402557373, + "learning_rate": 6.069000507356673e-06, + "loss": 0.0853, + "step": 4970 + }, + { + "epoch": 4.53972602739726, + "grad_norm": 6.911248683929443, + "learning_rate": 6.067985794013192e-06, + "loss": 0.069, + "step": 4971 + }, + { + "epoch": 4.540639269406393, + "grad_norm": 2.685328483581543, + "learning_rate": 6.0669710806697115e-06, + "loss": 0.018, + "step": 4972 + }, + { + "epoch": 4.541552511415525, + "grad_norm": 27.870010375976562, + "learning_rate": 6.065956367326231e-06, + "loss": 0.158, + "step": 4973 + }, + { + "epoch": 4.542465753424658, + "grad_norm": 80.76605224609375, + "learning_rate": 6.06494165398275e-06, + "loss": 0.84, + "step": 4974 + }, + { + "epoch": 4.54337899543379, + "grad_norm": 0.4481177031993866, + "learning_rate": 6.06392694063927e-06, + "loss": 0.0032, + "step": 4975 + }, + { + "epoch": 4.544292237442923, + "grad_norm": 30.508636474609375, + "learning_rate": 6.062912227295789e-06, + "loss": 0.161, + "step": 4976 + }, + { + "epoch": 4.545205479452055, + "grad_norm": 4.389225006103516, + "learning_rate": 6.061897513952309e-06, + "loss": 0.0321, + "step": 4977 + }, + { + "epoch": 4.546118721461188, + "grad_norm": 37.19953918457031, + "learning_rate": 6.060882800608828e-06, + "loss": 0.211, + "step": 4978 + }, + { + "epoch": 4.54703196347032, + "grad_norm": 41.70857620239258, + "learning_rate": 6.0598680872653485e-06, + "loss": 0.4933, + "step": 4979 + }, + { + "epoch": 4.5479452054794525, + "grad_norm": 1.0834146738052368, + "learning_rate": 6.058853373921868e-06, + "loss": 0.0065, + "step": 4980 + }, + { + "epoch": 4.548858447488584, + "grad_norm": 0.23764824867248535, + "learning_rate": 6.057838660578387e-06, + "loss": 0.0013, + "step": 4981 + }, + { + "epoch": 4.549771689497717, + "grad_norm": 59.67295455932617, + "learning_rate": 6.056823947234907e-06, + "loss": 0.3433, + "step": 4982 + }, + { + "epoch": 4.550684931506849, + "grad_norm": 13.386343002319336, + "learning_rate": 6.055809233891426e-06, + "loss": 0.1413, + "step": 4983 + }, + { + "epoch": 4.551598173515981, + "grad_norm": 4.114813327789307, + "learning_rate": 6.054794520547945e-06, + "loss": 0.0129, + "step": 4984 + }, + { + "epoch": 4.552511415525114, + "grad_norm": 1.1069068908691406, + "learning_rate": 6.053779807204465e-06, + "loss": 0.0066, + "step": 4985 + }, + { + "epoch": 4.553424657534246, + "grad_norm": 5.0698933601379395, + "learning_rate": 6.052765093860985e-06, + "loss": 0.0322, + "step": 4986 + }, + { + "epoch": 4.554337899543379, + "grad_norm": 63.52521514892578, + "learning_rate": 6.051750380517505e-06, + "loss": 1.4068, + "step": 4987 + }, + { + "epoch": 4.555251141552511, + "grad_norm": 1.1979459524154663, + "learning_rate": 6.050735667174024e-06, + "loss": 0.0103, + "step": 4988 + }, + { + "epoch": 4.556164383561644, + "grad_norm": 4.262155532836914, + "learning_rate": 6.049720953830544e-06, + "loss": 0.0143, + "step": 4989 + }, + { + "epoch": 4.557077625570776, + "grad_norm": 6.781479358673096, + "learning_rate": 6.048706240487063e-06, + "loss": 0.055, + "step": 4990 + }, + { + "epoch": 4.557990867579909, + "grad_norm": 0.1359366923570633, + "learning_rate": 6.047691527143582e-06, + "loss": 0.0009, + "step": 4991 + }, + { + "epoch": 4.558904109589041, + "grad_norm": 0.27076610922813416, + "learning_rate": 6.046676813800102e-06, + "loss": 0.0017, + "step": 4992 + }, + { + "epoch": 4.5598173515981735, + "grad_norm": 5.296483516693115, + "learning_rate": 6.045662100456622e-06, + "loss": 0.0426, + "step": 4993 + }, + { + "epoch": 4.560730593607306, + "grad_norm": 24.81288719177246, + "learning_rate": 6.0446473871131405e-06, + "loss": 0.2497, + "step": 4994 + }, + { + "epoch": 4.561643835616438, + "grad_norm": 29.241657257080078, + "learning_rate": 6.04363267376966e-06, + "loss": 0.2848, + "step": 4995 + }, + { + "epoch": 4.562557077625571, + "grad_norm": 2.0587263107299805, + "learning_rate": 6.042617960426181e-06, + "loss": 0.0168, + "step": 4996 + }, + { + "epoch": 4.563470319634703, + "grad_norm": 0.37372374534606934, + "learning_rate": 6.0416032470827e-06, + "loss": 0.0031, + "step": 4997 + }, + { + "epoch": 4.564383561643836, + "grad_norm": 6.383928298950195, + "learning_rate": 6.040588533739219e-06, + "loss": 0.0195, + "step": 4998 + }, + { + "epoch": 4.565296803652968, + "grad_norm": 3.644653081893921, + "learning_rate": 6.039573820395739e-06, + "loss": 0.0351, + "step": 4999 + }, + { + "epoch": 4.566210045662101, + "grad_norm": 0.41059207916259766, + "learning_rate": 6.038559107052259e-06, + "loss": 0.003, + "step": 5000 + }, + { + "epoch": 4.567123287671233, + "grad_norm": 11.967215538024902, + "learning_rate": 6.0375443937087775e-06, + "loss": 0.1187, + "step": 5001 + }, + { + "epoch": 4.5680365296803656, + "grad_norm": 10.992149353027344, + "learning_rate": 6.036529680365297e-06, + "loss": 0.1074, + "step": 5002 + }, + { + "epoch": 4.568949771689498, + "grad_norm": 1.308815836906433, + "learning_rate": 6.035514967021817e-06, + "loss": 0.0073, + "step": 5003 + }, + { + "epoch": 4.5698630136986305, + "grad_norm": 0.8909569978713989, + "learning_rate": 6.034500253678336e-06, + "loss": 0.0072, + "step": 5004 + }, + { + "epoch": 4.570776255707763, + "grad_norm": 1.572127103805542, + "learning_rate": 6.033485540334855e-06, + "loss": 0.0083, + "step": 5005 + }, + { + "epoch": 4.5716894977168945, + "grad_norm": 10.892782211303711, + "learning_rate": 6.032470826991376e-06, + "loss": 0.0718, + "step": 5006 + }, + { + "epoch": 4.572602739726028, + "grad_norm": 114.73028564453125, + "learning_rate": 6.031456113647896e-06, + "loss": 5.1967, + "step": 5007 + }, + { + "epoch": 4.573515981735159, + "grad_norm": 1.3922463655471802, + "learning_rate": 6.0304414003044145e-06, + "loss": 0.0086, + "step": 5008 + }, + { + "epoch": 4.574429223744293, + "grad_norm": 2.4980807304382324, + "learning_rate": 6.029426686960934e-06, + "loss": 0.0197, + "step": 5009 + }, + { + "epoch": 4.575342465753424, + "grad_norm": 2.914177417755127, + "learning_rate": 6.028411973617454e-06, + "loss": 0.0259, + "step": 5010 + }, + { + "epoch": 4.576255707762557, + "grad_norm": 7.190791606903076, + "learning_rate": 6.027397260273973e-06, + "loss": 0.0411, + "step": 5011 + }, + { + "epoch": 4.577168949771689, + "grad_norm": 2.9373133182525635, + "learning_rate": 6.026382546930492e-06, + "loss": 0.0232, + "step": 5012 + }, + { + "epoch": 4.578082191780822, + "grad_norm": 0.22967834770679474, + "learning_rate": 6.025367833587012e-06, + "loss": 0.0013, + "step": 5013 + }, + { + "epoch": 4.578995433789954, + "grad_norm": 0.9546313285827637, + "learning_rate": 6.024353120243531e-06, + "loss": 0.0065, + "step": 5014 + }, + { + "epoch": 4.579908675799087, + "grad_norm": 1.6920503377914429, + "learning_rate": 6.0233384069000515e-06, + "loss": 0.0103, + "step": 5015 + }, + { + "epoch": 4.580821917808219, + "grad_norm": 8.338447570800781, + "learning_rate": 6.022323693556571e-06, + "loss": 0.0638, + "step": 5016 + }, + { + "epoch": 4.5817351598173515, + "grad_norm": 11.60546588897705, + "learning_rate": 6.021308980213091e-06, + "loss": 0.0839, + "step": 5017 + }, + { + "epoch": 4.582648401826484, + "grad_norm": 1.9531214237213135, + "learning_rate": 6.02029426686961e-06, + "loss": 0.0147, + "step": 5018 + }, + { + "epoch": 4.583561643835616, + "grad_norm": 33.19252014160156, + "learning_rate": 6.019279553526129e-06, + "loss": 0.3277, + "step": 5019 + }, + { + "epoch": 4.584474885844749, + "grad_norm": 19.979124069213867, + "learning_rate": 6.018264840182649e-06, + "loss": 0.1846, + "step": 5020 + }, + { + "epoch": 4.585388127853881, + "grad_norm": 4.384936809539795, + "learning_rate": 6.017250126839168e-06, + "loss": 0.0198, + "step": 5021 + }, + { + "epoch": 4.586301369863014, + "grad_norm": 0.13494259119033813, + "learning_rate": 6.016235413495688e-06, + "loss": 0.0011, + "step": 5022 + }, + { + "epoch": 4.587214611872146, + "grad_norm": 13.891939163208008, + "learning_rate": 6.015220700152208e-06, + "loss": 0.1276, + "step": 5023 + }, + { + "epoch": 4.588127853881279, + "grad_norm": 1.2272969484329224, + "learning_rate": 6.014205986808727e-06, + "loss": 0.0083, + "step": 5024 + }, + { + "epoch": 4.589041095890411, + "grad_norm": 4.41409158706665, + "learning_rate": 6.013191273465247e-06, + "loss": 0.0231, + "step": 5025 + }, + { + "epoch": 4.5899543378995435, + "grad_norm": 5.938418865203857, + "learning_rate": 6.012176560121766e-06, + "loss": 0.0686, + "step": 5026 + }, + { + "epoch": 4.590867579908676, + "grad_norm": 1.7382020950317383, + "learning_rate": 6.011161846778286e-06, + "loss": 0.016, + "step": 5027 + }, + { + "epoch": 4.5917808219178085, + "grad_norm": 7.298681259155273, + "learning_rate": 6.010147133434805e-06, + "loss": 0.0507, + "step": 5028 + }, + { + "epoch": 4.592694063926941, + "grad_norm": 5.1260809898376465, + "learning_rate": 6.009132420091325e-06, + "loss": 0.0375, + "step": 5029 + }, + { + "epoch": 4.593607305936073, + "grad_norm": 43.801700592041016, + "learning_rate": 6.008117706747844e-06, + "loss": 0.353, + "step": 5030 + }, + { + "epoch": 4.594520547945206, + "grad_norm": 0.7032362818717957, + "learning_rate": 6.007102993404363e-06, + "loss": 0.0046, + "step": 5031 + }, + { + "epoch": 4.595433789954338, + "grad_norm": 4.4822282791137695, + "learning_rate": 6.006088280060884e-06, + "loss": 0.0294, + "step": 5032 + }, + { + "epoch": 4.59634703196347, + "grad_norm": 26.322246551513672, + "learning_rate": 6.005073566717403e-06, + "loss": 0.143, + "step": 5033 + }, + { + "epoch": 4.597260273972603, + "grad_norm": 0.4281940758228302, + "learning_rate": 6.004058853373922e-06, + "loss": 0.003, + "step": 5034 + }, + { + "epoch": 4.598173515981735, + "grad_norm": 4.512266635894775, + "learning_rate": 6.003044140030442e-06, + "loss": 0.0262, + "step": 5035 + }, + { + "epoch": 4.599086757990867, + "grad_norm": 2.250082015991211, + "learning_rate": 6.002029426686962e-06, + "loss": 0.0153, + "step": 5036 + }, + { + "epoch": 4.6, + "grad_norm": 2.2504239082336426, + "learning_rate": 6.001014713343481e-06, + "loss": 0.0127, + "step": 5037 + }, + { + "epoch": 4.600913242009132, + "grad_norm": 25.443416595458984, + "learning_rate": 6e-06, + "loss": 0.1774, + "step": 5038 + }, + { + "epoch": 4.6018264840182646, + "grad_norm": 4.183566093444824, + "learning_rate": 5.99898528665652e-06, + "loss": 0.0375, + "step": 5039 + }, + { + "epoch": 4.602739726027397, + "grad_norm": 30.48594093322754, + "learning_rate": 5.99797057331304e-06, + "loss": 0.3492, + "step": 5040 + }, + { + "epoch": 4.6036529680365295, + "grad_norm": 0.778468668460846, + "learning_rate": 5.996955859969558e-06, + "loss": 0.0042, + "step": 5041 + }, + { + "epoch": 4.604566210045662, + "grad_norm": 1.6701767444610596, + "learning_rate": 5.995941146626079e-06, + "loss": 0.0145, + "step": 5042 + }, + { + "epoch": 4.605479452054794, + "grad_norm": 143.5858154296875, + "learning_rate": 5.994926433282599e-06, + "loss": 2.0512, + "step": 5043 + }, + { + "epoch": 4.606392694063927, + "grad_norm": 33.95526885986328, + "learning_rate": 5.9939117199391175e-06, + "loss": 0.2662, + "step": 5044 + }, + { + "epoch": 4.607305936073059, + "grad_norm": 8.529084205627441, + "learning_rate": 5.992897006595637e-06, + "loss": 0.0608, + "step": 5045 + }, + { + "epoch": 4.608219178082192, + "grad_norm": 35.2895622253418, + "learning_rate": 5.991882293252157e-06, + "loss": 0.3138, + "step": 5046 + }, + { + "epoch": 4.609132420091324, + "grad_norm": 0.8134130835533142, + "learning_rate": 5.990867579908676e-06, + "loss": 0.0055, + "step": 5047 + }, + { + "epoch": 4.610045662100457, + "grad_norm": 13.541447639465332, + "learning_rate": 5.989852866565195e-06, + "loss": 0.1296, + "step": 5048 + }, + { + "epoch": 4.610958904109589, + "grad_norm": 1.4799625873565674, + "learning_rate": 5.988838153221715e-06, + "loss": 0.009, + "step": 5049 + }, + { + "epoch": 4.6118721461187215, + "grad_norm": 1.480337142944336, + "learning_rate": 5.987823439878236e-06, + "loss": 0.0104, + "step": 5050 + }, + { + "epoch": 4.612785388127854, + "grad_norm": 2.389274835586548, + "learning_rate": 5.9868087265347545e-06, + "loss": 0.0181, + "step": 5051 + }, + { + "epoch": 4.6136986301369864, + "grad_norm": 0.4836570918560028, + "learning_rate": 5.985794013191274e-06, + "loss": 0.0027, + "step": 5052 + }, + { + "epoch": 4.614611872146119, + "grad_norm": 15.469552040100098, + "learning_rate": 5.984779299847794e-06, + "loss": 0.13, + "step": 5053 + }, + { + "epoch": 4.615525114155251, + "grad_norm": 4.30690860748291, + "learning_rate": 5.983764586504313e-06, + "loss": 0.0394, + "step": 5054 + }, + { + "epoch": 4.616438356164384, + "grad_norm": 0.5510467886924744, + "learning_rate": 5.982749873160832e-06, + "loss": 0.0032, + "step": 5055 + }, + { + "epoch": 4.617351598173516, + "grad_norm": 0.16564542055130005, + "learning_rate": 5.981735159817352e-06, + "loss": 0.0013, + "step": 5056 + }, + { + "epoch": 4.618264840182649, + "grad_norm": 0.6738660931587219, + "learning_rate": 5.980720446473871e-06, + "loss": 0.006, + "step": 5057 + }, + { + "epoch": 4.619178082191781, + "grad_norm": 0.5617876052856445, + "learning_rate": 5.979705733130391e-06, + "loss": 0.004, + "step": 5058 + }, + { + "epoch": 4.620091324200914, + "grad_norm": 2.7335448265075684, + "learning_rate": 5.978691019786911e-06, + "loss": 0.0215, + "step": 5059 + }, + { + "epoch": 4.621004566210045, + "grad_norm": 25.561317443847656, + "learning_rate": 5.977676306443431e-06, + "loss": 0.1883, + "step": 5060 + }, + { + "epoch": 4.6219178082191785, + "grad_norm": 25.834428787231445, + "learning_rate": 5.97666159309995e-06, + "loss": 0.2604, + "step": 5061 + }, + { + "epoch": 4.62283105022831, + "grad_norm": 21.11931610107422, + "learning_rate": 5.975646879756469e-06, + "loss": 0.0761, + "step": 5062 + }, + { + "epoch": 4.6237442922374425, + "grad_norm": 1.5435904264450073, + "learning_rate": 5.974632166412989e-06, + "loss": 0.013, + "step": 5063 + }, + { + "epoch": 4.624657534246575, + "grad_norm": 2.5628254413604736, + "learning_rate": 5.973617453069508e-06, + "loss": 0.0262, + "step": 5064 + }, + { + "epoch": 4.6255707762557075, + "grad_norm": 18.754541397094727, + "learning_rate": 5.972602739726028e-06, + "loss": 0.1337, + "step": 5065 + }, + { + "epoch": 4.62648401826484, + "grad_norm": 31.832738876342773, + "learning_rate": 5.971588026382547e-06, + "loss": 0.221, + "step": 5066 + }, + { + "epoch": 4.627397260273972, + "grad_norm": 3.36777925491333, + "learning_rate": 5.970573313039066e-06, + "loss": 0.0233, + "step": 5067 + }, + { + "epoch": 4.628310502283105, + "grad_norm": 5.993345737457275, + "learning_rate": 5.969558599695587e-06, + "loss": 0.049, + "step": 5068 + }, + { + "epoch": 4.629223744292237, + "grad_norm": 5.2384490966796875, + "learning_rate": 5.968543886352106e-06, + "loss": 0.0032, + "step": 5069 + }, + { + "epoch": 4.63013698630137, + "grad_norm": 8.678038597106934, + "learning_rate": 5.967529173008626e-06, + "loss": 0.0621, + "step": 5070 + }, + { + "epoch": 4.631050228310502, + "grad_norm": 0.8538850545883179, + "learning_rate": 5.966514459665145e-06, + "loss": 0.0045, + "step": 5071 + }, + { + "epoch": 4.631963470319635, + "grad_norm": 1.4319466352462769, + "learning_rate": 5.965499746321665e-06, + "loss": 0.0092, + "step": 5072 + }, + { + "epoch": 4.632876712328767, + "grad_norm": 2.541191816329956, + "learning_rate": 5.964485032978184e-06, + "loss": 0.0183, + "step": 5073 + }, + { + "epoch": 4.6337899543378995, + "grad_norm": 8.988454818725586, + "learning_rate": 5.963470319634703e-06, + "loss": 0.0561, + "step": 5074 + }, + { + "epoch": 4.634703196347032, + "grad_norm": 10.626016616821289, + "learning_rate": 5.962455606291223e-06, + "loss": 0.0737, + "step": 5075 + }, + { + "epoch": 4.635616438356164, + "grad_norm": 1.5747560262680054, + "learning_rate": 5.961440892947743e-06, + "loss": 0.0124, + "step": 5076 + }, + { + "epoch": 4.636529680365297, + "grad_norm": 0.6005256772041321, + "learning_rate": 5.960426179604261e-06, + "loss": 0.0046, + "step": 5077 + }, + { + "epoch": 4.637442922374429, + "grad_norm": 29.476730346679688, + "learning_rate": 5.959411466260782e-06, + "loss": 0.2429, + "step": 5078 + }, + { + "epoch": 4.638356164383562, + "grad_norm": 2.1090736389160156, + "learning_rate": 5.958396752917302e-06, + "loss": 0.0139, + "step": 5079 + }, + { + "epoch": 4.639269406392694, + "grad_norm": 1.0041561126708984, + "learning_rate": 5.957382039573821e-06, + "loss": 0.0058, + "step": 5080 + }, + { + "epoch": 4.640182648401827, + "grad_norm": 106.09864044189453, + "learning_rate": 5.95636732623034e-06, + "loss": 1.2479, + "step": 5081 + }, + { + "epoch": 4.641095890410959, + "grad_norm": 3.0740530490875244, + "learning_rate": 5.95535261288686e-06, + "loss": 0.025, + "step": 5082 + }, + { + "epoch": 4.642009132420092, + "grad_norm": 0.7308427691459656, + "learning_rate": 5.9543378995433795e-06, + "loss": 0.0061, + "step": 5083 + }, + { + "epoch": 4.642922374429224, + "grad_norm": 31.07021141052246, + "learning_rate": 5.953323186199898e-06, + "loss": 0.2618, + "step": 5084 + }, + { + "epoch": 4.6438356164383565, + "grad_norm": 2.8999648094177246, + "learning_rate": 5.952308472856418e-06, + "loss": 0.024, + "step": 5085 + }, + { + "epoch": 4.644748858447489, + "grad_norm": 35.11711120605469, + "learning_rate": 5.951293759512939e-06, + "loss": 0.2257, + "step": 5086 + }, + { + "epoch": 4.6456621004566205, + "grad_norm": 15.425747871398926, + "learning_rate": 5.9502790461694574e-06, + "loss": 0.0906, + "step": 5087 + }, + { + "epoch": 4.646575342465754, + "grad_norm": 0.4656763970851898, + "learning_rate": 5.949264332825977e-06, + "loss": 0.0045, + "step": 5088 + }, + { + "epoch": 4.647488584474885, + "grad_norm": 5.34515380859375, + "learning_rate": 5.948249619482497e-06, + "loss": 0.0294, + "step": 5089 + }, + { + "epoch": 4.648401826484018, + "grad_norm": 0.6302225589752197, + "learning_rate": 5.9472349061390165e-06, + "loss": 0.0054, + "step": 5090 + }, + { + "epoch": 4.64931506849315, + "grad_norm": 67.54969787597656, + "learning_rate": 5.946220192795535e-06, + "loss": 0.6448, + "step": 5091 + }, + { + "epoch": 4.650228310502283, + "grad_norm": 2.7829647064208984, + "learning_rate": 5.945205479452055e-06, + "loss": 0.0204, + "step": 5092 + }, + { + "epoch": 4.651141552511415, + "grad_norm": 1.2766972780227661, + "learning_rate": 5.944190766108575e-06, + "loss": 0.0111, + "step": 5093 + }, + { + "epoch": 4.652054794520548, + "grad_norm": 2.475771427154541, + "learning_rate": 5.943176052765094e-06, + "loss": 0.0134, + "step": 5094 + }, + { + "epoch": 4.65296803652968, + "grad_norm": 6.649659156799316, + "learning_rate": 5.942161339421614e-06, + "loss": 0.0297, + "step": 5095 + }, + { + "epoch": 4.653881278538813, + "grad_norm": 10.59902286529541, + "learning_rate": 5.941146626078134e-06, + "loss": 0.0996, + "step": 5096 + }, + { + "epoch": 4.654794520547945, + "grad_norm": 26.395160675048828, + "learning_rate": 5.940131912734653e-06, + "loss": 0.1739, + "step": 5097 + }, + { + "epoch": 4.6557077625570775, + "grad_norm": 24.51691436767578, + "learning_rate": 5.939117199391172e-06, + "loss": 0.1583, + "step": 5098 + }, + { + "epoch": 4.65662100456621, + "grad_norm": 0.6297678351402283, + "learning_rate": 5.938102486047692e-06, + "loss": 0.0055, + "step": 5099 + }, + { + "epoch": 4.657534246575342, + "grad_norm": 12.086685180664062, + "learning_rate": 5.937087772704212e-06, + "loss": 0.092, + "step": 5100 + }, + { + "epoch": 4.658447488584475, + "grad_norm": 1.026427984237671, + "learning_rate": 5.936073059360731e-06, + "loss": 0.0083, + "step": 5101 + }, + { + "epoch": 4.659360730593607, + "grad_norm": 81.24484252929688, + "learning_rate": 5.93505834601725e-06, + "loss": 0.7828, + "step": 5102 + }, + { + "epoch": 4.66027397260274, + "grad_norm": 11.704761505126953, + "learning_rate": 5.934043632673771e-06, + "loss": 0.0809, + "step": 5103 + }, + { + "epoch": 4.661187214611872, + "grad_norm": 39.28131103515625, + "learning_rate": 5.933028919330289e-06, + "loss": 0.3211, + "step": 5104 + }, + { + "epoch": 4.662100456621005, + "grad_norm": 121.53456115722656, + "learning_rate": 5.932014205986809e-06, + "loss": 4.0315, + "step": 5105 + }, + { + "epoch": 4.663013698630137, + "grad_norm": 1.7868082523345947, + "learning_rate": 5.930999492643329e-06, + "loss": 0.0056, + "step": 5106 + }, + { + "epoch": 4.66392694063927, + "grad_norm": 3.8454790115356445, + "learning_rate": 5.929984779299848e-06, + "loss": 0.0329, + "step": 5107 + }, + { + "epoch": 4.664840182648402, + "grad_norm": 32.82209014892578, + "learning_rate": 5.928970065956368e-06, + "loss": 0.2861, + "step": 5108 + }, + { + "epoch": 4.6657534246575345, + "grad_norm": 2.2014715671539307, + "learning_rate": 5.927955352612887e-06, + "loss": 0.0164, + "step": 5109 + }, + { + "epoch": 4.666666666666667, + "grad_norm": 85.48332214355469, + "learning_rate": 5.926940639269407e-06, + "loss": 0.6225, + "step": 5110 + }, + { + "epoch": 4.667579908675799, + "grad_norm": 24.623332977294922, + "learning_rate": 5.925925925925926e-06, + "loss": 0.1409, + "step": 5111 + }, + { + "epoch": 4.668493150684932, + "grad_norm": 22.981279373168945, + "learning_rate": 5.924911212582446e-06, + "loss": 0.2109, + "step": 5112 + }, + { + "epoch": 4.669406392694064, + "grad_norm": 3.4746034145355225, + "learning_rate": 5.923896499238966e-06, + "loss": 0.0206, + "step": 5113 + }, + { + "epoch": 4.670319634703196, + "grad_norm": 2.1119225025177, + "learning_rate": 5.922881785895485e-06, + "loss": 0.0088, + "step": 5114 + }, + { + "epoch": 4.671232876712329, + "grad_norm": 44.129852294921875, + "learning_rate": 5.921867072552005e-06, + "loss": 0.6145, + "step": 5115 + }, + { + "epoch": 4.672146118721461, + "grad_norm": 27.356643676757812, + "learning_rate": 5.920852359208524e-06, + "loss": 0.2307, + "step": 5116 + }, + { + "epoch": 4.673059360730593, + "grad_norm": 4.234926700592041, + "learning_rate": 5.919837645865043e-06, + "loss": 0.0293, + "step": 5117 + }, + { + "epoch": 4.673972602739726, + "grad_norm": 4.005085468292236, + "learning_rate": 5.918822932521563e-06, + "loss": 0.033, + "step": 5118 + }, + { + "epoch": 4.674885844748858, + "grad_norm": 0.23507019877433777, + "learning_rate": 5.9178082191780825e-06, + "loss": 0.0018, + "step": 5119 + }, + { + "epoch": 4.675799086757991, + "grad_norm": 0.4941173791885376, + "learning_rate": 5.916793505834603e-06, + "loss": 0.0027, + "step": 5120 + }, + { + "epoch": 4.676712328767123, + "grad_norm": 1.4435031414031982, + "learning_rate": 5.915778792491121e-06, + "loss": 0.0116, + "step": 5121 + }, + { + "epoch": 4.6776255707762555, + "grad_norm": 3.564021587371826, + "learning_rate": 5.914764079147642e-06, + "loss": 0.0164, + "step": 5122 + }, + { + "epoch": 4.678538812785388, + "grad_norm": 5.824434280395508, + "learning_rate": 5.913749365804161e-06, + "loss": 0.0471, + "step": 5123 + }, + { + "epoch": 4.67945205479452, + "grad_norm": 9.574975967407227, + "learning_rate": 5.91273465246068e-06, + "loss": 0.1051, + "step": 5124 + }, + { + "epoch": 4.680365296803653, + "grad_norm": 7.2994914054870605, + "learning_rate": 5.9117199391172e-06, + "loss": 0.0614, + "step": 5125 + }, + { + "epoch": 4.681278538812785, + "grad_norm": 2.658771276473999, + "learning_rate": 5.9107052257737195e-06, + "loss": 0.0179, + "step": 5126 + }, + { + "epoch": 4.682191780821918, + "grad_norm": 8.704083442687988, + "learning_rate": 5.909690512430238e-06, + "loss": 0.05, + "step": 5127 + }, + { + "epoch": 4.68310502283105, + "grad_norm": 0.9925094246864319, + "learning_rate": 5.908675799086758e-06, + "loss": 0.0052, + "step": 5128 + }, + { + "epoch": 4.684018264840183, + "grad_norm": 0.10914366692304611, + "learning_rate": 5.907661085743278e-06, + "loss": 0.0006, + "step": 5129 + }, + { + "epoch": 4.684931506849315, + "grad_norm": 28.044214248657227, + "learning_rate": 5.906646372399798e-06, + "loss": 0.2721, + "step": 5130 + }, + { + "epoch": 4.685844748858448, + "grad_norm": 5.1799187660217285, + "learning_rate": 5.905631659056317e-06, + "loss": 0.0459, + "step": 5131 + }, + { + "epoch": 4.68675799086758, + "grad_norm": 100.89847564697266, + "learning_rate": 5.904616945712837e-06, + "loss": 2.4404, + "step": 5132 + }, + { + "epoch": 4.6876712328767125, + "grad_norm": 1.9082090854644775, + "learning_rate": 5.9036022323693565e-06, + "loss": 0.0124, + "step": 5133 + }, + { + "epoch": 4.688584474885845, + "grad_norm": 0.19476819038391113, + "learning_rate": 5.902587519025875e-06, + "loss": 0.0014, + "step": 5134 + }, + { + "epoch": 4.689497716894977, + "grad_norm": 9.608514785766602, + "learning_rate": 5.901572805682395e-06, + "loss": 0.0591, + "step": 5135 + }, + { + "epoch": 4.69041095890411, + "grad_norm": 23.480499267578125, + "learning_rate": 5.900558092338915e-06, + "loss": 0.1401, + "step": 5136 + }, + { + "epoch": 4.691324200913242, + "grad_norm": 3.0080533027648926, + "learning_rate": 5.8995433789954336e-06, + "loss": 0.0267, + "step": 5137 + }, + { + "epoch": 4.692237442922375, + "grad_norm": 113.10763549804688, + "learning_rate": 5.898528665651953e-06, + "loss": 2.0085, + "step": 5138 + }, + { + "epoch": 4.693150684931507, + "grad_norm": 4.919704914093018, + "learning_rate": 5.897513952308474e-06, + "loss": 0.025, + "step": 5139 + }, + { + "epoch": 4.69406392694064, + "grad_norm": 0.2899203598499298, + "learning_rate": 5.8964992389649935e-06, + "loss": 0.0022, + "step": 5140 + }, + { + "epoch": 4.694977168949771, + "grad_norm": 14.02645206451416, + "learning_rate": 5.895484525621512e-06, + "loss": 0.0742, + "step": 5141 + }, + { + "epoch": 4.695890410958905, + "grad_norm": 6.283430099487305, + "learning_rate": 5.894469812278032e-06, + "loss": 0.0281, + "step": 5142 + }, + { + "epoch": 4.696803652968036, + "grad_norm": 14.76905632019043, + "learning_rate": 5.893455098934552e-06, + "loss": 0.094, + "step": 5143 + }, + { + "epoch": 4.697716894977169, + "grad_norm": 0.04171667620539665, + "learning_rate": 5.8924403855910706e-06, + "loss": 0.0004, + "step": 5144 + }, + { + "epoch": 4.698630136986301, + "grad_norm": 24.75746726989746, + "learning_rate": 5.89142567224759e-06, + "loss": 0.2265, + "step": 5145 + }, + { + "epoch": 4.6995433789954335, + "grad_norm": 5.498997688293457, + "learning_rate": 5.89041095890411e-06, + "loss": 0.0352, + "step": 5146 + }, + { + "epoch": 4.700456621004566, + "grad_norm": 53.27203369140625, + "learning_rate": 5.889396245560629e-06, + "loss": 0.5529, + "step": 5147 + }, + { + "epoch": 4.701369863013698, + "grad_norm": 158.26158142089844, + "learning_rate": 5.888381532217149e-06, + "loss": 2.2253, + "step": 5148 + }, + { + "epoch": 4.702283105022831, + "grad_norm": 97.72554779052734, + "learning_rate": 5.887366818873669e-06, + "loss": 1.7485, + "step": 5149 + }, + { + "epoch": 4.703196347031963, + "grad_norm": 40.130767822265625, + "learning_rate": 5.886352105530189e-06, + "loss": 0.483, + "step": 5150 + }, + { + "epoch": 4.704109589041096, + "grad_norm": 1.6377596855163574, + "learning_rate": 5.8853373921867076e-06, + "loss": 0.0111, + "step": 5151 + }, + { + "epoch": 4.705022831050228, + "grad_norm": 1.8984726667404175, + "learning_rate": 5.884322678843227e-06, + "loss": 0.0141, + "step": 5152 + }, + { + "epoch": 4.705936073059361, + "grad_norm": 24.23497200012207, + "learning_rate": 5.883307965499747e-06, + "loss": 0.1559, + "step": 5153 + }, + { + "epoch": 4.706849315068493, + "grad_norm": 79.89871978759766, + "learning_rate": 5.882293252156266e-06, + "loss": 1.7358, + "step": 5154 + }, + { + "epoch": 4.707762557077626, + "grad_norm": 20.011741638183594, + "learning_rate": 5.8812785388127855e-06, + "loss": 0.1352, + "step": 5155 + }, + { + "epoch": 4.708675799086758, + "grad_norm": 52.545143127441406, + "learning_rate": 5.880263825469306e-06, + "loss": 0.4338, + "step": 5156 + }, + { + "epoch": 4.7095890410958905, + "grad_norm": 5.176661968231201, + "learning_rate": 5.879249112125824e-06, + "loss": 0.0323, + "step": 5157 + }, + { + "epoch": 4.710502283105023, + "grad_norm": 0.3147898316383362, + "learning_rate": 5.8782343987823446e-06, + "loss": 0.0019, + "step": 5158 + }, + { + "epoch": 4.711415525114155, + "grad_norm": 28.913021087646484, + "learning_rate": 5.877219685438864e-06, + "loss": 0.2206, + "step": 5159 + }, + { + "epoch": 4.712328767123288, + "grad_norm": 0.2514346241950989, + "learning_rate": 5.876204972095384e-06, + "loss": 0.0022, + "step": 5160 + }, + { + "epoch": 4.71324200913242, + "grad_norm": 0.11428891122341156, + "learning_rate": 5.875190258751903e-06, + "loss": 0.0008, + "step": 5161 + }, + { + "epoch": 4.714155251141553, + "grad_norm": 7.216229438781738, + "learning_rate": 5.8741755454084225e-06, + "loss": 0.0374, + "step": 5162 + }, + { + "epoch": 4.715068493150685, + "grad_norm": 68.50711059570312, + "learning_rate": 5.873160832064942e-06, + "loss": 0.9848, + "step": 5163 + }, + { + "epoch": 4.715981735159818, + "grad_norm": 1.5567997694015503, + "learning_rate": 5.872146118721461e-06, + "loss": 0.0084, + "step": 5164 + }, + { + "epoch": 4.71689497716895, + "grad_norm": 80.5141372680664, + "learning_rate": 5.871131405377981e-06, + "loss": 1.1131, + "step": 5165 + }, + { + "epoch": 4.717808219178083, + "grad_norm": 127.41883850097656, + "learning_rate": 5.870116692034501e-06, + "loss": 2.4799, + "step": 5166 + }, + { + "epoch": 4.718721461187215, + "grad_norm": 49.128780364990234, + "learning_rate": 5.86910197869102e-06, + "loss": 0.4481, + "step": 5167 + }, + { + "epoch": 4.719634703196347, + "grad_norm": 6.469141006469727, + "learning_rate": 5.86808726534754e-06, + "loss": 0.0471, + "step": 5168 + }, + { + "epoch": 4.72054794520548, + "grad_norm": 26.33849334716797, + "learning_rate": 5.8670725520040595e-06, + "loss": 0.1988, + "step": 5169 + }, + { + "epoch": 4.7214611872146115, + "grad_norm": 287.7178955078125, + "learning_rate": 5.866057838660579e-06, + "loss": 1.4022, + "step": 5170 + }, + { + "epoch": 4.722374429223744, + "grad_norm": 17.085615158081055, + "learning_rate": 5.865043125317098e-06, + "loss": 0.1388, + "step": 5171 + }, + { + "epoch": 4.723287671232876, + "grad_norm": 15.12143611907959, + "learning_rate": 5.864028411973618e-06, + "loss": 0.1176, + "step": 5172 + }, + { + "epoch": 4.724200913242009, + "grad_norm": 89.41740417480469, + "learning_rate": 5.863013698630137e-06, + "loss": 0.8718, + "step": 5173 + }, + { + "epoch": 4.725114155251141, + "grad_norm": 5.232202053070068, + "learning_rate": 5.861998985286656e-06, + "loss": 0.0273, + "step": 5174 + }, + { + "epoch": 4.726027397260274, + "grad_norm": 1.7975335121154785, + "learning_rate": 5.860984271943177e-06, + "loss": 0.013, + "step": 5175 + }, + { + "epoch": 4.726940639269406, + "grad_norm": 38.13688278198242, + "learning_rate": 5.8599695585996965e-06, + "loss": 0.3914, + "step": 5176 + }, + { + "epoch": 4.727853881278539, + "grad_norm": 50.04239273071289, + "learning_rate": 5.858954845256215e-06, + "loss": 0.5348, + "step": 5177 + }, + { + "epoch": 4.728767123287671, + "grad_norm": 6.9451212882995605, + "learning_rate": 5.857940131912735e-06, + "loss": 0.0422, + "step": 5178 + }, + { + "epoch": 4.729680365296804, + "grad_norm": 4.085601329803467, + "learning_rate": 5.856925418569255e-06, + "loss": 0.0369, + "step": 5179 + }, + { + "epoch": 4.730593607305936, + "grad_norm": 0.6776978373527527, + "learning_rate": 5.855910705225774e-06, + "loss": 0.0056, + "step": 5180 + }, + { + "epoch": 4.7315068493150685, + "grad_norm": 143.87582397460938, + "learning_rate": 5.854895991882293e-06, + "loss": 0.7062, + "step": 5181 + }, + { + "epoch": 4.732420091324201, + "grad_norm": 22.52387809753418, + "learning_rate": 5.853881278538813e-06, + "loss": 0.1926, + "step": 5182 + }, + { + "epoch": 4.733333333333333, + "grad_norm": 8.231650352478027, + "learning_rate": 5.8528665651953335e-06, + "loss": 0.0427, + "step": 5183 + }, + { + "epoch": 4.734246575342466, + "grad_norm": 3.31626033782959, + "learning_rate": 5.8518518518518515e-06, + "loss": 0.0314, + "step": 5184 + }, + { + "epoch": 4.735159817351598, + "grad_norm": 1.5320290327072144, + "learning_rate": 5.850837138508372e-06, + "loss": 0.0164, + "step": 5185 + }, + { + "epoch": 4.736073059360731, + "grad_norm": 40.28128433227539, + "learning_rate": 5.849822425164892e-06, + "loss": 0.58, + "step": 5186 + }, + { + "epoch": 4.736986301369863, + "grad_norm": 51.9484748840332, + "learning_rate": 5.8488077118214106e-06, + "loss": 0.2991, + "step": 5187 + }, + { + "epoch": 4.737899543378996, + "grad_norm": 31.8656063079834, + "learning_rate": 5.84779299847793e-06, + "loss": 0.3859, + "step": 5188 + }, + { + "epoch": 4.738812785388128, + "grad_norm": 2.1021270751953125, + "learning_rate": 5.84677828513445e-06, + "loss": 0.0156, + "step": 5189 + }, + { + "epoch": 4.739726027397261, + "grad_norm": 74.39668273925781, + "learning_rate": 5.84576357179097e-06, + "loss": 1.9022, + "step": 5190 + }, + { + "epoch": 4.740639269406393, + "grad_norm": 11.76511001586914, + "learning_rate": 5.8447488584474885e-06, + "loss": 0.1213, + "step": 5191 + }, + { + "epoch": 4.7415525114155255, + "grad_norm": 1.5832654237747192, + "learning_rate": 5.843734145104009e-06, + "loss": 0.0082, + "step": 5192 + }, + { + "epoch": 4.742465753424657, + "grad_norm": 7.318046569824219, + "learning_rate": 5.842719431760529e-06, + "loss": 0.0394, + "step": 5193 + }, + { + "epoch": 4.74337899543379, + "grad_norm": 48.16316604614258, + "learning_rate": 5.8417047184170476e-06, + "loss": 0.4455, + "step": 5194 + }, + { + "epoch": 4.744292237442922, + "grad_norm": 72.03402709960938, + "learning_rate": 5.840690005073567e-06, + "loss": 0.5826, + "step": 5195 + }, + { + "epoch": 4.745205479452055, + "grad_norm": 66.45557403564453, + "learning_rate": 5.839675291730087e-06, + "loss": 0.7824, + "step": 5196 + }, + { + "epoch": 4.746118721461187, + "grad_norm": 0.8376733064651489, + "learning_rate": 5.838660578386606e-06, + "loss": 0.003, + "step": 5197 + }, + { + "epoch": 4.747031963470319, + "grad_norm": 2.395630359649658, + "learning_rate": 5.8376458650431255e-06, + "loss": 0.0211, + "step": 5198 + }, + { + "epoch": 4.747945205479452, + "grad_norm": 6.1461052894592285, + "learning_rate": 5.836631151699645e-06, + "loss": 0.0501, + "step": 5199 + }, + { + "epoch": 4.748858447488584, + "grad_norm": 90.06536102294922, + "learning_rate": 5.835616438356166e-06, + "loss": 1.9962, + "step": 5200 + }, + { + "epoch": 4.749771689497717, + "grad_norm": 3.9494903087615967, + "learning_rate": 5.834601725012684e-06, + "loss": 0.0274, + "step": 5201 + }, + { + "epoch": 4.750684931506849, + "grad_norm": 0.15860456228256226, + "learning_rate": 5.833587011669204e-06, + "loss": 0.0014, + "step": 5202 + }, + { + "epoch": 4.751598173515982, + "grad_norm": 26.571760177612305, + "learning_rate": 5.832572298325724e-06, + "loss": 0.2692, + "step": 5203 + }, + { + "epoch": 4.752511415525114, + "grad_norm": 15.482834815979004, + "learning_rate": 5.831557584982243e-06, + "loss": 0.1337, + "step": 5204 + }, + { + "epoch": 4.7534246575342465, + "grad_norm": 14.088515281677246, + "learning_rate": 5.8305428716387625e-06, + "loss": 0.1021, + "step": 5205 + }, + { + "epoch": 4.754337899543379, + "grad_norm": 6.655299663543701, + "learning_rate": 5.829528158295282e-06, + "loss": 0.0571, + "step": 5206 + }, + { + "epoch": 4.755251141552511, + "grad_norm": 1.9738633632659912, + "learning_rate": 5.828513444951801e-06, + "loss": 0.0171, + "step": 5207 + }, + { + "epoch": 4.756164383561644, + "grad_norm": 17.681774139404297, + "learning_rate": 5.827498731608321e-06, + "loss": 0.1921, + "step": 5208 + }, + { + "epoch": 4.757077625570776, + "grad_norm": 4.765157222747803, + "learning_rate": 5.82648401826484e-06, + "loss": 0.0359, + "step": 5209 + }, + { + "epoch": 4.757990867579909, + "grad_norm": 0.619819164276123, + "learning_rate": 5.825469304921361e-06, + "loss": 0.0044, + "step": 5210 + }, + { + "epoch": 4.758904109589041, + "grad_norm": 1.7749236822128296, + "learning_rate": 5.82445459157788e-06, + "loss": 0.0141, + "step": 5211 + }, + { + "epoch": 4.759817351598174, + "grad_norm": 43.80927658081055, + "learning_rate": 5.8234398782343995e-06, + "loss": 0.5882, + "step": 5212 + }, + { + "epoch": 4.760730593607306, + "grad_norm": 109.08397674560547, + "learning_rate": 5.822425164890919e-06, + "loss": 2.1792, + "step": 5213 + }, + { + "epoch": 4.761643835616439, + "grad_norm": 4.786633491516113, + "learning_rate": 5.821410451547438e-06, + "loss": 0.0278, + "step": 5214 + }, + { + "epoch": 4.762557077625571, + "grad_norm": 7.364502906799316, + "learning_rate": 5.820395738203958e-06, + "loss": 0.0651, + "step": 5215 + }, + { + "epoch": 4.7634703196347035, + "grad_norm": 48.75895690917969, + "learning_rate": 5.819381024860477e-06, + "loss": 0.7634, + "step": 5216 + }, + { + "epoch": 4.764383561643836, + "grad_norm": 9.658821105957031, + "learning_rate": 5.818366311516996e-06, + "loss": 0.0616, + "step": 5217 + }, + { + "epoch": 4.765296803652968, + "grad_norm": 4.489524841308594, + "learning_rate": 5.817351598173516e-06, + "loss": 0.0377, + "step": 5218 + }, + { + "epoch": 4.766210045662101, + "grad_norm": 2.655758857727051, + "learning_rate": 5.8163368848300365e-06, + "loss": 0.0147, + "step": 5219 + }, + { + "epoch": 4.767123287671232, + "grad_norm": 0.30978137254714966, + "learning_rate": 5.815322171486556e-06, + "loss": 0.0018, + "step": 5220 + }, + { + "epoch": 4.768036529680366, + "grad_norm": 4.6308369636535645, + "learning_rate": 5.814307458143075e-06, + "loss": 0.0308, + "step": 5221 + }, + { + "epoch": 4.768949771689497, + "grad_norm": 10.975419998168945, + "learning_rate": 5.813292744799595e-06, + "loss": 0.0993, + "step": 5222 + }, + { + "epoch": 4.76986301369863, + "grad_norm": 10.820978164672852, + "learning_rate": 5.812278031456114e-06, + "loss": 0.0646, + "step": 5223 + }, + { + "epoch": 4.770776255707762, + "grad_norm": 0.38965293765068054, + "learning_rate": 5.811263318112633e-06, + "loss": 0.0028, + "step": 5224 + }, + { + "epoch": 4.771689497716895, + "grad_norm": 3.4783434867858887, + "learning_rate": 5.810248604769153e-06, + "loss": 0.0273, + "step": 5225 + }, + { + "epoch": 4.772602739726027, + "grad_norm": 3.9645345211029053, + "learning_rate": 5.809233891425673e-06, + "loss": 0.0335, + "step": 5226 + }, + { + "epoch": 4.77351598173516, + "grad_norm": 2.1741011142730713, + "learning_rate": 5.8082191780821915e-06, + "loss": 0.0199, + "step": 5227 + }, + { + "epoch": 4.774429223744292, + "grad_norm": 18.6918888092041, + "learning_rate": 5.807204464738712e-06, + "loss": 0.1388, + "step": 5228 + }, + { + "epoch": 4.7753424657534245, + "grad_norm": 91.04633331298828, + "learning_rate": 5.806189751395232e-06, + "loss": 2.1632, + "step": 5229 + }, + { + "epoch": 4.776255707762557, + "grad_norm": 2.4377288818359375, + "learning_rate": 5.805175038051751e-06, + "loss": 0.0153, + "step": 5230 + }, + { + "epoch": 4.777168949771689, + "grad_norm": 77.82317352294922, + "learning_rate": 5.80416032470827e-06, + "loss": 1.3056, + "step": 5231 + }, + { + "epoch": 4.778082191780822, + "grad_norm": 1.0181431770324707, + "learning_rate": 5.80314561136479e-06, + "loss": 0.0095, + "step": 5232 + }, + { + "epoch": 4.778995433789954, + "grad_norm": 2.5619287490844727, + "learning_rate": 5.80213089802131e-06, + "loss": 0.0204, + "step": 5233 + }, + { + "epoch": 4.779908675799087, + "grad_norm": 0.04852614179253578, + "learning_rate": 5.8011161846778285e-06, + "loss": 0.0005, + "step": 5234 + }, + { + "epoch": 4.780821917808219, + "grad_norm": 52.51780700683594, + "learning_rate": 5.800101471334348e-06, + "loss": 0.8341, + "step": 5235 + }, + { + "epoch": 4.781735159817352, + "grad_norm": 10.082982063293457, + "learning_rate": 5.799086757990869e-06, + "loss": 0.068, + "step": 5236 + }, + { + "epoch": 4.782648401826484, + "grad_norm": 1.2405025959014893, + "learning_rate": 5.798072044647387e-06, + "loss": 0.0101, + "step": 5237 + }, + { + "epoch": 4.7835616438356166, + "grad_norm": 27.851478576660156, + "learning_rate": 5.797057331303907e-06, + "loss": 0.6078, + "step": 5238 + }, + { + "epoch": 4.784474885844749, + "grad_norm": 22.839086532592773, + "learning_rate": 5.796042617960427e-06, + "loss": 0.132, + "step": 5239 + }, + { + "epoch": 4.7853881278538815, + "grad_norm": 21.184036254882812, + "learning_rate": 5.795027904616947e-06, + "loss": 0.1999, + "step": 5240 + }, + { + "epoch": 4.786301369863014, + "grad_norm": 18.8759708404541, + "learning_rate": 5.7940131912734655e-06, + "loss": 0.1397, + "step": 5241 + }, + { + "epoch": 4.787214611872146, + "grad_norm": 17.105236053466797, + "learning_rate": 5.792998477929985e-06, + "loss": 0.1068, + "step": 5242 + }, + { + "epoch": 4.788127853881279, + "grad_norm": 33.508453369140625, + "learning_rate": 5.791983764586505e-06, + "loss": 0.2376, + "step": 5243 + }, + { + "epoch": 4.789041095890411, + "grad_norm": 7.312587738037109, + "learning_rate": 5.790969051243024e-06, + "loss": 0.0497, + "step": 5244 + }, + { + "epoch": 4.789954337899544, + "grad_norm": 3.3358216285705566, + "learning_rate": 5.789954337899543e-06, + "loss": 0.0291, + "step": 5245 + }, + { + "epoch": 4.790867579908676, + "grad_norm": 38.56223678588867, + "learning_rate": 5.788939624556064e-06, + "loss": 0.3538, + "step": 5246 + }, + { + "epoch": 4.791780821917808, + "grad_norm": 76.6539306640625, + "learning_rate": 5.787924911212583e-06, + "loss": 0.9669, + "step": 5247 + }, + { + "epoch": 4.792694063926941, + "grad_norm": 16.10582160949707, + "learning_rate": 5.7869101978691025e-06, + "loss": 0.1539, + "step": 5248 + }, + { + "epoch": 4.793607305936073, + "grad_norm": 76.02971649169922, + "learning_rate": 5.785895484525622e-06, + "loss": 1.561, + "step": 5249 + }, + { + "epoch": 4.794520547945205, + "grad_norm": 0.941882312297821, + "learning_rate": 5.784880771182142e-06, + "loss": 0.0048, + "step": 5250 + }, + { + "epoch": 4.7954337899543376, + "grad_norm": 32.49429702758789, + "learning_rate": 5.783866057838661e-06, + "loss": 0.2551, + "step": 5251 + }, + { + "epoch": 4.79634703196347, + "grad_norm": 69.07408142089844, + "learning_rate": 5.78285134449518e-06, + "loss": 1.0342, + "step": 5252 + }, + { + "epoch": 4.7972602739726025, + "grad_norm": 3.104396104812622, + "learning_rate": 5.7818366311517e-06, + "loss": 0.0243, + "step": 5253 + }, + { + "epoch": 4.798173515981735, + "grad_norm": 56.91926956176758, + "learning_rate": 5.780821917808219e-06, + "loss": 0.423, + "step": 5254 + }, + { + "epoch": 4.799086757990867, + "grad_norm": 0.5321231484413147, + "learning_rate": 5.7798072044647394e-06, + "loss": 0.004, + "step": 5255 + }, + { + "epoch": 4.8, + "grad_norm": 6.269561290740967, + "learning_rate": 5.778792491121259e-06, + "loss": 0.038, + "step": 5256 + }, + { + "epoch": 4.800913242009132, + "grad_norm": 15.127166748046875, + "learning_rate": 5.777777777777778e-06, + "loss": 0.1102, + "step": 5257 + }, + { + "epoch": 4.801826484018265, + "grad_norm": 17.709749221801758, + "learning_rate": 5.776763064434298e-06, + "loss": 0.1833, + "step": 5258 + }, + { + "epoch": 4.802739726027397, + "grad_norm": 3.8718254566192627, + "learning_rate": 5.775748351090817e-06, + "loss": 0.0301, + "step": 5259 + }, + { + "epoch": 4.80365296803653, + "grad_norm": 10.766695976257324, + "learning_rate": 5.774733637747337e-06, + "loss": 0.0625, + "step": 5260 + }, + { + "epoch": 4.804566210045662, + "grad_norm": 2.2217907905578613, + "learning_rate": 5.773718924403856e-06, + "loss": 0.0181, + "step": 5261 + }, + { + "epoch": 4.8054794520547945, + "grad_norm": 31.0516357421875, + "learning_rate": 5.772704211060376e-06, + "loss": 0.1814, + "step": 5262 + }, + { + "epoch": 4.806392694063927, + "grad_norm": 32.477783203125, + "learning_rate": 5.771689497716896e-06, + "loss": 0.3111, + "step": 5263 + }, + { + "epoch": 4.8073059360730594, + "grad_norm": 26.54859733581543, + "learning_rate": 5.770674784373414e-06, + "loss": 0.2184, + "step": 5264 + }, + { + "epoch": 4.808219178082192, + "grad_norm": 0.19033066928386688, + "learning_rate": 5.769660071029935e-06, + "loss": 0.0016, + "step": 5265 + }, + { + "epoch": 4.809132420091324, + "grad_norm": 5.455990791320801, + "learning_rate": 5.768645357686454e-06, + "loss": 0.027, + "step": 5266 + }, + { + "epoch": 4.810045662100457, + "grad_norm": 57.33460998535156, + "learning_rate": 5.767630644342973e-06, + "loss": 0.9181, + "step": 5267 + }, + { + "epoch": 4.810958904109589, + "grad_norm": 10.590520858764648, + "learning_rate": 5.766615930999493e-06, + "loss": 0.0863, + "step": 5268 + }, + { + "epoch": 4.811872146118722, + "grad_norm": 0.4385227560997009, + "learning_rate": 5.765601217656013e-06, + "loss": 0.0028, + "step": 5269 + }, + { + "epoch": 4.812785388127854, + "grad_norm": 44.67427062988281, + "learning_rate": 5.764586504312532e-06, + "loss": 0.1514, + "step": 5270 + }, + { + "epoch": 4.813698630136987, + "grad_norm": 2.713715076446533, + "learning_rate": 5.763571790969051e-06, + "loss": 0.021, + "step": 5271 + }, + { + "epoch": 4.814611872146119, + "grad_norm": 24.6807861328125, + "learning_rate": 5.762557077625572e-06, + "loss": 0.3543, + "step": 5272 + }, + { + "epoch": 4.8155251141552515, + "grad_norm": 24.148855209350586, + "learning_rate": 5.761542364282091e-06, + "loss": 0.1453, + "step": 5273 + }, + { + "epoch": 4.816438356164383, + "grad_norm": 1.1320971250534058, + "learning_rate": 5.76052765093861e-06, + "loss": 0.0103, + "step": 5274 + }, + { + "epoch": 4.817351598173516, + "grad_norm": 28.481979370117188, + "learning_rate": 5.75951293759513e-06, + "loss": 0.2001, + "step": 5275 + }, + { + "epoch": 4.818264840182648, + "grad_norm": 14.554652214050293, + "learning_rate": 5.75849822425165e-06, + "loss": 0.1142, + "step": 5276 + }, + { + "epoch": 4.8191780821917805, + "grad_norm": 33.91655731201172, + "learning_rate": 5.7574835109081684e-06, + "loss": 0.3864, + "step": 5277 + }, + { + "epoch": 4.820091324200913, + "grad_norm": 11.271408081054688, + "learning_rate": 5.756468797564688e-06, + "loss": 0.0777, + "step": 5278 + }, + { + "epoch": 4.821004566210045, + "grad_norm": 3.9369072914123535, + "learning_rate": 5.755454084221208e-06, + "loss": 0.0317, + "step": 5279 + }, + { + "epoch": 4.821917808219178, + "grad_norm": 28.34369468688965, + "learning_rate": 5.754439370877728e-06, + "loss": 0.3506, + "step": 5280 + }, + { + "epoch": 4.82283105022831, + "grad_norm": 10.357736587524414, + "learning_rate": 5.753424657534246e-06, + "loss": 0.0669, + "step": 5281 + }, + { + "epoch": 4.823744292237443, + "grad_norm": 9.244132995605469, + "learning_rate": 5.752409944190767e-06, + "loss": 0.0684, + "step": 5282 + }, + { + "epoch": 4.824657534246575, + "grad_norm": 8.192710876464844, + "learning_rate": 5.751395230847287e-06, + "loss": 0.0493, + "step": 5283 + }, + { + "epoch": 4.825570776255708, + "grad_norm": 1.8044697046279907, + "learning_rate": 5.7503805175038054e-06, + "loss": 0.0165, + "step": 5284 + }, + { + "epoch": 4.82648401826484, + "grad_norm": 0.3837580680847168, + "learning_rate": 5.749365804160325e-06, + "loss": 0.0035, + "step": 5285 + }, + { + "epoch": 4.8273972602739725, + "grad_norm": 15.97191047668457, + "learning_rate": 5.748351090816845e-06, + "loss": 0.134, + "step": 5286 + }, + { + "epoch": 4.828310502283105, + "grad_norm": 3.3397412300109863, + "learning_rate": 5.747336377473364e-06, + "loss": 0.0287, + "step": 5287 + }, + { + "epoch": 4.829223744292237, + "grad_norm": 11.43582534790039, + "learning_rate": 5.746321664129883e-06, + "loss": 0.0901, + "step": 5288 + }, + { + "epoch": 4.83013698630137, + "grad_norm": 2.221824884414673, + "learning_rate": 5.745306950786403e-06, + "loss": 0.0143, + "step": 5289 + }, + { + "epoch": 4.831050228310502, + "grad_norm": 0.5961466431617737, + "learning_rate": 5.744292237442924e-06, + "loss": 0.0053, + "step": 5290 + }, + { + "epoch": 4.831963470319635, + "grad_norm": 5.012504577636719, + "learning_rate": 5.7432775240994424e-06, + "loss": 0.0509, + "step": 5291 + }, + { + "epoch": 4.832876712328767, + "grad_norm": 71.76164245605469, + "learning_rate": 5.742262810755962e-06, + "loss": 0.8471, + "step": 5292 + }, + { + "epoch": 4.8337899543379, + "grad_norm": 12.326372146606445, + "learning_rate": 5.741248097412482e-06, + "loss": 0.0512, + "step": 5293 + }, + { + "epoch": 4.834703196347032, + "grad_norm": 0.582581102848053, + "learning_rate": 5.740233384069001e-06, + "loss": 0.0053, + "step": 5294 + }, + { + "epoch": 4.835616438356165, + "grad_norm": 1.0030943155288696, + "learning_rate": 5.73921867072552e-06, + "loss": 0.0094, + "step": 5295 + }, + { + "epoch": 4.836529680365297, + "grad_norm": 27.1427001953125, + "learning_rate": 5.73820395738204e-06, + "loss": 0.1928, + "step": 5296 + }, + { + "epoch": 4.8374429223744295, + "grad_norm": 6.94536018371582, + "learning_rate": 5.737189244038559e-06, + "loss": 0.0475, + "step": 5297 + }, + { + "epoch": 4.838356164383562, + "grad_norm": 27.848875045776367, + "learning_rate": 5.736174530695079e-06, + "loss": 0.1763, + "step": 5298 + }, + { + "epoch": 4.839269406392694, + "grad_norm": 1.1138056516647339, + "learning_rate": 5.735159817351599e-06, + "loss": 0.0103, + "step": 5299 + }, + { + "epoch": 4.840182648401827, + "grad_norm": 7.479681968688965, + "learning_rate": 5.734145104008119e-06, + "loss": 0.0364, + "step": 5300 + }, + { + "epoch": 4.8410958904109584, + "grad_norm": 9.695963859558105, + "learning_rate": 5.733130390664638e-06, + "loss": 0.0687, + "step": 5301 + }, + { + "epoch": 4.842009132420092, + "grad_norm": 124.508056640625, + "learning_rate": 5.732115677321157e-06, + "loss": 2.1987, + "step": 5302 + }, + { + "epoch": 4.842922374429223, + "grad_norm": 49.11410140991211, + "learning_rate": 5.731100963977677e-06, + "loss": 0.5421, + "step": 5303 + }, + { + "epoch": 4.843835616438356, + "grad_norm": 28.617456436157227, + "learning_rate": 5.730086250634196e-06, + "loss": 0.1461, + "step": 5304 + }, + { + "epoch": 4.844748858447488, + "grad_norm": 0.02797684073448181, + "learning_rate": 5.729071537290716e-06, + "loss": 0.0002, + "step": 5305 + }, + { + "epoch": 4.845662100456621, + "grad_norm": 0.0703662559390068, + "learning_rate": 5.728056823947235e-06, + "loss": 0.0006, + "step": 5306 + }, + { + "epoch": 4.846575342465753, + "grad_norm": 1.6844549179077148, + "learning_rate": 5.727042110603754e-06, + "loss": 0.0079, + "step": 5307 + }, + { + "epoch": 4.847488584474886, + "grad_norm": 26.524995803833008, + "learning_rate": 5.726027397260274e-06, + "loss": 0.1507, + "step": 5308 + }, + { + "epoch": 4.848401826484018, + "grad_norm": 45.72352981567383, + "learning_rate": 5.725012683916794e-06, + "loss": 0.3111, + "step": 5309 + }, + { + "epoch": 4.8493150684931505, + "grad_norm": 96.11751556396484, + "learning_rate": 5.723997970573314e-06, + "loss": 0.806, + "step": 5310 + }, + { + "epoch": 4.850228310502283, + "grad_norm": 12.1402587890625, + "learning_rate": 5.722983257229833e-06, + "loss": 0.0864, + "step": 5311 + }, + { + "epoch": 4.851141552511415, + "grad_norm": 3.3703396320343018, + "learning_rate": 5.721968543886353e-06, + "loss": 0.0295, + "step": 5312 + }, + { + "epoch": 4.852054794520548, + "grad_norm": 0.3180323541164398, + "learning_rate": 5.720953830542872e-06, + "loss": 0.0029, + "step": 5313 + }, + { + "epoch": 4.85296803652968, + "grad_norm": 91.31227111816406, + "learning_rate": 5.719939117199391e-06, + "loss": 1.0085, + "step": 5314 + }, + { + "epoch": 4.853881278538813, + "grad_norm": 22.57598114013672, + "learning_rate": 5.718924403855911e-06, + "loss": 0.2208, + "step": 5315 + }, + { + "epoch": 4.854794520547945, + "grad_norm": 9.813343048095703, + "learning_rate": 5.717909690512431e-06, + "loss": 0.0643, + "step": 5316 + }, + { + "epoch": 4.855707762557078, + "grad_norm": 1.187015175819397, + "learning_rate": 5.716894977168949e-06, + "loss": 0.0065, + "step": 5317 + }, + { + "epoch": 4.85662100456621, + "grad_norm": 7.690515995025635, + "learning_rate": 5.71588026382547e-06, + "loss": 0.0423, + "step": 5318 + }, + { + "epoch": 4.857534246575343, + "grad_norm": 4.30479097366333, + "learning_rate": 5.71486555048199e-06, + "loss": 0.0299, + "step": 5319 + }, + { + "epoch": 4.858447488584475, + "grad_norm": 1.0442514419555664, + "learning_rate": 5.713850837138509e-06, + "loss": 0.0089, + "step": 5320 + }, + { + "epoch": 4.8593607305936075, + "grad_norm": 7.233133316040039, + "learning_rate": 5.712836123795028e-06, + "loss": 0.0547, + "step": 5321 + }, + { + "epoch": 4.86027397260274, + "grad_norm": 0.2186228632926941, + "learning_rate": 5.711821410451548e-06, + "loss": 0.0011, + "step": 5322 + }, + { + "epoch": 4.861187214611872, + "grad_norm": 0.4476790726184845, + "learning_rate": 5.7108066971080675e-06, + "loss": 0.0026, + "step": 5323 + }, + { + "epoch": 4.862100456621005, + "grad_norm": 0.8884304165840149, + "learning_rate": 5.709791983764586e-06, + "loss": 0.0072, + "step": 5324 + }, + { + "epoch": 4.863013698630137, + "grad_norm": 2.6830108165740967, + "learning_rate": 5.708777270421106e-06, + "loss": 0.0193, + "step": 5325 + }, + { + "epoch": 4.86392694063927, + "grad_norm": 28.368314743041992, + "learning_rate": 5.7077625570776266e-06, + "loss": 0.4887, + "step": 5326 + }, + { + "epoch": 4.864840182648402, + "grad_norm": 0.5129623413085938, + "learning_rate": 5.706747843734145e-06, + "loss": 0.0033, + "step": 5327 + }, + { + "epoch": 4.865753424657534, + "grad_norm": 5.213888645172119, + "learning_rate": 5.705733130390665e-06, + "loss": 0.0336, + "step": 5328 + }, + { + "epoch": 4.866666666666667, + "grad_norm": 10.43199634552002, + "learning_rate": 5.704718417047185e-06, + "loss": 0.1039, + "step": 5329 + }, + { + "epoch": 4.867579908675799, + "grad_norm": 5.464951992034912, + "learning_rate": 5.7037037037037045e-06, + "loss": 0.0353, + "step": 5330 + }, + { + "epoch": 4.868493150684931, + "grad_norm": 2.7363319396972656, + "learning_rate": 5.702688990360223e-06, + "loss": 0.0161, + "step": 5331 + }, + { + "epoch": 4.869406392694064, + "grad_norm": 24.18651008605957, + "learning_rate": 5.701674277016743e-06, + "loss": 0.1512, + "step": 5332 + }, + { + "epoch": 4.870319634703196, + "grad_norm": 50.22572708129883, + "learning_rate": 5.700659563673263e-06, + "loss": 0.5461, + "step": 5333 + }, + { + "epoch": 4.8712328767123285, + "grad_norm": 1.065071702003479, + "learning_rate": 5.6996448503297816e-06, + "loss": 0.0089, + "step": 5334 + }, + { + "epoch": 4.872146118721461, + "grad_norm": 1.8578253984451294, + "learning_rate": 5.698630136986302e-06, + "loss": 0.0131, + "step": 5335 + }, + { + "epoch": 4.873059360730593, + "grad_norm": 0.6478645205497742, + "learning_rate": 5.697615423642822e-06, + "loss": 0.0044, + "step": 5336 + }, + { + "epoch": 4.873972602739726, + "grad_norm": 29.59958839416504, + "learning_rate": 5.696600710299341e-06, + "loss": 0.3042, + "step": 5337 + }, + { + "epoch": 4.874885844748858, + "grad_norm": 9.710140228271484, + "learning_rate": 5.69558599695586e-06, + "loss": 0.0856, + "step": 5338 + }, + { + "epoch": 4.875799086757991, + "grad_norm": 35.6694221496582, + "learning_rate": 5.69457128361238e-06, + "loss": 0.3255, + "step": 5339 + }, + { + "epoch": 4.876712328767123, + "grad_norm": 2.462756633758545, + "learning_rate": 5.6935565702689e-06, + "loss": 0.0178, + "step": 5340 + }, + { + "epoch": 4.877625570776256, + "grad_norm": 0.14041349291801453, + "learning_rate": 5.6925418569254186e-06, + "loss": 0.0011, + "step": 5341 + }, + { + "epoch": 4.878538812785388, + "grad_norm": 1.5930497646331787, + "learning_rate": 5.691527143581938e-06, + "loss": 0.0147, + "step": 5342 + }, + { + "epoch": 4.879452054794521, + "grad_norm": 0.88856440782547, + "learning_rate": 5.690512430238459e-06, + "loss": 0.0074, + "step": 5343 + }, + { + "epoch": 4.880365296803653, + "grad_norm": 7.488541126251221, + "learning_rate": 5.689497716894977e-06, + "loss": 0.0669, + "step": 5344 + }, + { + "epoch": 4.8812785388127855, + "grad_norm": 51.25831985473633, + "learning_rate": 5.688483003551497e-06, + "loss": 0.7559, + "step": 5345 + }, + { + "epoch": 4.882191780821918, + "grad_norm": 130.68211364746094, + "learning_rate": 5.687468290208017e-06, + "loss": 2.8928, + "step": 5346 + }, + { + "epoch": 4.88310502283105, + "grad_norm": 0.4211225211620331, + "learning_rate": 5.686453576864536e-06, + "loss": 0.0037, + "step": 5347 + }, + { + "epoch": 4.884018264840183, + "grad_norm": 85.31111145019531, + "learning_rate": 5.6854388635210556e-06, + "loss": 1.441, + "step": 5348 + }, + { + "epoch": 4.884931506849315, + "grad_norm": 3.655787467956543, + "learning_rate": 5.684424150177575e-06, + "loss": 0.0328, + "step": 5349 + }, + { + "epoch": 4.885844748858448, + "grad_norm": 147.99002075195312, + "learning_rate": 5.683409436834095e-06, + "loss": 1.9429, + "step": 5350 + }, + { + "epoch": 4.88675799086758, + "grad_norm": 2.551248073577881, + "learning_rate": 5.682394723490614e-06, + "loss": 0.0152, + "step": 5351 + }, + { + "epoch": 4.887671232876713, + "grad_norm": 74.34707641601562, + "learning_rate": 5.681380010147134e-06, + "loss": 1.4699, + "step": 5352 + }, + { + "epoch": 4.888584474885845, + "grad_norm": 2.9885969161987305, + "learning_rate": 5.680365296803654e-06, + "loss": 0.0229, + "step": 5353 + }, + { + "epoch": 4.889497716894978, + "grad_norm": 1.631752848625183, + "learning_rate": 5.679350583460173e-06, + "loss": 0.0111, + "step": 5354 + }, + { + "epoch": 4.890410958904109, + "grad_norm": 2.698157548904419, + "learning_rate": 5.6783358701166926e-06, + "loss": 0.0206, + "step": 5355 + }, + { + "epoch": 4.8913242009132425, + "grad_norm": 36.290435791015625, + "learning_rate": 5.677321156773212e-06, + "loss": 0.3269, + "step": 5356 + }, + { + "epoch": 4.892237442922374, + "grad_norm": 11.268736839294434, + "learning_rate": 5.676306443429731e-06, + "loss": 0.0653, + "step": 5357 + }, + { + "epoch": 4.8931506849315065, + "grad_norm": 5.083221435546875, + "learning_rate": 5.675291730086251e-06, + "loss": 0.0215, + "step": 5358 + }, + { + "epoch": 4.894063926940639, + "grad_norm": 18.20076560974121, + "learning_rate": 5.6742770167427705e-06, + "loss": 0.0379, + "step": 5359 + }, + { + "epoch": 4.894977168949771, + "grad_norm": 3.1624338626861572, + "learning_rate": 5.673262303399291e-06, + "loss": 0.0174, + "step": 5360 + }, + { + "epoch": 4.895890410958904, + "grad_norm": 9.785016059875488, + "learning_rate": 5.672247590055809e-06, + "loss": 0.0707, + "step": 5361 + }, + { + "epoch": 4.896803652968036, + "grad_norm": 0.5997470021247864, + "learning_rate": 5.6712328767123296e-06, + "loss": 0.004, + "step": 5362 + }, + { + "epoch": 4.897716894977169, + "grad_norm": 84.10437774658203, + "learning_rate": 5.670218163368849e-06, + "loss": 2.8785, + "step": 5363 + }, + { + "epoch": 4.898630136986301, + "grad_norm": 8.625029563903809, + "learning_rate": 5.669203450025368e-06, + "loss": 0.0481, + "step": 5364 + }, + { + "epoch": 4.899543378995434, + "grad_norm": 26.610191345214844, + "learning_rate": 5.668188736681888e-06, + "loss": 0.085, + "step": 5365 + }, + { + "epoch": 4.900456621004566, + "grad_norm": 3.420710325241089, + "learning_rate": 5.6671740233384075e-06, + "loss": 0.0184, + "step": 5366 + }, + { + "epoch": 4.901369863013699, + "grad_norm": 53.62551498413086, + "learning_rate": 5.666159309994926e-06, + "loss": 0.4646, + "step": 5367 + }, + { + "epoch": 4.902283105022831, + "grad_norm": 85.57492065429688, + "learning_rate": 5.665144596651446e-06, + "loss": 1.6878, + "step": 5368 + }, + { + "epoch": 4.9031963470319635, + "grad_norm": 0.783019483089447, + "learning_rate": 5.664129883307966e-06, + "loss": 0.007, + "step": 5369 + }, + { + "epoch": 4.904109589041096, + "grad_norm": 4.22361421585083, + "learning_rate": 5.663115169964486e-06, + "loss": 0.0232, + "step": 5370 + }, + { + "epoch": 4.905022831050228, + "grad_norm": 0.5683445930480957, + "learning_rate": 5.662100456621005e-06, + "loss": 0.0053, + "step": 5371 + }, + { + "epoch": 4.905936073059361, + "grad_norm": 1.697799801826477, + "learning_rate": 5.661085743277525e-06, + "loss": 0.0128, + "step": 5372 + }, + { + "epoch": 4.906849315068493, + "grad_norm": 3.1953203678131104, + "learning_rate": 5.6600710299340445e-06, + "loss": 0.0235, + "step": 5373 + }, + { + "epoch": 4.907762557077626, + "grad_norm": 3.6580708026885986, + "learning_rate": 5.659056316590563e-06, + "loss": 0.0241, + "step": 5374 + }, + { + "epoch": 4.908675799086758, + "grad_norm": 7.516328811645508, + "learning_rate": 5.658041603247083e-06, + "loss": 0.0381, + "step": 5375 + }, + { + "epoch": 4.909589041095891, + "grad_norm": 11.18379020690918, + "learning_rate": 5.657026889903603e-06, + "loss": 0.0844, + "step": 5376 + }, + { + "epoch": 4.910502283105023, + "grad_norm": 11.668054580688477, + "learning_rate": 5.6560121765601216e-06, + "loss": 0.0892, + "step": 5377 + }, + { + "epoch": 4.911415525114156, + "grad_norm": 7.830521106719971, + "learning_rate": 5.654997463216641e-06, + "loss": 0.082, + "step": 5378 + }, + { + "epoch": 4.912328767123288, + "grad_norm": 31.968181610107422, + "learning_rate": 5.653982749873162e-06, + "loss": 0.4482, + "step": 5379 + }, + { + "epoch": 4.91324200913242, + "grad_norm": 31.58302879333496, + "learning_rate": 5.6529680365296815e-06, + "loss": 0.2601, + "step": 5380 + }, + { + "epoch": 4.914155251141553, + "grad_norm": 71.12445831298828, + "learning_rate": 5.6519533231862e-06, + "loss": 0.9896, + "step": 5381 + }, + { + "epoch": 4.9150684931506845, + "grad_norm": 14.975996017456055, + "learning_rate": 5.65093860984272e-06, + "loss": 0.1222, + "step": 5382 + }, + { + "epoch": 4.915981735159818, + "grad_norm": 5.164962291717529, + "learning_rate": 5.64992389649924e-06, + "loss": 0.0524, + "step": 5383 + }, + { + "epoch": 4.916894977168949, + "grad_norm": 61.349998474121094, + "learning_rate": 5.6489091831557586e-06, + "loss": 0.6748, + "step": 5384 + }, + { + "epoch": 4.917808219178082, + "grad_norm": 2.418428659439087, + "learning_rate": 5.647894469812278e-06, + "loss": 0.0172, + "step": 5385 + }, + { + "epoch": 4.918721461187214, + "grad_norm": 36.69374465942383, + "learning_rate": 5.646879756468798e-06, + "loss": 0.4085, + "step": 5386 + }, + { + "epoch": 4.919634703196347, + "grad_norm": 7.435288906097412, + "learning_rate": 5.645865043125317e-06, + "loss": 0.0795, + "step": 5387 + }, + { + "epoch": 4.920547945205479, + "grad_norm": 0.39005592465400696, + "learning_rate": 5.6448503297818365e-06, + "loss": 0.0031, + "step": 5388 + }, + { + "epoch": 4.921461187214612, + "grad_norm": 12.798779487609863, + "learning_rate": 5.643835616438357e-06, + "loss": 0.1342, + "step": 5389 + }, + { + "epoch": 4.922374429223744, + "grad_norm": 2.549394369125366, + "learning_rate": 5.642820903094877e-06, + "loss": 0.0156, + "step": 5390 + }, + { + "epoch": 4.923287671232877, + "grad_norm": 22.349613189697266, + "learning_rate": 5.6418061897513955e-06, + "loss": 0.2541, + "step": 5391 + }, + { + "epoch": 4.924200913242009, + "grad_norm": 7.546314716339111, + "learning_rate": 5.640791476407915e-06, + "loss": 0.019, + "step": 5392 + }, + { + "epoch": 4.9251141552511415, + "grad_norm": 21.24993133544922, + "learning_rate": 5.639776763064435e-06, + "loss": 0.1275, + "step": 5393 + }, + { + "epoch": 4.926027397260274, + "grad_norm": 10.778606414794922, + "learning_rate": 5.638762049720954e-06, + "loss": 0.0701, + "step": 5394 + }, + { + "epoch": 4.926940639269406, + "grad_norm": 60.94893264770508, + "learning_rate": 5.6377473363774735e-06, + "loss": 0.4081, + "step": 5395 + }, + { + "epoch": 4.927853881278539, + "grad_norm": 103.55965423583984, + "learning_rate": 5.636732623033994e-06, + "loss": 1.5395, + "step": 5396 + }, + { + "epoch": 4.928767123287671, + "grad_norm": 104.67887878417969, + "learning_rate": 5.635717909690512e-06, + "loss": 1.1489, + "step": 5397 + }, + { + "epoch": 4.929680365296804, + "grad_norm": 6.271685600280762, + "learning_rate": 5.6347031963470325e-06, + "loss": 0.0613, + "step": 5398 + }, + { + "epoch": 4.930593607305936, + "grad_norm": 1.0266790390014648, + "learning_rate": 5.633688483003552e-06, + "loss": 0.0104, + "step": 5399 + }, + { + "epoch": 4.931506849315069, + "grad_norm": 50.96086883544922, + "learning_rate": 5.632673769660072e-06, + "loss": 0.4726, + "step": 5400 + }, + { + "epoch": 4.932420091324201, + "grad_norm": 44.27599334716797, + "learning_rate": 5.631659056316591e-06, + "loss": 0.2624, + "step": 5401 + }, + { + "epoch": 4.933333333333334, + "grad_norm": 9.848569869995117, + "learning_rate": 5.6306443429731105e-06, + "loss": 0.1143, + "step": 5402 + }, + { + "epoch": 4.934246575342466, + "grad_norm": 20.91526222229004, + "learning_rate": 5.62962962962963e-06, + "loss": 0.2069, + "step": 5403 + }, + { + "epoch": 4.9351598173515985, + "grad_norm": 50.037296295166016, + "learning_rate": 5.628614916286149e-06, + "loss": 0.5963, + "step": 5404 + }, + { + "epoch": 4.936073059360731, + "grad_norm": 124.39530181884766, + "learning_rate": 5.627600202942669e-06, + "loss": 2.5504, + "step": 5405 + }, + { + "epoch": 4.936986301369863, + "grad_norm": 3.451709508895874, + "learning_rate": 5.626585489599189e-06, + "loss": 0.0207, + "step": 5406 + }, + { + "epoch": 4.937899543378995, + "grad_norm": 26.229421615600586, + "learning_rate": 5.625570776255708e-06, + "loss": 0.1657, + "step": 5407 + }, + { + "epoch": 4.938812785388128, + "grad_norm": 6.099640846252441, + "learning_rate": 5.624556062912228e-06, + "loss": 0.0559, + "step": 5408 + }, + { + "epoch": 4.93972602739726, + "grad_norm": 61.52592468261719, + "learning_rate": 5.6235413495687475e-06, + "loss": 0.8721, + "step": 5409 + }, + { + "epoch": 4.940639269406392, + "grad_norm": 9.765939712524414, + "learning_rate": 5.622526636225267e-06, + "loss": 0.0729, + "step": 5410 + }, + { + "epoch": 4.941552511415525, + "grad_norm": 21.068256378173828, + "learning_rate": 5.621511922881786e-06, + "loss": 0.1806, + "step": 5411 + }, + { + "epoch": 4.942465753424657, + "grad_norm": 12.19482421875, + "learning_rate": 5.620497209538306e-06, + "loss": 0.0945, + "step": 5412 + }, + { + "epoch": 4.94337899543379, + "grad_norm": 5.059861660003662, + "learning_rate": 5.619482496194825e-06, + "loss": 0.0411, + "step": 5413 + }, + { + "epoch": 4.944292237442922, + "grad_norm": 2.3247315883636475, + "learning_rate": 5.618467782851344e-06, + "loss": 0.0188, + "step": 5414 + }, + { + "epoch": 4.945205479452055, + "grad_norm": 8.88408088684082, + "learning_rate": 5.617453069507865e-06, + "loss": 0.0586, + "step": 5415 + }, + { + "epoch": 4.946118721461187, + "grad_norm": 4.3500142097473145, + "learning_rate": 5.6164383561643845e-06, + "loss": 0.0342, + "step": 5416 + }, + { + "epoch": 4.9470319634703195, + "grad_norm": 91.63671112060547, + "learning_rate": 5.615423642820903e-06, + "loss": 2.4962, + "step": 5417 + }, + { + "epoch": 4.947945205479452, + "grad_norm": 24.864452362060547, + "learning_rate": 5.614408929477423e-06, + "loss": 0.1656, + "step": 5418 + }, + { + "epoch": 4.948858447488584, + "grad_norm": 12.17675495147705, + "learning_rate": 5.613394216133943e-06, + "loss": 0.0863, + "step": 5419 + }, + { + "epoch": 4.949771689497717, + "grad_norm": 3.3334057331085205, + "learning_rate": 5.612379502790462e-06, + "loss": 0.0312, + "step": 5420 + }, + { + "epoch": 4.950684931506849, + "grad_norm": 4.684749126434326, + "learning_rate": 5.611364789446981e-06, + "loss": 0.0377, + "step": 5421 + }, + { + "epoch": 4.951598173515982, + "grad_norm": 11.76992130279541, + "learning_rate": 5.610350076103501e-06, + "loss": 0.1129, + "step": 5422 + }, + { + "epoch": 4.952511415525114, + "grad_norm": 7.5817646980285645, + "learning_rate": 5.6093353627600215e-06, + "loss": 0.0593, + "step": 5423 + }, + { + "epoch": 4.953424657534247, + "grad_norm": 1.661727786064148, + "learning_rate": 5.6083206494165395e-06, + "loss": 0.0141, + "step": 5424 + }, + { + "epoch": 4.954337899543379, + "grad_norm": 108.67108154296875, + "learning_rate": 5.60730593607306e-06, + "loss": 1.21, + "step": 5425 + }, + { + "epoch": 4.955251141552512, + "grad_norm": 13.749669075012207, + "learning_rate": 5.60629122272958e-06, + "loss": 0.0822, + "step": 5426 + }, + { + "epoch": 4.956164383561644, + "grad_norm": 6.3979010581970215, + "learning_rate": 5.6052765093860985e-06, + "loss": 0.0329, + "step": 5427 + }, + { + "epoch": 4.9570776255707765, + "grad_norm": 37.297454833984375, + "learning_rate": 5.604261796042618e-06, + "loss": 0.306, + "step": 5428 + }, + { + "epoch": 4.957990867579909, + "grad_norm": 68.28331756591797, + "learning_rate": 5.603247082699138e-06, + "loss": 1.1365, + "step": 5429 + }, + { + "epoch": 4.958904109589041, + "grad_norm": 2.9231226444244385, + "learning_rate": 5.602232369355658e-06, + "loss": 0.0203, + "step": 5430 + }, + { + "epoch": 4.959817351598174, + "grad_norm": 48.73593521118164, + "learning_rate": 5.6012176560121765e-06, + "loss": 0.289, + "step": 5431 + }, + { + "epoch": 4.960730593607306, + "grad_norm": 0.177085280418396, + "learning_rate": 5.600202942668697e-06, + "loss": 0.0015, + "step": 5432 + }, + { + "epoch": 4.961643835616439, + "grad_norm": 4.986574172973633, + "learning_rate": 5.599188229325217e-06, + "loss": 0.0457, + "step": 5433 + }, + { + "epoch": 4.96255707762557, + "grad_norm": 0.6518568992614746, + "learning_rate": 5.5981735159817355e-06, + "loss": 0.0045, + "step": 5434 + }, + { + "epoch": 4.963470319634704, + "grad_norm": 1.0296127796173096, + "learning_rate": 5.597158802638255e-06, + "loss": 0.0057, + "step": 5435 + }, + { + "epoch": 4.964383561643835, + "grad_norm": 4.952261924743652, + "learning_rate": 5.596144089294775e-06, + "loss": 0.0252, + "step": 5436 + }, + { + "epoch": 4.965296803652968, + "grad_norm": 1.6288061141967773, + "learning_rate": 5.595129375951294e-06, + "loss": 0.0124, + "step": 5437 + }, + { + "epoch": 4.9662100456621, + "grad_norm": 1.2803065776824951, + "learning_rate": 5.5941146626078135e-06, + "loss": 0.0124, + "step": 5438 + }, + { + "epoch": 4.967123287671233, + "grad_norm": 1.8752292394638062, + "learning_rate": 5.593099949264333e-06, + "loss": 0.0169, + "step": 5439 + }, + { + "epoch": 4.968036529680365, + "grad_norm": 0.9362497925758362, + "learning_rate": 5.592085235920854e-06, + "loss": 0.0065, + "step": 5440 + }, + { + "epoch": 4.9689497716894975, + "grad_norm": 37.19001007080078, + "learning_rate": 5.591070522577372e-06, + "loss": 0.4425, + "step": 5441 + }, + { + "epoch": 4.96986301369863, + "grad_norm": 4.153104782104492, + "learning_rate": 5.590055809233892e-06, + "loss": 0.0328, + "step": 5442 + }, + { + "epoch": 4.970776255707762, + "grad_norm": 143.3481903076172, + "learning_rate": 5.589041095890412e-06, + "loss": 0.9737, + "step": 5443 + }, + { + "epoch": 4.971689497716895, + "grad_norm": 40.49345779418945, + "learning_rate": 5.588026382546931e-06, + "loss": 0.5149, + "step": 5444 + }, + { + "epoch": 4.972602739726027, + "grad_norm": 28.70357322692871, + "learning_rate": 5.5870116692034504e-06, + "loss": 0.1976, + "step": 5445 + }, + { + "epoch": 4.97351598173516, + "grad_norm": 4.927487373352051, + "learning_rate": 5.58599695585997e-06, + "loss": 0.0309, + "step": 5446 + }, + { + "epoch": 4.974429223744292, + "grad_norm": 5.714946746826172, + "learning_rate": 5.584982242516489e-06, + "loss": 0.0482, + "step": 5447 + }, + { + "epoch": 4.975342465753425, + "grad_norm": 19.203062057495117, + "learning_rate": 5.583967529173009e-06, + "loss": 0.1614, + "step": 5448 + }, + { + "epoch": 4.976255707762557, + "grad_norm": 50.45720672607422, + "learning_rate": 5.582952815829528e-06, + "loss": 0.4514, + "step": 5449 + }, + { + "epoch": 4.9771689497716896, + "grad_norm": 12.216984748840332, + "learning_rate": 5.581938102486049e-06, + "loss": 0.0899, + "step": 5450 + }, + { + "epoch": 4.978082191780822, + "grad_norm": 1.7798480987548828, + "learning_rate": 5.580923389142568e-06, + "loss": 0.0123, + "step": 5451 + }, + { + "epoch": 4.9789954337899545, + "grad_norm": 12.675537109375, + "learning_rate": 5.5799086757990874e-06, + "loss": 0.0891, + "step": 5452 + }, + { + "epoch": 4.979908675799087, + "grad_norm": 2.535756826400757, + "learning_rate": 5.578893962455607e-06, + "loss": 0.017, + "step": 5453 + }, + { + "epoch": 4.980821917808219, + "grad_norm": 11.52964973449707, + "learning_rate": 5.577879249112126e-06, + "loss": 0.0811, + "step": 5454 + }, + { + "epoch": 4.981735159817352, + "grad_norm": 2.999673843383789, + "learning_rate": 5.576864535768646e-06, + "loss": 0.0185, + "step": 5455 + }, + { + "epoch": 4.982648401826484, + "grad_norm": 4.465473175048828, + "learning_rate": 5.575849822425165e-06, + "loss": 0.0334, + "step": 5456 + }, + { + "epoch": 4.983561643835617, + "grad_norm": 12.262029647827148, + "learning_rate": 5.574835109081684e-06, + "loss": 0.0605, + "step": 5457 + }, + { + "epoch": 4.984474885844749, + "grad_norm": 25.077138900756836, + "learning_rate": 5.573820395738204e-06, + "loss": 0.1961, + "step": 5458 + }, + { + "epoch": 4.985388127853882, + "grad_norm": 3.8026390075683594, + "learning_rate": 5.5728056823947244e-06, + "loss": 0.0397, + "step": 5459 + }, + { + "epoch": 4.986301369863014, + "grad_norm": 0.15716397762298584, + "learning_rate": 5.571790969051244e-06, + "loss": 0.0008, + "step": 5460 + }, + { + "epoch": 4.987214611872146, + "grad_norm": 0.27678534388542175, + "learning_rate": 5.570776255707763e-06, + "loss": 0.0024, + "step": 5461 + }, + { + "epoch": 4.988127853881279, + "grad_norm": 12.608105659484863, + "learning_rate": 5.569761542364283e-06, + "loss": 0.1024, + "step": 5462 + }, + { + "epoch": 4.989041095890411, + "grad_norm": 97.48377990722656, + "learning_rate": 5.568746829020802e-06, + "loss": 3.7675, + "step": 5463 + }, + { + "epoch": 4.989954337899543, + "grad_norm": 33.53026580810547, + "learning_rate": 5.567732115677321e-06, + "loss": 0.286, + "step": 5464 + }, + { + "epoch": 4.9908675799086755, + "grad_norm": 74.35877990722656, + "learning_rate": 5.566717402333841e-06, + "loss": 0.9866, + "step": 5465 + }, + { + "epoch": 4.991780821917808, + "grad_norm": 0.21251937747001648, + "learning_rate": 5.565702688990361e-06, + "loss": 0.0012, + "step": 5466 + }, + { + "epoch": 4.99269406392694, + "grad_norm": 73.17351531982422, + "learning_rate": 5.5646879756468794e-06, + "loss": 1.3448, + "step": 5467 + }, + { + "epoch": 4.993607305936073, + "grad_norm": 3.498723030090332, + "learning_rate": 5.563673262303399e-06, + "loss": 0.0324, + "step": 5468 + }, + { + "epoch": 4.994520547945205, + "grad_norm": 3.205713987350464, + "learning_rate": 5.56265854895992e-06, + "loss": 0.0253, + "step": 5469 + }, + { + "epoch": 4.995433789954338, + "grad_norm": 25.58266258239746, + "learning_rate": 5.561643835616439e-06, + "loss": 0.2357, + "step": 5470 + }, + { + "epoch": 4.99634703196347, + "grad_norm": 2.378260612487793, + "learning_rate": 5.560629122272958e-06, + "loss": 0.0175, + "step": 5471 + }, + { + "epoch": 4.997260273972603, + "grad_norm": 3.5927276611328125, + "learning_rate": 5.559614408929478e-06, + "loss": 0.0262, + "step": 5472 + }, + { + "epoch": 4.998173515981735, + "grad_norm": 0.06232835352420807, + "learning_rate": 5.558599695585998e-06, + "loss": 0.0004, + "step": 5473 + }, + { + "epoch": 4.9990867579908675, + "grad_norm": 56.20656204223633, + "learning_rate": 5.5575849822425164e-06, + "loss": 0.7292, + "step": 5474 + }, + { + "epoch": 5.0, + "grad_norm": 3.9541282653808594, + "learning_rate": 5.556570268899036e-06, + "loss": 0.0354, + "step": 5475 + }, + { + "epoch": 5.0009132420091325, + "grad_norm": 3.0716605186462402, + "learning_rate": 5.555555555555557e-06, + "loss": 0.0285, + "step": 5476 + }, + { + "epoch": 5.001826484018265, + "grad_norm": 1.4863992929458618, + "learning_rate": 5.554540842212075e-06, + "loss": 0.0122, + "step": 5477 + }, + { + "epoch": 5.002739726027397, + "grad_norm": 2.154258966445923, + "learning_rate": 5.553526128868595e-06, + "loss": 0.0187, + "step": 5478 + }, + { + "epoch": 5.00365296803653, + "grad_norm": 0.5417791604995728, + "learning_rate": 5.552511415525115e-06, + "loss": 0.0042, + "step": 5479 + }, + { + "epoch": 5.004566210045662, + "grad_norm": 32.77434539794922, + "learning_rate": 5.551496702181635e-06, + "loss": 0.4014, + "step": 5480 + }, + { + "epoch": 5.005479452054795, + "grad_norm": 4.997618675231934, + "learning_rate": 5.5504819888381534e-06, + "loss": 0.0535, + "step": 5481 + }, + { + "epoch": 5.006392694063927, + "grad_norm": 2.902785062789917, + "learning_rate": 5.549467275494673e-06, + "loss": 0.0217, + "step": 5482 + }, + { + "epoch": 5.00730593607306, + "grad_norm": 44.45767593383789, + "learning_rate": 5.548452562151193e-06, + "loss": 0.4441, + "step": 5483 + }, + { + "epoch": 5.008219178082192, + "grad_norm": 2.6400811672210693, + "learning_rate": 5.547437848807712e-06, + "loss": 0.0167, + "step": 5484 + }, + { + "epoch": 5.0091324200913245, + "grad_norm": 1.3582452535629272, + "learning_rate": 5.546423135464231e-06, + "loss": 0.0065, + "step": 5485 + }, + { + "epoch": 5.010045662100457, + "grad_norm": 17.82791519165039, + "learning_rate": 5.545408422120752e-06, + "loss": 0.1414, + "step": 5486 + }, + { + "epoch": 5.010958904109589, + "grad_norm": 14.363409042358398, + "learning_rate": 5.544393708777271e-06, + "loss": 0.1265, + "step": 5487 + }, + { + "epoch": 5.011872146118722, + "grad_norm": 4.058268070220947, + "learning_rate": 5.5433789954337904e-06, + "loss": 0.0297, + "step": 5488 + }, + { + "epoch": 5.0127853881278535, + "grad_norm": 59.6994743347168, + "learning_rate": 5.54236428209031e-06, + "loss": 0.6673, + "step": 5489 + }, + { + "epoch": 5.013698630136986, + "grad_norm": 15.00732135772705, + "learning_rate": 5.54134956874683e-06, + "loss": 0.138, + "step": 5490 + }, + { + "epoch": 5.014611872146118, + "grad_norm": 67.5901107788086, + "learning_rate": 5.540334855403349e-06, + "loss": 1.21, + "step": 5491 + }, + { + "epoch": 5.015525114155251, + "grad_norm": 3.734387159347534, + "learning_rate": 5.539320142059868e-06, + "loss": 0.0346, + "step": 5492 + }, + { + "epoch": 5.016438356164383, + "grad_norm": 0.23423056304454803, + "learning_rate": 5.538305428716388e-06, + "loss": 0.001, + "step": 5493 + }, + { + "epoch": 5.017351598173516, + "grad_norm": 9.547114372253418, + "learning_rate": 5.537290715372907e-06, + "loss": 0.0859, + "step": 5494 + }, + { + "epoch": 5.018264840182648, + "grad_norm": 26.380769729614258, + "learning_rate": 5.5362760020294274e-06, + "loss": 0.2024, + "step": 5495 + }, + { + "epoch": 5.019178082191781, + "grad_norm": 45.11640548706055, + "learning_rate": 5.535261288685947e-06, + "loss": 0.4593, + "step": 5496 + }, + { + "epoch": 5.020091324200913, + "grad_norm": 3.3863792419433594, + "learning_rate": 5.534246575342466e-06, + "loss": 0.0299, + "step": 5497 + }, + { + "epoch": 5.0210045662100455, + "grad_norm": 2.0795376300811768, + "learning_rate": 5.533231861998986e-06, + "loss": 0.0126, + "step": 5498 + }, + { + "epoch": 5.021917808219178, + "grad_norm": 31.906482696533203, + "learning_rate": 5.532217148655505e-06, + "loss": 0.3098, + "step": 5499 + }, + { + "epoch": 5.0228310502283104, + "grad_norm": 18.45003890991211, + "learning_rate": 5.531202435312025e-06, + "loss": 0.142, + "step": 5500 + }, + { + "epoch": 5.023744292237443, + "grad_norm": 12.45458698272705, + "learning_rate": 5.530187721968544e-06, + "loss": 0.117, + "step": 5501 + }, + { + "epoch": 5.024657534246575, + "grad_norm": 49.94281768798828, + "learning_rate": 5.529173008625064e-06, + "loss": 0.634, + "step": 5502 + }, + { + "epoch": 5.025570776255708, + "grad_norm": 2.420783519744873, + "learning_rate": 5.528158295281584e-06, + "loss": 0.0253, + "step": 5503 + }, + { + "epoch": 5.02648401826484, + "grad_norm": 70.59256744384766, + "learning_rate": 5.527143581938102e-06, + "loss": 0.8753, + "step": 5504 + }, + { + "epoch": 5.027397260273973, + "grad_norm": 2.9229750633239746, + "learning_rate": 5.526128868594623e-06, + "loss": 0.0202, + "step": 5505 + }, + { + "epoch": 5.028310502283105, + "grad_norm": 18.799135208129883, + "learning_rate": 5.525114155251142e-06, + "loss": 0.238, + "step": 5506 + }, + { + "epoch": 5.029223744292238, + "grad_norm": 55.88899230957031, + "learning_rate": 5.524099441907661e-06, + "loss": 0.5486, + "step": 5507 + }, + { + "epoch": 5.03013698630137, + "grad_norm": 9.927566528320312, + "learning_rate": 5.523084728564181e-06, + "loss": 0.086, + "step": 5508 + }, + { + "epoch": 5.0310502283105025, + "grad_norm": 15.334877014160156, + "learning_rate": 5.522070015220701e-06, + "loss": 0.1135, + "step": 5509 + }, + { + "epoch": 5.031963470319635, + "grad_norm": 2.7352306842803955, + "learning_rate": 5.52105530187722e-06, + "loss": 0.011, + "step": 5510 + }, + { + "epoch": 5.032876712328767, + "grad_norm": 30.791961669921875, + "learning_rate": 5.520040588533739e-06, + "loss": 0.143, + "step": 5511 + }, + { + "epoch": 5.0337899543379, + "grad_norm": 17.138856887817383, + "learning_rate": 5.519025875190259e-06, + "loss": 0.1143, + "step": 5512 + }, + { + "epoch": 5.034703196347032, + "grad_norm": 0.4037929177284241, + "learning_rate": 5.518011161846779e-06, + "loss": 0.0027, + "step": 5513 + }, + { + "epoch": 5.035616438356165, + "grad_norm": 5.256969928741455, + "learning_rate": 5.516996448503298e-06, + "loss": 0.0476, + "step": 5514 + }, + { + "epoch": 5.036529680365296, + "grad_norm": 5.554501056671143, + "learning_rate": 5.515981735159818e-06, + "loss": 0.0392, + "step": 5515 + }, + { + "epoch": 5.037442922374429, + "grad_norm": 66.57290649414062, + "learning_rate": 5.5149670218163376e-06, + "loss": 2.523, + "step": 5516 + }, + { + "epoch": 5.038356164383561, + "grad_norm": 30.466529846191406, + "learning_rate": 5.513952308472856e-06, + "loss": 0.3612, + "step": 5517 + }, + { + "epoch": 5.039269406392694, + "grad_norm": 1.2896054983139038, + "learning_rate": 5.512937595129376e-06, + "loss": 0.0086, + "step": 5518 + }, + { + "epoch": 5.040182648401826, + "grad_norm": 13.627866744995117, + "learning_rate": 5.511922881785896e-06, + "loss": 0.1287, + "step": 5519 + }, + { + "epoch": 5.041095890410959, + "grad_norm": 0.6470932364463806, + "learning_rate": 5.510908168442416e-06, + "loss": 0.0049, + "step": 5520 + }, + { + "epoch": 5.042009132420091, + "grad_norm": 3.770510196685791, + "learning_rate": 5.509893455098934e-06, + "loss": 0.0251, + "step": 5521 + }, + { + "epoch": 5.0429223744292235, + "grad_norm": 27.983863830566406, + "learning_rate": 5.508878741755455e-06, + "loss": 0.1811, + "step": 5522 + }, + { + "epoch": 5.043835616438356, + "grad_norm": 34.58634567260742, + "learning_rate": 5.5078640284119746e-06, + "loss": 0.1842, + "step": 5523 + }, + { + "epoch": 5.044748858447488, + "grad_norm": 3.5140292644500732, + "learning_rate": 5.506849315068493e-06, + "loss": 0.028, + "step": 5524 + }, + { + "epoch": 5.045662100456621, + "grad_norm": 2.154120683670044, + "learning_rate": 5.505834601725013e-06, + "loss": 0.0137, + "step": 5525 + }, + { + "epoch": 5.046575342465753, + "grad_norm": 17.964698791503906, + "learning_rate": 5.504819888381533e-06, + "loss": 0.1221, + "step": 5526 + }, + { + "epoch": 5.047488584474886, + "grad_norm": 92.69017028808594, + "learning_rate": 5.503805175038052e-06, + "loss": 2.5484, + "step": 5527 + }, + { + "epoch": 5.048401826484018, + "grad_norm": 0.15678593516349792, + "learning_rate": 5.502790461694571e-06, + "loss": 0.0014, + "step": 5528 + }, + { + "epoch": 5.049315068493151, + "grad_norm": 12.410589218139648, + "learning_rate": 5.501775748351091e-06, + "loss": 0.108, + "step": 5529 + }, + { + "epoch": 5.050228310502283, + "grad_norm": 5.756204128265381, + "learning_rate": 5.5007610350076116e-06, + "loss": 0.0347, + "step": 5530 + }, + { + "epoch": 5.051141552511416, + "grad_norm": 0.8051710724830627, + "learning_rate": 5.49974632166413e-06, + "loss": 0.0083, + "step": 5531 + }, + { + "epoch": 5.052054794520548, + "grad_norm": 0.013923225924372673, + "learning_rate": 5.49873160832065e-06, + "loss": 0.0001, + "step": 5532 + }, + { + "epoch": 5.0529680365296805, + "grad_norm": 15.664546012878418, + "learning_rate": 5.49771689497717e-06, + "loss": 0.1194, + "step": 5533 + }, + { + "epoch": 5.053881278538813, + "grad_norm": 77.58715057373047, + "learning_rate": 5.496702181633689e-06, + "loss": 1.1347, + "step": 5534 + }, + { + "epoch": 5.054794520547945, + "grad_norm": 0.9137105941772461, + "learning_rate": 5.495687468290208e-06, + "loss": 0.0071, + "step": 5535 + }, + { + "epoch": 5.055707762557078, + "grad_norm": 11.410512924194336, + "learning_rate": 5.494672754946728e-06, + "loss": 0.0832, + "step": 5536 + }, + { + "epoch": 5.05662100456621, + "grad_norm": 5.243501663208008, + "learning_rate": 5.493658041603247e-06, + "loss": 0.0332, + "step": 5537 + }, + { + "epoch": 5.057534246575343, + "grad_norm": 3.190835952758789, + "learning_rate": 5.4926433282597666e-06, + "loss": 0.0257, + "step": 5538 + }, + { + "epoch": 5.058447488584475, + "grad_norm": 11.417220115661621, + "learning_rate": 5.491628614916287e-06, + "loss": 0.086, + "step": 5539 + }, + { + "epoch": 5.059360730593608, + "grad_norm": 61.00743865966797, + "learning_rate": 5.490613901572807e-06, + "loss": 0.8283, + "step": 5540 + }, + { + "epoch": 5.06027397260274, + "grad_norm": 1.589325189590454, + "learning_rate": 5.489599188229326e-06, + "loss": 0.0122, + "step": 5541 + }, + { + "epoch": 5.061187214611872, + "grad_norm": 3.414792060852051, + "learning_rate": 5.488584474885845e-06, + "loss": 0.0251, + "step": 5542 + }, + { + "epoch": 5.062100456621004, + "grad_norm": 39.900970458984375, + "learning_rate": 5.487569761542365e-06, + "loss": 0.3296, + "step": 5543 + }, + { + "epoch": 5.063013698630137, + "grad_norm": 4.924422740936279, + "learning_rate": 5.486555048198884e-06, + "loss": 0.0385, + "step": 5544 + }, + { + "epoch": 5.063926940639269, + "grad_norm": 40.725399017333984, + "learning_rate": 5.4855403348554036e-06, + "loss": 0.341, + "step": 5545 + }, + { + "epoch": 5.0648401826484015, + "grad_norm": 8.892622947692871, + "learning_rate": 5.484525621511923e-06, + "loss": 0.0655, + "step": 5546 + }, + { + "epoch": 5.065753424657534, + "grad_norm": 2.553846597671509, + "learning_rate": 5.483510908168442e-06, + "loss": 0.0195, + "step": 5547 + }, + { + "epoch": 5.066666666666666, + "grad_norm": 5.341615676879883, + "learning_rate": 5.482496194824962e-06, + "loss": 0.0367, + "step": 5548 + }, + { + "epoch": 5.067579908675799, + "grad_norm": 8.430630683898926, + "learning_rate": 5.481481481481482e-06, + "loss": 0.0523, + "step": 5549 + }, + { + "epoch": 5.068493150684931, + "grad_norm": 8.790374755859375, + "learning_rate": 5.480466768138002e-06, + "loss": 0.0875, + "step": 5550 + }, + { + "epoch": 5.069406392694064, + "grad_norm": 1.4126853942871094, + "learning_rate": 5.479452054794521e-06, + "loss": 0.0142, + "step": 5551 + }, + { + "epoch": 5.070319634703196, + "grad_norm": 336.2002868652344, + "learning_rate": 5.4784373414510406e-06, + "loss": 0.1703, + "step": 5552 + }, + { + "epoch": 5.071232876712329, + "grad_norm": 4.616523742675781, + "learning_rate": 5.47742262810756e-06, + "loss": 0.0466, + "step": 5553 + }, + { + "epoch": 5.072146118721461, + "grad_norm": 19.057071685791016, + "learning_rate": 5.476407914764079e-06, + "loss": 0.1705, + "step": 5554 + }, + { + "epoch": 5.073059360730594, + "grad_norm": 7.1777024269104, + "learning_rate": 5.475393201420599e-06, + "loss": 0.0696, + "step": 5555 + }, + { + "epoch": 5.073972602739726, + "grad_norm": 1.545674443244934, + "learning_rate": 5.474378488077119e-06, + "loss": 0.0073, + "step": 5556 + }, + { + "epoch": 5.0748858447488585, + "grad_norm": 6.362856864929199, + "learning_rate": 5.473363774733637e-06, + "loss": 0.0367, + "step": 5557 + }, + { + "epoch": 5.075799086757991, + "grad_norm": 14.299063682556152, + "learning_rate": 5.472349061390158e-06, + "loss": 0.1289, + "step": 5558 + }, + { + "epoch": 5.076712328767123, + "grad_norm": 44.85059356689453, + "learning_rate": 5.4713343480466776e-06, + "loss": 0.5026, + "step": 5559 + }, + { + "epoch": 5.077625570776256, + "grad_norm": 1.7024039030075073, + "learning_rate": 5.470319634703197e-06, + "loss": 0.0139, + "step": 5560 + }, + { + "epoch": 5.078538812785388, + "grad_norm": 0.5580922961235046, + "learning_rate": 5.469304921359716e-06, + "loss": 0.0049, + "step": 5561 + }, + { + "epoch": 5.079452054794521, + "grad_norm": 0.7811982035636902, + "learning_rate": 5.468290208016236e-06, + "loss": 0.0049, + "step": 5562 + }, + { + "epoch": 5.080365296803653, + "grad_norm": 27.776525497436523, + "learning_rate": 5.4672754946727555e-06, + "loss": 0.113, + "step": 5563 + }, + { + "epoch": 5.081278538812786, + "grad_norm": 30.890214920043945, + "learning_rate": 5.466260781329274e-06, + "loss": 0.2676, + "step": 5564 + }, + { + "epoch": 5.082191780821918, + "grad_norm": 3.984637498855591, + "learning_rate": 5.465246067985794e-06, + "loss": 0.0268, + "step": 5565 + }, + { + "epoch": 5.083105022831051, + "grad_norm": 5.541572570800781, + "learning_rate": 5.4642313546423146e-06, + "loss": 0.0536, + "step": 5566 + }, + { + "epoch": 5.084018264840183, + "grad_norm": 48.933956146240234, + "learning_rate": 5.463216641298833e-06, + "loss": 0.5418, + "step": 5567 + }, + { + "epoch": 5.0849315068493155, + "grad_norm": 1.013611912727356, + "learning_rate": 5.462201927955353e-06, + "loss": 0.0091, + "step": 5568 + }, + { + "epoch": 5.085844748858447, + "grad_norm": 0.3980436623096466, + "learning_rate": 5.461187214611873e-06, + "loss": 0.0034, + "step": 5569 + }, + { + "epoch": 5.0867579908675795, + "grad_norm": 6.185565948486328, + "learning_rate": 5.4601725012683925e-06, + "loss": 0.0286, + "step": 5570 + }, + { + "epoch": 5.087671232876712, + "grad_norm": 0.2228877991437912, + "learning_rate": 5.459157787924911e-06, + "loss": 0.0019, + "step": 5571 + }, + { + "epoch": 5.088584474885844, + "grad_norm": 2.6306018829345703, + "learning_rate": 5.458143074581431e-06, + "loss": 0.0149, + "step": 5572 + }, + { + "epoch": 5.089497716894977, + "grad_norm": 0.13380931317806244, + "learning_rate": 5.457128361237951e-06, + "loss": 0.0011, + "step": 5573 + }, + { + "epoch": 5.090410958904109, + "grad_norm": 1.7035398483276367, + "learning_rate": 5.4561136478944696e-06, + "loss": 0.0137, + "step": 5574 + }, + { + "epoch": 5.091324200913242, + "grad_norm": 16.554624557495117, + "learning_rate": 5.45509893455099e-06, + "loss": 0.1619, + "step": 5575 + }, + { + "epoch": 5.092237442922374, + "grad_norm": 0.40900948643684387, + "learning_rate": 5.45408422120751e-06, + "loss": 0.0032, + "step": 5576 + }, + { + "epoch": 5.093150684931507, + "grad_norm": 1.2194170951843262, + "learning_rate": 5.453069507864029e-06, + "loss": 0.0101, + "step": 5577 + }, + { + "epoch": 5.094063926940639, + "grad_norm": 1.1192764043807983, + "learning_rate": 5.452054794520548e-06, + "loss": 0.0074, + "step": 5578 + }, + { + "epoch": 5.094977168949772, + "grad_norm": 2.6537771224975586, + "learning_rate": 5.451040081177068e-06, + "loss": 0.0249, + "step": 5579 + }, + { + "epoch": 5.095890410958904, + "grad_norm": 25.77701759338379, + "learning_rate": 5.450025367833588e-06, + "loss": 0.3099, + "step": 5580 + }, + { + "epoch": 5.0968036529680365, + "grad_norm": 5.725909233093262, + "learning_rate": 5.4490106544901065e-06, + "loss": 0.0387, + "step": 5581 + }, + { + "epoch": 5.097716894977169, + "grad_norm": 0.1906290501356125, + "learning_rate": 5.447995941146626e-06, + "loss": 0.0014, + "step": 5582 + }, + { + "epoch": 5.098630136986301, + "grad_norm": 4.931909561157227, + "learning_rate": 5.446981227803147e-06, + "loss": 0.0367, + "step": 5583 + }, + { + "epoch": 5.099543378995434, + "grad_norm": 19.15144920349121, + "learning_rate": 5.445966514459665e-06, + "loss": 0.1266, + "step": 5584 + }, + { + "epoch": 5.100456621004566, + "grad_norm": 21.37601089477539, + "learning_rate": 5.444951801116185e-06, + "loss": 0.1511, + "step": 5585 + }, + { + "epoch": 5.101369863013699, + "grad_norm": 61.19696044921875, + "learning_rate": 5.443937087772705e-06, + "loss": 0.6432, + "step": 5586 + }, + { + "epoch": 5.102283105022831, + "grad_norm": 24.37775993347168, + "learning_rate": 5.442922374429224e-06, + "loss": 0.2179, + "step": 5587 + }, + { + "epoch": 5.103196347031964, + "grad_norm": 69.94710540771484, + "learning_rate": 5.4419076610857435e-06, + "loss": 0.8774, + "step": 5588 + }, + { + "epoch": 5.104109589041096, + "grad_norm": 4.642393589019775, + "learning_rate": 5.440892947742263e-06, + "loss": 0.0374, + "step": 5589 + }, + { + "epoch": 5.105022831050229, + "grad_norm": 53.66767883300781, + "learning_rate": 5.439878234398783e-06, + "loss": 0.4261, + "step": 5590 + }, + { + "epoch": 5.105936073059361, + "grad_norm": 7.431753158569336, + "learning_rate": 5.438863521055302e-06, + "loss": 0.0569, + "step": 5591 + }, + { + "epoch": 5.1068493150684935, + "grad_norm": 6.659173965454102, + "learning_rate": 5.4378488077118215e-06, + "loss": 0.046, + "step": 5592 + }, + { + "epoch": 5.107762557077626, + "grad_norm": 0.4189554750919342, + "learning_rate": 5.436834094368342e-06, + "loss": 0.0032, + "step": 5593 + }, + { + "epoch": 5.108675799086758, + "grad_norm": 3.6807899475097656, + "learning_rate": 5.435819381024861e-06, + "loss": 0.0347, + "step": 5594 + }, + { + "epoch": 5.109589041095891, + "grad_norm": 21.75615692138672, + "learning_rate": 5.4348046676813805e-06, + "loss": 0.1911, + "step": 5595 + }, + { + "epoch": 5.110502283105022, + "grad_norm": 8.65053939819336, + "learning_rate": 5.4337899543379e-06, + "loss": 0.0775, + "step": 5596 + }, + { + "epoch": 5.111415525114155, + "grad_norm": 2.3656489849090576, + "learning_rate": 5.432775240994419e-06, + "loss": 0.0191, + "step": 5597 + }, + { + "epoch": 5.112328767123287, + "grad_norm": 16.86859703063965, + "learning_rate": 5.431760527650939e-06, + "loss": 0.111, + "step": 5598 + }, + { + "epoch": 5.11324200913242, + "grad_norm": 13.272735595703125, + "learning_rate": 5.4307458143074585e-06, + "loss": 0.1093, + "step": 5599 + }, + { + "epoch": 5.114155251141552, + "grad_norm": 8.020249366760254, + "learning_rate": 5.429731100963979e-06, + "loss": 0.0616, + "step": 5600 + }, + { + "epoch": 5.115068493150685, + "grad_norm": 16.79702377319336, + "learning_rate": 5.428716387620497e-06, + "loss": 0.092, + "step": 5601 + }, + { + "epoch": 5.115981735159817, + "grad_norm": 63.46223449707031, + "learning_rate": 5.4277016742770175e-06, + "loss": 1.5169, + "step": 5602 + }, + { + "epoch": 5.11689497716895, + "grad_norm": 102.7284164428711, + "learning_rate": 5.426686960933537e-06, + "loss": 1.4318, + "step": 5603 + }, + { + "epoch": 5.117808219178082, + "grad_norm": 49.03968048095703, + "learning_rate": 5.425672247590056e-06, + "loss": 0.387, + "step": 5604 + }, + { + "epoch": 5.1187214611872145, + "grad_norm": 1.0019359588623047, + "learning_rate": 5.424657534246576e-06, + "loss": 0.0068, + "step": 5605 + }, + { + "epoch": 5.119634703196347, + "grad_norm": 0.9884616732597351, + "learning_rate": 5.4236428209030955e-06, + "loss": 0.0063, + "step": 5606 + }, + { + "epoch": 5.120547945205479, + "grad_norm": 108.99983978271484, + "learning_rate": 5.422628107559614e-06, + "loss": 2.017, + "step": 5607 + }, + { + "epoch": 5.121461187214612, + "grad_norm": 12.848695755004883, + "learning_rate": 5.421613394216134e-06, + "loss": 0.1103, + "step": 5608 + }, + { + "epoch": 5.122374429223744, + "grad_norm": 0.734258770942688, + "learning_rate": 5.420598680872654e-06, + "loss": 0.0054, + "step": 5609 + }, + { + "epoch": 5.123287671232877, + "grad_norm": 2.829303741455078, + "learning_rate": 5.419583967529174e-06, + "loss": 0.0188, + "step": 5610 + }, + { + "epoch": 5.124200913242009, + "grad_norm": 1.1826918125152588, + "learning_rate": 5.418569254185693e-06, + "loss": 0.0077, + "step": 5611 + }, + { + "epoch": 5.125114155251142, + "grad_norm": 0.6618295311927795, + "learning_rate": 5.417554540842213e-06, + "loss": 0.0055, + "step": 5612 + }, + { + "epoch": 5.126027397260274, + "grad_norm": 1.9378912448883057, + "learning_rate": 5.4165398274987325e-06, + "loss": 0.01, + "step": 5613 + }, + { + "epoch": 5.126940639269407, + "grad_norm": 8.435102462768555, + "learning_rate": 5.415525114155251e-06, + "loss": 0.0711, + "step": 5614 + }, + { + "epoch": 5.127853881278539, + "grad_norm": 4.317385196685791, + "learning_rate": 5.414510400811771e-06, + "loss": 0.0274, + "step": 5615 + }, + { + "epoch": 5.1287671232876715, + "grad_norm": 4.56247615814209, + "learning_rate": 5.413495687468291e-06, + "loss": 0.0301, + "step": 5616 + }, + { + "epoch": 5.129680365296804, + "grad_norm": 0.10094714909791946, + "learning_rate": 5.4124809741248095e-06, + "loss": 0.0008, + "step": 5617 + }, + { + "epoch": 5.130593607305936, + "grad_norm": 67.62963104248047, + "learning_rate": 5.411466260781329e-06, + "loss": 1.0863, + "step": 5618 + }, + { + "epoch": 5.131506849315069, + "grad_norm": 1.5809181928634644, + "learning_rate": 5.41045154743785e-06, + "loss": 0.0099, + "step": 5619 + }, + { + "epoch": 5.132420091324201, + "grad_norm": 0.7775527238845825, + "learning_rate": 5.4094368340943695e-06, + "loss": 0.0032, + "step": 5620 + }, + { + "epoch": 5.133333333333334, + "grad_norm": 44.71820068359375, + "learning_rate": 5.408422120750888e-06, + "loss": 0.5125, + "step": 5621 + }, + { + "epoch": 5.134246575342466, + "grad_norm": 0.9821208119392395, + "learning_rate": 5.407407407407408e-06, + "loss": 0.0074, + "step": 5622 + }, + { + "epoch": 5.135159817351598, + "grad_norm": 35.739253997802734, + "learning_rate": 5.406392694063928e-06, + "loss": 0.2853, + "step": 5623 + }, + { + "epoch": 5.13607305936073, + "grad_norm": 9.0708646774292, + "learning_rate": 5.4053779807204465e-06, + "loss": 0.0611, + "step": 5624 + }, + { + "epoch": 5.136986301369863, + "grad_norm": 2.8072659969329834, + "learning_rate": 5.404363267376966e-06, + "loss": 0.0233, + "step": 5625 + }, + { + "epoch": 5.137899543378995, + "grad_norm": 4.412671089172363, + "learning_rate": 5.403348554033486e-06, + "loss": 0.0357, + "step": 5626 + }, + { + "epoch": 5.138812785388128, + "grad_norm": 1.8676000833511353, + "learning_rate": 5.402333840690005e-06, + "loss": 0.016, + "step": 5627 + }, + { + "epoch": 5.13972602739726, + "grad_norm": 12.978068351745605, + "learning_rate": 5.4013191273465245e-06, + "loss": 0.1573, + "step": 5628 + }, + { + "epoch": 5.1406392694063925, + "grad_norm": 12.839154243469238, + "learning_rate": 5.400304414003045e-06, + "loss": 0.1068, + "step": 5629 + }, + { + "epoch": 5.141552511415525, + "grad_norm": 2.147395372390747, + "learning_rate": 5.399289700659565e-06, + "loss": 0.0119, + "step": 5630 + }, + { + "epoch": 5.142465753424657, + "grad_norm": 0.07015986740589142, + "learning_rate": 5.3982749873160835e-06, + "loss": 0.0004, + "step": 5631 + }, + { + "epoch": 5.14337899543379, + "grad_norm": 2.035130262374878, + "learning_rate": 5.397260273972603e-06, + "loss": 0.0139, + "step": 5632 + }, + { + "epoch": 5.144292237442922, + "grad_norm": 0.5153517127037048, + "learning_rate": 5.396245560629123e-06, + "loss": 0.0039, + "step": 5633 + }, + { + "epoch": 5.145205479452055, + "grad_norm": 1.0954291820526123, + "learning_rate": 5.395230847285642e-06, + "loss": 0.0064, + "step": 5634 + }, + { + "epoch": 5.146118721461187, + "grad_norm": 60.76120376586914, + "learning_rate": 5.3942161339421614e-06, + "loss": 0.5535, + "step": 5635 + }, + { + "epoch": 5.14703196347032, + "grad_norm": 96.51669311523438, + "learning_rate": 5.393201420598682e-06, + "loss": 3.7023, + "step": 5636 + }, + { + "epoch": 5.147945205479452, + "grad_norm": 9.93305492401123, + "learning_rate": 5.3921867072552e-06, + "loss": 0.0574, + "step": 5637 + }, + { + "epoch": 5.148858447488585, + "grad_norm": 1.3260794878005981, + "learning_rate": 5.3911719939117205e-06, + "loss": 0.011, + "step": 5638 + }, + { + "epoch": 5.149771689497717, + "grad_norm": 0.10041358321905136, + "learning_rate": 5.39015728056824e-06, + "loss": 0.0007, + "step": 5639 + }, + { + "epoch": 5.1506849315068495, + "grad_norm": 50.38501739501953, + "learning_rate": 5.38914256722476e-06, + "loss": 0.3851, + "step": 5640 + }, + { + "epoch": 5.151598173515982, + "grad_norm": 0.33308088779449463, + "learning_rate": 5.388127853881279e-06, + "loss": 0.0022, + "step": 5641 + }, + { + "epoch": 5.152511415525114, + "grad_norm": 31.173315048217773, + "learning_rate": 5.3871131405377984e-06, + "loss": 0.3224, + "step": 5642 + }, + { + "epoch": 5.153424657534247, + "grad_norm": 14.323227882385254, + "learning_rate": 5.386098427194318e-06, + "loss": 0.0889, + "step": 5643 + }, + { + "epoch": 5.154337899543379, + "grad_norm": 0.3503572344779968, + "learning_rate": 5.385083713850837e-06, + "loss": 0.0021, + "step": 5644 + }, + { + "epoch": 5.155251141552512, + "grad_norm": 79.88006591796875, + "learning_rate": 5.384069000507357e-06, + "loss": 1.3715, + "step": 5645 + }, + { + "epoch": 5.156164383561644, + "grad_norm": 10.313355445861816, + "learning_rate": 5.383054287163877e-06, + "loss": 0.0566, + "step": 5646 + }, + { + "epoch": 5.157077625570777, + "grad_norm": 2.6192407608032227, + "learning_rate": 5.382039573820396e-06, + "loss": 0.0164, + "step": 5647 + }, + { + "epoch": 5.157990867579909, + "grad_norm": 1.6411659717559814, + "learning_rate": 5.381024860476916e-06, + "loss": 0.0158, + "step": 5648 + }, + { + "epoch": 5.1589041095890416, + "grad_norm": 3.068612575531006, + "learning_rate": 5.3800101471334354e-06, + "loss": 0.024, + "step": 5649 + }, + { + "epoch": 5.159817351598173, + "grad_norm": 81.98351287841797, + "learning_rate": 5.378995433789955e-06, + "loss": 1.1195, + "step": 5650 + }, + { + "epoch": 5.160730593607306, + "grad_norm": 37.122596740722656, + "learning_rate": 5.377980720446474e-06, + "loss": 0.5014, + "step": 5651 + }, + { + "epoch": 5.161643835616438, + "grad_norm": 88.95178985595703, + "learning_rate": 5.376966007102994e-06, + "loss": 0.1429, + "step": 5652 + }, + { + "epoch": 5.1625570776255705, + "grad_norm": 7.604737281799316, + "learning_rate": 5.375951293759513e-06, + "loss": 0.0652, + "step": 5653 + }, + { + "epoch": 5.163470319634703, + "grad_norm": 4.807616233825684, + "learning_rate": 5.374936580416032e-06, + "loss": 0.0359, + "step": 5654 + }, + { + "epoch": 5.164383561643835, + "grad_norm": 12.75497817993164, + "learning_rate": 5.373921867072553e-06, + "loss": 0.1109, + "step": 5655 + }, + { + "epoch": 5.165296803652968, + "grad_norm": 7.822116374969482, + "learning_rate": 5.3729071537290724e-06, + "loss": 0.0531, + "step": 5656 + }, + { + "epoch": 5.1662100456621, + "grad_norm": 32.559234619140625, + "learning_rate": 5.371892440385591e-06, + "loss": 0.3293, + "step": 5657 + }, + { + "epoch": 5.167123287671233, + "grad_norm": 65.49676513671875, + "learning_rate": 5.370877727042111e-06, + "loss": 0.6278, + "step": 5658 + }, + { + "epoch": 5.168036529680365, + "grad_norm": 22.29096031188965, + "learning_rate": 5.369863013698631e-06, + "loss": 0.1291, + "step": 5659 + }, + { + "epoch": 5.168949771689498, + "grad_norm": 7.4407057762146, + "learning_rate": 5.36884830035515e-06, + "loss": 0.0641, + "step": 5660 + }, + { + "epoch": 5.16986301369863, + "grad_norm": 7.707156181335449, + "learning_rate": 5.367833587011669e-06, + "loss": 0.0593, + "step": 5661 + }, + { + "epoch": 5.170776255707763, + "grad_norm": 37.213191986083984, + "learning_rate": 5.366818873668189e-06, + "loss": 0.3423, + "step": 5662 + }, + { + "epoch": 5.171689497716895, + "grad_norm": 0.5031901597976685, + "learning_rate": 5.3658041603247094e-06, + "loss": 0.0029, + "step": 5663 + }, + { + "epoch": 5.1726027397260275, + "grad_norm": 35.63014602661133, + "learning_rate": 5.3647894469812274e-06, + "loss": 0.4802, + "step": 5664 + }, + { + "epoch": 5.17351598173516, + "grad_norm": 38.80474090576172, + "learning_rate": 5.363774733637748e-06, + "loss": 0.4962, + "step": 5665 + }, + { + "epoch": 5.174429223744292, + "grad_norm": 9.639791488647461, + "learning_rate": 5.362760020294268e-06, + "loss": 0.0686, + "step": 5666 + }, + { + "epoch": 5.175342465753425, + "grad_norm": 6.218652725219727, + "learning_rate": 5.3617453069507865e-06, + "loss": 0.0445, + "step": 5667 + }, + { + "epoch": 5.176255707762557, + "grad_norm": 11.717172622680664, + "learning_rate": 5.360730593607306e-06, + "loss": 0.0935, + "step": 5668 + }, + { + "epoch": 5.17716894977169, + "grad_norm": 3.486917734146118, + "learning_rate": 5.359715880263826e-06, + "loss": 0.0179, + "step": 5669 + }, + { + "epoch": 5.178082191780822, + "grad_norm": 8.12496566772461, + "learning_rate": 5.358701166920346e-06, + "loss": 0.0093, + "step": 5670 + }, + { + "epoch": 5.178995433789955, + "grad_norm": 2.6033403873443604, + "learning_rate": 5.3576864535768644e-06, + "loss": 0.0195, + "step": 5671 + }, + { + "epoch": 5.179908675799087, + "grad_norm": 0.10777969658374786, + "learning_rate": 5.356671740233384e-06, + "loss": 0.0007, + "step": 5672 + }, + { + "epoch": 5.1808219178082195, + "grad_norm": 20.062881469726562, + "learning_rate": 5.355657026889905e-06, + "loss": 0.1327, + "step": 5673 + }, + { + "epoch": 5.181735159817352, + "grad_norm": 94.41646575927734, + "learning_rate": 5.3546423135464235e-06, + "loss": 2.7549, + "step": 5674 + }, + { + "epoch": 5.182648401826484, + "grad_norm": 19.39822769165039, + "learning_rate": 5.353627600202943e-06, + "loss": 0.1127, + "step": 5675 + }, + { + "epoch": 5.183561643835616, + "grad_norm": 1.6554512977600098, + "learning_rate": 5.352612886859463e-06, + "loss": 0.0097, + "step": 5676 + }, + { + "epoch": 5.1844748858447485, + "grad_norm": 6.728578090667725, + "learning_rate": 5.351598173515982e-06, + "loss": 0.0467, + "step": 5677 + }, + { + "epoch": 5.185388127853881, + "grad_norm": 16.01427459716797, + "learning_rate": 5.3505834601725014e-06, + "loss": 0.172, + "step": 5678 + }, + { + "epoch": 5.186301369863013, + "grad_norm": 1.183227300643921, + "learning_rate": 5.349568746829021e-06, + "loss": 0.0065, + "step": 5679 + }, + { + "epoch": 5.187214611872146, + "grad_norm": 1.6124701499938965, + "learning_rate": 5.348554033485542e-06, + "loss": 0.0098, + "step": 5680 + }, + { + "epoch": 5.188127853881278, + "grad_norm": 21.273361206054688, + "learning_rate": 5.34753932014206e-06, + "loss": 0.1397, + "step": 5681 + }, + { + "epoch": 5.189041095890411, + "grad_norm": 13.460317611694336, + "learning_rate": 5.34652460679858e-06, + "loss": 0.1296, + "step": 5682 + }, + { + "epoch": 5.189954337899543, + "grad_norm": 6.503454208374023, + "learning_rate": 5.3455098934551e-06, + "loss": 0.0625, + "step": 5683 + }, + { + "epoch": 5.190867579908676, + "grad_norm": 1.449252963066101, + "learning_rate": 5.344495180111619e-06, + "loss": 0.0107, + "step": 5684 + }, + { + "epoch": 5.191780821917808, + "grad_norm": 0.38035526871681213, + "learning_rate": 5.3434804667681384e-06, + "loss": 0.003, + "step": 5685 + }, + { + "epoch": 5.1926940639269406, + "grad_norm": 4.739443302154541, + "learning_rate": 5.342465753424658e-06, + "loss": 0.0345, + "step": 5686 + }, + { + "epoch": 5.193607305936073, + "grad_norm": 20.518043518066406, + "learning_rate": 5.341451040081177e-06, + "loss": 0.1909, + "step": 5687 + }, + { + "epoch": 5.1945205479452055, + "grad_norm": 6.527881622314453, + "learning_rate": 5.340436326737697e-06, + "loss": 0.0472, + "step": 5688 + }, + { + "epoch": 5.195433789954338, + "grad_norm": 12.901997566223145, + "learning_rate": 5.339421613394216e-06, + "loss": 0.1152, + "step": 5689 + }, + { + "epoch": 5.19634703196347, + "grad_norm": 1.0581281185150146, + "learning_rate": 5.338406900050737e-06, + "loss": 0.0067, + "step": 5690 + }, + { + "epoch": 5.197260273972603, + "grad_norm": 84.4952392578125, + "learning_rate": 5.337392186707256e-06, + "loss": 0.2451, + "step": 5691 + }, + { + "epoch": 5.198173515981735, + "grad_norm": 126.66506958007812, + "learning_rate": 5.3363774733637754e-06, + "loss": 0.2928, + "step": 5692 + }, + { + "epoch": 5.199086757990868, + "grad_norm": 2.8747518062591553, + "learning_rate": 5.335362760020295e-06, + "loss": 0.026, + "step": 5693 + }, + { + "epoch": 5.2, + "grad_norm": 2.6790239810943604, + "learning_rate": 5.334348046676814e-06, + "loss": 0.0177, + "step": 5694 + }, + { + "epoch": 5.200913242009133, + "grad_norm": 12.087896347045898, + "learning_rate": 5.333333333333334e-06, + "loss": 0.0921, + "step": 5695 + }, + { + "epoch": 5.201826484018265, + "grad_norm": 26.658843994140625, + "learning_rate": 5.332318619989853e-06, + "loss": 0.1572, + "step": 5696 + }, + { + "epoch": 5.2027397260273975, + "grad_norm": 6.316125392913818, + "learning_rate": 5.331303906646372e-06, + "loss": 0.0516, + "step": 5697 + }, + { + "epoch": 5.20365296803653, + "grad_norm": 0.10177106410264969, + "learning_rate": 5.330289193302892e-06, + "loss": 0.001, + "step": 5698 + }, + { + "epoch": 5.2045662100456624, + "grad_norm": 6.824997901916504, + "learning_rate": 5.3292744799594124e-06, + "loss": 0.0573, + "step": 5699 + }, + { + "epoch": 5.205479452054795, + "grad_norm": 1.2739794254302979, + "learning_rate": 5.328259766615932e-06, + "loss": 0.0123, + "step": 5700 + }, + { + "epoch": 5.206392694063927, + "grad_norm": 3.0960710048675537, + "learning_rate": 5.327245053272451e-06, + "loss": 0.0291, + "step": 5701 + }, + { + "epoch": 5.207305936073059, + "grad_norm": 0.9253113865852356, + "learning_rate": 5.326230339928971e-06, + "loss": 0.0078, + "step": 5702 + }, + { + "epoch": 5.208219178082191, + "grad_norm": 4.638556957244873, + "learning_rate": 5.32521562658549e-06, + "loss": 0.0372, + "step": 5703 + }, + { + "epoch": 5.209132420091324, + "grad_norm": 1.4326188564300537, + "learning_rate": 5.324200913242009e-06, + "loss": 0.0114, + "step": 5704 + }, + { + "epoch": 5.210045662100456, + "grad_norm": 37.40229415893555, + "learning_rate": 5.323186199898529e-06, + "loss": 0.2773, + "step": 5705 + }, + { + "epoch": 5.210958904109589, + "grad_norm": 0.2993702292442322, + "learning_rate": 5.3221714865550486e-06, + "loss": 0.0021, + "step": 5706 + }, + { + "epoch": 5.211872146118721, + "grad_norm": 71.32977294921875, + "learning_rate": 5.321156773211567e-06, + "loss": 0.6597, + "step": 5707 + }, + { + "epoch": 5.212785388127854, + "grad_norm": 13.633957862854004, + "learning_rate": 5.320142059868087e-06, + "loss": 0.1009, + "step": 5708 + }, + { + "epoch": 5.213698630136986, + "grad_norm": 25.140182495117188, + "learning_rate": 5.319127346524608e-06, + "loss": 0.1322, + "step": 5709 + }, + { + "epoch": 5.2146118721461185, + "grad_norm": 4.207064151763916, + "learning_rate": 5.318112633181127e-06, + "loss": 0.012, + "step": 5710 + }, + { + "epoch": 5.215525114155251, + "grad_norm": 8.006495475769043, + "learning_rate": 5.317097919837646e-06, + "loss": 0.0242, + "step": 5711 + }, + { + "epoch": 5.2164383561643834, + "grad_norm": 12.661420822143555, + "learning_rate": 5.316083206494166e-06, + "loss": 0.1018, + "step": 5712 + }, + { + "epoch": 5.217351598173516, + "grad_norm": 5.147617816925049, + "learning_rate": 5.3150684931506856e-06, + "loss": 0.0427, + "step": 5713 + }, + { + "epoch": 5.218264840182648, + "grad_norm": 9.645956993103027, + "learning_rate": 5.314053779807204e-06, + "loss": 0.0647, + "step": 5714 + }, + { + "epoch": 5.219178082191781, + "grad_norm": 0.4453723728656769, + "learning_rate": 5.313039066463724e-06, + "loss": 0.0035, + "step": 5715 + }, + { + "epoch": 5.220091324200913, + "grad_norm": 25.275102615356445, + "learning_rate": 5.312024353120245e-06, + "loss": 0.2943, + "step": 5716 + }, + { + "epoch": 5.221004566210046, + "grad_norm": 16.98212432861328, + "learning_rate": 5.311009639776763e-06, + "loss": 0.113, + "step": 5717 + }, + { + "epoch": 5.221917808219178, + "grad_norm": 27.174148559570312, + "learning_rate": 5.309994926433283e-06, + "loss": 0.1905, + "step": 5718 + }, + { + "epoch": 5.222831050228311, + "grad_norm": 0.5688508152961731, + "learning_rate": 5.308980213089803e-06, + "loss": 0.0042, + "step": 5719 + }, + { + "epoch": 5.223744292237443, + "grad_norm": 77.77110290527344, + "learning_rate": 5.3079654997463226e-06, + "loss": 0.5117, + "step": 5720 + }, + { + "epoch": 5.2246575342465755, + "grad_norm": 0.5959751009941101, + "learning_rate": 5.306950786402841e-06, + "loss": 0.0038, + "step": 5721 + }, + { + "epoch": 5.225570776255708, + "grad_norm": 8.720010757446289, + "learning_rate": 5.305936073059361e-06, + "loss": 0.0721, + "step": 5722 + }, + { + "epoch": 5.22648401826484, + "grad_norm": 20.92337989807129, + "learning_rate": 5.304921359715881e-06, + "loss": 0.1877, + "step": 5723 + }, + { + "epoch": 5.227397260273973, + "grad_norm": 6.095714569091797, + "learning_rate": 5.3039066463724e-06, + "loss": 0.0388, + "step": 5724 + }, + { + "epoch": 5.228310502283105, + "grad_norm": 0.5654784440994263, + "learning_rate": 5.302891933028919e-06, + "loss": 0.0045, + "step": 5725 + }, + { + "epoch": 5.229223744292238, + "grad_norm": 0.1083144024014473, + "learning_rate": 5.30187721968544e-06, + "loss": 0.0007, + "step": 5726 + }, + { + "epoch": 5.23013698630137, + "grad_norm": 8.405561447143555, + "learning_rate": 5.300862506341959e-06, + "loss": 0.0511, + "step": 5727 + }, + { + "epoch": 5.231050228310503, + "grad_norm": 0.6538875699043274, + "learning_rate": 5.299847792998478e-06, + "loss": 0.0055, + "step": 5728 + }, + { + "epoch": 5.231963470319634, + "grad_norm": 0.8391871452331543, + "learning_rate": 5.298833079654998e-06, + "loss": 0.0052, + "step": 5729 + }, + { + "epoch": 5.232876712328767, + "grad_norm": 13.64705753326416, + "learning_rate": 5.297818366311518e-06, + "loss": 0.0914, + "step": 5730 + }, + { + "epoch": 5.233789954337899, + "grad_norm": 0.30064666271209717, + "learning_rate": 5.296803652968037e-06, + "loss": 0.0019, + "step": 5731 + }, + { + "epoch": 5.234703196347032, + "grad_norm": 3.9629695415496826, + "learning_rate": 5.295788939624556e-06, + "loss": 0.0279, + "step": 5732 + }, + { + "epoch": 5.235616438356164, + "grad_norm": 1.974515676498413, + "learning_rate": 5.294774226281076e-06, + "loss": 0.0127, + "step": 5733 + }, + { + "epoch": 5.2365296803652965, + "grad_norm": 4.02461576461792, + "learning_rate": 5.293759512937595e-06, + "loss": 0.0338, + "step": 5734 + }, + { + "epoch": 5.237442922374429, + "grad_norm": 7.003183364868164, + "learning_rate": 5.292744799594115e-06, + "loss": 0.0496, + "step": 5735 + }, + { + "epoch": 5.238356164383561, + "grad_norm": 5.77480936050415, + "learning_rate": 5.291730086250635e-06, + "loss": 0.0487, + "step": 5736 + }, + { + "epoch": 5.239269406392694, + "grad_norm": 16.002870559692383, + "learning_rate": 5.290715372907154e-06, + "loss": 0.1244, + "step": 5737 + }, + { + "epoch": 5.240182648401826, + "grad_norm": 5.176574230194092, + "learning_rate": 5.289700659563674e-06, + "loss": 0.0172, + "step": 5738 + }, + { + "epoch": 5.241095890410959, + "grad_norm": 0.5806558728218079, + "learning_rate": 5.288685946220193e-06, + "loss": 0.0054, + "step": 5739 + }, + { + "epoch": 5.242009132420091, + "grad_norm": 2.0616893768310547, + "learning_rate": 5.287671232876713e-06, + "loss": 0.0168, + "step": 5740 + }, + { + "epoch": 5.242922374429224, + "grad_norm": 3.715217351913452, + "learning_rate": 5.286656519533232e-06, + "loss": 0.0414, + "step": 5741 + }, + { + "epoch": 5.243835616438356, + "grad_norm": 39.16128921508789, + "learning_rate": 5.2856418061897516e-06, + "loss": 0.236, + "step": 5742 + }, + { + "epoch": 5.244748858447489, + "grad_norm": 60.087589263916016, + "learning_rate": 5.284627092846272e-06, + "loss": 0.3385, + "step": 5743 + }, + { + "epoch": 5.245662100456621, + "grad_norm": 0.10939506441354752, + "learning_rate": 5.28361237950279e-06, + "loss": 0.0008, + "step": 5744 + }, + { + "epoch": 5.2465753424657535, + "grad_norm": 2.56221079826355, + "learning_rate": 5.282597666159311e-06, + "loss": 0.0194, + "step": 5745 + }, + { + "epoch": 5.247488584474886, + "grad_norm": 6.264577388763428, + "learning_rate": 5.28158295281583e-06, + "loss": 0.0484, + "step": 5746 + }, + { + "epoch": 5.248401826484018, + "grad_norm": 1.5934033393859863, + "learning_rate": 5.280568239472349e-06, + "loss": 0.0118, + "step": 5747 + }, + { + "epoch": 5.249315068493151, + "grad_norm": 62.93758773803711, + "learning_rate": 5.279553526128869e-06, + "loss": 0.5831, + "step": 5748 + }, + { + "epoch": 5.250228310502283, + "grad_norm": 0.07964770495891571, + "learning_rate": 5.2785388127853886e-06, + "loss": 0.0004, + "step": 5749 + }, + { + "epoch": 5.251141552511416, + "grad_norm": 35.04550552368164, + "learning_rate": 5.277524099441908e-06, + "loss": 0.281, + "step": 5750 + }, + { + "epoch": 5.252054794520548, + "grad_norm": 3.7256388664245605, + "learning_rate": 5.276509386098427e-06, + "loss": 0.0284, + "step": 5751 + }, + { + "epoch": 5.252968036529681, + "grad_norm": 12.715283393859863, + "learning_rate": 5.275494672754947e-06, + "loss": 0.1072, + "step": 5752 + }, + { + "epoch": 5.253881278538813, + "grad_norm": 32.27413558959961, + "learning_rate": 5.274479959411467e-06, + "loss": 0.1583, + "step": 5753 + }, + { + "epoch": 5.254794520547946, + "grad_norm": 5.408252716064453, + "learning_rate": 5.273465246067986e-06, + "loss": 0.0561, + "step": 5754 + }, + { + "epoch": 5.255707762557078, + "grad_norm": 83.35315704345703, + "learning_rate": 5.272450532724506e-06, + "loss": 1.1856, + "step": 5755 + }, + { + "epoch": 5.25662100456621, + "grad_norm": 0.5645703077316284, + "learning_rate": 5.2714358193810256e-06, + "loss": 0.0041, + "step": 5756 + }, + { + "epoch": 5.257534246575342, + "grad_norm": 5.87605094909668, + "learning_rate": 5.270421106037544e-06, + "loss": 0.0491, + "step": 5757 + }, + { + "epoch": 5.2584474885844745, + "grad_norm": 10.102302551269531, + "learning_rate": 5.269406392694064e-06, + "loss": 0.055, + "step": 5758 + }, + { + "epoch": 5.259360730593607, + "grad_norm": 26.613529205322266, + "learning_rate": 5.268391679350584e-06, + "loss": 0.1909, + "step": 5759 + }, + { + "epoch": 5.260273972602739, + "grad_norm": 73.41024017333984, + "learning_rate": 5.267376966007104e-06, + "loss": 0.9707, + "step": 5760 + }, + { + "epoch": 5.261187214611872, + "grad_norm": 2.671592950820923, + "learning_rate": 5.266362252663622e-06, + "loss": 0.0173, + "step": 5761 + }, + { + "epoch": 5.262100456621004, + "grad_norm": 4.9197516441345215, + "learning_rate": 5.265347539320143e-06, + "loss": 0.0332, + "step": 5762 + }, + { + "epoch": 5.263013698630137, + "grad_norm": 7.451664924621582, + "learning_rate": 5.2643328259766626e-06, + "loss": 0.0769, + "step": 5763 + }, + { + "epoch": 5.263926940639269, + "grad_norm": 7.567653179168701, + "learning_rate": 5.263318112633181e-06, + "loss": 0.0437, + "step": 5764 + }, + { + "epoch": 5.264840182648402, + "grad_norm": 84.69380187988281, + "learning_rate": 5.262303399289701e-06, + "loss": 2.4835, + "step": 5765 + }, + { + "epoch": 5.265753424657534, + "grad_norm": 2.1851155757904053, + "learning_rate": 5.261288685946221e-06, + "loss": 0.015, + "step": 5766 + }, + { + "epoch": 5.266666666666667, + "grad_norm": 0.3402155339717865, + "learning_rate": 5.26027397260274e-06, + "loss": 0.0027, + "step": 5767 + }, + { + "epoch": 5.267579908675799, + "grad_norm": 3.569012403488159, + "learning_rate": 5.259259259259259e-06, + "loss": 0.03, + "step": 5768 + }, + { + "epoch": 5.2684931506849315, + "grad_norm": 117.47514343261719, + "learning_rate": 5.258244545915779e-06, + "loss": 0.5616, + "step": 5769 + }, + { + "epoch": 5.269406392694064, + "grad_norm": 26.00856590270996, + "learning_rate": 5.2572298325722996e-06, + "loss": 0.1938, + "step": 5770 + }, + { + "epoch": 5.270319634703196, + "grad_norm": 3.7636802196502686, + "learning_rate": 5.256215119228818e-06, + "loss": 0.0278, + "step": 5771 + }, + { + "epoch": 5.271232876712329, + "grad_norm": 15.475831031799316, + "learning_rate": 5.255200405885338e-06, + "loss": 0.081, + "step": 5772 + }, + { + "epoch": 5.272146118721461, + "grad_norm": 83.12001037597656, + "learning_rate": 5.254185692541858e-06, + "loss": 1.2049, + "step": 5773 + }, + { + "epoch": 5.273059360730594, + "grad_norm": 57.47773361206055, + "learning_rate": 5.253170979198377e-06, + "loss": 0.6878, + "step": 5774 + }, + { + "epoch": 5.273972602739726, + "grad_norm": 20.052345275878906, + "learning_rate": 5.252156265854896e-06, + "loss": 0.2414, + "step": 5775 + }, + { + "epoch": 5.274885844748859, + "grad_norm": 72.3722152709961, + "learning_rate": 5.251141552511416e-06, + "loss": 0.9995, + "step": 5776 + }, + { + "epoch": 5.275799086757991, + "grad_norm": 10.405668258666992, + "learning_rate": 5.250126839167935e-06, + "loss": 0.0859, + "step": 5777 + }, + { + "epoch": 5.276712328767124, + "grad_norm": 17.119199752807617, + "learning_rate": 5.2491121258244545e-06, + "loss": 0.1086, + "step": 5778 + }, + { + "epoch": 5.277625570776256, + "grad_norm": 0.09773752838373184, + "learning_rate": 5.248097412480975e-06, + "loss": 0.0006, + "step": 5779 + }, + { + "epoch": 5.2785388127853885, + "grad_norm": 78.65313720703125, + "learning_rate": 5.247082699137495e-06, + "loss": 1.097, + "step": 5780 + }, + { + "epoch": 5.279452054794521, + "grad_norm": 24.26990509033203, + "learning_rate": 5.246067985794014e-06, + "loss": 0.1514, + "step": 5781 + }, + { + "epoch": 5.280365296803653, + "grad_norm": 4.172004699707031, + "learning_rate": 5.245053272450533e-06, + "loss": 0.0388, + "step": 5782 + }, + { + "epoch": 5.281278538812785, + "grad_norm": 47.5211181640625, + "learning_rate": 5.244038559107053e-06, + "loss": 0.1977, + "step": 5783 + }, + { + "epoch": 5.282191780821917, + "grad_norm": 6.840296745300293, + "learning_rate": 5.243023845763572e-06, + "loss": 0.0287, + "step": 5784 + }, + { + "epoch": 5.28310502283105, + "grad_norm": 25.165672302246094, + "learning_rate": 5.2420091324200915e-06, + "loss": 0.2251, + "step": 5785 + }, + { + "epoch": 5.284018264840182, + "grad_norm": 1.6749489307403564, + "learning_rate": 5.240994419076611e-06, + "loss": 0.0134, + "step": 5786 + }, + { + "epoch": 5.284931506849315, + "grad_norm": 5.869629859924316, + "learning_rate": 5.23997970573313e-06, + "loss": 0.0312, + "step": 5787 + }, + { + "epoch": 5.285844748858447, + "grad_norm": 37.88240432739258, + "learning_rate": 5.23896499238965e-06, + "loss": 0.328, + "step": 5788 + }, + { + "epoch": 5.28675799086758, + "grad_norm": 6.561133861541748, + "learning_rate": 5.23795027904617e-06, + "loss": 0.0559, + "step": 5789 + }, + { + "epoch": 5.287671232876712, + "grad_norm": 16.513580322265625, + "learning_rate": 5.23693556570269e-06, + "loss": 0.1243, + "step": 5790 + }, + { + "epoch": 5.288584474885845, + "grad_norm": 4.776734828948975, + "learning_rate": 5.235920852359209e-06, + "loss": 0.0302, + "step": 5791 + }, + { + "epoch": 5.289497716894977, + "grad_norm": 3.0749616622924805, + "learning_rate": 5.2349061390157285e-06, + "loss": 0.025, + "step": 5792 + }, + { + "epoch": 5.2904109589041095, + "grad_norm": 0.3986204266548157, + "learning_rate": 5.233891425672248e-06, + "loss": 0.0024, + "step": 5793 + }, + { + "epoch": 5.291324200913242, + "grad_norm": 16.508358001708984, + "learning_rate": 5.232876712328767e-06, + "loss": 0.0841, + "step": 5794 + }, + { + "epoch": 5.292237442922374, + "grad_norm": 19.871036529541016, + "learning_rate": 5.231861998985287e-06, + "loss": 0.1471, + "step": 5795 + }, + { + "epoch": 5.293150684931507, + "grad_norm": 6.517530918121338, + "learning_rate": 5.2308472856418065e-06, + "loss": 0.0491, + "step": 5796 + }, + { + "epoch": 5.294063926940639, + "grad_norm": 17.414030075073242, + "learning_rate": 5.229832572298325e-06, + "loss": 0.122, + "step": 5797 + }, + { + "epoch": 5.294977168949772, + "grad_norm": 7.732366561889648, + "learning_rate": 5.228817858954846e-06, + "loss": 0.052, + "step": 5798 + }, + { + "epoch": 5.295890410958904, + "grad_norm": 0.19831663370132446, + "learning_rate": 5.2278031456113655e-06, + "loss": 0.0016, + "step": 5799 + }, + { + "epoch": 5.296803652968037, + "grad_norm": 2.22233247756958, + "learning_rate": 5.226788432267885e-06, + "loss": 0.017, + "step": 5800 + }, + { + "epoch": 5.297716894977169, + "grad_norm": 57.77809524536133, + "learning_rate": 5.225773718924404e-06, + "loss": 0.5432, + "step": 5801 + }, + { + "epoch": 5.298630136986302, + "grad_norm": 7.128307819366455, + "learning_rate": 5.224759005580924e-06, + "loss": 0.0589, + "step": 5802 + }, + { + "epoch": 5.299543378995434, + "grad_norm": 0.6872591376304626, + "learning_rate": 5.2237442922374435e-06, + "loss": 0.0023, + "step": 5803 + }, + { + "epoch": 5.3004566210045665, + "grad_norm": 61.687042236328125, + "learning_rate": 5.222729578893962e-06, + "loss": 0.893, + "step": 5804 + }, + { + "epoch": 5.301369863013699, + "grad_norm": 6.305875301361084, + "learning_rate": 5.221714865550482e-06, + "loss": 0.0344, + "step": 5805 + }, + { + "epoch": 5.302283105022831, + "grad_norm": 1.331847906112671, + "learning_rate": 5.2207001522070025e-06, + "loss": 0.0092, + "step": 5806 + }, + { + "epoch": 5.303196347031964, + "grad_norm": 4.844211101531982, + "learning_rate": 5.219685438863521e-06, + "loss": 0.0271, + "step": 5807 + }, + { + "epoch": 5.304109589041096, + "grad_norm": 20.028274536132812, + "learning_rate": 5.218670725520041e-06, + "loss": 0.2134, + "step": 5808 + }, + { + "epoch": 5.305022831050229, + "grad_norm": 1.9864252805709839, + "learning_rate": 5.217656012176561e-06, + "loss": 0.0175, + "step": 5809 + }, + { + "epoch": 5.30593607305936, + "grad_norm": 31.675031661987305, + "learning_rate": 5.2166412988330805e-06, + "loss": 0.2607, + "step": 5810 + }, + { + "epoch": 5.306849315068493, + "grad_norm": 14.267372131347656, + "learning_rate": 5.215626585489599e-06, + "loss": 0.0736, + "step": 5811 + }, + { + "epoch": 5.307762557077625, + "grad_norm": 20.369213104248047, + "learning_rate": 5.214611872146119e-06, + "loss": 0.1701, + "step": 5812 + }, + { + "epoch": 5.308675799086758, + "grad_norm": 20.896657943725586, + "learning_rate": 5.213597158802639e-06, + "loss": 0.1431, + "step": 5813 + }, + { + "epoch": 5.30958904109589, + "grad_norm": 1.555895209312439, + "learning_rate": 5.2125824454591575e-06, + "loss": 0.0122, + "step": 5814 + }, + { + "epoch": 5.310502283105023, + "grad_norm": 0.04873627424240112, + "learning_rate": 5.211567732115678e-06, + "loss": 0.0004, + "step": 5815 + }, + { + "epoch": 5.311415525114155, + "grad_norm": 28.329578399658203, + "learning_rate": 5.210553018772198e-06, + "loss": 0.2959, + "step": 5816 + }, + { + "epoch": 5.3123287671232875, + "grad_norm": 3.30802321434021, + "learning_rate": 5.209538305428717e-06, + "loss": 0.0245, + "step": 5817 + }, + { + "epoch": 5.31324200913242, + "grad_norm": 1.8081899881362915, + "learning_rate": 5.208523592085236e-06, + "loss": 0.0165, + "step": 5818 + }, + { + "epoch": 5.314155251141552, + "grad_norm": 9.672830581665039, + "learning_rate": 5.207508878741756e-06, + "loss": 0.094, + "step": 5819 + }, + { + "epoch": 5.315068493150685, + "grad_norm": 11.65144157409668, + "learning_rate": 5.206494165398276e-06, + "loss": 0.0915, + "step": 5820 + }, + { + "epoch": 5.315981735159817, + "grad_norm": 7.896162033081055, + "learning_rate": 5.2054794520547945e-06, + "loss": 0.0727, + "step": 5821 + }, + { + "epoch": 5.31689497716895, + "grad_norm": 2.691967725753784, + "learning_rate": 5.204464738711314e-06, + "loss": 0.0186, + "step": 5822 + }, + { + "epoch": 5.317808219178082, + "grad_norm": 27.45469856262207, + "learning_rate": 5.203450025367835e-06, + "loss": 0.1643, + "step": 5823 + }, + { + "epoch": 5.318721461187215, + "grad_norm": 5.1416802406311035, + "learning_rate": 5.202435312024353e-06, + "loss": 0.0336, + "step": 5824 + }, + { + "epoch": 5.319634703196347, + "grad_norm": 21.335430145263672, + "learning_rate": 5.201420598680873e-06, + "loss": 0.0769, + "step": 5825 + }, + { + "epoch": 5.32054794520548, + "grad_norm": 2.773268222808838, + "learning_rate": 5.200405885337393e-06, + "loss": 0.0196, + "step": 5826 + }, + { + "epoch": 5.321461187214612, + "grad_norm": 30.57506561279297, + "learning_rate": 5.199391171993912e-06, + "loss": 0.2306, + "step": 5827 + }, + { + "epoch": 5.3223744292237445, + "grad_norm": 3.4534363746643066, + "learning_rate": 5.1983764586504315e-06, + "loss": 0.0232, + "step": 5828 + }, + { + "epoch": 5.323287671232877, + "grad_norm": 27.297483444213867, + "learning_rate": 5.197361745306951e-06, + "loss": 0.1916, + "step": 5829 + }, + { + "epoch": 5.324200913242009, + "grad_norm": 0.3132953345775604, + "learning_rate": 5.196347031963471e-06, + "loss": 0.002, + "step": 5830 + }, + { + "epoch": 5.325114155251142, + "grad_norm": 10.040693283081055, + "learning_rate": 5.19533231861999e-06, + "loss": 0.0811, + "step": 5831 + }, + { + "epoch": 5.326027397260274, + "grad_norm": 1.7519633769989014, + "learning_rate": 5.1943176052765094e-06, + "loss": 0.0112, + "step": 5832 + }, + { + "epoch": 5.326940639269407, + "grad_norm": 81.01386260986328, + "learning_rate": 5.19330289193303e-06, + "loss": 0.8949, + "step": 5833 + }, + { + "epoch": 5.327853881278539, + "grad_norm": 63.496116638183594, + "learning_rate": 5.192288178589549e-06, + "loss": 0.7307, + "step": 5834 + }, + { + "epoch": 5.328767123287671, + "grad_norm": 2.8409125804901123, + "learning_rate": 5.1912734652460685e-06, + "loss": 0.0241, + "step": 5835 + }, + { + "epoch": 5.329680365296804, + "grad_norm": 64.8731918334961, + "learning_rate": 5.190258751902588e-06, + "loss": 0.8437, + "step": 5836 + }, + { + "epoch": 5.330593607305936, + "grad_norm": 0.8032898306846619, + "learning_rate": 5.189244038559107e-06, + "loss": 0.0047, + "step": 5837 + }, + { + "epoch": 5.331506849315068, + "grad_norm": 12.163093566894531, + "learning_rate": 5.188229325215627e-06, + "loss": 0.1082, + "step": 5838 + }, + { + "epoch": 5.332420091324201, + "grad_norm": 13.785019874572754, + "learning_rate": 5.1872146118721464e-06, + "loss": 0.0939, + "step": 5839 + }, + { + "epoch": 5.333333333333333, + "grad_norm": 88.19429779052734, + "learning_rate": 5.186199898528667e-06, + "loss": 1.6984, + "step": 5840 + }, + { + "epoch": 5.3342465753424655, + "grad_norm": 21.05609130859375, + "learning_rate": 5.185185185185185e-06, + "loss": 0.2166, + "step": 5841 + }, + { + "epoch": 5.335159817351598, + "grad_norm": 0.22464729845523834, + "learning_rate": 5.1841704718417055e-06, + "loss": 0.0019, + "step": 5842 + }, + { + "epoch": 5.33607305936073, + "grad_norm": 5.464113235473633, + "learning_rate": 5.183155758498225e-06, + "loss": 0.0493, + "step": 5843 + }, + { + "epoch": 5.336986301369863, + "grad_norm": 37.3677864074707, + "learning_rate": 5.182141045154744e-06, + "loss": 0.2533, + "step": 5844 + }, + { + "epoch": 5.337899543378995, + "grad_norm": 11.178701400756836, + "learning_rate": 5.181126331811264e-06, + "loss": 0.107, + "step": 5845 + }, + { + "epoch": 5.338812785388128, + "grad_norm": 4.295238971710205, + "learning_rate": 5.1801116184677834e-06, + "loss": 0.0294, + "step": 5846 + }, + { + "epoch": 5.33972602739726, + "grad_norm": 2.559962034225464, + "learning_rate": 5.179096905124302e-06, + "loss": 0.0198, + "step": 5847 + }, + { + "epoch": 5.340639269406393, + "grad_norm": 8.178559303283691, + "learning_rate": 5.178082191780822e-06, + "loss": 0.0548, + "step": 5848 + }, + { + "epoch": 5.341552511415525, + "grad_norm": 7.682490825653076, + "learning_rate": 5.177067478437342e-06, + "loss": 0.0517, + "step": 5849 + }, + { + "epoch": 5.342465753424658, + "grad_norm": 1.357388973236084, + "learning_rate": 5.176052765093862e-06, + "loss": 0.0105, + "step": 5850 + }, + { + "epoch": 5.34337899543379, + "grad_norm": 72.0926742553711, + "learning_rate": 5.175038051750381e-06, + "loss": 0.597, + "step": 5851 + }, + { + "epoch": 5.3442922374429225, + "grad_norm": 23.06375503540039, + "learning_rate": 5.174023338406901e-06, + "loss": 0.2164, + "step": 5852 + }, + { + "epoch": 5.345205479452055, + "grad_norm": 16.10531234741211, + "learning_rate": 5.1730086250634204e-06, + "loss": 0.1005, + "step": 5853 + }, + { + "epoch": 5.346118721461187, + "grad_norm": 5.4459381103515625, + "learning_rate": 5.171993911719939e-06, + "loss": 0.0429, + "step": 5854 + }, + { + "epoch": 5.34703196347032, + "grad_norm": 16.285144805908203, + "learning_rate": 5.170979198376459e-06, + "loss": 0.229, + "step": 5855 + }, + { + "epoch": 5.347945205479452, + "grad_norm": 0.19090360403060913, + "learning_rate": 5.169964485032979e-06, + "loss": 0.001, + "step": 5856 + }, + { + "epoch": 5.348858447488585, + "grad_norm": 7.281144142150879, + "learning_rate": 5.1689497716894975e-06, + "loss": 0.0588, + "step": 5857 + }, + { + "epoch": 5.349771689497717, + "grad_norm": 22.978731155395508, + "learning_rate": 5.167935058346017e-06, + "loss": 0.0741, + "step": 5858 + }, + { + "epoch": 5.35068493150685, + "grad_norm": 0.09037967771291733, + "learning_rate": 5.166920345002538e-06, + "loss": 0.0007, + "step": 5859 + }, + { + "epoch": 5.351598173515982, + "grad_norm": 1.760579228401184, + "learning_rate": 5.1659056316590574e-06, + "loss": 0.0131, + "step": 5860 + }, + { + "epoch": 5.352511415525115, + "grad_norm": 18.020103454589844, + "learning_rate": 5.164890918315576e-06, + "loss": 0.1389, + "step": 5861 + }, + { + "epoch": 5.353424657534246, + "grad_norm": 3.9396190643310547, + "learning_rate": 5.163876204972096e-06, + "loss": 0.028, + "step": 5862 + }, + { + "epoch": 5.3543378995433795, + "grad_norm": 4.370096683502197, + "learning_rate": 5.162861491628616e-06, + "loss": 0.0361, + "step": 5863 + }, + { + "epoch": 5.355251141552511, + "grad_norm": 7.833180904388428, + "learning_rate": 5.1618467782851345e-06, + "loss": 0.0567, + "step": 5864 + }, + { + "epoch": 5.3561643835616435, + "grad_norm": 6.507137298583984, + "learning_rate": 5.160832064941654e-06, + "loss": 0.0396, + "step": 5865 + }, + { + "epoch": 5.357077625570776, + "grad_norm": 20.867889404296875, + "learning_rate": 5.159817351598174e-06, + "loss": 0.3133, + "step": 5866 + }, + { + "epoch": 5.357990867579908, + "grad_norm": 0.3397907018661499, + "learning_rate": 5.158802638254693e-06, + "loss": 0.0034, + "step": 5867 + }, + { + "epoch": 5.358904109589041, + "grad_norm": 1.221357822418213, + "learning_rate": 5.1577879249112124e-06, + "loss": 0.0108, + "step": 5868 + }, + { + "epoch": 5.359817351598173, + "grad_norm": 62.89436721801758, + "learning_rate": 5.156773211567733e-06, + "loss": 1.3265, + "step": 5869 + }, + { + "epoch": 5.360730593607306, + "grad_norm": 118.56758117675781, + "learning_rate": 5.155758498224253e-06, + "loss": 4.5816, + "step": 5870 + }, + { + "epoch": 5.361643835616438, + "grad_norm": 91.17918395996094, + "learning_rate": 5.1547437848807715e-06, + "loss": 0.6436, + "step": 5871 + }, + { + "epoch": 5.362557077625571, + "grad_norm": 6.578936576843262, + "learning_rate": 5.153729071537291e-06, + "loss": 0.0549, + "step": 5872 + }, + { + "epoch": 5.363470319634703, + "grad_norm": 12.37380599975586, + "learning_rate": 5.152714358193811e-06, + "loss": 0.1148, + "step": 5873 + }, + { + "epoch": 5.364383561643836, + "grad_norm": 1.7468644380569458, + "learning_rate": 5.15169964485033e-06, + "loss": 0.0124, + "step": 5874 + }, + { + "epoch": 5.365296803652968, + "grad_norm": 10.446979522705078, + "learning_rate": 5.1506849315068494e-06, + "loss": 0.0729, + "step": 5875 + }, + { + "epoch": 5.3662100456621005, + "grad_norm": 10.1701078414917, + "learning_rate": 5.149670218163369e-06, + "loss": 0.0849, + "step": 5876 + }, + { + "epoch": 5.367123287671233, + "grad_norm": 8.006875038146973, + "learning_rate": 5.148655504819888e-06, + "loss": 0.0489, + "step": 5877 + }, + { + "epoch": 5.368036529680365, + "grad_norm": 43.984310150146484, + "learning_rate": 5.1476407914764085e-06, + "loss": 0.3978, + "step": 5878 + }, + { + "epoch": 5.368949771689498, + "grad_norm": 1.4150934219360352, + "learning_rate": 5.146626078132928e-06, + "loss": 0.0098, + "step": 5879 + }, + { + "epoch": 5.36986301369863, + "grad_norm": 54.544586181640625, + "learning_rate": 5.145611364789448e-06, + "loss": 0.8078, + "step": 5880 + }, + { + "epoch": 5.370776255707763, + "grad_norm": 11.954182624816895, + "learning_rate": 5.144596651445967e-06, + "loss": 0.0888, + "step": 5881 + }, + { + "epoch": 5.371689497716895, + "grad_norm": 69.14698028564453, + "learning_rate": 5.1435819381024864e-06, + "loss": 0.5679, + "step": 5882 + }, + { + "epoch": 5.372602739726028, + "grad_norm": 7.057109832763672, + "learning_rate": 5.142567224759006e-06, + "loss": 0.0599, + "step": 5883 + }, + { + "epoch": 5.37351598173516, + "grad_norm": 3.4207029342651367, + "learning_rate": 5.141552511415525e-06, + "loss": 0.027, + "step": 5884 + }, + { + "epoch": 5.3744292237442925, + "grad_norm": 46.9983024597168, + "learning_rate": 5.140537798072045e-06, + "loss": 0.4422, + "step": 5885 + }, + { + "epoch": 5.375342465753425, + "grad_norm": 1.3366843461990356, + "learning_rate": 5.139523084728565e-06, + "loss": 0.0125, + "step": 5886 + }, + { + "epoch": 5.3762557077625575, + "grad_norm": 9.488337516784668, + "learning_rate": 5.138508371385084e-06, + "loss": 0.0677, + "step": 5887 + }, + { + "epoch": 5.37716894977169, + "grad_norm": 17.9864501953125, + "learning_rate": 5.137493658041604e-06, + "loss": 0.17, + "step": 5888 + }, + { + "epoch": 5.3780821917808215, + "grad_norm": 1.2986209392547607, + "learning_rate": 5.1364789446981234e-06, + "loss": 0.0123, + "step": 5889 + }, + { + "epoch": 5.378995433789954, + "grad_norm": 8.121434211730957, + "learning_rate": 5.135464231354643e-06, + "loss": 0.0708, + "step": 5890 + }, + { + "epoch": 5.379908675799086, + "grad_norm": 1.1610867977142334, + "learning_rate": 5.134449518011162e-06, + "loss": 0.0107, + "step": 5891 + }, + { + "epoch": 5.380821917808219, + "grad_norm": 4.135074138641357, + "learning_rate": 5.133434804667682e-06, + "loss": 0.0262, + "step": 5892 + }, + { + "epoch": 5.381735159817351, + "grad_norm": 14.728768348693848, + "learning_rate": 5.132420091324201e-06, + "loss": 0.115, + "step": 5893 + }, + { + "epoch": 5.382648401826484, + "grad_norm": 5.032414436340332, + "learning_rate": 5.13140537798072e-06, + "loss": 0.0456, + "step": 5894 + }, + { + "epoch": 5.383561643835616, + "grad_norm": 2.1995325088500977, + "learning_rate": 5.130390664637241e-06, + "loss": 0.0167, + "step": 5895 + }, + { + "epoch": 5.384474885844749, + "grad_norm": 8.261009216308594, + "learning_rate": 5.12937595129376e-06, + "loss": 0.0752, + "step": 5896 + }, + { + "epoch": 5.385388127853881, + "grad_norm": 8.833806037902832, + "learning_rate": 5.128361237950279e-06, + "loss": 0.0477, + "step": 5897 + }, + { + "epoch": 5.3863013698630136, + "grad_norm": 0.12815351784229279, + "learning_rate": 5.127346524606799e-06, + "loss": 0.0007, + "step": 5898 + }, + { + "epoch": 5.387214611872146, + "grad_norm": 5.497215270996094, + "learning_rate": 5.126331811263319e-06, + "loss": 0.0445, + "step": 5899 + }, + { + "epoch": 5.3881278538812785, + "grad_norm": 48.07004165649414, + "learning_rate": 5.125317097919838e-06, + "loss": 0.5473, + "step": 5900 + }, + { + "epoch": 5.389041095890411, + "grad_norm": 0.9183828234672546, + "learning_rate": 5.124302384576357e-06, + "loss": 0.0032, + "step": 5901 + }, + { + "epoch": 5.389954337899543, + "grad_norm": 1.7162539958953857, + "learning_rate": 5.123287671232877e-06, + "loss": 0.0102, + "step": 5902 + }, + { + "epoch": 5.390867579908676, + "grad_norm": 46.5203857421875, + "learning_rate": 5.122272957889397e-06, + "loss": 0.4023, + "step": 5903 + }, + { + "epoch": 5.391780821917808, + "grad_norm": 0.7017456889152527, + "learning_rate": 5.121258244545915e-06, + "loss": 0.0059, + "step": 5904 + }, + { + "epoch": 5.392694063926941, + "grad_norm": 2.9552042484283447, + "learning_rate": 5.120243531202436e-06, + "loss": 0.0197, + "step": 5905 + }, + { + "epoch": 5.393607305936073, + "grad_norm": 4.2921295166015625, + "learning_rate": 5.119228817858956e-06, + "loss": 0.032, + "step": 5906 + }, + { + "epoch": 5.394520547945206, + "grad_norm": 1.774693489074707, + "learning_rate": 5.1182141045154745e-06, + "loss": 0.0148, + "step": 5907 + }, + { + "epoch": 5.395433789954338, + "grad_norm": 28.610071182250977, + "learning_rate": 5.117199391171994e-06, + "loss": 0.1869, + "step": 5908 + }, + { + "epoch": 5.3963470319634705, + "grad_norm": 48.29982376098633, + "learning_rate": 5.116184677828514e-06, + "loss": 0.4233, + "step": 5909 + }, + { + "epoch": 5.397260273972603, + "grad_norm": 0.6622174382209778, + "learning_rate": 5.1151699644850336e-06, + "loss": 0.005, + "step": 5910 + }, + { + "epoch": 5.3981735159817354, + "grad_norm": 77.94112396240234, + "learning_rate": 5.114155251141552e-06, + "loss": 1.5665, + "step": 5911 + }, + { + "epoch": 5.399086757990868, + "grad_norm": 7.198596954345703, + "learning_rate": 5.113140537798072e-06, + "loss": 0.0412, + "step": 5912 + }, + { + "epoch": 5.4, + "grad_norm": 0.9291622042655945, + "learning_rate": 5.112125824454593e-06, + "loss": 0.0053, + "step": 5913 + }, + { + "epoch": 5.400913242009133, + "grad_norm": 10.097634315490723, + "learning_rate": 5.1111111111111115e-06, + "loss": 0.0813, + "step": 5914 + }, + { + "epoch": 5.401826484018265, + "grad_norm": 41.14621353149414, + "learning_rate": 5.110096397767631e-06, + "loss": 0.5311, + "step": 5915 + }, + { + "epoch": 5.402739726027397, + "grad_norm": 7.847353458404541, + "learning_rate": 5.109081684424151e-06, + "loss": 0.0694, + "step": 5916 + }, + { + "epoch": 5.403652968036529, + "grad_norm": 3.795524835586548, + "learning_rate": 5.10806697108067e-06, + "loss": 0.0257, + "step": 5917 + }, + { + "epoch": 5.404566210045662, + "grad_norm": 110.69950103759766, + "learning_rate": 5.107052257737189e-06, + "loss": 1.8467, + "step": 5918 + }, + { + "epoch": 5.405479452054794, + "grad_norm": 82.85639953613281, + "learning_rate": 5.106037544393709e-06, + "loss": 1.4895, + "step": 5919 + }, + { + "epoch": 5.406392694063927, + "grad_norm": 2.972740411758423, + "learning_rate": 5.10502283105023e-06, + "loss": 0.0209, + "step": 5920 + }, + { + "epoch": 5.407305936073059, + "grad_norm": 39.022132873535156, + "learning_rate": 5.104008117706748e-06, + "loss": 0.2538, + "step": 5921 + }, + { + "epoch": 5.4082191780821915, + "grad_norm": 66.51889038085938, + "learning_rate": 5.102993404363268e-06, + "loss": 0.3635, + "step": 5922 + }, + { + "epoch": 5.409132420091324, + "grad_norm": 30.270530700683594, + "learning_rate": 5.101978691019788e-06, + "loss": 0.2383, + "step": 5923 + }, + { + "epoch": 5.4100456621004565, + "grad_norm": 26.724016189575195, + "learning_rate": 5.100963977676307e-06, + "loss": 0.1703, + "step": 5924 + }, + { + "epoch": 5.410958904109589, + "grad_norm": 107.16281127929688, + "learning_rate": 5.099949264332826e-06, + "loss": 0.8599, + "step": 5925 + }, + { + "epoch": 5.411872146118721, + "grad_norm": 1.6489795446395874, + "learning_rate": 5.098934550989346e-06, + "loss": 0.0131, + "step": 5926 + }, + { + "epoch": 5.412785388127854, + "grad_norm": 4.541843414306641, + "learning_rate": 5.097919837645865e-06, + "loss": 0.0266, + "step": 5927 + }, + { + "epoch": 5.413698630136986, + "grad_norm": 1.464401364326477, + "learning_rate": 5.096905124302385e-06, + "loss": 0.0099, + "step": 5928 + }, + { + "epoch": 5.414611872146119, + "grad_norm": 85.51927185058594, + "learning_rate": 5.095890410958904e-06, + "loss": 1.4401, + "step": 5929 + }, + { + "epoch": 5.415525114155251, + "grad_norm": 2.5539400577545166, + "learning_rate": 5.094875697615425e-06, + "loss": 0.0173, + "step": 5930 + }, + { + "epoch": 5.416438356164384, + "grad_norm": 13.448187828063965, + "learning_rate": 5.093860984271944e-06, + "loss": 0.0974, + "step": 5931 + }, + { + "epoch": 5.417351598173516, + "grad_norm": 51.22328186035156, + "learning_rate": 5.092846270928463e-06, + "loss": 0.5887, + "step": 5932 + }, + { + "epoch": 5.4182648401826485, + "grad_norm": 0.6962748765945435, + "learning_rate": 5.091831557584983e-06, + "loss": 0.0045, + "step": 5933 + }, + { + "epoch": 5.419178082191781, + "grad_norm": 0.8141483664512634, + "learning_rate": 5.090816844241502e-06, + "loss": 0.0046, + "step": 5934 + }, + { + "epoch": 5.420091324200913, + "grad_norm": 1.951393485069275, + "learning_rate": 5.089802130898022e-06, + "loss": 0.0131, + "step": 5935 + }, + { + "epoch": 5.421004566210046, + "grad_norm": 3.8451387882232666, + "learning_rate": 5.088787417554541e-06, + "loss": 0.0203, + "step": 5936 + }, + { + "epoch": 5.421917808219178, + "grad_norm": 21.77200698852539, + "learning_rate": 5.08777270421106e-06, + "loss": 0.1852, + "step": 5937 + }, + { + "epoch": 5.422831050228311, + "grad_norm": 80.35626220703125, + "learning_rate": 5.08675799086758e-06, + "loss": 2.9478, + "step": 5938 + }, + { + "epoch": 5.423744292237443, + "grad_norm": 0.36450034379959106, + "learning_rate": 5.0857432775241e-06, + "loss": 0.002, + "step": 5939 + }, + { + "epoch": 5.424657534246576, + "grad_norm": 1.508590579032898, + "learning_rate": 5.08472856418062e-06, + "loss": 0.0102, + "step": 5940 + }, + { + "epoch": 5.425570776255708, + "grad_norm": 0.9328710436820984, + "learning_rate": 5.083713850837139e-06, + "loss": 0.0076, + "step": 5941 + }, + { + "epoch": 5.426484018264841, + "grad_norm": 110.7034912109375, + "learning_rate": 5.082699137493659e-06, + "loss": 3.5561, + "step": 5942 + }, + { + "epoch": 5.427397260273972, + "grad_norm": 0.8362423181533813, + "learning_rate": 5.081684424150178e-06, + "loss": 0.0051, + "step": 5943 + }, + { + "epoch": 5.428310502283105, + "grad_norm": 1.2031800746917725, + "learning_rate": 5.080669710806697e-06, + "loss": 0.0063, + "step": 5944 + }, + { + "epoch": 5.429223744292237, + "grad_norm": 1.0873757600784302, + "learning_rate": 5.079654997463217e-06, + "loss": 0.0057, + "step": 5945 + }, + { + "epoch": 5.4301369863013695, + "grad_norm": 20.004844665527344, + "learning_rate": 5.0786402841197366e-06, + "loss": 0.1237, + "step": 5946 + }, + { + "epoch": 5.431050228310502, + "grad_norm": 0.517219066619873, + "learning_rate": 5.077625570776255e-06, + "loss": 0.0034, + "step": 5947 + }, + { + "epoch": 5.4319634703196344, + "grad_norm": 60.17050552368164, + "learning_rate": 5.076610857432775e-06, + "loss": 0.4689, + "step": 5948 + }, + { + "epoch": 5.432876712328767, + "grad_norm": 1.8747438192367554, + "learning_rate": 5.075596144089296e-06, + "loss": 0.0136, + "step": 5949 + }, + { + "epoch": 5.433789954337899, + "grad_norm": 0.758164644241333, + "learning_rate": 5.074581430745815e-06, + "loss": 0.0067, + "step": 5950 + }, + { + "epoch": 5.434703196347032, + "grad_norm": 86.82738494873047, + "learning_rate": 5.073566717402334e-06, + "loss": 2.3296, + "step": 5951 + }, + { + "epoch": 5.435616438356164, + "grad_norm": 91.30095672607422, + "learning_rate": 5.072552004058854e-06, + "loss": 1.1623, + "step": 5952 + }, + { + "epoch": 5.436529680365297, + "grad_norm": 8.408917427062988, + "learning_rate": 5.0715372907153736e-06, + "loss": 0.0784, + "step": 5953 + }, + { + "epoch": 5.437442922374429, + "grad_norm": 15.434286117553711, + "learning_rate": 5.070522577371892e-06, + "loss": 0.156, + "step": 5954 + }, + { + "epoch": 5.438356164383562, + "grad_norm": 22.380537033081055, + "learning_rate": 5.069507864028412e-06, + "loss": 0.1996, + "step": 5955 + }, + { + "epoch": 5.439269406392694, + "grad_norm": 0.45679184794425964, + "learning_rate": 5.068493150684932e-06, + "loss": 0.0028, + "step": 5956 + }, + { + "epoch": 5.4401826484018265, + "grad_norm": 0.2658367455005646, + "learning_rate": 5.067478437341451e-06, + "loss": 0.0016, + "step": 5957 + }, + { + "epoch": 5.441095890410959, + "grad_norm": 2.6697499752044678, + "learning_rate": 5.066463723997971e-06, + "loss": 0.0197, + "step": 5958 + }, + { + "epoch": 5.442009132420091, + "grad_norm": 53.8836555480957, + "learning_rate": 5.065449010654491e-06, + "loss": 0.6868, + "step": 5959 + }, + { + "epoch": 5.442922374429224, + "grad_norm": 0.5290911793708801, + "learning_rate": 5.0644342973110106e-06, + "loss": 0.004, + "step": 5960 + }, + { + "epoch": 5.443835616438356, + "grad_norm": 20.748857498168945, + "learning_rate": 5.063419583967529e-06, + "loss": 0.1417, + "step": 5961 + }, + { + "epoch": 5.444748858447489, + "grad_norm": 11.663347244262695, + "learning_rate": 5.062404870624049e-06, + "loss": 0.1018, + "step": 5962 + }, + { + "epoch": 5.445662100456621, + "grad_norm": 2.990483522415161, + "learning_rate": 5.061390157280569e-06, + "loss": 0.0162, + "step": 5963 + }, + { + "epoch": 5.446575342465754, + "grad_norm": 3.5089271068573, + "learning_rate": 5.060375443937088e-06, + "loss": 0.0313, + "step": 5964 + }, + { + "epoch": 5.447488584474886, + "grad_norm": 3.0047595500946045, + "learning_rate": 5.059360730593607e-06, + "loss": 0.0196, + "step": 5965 + }, + { + "epoch": 5.448401826484019, + "grad_norm": 6.367799758911133, + "learning_rate": 5.058346017250128e-06, + "loss": 0.0434, + "step": 5966 + }, + { + "epoch": 5.449315068493151, + "grad_norm": 4.146515846252441, + "learning_rate": 5.057331303906647e-06, + "loss": 0.0224, + "step": 5967 + }, + { + "epoch": 5.4502283105022835, + "grad_norm": 59.11982345581055, + "learning_rate": 5.056316590563166e-06, + "loss": 0.2703, + "step": 5968 + }, + { + "epoch": 5.451141552511416, + "grad_norm": 0.2023552805185318, + "learning_rate": 5.055301877219686e-06, + "loss": 0.0017, + "step": 5969 + }, + { + "epoch": 5.4520547945205475, + "grad_norm": 114.99974822998047, + "learning_rate": 5.054287163876206e-06, + "loss": 0.9646, + "step": 5970 + }, + { + "epoch": 5.45296803652968, + "grad_norm": 2.7608485221862793, + "learning_rate": 5.053272450532725e-06, + "loss": 0.0254, + "step": 5971 + }, + { + "epoch": 5.453881278538812, + "grad_norm": 4.14379358291626, + "learning_rate": 5.052257737189244e-06, + "loss": 0.0277, + "step": 5972 + }, + { + "epoch": 5.454794520547945, + "grad_norm": 14.252459526062012, + "learning_rate": 5.051243023845764e-06, + "loss": 0.1177, + "step": 5973 + }, + { + "epoch": 5.455707762557077, + "grad_norm": 8.666152000427246, + "learning_rate": 5.050228310502283e-06, + "loss": 0.0635, + "step": 5974 + }, + { + "epoch": 5.45662100456621, + "grad_norm": 19.016586303710938, + "learning_rate": 5.049213597158803e-06, + "loss": 0.1579, + "step": 5975 + }, + { + "epoch": 5.457534246575342, + "grad_norm": 7.026690483093262, + "learning_rate": 5.048198883815323e-06, + "loss": 0.0605, + "step": 5976 + }, + { + "epoch": 5.458447488584475, + "grad_norm": 9.627598762512207, + "learning_rate": 5.047184170471842e-06, + "loss": 0.0703, + "step": 5977 + }, + { + "epoch": 5.459360730593607, + "grad_norm": 19.292964935302734, + "learning_rate": 5.046169457128362e-06, + "loss": 0.1885, + "step": 5978 + }, + { + "epoch": 5.46027397260274, + "grad_norm": 3.2862353324890137, + "learning_rate": 5.045154743784881e-06, + "loss": 0.0295, + "step": 5979 + }, + { + "epoch": 5.461187214611872, + "grad_norm": 2.1448216438293457, + "learning_rate": 5.044140030441401e-06, + "loss": 0.0223, + "step": 5980 + }, + { + "epoch": 5.4621004566210045, + "grad_norm": 24.472963333129883, + "learning_rate": 5.04312531709792e-06, + "loss": 0.1443, + "step": 5981 + }, + { + "epoch": 5.463013698630137, + "grad_norm": 12.700884819030762, + "learning_rate": 5.0421106037544395e-06, + "loss": 0.0841, + "step": 5982 + }, + { + "epoch": 5.463926940639269, + "grad_norm": 6.250962734222412, + "learning_rate": 5.04109589041096e-06, + "loss": 0.0404, + "step": 5983 + }, + { + "epoch": 5.464840182648402, + "grad_norm": 30.17221450805664, + "learning_rate": 5.040081177067478e-06, + "loss": 0.1881, + "step": 5984 + }, + { + "epoch": 5.465753424657534, + "grad_norm": 9.614295959472656, + "learning_rate": 5.039066463723999e-06, + "loss": 0.0821, + "step": 5985 + }, + { + "epoch": 5.466666666666667, + "grad_norm": 2.46037220954895, + "learning_rate": 5.038051750380518e-06, + "loss": 0.0208, + "step": 5986 + }, + { + "epoch": 5.467579908675799, + "grad_norm": 75.8371353149414, + "learning_rate": 5.037037037037037e-06, + "loss": 1.2497, + "step": 5987 + }, + { + "epoch": 5.468493150684932, + "grad_norm": 1.6166146993637085, + "learning_rate": 5.036022323693557e-06, + "loss": 0.0131, + "step": 5988 + }, + { + "epoch": 5.469406392694064, + "grad_norm": 57.98878479003906, + "learning_rate": 5.0350076103500765e-06, + "loss": 0.4029, + "step": 5989 + }, + { + "epoch": 5.470319634703197, + "grad_norm": 10.550365447998047, + "learning_rate": 5.033992897006596e-06, + "loss": 0.0726, + "step": 5990 + }, + { + "epoch": 5.471232876712329, + "grad_norm": 68.17037963867188, + "learning_rate": 5.032978183663115e-06, + "loss": 0.5328, + "step": 5991 + }, + { + "epoch": 5.4721461187214615, + "grad_norm": 29.538679122924805, + "learning_rate": 5.031963470319635e-06, + "loss": 0.3673, + "step": 5992 + }, + { + "epoch": 5.473059360730594, + "grad_norm": 11.588109970092773, + "learning_rate": 5.030948756976155e-06, + "loss": 0.0815, + "step": 5993 + }, + { + "epoch": 5.473972602739726, + "grad_norm": 0.7635502815246582, + "learning_rate": 5.029934043632674e-06, + "loss": 0.0055, + "step": 5994 + }, + { + "epoch": 5.474885844748858, + "grad_norm": 8.466620445251465, + "learning_rate": 5.028919330289194e-06, + "loss": 0.0781, + "step": 5995 + }, + { + "epoch": 5.475799086757991, + "grad_norm": 0.5393479466438293, + "learning_rate": 5.0279046169457135e-06, + "loss": 0.0041, + "step": 5996 + }, + { + "epoch": 5.476712328767123, + "grad_norm": 13.565749168395996, + "learning_rate": 5.026889903602232e-06, + "loss": 0.104, + "step": 5997 + }, + { + "epoch": 5.477625570776255, + "grad_norm": 8.609330177307129, + "learning_rate": 5.025875190258752e-06, + "loss": 0.0726, + "step": 5998 + }, + { + "epoch": 5.478538812785388, + "grad_norm": 3.6305742263793945, + "learning_rate": 5.024860476915272e-06, + "loss": 0.0121, + "step": 5999 + }, + { + "epoch": 5.47945205479452, + "grad_norm": 1.4688007831573486, + "learning_rate": 5.0238457635717915e-06, + "loss": 0.0113, + "step": 6000 + }, + { + "epoch": 5.480365296803653, + "grad_norm": 1.271701455116272, + "learning_rate": 5.02283105022831e-06, + "loss": 0.0076, + "step": 6001 + }, + { + "epoch": 5.481278538812785, + "grad_norm": 1.557411551475525, + "learning_rate": 5.021816336884831e-06, + "loss": 0.0151, + "step": 6002 + }, + { + "epoch": 5.482191780821918, + "grad_norm": 2.621350049972534, + "learning_rate": 5.0208016235413505e-06, + "loss": 0.0188, + "step": 6003 + }, + { + "epoch": 5.48310502283105, + "grad_norm": 24.8333683013916, + "learning_rate": 5.019786910197869e-06, + "loss": 0.276, + "step": 6004 + }, + { + "epoch": 5.4840182648401825, + "grad_norm": 16.49659538269043, + "learning_rate": 5.018772196854389e-06, + "loss": 0.1324, + "step": 6005 + }, + { + "epoch": 5.484931506849315, + "grad_norm": 27.885662078857422, + "learning_rate": 5.017757483510909e-06, + "loss": 0.2772, + "step": 6006 + }, + { + "epoch": 5.485844748858447, + "grad_norm": 4.255710601806641, + "learning_rate": 5.016742770167428e-06, + "loss": 0.0327, + "step": 6007 + }, + { + "epoch": 5.48675799086758, + "grad_norm": 5.307809829711914, + "learning_rate": 5.015728056823947e-06, + "loss": 0.0457, + "step": 6008 + }, + { + "epoch": 5.487671232876712, + "grad_norm": 26.16577911376953, + "learning_rate": 5.014713343480467e-06, + "loss": 0.2247, + "step": 6009 + }, + { + "epoch": 5.488584474885845, + "grad_norm": 1.607830286026001, + "learning_rate": 5.0136986301369875e-06, + "loss": 0.0145, + "step": 6010 + }, + { + "epoch": 5.489497716894977, + "grad_norm": 0.23576274514198303, + "learning_rate": 5.012683916793506e-06, + "loss": 0.0017, + "step": 6011 + }, + { + "epoch": 5.49041095890411, + "grad_norm": 10.563131332397461, + "learning_rate": 5.011669203450026e-06, + "loss": 0.0673, + "step": 6012 + }, + { + "epoch": 5.491324200913242, + "grad_norm": 9.632599830627441, + "learning_rate": 5.010654490106546e-06, + "loss": 0.0664, + "step": 6013 + }, + { + "epoch": 5.492237442922375, + "grad_norm": 93.74455261230469, + "learning_rate": 5.009639776763065e-06, + "loss": 0.9796, + "step": 6014 + }, + { + "epoch": 5.493150684931507, + "grad_norm": 0.9360544085502625, + "learning_rate": 5.008625063419584e-06, + "loss": 0.0066, + "step": 6015 + }, + { + "epoch": 5.4940639269406395, + "grad_norm": 15.855327606201172, + "learning_rate": 5.007610350076104e-06, + "loss": 0.1794, + "step": 6016 + }, + { + "epoch": 5.494977168949772, + "grad_norm": 29.42024040222168, + "learning_rate": 5.006595636732623e-06, + "loss": 0.2246, + "step": 6017 + }, + { + "epoch": 5.495890410958904, + "grad_norm": 3.4493730068206787, + "learning_rate": 5.0055809233891425e-06, + "loss": 0.0326, + "step": 6018 + }, + { + "epoch": 5.496803652968037, + "grad_norm": 9.025847434997559, + "learning_rate": 5.004566210045663e-06, + "loss": 0.0714, + "step": 6019 + }, + { + "epoch": 5.497716894977169, + "grad_norm": 1.9629707336425781, + "learning_rate": 5.003551496702183e-06, + "loss": 0.018, + "step": 6020 + }, + { + "epoch": 5.498630136986302, + "grad_norm": 5.16473388671875, + "learning_rate": 5.002536783358702e-06, + "loss": 0.0307, + "step": 6021 + }, + { + "epoch": 5.499543378995433, + "grad_norm": 3.84019136428833, + "learning_rate": 5.001522070015221e-06, + "loss": 0.0259, + "step": 6022 + }, + { + "epoch": 5.500456621004567, + "grad_norm": 0.726408839225769, + "learning_rate": 5.000507356671741e-06, + "loss": 0.0039, + "step": 6023 + }, + { + "epoch": 5.501369863013698, + "grad_norm": 12.584319114685059, + "learning_rate": 4.999492643328261e-06, + "loss": 0.1294, + "step": 6024 + }, + { + "epoch": 5.502283105022831, + "grad_norm": 3.388960599899292, + "learning_rate": 4.9984779299847795e-06, + "loss": 0.0254, + "step": 6025 + }, + { + "epoch": 5.503196347031963, + "grad_norm": 7.785329818725586, + "learning_rate": 4.997463216641299e-06, + "loss": 0.0728, + "step": 6026 + }, + { + "epoch": 5.504109589041096, + "grad_norm": 6.670809745788574, + "learning_rate": 4.996448503297819e-06, + "loss": 0.0419, + "step": 6027 + }, + { + "epoch": 5.505022831050228, + "grad_norm": 36.80439758300781, + "learning_rate": 4.995433789954338e-06, + "loss": 0.2652, + "step": 6028 + }, + { + "epoch": 5.5059360730593605, + "grad_norm": 4.270063400268555, + "learning_rate": 4.994419076610858e-06, + "loss": 0.0211, + "step": 6029 + }, + { + "epoch": 5.506849315068493, + "grad_norm": 4.915149211883545, + "learning_rate": 4.993404363267377e-06, + "loss": 0.0328, + "step": 6030 + }, + { + "epoch": 5.507762557077625, + "grad_norm": 20.821653366088867, + "learning_rate": 4.992389649923897e-06, + "loss": 0.1465, + "step": 6031 + }, + { + "epoch": 5.508675799086758, + "grad_norm": 0.34285616874694824, + "learning_rate": 4.9913749365804165e-06, + "loss": 0.0014, + "step": 6032 + }, + { + "epoch": 5.50958904109589, + "grad_norm": 1.092483401298523, + "learning_rate": 4.990360223236936e-06, + "loss": 0.01, + "step": 6033 + }, + { + "epoch": 5.510502283105023, + "grad_norm": 1.1294043064117432, + "learning_rate": 4.989345509893456e-06, + "loss": 0.0091, + "step": 6034 + }, + { + "epoch": 5.511415525114155, + "grad_norm": 2.1373627185821533, + "learning_rate": 4.988330796549975e-06, + "loss": 0.0156, + "step": 6035 + }, + { + "epoch": 5.512328767123288, + "grad_norm": 25.270164489746094, + "learning_rate": 4.9873160832064944e-06, + "loss": 0.3364, + "step": 6036 + }, + { + "epoch": 5.51324200913242, + "grad_norm": 59.81551742553711, + "learning_rate": 4.986301369863014e-06, + "loss": 0.894, + "step": 6037 + }, + { + "epoch": 5.514155251141553, + "grad_norm": 3.044848680496216, + "learning_rate": 4.985286656519534e-06, + "loss": 0.0194, + "step": 6038 + }, + { + "epoch": 5.515068493150685, + "grad_norm": 3.9017834663391113, + "learning_rate": 4.9842719431760535e-06, + "loss": 0.0346, + "step": 6039 + }, + { + "epoch": 5.5159817351598175, + "grad_norm": 15.005702018737793, + "learning_rate": 4.983257229832572e-06, + "loss": 0.1524, + "step": 6040 + }, + { + "epoch": 5.51689497716895, + "grad_norm": 26.86388397216797, + "learning_rate": 4.982242516489092e-06, + "loss": 0.2648, + "step": 6041 + }, + { + "epoch": 5.517808219178082, + "grad_norm": 15.346484184265137, + "learning_rate": 4.981227803145612e-06, + "loss": 0.1032, + "step": 6042 + }, + { + "epoch": 5.518721461187215, + "grad_norm": 21.00322914123535, + "learning_rate": 4.9802130898021314e-06, + "loss": 0.1646, + "step": 6043 + }, + { + "epoch": 5.519634703196347, + "grad_norm": 0.6827477812767029, + "learning_rate": 4.979198376458651e-06, + "loss": 0.0046, + "step": 6044 + }, + { + "epoch": 5.52054794520548, + "grad_norm": 8.906719207763672, + "learning_rate": 4.97818366311517e-06, + "loss": 0.0649, + "step": 6045 + }, + { + "epoch": 5.521461187214612, + "grad_norm": 0.9891359210014343, + "learning_rate": 4.97716894977169e-06, + "loss": 0.0082, + "step": 6046 + }, + { + "epoch": 5.522374429223745, + "grad_norm": 1.1630502939224243, + "learning_rate": 4.976154236428209e-06, + "loss": 0.0083, + "step": 6047 + }, + { + "epoch": 5.523287671232877, + "grad_norm": 1.7255055904388428, + "learning_rate": 4.975139523084729e-06, + "loss": 0.0109, + "step": 6048 + }, + { + "epoch": 5.524200913242009, + "grad_norm": 24.820770263671875, + "learning_rate": 4.974124809741249e-06, + "loss": 0.3047, + "step": 6049 + }, + { + "epoch": 5.525114155251142, + "grad_norm": 34.021671295166016, + "learning_rate": 4.973110096397768e-06, + "loss": 0.3733, + "step": 6050 + }, + { + "epoch": 5.526027397260274, + "grad_norm": 0.6970183849334717, + "learning_rate": 4.972095383054287e-06, + "loss": 0.0051, + "step": 6051 + }, + { + "epoch": 5.526940639269406, + "grad_norm": 0.27286431193351746, + "learning_rate": 4.971080669710807e-06, + "loss": 0.0017, + "step": 6052 + }, + { + "epoch": 5.5278538812785385, + "grad_norm": 9.174155235290527, + "learning_rate": 4.970065956367327e-06, + "loss": 0.0582, + "step": 6053 + }, + { + "epoch": 5.528767123287671, + "grad_norm": 3.0344417095184326, + "learning_rate": 4.969051243023846e-06, + "loss": 0.0238, + "step": 6054 + }, + { + "epoch": 5.529680365296803, + "grad_norm": 3.945967435836792, + "learning_rate": 4.968036529680366e-06, + "loss": 0.0264, + "step": 6055 + }, + { + "epoch": 5.530593607305936, + "grad_norm": 45.57398986816406, + "learning_rate": 4.967021816336885e-06, + "loss": 0.2147, + "step": 6056 + }, + { + "epoch": 5.531506849315068, + "grad_norm": 26.44235610961914, + "learning_rate": 4.966007102993405e-06, + "loss": 0.2043, + "step": 6057 + }, + { + "epoch": 5.532420091324201, + "grad_norm": 12.065664291381836, + "learning_rate": 4.964992389649924e-06, + "loss": 0.0872, + "step": 6058 + }, + { + "epoch": 5.533333333333333, + "grad_norm": 11.060072898864746, + "learning_rate": 4.963977676306444e-06, + "loss": 0.1048, + "step": 6059 + }, + { + "epoch": 5.534246575342466, + "grad_norm": 1.4485132694244385, + "learning_rate": 4.962962962962964e-06, + "loss": 0.0088, + "step": 6060 + }, + { + "epoch": 5.535159817351598, + "grad_norm": 19.72361183166504, + "learning_rate": 4.9619482496194825e-06, + "loss": 0.1742, + "step": 6061 + }, + { + "epoch": 5.536073059360731, + "grad_norm": 0.4722713530063629, + "learning_rate": 4.960933536276002e-06, + "loss": 0.0036, + "step": 6062 + }, + { + "epoch": 5.536986301369863, + "grad_norm": 79.97166442871094, + "learning_rate": 4.959918822932522e-06, + "loss": 1.4727, + "step": 6063 + }, + { + "epoch": 5.5378995433789955, + "grad_norm": 13.989429473876953, + "learning_rate": 4.958904109589042e-06, + "loss": 0.1363, + "step": 6064 + }, + { + "epoch": 5.538812785388128, + "grad_norm": 0.04985082522034645, + "learning_rate": 4.957889396245561e-06, + "loss": 0.0003, + "step": 6065 + }, + { + "epoch": 5.53972602739726, + "grad_norm": 9.282301902770996, + "learning_rate": 4.95687468290208e-06, + "loss": 0.0811, + "step": 6066 + }, + { + "epoch": 5.540639269406393, + "grad_norm": 2.9482414722442627, + "learning_rate": 4.9558599695586e-06, + "loss": 0.0274, + "step": 6067 + }, + { + "epoch": 5.541552511415525, + "grad_norm": 28.8284969329834, + "learning_rate": 4.9548452562151195e-06, + "loss": 0.4109, + "step": 6068 + }, + { + "epoch": 5.542465753424658, + "grad_norm": 4.761038780212402, + "learning_rate": 4.953830542871639e-06, + "loss": 0.0333, + "step": 6069 + }, + { + "epoch": 5.54337899543379, + "grad_norm": 18.150468826293945, + "learning_rate": 4.952815829528159e-06, + "loss": 0.1463, + "step": 6070 + }, + { + "epoch": 5.544292237442923, + "grad_norm": 7.393531799316406, + "learning_rate": 4.951801116184678e-06, + "loss": 0.0574, + "step": 6071 + }, + { + "epoch": 5.545205479452055, + "grad_norm": 2.088578939437866, + "learning_rate": 4.9507864028411974e-06, + "loss": 0.0181, + "step": 6072 + }, + { + "epoch": 5.546118721461188, + "grad_norm": 0.6927598118782043, + "learning_rate": 4.949771689497717e-06, + "loss": 0.0036, + "step": 6073 + }, + { + "epoch": 5.54703196347032, + "grad_norm": 24.260215759277344, + "learning_rate": 4.948756976154237e-06, + "loss": 0.2845, + "step": 6074 + }, + { + "epoch": 5.5479452054794525, + "grad_norm": 32.01703643798828, + "learning_rate": 4.9477422628107565e-06, + "loss": 0.3517, + "step": 6075 + }, + { + "epoch": 5.548858447488584, + "grad_norm": 5.223072052001953, + "learning_rate": 4.946727549467275e-06, + "loss": 0.0361, + "step": 6076 + }, + { + "epoch": 5.549771689497717, + "grad_norm": 0.023917928338050842, + "learning_rate": 4.945712836123796e-06, + "loss": 0.0002, + "step": 6077 + }, + { + "epoch": 5.550684931506849, + "grad_norm": 9.745906829833984, + "learning_rate": 4.944698122780315e-06, + "loss": 0.0477, + "step": 6078 + }, + { + "epoch": 5.551598173515981, + "grad_norm": 13.510110855102539, + "learning_rate": 4.9436834094368344e-06, + "loss": 0.1596, + "step": 6079 + }, + { + "epoch": 5.552511415525114, + "grad_norm": 60.00447082519531, + "learning_rate": 4.942668696093354e-06, + "loss": 0.3758, + "step": 6080 + }, + { + "epoch": 5.553424657534246, + "grad_norm": 81.26018524169922, + "learning_rate": 4.941653982749873e-06, + "loss": 0.9188, + "step": 6081 + }, + { + "epoch": 5.554337899543379, + "grad_norm": 63.509300231933594, + "learning_rate": 4.9406392694063935e-06, + "loss": 0.4904, + "step": 6082 + }, + { + "epoch": 5.555251141552511, + "grad_norm": 0.20432963967323303, + "learning_rate": 4.939624556062912e-06, + "loss": 0.0015, + "step": 6083 + }, + { + "epoch": 5.556164383561644, + "grad_norm": 0.8502029776573181, + "learning_rate": 4.938609842719432e-06, + "loss": 0.0049, + "step": 6084 + }, + { + "epoch": 5.557077625570776, + "grad_norm": 4.322597503662109, + "learning_rate": 4.937595129375952e-06, + "loss": 0.0317, + "step": 6085 + }, + { + "epoch": 5.557990867579909, + "grad_norm": 0.5894355177879333, + "learning_rate": 4.9365804160324706e-06, + "loss": 0.0047, + "step": 6086 + }, + { + "epoch": 5.558904109589041, + "grad_norm": 12.579133987426758, + "learning_rate": 4.935565702688991e-06, + "loss": 0.0919, + "step": 6087 + }, + { + "epoch": 5.5598173515981735, + "grad_norm": 1.3774815797805786, + "learning_rate": 4.93455098934551e-06, + "loss": 0.0084, + "step": 6088 + }, + { + "epoch": 5.560730593607306, + "grad_norm": 8.177870750427246, + "learning_rate": 4.93353627600203e-06, + "loss": 0.0686, + "step": 6089 + }, + { + "epoch": 5.561643835616438, + "grad_norm": 31.09693145751953, + "learning_rate": 4.932521562658549e-06, + "loss": 0.3656, + "step": 6090 + }, + { + "epoch": 5.562557077625571, + "grad_norm": 0.7116105556488037, + "learning_rate": 4.931506849315069e-06, + "loss": 0.0038, + "step": 6091 + }, + { + "epoch": 5.563470319634703, + "grad_norm": 5.596746444702148, + "learning_rate": 4.930492135971589e-06, + "loss": 0.0551, + "step": 6092 + }, + { + "epoch": 5.564383561643836, + "grad_norm": 5.6244306564331055, + "learning_rate": 4.9294774226281076e-06, + "loss": 0.0394, + "step": 6093 + }, + { + "epoch": 5.565296803652968, + "grad_norm": 8.540278434753418, + "learning_rate": 4.928462709284627e-06, + "loss": 0.0265, + "step": 6094 + }, + { + "epoch": 5.566210045662101, + "grad_norm": 0.6983941197395325, + "learning_rate": 4.927447995941147e-06, + "loss": 0.003, + "step": 6095 + }, + { + "epoch": 5.567123287671233, + "grad_norm": 4.494696140289307, + "learning_rate": 4.926433282597667e-06, + "loss": 0.029, + "step": 6096 + }, + { + "epoch": 5.5680365296803656, + "grad_norm": 33.862857818603516, + "learning_rate": 4.925418569254186e-06, + "loss": 0.2859, + "step": 6097 + }, + { + "epoch": 5.568949771689498, + "grad_norm": 9.512170791625977, + "learning_rate": 4.924403855910705e-06, + "loss": 0.0629, + "step": 6098 + }, + { + "epoch": 5.5698630136986305, + "grad_norm": 43.25621032714844, + "learning_rate": 4.923389142567226e-06, + "loss": 0.4887, + "step": 6099 + }, + { + "epoch": 5.570776255707763, + "grad_norm": 5.7355852127075195, + "learning_rate": 4.9223744292237446e-06, + "loss": 0.0344, + "step": 6100 + }, + { + "epoch": 5.5716894977168945, + "grad_norm": 3.0707647800445557, + "learning_rate": 4.921359715880264e-06, + "loss": 0.0233, + "step": 6101 + }, + { + "epoch": 5.572602739726028, + "grad_norm": 12.346171379089355, + "learning_rate": 4.920345002536784e-06, + "loss": 0.0811, + "step": 6102 + }, + { + "epoch": 5.573515981735159, + "grad_norm": 47.95651626586914, + "learning_rate": 4.919330289193303e-06, + "loss": 0.451, + "step": 6103 + }, + { + "epoch": 5.574429223744293, + "grad_norm": 1.312968134880066, + "learning_rate": 4.918315575849823e-06, + "loss": 0.0086, + "step": 6104 + }, + { + "epoch": 5.575342465753424, + "grad_norm": 85.6370849609375, + "learning_rate": 4.917300862506342e-06, + "loss": 1.6828, + "step": 6105 + }, + { + "epoch": 5.576255707762557, + "grad_norm": 8.64775562286377, + "learning_rate": 4.916286149162862e-06, + "loss": 0.051, + "step": 6106 + }, + { + "epoch": 5.577168949771689, + "grad_norm": 14.983742713928223, + "learning_rate": 4.9152714358193816e-06, + "loss": 0.1185, + "step": 6107 + }, + { + "epoch": 5.578082191780822, + "grad_norm": 2.7254843711853027, + "learning_rate": 4.9142567224759e-06, + "loss": 0.0163, + "step": 6108 + }, + { + "epoch": 5.578995433789954, + "grad_norm": 30.4426326751709, + "learning_rate": 4.913242009132421e-06, + "loss": 0.2722, + "step": 6109 + }, + { + "epoch": 5.579908675799087, + "grad_norm": 3.889770746231079, + "learning_rate": 4.91222729578894e-06, + "loss": 0.0345, + "step": 6110 + }, + { + "epoch": 5.580821917808219, + "grad_norm": 13.56224536895752, + "learning_rate": 4.9112125824454595e-06, + "loss": 0.0965, + "step": 6111 + }, + { + "epoch": 5.5817351598173515, + "grad_norm": 0.13592997193336487, + "learning_rate": 4.910197869101979e-06, + "loss": 0.0006, + "step": 6112 + }, + { + "epoch": 5.582648401826484, + "grad_norm": 6.370697498321533, + "learning_rate": 4.909183155758499e-06, + "loss": 0.0324, + "step": 6113 + }, + { + "epoch": 5.583561643835616, + "grad_norm": 50.08890914916992, + "learning_rate": 4.9081684424150186e-06, + "loss": 0.4616, + "step": 6114 + }, + { + "epoch": 5.584474885844749, + "grad_norm": 43.102664947509766, + "learning_rate": 4.907153729071537e-06, + "loss": 0.3604, + "step": 6115 + }, + { + "epoch": 5.585388127853881, + "grad_norm": 0.8227907419204712, + "learning_rate": 4.906139015728057e-06, + "loss": 0.0045, + "step": 6116 + }, + { + "epoch": 5.586301369863014, + "grad_norm": 2.2843122482299805, + "learning_rate": 4.905124302384577e-06, + "loss": 0.0125, + "step": 6117 + }, + { + "epoch": 5.587214611872146, + "grad_norm": 0.5656725168228149, + "learning_rate": 4.9041095890410965e-06, + "loss": 0.0051, + "step": 6118 + }, + { + "epoch": 5.588127853881279, + "grad_norm": 11.920206069946289, + "learning_rate": 4.903094875697616e-06, + "loss": 0.1401, + "step": 6119 + }, + { + "epoch": 5.589041095890411, + "grad_norm": 2.5111560821533203, + "learning_rate": 4.902080162354135e-06, + "loss": 0.0152, + "step": 6120 + }, + { + "epoch": 5.5899543378995435, + "grad_norm": 31.83820343017578, + "learning_rate": 4.901065449010655e-06, + "loss": 0.1685, + "step": 6121 + }, + { + "epoch": 5.590867579908676, + "grad_norm": 32.363521575927734, + "learning_rate": 4.900050735667174e-06, + "loss": 0.4028, + "step": 6122 + }, + { + "epoch": 5.5917808219178085, + "grad_norm": 2.3298354148864746, + "learning_rate": 4.899036022323694e-06, + "loss": 0.013, + "step": 6123 + }, + { + "epoch": 5.592694063926941, + "grad_norm": 86.00255584716797, + "learning_rate": 4.898021308980214e-06, + "loss": 1.2727, + "step": 6124 + }, + { + "epoch": 5.593607305936073, + "grad_norm": 41.252197265625, + "learning_rate": 4.897006595636733e-06, + "loss": 0.58, + "step": 6125 + }, + { + "epoch": 5.594520547945206, + "grad_norm": 0.6636388301849365, + "learning_rate": 4.895991882293252e-06, + "loss": 0.0041, + "step": 6126 + }, + { + "epoch": 5.595433789954338, + "grad_norm": 2.0320498943328857, + "learning_rate": 4.894977168949772e-06, + "loss": 0.0153, + "step": 6127 + }, + { + "epoch": 5.59634703196347, + "grad_norm": 1.533373236656189, + "learning_rate": 4.893962455606292e-06, + "loss": 0.0127, + "step": 6128 + }, + { + "epoch": 5.597260273972603, + "grad_norm": 4.155474662780762, + "learning_rate": 4.892947742262811e-06, + "loss": 0.0308, + "step": 6129 + }, + { + "epoch": 5.598173515981735, + "grad_norm": 1.172961950302124, + "learning_rate": 4.89193302891933e-06, + "loss": 0.0112, + "step": 6130 + }, + { + "epoch": 5.599086757990867, + "grad_norm": 0.23334352672100067, + "learning_rate": 4.89091831557585e-06, + "loss": 0.0019, + "step": 6131 + }, + { + "epoch": 5.6, + "grad_norm": 20.489696502685547, + "learning_rate": 4.88990360223237e-06, + "loss": 0.1795, + "step": 6132 + }, + { + "epoch": 5.600913242009132, + "grad_norm": 2.336911916732788, + "learning_rate": 4.888888888888889e-06, + "loss": 0.0243, + "step": 6133 + }, + { + "epoch": 5.6018264840182646, + "grad_norm": 1.094377040863037, + "learning_rate": 4.887874175545409e-06, + "loss": 0.0072, + "step": 6134 + }, + { + "epoch": 5.602739726027397, + "grad_norm": 70.82136535644531, + "learning_rate": 4.886859462201929e-06, + "loss": 1.019, + "step": 6135 + }, + { + "epoch": 5.6036529680365295, + "grad_norm": 1.163124680519104, + "learning_rate": 4.8858447488584476e-06, + "loss": 0.0057, + "step": 6136 + }, + { + "epoch": 5.604566210045662, + "grad_norm": 0.569444477558136, + "learning_rate": 4.884830035514967e-06, + "loss": 0.0034, + "step": 6137 + }, + { + "epoch": 5.605479452054794, + "grad_norm": 3.5542871952056885, + "learning_rate": 4.883815322171487e-06, + "loss": 0.022, + "step": 6138 + }, + { + "epoch": 5.606392694063927, + "grad_norm": 0.10126509517431259, + "learning_rate": 4.882800608828007e-06, + "loss": 0.0007, + "step": 6139 + }, + { + "epoch": 5.607305936073059, + "grad_norm": 5.327071189880371, + "learning_rate": 4.881785895484526e-06, + "loss": 0.0357, + "step": 6140 + }, + { + "epoch": 5.608219178082192, + "grad_norm": 1.3116464614868164, + "learning_rate": 4.880771182141045e-06, + "loss": 0.0072, + "step": 6141 + }, + { + "epoch": 5.609132420091324, + "grad_norm": 2.5217814445495605, + "learning_rate": 4.879756468797565e-06, + "loss": 0.0243, + "step": 6142 + }, + { + "epoch": 5.610045662100457, + "grad_norm": 0.3501986861228943, + "learning_rate": 4.8787417554540846e-06, + "loss": 0.0022, + "step": 6143 + }, + { + "epoch": 5.610958904109589, + "grad_norm": 3.655754566192627, + "learning_rate": 4.877727042110604e-06, + "loss": 0.0259, + "step": 6144 + }, + { + "epoch": 5.6118721461187215, + "grad_norm": 8.687769889831543, + "learning_rate": 4.876712328767124e-06, + "loss": 0.0612, + "step": 6145 + }, + { + "epoch": 5.612785388127854, + "grad_norm": 1.5007432699203491, + "learning_rate": 4.875697615423643e-06, + "loss": 0.0109, + "step": 6146 + }, + { + "epoch": 5.6136986301369864, + "grad_norm": 40.383541107177734, + "learning_rate": 4.8746829020801625e-06, + "loss": 0.3215, + "step": 6147 + }, + { + "epoch": 5.614611872146119, + "grad_norm": 0.5856110453605652, + "learning_rate": 4.873668188736682e-06, + "loss": 0.0046, + "step": 6148 + }, + { + "epoch": 5.615525114155251, + "grad_norm": 69.12161254882812, + "learning_rate": 4.872653475393202e-06, + "loss": 0.745, + "step": 6149 + }, + { + "epoch": 5.616438356164384, + "grad_norm": 10.939131736755371, + "learning_rate": 4.8716387620497216e-06, + "loss": 0.0946, + "step": 6150 + }, + { + "epoch": 5.617351598173516, + "grad_norm": 55.565147399902344, + "learning_rate": 4.87062404870624e-06, + "loss": 0.5517, + "step": 6151 + }, + { + "epoch": 5.618264840182649, + "grad_norm": 0.42682400345802307, + "learning_rate": 4.86960933536276e-06, + "loss": 0.0031, + "step": 6152 + }, + { + "epoch": 5.619178082191781, + "grad_norm": 1.0295929908752441, + "learning_rate": 4.86859462201928e-06, + "loss": 0.0067, + "step": 6153 + }, + { + "epoch": 5.620091324200914, + "grad_norm": 2.155923366546631, + "learning_rate": 4.8675799086757995e-06, + "loss": 0.0173, + "step": 6154 + }, + { + "epoch": 5.621004566210045, + "grad_norm": 6.972536087036133, + "learning_rate": 4.866565195332319e-06, + "loss": 0.0488, + "step": 6155 + }, + { + "epoch": 5.6219178082191785, + "grad_norm": 5.254935264587402, + "learning_rate": 4.865550481988838e-06, + "loss": 0.0446, + "step": 6156 + }, + { + "epoch": 5.62283105022831, + "grad_norm": 0.215129092335701, + "learning_rate": 4.8645357686453585e-06, + "loss": 0.0019, + "step": 6157 + }, + { + "epoch": 5.6237442922374425, + "grad_norm": 13.56357479095459, + "learning_rate": 4.863521055301877e-06, + "loss": 0.0933, + "step": 6158 + }, + { + "epoch": 5.624657534246575, + "grad_norm": 26.503938674926758, + "learning_rate": 4.862506341958397e-06, + "loss": 0.2582, + "step": 6159 + }, + { + "epoch": 5.6255707762557075, + "grad_norm": 11.689404487609863, + "learning_rate": 4.861491628614917e-06, + "loss": 0.1197, + "step": 6160 + }, + { + "epoch": 5.62648401826484, + "grad_norm": 1.3222562074661255, + "learning_rate": 4.860476915271436e-06, + "loss": 0.0127, + "step": 6161 + }, + { + "epoch": 5.627397260273972, + "grad_norm": 26.653779983520508, + "learning_rate": 4.859462201927956e-06, + "loss": 0.1769, + "step": 6162 + }, + { + "epoch": 5.628310502283105, + "grad_norm": 16.747140884399414, + "learning_rate": 4.858447488584475e-06, + "loss": 0.0952, + "step": 6163 + }, + { + "epoch": 5.629223744292237, + "grad_norm": 20.14765739440918, + "learning_rate": 4.857432775240995e-06, + "loss": 0.2692, + "step": 6164 + }, + { + "epoch": 5.63013698630137, + "grad_norm": 8.103009223937988, + "learning_rate": 4.856418061897514e-06, + "loss": 0.0645, + "step": 6165 + }, + { + "epoch": 5.631050228310502, + "grad_norm": 0.24526718258857727, + "learning_rate": 4.855403348554033e-06, + "loss": 0.0015, + "step": 6166 + }, + { + "epoch": 5.631963470319635, + "grad_norm": 13.362220764160156, + "learning_rate": 4.854388635210554e-06, + "loss": 0.0999, + "step": 6167 + }, + { + "epoch": 5.632876712328767, + "grad_norm": 5.207032203674316, + "learning_rate": 4.853373921867073e-06, + "loss": 0.0393, + "step": 6168 + }, + { + "epoch": 5.6337899543378995, + "grad_norm": 13.043059349060059, + "learning_rate": 4.852359208523592e-06, + "loss": 0.1381, + "step": 6169 + }, + { + "epoch": 5.634703196347032, + "grad_norm": 1.3327536582946777, + "learning_rate": 4.851344495180112e-06, + "loss": 0.0081, + "step": 6170 + }, + { + "epoch": 5.635616438356164, + "grad_norm": 26.335372924804688, + "learning_rate": 4.850329781836632e-06, + "loss": 0.273, + "step": 6171 + }, + { + "epoch": 5.636529680365297, + "grad_norm": 37.80538558959961, + "learning_rate": 4.849315068493151e-06, + "loss": 0.2932, + "step": 6172 + }, + { + "epoch": 5.637442922374429, + "grad_norm": 23.213640213012695, + "learning_rate": 4.84830035514967e-06, + "loss": 0.1647, + "step": 6173 + }, + { + "epoch": 5.638356164383562, + "grad_norm": 31.477468490600586, + "learning_rate": 4.84728564180619e-06, + "loss": 0.373, + "step": 6174 + }, + { + "epoch": 5.639269406392694, + "grad_norm": 41.86611557006836, + "learning_rate": 4.84627092846271e-06, + "loss": 0.1808, + "step": 6175 + }, + { + "epoch": 5.640182648401827, + "grad_norm": 1.7818270921707153, + "learning_rate": 4.845256215119229e-06, + "loss": 0.0104, + "step": 6176 + }, + { + "epoch": 5.641095890410959, + "grad_norm": 4.275580406188965, + "learning_rate": 4.844241501775749e-06, + "loss": 0.022, + "step": 6177 + }, + { + "epoch": 5.642009132420092, + "grad_norm": 0.5177743434906006, + "learning_rate": 4.843226788432268e-06, + "loss": 0.0045, + "step": 6178 + }, + { + "epoch": 5.642922374429224, + "grad_norm": 8.551973342895508, + "learning_rate": 4.842212075088788e-06, + "loss": 0.0614, + "step": 6179 + }, + { + "epoch": 5.6438356164383565, + "grad_norm": 4.11021614074707, + "learning_rate": 4.841197361745307e-06, + "loss": 0.0325, + "step": 6180 + }, + { + "epoch": 5.644748858447489, + "grad_norm": 19.931798934936523, + "learning_rate": 4.840182648401827e-06, + "loss": 0.2056, + "step": 6181 + }, + { + "epoch": 5.6456621004566205, + "grad_norm": 14.048386573791504, + "learning_rate": 4.839167935058347e-06, + "loss": 0.0703, + "step": 6182 + }, + { + "epoch": 5.646575342465754, + "grad_norm": 0.8870598673820496, + "learning_rate": 4.8381532217148655e-06, + "loss": 0.005, + "step": 6183 + }, + { + "epoch": 5.647488584474885, + "grad_norm": 0.30193495750427246, + "learning_rate": 4.837138508371386e-06, + "loss": 0.0017, + "step": 6184 + }, + { + "epoch": 5.648401826484018, + "grad_norm": 0.6807970404624939, + "learning_rate": 4.836123795027905e-06, + "loss": 0.0048, + "step": 6185 + }, + { + "epoch": 5.64931506849315, + "grad_norm": 1.9753656387329102, + "learning_rate": 4.8351090816844245e-06, + "loss": 0.0111, + "step": 6186 + }, + { + "epoch": 5.650228310502283, + "grad_norm": 4.88196325302124, + "learning_rate": 4.834094368340944e-06, + "loss": 0.0329, + "step": 6187 + }, + { + "epoch": 5.651141552511415, + "grad_norm": 5.9741740226745605, + "learning_rate": 4.833079654997463e-06, + "loss": 0.035, + "step": 6188 + }, + { + "epoch": 5.652054794520548, + "grad_norm": 1.681877851486206, + "learning_rate": 4.832064941653984e-06, + "loss": 0.0106, + "step": 6189 + }, + { + "epoch": 5.65296803652968, + "grad_norm": 25.854618072509766, + "learning_rate": 4.8310502283105025e-06, + "loss": 0.1996, + "step": 6190 + }, + { + "epoch": 5.653881278538813, + "grad_norm": 0.015403260476887226, + "learning_rate": 4.830035514967022e-06, + "loss": 0.0001, + "step": 6191 + }, + { + "epoch": 5.654794520547945, + "grad_norm": 15.94944953918457, + "learning_rate": 4.829020801623542e-06, + "loss": 0.0927, + "step": 6192 + }, + { + "epoch": 5.6557077625570775, + "grad_norm": 7.874975681304932, + "learning_rate": 4.8280060882800615e-06, + "loss": 0.0703, + "step": 6193 + }, + { + "epoch": 5.65662100456621, + "grad_norm": 48.74298095703125, + "learning_rate": 4.826991374936581e-06, + "loss": 0.4148, + "step": 6194 + }, + { + "epoch": 5.657534246575342, + "grad_norm": 5.131871700286865, + "learning_rate": 4.8259766615931e-06, + "loss": 0.0445, + "step": 6195 + }, + { + "epoch": 5.658447488584475, + "grad_norm": 4.292543411254883, + "learning_rate": 4.82496194824962e-06, + "loss": 0.0402, + "step": 6196 + }, + { + "epoch": 5.659360730593607, + "grad_norm": 6.915599346160889, + "learning_rate": 4.8239472349061395e-06, + "loss": 0.0391, + "step": 6197 + }, + { + "epoch": 5.66027397260274, + "grad_norm": 73.57177734375, + "learning_rate": 4.822932521562659e-06, + "loss": 1.0901, + "step": 6198 + }, + { + "epoch": 5.661187214611872, + "grad_norm": 3.592679500579834, + "learning_rate": 4.821917808219179e-06, + "loss": 0.027, + "step": 6199 + }, + { + "epoch": 5.662100456621005, + "grad_norm": 15.03331470489502, + "learning_rate": 4.820903094875698e-06, + "loss": 0.1202, + "step": 6200 + }, + { + "epoch": 5.663013698630137, + "grad_norm": 0.09686741977930069, + "learning_rate": 4.819888381532217e-06, + "loss": 0.0008, + "step": 6201 + }, + { + "epoch": 5.66392694063927, + "grad_norm": 12.562921524047852, + "learning_rate": 4.818873668188737e-06, + "loss": 0.0809, + "step": 6202 + }, + { + "epoch": 5.664840182648402, + "grad_norm": 4.3868865966796875, + "learning_rate": 4.817858954845257e-06, + "loss": 0.0298, + "step": 6203 + }, + { + "epoch": 5.6657534246575345, + "grad_norm": 96.30401611328125, + "learning_rate": 4.8168442415017765e-06, + "loss": 1.2984, + "step": 6204 + }, + { + "epoch": 5.666666666666667, + "grad_norm": 0.5326431393623352, + "learning_rate": 4.815829528158295e-06, + "loss": 0.0039, + "step": 6205 + }, + { + "epoch": 5.667579908675799, + "grad_norm": 2.357754945755005, + "learning_rate": 4.814814814814815e-06, + "loss": 0.0161, + "step": 6206 + }, + { + "epoch": 5.668493150684932, + "grad_norm": 9.063591003417969, + "learning_rate": 4.813800101471335e-06, + "loss": 0.0549, + "step": 6207 + }, + { + "epoch": 5.669406392694064, + "grad_norm": 21.310659408569336, + "learning_rate": 4.812785388127854e-06, + "loss": 0.1687, + "step": 6208 + }, + { + "epoch": 5.670319634703196, + "grad_norm": 2.9325766563415527, + "learning_rate": 4.811770674784374e-06, + "loss": 0.0284, + "step": 6209 + }, + { + "epoch": 5.671232876712329, + "grad_norm": 1.310929536819458, + "learning_rate": 4.810755961440893e-06, + "loss": 0.0097, + "step": 6210 + }, + { + "epoch": 5.672146118721461, + "grad_norm": 41.028778076171875, + "learning_rate": 4.809741248097413e-06, + "loss": 0.2962, + "step": 6211 + }, + { + "epoch": 5.673059360730593, + "grad_norm": 0.7539507150650024, + "learning_rate": 4.808726534753932e-06, + "loss": 0.006, + "step": 6212 + }, + { + "epoch": 5.673972602739726, + "grad_norm": 5.269160270690918, + "learning_rate": 4.807711821410452e-06, + "loss": 0.0355, + "step": 6213 + }, + { + "epoch": 5.674885844748858, + "grad_norm": 47.66608810424805, + "learning_rate": 4.806697108066972e-06, + "loss": 0.3378, + "step": 6214 + }, + { + "epoch": 5.675799086757991, + "grad_norm": 0.2036321610212326, + "learning_rate": 4.805682394723491e-06, + "loss": 0.0015, + "step": 6215 + }, + { + "epoch": 5.676712328767123, + "grad_norm": 7.247274398803711, + "learning_rate": 4.80466768138001e-06, + "loss": 0.0541, + "step": 6216 + }, + { + "epoch": 5.6776255707762555, + "grad_norm": 4.601781845092773, + "learning_rate": 4.80365296803653e-06, + "loss": 0.0248, + "step": 6217 + }, + { + "epoch": 5.678538812785388, + "grad_norm": 0.3532552123069763, + "learning_rate": 4.80263825469305e-06, + "loss": 0.0025, + "step": 6218 + }, + { + "epoch": 5.67945205479452, + "grad_norm": 10.304889678955078, + "learning_rate": 4.801623541349569e-06, + "loss": 0.0826, + "step": 6219 + }, + { + "epoch": 5.680365296803653, + "grad_norm": 7.839193820953369, + "learning_rate": 4.800608828006089e-06, + "loss": 0.0647, + "step": 6220 + }, + { + "epoch": 5.681278538812785, + "grad_norm": 3.0452473163604736, + "learning_rate": 4.799594114662608e-06, + "loss": 0.0206, + "step": 6221 + }, + { + "epoch": 5.682191780821918, + "grad_norm": 9.411534309387207, + "learning_rate": 4.7985794013191275e-06, + "loss": 0.048, + "step": 6222 + }, + { + "epoch": 5.68310502283105, + "grad_norm": 5.243720054626465, + "learning_rate": 4.797564687975647e-06, + "loss": 0.0499, + "step": 6223 + }, + { + "epoch": 5.684018264840183, + "grad_norm": 11.288482666015625, + "learning_rate": 4.796549974632167e-06, + "loss": 0.0554, + "step": 6224 + }, + { + "epoch": 5.684931506849315, + "grad_norm": 66.39952850341797, + "learning_rate": 4.795535261288687e-06, + "loss": 1.0419, + "step": 6225 + }, + { + "epoch": 5.685844748858448, + "grad_norm": 41.3634033203125, + "learning_rate": 4.7945205479452054e-06, + "loss": 0.4734, + "step": 6226 + }, + { + "epoch": 5.68675799086758, + "grad_norm": 15.263056755065918, + "learning_rate": 4.793505834601725e-06, + "loss": 0.1143, + "step": 6227 + }, + { + "epoch": 5.6876712328767125, + "grad_norm": 47.93193435668945, + "learning_rate": 4.792491121258245e-06, + "loss": 0.3648, + "step": 6228 + }, + { + "epoch": 5.688584474885845, + "grad_norm": 47.36946487426758, + "learning_rate": 4.7914764079147645e-06, + "loss": 0.2874, + "step": 6229 + }, + { + "epoch": 5.689497716894977, + "grad_norm": 3.390753984451294, + "learning_rate": 4.790461694571284e-06, + "loss": 0.03, + "step": 6230 + }, + { + "epoch": 5.69041095890411, + "grad_norm": 97.24192810058594, + "learning_rate": 4.789446981227803e-06, + "loss": 2.829, + "step": 6231 + }, + { + "epoch": 5.691324200913242, + "grad_norm": 1.8728729486465454, + "learning_rate": 4.788432267884323e-06, + "loss": 0.0103, + "step": 6232 + }, + { + "epoch": 5.692237442922375, + "grad_norm": 8.984526634216309, + "learning_rate": 4.7874175545408424e-06, + "loss": 0.0561, + "step": 6233 + }, + { + "epoch": 5.693150684931507, + "grad_norm": 2.8752241134643555, + "learning_rate": 4.786402841197362e-06, + "loss": 0.0109, + "step": 6234 + }, + { + "epoch": 5.69406392694064, + "grad_norm": 14.340533256530762, + "learning_rate": 4.785388127853882e-06, + "loss": 0.0896, + "step": 6235 + }, + { + "epoch": 5.694977168949771, + "grad_norm": 1.0547605752944946, + "learning_rate": 4.784373414510401e-06, + "loss": 0.0043, + "step": 6236 + }, + { + "epoch": 5.695890410958905, + "grad_norm": 0.46365028619766235, + "learning_rate": 4.783358701166921e-06, + "loss": 0.0029, + "step": 6237 + }, + { + "epoch": 5.696803652968036, + "grad_norm": 0.6564566493034363, + "learning_rate": 4.78234398782344e-06, + "loss": 0.0046, + "step": 6238 + }, + { + "epoch": 5.697716894977169, + "grad_norm": 3.4735538959503174, + "learning_rate": 4.78132927447996e-06, + "loss": 0.0271, + "step": 6239 + }, + { + "epoch": 5.698630136986301, + "grad_norm": 2.1133439540863037, + "learning_rate": 4.7803145611364794e-06, + "loss": 0.0163, + "step": 6240 + }, + { + "epoch": 5.6995433789954335, + "grad_norm": 6.861545562744141, + "learning_rate": 4.779299847792998e-06, + "loss": 0.0565, + "step": 6241 + }, + { + "epoch": 5.700456621004566, + "grad_norm": 1.336923599243164, + "learning_rate": 4.778285134449519e-06, + "loss": 0.0088, + "step": 6242 + }, + { + "epoch": 5.701369863013698, + "grad_norm": 0.20653989911079407, + "learning_rate": 4.777270421106038e-06, + "loss": 0.0012, + "step": 6243 + }, + { + "epoch": 5.702283105022831, + "grad_norm": 0.5571025609970093, + "learning_rate": 4.776255707762557e-06, + "loss": 0.0046, + "step": 6244 + }, + { + "epoch": 5.703196347031963, + "grad_norm": 0.37896305322647095, + "learning_rate": 4.775240994419077e-06, + "loss": 0.0034, + "step": 6245 + }, + { + "epoch": 5.704109589041096, + "grad_norm": 29.597963333129883, + "learning_rate": 4.774226281075596e-06, + "loss": 0.2822, + "step": 6246 + }, + { + "epoch": 5.705022831050228, + "grad_norm": 89.74457550048828, + "learning_rate": 4.7732115677321164e-06, + "loss": 2.87, + "step": 6247 + }, + { + "epoch": 5.705936073059361, + "grad_norm": 3.9842123985290527, + "learning_rate": 4.772196854388635e-06, + "loss": 0.024, + "step": 6248 + }, + { + "epoch": 5.706849315068493, + "grad_norm": 79.8553466796875, + "learning_rate": 4.771182141045155e-06, + "loss": 1.8197, + "step": 6249 + }, + { + "epoch": 5.707762557077626, + "grad_norm": 13.68539810180664, + "learning_rate": 4.770167427701675e-06, + "loss": 0.1095, + "step": 6250 + }, + { + "epoch": 5.708675799086758, + "grad_norm": 0.058219823986291885, + "learning_rate": 4.769152714358194e-06, + "loss": 0.0003, + "step": 6251 + }, + { + "epoch": 5.7095890410958905, + "grad_norm": 0.46411746740341187, + "learning_rate": 4.768138001014714e-06, + "loss": 0.0031, + "step": 6252 + }, + { + "epoch": 5.710502283105023, + "grad_norm": 9.373150825500488, + "learning_rate": 4.767123287671233e-06, + "loss": 0.0468, + "step": 6253 + }, + { + "epoch": 5.711415525114155, + "grad_norm": 59.289772033691406, + "learning_rate": 4.766108574327753e-06, + "loss": 0.8243, + "step": 6254 + }, + { + "epoch": 5.712328767123288, + "grad_norm": 28.08940315246582, + "learning_rate": 4.765093860984272e-06, + "loss": 0.2753, + "step": 6255 + }, + { + "epoch": 5.71324200913242, + "grad_norm": 39.898658752441406, + "learning_rate": 4.764079147640792e-06, + "loss": 0.8156, + "step": 6256 + }, + { + "epoch": 5.714155251141553, + "grad_norm": 3.649132013320923, + "learning_rate": 4.763064434297312e-06, + "loss": 0.0225, + "step": 6257 + }, + { + "epoch": 5.715068493150685, + "grad_norm": 159.52691650390625, + "learning_rate": 4.7620497209538305e-06, + "loss": 1.1908, + "step": 6258 + }, + { + "epoch": 5.715981735159818, + "grad_norm": 5.3911967277526855, + "learning_rate": 4.761035007610351e-06, + "loss": 0.03, + "step": 6259 + }, + { + "epoch": 5.71689497716895, + "grad_norm": 1.4110770225524902, + "learning_rate": 4.76002029426687e-06, + "loss": 0.0104, + "step": 6260 + }, + { + "epoch": 5.717808219178083, + "grad_norm": 1.884843349456787, + "learning_rate": 4.75900558092339e-06, + "loss": 0.0119, + "step": 6261 + }, + { + "epoch": 5.718721461187215, + "grad_norm": 17.487834930419922, + "learning_rate": 4.757990867579909e-06, + "loss": 0.1163, + "step": 6262 + }, + { + "epoch": 5.719634703196347, + "grad_norm": 0.23798131942749023, + "learning_rate": 4.756976154236428e-06, + "loss": 0.0016, + "step": 6263 + }, + { + "epoch": 5.72054794520548, + "grad_norm": 10.974420547485352, + "learning_rate": 4.755961440892949e-06, + "loss": 0.0304, + "step": 6264 + }, + { + "epoch": 5.7214611872146115, + "grad_norm": 69.4903793334961, + "learning_rate": 4.7549467275494675e-06, + "loss": 0.5969, + "step": 6265 + }, + { + "epoch": 5.722374429223744, + "grad_norm": 83.6186294555664, + "learning_rate": 4.753932014205987e-06, + "loss": 0.8885, + "step": 6266 + }, + { + "epoch": 5.723287671232876, + "grad_norm": 13.90544319152832, + "learning_rate": 4.752917300862507e-06, + "loss": 0.1044, + "step": 6267 + }, + { + "epoch": 5.724200913242009, + "grad_norm": 4.718982219696045, + "learning_rate": 4.751902587519026e-06, + "loss": 0.0319, + "step": 6268 + }, + { + "epoch": 5.725114155251141, + "grad_norm": 1.1351267099380493, + "learning_rate": 4.750887874175546e-06, + "loss": 0.0076, + "step": 6269 + }, + { + "epoch": 5.726027397260274, + "grad_norm": 62.36274337768555, + "learning_rate": 4.749873160832065e-06, + "loss": 0.4666, + "step": 6270 + }, + { + "epoch": 5.726940639269406, + "grad_norm": 7.496153354644775, + "learning_rate": 4.748858447488585e-06, + "loss": 0.0148, + "step": 6271 + }, + { + "epoch": 5.727853881278539, + "grad_norm": 68.45071411132812, + "learning_rate": 4.7478437341451045e-06, + "loss": 1.2613, + "step": 6272 + }, + { + "epoch": 5.728767123287671, + "grad_norm": 0.0796283707022667, + "learning_rate": 4.746829020801624e-06, + "loss": 0.0008, + "step": 6273 + }, + { + "epoch": 5.729680365296804, + "grad_norm": 1.1672964096069336, + "learning_rate": 4.745814307458144e-06, + "loss": 0.0072, + "step": 6274 + }, + { + "epoch": 5.730593607305936, + "grad_norm": 2.16306734085083, + "learning_rate": 4.744799594114663e-06, + "loss": 0.0193, + "step": 6275 + }, + { + "epoch": 5.7315068493150685, + "grad_norm": 1.277582049369812, + "learning_rate": 4.743784880771182e-06, + "loss": 0.0079, + "step": 6276 + }, + { + "epoch": 5.732420091324201, + "grad_norm": 26.970216751098633, + "learning_rate": 4.742770167427702e-06, + "loss": 0.2247, + "step": 6277 + }, + { + "epoch": 5.733333333333333, + "grad_norm": 0.5018754601478577, + "learning_rate": 4.741755454084222e-06, + "loss": 0.0045, + "step": 6278 + }, + { + "epoch": 5.734246575342466, + "grad_norm": 2.5454483032226562, + "learning_rate": 4.7407407407407415e-06, + "loss": 0.0152, + "step": 6279 + }, + { + "epoch": 5.735159817351598, + "grad_norm": 1.4474146366119385, + "learning_rate": 4.73972602739726e-06, + "loss": 0.0089, + "step": 6280 + }, + { + "epoch": 5.736073059360731, + "grad_norm": 7.587601184844971, + "learning_rate": 4.73871131405378e-06, + "loss": 0.0633, + "step": 6281 + }, + { + "epoch": 5.736986301369863, + "grad_norm": 0.8566759824752808, + "learning_rate": 4.7376966007103e-06, + "loss": 0.0023, + "step": 6282 + }, + { + "epoch": 5.737899543378996, + "grad_norm": 6.310000896453857, + "learning_rate": 4.736681887366819e-06, + "loss": 0.0485, + "step": 6283 + }, + { + "epoch": 5.738812785388128, + "grad_norm": 30.606704711914062, + "learning_rate": 4.735667174023339e-06, + "loss": 0.2705, + "step": 6284 + }, + { + "epoch": 5.739726027397261, + "grad_norm": 77.85704040527344, + "learning_rate": 4.734652460679858e-06, + "loss": 0.0401, + "step": 6285 + }, + { + "epoch": 5.740639269406393, + "grad_norm": 83.4261703491211, + "learning_rate": 4.733637747336378e-06, + "loss": 0.3033, + "step": 6286 + }, + { + "epoch": 5.7415525114155255, + "grad_norm": 13.600371360778809, + "learning_rate": 4.732623033992897e-06, + "loss": 0.0749, + "step": 6287 + }, + { + "epoch": 5.742465753424657, + "grad_norm": 2.6699180603027344, + "learning_rate": 4.731608320649417e-06, + "loss": 0.0155, + "step": 6288 + }, + { + "epoch": 5.74337899543379, + "grad_norm": 4.8108086585998535, + "learning_rate": 4.730593607305937e-06, + "loss": 0.0279, + "step": 6289 + }, + { + "epoch": 5.744292237442922, + "grad_norm": 0.14039888978004456, + "learning_rate": 4.7295788939624556e-06, + "loss": 0.0007, + "step": 6290 + }, + { + "epoch": 5.745205479452055, + "grad_norm": 7.16610860824585, + "learning_rate": 4.728564180618975e-06, + "loss": 0.0627, + "step": 6291 + }, + { + "epoch": 5.746118721461187, + "grad_norm": 1.005569338798523, + "learning_rate": 4.727549467275495e-06, + "loss": 0.0061, + "step": 6292 + }, + { + "epoch": 5.747031963470319, + "grad_norm": 7.403438568115234, + "learning_rate": 4.726534753932015e-06, + "loss": 0.0495, + "step": 6293 + }, + { + "epoch": 5.747945205479452, + "grad_norm": 30.61896514892578, + "learning_rate": 4.725520040588534e-06, + "loss": 0.1575, + "step": 6294 + }, + { + "epoch": 5.748858447488584, + "grad_norm": 1.4003325700759888, + "learning_rate": 4.724505327245054e-06, + "loss": 0.0125, + "step": 6295 + }, + { + "epoch": 5.749771689497717, + "grad_norm": 1.489829659461975, + "learning_rate": 4.723490613901573e-06, + "loss": 0.0097, + "step": 6296 + }, + { + "epoch": 5.750684931506849, + "grad_norm": 15.286921501159668, + "learning_rate": 4.7224759005580926e-06, + "loss": 0.0969, + "step": 6297 + }, + { + "epoch": 5.751598173515982, + "grad_norm": 1.850224494934082, + "learning_rate": 4.721461187214612e-06, + "loss": 0.0132, + "step": 6298 + }, + { + "epoch": 5.752511415525114, + "grad_norm": 2.982309341430664, + "learning_rate": 4.720446473871132e-06, + "loss": 0.0258, + "step": 6299 + }, + { + "epoch": 5.7534246575342465, + "grad_norm": 1.5499919652938843, + "learning_rate": 4.719431760527652e-06, + "loss": 0.0111, + "step": 6300 + }, + { + "epoch": 5.754337899543379, + "grad_norm": 11.811062812805176, + "learning_rate": 4.7184170471841705e-06, + "loss": 0.1352, + "step": 6301 + }, + { + "epoch": 5.755251141552511, + "grad_norm": 0.5679688453674316, + "learning_rate": 4.71740233384069e-06, + "loss": 0.0039, + "step": 6302 + }, + { + "epoch": 5.756164383561644, + "grad_norm": 1.1695523262023926, + "learning_rate": 4.71638762049721e-06, + "loss": 0.0068, + "step": 6303 + }, + { + "epoch": 5.757077625570776, + "grad_norm": 2.203482151031494, + "learning_rate": 4.7153729071537296e-06, + "loss": 0.0104, + "step": 6304 + }, + { + "epoch": 5.757990867579909, + "grad_norm": 0.3010949194431305, + "learning_rate": 4.714358193810249e-06, + "loss": 0.0021, + "step": 6305 + }, + { + "epoch": 5.758904109589041, + "grad_norm": 17.483064651489258, + "learning_rate": 4.713343480466768e-06, + "loss": 0.1019, + "step": 6306 + }, + { + "epoch": 5.759817351598174, + "grad_norm": 0.32501402497291565, + "learning_rate": 4.712328767123288e-06, + "loss": 0.0022, + "step": 6307 + }, + { + "epoch": 5.760730593607306, + "grad_norm": 45.183189392089844, + "learning_rate": 4.7113140537798075e-06, + "loss": 0.3123, + "step": 6308 + }, + { + "epoch": 5.761643835616439, + "grad_norm": 3.6501009464263916, + "learning_rate": 4.710299340436327e-06, + "loss": 0.0199, + "step": 6309 + }, + { + "epoch": 5.762557077625571, + "grad_norm": 3.219440460205078, + "learning_rate": 4.709284627092847e-06, + "loss": 0.0236, + "step": 6310 + }, + { + "epoch": 5.7634703196347035, + "grad_norm": 0.2460600584745407, + "learning_rate": 4.708269913749366e-06, + "loss": 0.0016, + "step": 6311 + }, + { + "epoch": 5.764383561643836, + "grad_norm": 2.96557879447937, + "learning_rate": 4.707255200405885e-06, + "loss": 0.0215, + "step": 6312 + }, + { + "epoch": 5.765296803652968, + "grad_norm": 0.38358449935913086, + "learning_rate": 4.706240487062405e-06, + "loss": 0.0022, + "step": 6313 + }, + { + "epoch": 5.766210045662101, + "grad_norm": 2.7226343154907227, + "learning_rate": 4.705225773718925e-06, + "loss": 0.0172, + "step": 6314 + }, + { + "epoch": 5.767123287671232, + "grad_norm": 0.40661075711250305, + "learning_rate": 4.7042110603754445e-06, + "loss": 0.0034, + "step": 6315 + }, + { + "epoch": 5.768036529680366, + "grad_norm": 72.62373352050781, + "learning_rate": 4.703196347031963e-06, + "loss": 1.8036, + "step": 6316 + }, + { + "epoch": 5.768949771689497, + "grad_norm": 75.79652404785156, + "learning_rate": 4.702181633688484e-06, + "loss": 1.0278, + "step": 6317 + }, + { + "epoch": 5.76986301369863, + "grad_norm": 2.7379679679870605, + "learning_rate": 4.701166920345003e-06, + "loss": 0.0177, + "step": 6318 + }, + { + "epoch": 5.770776255707762, + "grad_norm": 119.58436584472656, + "learning_rate": 4.700152207001522e-06, + "loss": 0.6434, + "step": 6319 + }, + { + "epoch": 5.771689497716895, + "grad_norm": 7.019618511199951, + "learning_rate": 4.699137493658042e-06, + "loss": 0.0457, + "step": 6320 + }, + { + "epoch": 5.772602739726027, + "grad_norm": 0.1052672415971756, + "learning_rate": 4.698122780314561e-06, + "loss": 0.0007, + "step": 6321 + }, + { + "epoch": 5.77351598173516, + "grad_norm": 3.1765389442443848, + "learning_rate": 4.6971080669710815e-06, + "loss": 0.0214, + "step": 6322 + }, + { + "epoch": 5.774429223744292, + "grad_norm": 3.4024696350097656, + "learning_rate": 4.6960933536276e-06, + "loss": 0.0166, + "step": 6323 + }, + { + "epoch": 5.7753424657534245, + "grad_norm": 36.180789947509766, + "learning_rate": 4.69507864028412e-06, + "loss": 0.1269, + "step": 6324 + }, + { + "epoch": 5.776255707762557, + "grad_norm": 75.8991470336914, + "learning_rate": 4.69406392694064e-06, + "loss": 1.1192, + "step": 6325 + }, + { + "epoch": 5.777168949771689, + "grad_norm": 0.9386076927185059, + "learning_rate": 4.6930492135971586e-06, + "loss": 0.0066, + "step": 6326 + }, + { + "epoch": 5.778082191780822, + "grad_norm": 1.8757257461547852, + "learning_rate": 4.692034500253679e-06, + "loss": 0.0131, + "step": 6327 + }, + { + "epoch": 5.778995433789954, + "grad_norm": 15.015893936157227, + "learning_rate": 4.691019786910198e-06, + "loss": 0.1171, + "step": 6328 + }, + { + "epoch": 5.779908675799087, + "grad_norm": 0.6816348433494568, + "learning_rate": 4.690005073566718e-06, + "loss": 0.0048, + "step": 6329 + }, + { + "epoch": 5.780821917808219, + "grad_norm": 3.4283385276794434, + "learning_rate": 4.688990360223237e-06, + "loss": 0.0215, + "step": 6330 + }, + { + "epoch": 5.781735159817352, + "grad_norm": 1.5965230464935303, + "learning_rate": 4.687975646879756e-06, + "loss": 0.013, + "step": 6331 + }, + { + "epoch": 5.782648401826484, + "grad_norm": 3.162971258163452, + "learning_rate": 4.686960933536277e-06, + "loss": 0.0259, + "step": 6332 + }, + { + "epoch": 5.7835616438356166, + "grad_norm": 25.332767486572266, + "learning_rate": 4.6859462201927956e-06, + "loss": 0.2308, + "step": 6333 + }, + { + "epoch": 5.784474885844749, + "grad_norm": 63.254329681396484, + "learning_rate": 4.684931506849315e-06, + "loss": 0.1855, + "step": 6334 + }, + { + "epoch": 5.7853881278538815, + "grad_norm": 0.6333000659942627, + "learning_rate": 4.683916793505835e-06, + "loss": 0.0048, + "step": 6335 + }, + { + "epoch": 5.786301369863014, + "grad_norm": 3.583315134048462, + "learning_rate": 4.682902080162355e-06, + "loss": 0.0272, + "step": 6336 + }, + { + "epoch": 5.787214611872146, + "grad_norm": 0.08942751586437225, + "learning_rate": 4.681887366818874e-06, + "loss": 0.0007, + "step": 6337 + }, + { + "epoch": 5.788127853881279, + "grad_norm": 3.2114107608795166, + "learning_rate": 4.680872653475393e-06, + "loss": 0.0135, + "step": 6338 + }, + { + "epoch": 5.789041095890411, + "grad_norm": 0.9336119294166565, + "learning_rate": 4.679857940131914e-06, + "loss": 0.0063, + "step": 6339 + }, + { + "epoch": 5.789954337899544, + "grad_norm": 3.386265516281128, + "learning_rate": 4.6788432267884326e-06, + "loss": 0.0227, + "step": 6340 + }, + { + "epoch": 5.790867579908676, + "grad_norm": 0.11029919236898422, + "learning_rate": 4.677828513444952e-06, + "loss": 0.0008, + "step": 6341 + }, + { + "epoch": 5.791780821917808, + "grad_norm": 2.182194232940674, + "learning_rate": 4.676813800101472e-06, + "loss": 0.0148, + "step": 6342 + }, + { + "epoch": 5.792694063926941, + "grad_norm": 8.725224494934082, + "learning_rate": 4.675799086757991e-06, + "loss": 0.0643, + "step": 6343 + }, + { + "epoch": 5.793607305936073, + "grad_norm": 1.5506550073623657, + "learning_rate": 4.674784373414511e-06, + "loss": 0.0122, + "step": 6344 + }, + { + "epoch": 5.794520547945205, + "grad_norm": 0.24114222824573517, + "learning_rate": 4.67376966007103e-06, + "loss": 0.0018, + "step": 6345 + }, + { + "epoch": 5.7954337899543376, + "grad_norm": 1.1052191257476807, + "learning_rate": 4.67275494672755e-06, + "loss": 0.0068, + "step": 6346 + }, + { + "epoch": 5.79634703196347, + "grad_norm": 5.463517189025879, + "learning_rate": 4.6717402333840695e-06, + "loss": 0.047, + "step": 6347 + }, + { + "epoch": 5.7972602739726025, + "grad_norm": 0.916716992855072, + "learning_rate": 4.670725520040588e-06, + "loss": 0.0064, + "step": 6348 + }, + { + "epoch": 5.798173515981735, + "grad_norm": 10.660006523132324, + "learning_rate": 4.669710806697109e-06, + "loss": 0.0842, + "step": 6349 + }, + { + "epoch": 5.799086757990867, + "grad_norm": 3.49184513092041, + "learning_rate": 4.668696093353628e-06, + "loss": 0.0255, + "step": 6350 + }, + { + "epoch": 5.8, + "grad_norm": 62.9752082824707, + "learning_rate": 4.6676813800101475e-06, + "loss": 0.508, + "step": 6351 + }, + { + "epoch": 5.800913242009132, + "grad_norm": 5.787251949310303, + "learning_rate": 4.666666666666667e-06, + "loss": 0.0402, + "step": 6352 + }, + { + "epoch": 5.801826484018265, + "grad_norm": 0.7905371189117432, + "learning_rate": 4.665651953323187e-06, + "loss": 0.0056, + "step": 6353 + }, + { + "epoch": 5.802739726027397, + "grad_norm": 15.060397148132324, + "learning_rate": 4.6646372399797065e-06, + "loss": 0.0936, + "step": 6354 + }, + { + "epoch": 5.80365296803653, + "grad_norm": 81.27558898925781, + "learning_rate": 4.663622526636225e-06, + "loss": 1.1008, + "step": 6355 + }, + { + "epoch": 5.804566210045662, + "grad_norm": 2.62900972366333, + "learning_rate": 4.662607813292745e-06, + "loss": 0.0179, + "step": 6356 + }, + { + "epoch": 5.8054794520547945, + "grad_norm": 1.4463907480239868, + "learning_rate": 4.661593099949265e-06, + "loss": 0.0094, + "step": 6357 + }, + { + "epoch": 5.806392694063927, + "grad_norm": 50.06877899169922, + "learning_rate": 4.6605783866057845e-06, + "loss": 0.6257, + "step": 6358 + }, + { + "epoch": 5.8073059360730594, + "grad_norm": 6.01922607421875, + "learning_rate": 4.659563673262304e-06, + "loss": 0.0254, + "step": 6359 + }, + { + "epoch": 5.808219178082192, + "grad_norm": 13.552370071411133, + "learning_rate": 4.658548959918823e-06, + "loss": 0.1196, + "step": 6360 + }, + { + "epoch": 5.809132420091324, + "grad_norm": 53.516361236572266, + "learning_rate": 4.657534246575343e-06, + "loss": 0.3432, + "step": 6361 + }, + { + "epoch": 5.810045662100457, + "grad_norm": 2.8487582206726074, + "learning_rate": 4.656519533231862e-06, + "loss": 0.0214, + "step": 6362 + }, + { + "epoch": 5.810958904109589, + "grad_norm": 19.505434036254883, + "learning_rate": 4.655504819888382e-06, + "loss": 0.1276, + "step": 6363 + }, + { + "epoch": 5.811872146118722, + "grad_norm": 36.25993728637695, + "learning_rate": 4.654490106544902e-06, + "loss": 0.3188, + "step": 6364 + }, + { + "epoch": 5.812785388127854, + "grad_norm": 0.06494947522878647, + "learning_rate": 4.653475393201421e-06, + "loss": 0.0005, + "step": 6365 + }, + { + "epoch": 5.813698630136987, + "grad_norm": 15.217336654663086, + "learning_rate": 4.65246067985794e-06, + "loss": 0.139, + "step": 6366 + }, + { + "epoch": 5.814611872146119, + "grad_norm": 3.173311948776245, + "learning_rate": 4.65144596651446e-06, + "loss": 0.0123, + "step": 6367 + }, + { + "epoch": 5.8155251141552515, + "grad_norm": 0.18858474493026733, + "learning_rate": 4.65043125317098e-06, + "loss": 0.0011, + "step": 6368 + }, + { + "epoch": 5.816438356164383, + "grad_norm": 53.71699905395508, + "learning_rate": 4.649416539827499e-06, + "loss": 0.5814, + "step": 6369 + }, + { + "epoch": 5.817351598173516, + "grad_norm": 6.314499378204346, + "learning_rate": 4.648401826484018e-06, + "loss": 0.0347, + "step": 6370 + }, + { + "epoch": 5.818264840182648, + "grad_norm": 11.267075538635254, + "learning_rate": 4.647387113140538e-06, + "loss": 0.0783, + "step": 6371 + }, + { + "epoch": 5.8191780821917805, + "grad_norm": 12.3480863571167, + "learning_rate": 4.646372399797058e-06, + "loss": 0.0764, + "step": 6372 + }, + { + "epoch": 5.820091324200913, + "grad_norm": 0.11911693960428238, + "learning_rate": 4.645357686453577e-06, + "loss": 0.0006, + "step": 6373 + }, + { + "epoch": 5.821004566210045, + "grad_norm": 8.740556716918945, + "learning_rate": 4.644342973110097e-06, + "loss": 0.0535, + "step": 6374 + }, + { + "epoch": 5.821917808219178, + "grad_norm": 3.9740703105926514, + "learning_rate": 4.643328259766617e-06, + "loss": 0.0223, + "step": 6375 + }, + { + "epoch": 5.82283105022831, + "grad_norm": 2.5824551582336426, + "learning_rate": 4.6423135464231355e-06, + "loss": 0.0183, + "step": 6376 + }, + { + "epoch": 5.823744292237443, + "grad_norm": 0.8550006747245789, + "learning_rate": 4.641298833079655e-06, + "loss": 0.0045, + "step": 6377 + }, + { + "epoch": 5.824657534246575, + "grad_norm": 23.57609748840332, + "learning_rate": 4.640284119736175e-06, + "loss": 0.1237, + "step": 6378 + }, + { + "epoch": 5.825570776255708, + "grad_norm": 1.0776759386062622, + "learning_rate": 4.639269406392695e-06, + "loss": 0.006, + "step": 6379 + }, + { + "epoch": 5.82648401826484, + "grad_norm": 0.5209550857543945, + "learning_rate": 4.638254693049214e-06, + "loss": 0.0027, + "step": 6380 + }, + { + "epoch": 5.8273972602739725, + "grad_norm": 87.09274291992188, + "learning_rate": 4.637239979705733e-06, + "loss": 1.7001, + "step": 6381 + }, + { + "epoch": 5.828310502283105, + "grad_norm": 1.178368091583252, + "learning_rate": 4.636225266362253e-06, + "loss": 0.005, + "step": 6382 + }, + { + "epoch": 5.829223744292237, + "grad_norm": 62.01606750488281, + "learning_rate": 4.6352105530187725e-06, + "loss": 0.4256, + "step": 6383 + }, + { + "epoch": 5.83013698630137, + "grad_norm": 8.144169807434082, + "learning_rate": 4.634195839675292e-06, + "loss": 0.0441, + "step": 6384 + }, + { + "epoch": 5.831050228310502, + "grad_norm": 16.427846908569336, + "learning_rate": 4.633181126331812e-06, + "loss": 0.1812, + "step": 6385 + }, + { + "epoch": 5.831963470319635, + "grad_norm": 1.0125596523284912, + "learning_rate": 4.632166412988331e-06, + "loss": 0.0084, + "step": 6386 + }, + { + "epoch": 5.832876712328767, + "grad_norm": 41.478885650634766, + "learning_rate": 4.6311516996448505e-06, + "loss": 0.6418, + "step": 6387 + }, + { + "epoch": 5.8337899543379, + "grad_norm": 48.570343017578125, + "learning_rate": 4.63013698630137e-06, + "loss": 0.2245, + "step": 6388 + }, + { + "epoch": 5.834703196347032, + "grad_norm": 4.431980609893799, + "learning_rate": 4.62912227295789e-06, + "loss": 0.03, + "step": 6389 + }, + { + "epoch": 5.835616438356165, + "grad_norm": 0.24689583480358124, + "learning_rate": 4.6281075596144095e-06, + "loss": 0.0014, + "step": 6390 + }, + { + "epoch": 5.836529680365297, + "grad_norm": 118.50316619873047, + "learning_rate": 4.627092846270928e-06, + "loss": 1.6909, + "step": 6391 + }, + { + "epoch": 5.8374429223744295, + "grad_norm": 0.6554612517356873, + "learning_rate": 4.626078132927448e-06, + "loss": 0.0058, + "step": 6392 + }, + { + "epoch": 5.838356164383562, + "grad_norm": 58.16248321533203, + "learning_rate": 4.625063419583968e-06, + "loss": 0.9593, + "step": 6393 + }, + { + "epoch": 5.839269406392694, + "grad_norm": 33.42606735229492, + "learning_rate": 4.6240487062404875e-06, + "loss": 0.1873, + "step": 6394 + }, + { + "epoch": 5.840182648401827, + "grad_norm": 1.2439571619033813, + "learning_rate": 4.623033992897007e-06, + "loss": 0.0084, + "step": 6395 + }, + { + "epoch": 5.8410958904109584, + "grad_norm": 1.1335729360580444, + "learning_rate": 4.622019279553526e-06, + "loss": 0.0084, + "step": 6396 + }, + { + "epoch": 5.842009132420092, + "grad_norm": 111.12181854248047, + "learning_rate": 4.6210045662100465e-06, + "loss": 0.5745, + "step": 6397 + }, + { + "epoch": 5.842922374429223, + "grad_norm": 2.1747050285339355, + "learning_rate": 4.619989852866565e-06, + "loss": 0.0203, + "step": 6398 + }, + { + "epoch": 5.843835616438356, + "grad_norm": 9.445022583007812, + "learning_rate": 4.618975139523085e-06, + "loss": 0.0557, + "step": 6399 + }, + { + "epoch": 5.844748858447488, + "grad_norm": 4.668354034423828, + "learning_rate": 4.617960426179605e-06, + "loss": 0.0351, + "step": 6400 + }, + { + "epoch": 5.845662100456621, + "grad_norm": 17.5120792388916, + "learning_rate": 4.616945712836124e-06, + "loss": 0.0934, + "step": 6401 + }, + { + "epoch": 5.846575342465753, + "grad_norm": 18.407535552978516, + "learning_rate": 4.615930999492644e-06, + "loss": 0.2316, + "step": 6402 + }, + { + "epoch": 5.847488584474886, + "grad_norm": 6.577057361602783, + "learning_rate": 4.614916286149163e-06, + "loss": 0.0345, + "step": 6403 + }, + { + "epoch": 5.848401826484018, + "grad_norm": 1.2459369897842407, + "learning_rate": 4.613901572805683e-06, + "loss": 0.0062, + "step": 6404 + }, + { + "epoch": 5.8493150684931505, + "grad_norm": 1.460628867149353, + "learning_rate": 4.612886859462202e-06, + "loss": 0.0118, + "step": 6405 + }, + { + "epoch": 5.850228310502283, + "grad_norm": 1.2734293937683105, + "learning_rate": 4.611872146118721e-06, + "loss": 0.0078, + "step": 6406 + }, + { + "epoch": 5.851141552511415, + "grad_norm": 0.5266693830490112, + "learning_rate": 4.610857432775242e-06, + "loss": 0.0032, + "step": 6407 + }, + { + "epoch": 5.852054794520548, + "grad_norm": 39.045745849609375, + "learning_rate": 4.609842719431761e-06, + "loss": 0.3133, + "step": 6408 + }, + { + "epoch": 5.85296803652968, + "grad_norm": 0.12589038908481598, + "learning_rate": 4.60882800608828e-06, + "loss": 0.0008, + "step": 6409 + }, + { + "epoch": 5.853881278538813, + "grad_norm": 87.10112762451172, + "learning_rate": 4.6078132927448e-06, + "loss": 3.2165, + "step": 6410 + }, + { + "epoch": 5.854794520547945, + "grad_norm": 12.078407287597656, + "learning_rate": 4.606798579401319e-06, + "loss": 0.0961, + "step": 6411 + }, + { + "epoch": 5.855707762557078, + "grad_norm": 0.3316490352153778, + "learning_rate": 4.605783866057839e-06, + "loss": 0.003, + "step": 6412 + }, + { + "epoch": 5.85662100456621, + "grad_norm": 61.43522262573242, + "learning_rate": 4.604769152714358e-06, + "loss": 0.5073, + "step": 6413 + }, + { + "epoch": 5.857534246575343, + "grad_norm": 34.94548034667969, + "learning_rate": 4.603754439370878e-06, + "loss": 0.3803, + "step": 6414 + }, + { + "epoch": 5.858447488584475, + "grad_norm": 20.68385124206543, + "learning_rate": 4.602739726027398e-06, + "loss": 0.1163, + "step": 6415 + }, + { + "epoch": 5.8593607305936075, + "grad_norm": 2.878553867340088, + "learning_rate": 4.601725012683917e-06, + "loss": 0.0204, + "step": 6416 + }, + { + "epoch": 5.86027397260274, + "grad_norm": 0.7495649456977844, + "learning_rate": 4.600710299340437e-06, + "loss": 0.0063, + "step": 6417 + }, + { + "epoch": 5.861187214611872, + "grad_norm": 2.0889604091644287, + "learning_rate": 4.599695585996956e-06, + "loss": 0.0158, + "step": 6418 + }, + { + "epoch": 5.862100456621005, + "grad_norm": 5.020209312438965, + "learning_rate": 4.598680872653476e-06, + "loss": 0.0324, + "step": 6419 + }, + { + "epoch": 5.863013698630137, + "grad_norm": 12.356657028198242, + "learning_rate": 4.597666159309995e-06, + "loss": 0.1174, + "step": 6420 + }, + { + "epoch": 5.86392694063927, + "grad_norm": 0.050912827253341675, + "learning_rate": 4.596651445966515e-06, + "loss": 0.0004, + "step": 6421 + }, + { + "epoch": 5.864840182648402, + "grad_norm": 9.477063179016113, + "learning_rate": 4.595636732623035e-06, + "loss": 0.0663, + "step": 6422 + }, + { + "epoch": 5.865753424657534, + "grad_norm": 11.750054359436035, + "learning_rate": 4.5946220192795534e-06, + "loss": 0.0677, + "step": 6423 + }, + { + "epoch": 5.866666666666667, + "grad_norm": 2.830549955368042, + "learning_rate": 4.593607305936074e-06, + "loss": 0.0187, + "step": 6424 + }, + { + "epoch": 5.867579908675799, + "grad_norm": 5.28857946395874, + "learning_rate": 4.592592592592593e-06, + "loss": 0.0426, + "step": 6425 + }, + { + "epoch": 5.868493150684931, + "grad_norm": 117.83121490478516, + "learning_rate": 4.5915778792491125e-06, + "loss": 2.1251, + "step": 6426 + }, + { + "epoch": 5.869406392694064, + "grad_norm": 5.3877973556518555, + "learning_rate": 4.590563165905632e-06, + "loss": 0.0293, + "step": 6427 + }, + { + "epoch": 5.870319634703196, + "grad_norm": 0.27929338812828064, + "learning_rate": 4.589548452562151e-06, + "loss": 0.0017, + "step": 6428 + }, + { + "epoch": 5.8712328767123285, + "grad_norm": 16.617345809936523, + "learning_rate": 4.588533739218672e-06, + "loss": 0.1147, + "step": 6429 + }, + { + "epoch": 5.872146118721461, + "grad_norm": 3.03617787361145, + "learning_rate": 4.5875190258751904e-06, + "loss": 0.0255, + "step": 6430 + }, + { + "epoch": 5.873059360730593, + "grad_norm": 2.990602970123291, + "learning_rate": 4.58650431253171e-06, + "loss": 0.0166, + "step": 6431 + }, + { + "epoch": 5.873972602739726, + "grad_norm": 69.69731903076172, + "learning_rate": 4.58548959918823e-06, + "loss": 0.6108, + "step": 6432 + }, + { + "epoch": 5.874885844748858, + "grad_norm": 0.08110474050045013, + "learning_rate": 4.5844748858447495e-06, + "loss": 0.0007, + "step": 6433 + }, + { + "epoch": 5.875799086757991, + "grad_norm": 13.055305480957031, + "learning_rate": 4.583460172501269e-06, + "loss": 0.0855, + "step": 6434 + }, + { + "epoch": 5.876712328767123, + "grad_norm": 0.40921732783317566, + "learning_rate": 4.582445459157788e-06, + "loss": 0.0031, + "step": 6435 + }, + { + "epoch": 5.877625570776256, + "grad_norm": 0.9120984077453613, + "learning_rate": 4.581430745814308e-06, + "loss": 0.0039, + "step": 6436 + }, + { + "epoch": 5.878538812785388, + "grad_norm": 1.0850497484207153, + "learning_rate": 4.5804160324708274e-06, + "loss": 0.0079, + "step": 6437 + }, + { + "epoch": 5.879452054794521, + "grad_norm": 6.170816898345947, + "learning_rate": 4.579401319127347e-06, + "loss": 0.0384, + "step": 6438 + }, + { + "epoch": 5.880365296803653, + "grad_norm": 2.0151350498199463, + "learning_rate": 4.578386605783867e-06, + "loss": 0.0173, + "step": 6439 + }, + { + "epoch": 5.8812785388127855, + "grad_norm": 23.81354522705078, + "learning_rate": 4.577371892440386e-06, + "loss": 0.1802, + "step": 6440 + }, + { + "epoch": 5.882191780821918, + "grad_norm": 1.3295880556106567, + "learning_rate": 4.576357179096905e-06, + "loss": 0.0103, + "step": 6441 + }, + { + "epoch": 5.88310502283105, + "grad_norm": 46.93523025512695, + "learning_rate": 4.575342465753425e-06, + "loss": 0.266, + "step": 6442 + }, + { + "epoch": 5.884018264840183, + "grad_norm": 1.2501901388168335, + "learning_rate": 4.574327752409945e-06, + "loss": 0.0084, + "step": 6443 + }, + { + "epoch": 5.884931506849315, + "grad_norm": 46.992027282714844, + "learning_rate": 4.5733130390664644e-06, + "loss": 0.5399, + "step": 6444 + }, + { + "epoch": 5.885844748858448, + "grad_norm": 36.643333435058594, + "learning_rate": 4.572298325722983e-06, + "loss": 0.2155, + "step": 6445 + }, + { + "epoch": 5.88675799086758, + "grad_norm": 4.216553211212158, + "learning_rate": 4.571283612379503e-06, + "loss": 0.0161, + "step": 6446 + }, + { + "epoch": 5.887671232876713, + "grad_norm": 0.8913848996162415, + "learning_rate": 4.570268899036023e-06, + "loss": 0.0054, + "step": 6447 + }, + { + "epoch": 5.888584474885845, + "grad_norm": 1.734753966331482, + "learning_rate": 4.569254185692542e-06, + "loss": 0.0106, + "step": 6448 + }, + { + "epoch": 5.889497716894978, + "grad_norm": 2.6250736713409424, + "learning_rate": 4.568239472349062e-06, + "loss": 0.0132, + "step": 6449 + }, + { + "epoch": 5.890410958904109, + "grad_norm": 12.773390769958496, + "learning_rate": 4.567224759005581e-06, + "loss": 0.0906, + "step": 6450 + }, + { + "epoch": 5.8913242009132425, + "grad_norm": 6.514023780822754, + "learning_rate": 4.566210045662101e-06, + "loss": 0.0373, + "step": 6451 + }, + { + "epoch": 5.892237442922374, + "grad_norm": 119.97955322265625, + "learning_rate": 4.56519533231862e-06, + "loss": 0.9276, + "step": 6452 + }, + { + "epoch": 5.8931506849315065, + "grad_norm": 0.7252671718597412, + "learning_rate": 4.56418061897514e-06, + "loss": 0.0066, + "step": 6453 + }, + { + "epoch": 5.894063926940639, + "grad_norm": 40.10837936401367, + "learning_rate": 4.56316590563166e-06, + "loss": 0.3299, + "step": 6454 + }, + { + "epoch": 5.894977168949771, + "grad_norm": 78.1488265991211, + "learning_rate": 4.562151192288179e-06, + "loss": 0.9742, + "step": 6455 + }, + { + "epoch": 5.895890410958904, + "grad_norm": 12.258306503295898, + "learning_rate": 4.561136478944698e-06, + "loss": 0.0431, + "step": 6456 + }, + { + "epoch": 5.896803652968036, + "grad_norm": 16.331995010375977, + "learning_rate": 4.560121765601218e-06, + "loss": 0.1098, + "step": 6457 + }, + { + "epoch": 5.897716894977169, + "grad_norm": 59.23135757446289, + "learning_rate": 4.559107052257738e-06, + "loss": 0.4569, + "step": 6458 + }, + { + "epoch": 5.898630136986301, + "grad_norm": 1.0348625183105469, + "learning_rate": 4.558092338914257e-06, + "loss": 0.0057, + "step": 6459 + }, + { + "epoch": 5.899543378995434, + "grad_norm": 0.5181549787521362, + "learning_rate": 4.557077625570777e-06, + "loss": 0.0029, + "step": 6460 + }, + { + "epoch": 5.900456621004566, + "grad_norm": 2.242737054824829, + "learning_rate": 4.556062912227296e-06, + "loss": 0.0128, + "step": 6461 + }, + { + "epoch": 5.901369863013699, + "grad_norm": 0.006632945034652948, + "learning_rate": 4.5550481988838155e-06, + "loss": 0.0001, + "step": 6462 + }, + { + "epoch": 5.902283105022831, + "grad_norm": 11.19295597076416, + "learning_rate": 4.554033485540335e-06, + "loss": 0.0722, + "step": 6463 + }, + { + "epoch": 5.9031963470319635, + "grad_norm": 0.521324872970581, + "learning_rate": 4.553018772196855e-06, + "loss": 0.0043, + "step": 6464 + }, + { + "epoch": 5.904109589041096, + "grad_norm": 34.53535842895508, + "learning_rate": 4.552004058853375e-06, + "loss": 0.2406, + "step": 6465 + }, + { + "epoch": 5.905022831050228, + "grad_norm": 7.559736251831055, + "learning_rate": 4.550989345509893e-06, + "loss": 0.0384, + "step": 6466 + }, + { + "epoch": 5.905936073059361, + "grad_norm": 21.259050369262695, + "learning_rate": 4.549974632166413e-06, + "loss": 0.1625, + "step": 6467 + }, + { + "epoch": 5.906849315068493, + "grad_norm": 9.707518577575684, + "learning_rate": 4.548959918822933e-06, + "loss": 0.0654, + "step": 6468 + }, + { + "epoch": 5.907762557077626, + "grad_norm": 3.139920711517334, + "learning_rate": 4.5479452054794525e-06, + "loss": 0.0197, + "step": 6469 + }, + { + "epoch": 5.908675799086758, + "grad_norm": 120.90736389160156, + "learning_rate": 4.546930492135972e-06, + "loss": 1.4698, + "step": 6470 + }, + { + "epoch": 5.909589041095891, + "grad_norm": 1.8215408325195312, + "learning_rate": 4.545915778792491e-06, + "loss": 0.0115, + "step": 6471 + }, + { + "epoch": 5.910502283105023, + "grad_norm": 20.166240692138672, + "learning_rate": 4.544901065449011e-06, + "loss": 0.1348, + "step": 6472 + }, + { + "epoch": 5.911415525114156, + "grad_norm": 1.2377066612243652, + "learning_rate": 4.54388635210553e-06, + "loss": 0.0068, + "step": 6473 + }, + { + "epoch": 5.912328767123288, + "grad_norm": 91.19499969482422, + "learning_rate": 4.54287163876205e-06, + "loss": 0.8039, + "step": 6474 + }, + { + "epoch": 5.91324200913242, + "grad_norm": 0.975026547908783, + "learning_rate": 4.54185692541857e-06, + "loss": 0.004, + "step": 6475 + }, + { + "epoch": 5.914155251141553, + "grad_norm": 3.444601535797119, + "learning_rate": 4.540842212075089e-06, + "loss": 0.0242, + "step": 6476 + }, + { + "epoch": 5.9150684931506845, + "grad_norm": 6.968243598937988, + "learning_rate": 4.539827498731609e-06, + "loss": 0.0447, + "step": 6477 + }, + { + "epoch": 5.915981735159818, + "grad_norm": 6.513703346252441, + "learning_rate": 4.538812785388128e-06, + "loss": 0.0423, + "step": 6478 + }, + { + "epoch": 5.916894977168949, + "grad_norm": 0.05852198228240013, + "learning_rate": 4.537798072044648e-06, + "loss": 0.0004, + "step": 6479 + }, + { + "epoch": 5.917808219178082, + "grad_norm": 2.291600465774536, + "learning_rate": 4.536783358701167e-06, + "loss": 0.019, + "step": 6480 + }, + { + "epoch": 5.918721461187214, + "grad_norm": 0.9365049600601196, + "learning_rate": 4.535768645357686e-06, + "loss": 0.005, + "step": 6481 + }, + { + "epoch": 5.919634703196347, + "grad_norm": 12.688680648803711, + "learning_rate": 4.534753932014207e-06, + "loss": 0.1177, + "step": 6482 + }, + { + "epoch": 5.920547945205479, + "grad_norm": 0.6818444728851318, + "learning_rate": 4.533739218670726e-06, + "loss": 0.0069, + "step": 6483 + }, + { + "epoch": 5.921461187214612, + "grad_norm": 44.73384475708008, + "learning_rate": 4.532724505327245e-06, + "loss": 0.3157, + "step": 6484 + }, + { + "epoch": 5.922374429223744, + "grad_norm": 5.325942516326904, + "learning_rate": 4.531709791983765e-06, + "loss": 0.0319, + "step": 6485 + }, + { + "epoch": 5.923287671232877, + "grad_norm": 0.7549267411231995, + "learning_rate": 4.530695078640284e-06, + "loss": 0.0057, + "step": 6486 + }, + { + "epoch": 5.924200913242009, + "grad_norm": 23.06356430053711, + "learning_rate": 4.529680365296804e-06, + "loss": 0.1587, + "step": 6487 + }, + { + "epoch": 5.9251141552511415, + "grad_norm": 3.147104024887085, + "learning_rate": 4.528665651953323e-06, + "loss": 0.0219, + "step": 6488 + }, + { + "epoch": 5.926027397260274, + "grad_norm": 5.962182998657227, + "learning_rate": 4.527650938609843e-06, + "loss": 0.0416, + "step": 6489 + }, + { + "epoch": 5.926940639269406, + "grad_norm": 84.33705139160156, + "learning_rate": 4.526636225266363e-06, + "loss": 1.1604, + "step": 6490 + }, + { + "epoch": 5.927853881278539, + "grad_norm": 47.431175231933594, + "learning_rate": 4.5256215119228815e-06, + "loss": 0.6466, + "step": 6491 + }, + { + "epoch": 5.928767123287671, + "grad_norm": 0.06763371080160141, + "learning_rate": 4.524606798579402e-06, + "loss": 0.0006, + "step": 6492 + }, + { + "epoch": 5.929680365296804, + "grad_norm": 77.11488342285156, + "learning_rate": 4.523592085235921e-06, + "loss": 1.0733, + "step": 6493 + }, + { + "epoch": 5.930593607305936, + "grad_norm": 2.2376575469970703, + "learning_rate": 4.5225773718924406e-06, + "loss": 0.009, + "step": 6494 + }, + { + "epoch": 5.931506849315069, + "grad_norm": 4.112747669219971, + "learning_rate": 4.52156265854896e-06, + "loss": 0.0302, + "step": 6495 + }, + { + "epoch": 5.932420091324201, + "grad_norm": 3.221682071685791, + "learning_rate": 4.52054794520548e-06, + "loss": 0.0202, + "step": 6496 + }, + { + "epoch": 5.933333333333334, + "grad_norm": 5.665585994720459, + "learning_rate": 4.519533231862e-06, + "loss": 0.032, + "step": 6497 + }, + { + "epoch": 5.934246575342466, + "grad_norm": 2.1778273582458496, + "learning_rate": 4.5185185185185185e-06, + "loss": 0.0187, + "step": 6498 + }, + { + "epoch": 5.9351598173515985, + "grad_norm": 28.31464385986328, + "learning_rate": 4.517503805175039e-06, + "loss": 0.2853, + "step": 6499 + }, + { + "epoch": 5.936073059360731, + "grad_norm": 20.263273239135742, + "learning_rate": 4.516489091831558e-06, + "loss": 0.2011, + "step": 6500 + }, + { + "epoch": 5.936986301369863, + "grad_norm": 0.0651245266199112, + "learning_rate": 4.5154743784880776e-06, + "loss": 0.0004, + "step": 6501 + }, + { + "epoch": 5.937899543378995, + "grad_norm": 101.8026351928711, + "learning_rate": 4.514459665144597e-06, + "loss": 3.3116, + "step": 6502 + }, + { + "epoch": 5.938812785388128, + "grad_norm": 68.24652862548828, + "learning_rate": 4.513444951801116e-06, + "loss": 0.9236, + "step": 6503 + }, + { + "epoch": 5.93972602739726, + "grad_norm": 15.394010543823242, + "learning_rate": 4.512430238457637e-06, + "loss": 0.0869, + "step": 6504 + }, + { + "epoch": 5.940639269406392, + "grad_norm": 17.295616149902344, + "learning_rate": 4.5114155251141555e-06, + "loss": 0.111, + "step": 6505 + }, + { + "epoch": 5.941552511415525, + "grad_norm": 1.8065096139907837, + "learning_rate": 4.510400811770675e-06, + "loss": 0.0173, + "step": 6506 + }, + { + "epoch": 5.942465753424657, + "grad_norm": 79.83340454101562, + "learning_rate": 4.509386098427195e-06, + "loss": 1.519, + "step": 6507 + }, + { + "epoch": 5.94337899543379, + "grad_norm": 3.1422901153564453, + "learning_rate": 4.508371385083714e-06, + "loss": 0.017, + "step": 6508 + }, + { + "epoch": 5.944292237442922, + "grad_norm": 5.366922378540039, + "learning_rate": 4.507356671740234e-06, + "loss": 0.0326, + "step": 6509 + }, + { + "epoch": 5.945205479452055, + "grad_norm": 3.703190803527832, + "learning_rate": 4.506341958396753e-06, + "loss": 0.0277, + "step": 6510 + }, + { + "epoch": 5.946118721461187, + "grad_norm": 19.936504364013672, + "learning_rate": 4.505327245053273e-06, + "loss": 0.1623, + "step": 6511 + }, + { + "epoch": 5.9470319634703195, + "grad_norm": 0.22677668929100037, + "learning_rate": 4.5043125317097925e-06, + "loss": 0.0012, + "step": 6512 + }, + { + "epoch": 5.947945205479452, + "grad_norm": 8.532551765441895, + "learning_rate": 4.503297818366311e-06, + "loss": 0.0647, + "step": 6513 + }, + { + "epoch": 5.948858447488584, + "grad_norm": 11.433242797851562, + "learning_rate": 4.502283105022832e-06, + "loss": 0.0976, + "step": 6514 + }, + { + "epoch": 5.949771689497717, + "grad_norm": 2.4035987854003906, + "learning_rate": 4.501268391679351e-06, + "loss": 0.0166, + "step": 6515 + }, + { + "epoch": 5.950684931506849, + "grad_norm": 0.2171848863363266, + "learning_rate": 4.50025367833587e-06, + "loss": 0.0017, + "step": 6516 + }, + { + "epoch": 5.951598173515982, + "grad_norm": 76.54119873046875, + "learning_rate": 4.49923896499239e-06, + "loss": 1.8656, + "step": 6517 + }, + { + "epoch": 5.952511415525114, + "grad_norm": 5.868743419647217, + "learning_rate": 4.49822425164891e-06, + "loss": 0.0614, + "step": 6518 + }, + { + "epoch": 5.953424657534247, + "grad_norm": 0.92151939868927, + "learning_rate": 4.4972095383054295e-06, + "loss": 0.008, + "step": 6519 + }, + { + "epoch": 5.954337899543379, + "grad_norm": 3.790525197982788, + "learning_rate": 4.496194824961948e-06, + "loss": 0.0317, + "step": 6520 + }, + { + "epoch": 5.955251141552512, + "grad_norm": 75.12450408935547, + "learning_rate": 4.495180111618468e-06, + "loss": 1.5304, + "step": 6521 + }, + { + "epoch": 5.956164383561644, + "grad_norm": 62.349578857421875, + "learning_rate": 4.494165398274988e-06, + "loss": 0.9274, + "step": 6522 + }, + { + "epoch": 5.9570776255707765, + "grad_norm": 3.392524003982544, + "learning_rate": 4.493150684931507e-06, + "loss": 0.0214, + "step": 6523 + }, + { + "epoch": 5.957990867579909, + "grad_norm": 6.030546188354492, + "learning_rate": 4.492135971588027e-06, + "loss": 0.0533, + "step": 6524 + }, + { + "epoch": 5.958904109589041, + "grad_norm": 1.0064963102340698, + "learning_rate": 4.491121258244546e-06, + "loss": 0.01, + "step": 6525 + }, + { + "epoch": 5.959817351598174, + "grad_norm": 11.324017524719238, + "learning_rate": 4.490106544901066e-06, + "loss": 0.0743, + "step": 6526 + }, + { + "epoch": 5.960730593607306, + "grad_norm": 4.299860954284668, + "learning_rate": 4.489091831557585e-06, + "loss": 0.0271, + "step": 6527 + }, + { + "epoch": 5.961643835616439, + "grad_norm": 0.10629075020551682, + "learning_rate": 4.488077118214105e-06, + "loss": 0.0005, + "step": 6528 + }, + { + "epoch": 5.96255707762557, + "grad_norm": 25.20558738708496, + "learning_rate": 4.487062404870625e-06, + "loss": 0.2142, + "step": 6529 + }, + { + "epoch": 5.963470319634704, + "grad_norm": 28.998367309570312, + "learning_rate": 4.4860476915271436e-06, + "loss": 0.327, + "step": 6530 + }, + { + "epoch": 5.964383561643835, + "grad_norm": 65.00112915039062, + "learning_rate": 4.485032978183663e-06, + "loss": 0.5998, + "step": 6531 + }, + { + "epoch": 5.965296803652968, + "grad_norm": 0.47264429926872253, + "learning_rate": 4.484018264840183e-06, + "loss": 0.0036, + "step": 6532 + }, + { + "epoch": 5.9662100456621, + "grad_norm": 0.3620924949645996, + "learning_rate": 4.483003551496703e-06, + "loss": 0.002, + "step": 6533 + }, + { + "epoch": 5.967123287671233, + "grad_norm": 5.177344799041748, + "learning_rate": 4.481988838153222e-06, + "loss": 0.0407, + "step": 6534 + }, + { + "epoch": 5.968036529680365, + "grad_norm": 40.92805099487305, + "learning_rate": 4.480974124809742e-06, + "loss": 0.1334, + "step": 6535 + }, + { + "epoch": 5.9689497716894975, + "grad_norm": 0.47250017523765564, + "learning_rate": 4.479959411466261e-06, + "loss": 0.0024, + "step": 6536 + }, + { + "epoch": 5.96986301369863, + "grad_norm": 0.10878687351942062, + "learning_rate": 4.4789446981227805e-06, + "loss": 0.0006, + "step": 6537 + }, + { + "epoch": 5.970776255707762, + "grad_norm": 3.026728868484497, + "learning_rate": 4.4779299847793e-06, + "loss": 0.0279, + "step": 6538 + }, + { + "epoch": 5.971689497716895, + "grad_norm": 2.4585421085357666, + "learning_rate": 4.47691527143582e-06, + "loss": 0.0161, + "step": 6539 + }, + { + "epoch": 5.972602739726027, + "grad_norm": 6.142393112182617, + "learning_rate": 4.47590055809234e-06, + "loss": 0.0307, + "step": 6540 + }, + { + "epoch": 5.97351598173516, + "grad_norm": 0.35396403074264526, + "learning_rate": 4.4748858447488585e-06, + "loss": 0.0024, + "step": 6541 + }, + { + "epoch": 5.974429223744292, + "grad_norm": 25.740440368652344, + "learning_rate": 4.473871131405378e-06, + "loss": 0.3225, + "step": 6542 + }, + { + "epoch": 5.975342465753425, + "grad_norm": 1.5711997747421265, + "learning_rate": 4.472856418061898e-06, + "loss": 0.0099, + "step": 6543 + }, + { + "epoch": 5.976255707762557, + "grad_norm": 3.717107057571411, + "learning_rate": 4.4718417047184175e-06, + "loss": 0.0196, + "step": 6544 + }, + { + "epoch": 5.9771689497716896, + "grad_norm": 0.11520597338676453, + "learning_rate": 4.470826991374937e-06, + "loss": 0.0013, + "step": 6545 + }, + { + "epoch": 5.978082191780822, + "grad_norm": 0.028161147609353065, + "learning_rate": 4.469812278031456e-06, + "loss": 0.0002, + "step": 6546 + }, + { + "epoch": 5.9789954337899545, + "grad_norm": 0.4465423822402954, + "learning_rate": 4.468797564687976e-06, + "loss": 0.0036, + "step": 6547 + }, + { + "epoch": 5.979908675799087, + "grad_norm": 2.947760581970215, + "learning_rate": 4.4677828513444955e-06, + "loss": 0.022, + "step": 6548 + }, + { + "epoch": 5.980821917808219, + "grad_norm": 46.68765640258789, + "learning_rate": 4.466768138001015e-06, + "loss": 0.4179, + "step": 6549 + }, + { + "epoch": 5.981735159817352, + "grad_norm": 19.085887908935547, + "learning_rate": 4.465753424657535e-06, + "loss": 0.2355, + "step": 6550 + }, + { + "epoch": 5.982648401826484, + "grad_norm": 6.383697509765625, + "learning_rate": 4.464738711314054e-06, + "loss": 0.0456, + "step": 6551 + }, + { + "epoch": 5.983561643835617, + "grad_norm": 0.902556300163269, + "learning_rate": 4.463723997970573e-06, + "loss": 0.0043, + "step": 6552 + }, + { + "epoch": 5.984474885844749, + "grad_norm": 8.811875343322754, + "learning_rate": 4.462709284627093e-06, + "loss": 0.058, + "step": 6553 + }, + { + "epoch": 5.985388127853882, + "grad_norm": 90.21971130371094, + "learning_rate": 4.461694571283613e-06, + "loss": 2.5751, + "step": 6554 + }, + { + "epoch": 5.986301369863014, + "grad_norm": 52.194557189941406, + "learning_rate": 4.4606798579401325e-06, + "loss": 0.3719, + "step": 6555 + }, + { + "epoch": 5.987214611872146, + "grad_norm": 14.576478958129883, + "learning_rate": 4.459665144596651e-06, + "loss": 0.0901, + "step": 6556 + }, + { + "epoch": 5.988127853881279, + "grad_norm": 74.97417449951172, + "learning_rate": 4.458650431253172e-06, + "loss": 0.8484, + "step": 6557 + }, + { + "epoch": 5.989041095890411, + "grad_norm": 26.654279708862305, + "learning_rate": 4.457635717909691e-06, + "loss": 0.162, + "step": 6558 + }, + { + "epoch": 5.989954337899543, + "grad_norm": 0.15857751667499542, + "learning_rate": 4.45662100456621e-06, + "loss": 0.0013, + "step": 6559 + }, + { + "epoch": 5.9908675799086755, + "grad_norm": 14.43640422821045, + "learning_rate": 4.45560629122273e-06, + "loss": 0.146, + "step": 6560 + }, + { + "epoch": 5.991780821917808, + "grad_norm": 14.748570442199707, + "learning_rate": 4.454591577879249e-06, + "loss": 0.1037, + "step": 6561 + }, + { + "epoch": 5.99269406392694, + "grad_norm": 0.799262285232544, + "learning_rate": 4.4535768645357695e-06, + "loss": 0.0062, + "step": 6562 + }, + { + "epoch": 5.993607305936073, + "grad_norm": 0.7677450776100159, + "learning_rate": 4.452562151192288e-06, + "loss": 0.0047, + "step": 6563 + }, + { + "epoch": 5.994520547945205, + "grad_norm": 1.1639456748962402, + "learning_rate": 4.451547437848808e-06, + "loss": 0.0084, + "step": 6564 + }, + { + "epoch": 5.995433789954338, + "grad_norm": 19.087717056274414, + "learning_rate": 4.450532724505328e-06, + "loss": 0.1464, + "step": 6565 + }, + { + "epoch": 5.99634703196347, + "grad_norm": 7.524383068084717, + "learning_rate": 4.4495180111618465e-06, + "loss": 0.0429, + "step": 6566 + }, + { + "epoch": 5.997260273972603, + "grad_norm": 0.24816685914993286, + "learning_rate": 4.448503297818367e-06, + "loss": 0.0021, + "step": 6567 + }, + { + "epoch": 5.998173515981735, + "grad_norm": 0.6818237900733948, + "learning_rate": 4.447488584474886e-06, + "loss": 0.005, + "step": 6568 + }, + { + "epoch": 5.9990867579908675, + "grad_norm": 0.6453270316123962, + "learning_rate": 4.446473871131406e-06, + "loss": 0.0049, + "step": 6569 + }, + { + "epoch": 6.0, + "grad_norm": 2.7576963901519775, + "learning_rate": 4.445459157787925e-06, + "loss": 0.024, + "step": 6570 + }, + { + "epoch": 6.0009132420091325, + "grad_norm": 0.6458755731582642, + "learning_rate": 4.444444444444444e-06, + "loss": 0.0052, + "step": 6571 + }, + { + "epoch": 6.001826484018265, + "grad_norm": 2.819990396499634, + "learning_rate": 4.443429731100965e-06, + "loss": 0.0217, + "step": 6572 + }, + { + "epoch": 6.002739726027397, + "grad_norm": 3.4346907138824463, + "learning_rate": 4.4424150177574835e-06, + "loss": 0.0204, + "step": 6573 + }, + { + "epoch": 6.00365296803653, + "grad_norm": 5.386760711669922, + "learning_rate": 4.441400304414003e-06, + "loss": 0.0308, + "step": 6574 + }, + { + "epoch": 6.004566210045662, + "grad_norm": 7.551497936248779, + "learning_rate": 4.440385591070523e-06, + "loss": 0.0502, + "step": 6575 + }, + { + "epoch": 6.005479452054795, + "grad_norm": 0.3999517261981964, + "learning_rate": 4.439370877727043e-06, + "loss": 0.003, + "step": 6576 + }, + { + "epoch": 6.006392694063927, + "grad_norm": 3.9378390312194824, + "learning_rate": 4.438356164383562e-06, + "loss": 0.0237, + "step": 6577 + }, + { + "epoch": 6.00730593607306, + "grad_norm": 5.160080432891846, + "learning_rate": 4.437341451040081e-06, + "loss": 0.0482, + "step": 6578 + }, + { + "epoch": 6.008219178082192, + "grad_norm": 1.6658786535263062, + "learning_rate": 4.436326737696602e-06, + "loss": 0.0104, + "step": 6579 + }, + { + "epoch": 6.0091324200913245, + "grad_norm": 11.284366607666016, + "learning_rate": 4.4353120243531205e-06, + "loss": 0.0784, + "step": 6580 + }, + { + "epoch": 6.010045662100457, + "grad_norm": 2.4496374130249023, + "learning_rate": 4.43429731100964e-06, + "loss": 0.0118, + "step": 6581 + }, + { + "epoch": 6.010958904109589, + "grad_norm": 6.432136058807373, + "learning_rate": 4.43328259766616e-06, + "loss": 0.0428, + "step": 6582 + }, + { + "epoch": 6.011872146118722, + "grad_norm": 0.3512638211250305, + "learning_rate": 4.432267884322679e-06, + "loss": 0.0026, + "step": 6583 + }, + { + "epoch": 6.0127853881278535, + "grad_norm": 0.24490070343017578, + "learning_rate": 4.431253170979199e-06, + "loss": 0.0011, + "step": 6584 + }, + { + "epoch": 6.013698630136986, + "grad_norm": 0.3673464357852936, + "learning_rate": 4.430238457635718e-06, + "loss": 0.0027, + "step": 6585 + }, + { + "epoch": 6.014611872146118, + "grad_norm": 0.3405728042125702, + "learning_rate": 4.429223744292238e-06, + "loss": 0.0021, + "step": 6586 + }, + { + "epoch": 6.015525114155251, + "grad_norm": 9.8167724609375, + "learning_rate": 4.4282090309487575e-06, + "loss": 0.1014, + "step": 6587 + }, + { + "epoch": 6.016438356164383, + "grad_norm": 2.2742607593536377, + "learning_rate": 4.427194317605276e-06, + "loss": 0.0131, + "step": 6588 + }, + { + "epoch": 6.017351598173516, + "grad_norm": 2.255802869796753, + "learning_rate": 4.426179604261797e-06, + "loss": 0.0136, + "step": 6589 + }, + { + "epoch": 6.018264840182648, + "grad_norm": 0.6155288815498352, + "learning_rate": 4.425164890918316e-06, + "loss": 0.005, + "step": 6590 + }, + { + "epoch": 6.019178082191781, + "grad_norm": 1.8816601037979126, + "learning_rate": 4.4241501775748354e-06, + "loss": 0.0111, + "step": 6591 + }, + { + "epoch": 6.020091324200913, + "grad_norm": 4.641568183898926, + "learning_rate": 4.423135464231355e-06, + "loss": 0.0374, + "step": 6592 + }, + { + "epoch": 6.0210045662100455, + "grad_norm": 4.787967681884766, + "learning_rate": 4.422120750887874e-06, + "loss": 0.0505, + "step": 6593 + }, + { + "epoch": 6.021917808219178, + "grad_norm": 9.99697494506836, + "learning_rate": 4.4211060375443945e-06, + "loss": 0.0658, + "step": 6594 + }, + { + "epoch": 6.0228310502283104, + "grad_norm": 16.958295822143555, + "learning_rate": 4.420091324200913e-06, + "loss": 0.1781, + "step": 6595 + }, + { + "epoch": 6.023744292237443, + "grad_norm": 62.48561477661133, + "learning_rate": 4.419076610857433e-06, + "loss": 0.9392, + "step": 6596 + }, + { + "epoch": 6.024657534246575, + "grad_norm": 5.114564895629883, + "learning_rate": 4.418061897513953e-06, + "loss": 0.0379, + "step": 6597 + }, + { + "epoch": 6.025570776255708, + "grad_norm": 8.59701919555664, + "learning_rate": 4.4170471841704724e-06, + "loss": 0.0721, + "step": 6598 + }, + { + "epoch": 6.02648401826484, + "grad_norm": 26.817468643188477, + "learning_rate": 4.416032470826992e-06, + "loss": 0.3599, + "step": 6599 + }, + { + "epoch": 6.027397260273973, + "grad_norm": 0.44702520966529846, + "learning_rate": 4.415017757483511e-06, + "loss": 0.003, + "step": 6600 + }, + { + "epoch": 6.028310502283105, + "grad_norm": 0.6123420596122742, + "learning_rate": 4.414003044140031e-06, + "loss": 0.0042, + "step": 6601 + }, + { + "epoch": 6.029223744292238, + "grad_norm": 54.31485366821289, + "learning_rate": 4.41298833079655e-06, + "loss": 0.6276, + "step": 6602 + }, + { + "epoch": 6.03013698630137, + "grad_norm": 0.0839325413107872, + "learning_rate": 4.41197361745307e-06, + "loss": 0.0006, + "step": 6603 + }, + { + "epoch": 6.0310502283105025, + "grad_norm": 3.4596564769744873, + "learning_rate": 4.41095890410959e-06, + "loss": 0.0291, + "step": 6604 + }, + { + "epoch": 6.031963470319635, + "grad_norm": 14.163607597351074, + "learning_rate": 4.409944190766109e-06, + "loss": 0.1049, + "step": 6605 + }, + { + "epoch": 6.032876712328767, + "grad_norm": 0.7183751463890076, + "learning_rate": 4.408929477422628e-06, + "loss": 0.0064, + "step": 6606 + }, + { + "epoch": 6.0337899543379, + "grad_norm": 0.7153368592262268, + "learning_rate": 4.407914764079148e-06, + "loss": 0.005, + "step": 6607 + }, + { + "epoch": 6.034703196347032, + "grad_norm": 28.141504287719727, + "learning_rate": 4.406900050735668e-06, + "loss": 0.1967, + "step": 6608 + }, + { + "epoch": 6.035616438356165, + "grad_norm": 0.9624531865119934, + "learning_rate": 4.405885337392187e-06, + "loss": 0.008, + "step": 6609 + }, + { + "epoch": 6.036529680365296, + "grad_norm": 2.4505114555358887, + "learning_rate": 4.404870624048706e-06, + "loss": 0.0219, + "step": 6610 + }, + { + "epoch": 6.037442922374429, + "grad_norm": 83.26984405517578, + "learning_rate": 4.403855910705226e-06, + "loss": 1.4277, + "step": 6611 + }, + { + "epoch": 6.038356164383561, + "grad_norm": 0.46227702498435974, + "learning_rate": 4.402841197361746e-06, + "loss": 0.0041, + "step": 6612 + }, + { + "epoch": 6.039269406392694, + "grad_norm": 26.995182037353516, + "learning_rate": 4.401826484018265e-06, + "loss": 0.2184, + "step": 6613 + }, + { + "epoch": 6.040182648401826, + "grad_norm": 7.704474449157715, + "learning_rate": 4.400811770674785e-06, + "loss": 0.0501, + "step": 6614 + }, + { + "epoch": 6.041095890410959, + "grad_norm": 5.063155174255371, + "learning_rate": 4.399797057331304e-06, + "loss": 0.0447, + "step": 6615 + }, + { + "epoch": 6.042009132420091, + "grad_norm": 26.75105094909668, + "learning_rate": 4.3987823439878235e-06, + "loss": 0.3027, + "step": 6616 + }, + { + "epoch": 6.0429223744292235, + "grad_norm": 23.046052932739258, + "learning_rate": 4.397767630644343e-06, + "loss": 0.1866, + "step": 6617 + }, + { + "epoch": 6.043835616438356, + "grad_norm": 0.20441171526908875, + "learning_rate": 4.396752917300863e-06, + "loss": 0.0015, + "step": 6618 + }, + { + "epoch": 6.044748858447488, + "grad_norm": 0.5208569765090942, + "learning_rate": 4.395738203957383e-06, + "loss": 0.004, + "step": 6619 + }, + { + "epoch": 6.045662100456621, + "grad_norm": 34.4825439453125, + "learning_rate": 4.394723490613902e-06, + "loss": 0.1881, + "step": 6620 + }, + { + "epoch": 6.046575342465753, + "grad_norm": 21.152158737182617, + "learning_rate": 4.393708777270421e-06, + "loss": 0.1419, + "step": 6621 + }, + { + "epoch": 6.047488584474886, + "grad_norm": 4.314539432525635, + "learning_rate": 4.392694063926941e-06, + "loss": 0.0393, + "step": 6622 + }, + { + "epoch": 6.048401826484018, + "grad_norm": 3.104836940765381, + "learning_rate": 4.3916793505834605e-06, + "loss": 0.0284, + "step": 6623 + }, + { + "epoch": 6.049315068493151, + "grad_norm": 7.7042646408081055, + "learning_rate": 4.39066463723998e-06, + "loss": 0.0604, + "step": 6624 + }, + { + "epoch": 6.050228310502283, + "grad_norm": 0.8773698806762695, + "learning_rate": 4.3896499238965e-06, + "loss": 0.0089, + "step": 6625 + }, + { + "epoch": 6.051141552511416, + "grad_norm": 11.748324394226074, + "learning_rate": 4.388635210553019e-06, + "loss": 0.0907, + "step": 6626 + }, + { + "epoch": 6.052054794520548, + "grad_norm": 0.44508135318756104, + "learning_rate": 4.3876204972095384e-06, + "loss": 0.0028, + "step": 6627 + }, + { + "epoch": 6.0529680365296805, + "grad_norm": 8.542681694030762, + "learning_rate": 4.386605783866058e-06, + "loss": 0.0515, + "step": 6628 + }, + { + "epoch": 6.053881278538813, + "grad_norm": 0.5552430152893066, + "learning_rate": 4.385591070522578e-06, + "loss": 0.0041, + "step": 6629 + }, + { + "epoch": 6.054794520547945, + "grad_norm": 31.28386878967285, + "learning_rate": 4.3845763571790975e-06, + "loss": 0.1694, + "step": 6630 + }, + { + "epoch": 6.055707762557078, + "grad_norm": 0.5558962821960449, + "learning_rate": 4.383561643835616e-06, + "loss": 0.0046, + "step": 6631 + }, + { + "epoch": 6.05662100456621, + "grad_norm": 0.09849616140127182, + "learning_rate": 4.382546930492136e-06, + "loss": 0.0005, + "step": 6632 + }, + { + "epoch": 6.057534246575343, + "grad_norm": 55.76874923706055, + "learning_rate": 4.381532217148656e-06, + "loss": 0.4164, + "step": 6633 + }, + { + "epoch": 6.058447488584475, + "grad_norm": 1.6715861558914185, + "learning_rate": 4.3805175038051754e-06, + "loss": 0.0086, + "step": 6634 + }, + { + "epoch": 6.059360730593608, + "grad_norm": 62.7872314453125, + "learning_rate": 4.379502790461695e-06, + "loss": 0.7631, + "step": 6635 + }, + { + "epoch": 6.06027397260274, + "grad_norm": 2.207599401473999, + "learning_rate": 4.378488077118214e-06, + "loss": 0.0154, + "step": 6636 + }, + { + "epoch": 6.061187214611872, + "grad_norm": 19.932720184326172, + "learning_rate": 4.3774733637747345e-06, + "loss": 0.1169, + "step": 6637 + }, + { + "epoch": 6.062100456621004, + "grad_norm": 1.41544771194458, + "learning_rate": 4.376458650431253e-06, + "loss": 0.0085, + "step": 6638 + }, + { + "epoch": 6.063013698630137, + "grad_norm": 5.439250946044922, + "learning_rate": 4.375443937087773e-06, + "loss": 0.0433, + "step": 6639 + }, + { + "epoch": 6.063926940639269, + "grad_norm": 36.193538665771484, + "learning_rate": 4.374429223744293e-06, + "loss": 0.4527, + "step": 6640 + }, + { + "epoch": 6.0648401826484015, + "grad_norm": 0.28960752487182617, + "learning_rate": 4.373414510400812e-06, + "loss": 0.0028, + "step": 6641 + }, + { + "epoch": 6.065753424657534, + "grad_norm": 50.62773132324219, + "learning_rate": 4.372399797057332e-06, + "loss": 0.6258, + "step": 6642 + }, + { + "epoch": 6.066666666666666, + "grad_norm": 9.756749153137207, + "learning_rate": 4.371385083713851e-06, + "loss": 0.1, + "step": 6643 + }, + { + "epoch": 6.067579908675799, + "grad_norm": 40.13080596923828, + "learning_rate": 4.370370370370371e-06, + "loss": 0.3847, + "step": 6644 + }, + { + "epoch": 6.068493150684931, + "grad_norm": 5.5731682777404785, + "learning_rate": 4.36935565702689e-06, + "loss": 0.0535, + "step": 6645 + }, + { + "epoch": 6.069406392694064, + "grad_norm": 5.311216831207275, + "learning_rate": 4.368340943683409e-06, + "loss": 0.0404, + "step": 6646 + }, + { + "epoch": 6.070319634703196, + "grad_norm": 0.6210635900497437, + "learning_rate": 4.36732623033993e-06, + "loss": 0.004, + "step": 6647 + }, + { + "epoch": 6.071232876712329, + "grad_norm": 14.554628372192383, + "learning_rate": 4.366311516996449e-06, + "loss": 0.0975, + "step": 6648 + }, + { + "epoch": 6.072146118721461, + "grad_norm": 31.446882247924805, + "learning_rate": 4.365296803652968e-06, + "loss": 0.2541, + "step": 6649 + }, + { + "epoch": 6.073059360730594, + "grad_norm": 27.981351852416992, + "learning_rate": 4.364282090309488e-06, + "loss": 0.1468, + "step": 6650 + }, + { + "epoch": 6.073972602739726, + "grad_norm": 0.4048837721347809, + "learning_rate": 4.363267376966007e-06, + "loss": 0.0029, + "step": 6651 + }, + { + "epoch": 6.0748858447488585, + "grad_norm": 26.65433120727539, + "learning_rate": 4.362252663622527e-06, + "loss": 0.1321, + "step": 6652 + }, + { + "epoch": 6.075799086757991, + "grad_norm": 2.719858407974243, + "learning_rate": 4.361237950279046e-06, + "loss": 0.0116, + "step": 6653 + }, + { + "epoch": 6.076712328767123, + "grad_norm": 19.05782127380371, + "learning_rate": 4.360223236935566e-06, + "loss": 0.1346, + "step": 6654 + }, + { + "epoch": 6.077625570776256, + "grad_norm": 27.580890655517578, + "learning_rate": 4.359208523592086e-06, + "loss": 0.3022, + "step": 6655 + }, + { + "epoch": 6.078538812785388, + "grad_norm": 24.68669891357422, + "learning_rate": 4.358193810248605e-06, + "loss": 0.2133, + "step": 6656 + }, + { + "epoch": 6.079452054794521, + "grad_norm": 14.27434253692627, + "learning_rate": 4.357179096905125e-06, + "loss": 0.0969, + "step": 6657 + }, + { + "epoch": 6.080365296803653, + "grad_norm": 19.642072677612305, + "learning_rate": 4.356164383561644e-06, + "loss": 0.1417, + "step": 6658 + }, + { + "epoch": 6.081278538812786, + "grad_norm": 53.297489166259766, + "learning_rate": 4.355149670218164e-06, + "loss": 0.9261, + "step": 6659 + }, + { + "epoch": 6.082191780821918, + "grad_norm": 118.53862762451172, + "learning_rate": 4.354134956874683e-06, + "loss": 0.6675, + "step": 6660 + }, + { + "epoch": 6.083105022831051, + "grad_norm": 8.534711837768555, + "learning_rate": 4.353120243531203e-06, + "loss": 0.0583, + "step": 6661 + }, + { + "epoch": 6.084018264840183, + "grad_norm": 24.36887550354004, + "learning_rate": 4.3521055301877226e-06, + "loss": 0.2401, + "step": 6662 + }, + { + "epoch": 6.0849315068493155, + "grad_norm": 26.137325286865234, + "learning_rate": 4.351090816844241e-06, + "loss": 0.1941, + "step": 6663 + }, + { + "epoch": 6.085844748858447, + "grad_norm": 3.9830708503723145, + "learning_rate": 4.350076103500762e-06, + "loss": 0.0244, + "step": 6664 + }, + { + "epoch": 6.0867579908675795, + "grad_norm": 4.614989280700684, + "learning_rate": 4.349061390157281e-06, + "loss": 0.0335, + "step": 6665 + }, + { + "epoch": 6.087671232876712, + "grad_norm": 0.5414465069770813, + "learning_rate": 4.3480466768138005e-06, + "loss": 0.0048, + "step": 6666 + }, + { + "epoch": 6.088584474885844, + "grad_norm": 3.098433256149292, + "learning_rate": 4.34703196347032e-06, + "loss": 0.0161, + "step": 6667 + }, + { + "epoch": 6.089497716894977, + "grad_norm": 0.10339552164077759, + "learning_rate": 4.346017250126839e-06, + "loss": 0.0005, + "step": 6668 + }, + { + "epoch": 6.090410958904109, + "grad_norm": 0.29048216342926025, + "learning_rate": 4.3450025367833596e-06, + "loss": 0.002, + "step": 6669 + }, + { + "epoch": 6.091324200913242, + "grad_norm": 4.7039642333984375, + "learning_rate": 4.343987823439878e-06, + "loss": 0.0382, + "step": 6670 + }, + { + "epoch": 6.092237442922374, + "grad_norm": 0.2822943329811096, + "learning_rate": 4.342973110096398e-06, + "loss": 0.0024, + "step": 6671 + }, + { + "epoch": 6.093150684931507, + "grad_norm": 4.123199939727783, + "learning_rate": 4.341958396752918e-06, + "loss": 0.0331, + "step": 6672 + }, + { + "epoch": 6.094063926940639, + "grad_norm": 6.658768653869629, + "learning_rate": 4.340943683409437e-06, + "loss": 0.0452, + "step": 6673 + }, + { + "epoch": 6.094977168949772, + "grad_norm": 1.542533040046692, + "learning_rate": 4.339928970065957e-06, + "loss": 0.0097, + "step": 6674 + }, + { + "epoch": 6.095890410958904, + "grad_norm": 10.000072479248047, + "learning_rate": 4.338914256722476e-06, + "loss": 0.0555, + "step": 6675 + }, + { + "epoch": 6.0968036529680365, + "grad_norm": 0.779171884059906, + "learning_rate": 4.337899543378996e-06, + "loss": 0.007, + "step": 6676 + }, + { + "epoch": 6.097716894977169, + "grad_norm": 0.47152000665664673, + "learning_rate": 4.336884830035515e-06, + "loss": 0.0027, + "step": 6677 + }, + { + "epoch": 6.098630136986301, + "grad_norm": 2.480659008026123, + "learning_rate": 4.335870116692035e-06, + "loss": 0.0226, + "step": 6678 + }, + { + "epoch": 6.099543378995434, + "grad_norm": 47.448211669921875, + "learning_rate": 4.334855403348555e-06, + "loss": 0.431, + "step": 6679 + }, + { + "epoch": 6.100456621004566, + "grad_norm": 26.917505264282227, + "learning_rate": 4.333840690005074e-06, + "loss": 0.2571, + "step": 6680 + }, + { + "epoch": 6.101369863013699, + "grad_norm": 34.462459564208984, + "learning_rate": 4.332825976661593e-06, + "loss": 0.2868, + "step": 6681 + }, + { + "epoch": 6.102283105022831, + "grad_norm": 0.6357265114784241, + "learning_rate": 4.331811263318113e-06, + "loss": 0.0064, + "step": 6682 + }, + { + "epoch": 6.103196347031964, + "grad_norm": 25.168638229370117, + "learning_rate": 4.330796549974633e-06, + "loss": 0.1817, + "step": 6683 + }, + { + "epoch": 6.104109589041096, + "grad_norm": 10.684536933898926, + "learning_rate": 4.329781836631152e-06, + "loss": 0.0504, + "step": 6684 + }, + { + "epoch": 6.105022831050229, + "grad_norm": 52.960140228271484, + "learning_rate": 4.328767123287671e-06, + "loss": 0.3532, + "step": 6685 + }, + { + "epoch": 6.105936073059361, + "grad_norm": 7.772068023681641, + "learning_rate": 4.327752409944191e-06, + "loss": 0.0583, + "step": 6686 + }, + { + "epoch": 6.1068493150684935, + "grad_norm": 21.841123580932617, + "learning_rate": 4.326737696600711e-06, + "loss": 0.1227, + "step": 6687 + }, + { + "epoch": 6.107762557077626, + "grad_norm": 6.429016590118408, + "learning_rate": 4.32572298325723e-06, + "loss": 0.0337, + "step": 6688 + }, + { + "epoch": 6.108675799086758, + "grad_norm": 110.38987731933594, + "learning_rate": 4.32470826991375e-06, + "loss": 3.788, + "step": 6689 + }, + { + "epoch": 6.109589041095891, + "grad_norm": 7.277464389801025, + "learning_rate": 4.323693556570269e-06, + "loss": 0.0467, + "step": 6690 + }, + { + "epoch": 6.110502283105022, + "grad_norm": 0.792913019657135, + "learning_rate": 4.3226788432267886e-06, + "loss": 0.0054, + "step": 6691 + }, + { + "epoch": 6.111415525114155, + "grad_norm": 3.6331894397735596, + "learning_rate": 4.321664129883308e-06, + "loss": 0.0308, + "step": 6692 + }, + { + "epoch": 6.112328767123287, + "grad_norm": 0.9122188091278076, + "learning_rate": 4.320649416539828e-06, + "loss": 0.0075, + "step": 6693 + }, + { + "epoch": 6.11324200913242, + "grad_norm": 9.068998336791992, + "learning_rate": 4.319634703196348e-06, + "loss": 0.0588, + "step": 6694 + }, + { + "epoch": 6.114155251141552, + "grad_norm": 0.2909929156303406, + "learning_rate": 4.3186199898528665e-06, + "loss": 0.0021, + "step": 6695 + }, + { + "epoch": 6.115068493150685, + "grad_norm": 4.8979902267456055, + "learning_rate": 4.317605276509386e-06, + "loss": 0.0367, + "step": 6696 + }, + { + "epoch": 6.115981735159817, + "grad_norm": 0.6052277684211731, + "learning_rate": 4.316590563165906e-06, + "loss": 0.0042, + "step": 6697 + }, + { + "epoch": 6.11689497716895, + "grad_norm": 16.673986434936523, + "learning_rate": 4.3155758498224256e-06, + "loss": 0.083, + "step": 6698 + }, + { + "epoch": 6.117808219178082, + "grad_norm": 9.100759506225586, + "learning_rate": 4.314561136478945e-06, + "loss": 0.043, + "step": 6699 + }, + { + "epoch": 6.1187214611872145, + "grad_norm": 12.007796287536621, + "learning_rate": 4.313546423135465e-06, + "loss": 0.0906, + "step": 6700 + }, + { + "epoch": 6.119634703196347, + "grad_norm": 8.339831352233887, + "learning_rate": 4.312531709791984e-06, + "loss": 0.0691, + "step": 6701 + }, + { + "epoch": 6.120547945205479, + "grad_norm": 14.09092903137207, + "learning_rate": 4.3115169964485035e-06, + "loss": 0.1066, + "step": 6702 + }, + { + "epoch": 6.121461187214612, + "grad_norm": 86.44506072998047, + "learning_rate": 4.310502283105023e-06, + "loss": 1.0152, + "step": 6703 + }, + { + "epoch": 6.122374429223744, + "grad_norm": 0.42419788241386414, + "learning_rate": 4.309487569761543e-06, + "loss": 0.0035, + "step": 6704 + }, + { + "epoch": 6.123287671232877, + "grad_norm": 27.20566749572754, + "learning_rate": 4.3084728564180626e-06, + "loss": 0.3799, + "step": 6705 + }, + { + "epoch": 6.124200913242009, + "grad_norm": 0.07316378504037857, + "learning_rate": 4.307458143074581e-06, + "loss": 0.0004, + "step": 6706 + }, + { + "epoch": 6.125114155251142, + "grad_norm": 4.598282814025879, + "learning_rate": 4.306443429731101e-06, + "loss": 0.0485, + "step": 6707 + }, + { + "epoch": 6.126027397260274, + "grad_norm": 0.12139663100242615, + "learning_rate": 4.305428716387621e-06, + "loss": 0.0008, + "step": 6708 + }, + { + "epoch": 6.126940639269407, + "grad_norm": 1.2395603656768799, + "learning_rate": 4.3044140030441405e-06, + "loss": 0.0075, + "step": 6709 + }, + { + "epoch": 6.127853881278539, + "grad_norm": 6.064350605010986, + "learning_rate": 4.30339928970066e-06, + "loss": 0.039, + "step": 6710 + }, + { + "epoch": 6.1287671232876715, + "grad_norm": 0.7258136868476868, + "learning_rate": 4.302384576357179e-06, + "loss": 0.0044, + "step": 6711 + }, + { + "epoch": 6.129680365296804, + "grad_norm": 6.838637351989746, + "learning_rate": 4.301369863013699e-06, + "loss": 0.0357, + "step": 6712 + }, + { + "epoch": 6.130593607305936, + "grad_norm": 55.48668670654297, + "learning_rate": 4.300355149670218e-06, + "loss": 0.6178, + "step": 6713 + }, + { + "epoch": 6.131506849315069, + "grad_norm": 0.874435305595398, + "learning_rate": 4.299340436326738e-06, + "loss": 0.0047, + "step": 6714 + }, + { + "epoch": 6.132420091324201, + "grad_norm": 5.881022930145264, + "learning_rate": 4.298325722983258e-06, + "loss": 0.0331, + "step": 6715 + }, + { + "epoch": 6.133333333333334, + "grad_norm": 0.32111433148384094, + "learning_rate": 4.297311009639777e-06, + "loss": 0.002, + "step": 6716 + }, + { + "epoch": 6.134246575342466, + "grad_norm": 2.039020538330078, + "learning_rate": 4.296296296296296e-06, + "loss": 0.0109, + "step": 6717 + }, + { + "epoch": 6.135159817351598, + "grad_norm": 0.04787341505289078, + "learning_rate": 4.295281582952816e-06, + "loss": 0.0004, + "step": 6718 + }, + { + "epoch": 6.13607305936073, + "grad_norm": 11.637811660766602, + "learning_rate": 4.294266869609336e-06, + "loss": 0.0673, + "step": 6719 + }, + { + "epoch": 6.136986301369863, + "grad_norm": 4.037213325500488, + "learning_rate": 4.293252156265855e-06, + "loss": 0.0254, + "step": 6720 + }, + { + "epoch": 6.137899543378995, + "grad_norm": 47.14942932128906, + "learning_rate": 4.292237442922374e-06, + "loss": 0.6604, + "step": 6721 + }, + { + "epoch": 6.138812785388128, + "grad_norm": 38.08292007446289, + "learning_rate": 4.291222729578895e-06, + "loss": 0.3083, + "step": 6722 + }, + { + "epoch": 6.13972602739726, + "grad_norm": 8.56736946105957, + "learning_rate": 4.290208016235414e-06, + "loss": 0.0636, + "step": 6723 + }, + { + "epoch": 6.1406392694063925, + "grad_norm": 0.13822951912879944, + "learning_rate": 4.289193302891933e-06, + "loss": 0.0007, + "step": 6724 + }, + { + "epoch": 6.141552511415525, + "grad_norm": 14.944713592529297, + "learning_rate": 4.288178589548453e-06, + "loss": 0.0718, + "step": 6725 + }, + { + "epoch": 6.142465753424657, + "grad_norm": 134.31736755371094, + "learning_rate": 4.287163876204972e-06, + "loss": 2.2665, + "step": 6726 + }, + { + "epoch": 6.14337899543379, + "grad_norm": 3.158740997314453, + "learning_rate": 4.286149162861492e-06, + "loss": 0.0211, + "step": 6727 + }, + { + "epoch": 6.144292237442922, + "grad_norm": 1.090279459953308, + "learning_rate": 4.285134449518011e-06, + "loss": 0.0082, + "step": 6728 + }, + { + "epoch": 6.145205479452055, + "grad_norm": 2.4955201148986816, + "learning_rate": 4.284119736174531e-06, + "loss": 0.0232, + "step": 6729 + }, + { + "epoch": 6.146118721461187, + "grad_norm": 21.980087280273438, + "learning_rate": 4.283105022831051e-06, + "loss": 0.1387, + "step": 6730 + }, + { + "epoch": 6.14703196347032, + "grad_norm": 0.030816320329904556, + "learning_rate": 4.2820903094875695e-06, + "loss": 0.0002, + "step": 6731 + }, + { + "epoch": 6.147945205479452, + "grad_norm": 11.231053352355957, + "learning_rate": 4.28107559614409e-06, + "loss": 0.0801, + "step": 6732 + }, + { + "epoch": 6.148858447488585, + "grad_norm": 21.847591400146484, + "learning_rate": 4.280060882800609e-06, + "loss": 0.1167, + "step": 6733 + }, + { + "epoch": 6.149771689497717, + "grad_norm": 37.93674087524414, + "learning_rate": 4.2790461694571285e-06, + "loss": 0.1133, + "step": 6734 + }, + { + "epoch": 6.1506849315068495, + "grad_norm": 5.278824329376221, + "learning_rate": 4.278031456113648e-06, + "loss": 0.0466, + "step": 6735 + }, + { + "epoch": 6.151598173515982, + "grad_norm": 50.96424102783203, + "learning_rate": 4.277016742770168e-06, + "loss": 0.6689, + "step": 6736 + }, + { + "epoch": 6.152511415525114, + "grad_norm": 2.464127540588379, + "learning_rate": 4.276002029426688e-06, + "loss": 0.0134, + "step": 6737 + }, + { + "epoch": 6.153424657534247, + "grad_norm": 0.7455998063087463, + "learning_rate": 4.2749873160832065e-06, + "loss": 0.0068, + "step": 6738 + }, + { + "epoch": 6.154337899543379, + "grad_norm": 0.23651817440986633, + "learning_rate": 4.273972602739727e-06, + "loss": 0.0016, + "step": 6739 + }, + { + "epoch": 6.155251141552512, + "grad_norm": 10.124393463134766, + "learning_rate": 4.272957889396246e-06, + "loss": 0.0571, + "step": 6740 + }, + { + "epoch": 6.156164383561644, + "grad_norm": 0.32814931869506836, + "learning_rate": 4.2719431760527655e-06, + "loss": 0.002, + "step": 6741 + }, + { + "epoch": 6.157077625570777, + "grad_norm": 0.27434083819389343, + "learning_rate": 4.270928462709285e-06, + "loss": 0.0023, + "step": 6742 + }, + { + "epoch": 6.157990867579909, + "grad_norm": 0.2224847376346588, + "learning_rate": 4.269913749365804e-06, + "loss": 0.0018, + "step": 6743 + }, + { + "epoch": 6.1589041095890416, + "grad_norm": 41.70210266113281, + "learning_rate": 4.268899036022325e-06, + "loss": 0.5226, + "step": 6744 + }, + { + "epoch": 6.159817351598173, + "grad_norm": 38.243865966796875, + "learning_rate": 4.2678843226788435e-06, + "loss": 0.3704, + "step": 6745 + }, + { + "epoch": 6.160730593607306, + "grad_norm": 30.348649978637695, + "learning_rate": 4.266869609335363e-06, + "loss": 0.2882, + "step": 6746 + }, + { + "epoch": 6.161643835616438, + "grad_norm": 0.34953680634498596, + "learning_rate": 4.265854895991883e-06, + "loss": 0.0035, + "step": 6747 + }, + { + "epoch": 6.1625570776255705, + "grad_norm": 36.508766174316406, + "learning_rate": 4.264840182648402e-06, + "loss": 0.1542, + "step": 6748 + }, + { + "epoch": 6.163470319634703, + "grad_norm": 2.1917800903320312, + "learning_rate": 4.263825469304922e-06, + "loss": 0.0161, + "step": 6749 + }, + { + "epoch": 6.164383561643835, + "grad_norm": 0.3423321545124054, + "learning_rate": 4.262810755961441e-06, + "loss": 0.0027, + "step": 6750 + }, + { + "epoch": 6.165296803652968, + "grad_norm": 67.32290649414062, + "learning_rate": 4.261796042617961e-06, + "loss": 0.5966, + "step": 6751 + }, + { + "epoch": 6.1662100456621, + "grad_norm": 3.0305216312408447, + "learning_rate": 4.2607813292744805e-06, + "loss": 0.0238, + "step": 6752 + }, + { + "epoch": 6.167123287671233, + "grad_norm": 0.7407518029212952, + "learning_rate": 4.259766615930999e-06, + "loss": 0.0066, + "step": 6753 + }, + { + "epoch": 6.168036529680365, + "grad_norm": 23.4346981048584, + "learning_rate": 4.25875190258752e-06, + "loss": 0.1847, + "step": 6754 + }, + { + "epoch": 6.168949771689498, + "grad_norm": 0.5035897493362427, + "learning_rate": 4.257737189244039e-06, + "loss": 0.0025, + "step": 6755 + }, + { + "epoch": 6.16986301369863, + "grad_norm": 2.547668695449829, + "learning_rate": 4.256722475900558e-06, + "loss": 0.0172, + "step": 6756 + }, + { + "epoch": 6.170776255707763, + "grad_norm": 0.3659752309322357, + "learning_rate": 4.255707762557078e-06, + "loss": 0.0023, + "step": 6757 + }, + { + "epoch": 6.171689497716895, + "grad_norm": 0.25381702184677124, + "learning_rate": 4.254693049213598e-06, + "loss": 0.0014, + "step": 6758 + }, + { + "epoch": 6.1726027397260275, + "grad_norm": 7.9652099609375, + "learning_rate": 4.2536783358701175e-06, + "loss": 0.0427, + "step": 6759 + }, + { + "epoch": 6.17351598173516, + "grad_norm": 4.849400997161865, + "learning_rate": 4.252663622526636e-06, + "loss": 0.0306, + "step": 6760 + }, + { + "epoch": 6.174429223744292, + "grad_norm": 1.0798081159591675, + "learning_rate": 4.251648909183156e-06, + "loss": 0.0099, + "step": 6761 + }, + { + "epoch": 6.175342465753425, + "grad_norm": 1.5861057043075562, + "learning_rate": 4.250634195839676e-06, + "loss": 0.01, + "step": 6762 + }, + { + "epoch": 6.176255707762557, + "grad_norm": 0.9497644305229187, + "learning_rate": 4.249619482496195e-06, + "loss": 0.0066, + "step": 6763 + }, + { + "epoch": 6.17716894977169, + "grad_norm": 51.560264587402344, + "learning_rate": 4.248604769152715e-06, + "loss": 0.4791, + "step": 6764 + }, + { + "epoch": 6.178082191780822, + "grad_norm": 0.7134747505187988, + "learning_rate": 4.247590055809234e-06, + "loss": 0.0057, + "step": 6765 + }, + { + "epoch": 6.178995433789955, + "grad_norm": 2.9880497455596924, + "learning_rate": 4.246575342465754e-06, + "loss": 0.0167, + "step": 6766 + }, + { + "epoch": 6.179908675799087, + "grad_norm": 0.3073572516441345, + "learning_rate": 4.245560629122273e-06, + "loss": 0.0026, + "step": 6767 + }, + { + "epoch": 6.1808219178082195, + "grad_norm": 1.9181276559829712, + "learning_rate": 4.244545915778793e-06, + "loss": 0.0158, + "step": 6768 + }, + { + "epoch": 6.181735159817352, + "grad_norm": 0.9030181169509888, + "learning_rate": 4.243531202435313e-06, + "loss": 0.0063, + "step": 6769 + }, + { + "epoch": 6.182648401826484, + "grad_norm": 0.762498140335083, + "learning_rate": 4.2425164890918315e-06, + "loss": 0.0065, + "step": 6770 + }, + { + "epoch": 6.183561643835616, + "grad_norm": 4.887725830078125, + "learning_rate": 4.241501775748351e-06, + "loss": 0.0325, + "step": 6771 + }, + { + "epoch": 6.1844748858447485, + "grad_norm": 0.8369694948196411, + "learning_rate": 4.240487062404871e-06, + "loss": 0.0063, + "step": 6772 + }, + { + "epoch": 6.185388127853881, + "grad_norm": 0.5551093816757202, + "learning_rate": 4.239472349061391e-06, + "loss": 0.0034, + "step": 6773 + }, + { + "epoch": 6.186301369863013, + "grad_norm": 51.49940872192383, + "learning_rate": 4.23845763571791e-06, + "loss": 0.6103, + "step": 6774 + }, + { + "epoch": 6.187214611872146, + "grad_norm": 0.2190358191728592, + "learning_rate": 4.237442922374429e-06, + "loss": 0.0013, + "step": 6775 + }, + { + "epoch": 6.188127853881278, + "grad_norm": 90.49063873291016, + "learning_rate": 4.236428209030949e-06, + "loss": 1.6332, + "step": 6776 + }, + { + "epoch": 6.189041095890411, + "grad_norm": 0.9787750840187073, + "learning_rate": 4.2354134956874685e-06, + "loss": 0.0081, + "step": 6777 + }, + { + "epoch": 6.189954337899543, + "grad_norm": 3.0555248260498047, + "learning_rate": 4.234398782343988e-06, + "loss": 0.0269, + "step": 6778 + }, + { + "epoch": 6.190867579908676, + "grad_norm": 0.11568213999271393, + "learning_rate": 4.233384069000508e-06, + "loss": 0.0008, + "step": 6779 + }, + { + "epoch": 6.191780821917808, + "grad_norm": 4.366349220275879, + "learning_rate": 4.232369355657028e-06, + "loss": 0.0348, + "step": 6780 + }, + { + "epoch": 6.1926940639269406, + "grad_norm": 0.9069395661354065, + "learning_rate": 4.2313546423135464e-06, + "loss": 0.0045, + "step": 6781 + }, + { + "epoch": 6.193607305936073, + "grad_norm": 3.863848924636841, + "learning_rate": 4.230339928970066e-06, + "loss": 0.0327, + "step": 6782 + }, + { + "epoch": 6.1945205479452055, + "grad_norm": 0.8613216280937195, + "learning_rate": 4.229325215626586e-06, + "loss": 0.0056, + "step": 6783 + }, + { + "epoch": 6.195433789954338, + "grad_norm": 155.9404296875, + "learning_rate": 4.2283105022831055e-06, + "loss": 1.4475, + "step": 6784 + }, + { + "epoch": 6.19634703196347, + "grad_norm": 1.158347725868225, + "learning_rate": 4.227295788939625e-06, + "loss": 0.0102, + "step": 6785 + }, + { + "epoch": 6.197260273972603, + "grad_norm": 5.03058385848999, + "learning_rate": 4.226281075596144e-06, + "loss": 0.0316, + "step": 6786 + }, + { + "epoch": 6.198173515981735, + "grad_norm": 14.873600959777832, + "learning_rate": 4.225266362252664e-06, + "loss": 0.1251, + "step": 6787 + }, + { + "epoch": 6.199086757990868, + "grad_norm": 34.4898681640625, + "learning_rate": 4.2242516489091834e-06, + "loss": 0.2754, + "step": 6788 + }, + { + "epoch": 6.2, + "grad_norm": 0.22055315971374512, + "learning_rate": 4.223236935565703e-06, + "loss": 0.0019, + "step": 6789 + }, + { + "epoch": 6.200913242009133, + "grad_norm": 6.29018497467041, + "learning_rate": 4.222222222222223e-06, + "loss": 0.0351, + "step": 6790 + }, + { + "epoch": 6.201826484018265, + "grad_norm": 4.663547515869141, + "learning_rate": 4.221207508878742e-06, + "loss": 0.0309, + "step": 6791 + }, + { + "epoch": 6.2027397260273975, + "grad_norm": 0.3967207670211792, + "learning_rate": 4.220192795535261e-06, + "loss": 0.0026, + "step": 6792 + }, + { + "epoch": 6.20365296803653, + "grad_norm": 0.06524629890918732, + "learning_rate": 4.219178082191781e-06, + "loss": 0.0005, + "step": 6793 + }, + { + "epoch": 6.2045662100456624, + "grad_norm": 49.77555465698242, + "learning_rate": 4.218163368848301e-06, + "loss": 0.4715, + "step": 6794 + }, + { + "epoch": 6.205479452054795, + "grad_norm": 0.16926135122776031, + "learning_rate": 4.2171486555048204e-06, + "loss": 0.001, + "step": 6795 + }, + { + "epoch": 6.206392694063927, + "grad_norm": 7.460196018218994, + "learning_rate": 4.216133942161339e-06, + "loss": 0.0719, + "step": 6796 + }, + { + "epoch": 6.207305936073059, + "grad_norm": 50.79012680053711, + "learning_rate": 4.215119228817859e-06, + "loss": 0.673, + "step": 6797 + }, + { + "epoch": 6.208219178082191, + "grad_norm": 3.013082504272461, + "learning_rate": 4.214104515474379e-06, + "loss": 0.0241, + "step": 6798 + }, + { + "epoch": 6.209132420091324, + "grad_norm": 4.221345901489258, + "learning_rate": 4.213089802130898e-06, + "loss": 0.0292, + "step": 6799 + }, + { + "epoch": 6.210045662100456, + "grad_norm": 0.26605477929115295, + "learning_rate": 4.212075088787418e-06, + "loss": 0.001, + "step": 6800 + }, + { + "epoch": 6.210958904109589, + "grad_norm": 3.518666982650757, + "learning_rate": 4.211060375443937e-06, + "loss": 0.03, + "step": 6801 + }, + { + "epoch": 6.211872146118721, + "grad_norm": 0.11578772217035294, + "learning_rate": 4.2100456621004574e-06, + "loss": 0.0007, + "step": 6802 + }, + { + "epoch": 6.212785388127854, + "grad_norm": 3.5380895137786865, + "learning_rate": 4.209030948756976e-06, + "loss": 0.0165, + "step": 6803 + }, + { + "epoch": 6.213698630136986, + "grad_norm": 1.0557034015655518, + "learning_rate": 4.208016235413496e-06, + "loss": 0.0073, + "step": 6804 + }, + { + "epoch": 6.2146118721461185, + "grad_norm": 0.9669680595397949, + "learning_rate": 4.207001522070016e-06, + "loss": 0.0092, + "step": 6805 + }, + { + "epoch": 6.215525114155251, + "grad_norm": 0.449761301279068, + "learning_rate": 4.2059868087265345e-06, + "loss": 0.0029, + "step": 6806 + }, + { + "epoch": 6.2164383561643834, + "grad_norm": 0.7613855600357056, + "learning_rate": 4.204972095383055e-06, + "loss": 0.0052, + "step": 6807 + }, + { + "epoch": 6.217351598173516, + "grad_norm": 1.319640874862671, + "learning_rate": 4.203957382039574e-06, + "loss": 0.0091, + "step": 6808 + }, + { + "epoch": 6.218264840182648, + "grad_norm": 1.0336737632751465, + "learning_rate": 4.202942668696094e-06, + "loss": 0.0076, + "step": 6809 + }, + { + "epoch": 6.219178082191781, + "grad_norm": 34.77851486206055, + "learning_rate": 4.201927955352613e-06, + "loss": 0.2701, + "step": 6810 + }, + { + "epoch": 6.220091324200913, + "grad_norm": 2.074542284011841, + "learning_rate": 4.200913242009132e-06, + "loss": 0.0115, + "step": 6811 + }, + { + "epoch": 6.221004566210046, + "grad_norm": 6.5493693351745605, + "learning_rate": 4.199898528665653e-06, + "loss": 0.0449, + "step": 6812 + }, + { + "epoch": 6.221917808219178, + "grad_norm": 1.8299260139465332, + "learning_rate": 4.1988838153221715e-06, + "loss": 0.0137, + "step": 6813 + }, + { + "epoch": 6.222831050228311, + "grad_norm": 0.010028359480202198, + "learning_rate": 4.197869101978691e-06, + "loss": 0.0001, + "step": 6814 + }, + { + "epoch": 6.223744292237443, + "grad_norm": 2.259125232696533, + "learning_rate": 4.196854388635211e-06, + "loss": 0.0084, + "step": 6815 + }, + { + "epoch": 6.2246575342465755, + "grad_norm": 0.5780997276306152, + "learning_rate": 4.195839675291731e-06, + "loss": 0.0044, + "step": 6816 + }, + { + "epoch": 6.225570776255708, + "grad_norm": 9.22189998626709, + "learning_rate": 4.19482496194825e-06, + "loss": 0.0629, + "step": 6817 + }, + { + "epoch": 6.22648401826484, + "grad_norm": 0.20073117315769196, + "learning_rate": 4.193810248604769e-06, + "loss": 0.001, + "step": 6818 + }, + { + "epoch": 6.227397260273973, + "grad_norm": 38.872623443603516, + "learning_rate": 4.192795535261289e-06, + "loss": 0.357, + "step": 6819 + }, + { + "epoch": 6.228310502283105, + "grad_norm": 0.08050902932882309, + "learning_rate": 4.1917808219178085e-06, + "loss": 0.0008, + "step": 6820 + }, + { + "epoch": 6.229223744292238, + "grad_norm": 0.14089365303516388, + "learning_rate": 4.190766108574328e-06, + "loss": 0.0012, + "step": 6821 + }, + { + "epoch": 6.23013698630137, + "grad_norm": 5.146953105926514, + "learning_rate": 4.189751395230848e-06, + "loss": 0.0452, + "step": 6822 + }, + { + "epoch": 6.231050228310503, + "grad_norm": 0.7448309063911438, + "learning_rate": 4.188736681887367e-06, + "loss": 0.0037, + "step": 6823 + }, + { + "epoch": 6.231963470319634, + "grad_norm": 4.391191005706787, + "learning_rate": 4.187721968543887e-06, + "loss": 0.0285, + "step": 6824 + }, + { + "epoch": 6.232876712328767, + "grad_norm": 0.8665407299995422, + "learning_rate": 4.186707255200406e-06, + "loss": 0.0067, + "step": 6825 + }, + { + "epoch": 6.233789954337899, + "grad_norm": 2.3906753063201904, + "learning_rate": 4.185692541856926e-06, + "loss": 0.0142, + "step": 6826 + }, + { + "epoch": 6.234703196347032, + "grad_norm": 14.465858459472656, + "learning_rate": 4.1846778285134455e-06, + "loss": 0.124, + "step": 6827 + }, + { + "epoch": 6.235616438356164, + "grad_norm": 4.396858215332031, + "learning_rate": 4.183663115169964e-06, + "loss": 0.0236, + "step": 6828 + }, + { + "epoch": 6.2365296803652965, + "grad_norm": 0.2993071377277374, + "learning_rate": 4.182648401826485e-06, + "loss": 0.0028, + "step": 6829 + }, + { + "epoch": 6.237442922374429, + "grad_norm": 3.5830514430999756, + "learning_rate": 4.181633688483004e-06, + "loss": 0.0319, + "step": 6830 + }, + { + "epoch": 6.238356164383561, + "grad_norm": 115.00516510009766, + "learning_rate": 4.1806189751395234e-06, + "loss": 1.0483, + "step": 6831 + }, + { + "epoch": 6.239269406392694, + "grad_norm": 1.6589921712875366, + "learning_rate": 4.179604261796043e-06, + "loss": 0.0119, + "step": 6832 + }, + { + "epoch": 6.240182648401826, + "grad_norm": 4.720387935638428, + "learning_rate": 4.178589548452562e-06, + "loss": 0.0247, + "step": 6833 + }, + { + "epoch": 6.241095890410959, + "grad_norm": 0.0532551109790802, + "learning_rate": 4.1775748351090825e-06, + "loss": 0.0003, + "step": 6834 + }, + { + "epoch": 6.242009132420091, + "grad_norm": 26.64932632446289, + "learning_rate": 4.176560121765601e-06, + "loss": 0.2614, + "step": 6835 + }, + { + "epoch": 6.242922374429224, + "grad_norm": 15.396989822387695, + "learning_rate": 4.175545408422121e-06, + "loss": 0.111, + "step": 6836 + }, + { + "epoch": 6.243835616438356, + "grad_norm": 0.05635911971330643, + "learning_rate": 4.174530695078641e-06, + "loss": 0.0003, + "step": 6837 + }, + { + "epoch": 6.244748858447489, + "grad_norm": 8.070173263549805, + "learning_rate": 4.1735159817351604e-06, + "loss": 0.0712, + "step": 6838 + }, + { + "epoch": 6.245662100456621, + "grad_norm": 70.60499572753906, + "learning_rate": 4.17250126839168e-06, + "loss": 0.9731, + "step": 6839 + }, + { + "epoch": 6.2465753424657535, + "grad_norm": 0.39983439445495605, + "learning_rate": 4.171486555048199e-06, + "loss": 0.0034, + "step": 6840 + }, + { + "epoch": 6.247488584474886, + "grad_norm": 0.4785703122615814, + "learning_rate": 4.170471841704719e-06, + "loss": 0.0028, + "step": 6841 + }, + { + "epoch": 6.248401826484018, + "grad_norm": 6.735509872436523, + "learning_rate": 4.169457128361238e-06, + "loss": 0.0518, + "step": 6842 + }, + { + "epoch": 6.249315068493151, + "grad_norm": 1.3615732192993164, + "learning_rate": 4.168442415017758e-06, + "loss": 0.0078, + "step": 6843 + }, + { + "epoch": 6.250228310502283, + "grad_norm": 0.15716016292572021, + "learning_rate": 4.167427701674278e-06, + "loss": 0.0011, + "step": 6844 + }, + { + "epoch": 6.251141552511416, + "grad_norm": 2.638049602508545, + "learning_rate": 4.166412988330797e-06, + "loss": 0.0165, + "step": 6845 + }, + { + "epoch": 6.252054794520548, + "grad_norm": 0.818301260471344, + "learning_rate": 4.165398274987316e-06, + "loss": 0.0047, + "step": 6846 + }, + { + "epoch": 6.252968036529681, + "grad_norm": 2.2019121646881104, + "learning_rate": 4.164383561643836e-06, + "loss": 0.0148, + "step": 6847 + }, + { + "epoch": 6.253881278538813, + "grad_norm": 1.995864987373352, + "learning_rate": 4.163368848300356e-06, + "loss": 0.0148, + "step": 6848 + }, + { + "epoch": 6.254794520547946, + "grad_norm": 1.3089330196380615, + "learning_rate": 4.162354134956875e-06, + "loss": 0.0074, + "step": 6849 + }, + { + "epoch": 6.255707762557078, + "grad_norm": 5.096553802490234, + "learning_rate": 4.161339421613394e-06, + "loss": 0.0419, + "step": 6850 + }, + { + "epoch": 6.25662100456621, + "grad_norm": 40.1860466003418, + "learning_rate": 4.160324708269914e-06, + "loss": 0.5305, + "step": 6851 + }, + { + "epoch": 6.257534246575342, + "grad_norm": 3.3563969135284424, + "learning_rate": 4.1593099949264336e-06, + "loss": 0.0293, + "step": 6852 + }, + { + "epoch": 6.2584474885844745, + "grad_norm": 7.152345657348633, + "learning_rate": 4.158295281582953e-06, + "loss": 0.0435, + "step": 6853 + }, + { + "epoch": 6.259360730593607, + "grad_norm": 9.772980690002441, + "learning_rate": 4.157280568239473e-06, + "loss": 0.0559, + "step": 6854 + }, + { + "epoch": 6.260273972602739, + "grad_norm": 0.2133939266204834, + "learning_rate": 4.156265854895992e-06, + "loss": 0.0011, + "step": 6855 + }, + { + "epoch": 6.261187214611872, + "grad_norm": 0.12054303288459778, + "learning_rate": 4.1552511415525115e-06, + "loss": 0.0007, + "step": 6856 + }, + { + "epoch": 6.262100456621004, + "grad_norm": 7.097994804382324, + "learning_rate": 4.154236428209031e-06, + "loss": 0.0743, + "step": 6857 + }, + { + "epoch": 6.263013698630137, + "grad_norm": 0.3231601417064667, + "learning_rate": 4.153221714865551e-06, + "loss": 0.0023, + "step": 6858 + }, + { + "epoch": 6.263926940639269, + "grad_norm": 32.179359436035156, + "learning_rate": 4.1522070015220706e-06, + "loss": 0.3623, + "step": 6859 + }, + { + "epoch": 6.264840182648402, + "grad_norm": 3.397921085357666, + "learning_rate": 4.15119228817859e-06, + "loss": 0.0274, + "step": 6860 + }, + { + "epoch": 6.265753424657534, + "grad_norm": 4.553414344787598, + "learning_rate": 4.150177574835109e-06, + "loss": 0.0303, + "step": 6861 + }, + { + "epoch": 6.266666666666667, + "grad_norm": 1.0808736085891724, + "learning_rate": 4.149162861491629e-06, + "loss": 0.0071, + "step": 6862 + }, + { + "epoch": 6.267579908675799, + "grad_norm": 2.1121110916137695, + "learning_rate": 4.1481481481481485e-06, + "loss": 0.0123, + "step": 6863 + }, + { + "epoch": 6.2684931506849315, + "grad_norm": 0.024282388389110565, + "learning_rate": 4.147133434804668e-06, + "loss": 0.0001, + "step": 6864 + }, + { + "epoch": 6.269406392694064, + "grad_norm": 19.409685134887695, + "learning_rate": 4.146118721461188e-06, + "loss": 0.1695, + "step": 6865 + }, + { + "epoch": 6.270319634703196, + "grad_norm": 5.97842264175415, + "learning_rate": 4.145104008117707e-06, + "loss": 0.0418, + "step": 6866 + }, + { + "epoch": 6.271232876712329, + "grad_norm": 0.521831214427948, + "learning_rate": 4.144089294774226e-06, + "loss": 0.0035, + "step": 6867 + }, + { + "epoch": 6.272146118721461, + "grad_norm": 40.27757263183594, + "learning_rate": 4.143074581430746e-06, + "loss": 0.3546, + "step": 6868 + }, + { + "epoch": 6.273059360730594, + "grad_norm": 0.4246051609516144, + "learning_rate": 4.142059868087266e-06, + "loss": 0.003, + "step": 6869 + }, + { + "epoch": 6.273972602739726, + "grad_norm": 5.0862202644348145, + "learning_rate": 4.1410451547437855e-06, + "loss": 0.0222, + "step": 6870 + }, + { + "epoch": 6.274885844748859, + "grad_norm": 11.864727973937988, + "learning_rate": 4.140030441400304e-06, + "loss": 0.08, + "step": 6871 + }, + { + "epoch": 6.275799086757991, + "grad_norm": 12.471089363098145, + "learning_rate": 4.139015728056824e-06, + "loss": 0.0933, + "step": 6872 + }, + { + "epoch": 6.276712328767124, + "grad_norm": 75.60482025146484, + "learning_rate": 4.138001014713344e-06, + "loss": 1.4411, + "step": 6873 + }, + { + "epoch": 6.277625570776256, + "grad_norm": 5.5647687911987305, + "learning_rate": 4.136986301369863e-06, + "loss": 0.0344, + "step": 6874 + }, + { + "epoch": 6.2785388127853885, + "grad_norm": 8.801070213317871, + "learning_rate": 4.135971588026383e-06, + "loss": 0.0518, + "step": 6875 + }, + { + "epoch": 6.279452054794521, + "grad_norm": 1.889267086982727, + "learning_rate": 4.134956874682902e-06, + "loss": 0.0113, + "step": 6876 + }, + { + "epoch": 6.280365296803653, + "grad_norm": 3.6078171730041504, + "learning_rate": 4.133942161339422e-06, + "loss": 0.0233, + "step": 6877 + }, + { + "epoch": 6.281278538812785, + "grad_norm": 4.8488545417785645, + "learning_rate": 4.132927447995941e-06, + "loss": 0.0399, + "step": 6878 + }, + { + "epoch": 6.282191780821917, + "grad_norm": 2.0833189487457275, + "learning_rate": 4.131912734652461e-06, + "loss": 0.0142, + "step": 6879 + }, + { + "epoch": 6.28310502283105, + "grad_norm": 8.964244842529297, + "learning_rate": 4.130898021308981e-06, + "loss": 0.0529, + "step": 6880 + }, + { + "epoch": 6.284018264840182, + "grad_norm": 0.24755166471004486, + "learning_rate": 4.1298833079654996e-06, + "loss": 0.0018, + "step": 6881 + }, + { + "epoch": 6.284931506849315, + "grad_norm": 4.726972579956055, + "learning_rate": 4.12886859462202e-06, + "loss": 0.0328, + "step": 6882 + }, + { + "epoch": 6.285844748858447, + "grad_norm": 2.437314510345459, + "learning_rate": 4.127853881278539e-06, + "loss": 0.0164, + "step": 6883 + }, + { + "epoch": 6.28675799086758, + "grad_norm": 2.446936845779419, + "learning_rate": 4.126839167935059e-06, + "loss": 0.0188, + "step": 6884 + }, + { + "epoch": 6.287671232876712, + "grad_norm": 64.4734115600586, + "learning_rate": 4.125824454591578e-06, + "loss": 0.9181, + "step": 6885 + }, + { + "epoch": 6.288584474885845, + "grad_norm": 1.7170573472976685, + "learning_rate": 4.124809741248097e-06, + "loss": 0.0112, + "step": 6886 + }, + { + "epoch": 6.289497716894977, + "grad_norm": 2.104701280593872, + "learning_rate": 4.123795027904618e-06, + "loss": 0.0163, + "step": 6887 + }, + { + "epoch": 6.2904109589041095, + "grad_norm": 6.87517786026001, + "learning_rate": 4.1227803145611366e-06, + "loss": 0.0318, + "step": 6888 + }, + { + "epoch": 6.291324200913242, + "grad_norm": 0.201304629445076, + "learning_rate": 4.121765601217656e-06, + "loss": 0.0017, + "step": 6889 + }, + { + "epoch": 6.292237442922374, + "grad_norm": 1.8029893636703491, + "learning_rate": 4.120750887874176e-06, + "loss": 0.0125, + "step": 6890 + }, + { + "epoch": 6.293150684931507, + "grad_norm": 10.818290710449219, + "learning_rate": 4.119736174530695e-06, + "loss": 0.0424, + "step": 6891 + }, + { + "epoch": 6.294063926940639, + "grad_norm": 9.234703063964844, + "learning_rate": 4.118721461187215e-06, + "loss": 0.0408, + "step": 6892 + }, + { + "epoch": 6.294977168949772, + "grad_norm": 3.2015206813812256, + "learning_rate": 4.117706747843734e-06, + "loss": 0.0206, + "step": 6893 + }, + { + "epoch": 6.295890410958904, + "grad_norm": 20.626699447631836, + "learning_rate": 4.116692034500254e-06, + "loss": 0.1862, + "step": 6894 + }, + { + "epoch": 6.296803652968037, + "grad_norm": 10.50836181640625, + "learning_rate": 4.1156773211567736e-06, + "loss": 0.0794, + "step": 6895 + }, + { + "epoch": 6.297716894977169, + "grad_norm": 11.515361785888672, + "learning_rate": 4.114662607813293e-06, + "loss": 0.0591, + "step": 6896 + }, + { + "epoch": 6.298630136986302, + "grad_norm": 4.9814887046813965, + "learning_rate": 4.113647894469813e-06, + "loss": 0.0419, + "step": 6897 + }, + { + "epoch": 6.299543378995434, + "grad_norm": 7.2587571144104, + "learning_rate": 4.112633181126332e-06, + "loss": 0.0444, + "step": 6898 + }, + { + "epoch": 6.3004566210045665, + "grad_norm": 10.203746795654297, + "learning_rate": 4.1116184677828515e-06, + "loss": 0.0925, + "step": 6899 + }, + { + "epoch": 6.301369863013699, + "grad_norm": 10.570732116699219, + "learning_rate": 4.110603754439371e-06, + "loss": 0.0534, + "step": 6900 + }, + { + "epoch": 6.302283105022831, + "grad_norm": 0.5685854554176331, + "learning_rate": 4.109589041095891e-06, + "loss": 0.0038, + "step": 6901 + }, + { + "epoch": 6.303196347031964, + "grad_norm": 0.21995890140533447, + "learning_rate": 4.1085743277524106e-06, + "loss": 0.0016, + "step": 6902 + }, + { + "epoch": 6.304109589041096, + "grad_norm": 0.15097714960575104, + "learning_rate": 4.107559614408929e-06, + "loss": 0.0009, + "step": 6903 + }, + { + "epoch": 6.305022831050229, + "grad_norm": 0.22437557578086853, + "learning_rate": 4.10654490106545e-06, + "loss": 0.0011, + "step": 6904 + }, + { + "epoch": 6.30593607305936, + "grad_norm": 7.371106147766113, + "learning_rate": 4.105530187721969e-06, + "loss": 0.0449, + "step": 6905 + }, + { + "epoch": 6.306849315068493, + "grad_norm": 2.3747427463531494, + "learning_rate": 4.1045154743784885e-06, + "loss": 0.0182, + "step": 6906 + }, + { + "epoch": 6.307762557077625, + "grad_norm": 4.229263782501221, + "learning_rate": 4.103500761035008e-06, + "loss": 0.0329, + "step": 6907 + }, + { + "epoch": 6.308675799086758, + "grad_norm": 38.65958786010742, + "learning_rate": 4.102486047691527e-06, + "loss": 0.417, + "step": 6908 + }, + { + "epoch": 6.30958904109589, + "grad_norm": 19.114906311035156, + "learning_rate": 4.1014713343480476e-06, + "loss": 0.0936, + "step": 6909 + }, + { + "epoch": 6.310502283105023, + "grad_norm": 1.5376769304275513, + "learning_rate": 4.100456621004566e-06, + "loss": 0.0108, + "step": 6910 + }, + { + "epoch": 6.311415525114155, + "grad_norm": 2.2674925327301025, + "learning_rate": 4.099441907661086e-06, + "loss": 0.0145, + "step": 6911 + }, + { + "epoch": 6.3123287671232875, + "grad_norm": 0.9741470813751221, + "learning_rate": 4.098427194317606e-06, + "loss": 0.0057, + "step": 6912 + }, + { + "epoch": 6.31324200913242, + "grad_norm": 3.6654231548309326, + "learning_rate": 4.097412480974125e-06, + "loss": 0.0191, + "step": 6913 + }, + { + "epoch": 6.314155251141552, + "grad_norm": 0.009269741363823414, + "learning_rate": 4.096397767630645e-06, + "loss": 0.0001, + "step": 6914 + }, + { + "epoch": 6.315068493150685, + "grad_norm": 2.4165802001953125, + "learning_rate": 4.095383054287164e-06, + "loss": 0.0127, + "step": 6915 + }, + { + "epoch": 6.315981735159817, + "grad_norm": 11.214003562927246, + "learning_rate": 4.094368340943684e-06, + "loss": 0.0769, + "step": 6916 + }, + { + "epoch": 6.31689497716895, + "grad_norm": 4.967249393463135, + "learning_rate": 4.093353627600203e-06, + "loss": 0.0376, + "step": 6917 + }, + { + "epoch": 6.317808219178082, + "grad_norm": 4.068699359893799, + "learning_rate": 4.092338914256723e-06, + "loss": 0.0326, + "step": 6918 + }, + { + "epoch": 6.318721461187215, + "grad_norm": 1.7617791891098022, + "learning_rate": 4.091324200913243e-06, + "loss": 0.0145, + "step": 6919 + }, + { + "epoch": 6.319634703196347, + "grad_norm": 0.8260309100151062, + "learning_rate": 4.090309487569762e-06, + "loss": 0.0064, + "step": 6920 + }, + { + "epoch": 6.32054794520548, + "grad_norm": 15.123785972595215, + "learning_rate": 4.089294774226281e-06, + "loss": 0.0929, + "step": 6921 + }, + { + "epoch": 6.321461187214612, + "grad_norm": 12.132643699645996, + "learning_rate": 4.088280060882801e-06, + "loss": 0.0858, + "step": 6922 + }, + { + "epoch": 6.3223744292237445, + "grad_norm": 26.550630569458008, + "learning_rate": 4.087265347539321e-06, + "loss": 0.3279, + "step": 6923 + }, + { + "epoch": 6.323287671232877, + "grad_norm": 0.27123183012008667, + "learning_rate": 4.08625063419584e-06, + "loss": 0.0016, + "step": 6924 + }, + { + "epoch": 6.324200913242009, + "grad_norm": 13.42725658416748, + "learning_rate": 4.085235920852359e-06, + "loss": 0.1125, + "step": 6925 + }, + { + "epoch": 6.325114155251142, + "grad_norm": 0.22129760682582855, + "learning_rate": 4.084221207508879e-06, + "loss": 0.001, + "step": 6926 + }, + { + "epoch": 6.326027397260274, + "grad_norm": 3.256683826446533, + "learning_rate": 4.083206494165399e-06, + "loss": 0.0241, + "step": 6927 + }, + { + "epoch": 6.326940639269407, + "grad_norm": 0.40151116251945496, + "learning_rate": 4.082191780821918e-06, + "loss": 0.0028, + "step": 6928 + }, + { + "epoch": 6.327853881278539, + "grad_norm": 29.385190963745117, + "learning_rate": 4.081177067478438e-06, + "loss": 0.1903, + "step": 6929 + }, + { + "epoch": 6.328767123287671, + "grad_norm": 6.08815336227417, + "learning_rate": 4.080162354134957e-06, + "loss": 0.0315, + "step": 6930 + }, + { + "epoch": 6.329680365296804, + "grad_norm": 0.344466894865036, + "learning_rate": 4.0791476407914765e-06, + "loss": 0.0017, + "step": 6931 + }, + { + "epoch": 6.330593607305936, + "grad_norm": 37.336673736572266, + "learning_rate": 4.078132927447996e-06, + "loss": 0.4255, + "step": 6932 + }, + { + "epoch": 6.331506849315068, + "grad_norm": 9.971691131591797, + "learning_rate": 4.077118214104516e-06, + "loss": 0.0834, + "step": 6933 + }, + { + "epoch": 6.332420091324201, + "grad_norm": 0.5800681114196777, + "learning_rate": 4.076103500761036e-06, + "loss": 0.0047, + "step": 6934 + }, + { + "epoch": 6.333333333333333, + "grad_norm": 63.4232177734375, + "learning_rate": 4.0750887874175545e-06, + "loss": 0.5134, + "step": 6935 + }, + { + "epoch": 6.3342465753424655, + "grad_norm": 7.561026573181152, + "learning_rate": 4.074074074074074e-06, + "loss": 0.0603, + "step": 6936 + }, + { + "epoch": 6.335159817351598, + "grad_norm": 3.4790165424346924, + "learning_rate": 4.073059360730594e-06, + "loss": 0.0294, + "step": 6937 + }, + { + "epoch": 6.33607305936073, + "grad_norm": 0.49294236302375793, + "learning_rate": 4.0720446473871135e-06, + "loss": 0.0044, + "step": 6938 + }, + { + "epoch": 6.336986301369863, + "grad_norm": 0.08391166478395462, + "learning_rate": 4.071029934043633e-06, + "loss": 0.0006, + "step": 6939 + }, + { + "epoch": 6.337899543378995, + "grad_norm": 11.453744888305664, + "learning_rate": 4.070015220700153e-06, + "loss": 0.0887, + "step": 6940 + }, + { + "epoch": 6.338812785388128, + "grad_norm": 3.2835161685943604, + "learning_rate": 4.069000507356672e-06, + "loss": 0.0214, + "step": 6941 + }, + { + "epoch": 6.33972602739726, + "grad_norm": 2.854081869125366, + "learning_rate": 4.0679857940131915e-06, + "loss": 0.0233, + "step": 6942 + }, + { + "epoch": 6.340639269406393, + "grad_norm": 30.33561897277832, + "learning_rate": 4.066971080669711e-06, + "loss": 0.1413, + "step": 6943 + }, + { + "epoch": 6.341552511415525, + "grad_norm": 0.8778665661811829, + "learning_rate": 4.065956367326231e-06, + "loss": 0.0071, + "step": 6944 + }, + { + "epoch": 6.342465753424658, + "grad_norm": 6.8836669921875, + "learning_rate": 4.0649416539827505e-06, + "loss": 0.0652, + "step": 6945 + }, + { + "epoch": 6.34337899543379, + "grad_norm": 7.386643409729004, + "learning_rate": 4.063926940639269e-06, + "loss": 0.0447, + "step": 6946 + }, + { + "epoch": 6.3442922374429225, + "grad_norm": 13.41864013671875, + "learning_rate": 4.062912227295789e-06, + "loss": 0.0848, + "step": 6947 + }, + { + "epoch": 6.345205479452055, + "grad_norm": 7.300187587738037, + "learning_rate": 4.061897513952309e-06, + "loss": 0.0479, + "step": 6948 + }, + { + "epoch": 6.346118721461187, + "grad_norm": 2.3980886936187744, + "learning_rate": 4.0608828006088285e-06, + "loss": 0.0124, + "step": 6949 + }, + { + "epoch": 6.34703196347032, + "grad_norm": 2.563575029373169, + "learning_rate": 4.059868087265348e-06, + "loss": 0.0157, + "step": 6950 + }, + { + "epoch": 6.347945205479452, + "grad_norm": 2.1603338718414307, + "learning_rate": 4.058853373921867e-06, + "loss": 0.015, + "step": 6951 + }, + { + "epoch": 6.348858447488585, + "grad_norm": 0.6081390380859375, + "learning_rate": 4.057838660578387e-06, + "loss": 0.0028, + "step": 6952 + }, + { + "epoch": 6.349771689497717, + "grad_norm": 19.129507064819336, + "learning_rate": 4.056823947234906e-06, + "loss": 0.1114, + "step": 6953 + }, + { + "epoch": 6.35068493150685, + "grad_norm": 0.13310618698596954, + "learning_rate": 4.055809233891426e-06, + "loss": 0.0009, + "step": 6954 + }, + { + "epoch": 6.351598173515982, + "grad_norm": 0.7677640318870544, + "learning_rate": 4.054794520547946e-06, + "loss": 0.0059, + "step": 6955 + }, + { + "epoch": 6.352511415525115, + "grad_norm": 2.445518732070923, + "learning_rate": 4.053779807204465e-06, + "loss": 0.0145, + "step": 6956 + }, + { + "epoch": 6.353424657534246, + "grad_norm": 27.853961944580078, + "learning_rate": 4.052765093860984e-06, + "loss": 0.2266, + "step": 6957 + }, + { + "epoch": 6.3543378995433795, + "grad_norm": 11.488131523132324, + "learning_rate": 4.051750380517504e-06, + "loss": 0.0539, + "step": 6958 + }, + { + "epoch": 6.355251141552511, + "grad_norm": 4.400113105773926, + "learning_rate": 4.050735667174024e-06, + "loss": 0.0337, + "step": 6959 + }, + { + "epoch": 6.3561643835616435, + "grad_norm": 52.53571701049805, + "learning_rate": 4.049720953830543e-06, + "loss": 0.652, + "step": 6960 + }, + { + "epoch": 6.357077625570776, + "grad_norm": 3.4206669330596924, + "learning_rate": 4.048706240487062e-06, + "loss": 0.021, + "step": 6961 + }, + { + "epoch": 6.357990867579908, + "grad_norm": 26.889202117919922, + "learning_rate": 4.047691527143583e-06, + "loss": 0.2307, + "step": 6962 + }, + { + "epoch": 6.358904109589041, + "grad_norm": 0.3270319402217865, + "learning_rate": 4.046676813800102e-06, + "loss": 0.0027, + "step": 6963 + }, + { + "epoch": 6.359817351598173, + "grad_norm": 1.577584147453308, + "learning_rate": 4.045662100456621e-06, + "loss": 0.0115, + "step": 6964 + }, + { + "epoch": 6.360730593607306, + "grad_norm": 27.82815170288086, + "learning_rate": 4.044647387113141e-06, + "loss": 0.1149, + "step": 6965 + }, + { + "epoch": 6.361643835616438, + "grad_norm": 0.6904962658882141, + "learning_rate": 4.04363267376966e-06, + "loss": 0.0065, + "step": 6966 + }, + { + "epoch": 6.362557077625571, + "grad_norm": 1.437649130821228, + "learning_rate": 4.04261796042618e-06, + "loss": 0.0081, + "step": 6967 + }, + { + "epoch": 6.363470319634703, + "grad_norm": 0.4166823923587799, + "learning_rate": 4.041603247082699e-06, + "loss": 0.0028, + "step": 6968 + }, + { + "epoch": 6.364383561643836, + "grad_norm": 2.151062250137329, + "learning_rate": 4.040588533739219e-06, + "loss": 0.017, + "step": 6969 + }, + { + "epoch": 6.365296803652968, + "grad_norm": 1.1393731832504272, + "learning_rate": 4.039573820395739e-06, + "loss": 0.0068, + "step": 6970 + }, + { + "epoch": 6.3662100456621005, + "grad_norm": 1.9409329891204834, + "learning_rate": 4.0385591070522574e-06, + "loss": 0.0103, + "step": 6971 + }, + { + "epoch": 6.367123287671233, + "grad_norm": 0.40187516808509827, + "learning_rate": 4.037544393708778e-06, + "loss": 0.0033, + "step": 6972 + }, + { + "epoch": 6.368036529680365, + "grad_norm": 0.5792803168296814, + "learning_rate": 4.036529680365297e-06, + "loss": 0.0038, + "step": 6973 + }, + { + "epoch": 6.368949771689498, + "grad_norm": 1.4192887544631958, + "learning_rate": 4.0355149670218165e-06, + "loss": 0.0103, + "step": 6974 + }, + { + "epoch": 6.36986301369863, + "grad_norm": 1.648301362991333, + "learning_rate": 4.034500253678336e-06, + "loss": 0.0055, + "step": 6975 + }, + { + "epoch": 6.370776255707763, + "grad_norm": 5.521856784820557, + "learning_rate": 4.033485540334856e-06, + "loss": 0.0442, + "step": 6976 + }, + { + "epoch": 6.371689497716895, + "grad_norm": 0.3497333228588104, + "learning_rate": 4.032470826991376e-06, + "loss": 0.0026, + "step": 6977 + }, + { + "epoch": 6.372602739726028, + "grad_norm": 1.958071231842041, + "learning_rate": 4.0314561136478944e-06, + "loss": 0.0147, + "step": 6978 + }, + { + "epoch": 6.37351598173516, + "grad_norm": 45.19145202636719, + "learning_rate": 4.030441400304414e-06, + "loss": 0.3084, + "step": 6979 + }, + { + "epoch": 6.3744292237442925, + "grad_norm": 1.5378632545471191, + "learning_rate": 4.029426686960934e-06, + "loss": 0.0117, + "step": 6980 + }, + { + "epoch": 6.375342465753425, + "grad_norm": 0.20422245562076569, + "learning_rate": 4.0284119736174535e-06, + "loss": 0.0012, + "step": 6981 + }, + { + "epoch": 6.3762557077625575, + "grad_norm": 0.124498650431633, + "learning_rate": 4.027397260273973e-06, + "loss": 0.0011, + "step": 6982 + }, + { + "epoch": 6.37716894977169, + "grad_norm": 1.1117284297943115, + "learning_rate": 4.026382546930492e-06, + "loss": 0.0059, + "step": 6983 + }, + { + "epoch": 6.3780821917808215, + "grad_norm": 0.5681750774383545, + "learning_rate": 4.025367833587013e-06, + "loss": 0.0041, + "step": 6984 + }, + { + "epoch": 6.378995433789954, + "grad_norm": 98.3225326538086, + "learning_rate": 4.0243531202435314e-06, + "loss": 2.2715, + "step": 6985 + }, + { + "epoch": 6.379908675799086, + "grad_norm": 6.1907877922058105, + "learning_rate": 4.023338406900051e-06, + "loss": 0.0414, + "step": 6986 + }, + { + "epoch": 6.380821917808219, + "grad_norm": 0.2435901165008545, + "learning_rate": 4.022323693556571e-06, + "loss": 0.0016, + "step": 6987 + }, + { + "epoch": 6.381735159817351, + "grad_norm": 1.8830407857894897, + "learning_rate": 4.02130898021309e-06, + "loss": 0.0159, + "step": 6988 + }, + { + "epoch": 6.382648401826484, + "grad_norm": 0.135617196559906, + "learning_rate": 4.02029426686961e-06, + "loss": 0.0012, + "step": 6989 + }, + { + "epoch": 6.383561643835616, + "grad_norm": 5.5370283126831055, + "learning_rate": 4.019279553526129e-06, + "loss": 0.038, + "step": 6990 + }, + { + "epoch": 6.384474885844749, + "grad_norm": 0.6008830666542053, + "learning_rate": 4.018264840182649e-06, + "loss": 0.0039, + "step": 6991 + }, + { + "epoch": 6.385388127853881, + "grad_norm": 2.304537296295166, + "learning_rate": 4.0172501268391684e-06, + "loss": 0.0149, + "step": 6992 + }, + { + "epoch": 6.3863013698630136, + "grad_norm": 2.2217180728912354, + "learning_rate": 4.016235413495687e-06, + "loss": 0.0102, + "step": 6993 + }, + { + "epoch": 6.387214611872146, + "grad_norm": 2.597374200820923, + "learning_rate": 4.015220700152208e-06, + "loss": 0.0123, + "step": 6994 + }, + { + "epoch": 6.3881278538812785, + "grad_norm": 6.934275150299072, + "learning_rate": 4.014205986808727e-06, + "loss": 0.0544, + "step": 6995 + }, + { + "epoch": 6.389041095890411, + "grad_norm": 1.3299574851989746, + "learning_rate": 4.013191273465246e-06, + "loss": 0.0073, + "step": 6996 + }, + { + "epoch": 6.389954337899543, + "grad_norm": 1.2946171760559082, + "learning_rate": 4.012176560121766e-06, + "loss": 0.0099, + "step": 6997 + }, + { + "epoch": 6.390867579908676, + "grad_norm": 1.6389614343643188, + "learning_rate": 4.011161846778286e-06, + "loss": 0.0144, + "step": 6998 + }, + { + "epoch": 6.391780821917808, + "grad_norm": 11.134117126464844, + "learning_rate": 4.0101471334348054e-06, + "loss": 0.0635, + "step": 6999 + }, + { + "epoch": 6.392694063926941, + "grad_norm": 11.471896171569824, + "learning_rate": 4.009132420091324e-06, + "loss": 0.0669, + "step": 7000 + }, + { + "epoch": 6.393607305936073, + "grad_norm": 0.0163827333599329, + "learning_rate": 4.008117706747844e-06, + "loss": 0.0001, + "step": 7001 + }, + { + "epoch": 6.394520547945206, + "grad_norm": 3.682574987411499, + "learning_rate": 4.007102993404364e-06, + "loss": 0.0206, + "step": 7002 + }, + { + "epoch": 6.395433789954338, + "grad_norm": 1.4777758121490479, + "learning_rate": 4.006088280060883e-06, + "loss": 0.0096, + "step": 7003 + }, + { + "epoch": 6.3963470319634705, + "grad_norm": 3.203167200088501, + "learning_rate": 4.005073566717403e-06, + "loss": 0.0261, + "step": 7004 + }, + { + "epoch": 6.397260273972603, + "grad_norm": 26.63433837890625, + "learning_rate": 4.004058853373922e-06, + "loss": 0.1844, + "step": 7005 + }, + { + "epoch": 6.3981735159817354, + "grad_norm": 1.3354159593582153, + "learning_rate": 4.003044140030442e-06, + "loss": 0.0094, + "step": 7006 + }, + { + "epoch": 6.399086757990868, + "grad_norm": 0.12577491998672485, + "learning_rate": 4.002029426686961e-06, + "loss": 0.0009, + "step": 7007 + }, + { + "epoch": 6.4, + "grad_norm": 0.5562472939491272, + "learning_rate": 4.001014713343481e-06, + "loss": 0.0046, + "step": 7008 + }, + { + "epoch": 6.400913242009133, + "grad_norm": 3.0024776458740234, + "learning_rate": 4.000000000000001e-06, + "loss": 0.0206, + "step": 7009 + }, + { + "epoch": 6.401826484018265, + "grad_norm": 89.02360534667969, + "learning_rate": 3.9989852866565195e-06, + "loss": 1.9288, + "step": 7010 + }, + { + "epoch": 6.402739726027397, + "grad_norm": 13.964315414428711, + "learning_rate": 3.997970573313039e-06, + "loss": 0.1343, + "step": 7011 + }, + { + "epoch": 6.403652968036529, + "grad_norm": 113.65005493164062, + "learning_rate": 3.996955859969559e-06, + "loss": 2.8629, + "step": 7012 + }, + { + "epoch": 6.404566210045662, + "grad_norm": 2.126110315322876, + "learning_rate": 3.995941146626079e-06, + "loss": 0.0159, + "step": 7013 + }, + { + "epoch": 6.405479452054794, + "grad_norm": 0.7893421053886414, + "learning_rate": 3.994926433282598e-06, + "loss": 0.0044, + "step": 7014 + }, + { + "epoch": 6.406392694063927, + "grad_norm": 0.8101407289505005, + "learning_rate": 3.993911719939117e-06, + "loss": 0.0053, + "step": 7015 + }, + { + "epoch": 6.407305936073059, + "grad_norm": 0.14508871734142303, + "learning_rate": 3.992897006595637e-06, + "loss": 0.0007, + "step": 7016 + }, + { + "epoch": 6.4082191780821915, + "grad_norm": 25.545820236206055, + "learning_rate": 3.9918822932521565e-06, + "loss": 0.2601, + "step": 7017 + }, + { + "epoch": 6.409132420091324, + "grad_norm": 1.3941049575805664, + "learning_rate": 3.990867579908676e-06, + "loss": 0.0084, + "step": 7018 + }, + { + "epoch": 6.4100456621004565, + "grad_norm": 1.0699206590652466, + "learning_rate": 3.989852866565196e-06, + "loss": 0.008, + "step": 7019 + }, + { + "epoch": 6.410958904109589, + "grad_norm": 4.721137523651123, + "learning_rate": 3.988838153221716e-06, + "loss": 0.0307, + "step": 7020 + }, + { + "epoch": 6.411872146118721, + "grad_norm": 5.926520824432373, + "learning_rate": 3.9878234398782344e-06, + "loss": 0.0502, + "step": 7021 + }, + { + "epoch": 6.412785388127854, + "grad_norm": 1.6382085084915161, + "learning_rate": 3.986808726534754e-06, + "loss": 0.0092, + "step": 7022 + }, + { + "epoch": 6.413698630136986, + "grad_norm": 0.06295399367809296, + "learning_rate": 3.985794013191274e-06, + "loss": 0.0004, + "step": 7023 + }, + { + "epoch": 6.414611872146119, + "grad_norm": 16.673633575439453, + "learning_rate": 3.9847792998477935e-06, + "loss": 0.1367, + "step": 7024 + }, + { + "epoch": 6.415525114155251, + "grad_norm": 2.066765546798706, + "learning_rate": 3.983764586504313e-06, + "loss": 0.0136, + "step": 7025 + }, + { + "epoch": 6.416438356164384, + "grad_norm": 15.260835647583008, + "learning_rate": 3.982749873160832e-06, + "loss": 0.0952, + "step": 7026 + }, + { + "epoch": 6.417351598173516, + "grad_norm": 7.302321910858154, + "learning_rate": 3.981735159817352e-06, + "loss": 0.043, + "step": 7027 + }, + { + "epoch": 6.4182648401826485, + "grad_norm": 0.03780427947640419, + "learning_rate": 3.9807204464738714e-06, + "loss": 0.0003, + "step": 7028 + }, + { + "epoch": 6.419178082191781, + "grad_norm": 7.389807224273682, + "learning_rate": 3.979705733130391e-06, + "loss": 0.0467, + "step": 7029 + }, + { + "epoch": 6.420091324200913, + "grad_norm": 8.47191047668457, + "learning_rate": 3.978691019786911e-06, + "loss": 0.0566, + "step": 7030 + }, + { + "epoch": 6.421004566210046, + "grad_norm": 0.4507804811000824, + "learning_rate": 3.97767630644343e-06, + "loss": 0.0031, + "step": 7031 + }, + { + "epoch": 6.421917808219178, + "grad_norm": 0.24887515604496002, + "learning_rate": 3.976661593099949e-06, + "loss": 0.002, + "step": 7032 + }, + { + "epoch": 6.422831050228311, + "grad_norm": 0.12843231856822968, + "learning_rate": 3.975646879756469e-06, + "loss": 0.0008, + "step": 7033 + }, + { + "epoch": 6.423744292237443, + "grad_norm": 2.439542531967163, + "learning_rate": 3.974632166412989e-06, + "loss": 0.0209, + "step": 7034 + }, + { + "epoch": 6.424657534246576, + "grad_norm": 85.79859161376953, + "learning_rate": 3.9736174530695084e-06, + "loss": 1.5032, + "step": 7035 + }, + { + "epoch": 6.425570776255708, + "grad_norm": 98.2704849243164, + "learning_rate": 3.972602739726027e-06, + "loss": 0.7796, + "step": 7036 + }, + { + "epoch": 6.426484018264841, + "grad_norm": 5.539215087890625, + "learning_rate": 3.971588026382547e-06, + "loss": 0.03, + "step": 7037 + }, + { + "epoch": 6.427397260273972, + "grad_norm": 0.635082483291626, + "learning_rate": 3.970573313039067e-06, + "loss": 0.0037, + "step": 7038 + }, + { + "epoch": 6.428310502283105, + "grad_norm": 0.0204972792416811, + "learning_rate": 3.969558599695586e-06, + "loss": 0.0001, + "step": 7039 + }, + { + "epoch": 6.429223744292237, + "grad_norm": 2.0731935501098633, + "learning_rate": 3.968543886352106e-06, + "loss": 0.013, + "step": 7040 + }, + { + "epoch": 6.4301369863013695, + "grad_norm": 0.12603971362113953, + "learning_rate": 3.967529173008625e-06, + "loss": 0.0009, + "step": 7041 + }, + { + "epoch": 6.431050228310502, + "grad_norm": 0.2856568694114685, + "learning_rate": 3.966514459665145e-06, + "loss": 0.0007, + "step": 7042 + }, + { + "epoch": 6.4319634703196344, + "grad_norm": 3.0559542179107666, + "learning_rate": 3.965499746321664e-06, + "loss": 0.0187, + "step": 7043 + }, + { + "epoch": 6.432876712328767, + "grad_norm": 2.197442054748535, + "learning_rate": 3.964485032978184e-06, + "loss": 0.0164, + "step": 7044 + }, + { + "epoch": 6.433789954337899, + "grad_norm": 5.5933027267456055, + "learning_rate": 3.963470319634704e-06, + "loss": 0.0374, + "step": 7045 + }, + { + "epoch": 6.434703196347032, + "grad_norm": 6.012284755706787, + "learning_rate": 3.9624556062912225e-06, + "loss": 0.0426, + "step": 7046 + }, + { + "epoch": 6.435616438356164, + "grad_norm": 31.60468864440918, + "learning_rate": 3.961440892947743e-06, + "loss": 0.2362, + "step": 7047 + }, + { + "epoch": 6.436529680365297, + "grad_norm": 1.1010419130325317, + "learning_rate": 3.960426179604262e-06, + "loss": 0.0068, + "step": 7048 + }, + { + "epoch": 6.437442922374429, + "grad_norm": 0.7271740436553955, + "learning_rate": 3.9594114662607816e-06, + "loss": 0.005, + "step": 7049 + }, + { + "epoch": 6.438356164383562, + "grad_norm": 1.4127116203308105, + "learning_rate": 3.958396752917301e-06, + "loss": 0.0071, + "step": 7050 + }, + { + "epoch": 6.439269406392694, + "grad_norm": 56.8497428894043, + "learning_rate": 3.95738203957382e-06, + "loss": 0.3961, + "step": 7051 + }, + { + "epoch": 6.4401826484018265, + "grad_norm": 1.2127466201782227, + "learning_rate": 3.956367326230341e-06, + "loss": 0.0091, + "step": 7052 + }, + { + "epoch": 6.441095890410959, + "grad_norm": 48.93115234375, + "learning_rate": 3.9553526128868595e-06, + "loss": 0.3571, + "step": 7053 + }, + { + "epoch": 6.442009132420091, + "grad_norm": 77.9732894897461, + "learning_rate": 3.954337899543379e-06, + "loss": 2.1906, + "step": 7054 + }, + { + "epoch": 6.442922374429224, + "grad_norm": 0.32681959867477417, + "learning_rate": 3.953323186199899e-06, + "loss": 0.0026, + "step": 7055 + }, + { + "epoch": 6.443835616438356, + "grad_norm": 35.389007568359375, + "learning_rate": 3.9523084728564186e-06, + "loss": 0.4198, + "step": 7056 + }, + { + "epoch": 6.444748858447489, + "grad_norm": 20.216705322265625, + "learning_rate": 3.951293759512938e-06, + "loss": 0.1729, + "step": 7057 + }, + { + "epoch": 6.445662100456621, + "grad_norm": 9.618810653686523, + "learning_rate": 3.950279046169457e-06, + "loss": 0.0761, + "step": 7058 + }, + { + "epoch": 6.446575342465754, + "grad_norm": 38.101417541503906, + "learning_rate": 3.949264332825977e-06, + "loss": 0.5425, + "step": 7059 + }, + { + "epoch": 6.447488584474886, + "grad_norm": 1.6517860889434814, + "learning_rate": 3.9482496194824965e-06, + "loss": 0.0132, + "step": 7060 + }, + { + "epoch": 6.448401826484019, + "grad_norm": 3.171826124191284, + "learning_rate": 3.947234906139016e-06, + "loss": 0.0218, + "step": 7061 + }, + { + "epoch": 6.449315068493151, + "grad_norm": 0.238280788064003, + "learning_rate": 3.946220192795536e-06, + "loss": 0.0016, + "step": 7062 + }, + { + "epoch": 6.4502283105022835, + "grad_norm": 3.665581703186035, + "learning_rate": 3.945205479452055e-06, + "loss": 0.013, + "step": 7063 + }, + { + "epoch": 6.451141552511416, + "grad_norm": 101.55278778076172, + "learning_rate": 3.944190766108575e-06, + "loss": 3.273, + "step": 7064 + }, + { + "epoch": 6.4520547945205475, + "grad_norm": 1.491432547569275, + "learning_rate": 3.943176052765094e-06, + "loss": 0.0109, + "step": 7065 + }, + { + "epoch": 6.45296803652968, + "grad_norm": 63.019622802734375, + "learning_rate": 3.942161339421614e-06, + "loss": 0.9731, + "step": 7066 + }, + { + "epoch": 6.453881278538812, + "grad_norm": 16.773515701293945, + "learning_rate": 3.9411466260781335e-06, + "loss": 0.1431, + "step": 7067 + }, + { + "epoch": 6.454794520547945, + "grad_norm": 1.306717038154602, + "learning_rate": 3.940131912734652e-06, + "loss": 0.0107, + "step": 7068 + }, + { + "epoch": 6.455707762557077, + "grad_norm": 3.994396448135376, + "learning_rate": 3.939117199391173e-06, + "loss": 0.0235, + "step": 7069 + }, + { + "epoch": 6.45662100456621, + "grad_norm": 8.622099876403809, + "learning_rate": 3.938102486047692e-06, + "loss": 0.0397, + "step": 7070 + }, + { + "epoch": 6.457534246575342, + "grad_norm": 4.579776287078857, + "learning_rate": 3.937087772704211e-06, + "loss": 0.0304, + "step": 7071 + }, + { + "epoch": 6.458447488584475, + "grad_norm": 4.403387546539307, + "learning_rate": 3.936073059360731e-06, + "loss": 0.0279, + "step": 7072 + }, + { + "epoch": 6.459360730593607, + "grad_norm": 2.01849627494812, + "learning_rate": 3.93505834601725e-06, + "loss": 0.0146, + "step": 7073 + }, + { + "epoch": 6.46027397260274, + "grad_norm": 0.03525076434016228, + "learning_rate": 3.9340436326737705e-06, + "loss": 0.0002, + "step": 7074 + }, + { + "epoch": 6.461187214611872, + "grad_norm": 63.21949005126953, + "learning_rate": 3.933028919330289e-06, + "loss": 1.1273, + "step": 7075 + }, + { + "epoch": 6.4621004566210045, + "grad_norm": 0.25918063521385193, + "learning_rate": 3.932014205986809e-06, + "loss": 0.0019, + "step": 7076 + }, + { + "epoch": 6.463013698630137, + "grad_norm": 1.1152912378311157, + "learning_rate": 3.930999492643329e-06, + "loss": 0.0088, + "step": 7077 + }, + { + "epoch": 6.463926940639269, + "grad_norm": 3.6697349548339844, + "learning_rate": 3.929984779299848e-06, + "loss": 0.0259, + "step": 7078 + }, + { + "epoch": 6.464840182648402, + "grad_norm": 3.8284568786621094, + "learning_rate": 3.928970065956368e-06, + "loss": 0.025, + "step": 7079 + }, + { + "epoch": 6.465753424657534, + "grad_norm": 1.4462312459945679, + "learning_rate": 3.927955352612887e-06, + "loss": 0.0093, + "step": 7080 + }, + { + "epoch": 6.466666666666667, + "grad_norm": 14.668274879455566, + "learning_rate": 3.926940639269407e-06, + "loss": 0.1077, + "step": 7081 + }, + { + "epoch": 6.467579908675799, + "grad_norm": 0.3829931914806366, + "learning_rate": 3.925925925925926e-06, + "loss": 0.0027, + "step": 7082 + }, + { + "epoch": 6.468493150684932, + "grad_norm": 0.28633391857147217, + "learning_rate": 3.924911212582446e-06, + "loss": 0.0022, + "step": 7083 + }, + { + "epoch": 6.469406392694064, + "grad_norm": 11.177261352539062, + "learning_rate": 3.923896499238966e-06, + "loss": 0.0866, + "step": 7084 + }, + { + "epoch": 6.470319634703197, + "grad_norm": 0.23145407438278198, + "learning_rate": 3.9228817858954846e-06, + "loss": 0.0016, + "step": 7085 + }, + { + "epoch": 6.471232876712329, + "grad_norm": 0.8995561003684998, + "learning_rate": 3.921867072552004e-06, + "loss": 0.0072, + "step": 7086 + }, + { + "epoch": 6.4721461187214615, + "grad_norm": 0.4285564422607422, + "learning_rate": 3.920852359208524e-06, + "loss": 0.0028, + "step": 7087 + }, + { + "epoch": 6.473059360730594, + "grad_norm": 5.851683616638184, + "learning_rate": 3.919837645865044e-06, + "loss": 0.0322, + "step": 7088 + }, + { + "epoch": 6.473972602739726, + "grad_norm": 3.1170828342437744, + "learning_rate": 3.918822932521563e-06, + "loss": 0.0143, + "step": 7089 + }, + { + "epoch": 6.474885844748858, + "grad_norm": 3.8128764629364014, + "learning_rate": 3.917808219178082e-06, + "loss": 0.021, + "step": 7090 + }, + { + "epoch": 6.475799086757991, + "grad_norm": 1.7732033729553223, + "learning_rate": 3.916793505834602e-06, + "loss": 0.0129, + "step": 7091 + }, + { + "epoch": 6.476712328767123, + "grad_norm": 0.4360803961753845, + "learning_rate": 3.9157787924911216e-06, + "loss": 0.0043, + "step": 7092 + }, + { + "epoch": 6.477625570776255, + "grad_norm": 2.1411502361297607, + "learning_rate": 3.914764079147641e-06, + "loss": 0.0162, + "step": 7093 + }, + { + "epoch": 6.478538812785388, + "grad_norm": 77.91861724853516, + "learning_rate": 3.913749365804161e-06, + "loss": 0.3093, + "step": 7094 + }, + { + "epoch": 6.47945205479452, + "grad_norm": 17.242122650146484, + "learning_rate": 3.91273465246068e-06, + "loss": 0.1639, + "step": 7095 + }, + { + "epoch": 6.480365296803653, + "grad_norm": 0.061755187809467316, + "learning_rate": 3.9117199391171995e-06, + "loss": 0.0004, + "step": 7096 + }, + { + "epoch": 6.481278538812785, + "grad_norm": 0.5627462863922119, + "learning_rate": 3.910705225773719e-06, + "loss": 0.0036, + "step": 7097 + }, + { + "epoch": 6.482191780821918, + "grad_norm": 6.800261974334717, + "learning_rate": 3.909690512430239e-06, + "loss": 0.0403, + "step": 7098 + }, + { + "epoch": 6.48310502283105, + "grad_norm": 1.375739336013794, + "learning_rate": 3.9086757990867586e-06, + "loss": 0.0088, + "step": 7099 + }, + { + "epoch": 6.4840182648401825, + "grad_norm": 0.1714092344045639, + "learning_rate": 3.907661085743278e-06, + "loss": 0.0018, + "step": 7100 + }, + { + "epoch": 6.484931506849315, + "grad_norm": 1.8257527351379395, + "learning_rate": 3.906646372399797e-06, + "loss": 0.0133, + "step": 7101 + }, + { + "epoch": 6.485844748858447, + "grad_norm": 14.582878112792969, + "learning_rate": 3.905631659056317e-06, + "loss": 0.2159, + "step": 7102 + }, + { + "epoch": 6.48675799086758, + "grad_norm": 5.9608259201049805, + "learning_rate": 3.9046169457128365e-06, + "loss": 0.0524, + "step": 7103 + }, + { + "epoch": 6.487671232876712, + "grad_norm": 1.812774419784546, + "learning_rate": 3.903602232369356e-06, + "loss": 0.0111, + "step": 7104 + }, + { + "epoch": 6.488584474885845, + "grad_norm": 24.57931137084961, + "learning_rate": 3.902587519025876e-06, + "loss": 0.1764, + "step": 7105 + }, + { + "epoch": 6.489497716894977, + "grad_norm": 0.07520081847906113, + "learning_rate": 3.901572805682395e-06, + "loss": 0.0004, + "step": 7106 + }, + { + "epoch": 6.49041095890411, + "grad_norm": 59.6149787902832, + "learning_rate": 3.900558092338914e-06, + "loss": 0.9207, + "step": 7107 + }, + { + "epoch": 6.491324200913242, + "grad_norm": 0.7302426695823669, + "learning_rate": 3.899543378995434e-06, + "loss": 0.0058, + "step": 7108 + }, + { + "epoch": 6.492237442922375, + "grad_norm": 0.03519397974014282, + "learning_rate": 3.898528665651954e-06, + "loss": 0.0002, + "step": 7109 + }, + { + "epoch": 6.493150684931507, + "grad_norm": 11.498497009277344, + "learning_rate": 3.8975139523084735e-06, + "loss": 0.0669, + "step": 7110 + }, + { + "epoch": 6.4940639269406395, + "grad_norm": 85.79281616210938, + "learning_rate": 3.896499238964992e-06, + "loss": 0.8553, + "step": 7111 + }, + { + "epoch": 6.494977168949772, + "grad_norm": 0.022878026589751244, + "learning_rate": 3.895484525621512e-06, + "loss": 0.0002, + "step": 7112 + }, + { + "epoch": 6.495890410958904, + "grad_norm": 0.21993723511695862, + "learning_rate": 3.894469812278032e-06, + "loss": 0.0011, + "step": 7113 + }, + { + "epoch": 6.496803652968037, + "grad_norm": 0.09355924278497696, + "learning_rate": 3.893455098934551e-06, + "loss": 0.0008, + "step": 7114 + }, + { + "epoch": 6.497716894977169, + "grad_norm": 0.11743737012147903, + "learning_rate": 3.892440385591071e-06, + "loss": 0.0009, + "step": 7115 + }, + { + "epoch": 6.498630136986302, + "grad_norm": 8.232685089111328, + "learning_rate": 3.89142567224759e-06, + "loss": 0.0466, + "step": 7116 + }, + { + "epoch": 6.499543378995433, + "grad_norm": 0.5121130347251892, + "learning_rate": 3.89041095890411e-06, + "loss": 0.0036, + "step": 7117 + }, + { + "epoch": 6.500456621004567, + "grad_norm": 0.5074238181114197, + "learning_rate": 3.889396245560629e-06, + "loss": 0.0036, + "step": 7118 + }, + { + "epoch": 6.501369863013698, + "grad_norm": 7.517014980316162, + "learning_rate": 3.888381532217149e-06, + "loss": 0.0429, + "step": 7119 + }, + { + "epoch": 6.502283105022831, + "grad_norm": 0.5826424956321716, + "learning_rate": 3.887366818873669e-06, + "loss": 0.0038, + "step": 7120 + }, + { + "epoch": 6.503196347031963, + "grad_norm": 18.427282333374023, + "learning_rate": 3.8863521055301875e-06, + "loss": 0.1657, + "step": 7121 + }, + { + "epoch": 6.504109589041096, + "grad_norm": 14.47811508178711, + "learning_rate": 3.885337392186708e-06, + "loss": 0.0687, + "step": 7122 + }, + { + "epoch": 6.505022831050228, + "grad_norm": 1.1090106964111328, + "learning_rate": 3.884322678843227e-06, + "loss": 0.0035, + "step": 7123 + }, + { + "epoch": 6.5059360730593605, + "grad_norm": 1.7531803846359253, + "learning_rate": 3.883307965499747e-06, + "loss": 0.013, + "step": 7124 + }, + { + "epoch": 6.506849315068493, + "grad_norm": 0.7932420969009399, + "learning_rate": 3.882293252156266e-06, + "loss": 0.0049, + "step": 7125 + }, + { + "epoch": 6.507762557077625, + "grad_norm": 1.3145010471343994, + "learning_rate": 3.881278538812785e-06, + "loss": 0.0102, + "step": 7126 + }, + { + "epoch": 6.508675799086758, + "grad_norm": 24.28494644165039, + "learning_rate": 3.880263825469306e-06, + "loss": 0.1598, + "step": 7127 + }, + { + "epoch": 6.50958904109589, + "grad_norm": 0.9439361095428467, + "learning_rate": 3.8792491121258245e-06, + "loss": 0.0076, + "step": 7128 + }, + { + "epoch": 6.510502283105023, + "grad_norm": 0.5779951214790344, + "learning_rate": 3.878234398782344e-06, + "loss": 0.0026, + "step": 7129 + }, + { + "epoch": 6.511415525114155, + "grad_norm": 47.84369659423828, + "learning_rate": 3.877219685438864e-06, + "loss": 0.7734, + "step": 7130 + }, + { + "epoch": 6.512328767123288, + "grad_norm": 47.60697937011719, + "learning_rate": 3.876204972095383e-06, + "loss": 0.3921, + "step": 7131 + }, + { + "epoch": 6.51324200913242, + "grad_norm": 2.5957610607147217, + "learning_rate": 3.875190258751903e-06, + "loss": 0.0123, + "step": 7132 + }, + { + "epoch": 6.514155251141553, + "grad_norm": 0.1513276994228363, + "learning_rate": 3.874175545408422e-06, + "loss": 0.0011, + "step": 7133 + }, + { + "epoch": 6.515068493150685, + "grad_norm": 0.09759090840816498, + "learning_rate": 3.873160832064942e-06, + "loss": 0.0006, + "step": 7134 + }, + { + "epoch": 6.5159817351598175, + "grad_norm": 69.01766967773438, + "learning_rate": 3.8721461187214615e-06, + "loss": 0.8975, + "step": 7135 + }, + { + "epoch": 6.51689497716895, + "grad_norm": 1.634248971939087, + "learning_rate": 3.871131405377981e-06, + "loss": 0.0107, + "step": 7136 + }, + { + "epoch": 6.517808219178082, + "grad_norm": 4.855834007263184, + "learning_rate": 3.870116692034501e-06, + "loss": 0.0234, + "step": 7137 + }, + { + "epoch": 6.518721461187215, + "grad_norm": 0.6566715240478516, + "learning_rate": 3.86910197869102e-06, + "loss": 0.0053, + "step": 7138 + }, + { + "epoch": 6.519634703196347, + "grad_norm": 5.656027317047119, + "learning_rate": 3.8680872653475395e-06, + "loss": 0.0497, + "step": 7139 + }, + { + "epoch": 6.52054794520548, + "grad_norm": 90.09362030029297, + "learning_rate": 3.867072552004059e-06, + "loss": 0.6902, + "step": 7140 + }, + { + "epoch": 6.521461187214612, + "grad_norm": 6.982358932495117, + "learning_rate": 3.866057838660579e-06, + "loss": 0.0462, + "step": 7141 + }, + { + "epoch": 6.522374429223745, + "grad_norm": 2.8906424045562744, + "learning_rate": 3.8650431253170985e-06, + "loss": 0.014, + "step": 7142 + }, + { + "epoch": 6.523287671232877, + "grad_norm": 2.271347999572754, + "learning_rate": 3.864028411973617e-06, + "loss": 0.0164, + "step": 7143 + }, + { + "epoch": 6.524200913242009, + "grad_norm": 0.25707709789276123, + "learning_rate": 3.863013698630138e-06, + "loss": 0.0018, + "step": 7144 + }, + { + "epoch": 6.525114155251142, + "grad_norm": 35.26117706298828, + "learning_rate": 3.861998985286657e-06, + "loss": 0.4391, + "step": 7145 + }, + { + "epoch": 6.526027397260274, + "grad_norm": 0.5356554388999939, + "learning_rate": 3.8609842719431765e-06, + "loss": 0.0035, + "step": 7146 + }, + { + "epoch": 6.526940639269406, + "grad_norm": 1.131523609161377, + "learning_rate": 3.859969558599696e-06, + "loss": 0.0058, + "step": 7147 + }, + { + "epoch": 6.5278538812785385, + "grad_norm": 0.8076293468475342, + "learning_rate": 3.858954845256215e-06, + "loss": 0.0052, + "step": 7148 + }, + { + "epoch": 6.528767123287671, + "grad_norm": 9.055542945861816, + "learning_rate": 3.8579401319127355e-06, + "loss": 0.1019, + "step": 7149 + }, + { + "epoch": 6.529680365296803, + "grad_norm": 11.721227645874023, + "learning_rate": 3.856925418569254e-06, + "loss": 0.0657, + "step": 7150 + }, + { + "epoch": 6.530593607305936, + "grad_norm": 90.04353332519531, + "learning_rate": 3.855910705225774e-06, + "loss": 0.3238, + "step": 7151 + }, + { + "epoch": 6.531506849315068, + "grad_norm": 0.7629937529563904, + "learning_rate": 3.854895991882294e-06, + "loss": 0.0075, + "step": 7152 + }, + { + "epoch": 6.532420091324201, + "grad_norm": 1.6569148302078247, + "learning_rate": 3.853881278538813e-06, + "loss": 0.0103, + "step": 7153 + }, + { + "epoch": 6.533333333333333, + "grad_norm": 93.38300323486328, + "learning_rate": 3.852866565195333e-06, + "loss": 2.8158, + "step": 7154 + }, + { + "epoch": 6.534246575342466, + "grad_norm": 0.0238991416990757, + "learning_rate": 3.851851851851852e-06, + "loss": 0.0002, + "step": 7155 + }, + { + "epoch": 6.535159817351598, + "grad_norm": 7.823836326599121, + "learning_rate": 3.850837138508372e-06, + "loss": 0.0471, + "step": 7156 + }, + { + "epoch": 6.536073059360731, + "grad_norm": 0.10311391204595566, + "learning_rate": 3.849822425164891e-06, + "loss": 0.0008, + "step": 7157 + }, + { + "epoch": 6.536986301369863, + "grad_norm": 0.5261028409004211, + "learning_rate": 3.848807711821411e-06, + "loss": 0.0035, + "step": 7158 + }, + { + "epoch": 6.5378995433789955, + "grad_norm": 11.185664176940918, + "learning_rate": 3.847792998477931e-06, + "loss": 0.0791, + "step": 7159 + }, + { + "epoch": 6.538812785388128, + "grad_norm": 11.611106872558594, + "learning_rate": 3.84677828513445e-06, + "loss": 0.0671, + "step": 7160 + }, + { + "epoch": 6.53972602739726, + "grad_norm": 0.02490413561463356, + "learning_rate": 3.845763571790969e-06, + "loss": 0.0002, + "step": 7161 + }, + { + "epoch": 6.540639269406393, + "grad_norm": 1.2432000637054443, + "learning_rate": 3.844748858447489e-06, + "loss": 0.0064, + "step": 7162 + }, + { + "epoch": 6.541552511415525, + "grad_norm": 4.638020992279053, + "learning_rate": 3.843734145104009e-06, + "loss": 0.0411, + "step": 7163 + }, + { + "epoch": 6.542465753424658, + "grad_norm": 0.6264899969100952, + "learning_rate": 3.842719431760528e-06, + "loss": 0.0059, + "step": 7164 + }, + { + "epoch": 6.54337899543379, + "grad_norm": 0.6546725630760193, + "learning_rate": 3.841704718417047e-06, + "loss": 0.0042, + "step": 7165 + }, + { + "epoch": 6.544292237442923, + "grad_norm": 0.7264931201934814, + "learning_rate": 3.840690005073567e-06, + "loss": 0.0065, + "step": 7166 + }, + { + "epoch": 6.545205479452055, + "grad_norm": 0.34409022331237793, + "learning_rate": 3.839675291730087e-06, + "loss": 0.0022, + "step": 7167 + }, + { + "epoch": 6.546118721461188, + "grad_norm": 68.65106964111328, + "learning_rate": 3.838660578386606e-06, + "loss": 0.8765, + "step": 7168 + }, + { + "epoch": 6.54703196347032, + "grad_norm": 0.11103720217943192, + "learning_rate": 3.837645865043126e-06, + "loss": 0.001, + "step": 7169 + }, + { + "epoch": 6.5479452054794525, + "grad_norm": 2.869906425476074, + "learning_rate": 3.836631151699645e-06, + "loss": 0.0212, + "step": 7170 + }, + { + "epoch": 6.548858447488584, + "grad_norm": 17.812875747680664, + "learning_rate": 3.8356164383561645e-06, + "loss": 0.2543, + "step": 7171 + }, + { + "epoch": 6.549771689497717, + "grad_norm": 1.9745965003967285, + "learning_rate": 3.834601725012684e-06, + "loss": 0.0152, + "step": 7172 + }, + { + "epoch": 6.550684931506849, + "grad_norm": 29.663381576538086, + "learning_rate": 3.833587011669204e-06, + "loss": 0.2462, + "step": 7173 + }, + { + "epoch": 6.551598173515981, + "grad_norm": 132.2650604248047, + "learning_rate": 3.832572298325724e-06, + "loss": 1.415, + "step": 7174 + }, + { + "epoch": 6.552511415525114, + "grad_norm": 20.612361907958984, + "learning_rate": 3.8315575849822424e-06, + "loss": 0.1442, + "step": 7175 + }, + { + "epoch": 6.553424657534246, + "grad_norm": 0.2156650722026825, + "learning_rate": 3.830542871638762e-06, + "loss": 0.0017, + "step": 7176 + }, + { + "epoch": 6.554337899543379, + "grad_norm": 47.62466812133789, + "learning_rate": 3.829528158295282e-06, + "loss": 0.3447, + "step": 7177 + }, + { + "epoch": 6.555251141552511, + "grad_norm": 51.7250862121582, + "learning_rate": 3.8285134449518015e-06, + "loss": 0.3968, + "step": 7178 + }, + { + "epoch": 6.556164383561644, + "grad_norm": 32.438297271728516, + "learning_rate": 3.827498731608321e-06, + "loss": 0.1842, + "step": 7179 + }, + { + "epoch": 6.557077625570776, + "grad_norm": 11.380829811096191, + "learning_rate": 3.826484018264841e-06, + "loss": 0.0662, + "step": 7180 + }, + { + "epoch": 6.557990867579909, + "grad_norm": 27.61581802368164, + "learning_rate": 3.82546930492136e-06, + "loss": 0.2811, + "step": 7181 + }, + { + "epoch": 6.558904109589041, + "grad_norm": 4.9732890129089355, + "learning_rate": 3.8244545915778794e-06, + "loss": 0.0279, + "step": 7182 + }, + { + "epoch": 6.5598173515981735, + "grad_norm": 0.49796855449676514, + "learning_rate": 3.823439878234399e-06, + "loss": 0.0031, + "step": 7183 + }, + { + "epoch": 6.560730593607306, + "grad_norm": 35.55619430541992, + "learning_rate": 3.822425164890919e-06, + "loss": 0.4333, + "step": 7184 + }, + { + "epoch": 6.561643835616438, + "grad_norm": 29.4481258392334, + "learning_rate": 3.8214104515474385e-06, + "loss": 0.3083, + "step": 7185 + }, + { + "epoch": 6.562557077625571, + "grad_norm": 24.34479522705078, + "learning_rate": 3.820395738203957e-06, + "loss": 0.1893, + "step": 7186 + }, + { + "epoch": 6.563470319634703, + "grad_norm": 44.672508239746094, + "learning_rate": 3.819381024860477e-06, + "loss": 0.5052, + "step": 7187 + }, + { + "epoch": 6.564383561643836, + "grad_norm": 0.7438127398490906, + "learning_rate": 3.818366311516997e-06, + "loss": 0.0048, + "step": 7188 + }, + { + "epoch": 6.565296803652968, + "grad_norm": 25.656301498413086, + "learning_rate": 3.8173515981735164e-06, + "loss": 0.2325, + "step": 7189 + }, + { + "epoch": 6.566210045662101, + "grad_norm": 1.5432084798812866, + "learning_rate": 3.816336884830036e-06, + "loss": 0.0111, + "step": 7190 + }, + { + "epoch": 6.567123287671233, + "grad_norm": 3.5329129695892334, + "learning_rate": 3.815322171486555e-06, + "loss": 0.0216, + "step": 7191 + }, + { + "epoch": 6.5680365296803656, + "grad_norm": 32.14685821533203, + "learning_rate": 3.814307458143075e-06, + "loss": 0.1509, + "step": 7192 + }, + { + "epoch": 6.568949771689498, + "grad_norm": 0.8610326051712036, + "learning_rate": 3.8132927447995944e-06, + "loss": 0.0059, + "step": 7193 + }, + { + "epoch": 6.5698630136986305, + "grad_norm": 1.6015419960021973, + "learning_rate": 3.812278031456114e-06, + "loss": 0.0105, + "step": 7194 + }, + { + "epoch": 6.570776255707763, + "grad_norm": 0.46770963072776794, + "learning_rate": 3.8112633181126333e-06, + "loss": 0.0037, + "step": 7195 + }, + { + "epoch": 6.5716894977168945, + "grad_norm": 27.693431854248047, + "learning_rate": 3.810248604769153e-06, + "loss": 0.2475, + "step": 7196 + }, + { + "epoch": 6.572602739726028, + "grad_norm": 0.6417356133460999, + "learning_rate": 3.8092338914256727e-06, + "loss": 0.0038, + "step": 7197 + }, + { + "epoch": 6.573515981735159, + "grad_norm": 1.4927071332931519, + "learning_rate": 3.808219178082192e-06, + "loss": 0.0109, + "step": 7198 + }, + { + "epoch": 6.574429223744293, + "grad_norm": 0.9657244086265564, + "learning_rate": 3.8072044647387117e-06, + "loss": 0.0077, + "step": 7199 + }, + { + "epoch": 6.575342465753424, + "grad_norm": 3.1589255332946777, + "learning_rate": 3.8061897513952314e-06, + "loss": 0.0219, + "step": 7200 + }, + { + "epoch": 6.576255707762557, + "grad_norm": 5.62777853012085, + "learning_rate": 3.8051750380517506e-06, + "loss": 0.0308, + "step": 7201 + }, + { + "epoch": 6.577168949771689, + "grad_norm": 0.6305202841758728, + "learning_rate": 3.8041603247082703e-06, + "loss": 0.0046, + "step": 7202 + }, + { + "epoch": 6.578082191780822, + "grad_norm": 6.33172082901001, + "learning_rate": 3.8031456113647896e-06, + "loss": 0.0449, + "step": 7203 + }, + { + "epoch": 6.578995433789954, + "grad_norm": 0.599450409412384, + "learning_rate": 3.8021308980213097e-06, + "loss": 0.0047, + "step": 7204 + }, + { + "epoch": 6.579908675799087, + "grad_norm": 0.7477272152900696, + "learning_rate": 3.801116184677829e-06, + "loss": 0.0054, + "step": 7205 + }, + { + "epoch": 6.580821917808219, + "grad_norm": 123.88484954833984, + "learning_rate": 3.8001014713343482e-06, + "loss": 3.351, + "step": 7206 + }, + { + "epoch": 6.5817351598173515, + "grad_norm": 2.123669147491455, + "learning_rate": 3.799086757990868e-06, + "loss": 0.0066, + "step": 7207 + }, + { + "epoch": 6.582648401826484, + "grad_norm": 30.366024017333984, + "learning_rate": 3.798072044647387e-06, + "loss": 0.3231, + "step": 7208 + }, + { + "epoch": 6.583561643835616, + "grad_norm": 12.694185256958008, + "learning_rate": 3.7970573313039073e-06, + "loss": 0.0869, + "step": 7209 + }, + { + "epoch": 6.584474885844749, + "grad_norm": 0.2679733633995056, + "learning_rate": 3.7960426179604266e-06, + "loss": 0.0015, + "step": 7210 + }, + { + "epoch": 6.585388127853881, + "grad_norm": 0.11192528903484344, + "learning_rate": 3.795027904616946e-06, + "loss": 0.001, + "step": 7211 + }, + { + "epoch": 6.586301369863014, + "grad_norm": 9.340005874633789, + "learning_rate": 3.7940131912734655e-06, + "loss": 0.0919, + "step": 7212 + }, + { + "epoch": 6.587214611872146, + "grad_norm": 0.08138057589530945, + "learning_rate": 3.792998477929985e-06, + "loss": 0.0006, + "step": 7213 + }, + { + "epoch": 6.588127853881279, + "grad_norm": 0.2626964747905731, + "learning_rate": 3.791983764586505e-06, + "loss": 0.001, + "step": 7214 + }, + { + "epoch": 6.589041095890411, + "grad_norm": 2.3853468894958496, + "learning_rate": 3.790969051243024e-06, + "loss": 0.0112, + "step": 7215 + }, + { + "epoch": 6.5899543378995435, + "grad_norm": 2.4299144744873047, + "learning_rate": 3.7899543378995435e-06, + "loss": 0.0202, + "step": 7216 + }, + { + "epoch": 6.590867579908676, + "grad_norm": 9.240960121154785, + "learning_rate": 3.788939624556063e-06, + "loss": 0.0588, + "step": 7217 + }, + { + "epoch": 6.5917808219178085, + "grad_norm": 17.8145694732666, + "learning_rate": 3.787924911212583e-06, + "loss": 0.1305, + "step": 7218 + }, + { + "epoch": 6.592694063926941, + "grad_norm": 3.0616021156311035, + "learning_rate": 3.7869101978691025e-06, + "loss": 0.0128, + "step": 7219 + }, + { + "epoch": 6.593607305936073, + "grad_norm": 8.049419403076172, + "learning_rate": 3.785895484525622e-06, + "loss": 0.053, + "step": 7220 + }, + { + "epoch": 6.594520547945206, + "grad_norm": 0.07818853110074997, + "learning_rate": 3.784880771182141e-06, + "loss": 0.0003, + "step": 7221 + }, + { + "epoch": 6.595433789954338, + "grad_norm": 0.21569697558879852, + "learning_rate": 3.783866057838661e-06, + "loss": 0.0016, + "step": 7222 + }, + { + "epoch": 6.59634703196347, + "grad_norm": 0.4480828642845154, + "learning_rate": 3.7828513444951805e-06, + "loss": 0.0031, + "step": 7223 + }, + { + "epoch": 6.597260273972603, + "grad_norm": 1.1932278871536255, + "learning_rate": 3.7818366311517e-06, + "loss": 0.0081, + "step": 7224 + }, + { + "epoch": 6.598173515981735, + "grad_norm": 54.03636169433594, + "learning_rate": 3.7808219178082194e-06, + "loss": 0.4059, + "step": 7225 + }, + { + "epoch": 6.599086757990867, + "grad_norm": 1.4527117013931274, + "learning_rate": 3.7798072044647387e-06, + "loss": 0.0088, + "step": 7226 + }, + { + "epoch": 6.6, + "grad_norm": 40.68848419189453, + "learning_rate": 3.778792491121259e-06, + "loss": 0.2892, + "step": 7227 + }, + { + "epoch": 6.600913242009132, + "grad_norm": 2.3269333839416504, + "learning_rate": 3.777777777777778e-06, + "loss": 0.011, + "step": 7228 + }, + { + "epoch": 6.6018264840182646, + "grad_norm": 16.484657287597656, + "learning_rate": 3.7767630644342978e-06, + "loss": 0.0917, + "step": 7229 + }, + { + "epoch": 6.602739726027397, + "grad_norm": 5.320441246032715, + "learning_rate": 3.775748351090817e-06, + "loss": 0.0259, + "step": 7230 + }, + { + "epoch": 6.6036529680365295, + "grad_norm": 3.3679327964782715, + "learning_rate": 3.7747336377473363e-06, + "loss": 0.0164, + "step": 7231 + }, + { + "epoch": 6.604566210045662, + "grad_norm": 1.6511152982711792, + "learning_rate": 3.7737189244038564e-06, + "loss": 0.0162, + "step": 7232 + }, + { + "epoch": 6.605479452054794, + "grad_norm": 12.698073387145996, + "learning_rate": 3.7727042110603757e-06, + "loss": 0.064, + "step": 7233 + }, + { + "epoch": 6.606392694063927, + "grad_norm": 15.711335182189941, + "learning_rate": 3.7716894977168954e-06, + "loss": 0.0742, + "step": 7234 + }, + { + "epoch": 6.607305936073059, + "grad_norm": 6.773543834686279, + "learning_rate": 3.7706747843734147e-06, + "loss": 0.0296, + "step": 7235 + }, + { + "epoch": 6.608219178082192, + "grad_norm": 0.5040116906166077, + "learning_rate": 3.7696600710299343e-06, + "loss": 0.0033, + "step": 7236 + }, + { + "epoch": 6.609132420091324, + "grad_norm": 1.207558512687683, + "learning_rate": 3.768645357686454e-06, + "loss": 0.0084, + "step": 7237 + }, + { + "epoch": 6.610045662100457, + "grad_norm": 6.934304714202881, + "learning_rate": 3.7676306443429733e-06, + "loss": 0.0396, + "step": 7238 + }, + { + "epoch": 6.610958904109589, + "grad_norm": 0.43120890855789185, + "learning_rate": 3.766615930999493e-06, + "loss": 0.0037, + "step": 7239 + }, + { + "epoch": 6.6118721461187215, + "grad_norm": 12.483438491821289, + "learning_rate": 3.7656012176560127e-06, + "loss": 0.0813, + "step": 7240 + }, + { + "epoch": 6.612785388127854, + "grad_norm": 10.418181419372559, + "learning_rate": 3.764586504312532e-06, + "loss": 0.0741, + "step": 7241 + }, + { + "epoch": 6.6136986301369864, + "grad_norm": 12.645187377929688, + "learning_rate": 3.7635717909690516e-06, + "loss": 0.0952, + "step": 7242 + }, + { + "epoch": 6.614611872146119, + "grad_norm": 16.87702178955078, + "learning_rate": 3.762557077625571e-06, + "loss": 0.1185, + "step": 7243 + }, + { + "epoch": 6.615525114155251, + "grad_norm": 0.8337529301643372, + "learning_rate": 3.761542364282091e-06, + "loss": 0.0049, + "step": 7244 + }, + { + "epoch": 6.616438356164384, + "grad_norm": 1.2554292678833008, + "learning_rate": 3.7605276509386103e-06, + "loss": 0.0072, + "step": 7245 + }, + { + "epoch": 6.617351598173516, + "grad_norm": 0.5657912492752075, + "learning_rate": 3.7595129375951296e-06, + "loss": 0.0034, + "step": 7246 + }, + { + "epoch": 6.618264840182649, + "grad_norm": 12.038294792175293, + "learning_rate": 3.7584982242516493e-06, + "loss": 0.0821, + "step": 7247 + }, + { + "epoch": 6.619178082191781, + "grad_norm": 12.420132637023926, + "learning_rate": 3.7574835109081685e-06, + "loss": 0.0701, + "step": 7248 + }, + { + "epoch": 6.620091324200914, + "grad_norm": 7.513493537902832, + "learning_rate": 3.7564687975646886e-06, + "loss": 0.0478, + "step": 7249 + }, + { + "epoch": 6.621004566210045, + "grad_norm": 29.399227142333984, + "learning_rate": 3.755454084221208e-06, + "loss": 0.1303, + "step": 7250 + }, + { + "epoch": 6.6219178082191785, + "grad_norm": 4.411749362945557, + "learning_rate": 3.754439370877727e-06, + "loss": 0.0322, + "step": 7251 + }, + { + "epoch": 6.62283105022831, + "grad_norm": 0.3944140374660492, + "learning_rate": 3.753424657534247e-06, + "loss": 0.0024, + "step": 7252 + }, + { + "epoch": 6.6237442922374425, + "grad_norm": 0.002784779528155923, + "learning_rate": 3.752409944190766e-06, + "loss": 0.0, + "step": 7253 + }, + { + "epoch": 6.624657534246575, + "grad_norm": 18.52928352355957, + "learning_rate": 3.7513952308472863e-06, + "loss": 0.1488, + "step": 7254 + }, + { + "epoch": 6.6255707762557075, + "grad_norm": 62.826602935791016, + "learning_rate": 3.7503805175038055e-06, + "loss": 0.6647, + "step": 7255 + }, + { + "epoch": 6.62648401826484, + "grad_norm": 0.4288492202758789, + "learning_rate": 3.749365804160325e-06, + "loss": 0.0025, + "step": 7256 + }, + { + "epoch": 6.627397260273972, + "grad_norm": 0.5822691321372986, + "learning_rate": 3.7483510908168445e-06, + "loss": 0.0042, + "step": 7257 + }, + { + "epoch": 6.628310502283105, + "grad_norm": 0.27089935541152954, + "learning_rate": 3.747336377473364e-06, + "loss": 0.0014, + "step": 7258 + }, + { + "epoch": 6.629223744292237, + "grad_norm": 1.6182987689971924, + "learning_rate": 3.746321664129884e-06, + "loss": 0.0099, + "step": 7259 + }, + { + "epoch": 6.63013698630137, + "grad_norm": 1.0189653635025024, + "learning_rate": 3.745306950786403e-06, + "loss": 0.0064, + "step": 7260 + }, + { + "epoch": 6.631050228310502, + "grad_norm": 16.000938415527344, + "learning_rate": 3.7442922374429224e-06, + "loss": 0.1063, + "step": 7261 + }, + { + "epoch": 6.631963470319635, + "grad_norm": 2.5454256534576416, + "learning_rate": 3.7432775240994425e-06, + "loss": 0.0191, + "step": 7262 + }, + { + "epoch": 6.632876712328767, + "grad_norm": 4.634519100189209, + "learning_rate": 3.742262810755962e-06, + "loss": 0.0329, + "step": 7263 + }, + { + "epoch": 6.6337899543378995, + "grad_norm": 95.53146362304688, + "learning_rate": 3.7412480974124815e-06, + "loss": 0.4154, + "step": 7264 + }, + { + "epoch": 6.634703196347032, + "grad_norm": 0.3001129627227783, + "learning_rate": 3.7402333840690008e-06, + "loss": 0.0022, + "step": 7265 + }, + { + "epoch": 6.635616438356164, + "grad_norm": 0.03285665437579155, + "learning_rate": 3.73921867072552e-06, + "loss": 0.0002, + "step": 7266 + }, + { + "epoch": 6.636529680365297, + "grad_norm": 9.613590240478516, + "learning_rate": 3.73820395738204e-06, + "loss": 0.0482, + "step": 7267 + }, + { + "epoch": 6.637442922374429, + "grad_norm": 0.1875305026769638, + "learning_rate": 3.7371892440385594e-06, + "loss": 0.001, + "step": 7268 + }, + { + "epoch": 6.638356164383562, + "grad_norm": 52.965728759765625, + "learning_rate": 3.736174530695079e-06, + "loss": 0.5065, + "step": 7269 + }, + { + "epoch": 6.639269406392694, + "grad_norm": 1.253082513809204, + "learning_rate": 3.7351598173515984e-06, + "loss": 0.0085, + "step": 7270 + }, + { + "epoch": 6.640182648401827, + "grad_norm": 0.5495775938034058, + "learning_rate": 3.7341451040081176e-06, + "loss": 0.0038, + "step": 7271 + }, + { + "epoch": 6.641095890410959, + "grad_norm": 29.526695251464844, + "learning_rate": 3.7331303906646378e-06, + "loss": 0.2123, + "step": 7272 + }, + { + "epoch": 6.642009132420092, + "grad_norm": 115.89295959472656, + "learning_rate": 3.732115677321157e-06, + "loss": 2.8652, + "step": 7273 + }, + { + "epoch": 6.642922374429224, + "grad_norm": 2.2272117137908936, + "learning_rate": 3.7311009639776767e-06, + "loss": 0.008, + "step": 7274 + }, + { + "epoch": 6.6438356164383565, + "grad_norm": 0.5195983648300171, + "learning_rate": 3.730086250634196e-06, + "loss": 0.0035, + "step": 7275 + }, + { + "epoch": 6.644748858447489, + "grad_norm": 3.1596195697784424, + "learning_rate": 3.7290715372907157e-06, + "loss": 0.0199, + "step": 7276 + }, + { + "epoch": 6.6456621004566205, + "grad_norm": 0.09348420798778534, + "learning_rate": 3.7280568239472354e-06, + "loss": 0.0005, + "step": 7277 + }, + { + "epoch": 6.646575342465754, + "grad_norm": 2.1177501678466797, + "learning_rate": 3.7270421106037546e-06, + "loss": 0.0164, + "step": 7278 + }, + { + "epoch": 6.647488584474885, + "grad_norm": 11.086904525756836, + "learning_rate": 3.7260273972602743e-06, + "loss": 0.033, + "step": 7279 + }, + { + "epoch": 6.648401826484018, + "grad_norm": 0.2575398087501526, + "learning_rate": 3.725012683916794e-06, + "loss": 0.002, + "step": 7280 + }, + { + "epoch": 6.64931506849315, + "grad_norm": 9.448616981506348, + "learning_rate": 3.7239979705733133e-06, + "loss": 0.049, + "step": 7281 + }, + { + "epoch": 6.650228310502283, + "grad_norm": 1.158607006072998, + "learning_rate": 3.722983257229833e-06, + "loss": 0.0056, + "step": 7282 + }, + { + "epoch": 6.651141552511415, + "grad_norm": 0.12404650449752808, + "learning_rate": 3.7219685438863522e-06, + "loss": 0.0011, + "step": 7283 + }, + { + "epoch": 6.652054794520548, + "grad_norm": 0.708348274230957, + "learning_rate": 3.7209538305428724e-06, + "loss": 0.0048, + "step": 7284 + }, + { + "epoch": 6.65296803652968, + "grad_norm": 83.53624725341797, + "learning_rate": 3.7199391171993916e-06, + "loss": 2.2669, + "step": 7285 + }, + { + "epoch": 6.653881278538813, + "grad_norm": 5.88054895401001, + "learning_rate": 3.718924403855911e-06, + "loss": 0.0491, + "step": 7286 + }, + { + "epoch": 6.654794520547945, + "grad_norm": 0.42009031772613525, + "learning_rate": 3.7179096905124306e-06, + "loss": 0.003, + "step": 7287 + }, + { + "epoch": 6.6557077625570775, + "grad_norm": 26.739168167114258, + "learning_rate": 3.71689497716895e-06, + "loss": 0.1544, + "step": 7288 + }, + { + "epoch": 6.65662100456621, + "grad_norm": 0.6860663294792175, + "learning_rate": 3.71588026382547e-06, + "loss": 0.0066, + "step": 7289 + }, + { + "epoch": 6.657534246575342, + "grad_norm": 0.7041828632354736, + "learning_rate": 3.7148655504819892e-06, + "loss": 0.0041, + "step": 7290 + }, + { + "epoch": 6.658447488584475, + "grad_norm": 21.104475021362305, + "learning_rate": 3.7138508371385085e-06, + "loss": 0.1063, + "step": 7291 + }, + { + "epoch": 6.659360730593607, + "grad_norm": 4.912707328796387, + "learning_rate": 3.712836123795028e-06, + "loss": 0.0401, + "step": 7292 + }, + { + "epoch": 6.66027397260274, + "grad_norm": 62.408546447753906, + "learning_rate": 3.7118214104515475e-06, + "loss": 0.7062, + "step": 7293 + }, + { + "epoch": 6.661187214611872, + "grad_norm": 59.24049377441406, + "learning_rate": 3.7108066971080676e-06, + "loss": 0.7681, + "step": 7294 + }, + { + "epoch": 6.662100456621005, + "grad_norm": 2.378509759902954, + "learning_rate": 3.709791983764587e-06, + "loss": 0.0191, + "step": 7295 + }, + { + "epoch": 6.663013698630137, + "grad_norm": 17.895977020263672, + "learning_rate": 3.708777270421106e-06, + "loss": 0.1061, + "step": 7296 + }, + { + "epoch": 6.66392694063927, + "grad_norm": 0.14960701763629913, + "learning_rate": 3.707762557077626e-06, + "loss": 0.0009, + "step": 7297 + }, + { + "epoch": 6.664840182648402, + "grad_norm": 0.031418975442647934, + "learning_rate": 3.7067478437341455e-06, + "loss": 0.0002, + "step": 7298 + }, + { + "epoch": 6.6657534246575345, + "grad_norm": 1.02297043800354, + "learning_rate": 3.705733130390665e-06, + "loss": 0.0077, + "step": 7299 + }, + { + "epoch": 6.666666666666667, + "grad_norm": 1.7788398265838623, + "learning_rate": 3.7047184170471845e-06, + "loss": 0.0096, + "step": 7300 + }, + { + "epoch": 6.667579908675799, + "grad_norm": 5.32322359085083, + "learning_rate": 3.7037037037037037e-06, + "loss": 0.0289, + "step": 7301 + }, + { + "epoch": 6.668493150684932, + "grad_norm": 3.170071601867676, + "learning_rate": 3.702688990360224e-06, + "loss": 0.0217, + "step": 7302 + }, + { + "epoch": 6.669406392694064, + "grad_norm": 6.159772872924805, + "learning_rate": 3.701674277016743e-06, + "loss": 0.0387, + "step": 7303 + }, + { + "epoch": 6.670319634703196, + "grad_norm": 0.7177438139915466, + "learning_rate": 3.700659563673263e-06, + "loss": 0.0044, + "step": 7304 + }, + { + "epoch": 6.671232876712329, + "grad_norm": 1.1756536960601807, + "learning_rate": 3.699644850329782e-06, + "loss": 0.0069, + "step": 7305 + }, + { + "epoch": 6.672146118721461, + "grad_norm": 0.31832242012023926, + "learning_rate": 3.6986301369863014e-06, + "loss": 0.0028, + "step": 7306 + }, + { + "epoch": 6.673059360730593, + "grad_norm": 3.969524621963501, + "learning_rate": 3.6976154236428215e-06, + "loss": 0.0259, + "step": 7307 + }, + { + "epoch": 6.673972602739726, + "grad_norm": 0.61771160364151, + "learning_rate": 3.6966007102993407e-06, + "loss": 0.0046, + "step": 7308 + }, + { + "epoch": 6.674885844748858, + "grad_norm": 17.222057342529297, + "learning_rate": 3.6955859969558604e-06, + "loss": 0.1031, + "step": 7309 + }, + { + "epoch": 6.675799086757991, + "grad_norm": 8.736165046691895, + "learning_rate": 3.6945712836123797e-06, + "loss": 0.0844, + "step": 7310 + }, + { + "epoch": 6.676712328767123, + "grad_norm": 0.5417636632919312, + "learning_rate": 3.693556570268899e-06, + "loss": 0.003, + "step": 7311 + }, + { + "epoch": 6.6776255707762555, + "grad_norm": 0.21571603417396545, + "learning_rate": 3.692541856925419e-06, + "loss": 0.0015, + "step": 7312 + }, + { + "epoch": 6.678538812785388, + "grad_norm": 0.10207348316907883, + "learning_rate": 3.6915271435819384e-06, + "loss": 0.0007, + "step": 7313 + }, + { + "epoch": 6.67945205479452, + "grad_norm": 4.211896896362305, + "learning_rate": 3.690512430238458e-06, + "loss": 0.0244, + "step": 7314 + }, + { + "epoch": 6.680365296803653, + "grad_norm": 7.307104110717773, + "learning_rate": 3.6894977168949773e-06, + "loss": 0.0113, + "step": 7315 + }, + { + "epoch": 6.681278538812785, + "grad_norm": 15.051007270812988, + "learning_rate": 3.688483003551497e-06, + "loss": 0.073, + "step": 7316 + }, + { + "epoch": 6.682191780821918, + "grad_norm": 18.408336639404297, + "learning_rate": 3.6874682902080167e-06, + "loss": 0.1038, + "step": 7317 + }, + { + "epoch": 6.68310502283105, + "grad_norm": 1.5021672248840332, + "learning_rate": 3.686453576864536e-06, + "loss": 0.0136, + "step": 7318 + }, + { + "epoch": 6.684018264840183, + "grad_norm": 3.813617706298828, + "learning_rate": 3.6854388635210557e-06, + "loss": 0.0307, + "step": 7319 + }, + { + "epoch": 6.684931506849315, + "grad_norm": 0.3600378930568695, + "learning_rate": 3.6844241501775753e-06, + "loss": 0.0026, + "step": 7320 + }, + { + "epoch": 6.685844748858448, + "grad_norm": 4.761172771453857, + "learning_rate": 3.6834094368340946e-06, + "loss": 0.0417, + "step": 7321 + }, + { + "epoch": 6.68675799086758, + "grad_norm": 0.3648829460144043, + "learning_rate": 3.6823947234906143e-06, + "loss": 0.0033, + "step": 7322 + }, + { + "epoch": 6.6876712328767125, + "grad_norm": 2.365412950515747, + "learning_rate": 3.6813800101471336e-06, + "loss": 0.014, + "step": 7323 + }, + { + "epoch": 6.688584474885845, + "grad_norm": 7.784708499908447, + "learning_rate": 3.6803652968036537e-06, + "loss": 0.0326, + "step": 7324 + }, + { + "epoch": 6.689497716894977, + "grad_norm": 17.77342414855957, + "learning_rate": 3.679350583460173e-06, + "loss": 0.1527, + "step": 7325 + }, + { + "epoch": 6.69041095890411, + "grad_norm": 1.2424474954605103, + "learning_rate": 3.6783358701166922e-06, + "loss": 0.0072, + "step": 7326 + }, + { + "epoch": 6.691324200913242, + "grad_norm": 93.44783782958984, + "learning_rate": 3.677321156773212e-06, + "loss": 1.4722, + "step": 7327 + }, + { + "epoch": 6.692237442922375, + "grad_norm": 13.678610801696777, + "learning_rate": 3.676306443429731e-06, + "loss": 0.0981, + "step": 7328 + }, + { + "epoch": 6.693150684931507, + "grad_norm": 1.2552807331085205, + "learning_rate": 3.6752917300862513e-06, + "loss": 0.01, + "step": 7329 + }, + { + "epoch": 6.69406392694064, + "grad_norm": 16.573158264160156, + "learning_rate": 3.6742770167427706e-06, + "loss": 0.1086, + "step": 7330 + }, + { + "epoch": 6.694977168949771, + "grad_norm": 71.18212127685547, + "learning_rate": 3.67326230339929e-06, + "loss": 1.3185, + "step": 7331 + }, + { + "epoch": 6.695890410958905, + "grad_norm": 50.19218444824219, + "learning_rate": 3.6722475900558095e-06, + "loss": 0.4616, + "step": 7332 + }, + { + "epoch": 6.696803652968036, + "grad_norm": 90.9581069946289, + "learning_rate": 3.671232876712329e-06, + "loss": 1.0512, + "step": 7333 + }, + { + "epoch": 6.697716894977169, + "grad_norm": 1.1088601350784302, + "learning_rate": 3.670218163368849e-06, + "loss": 0.0072, + "step": 7334 + }, + { + "epoch": 6.698630136986301, + "grad_norm": 7.996056079864502, + "learning_rate": 3.669203450025368e-06, + "loss": 0.0335, + "step": 7335 + }, + { + "epoch": 6.6995433789954335, + "grad_norm": 0.07729429751634598, + "learning_rate": 3.6681887366818875e-06, + "loss": 0.0005, + "step": 7336 + }, + { + "epoch": 6.700456621004566, + "grad_norm": 0.9313794374465942, + "learning_rate": 3.667174023338407e-06, + "loss": 0.0077, + "step": 7337 + }, + { + "epoch": 6.701369863013698, + "grad_norm": 2.2504723072052, + "learning_rate": 3.666159309994927e-06, + "loss": 0.0105, + "step": 7338 + }, + { + "epoch": 6.702283105022831, + "grad_norm": 12.574541091918945, + "learning_rate": 3.6651445966514465e-06, + "loss": 0.1322, + "step": 7339 + }, + { + "epoch": 6.703196347031963, + "grad_norm": 2.3538131713867188, + "learning_rate": 3.664129883307966e-06, + "loss": 0.0155, + "step": 7340 + }, + { + "epoch": 6.704109589041096, + "grad_norm": 2.0700387954711914, + "learning_rate": 3.663115169964485e-06, + "loss": 0.0175, + "step": 7341 + }, + { + "epoch": 6.705022831050228, + "grad_norm": 157.94837951660156, + "learning_rate": 3.662100456621005e-06, + "loss": 1.9377, + "step": 7342 + }, + { + "epoch": 6.705936073059361, + "grad_norm": 1.111524224281311, + "learning_rate": 3.6610857432775245e-06, + "loss": 0.0053, + "step": 7343 + }, + { + "epoch": 6.706849315068493, + "grad_norm": 0.7736087441444397, + "learning_rate": 3.660071029934044e-06, + "loss": 0.0045, + "step": 7344 + }, + { + "epoch": 6.707762557077626, + "grad_norm": 15.361994743347168, + "learning_rate": 3.6590563165905634e-06, + "loss": 0.148, + "step": 7345 + }, + { + "epoch": 6.708675799086758, + "grad_norm": 4.521752834320068, + "learning_rate": 3.6580416032470827e-06, + "loss": 0.038, + "step": 7346 + }, + { + "epoch": 6.7095890410958905, + "grad_norm": 0.8608750104904175, + "learning_rate": 3.657026889903603e-06, + "loss": 0.0046, + "step": 7347 + }, + { + "epoch": 6.710502283105023, + "grad_norm": 39.301063537597656, + "learning_rate": 3.656012176560122e-06, + "loss": 0.2415, + "step": 7348 + }, + { + "epoch": 6.711415525114155, + "grad_norm": 2.4748406410217285, + "learning_rate": 3.6549974632166418e-06, + "loss": 0.0151, + "step": 7349 + }, + { + "epoch": 6.712328767123288, + "grad_norm": 0.9207820892333984, + "learning_rate": 3.653982749873161e-06, + "loss": 0.0077, + "step": 7350 + }, + { + "epoch": 6.71324200913242, + "grad_norm": 1.2210978269577026, + "learning_rate": 3.6529680365296803e-06, + "loss": 0.0083, + "step": 7351 + }, + { + "epoch": 6.714155251141553, + "grad_norm": 2.1165294647216797, + "learning_rate": 3.6519533231862004e-06, + "loss": 0.0146, + "step": 7352 + }, + { + "epoch": 6.715068493150685, + "grad_norm": 40.96381378173828, + "learning_rate": 3.6509386098427197e-06, + "loss": 0.3505, + "step": 7353 + }, + { + "epoch": 6.715981735159818, + "grad_norm": 2.5005667209625244, + "learning_rate": 3.6499238964992394e-06, + "loss": 0.0214, + "step": 7354 + }, + { + "epoch": 6.71689497716895, + "grad_norm": 3.807114839553833, + "learning_rate": 3.6489091831557586e-06, + "loss": 0.0166, + "step": 7355 + }, + { + "epoch": 6.717808219178083, + "grad_norm": 4.987398147583008, + "learning_rate": 3.6478944698122783e-06, + "loss": 0.0344, + "step": 7356 + }, + { + "epoch": 6.718721461187215, + "grad_norm": 3.387132167816162, + "learning_rate": 3.646879756468798e-06, + "loss": 0.021, + "step": 7357 + }, + { + "epoch": 6.719634703196347, + "grad_norm": 12.67447566986084, + "learning_rate": 3.6458650431253173e-06, + "loss": 0.11, + "step": 7358 + }, + { + "epoch": 6.72054794520548, + "grad_norm": 0.17522984743118286, + "learning_rate": 3.644850329781837e-06, + "loss": 0.0013, + "step": 7359 + }, + { + "epoch": 6.7214611872146115, + "grad_norm": 24.48514747619629, + "learning_rate": 3.6438356164383567e-06, + "loss": 0.1361, + "step": 7360 + }, + { + "epoch": 6.722374429223744, + "grad_norm": 15.631121635437012, + "learning_rate": 3.642820903094876e-06, + "loss": 0.1195, + "step": 7361 + }, + { + "epoch": 6.723287671232876, + "grad_norm": 7.199901103973389, + "learning_rate": 3.6418061897513956e-06, + "loss": 0.0596, + "step": 7362 + }, + { + "epoch": 6.724200913242009, + "grad_norm": 13.652734756469727, + "learning_rate": 3.640791476407915e-06, + "loss": 0.0638, + "step": 7363 + }, + { + "epoch": 6.725114155251141, + "grad_norm": 0.4639397859573364, + "learning_rate": 3.639776763064435e-06, + "loss": 0.0033, + "step": 7364 + }, + { + "epoch": 6.726027397260274, + "grad_norm": 24.372150421142578, + "learning_rate": 3.6387620497209543e-06, + "loss": 0.2394, + "step": 7365 + }, + { + "epoch": 6.726940639269406, + "grad_norm": 82.04707336425781, + "learning_rate": 3.6377473363774736e-06, + "loss": 0.7985, + "step": 7366 + }, + { + "epoch": 6.727853881278539, + "grad_norm": 17.831405639648438, + "learning_rate": 3.6367326230339933e-06, + "loss": 0.0979, + "step": 7367 + }, + { + "epoch": 6.728767123287671, + "grad_norm": 4.519169330596924, + "learning_rate": 3.6357179096905125e-06, + "loss": 0.0288, + "step": 7368 + }, + { + "epoch": 6.729680365296804, + "grad_norm": 36.247379302978516, + "learning_rate": 3.6347031963470326e-06, + "loss": 0.234, + "step": 7369 + }, + { + "epoch": 6.730593607305936, + "grad_norm": 4.848525524139404, + "learning_rate": 3.633688483003552e-06, + "loss": 0.025, + "step": 7370 + }, + { + "epoch": 6.7315068493150685, + "grad_norm": 0.1669214814901352, + "learning_rate": 3.632673769660071e-06, + "loss": 0.0012, + "step": 7371 + }, + { + "epoch": 6.732420091324201, + "grad_norm": 2.3948769569396973, + "learning_rate": 3.631659056316591e-06, + "loss": 0.0143, + "step": 7372 + }, + { + "epoch": 6.733333333333333, + "grad_norm": 0.04063297063112259, + "learning_rate": 3.63064434297311e-06, + "loss": 0.0003, + "step": 7373 + }, + { + "epoch": 6.734246575342466, + "grad_norm": 6.687952995300293, + "learning_rate": 3.6296296296296302e-06, + "loss": 0.0456, + "step": 7374 + }, + { + "epoch": 6.735159817351598, + "grad_norm": 3.3822743892669678, + "learning_rate": 3.6286149162861495e-06, + "loss": 0.0185, + "step": 7375 + }, + { + "epoch": 6.736073059360731, + "grad_norm": 3.512389659881592, + "learning_rate": 3.6276002029426688e-06, + "loss": 0.017, + "step": 7376 + }, + { + "epoch": 6.736986301369863, + "grad_norm": 2.122880697250366, + "learning_rate": 3.6265854895991885e-06, + "loss": 0.0126, + "step": 7377 + }, + { + "epoch": 6.737899543378996, + "grad_norm": 25.550565719604492, + "learning_rate": 3.625570776255708e-06, + "loss": 0.2211, + "step": 7378 + }, + { + "epoch": 6.738812785388128, + "grad_norm": 0.5036630630493164, + "learning_rate": 3.624556062912228e-06, + "loss": 0.0035, + "step": 7379 + }, + { + "epoch": 6.739726027397261, + "grad_norm": 0.3415035307407379, + "learning_rate": 3.623541349568747e-06, + "loss": 0.0018, + "step": 7380 + }, + { + "epoch": 6.740639269406393, + "grad_norm": 9.139322280883789, + "learning_rate": 3.6225266362252664e-06, + "loss": 0.0351, + "step": 7381 + }, + { + "epoch": 6.7415525114155255, + "grad_norm": 0.07064466178417206, + "learning_rate": 3.6215119228817865e-06, + "loss": 0.0005, + "step": 7382 + }, + { + "epoch": 6.742465753424657, + "grad_norm": 3.8701748847961426, + "learning_rate": 3.6204972095383058e-06, + "loss": 0.0281, + "step": 7383 + }, + { + "epoch": 6.74337899543379, + "grad_norm": 0.955512285232544, + "learning_rate": 3.6194824961948255e-06, + "loss": 0.0066, + "step": 7384 + }, + { + "epoch": 6.744292237442922, + "grad_norm": 66.12566375732422, + "learning_rate": 3.6184677828513447e-06, + "loss": 0.745, + "step": 7385 + }, + { + "epoch": 6.745205479452055, + "grad_norm": 12.542951583862305, + "learning_rate": 3.617453069507864e-06, + "loss": 0.0817, + "step": 7386 + }, + { + "epoch": 6.746118721461187, + "grad_norm": 0.7241488099098206, + "learning_rate": 3.616438356164384e-06, + "loss": 0.0064, + "step": 7387 + }, + { + "epoch": 6.747031963470319, + "grad_norm": 0.06323787569999695, + "learning_rate": 3.6154236428209034e-06, + "loss": 0.0004, + "step": 7388 + }, + { + "epoch": 6.747945205479452, + "grad_norm": 0.023778527975082397, + "learning_rate": 3.614408929477423e-06, + "loss": 0.0001, + "step": 7389 + }, + { + "epoch": 6.748858447488584, + "grad_norm": 4.374607086181641, + "learning_rate": 3.6133942161339424e-06, + "loss": 0.0252, + "step": 7390 + }, + { + "epoch": 6.749771689497717, + "grad_norm": 37.55704879760742, + "learning_rate": 3.6123795027904616e-06, + "loss": 0.2365, + "step": 7391 + }, + { + "epoch": 6.750684931506849, + "grad_norm": 6.744319915771484, + "learning_rate": 3.6113647894469817e-06, + "loss": 0.0401, + "step": 7392 + }, + { + "epoch": 6.751598173515982, + "grad_norm": 0.7654634714126587, + "learning_rate": 3.610350076103501e-06, + "loss": 0.0044, + "step": 7393 + }, + { + "epoch": 6.752511415525114, + "grad_norm": 34.62944030761719, + "learning_rate": 3.6093353627600207e-06, + "loss": 0.254, + "step": 7394 + }, + { + "epoch": 6.7534246575342465, + "grad_norm": 49.882972717285156, + "learning_rate": 3.60832064941654e-06, + "loss": 0.3089, + "step": 7395 + }, + { + "epoch": 6.754337899543379, + "grad_norm": 8.289661407470703, + "learning_rate": 3.6073059360730597e-06, + "loss": 0.0596, + "step": 7396 + }, + { + "epoch": 6.755251141552511, + "grad_norm": 10.701382637023926, + "learning_rate": 3.6062912227295794e-06, + "loss": 0.0958, + "step": 7397 + }, + { + "epoch": 6.756164383561644, + "grad_norm": 1.8977946043014526, + "learning_rate": 3.6052765093860986e-06, + "loss": 0.0132, + "step": 7398 + }, + { + "epoch": 6.757077625570776, + "grad_norm": 3.2352559566497803, + "learning_rate": 3.6042617960426183e-06, + "loss": 0.0214, + "step": 7399 + }, + { + "epoch": 6.757990867579909, + "grad_norm": 23.53231430053711, + "learning_rate": 3.603247082699138e-06, + "loss": 0.2927, + "step": 7400 + }, + { + "epoch": 6.758904109589041, + "grad_norm": 0.4969744384288788, + "learning_rate": 3.6022323693556573e-06, + "loss": 0.0031, + "step": 7401 + }, + { + "epoch": 6.759817351598174, + "grad_norm": 0.35649573802948, + "learning_rate": 3.601217656012177e-06, + "loss": 0.0023, + "step": 7402 + }, + { + "epoch": 6.760730593607306, + "grad_norm": 116.18306732177734, + "learning_rate": 3.6002029426686962e-06, + "loss": 1.6628, + "step": 7403 + }, + { + "epoch": 6.761643835616439, + "grad_norm": 2.772900342941284, + "learning_rate": 3.5991882293252164e-06, + "loss": 0.0161, + "step": 7404 + }, + { + "epoch": 6.762557077625571, + "grad_norm": 0.08861187845468521, + "learning_rate": 3.5981735159817356e-06, + "loss": 0.0007, + "step": 7405 + }, + { + "epoch": 6.7634703196347035, + "grad_norm": 8.142931938171387, + "learning_rate": 3.597158802638255e-06, + "loss": 0.0443, + "step": 7406 + }, + { + "epoch": 6.764383561643836, + "grad_norm": 5.345365524291992, + "learning_rate": 3.5961440892947746e-06, + "loss": 0.0454, + "step": 7407 + }, + { + "epoch": 6.765296803652968, + "grad_norm": 0.09266132116317749, + "learning_rate": 3.595129375951294e-06, + "loss": 0.0006, + "step": 7408 + }, + { + "epoch": 6.766210045662101, + "grad_norm": 17.822917938232422, + "learning_rate": 3.594114662607814e-06, + "loss": 0.1284, + "step": 7409 + }, + { + "epoch": 6.767123287671232, + "grad_norm": 72.59253692626953, + "learning_rate": 3.5930999492643332e-06, + "loss": 0.4894, + "step": 7410 + }, + { + "epoch": 6.768036529680366, + "grad_norm": 0.23187939822673798, + "learning_rate": 3.5920852359208525e-06, + "loss": 0.0013, + "step": 7411 + }, + { + "epoch": 6.768949771689497, + "grad_norm": 0.6494069695472717, + "learning_rate": 3.591070522577372e-06, + "loss": 0.0042, + "step": 7412 + }, + { + "epoch": 6.76986301369863, + "grad_norm": 60.67249298095703, + "learning_rate": 3.5900558092338915e-06, + "loss": 0.8856, + "step": 7413 + }, + { + "epoch": 6.770776255707762, + "grad_norm": 0.544363796710968, + "learning_rate": 3.5890410958904116e-06, + "loss": 0.0028, + "step": 7414 + }, + { + "epoch": 6.771689497716895, + "grad_norm": 0.7896638512611389, + "learning_rate": 3.588026382546931e-06, + "loss": 0.0047, + "step": 7415 + }, + { + "epoch": 6.772602739726027, + "grad_norm": 1.1632403135299683, + "learning_rate": 3.58701166920345e-06, + "loss": 0.0086, + "step": 7416 + }, + { + "epoch": 6.77351598173516, + "grad_norm": 0.043275970965623856, + "learning_rate": 3.58599695585997e-06, + "loss": 0.0004, + "step": 7417 + }, + { + "epoch": 6.774429223744292, + "grad_norm": 0.373374342918396, + "learning_rate": 3.5849822425164895e-06, + "loss": 0.0026, + "step": 7418 + }, + { + "epoch": 6.7753424657534245, + "grad_norm": 32.06299591064453, + "learning_rate": 3.583967529173009e-06, + "loss": 0.2079, + "step": 7419 + }, + { + "epoch": 6.776255707762557, + "grad_norm": 39.03614044189453, + "learning_rate": 3.5829528158295285e-06, + "loss": 0.2809, + "step": 7420 + }, + { + "epoch": 6.777168949771689, + "grad_norm": 0.6556504368782043, + "learning_rate": 3.5819381024860477e-06, + "loss": 0.0046, + "step": 7421 + }, + { + "epoch": 6.778082191780822, + "grad_norm": 0.3690522015094757, + "learning_rate": 3.580923389142568e-06, + "loss": 0.0019, + "step": 7422 + }, + { + "epoch": 6.778995433789954, + "grad_norm": 84.18517303466797, + "learning_rate": 3.579908675799087e-06, + "loss": 2.4959, + "step": 7423 + }, + { + "epoch": 6.779908675799087, + "grad_norm": 2.1326253414154053, + "learning_rate": 3.578893962455607e-06, + "loss": 0.0122, + "step": 7424 + }, + { + "epoch": 6.780821917808219, + "grad_norm": 1.3797141313552856, + "learning_rate": 3.577879249112126e-06, + "loss": 0.0075, + "step": 7425 + }, + { + "epoch": 6.781735159817352, + "grad_norm": 0.9832749962806702, + "learning_rate": 3.5768645357686453e-06, + "loss": 0.0052, + "step": 7426 + }, + { + "epoch": 6.782648401826484, + "grad_norm": 2.4582886695861816, + "learning_rate": 3.5758498224251655e-06, + "loss": 0.0132, + "step": 7427 + }, + { + "epoch": 6.7835616438356166, + "grad_norm": 1.478623390197754, + "learning_rate": 3.5748351090816847e-06, + "loss": 0.0094, + "step": 7428 + }, + { + "epoch": 6.784474885844749, + "grad_norm": 12.98536491394043, + "learning_rate": 3.5738203957382044e-06, + "loss": 0.1032, + "step": 7429 + }, + { + "epoch": 6.7853881278538815, + "grad_norm": 6.4768877029418945, + "learning_rate": 3.5728056823947237e-06, + "loss": 0.042, + "step": 7430 + }, + { + "epoch": 6.786301369863014, + "grad_norm": 0.746381938457489, + "learning_rate": 3.571790969051243e-06, + "loss": 0.0044, + "step": 7431 + }, + { + "epoch": 6.787214611872146, + "grad_norm": 34.41130447387695, + "learning_rate": 3.570776255707763e-06, + "loss": 0.2357, + "step": 7432 + }, + { + "epoch": 6.788127853881279, + "grad_norm": 0.0504218153655529, + "learning_rate": 3.5697615423642823e-06, + "loss": 0.0004, + "step": 7433 + }, + { + "epoch": 6.789041095890411, + "grad_norm": 5.911918640136719, + "learning_rate": 3.568746829020802e-06, + "loss": 0.0471, + "step": 7434 + }, + { + "epoch": 6.789954337899544, + "grad_norm": 35.82204818725586, + "learning_rate": 3.5677321156773213e-06, + "loss": 0.2182, + "step": 7435 + }, + { + "epoch": 6.790867579908676, + "grad_norm": 1.7254160642623901, + "learning_rate": 3.566717402333841e-06, + "loss": 0.0129, + "step": 7436 + }, + { + "epoch": 6.791780821917808, + "grad_norm": 0.2602769732475281, + "learning_rate": 3.5657026889903607e-06, + "loss": 0.0016, + "step": 7437 + }, + { + "epoch": 6.792694063926941, + "grad_norm": 0.6854977011680603, + "learning_rate": 3.56468797564688e-06, + "loss": 0.0052, + "step": 7438 + }, + { + "epoch": 6.793607305936073, + "grad_norm": 18.21657371520996, + "learning_rate": 3.5636732623033996e-06, + "loss": 0.0805, + "step": 7439 + }, + { + "epoch": 6.794520547945205, + "grad_norm": 0.718605101108551, + "learning_rate": 3.5626585489599193e-06, + "loss": 0.0044, + "step": 7440 + }, + { + "epoch": 6.7954337899543376, + "grad_norm": 122.75605773925781, + "learning_rate": 3.5616438356164386e-06, + "loss": 2.8496, + "step": 7441 + }, + { + "epoch": 6.79634703196347, + "grad_norm": 0.4126568138599396, + "learning_rate": 3.5606291222729583e-06, + "loss": 0.0023, + "step": 7442 + }, + { + "epoch": 6.7972602739726025, + "grad_norm": 4.190203666687012, + "learning_rate": 3.5596144089294776e-06, + "loss": 0.0325, + "step": 7443 + }, + { + "epoch": 6.798173515981735, + "grad_norm": 120.65007781982422, + "learning_rate": 3.5585996955859977e-06, + "loss": 1.9443, + "step": 7444 + }, + { + "epoch": 6.799086757990867, + "grad_norm": 2.8639581203460693, + "learning_rate": 3.557584982242517e-06, + "loss": 0.0114, + "step": 7445 + }, + { + "epoch": 6.8, + "grad_norm": 0.46485015749931335, + "learning_rate": 3.5565702688990362e-06, + "loss": 0.0033, + "step": 7446 + }, + { + "epoch": 6.800913242009132, + "grad_norm": 27.362213134765625, + "learning_rate": 3.555555555555556e-06, + "loss": 0.1558, + "step": 7447 + }, + { + "epoch": 6.801826484018265, + "grad_norm": 26.606706619262695, + "learning_rate": 3.554540842212075e-06, + "loss": 0.1781, + "step": 7448 + }, + { + "epoch": 6.802739726027397, + "grad_norm": 0.04152502492070198, + "learning_rate": 3.5535261288685953e-06, + "loss": 0.0003, + "step": 7449 + }, + { + "epoch": 6.80365296803653, + "grad_norm": 15.337368965148926, + "learning_rate": 3.5525114155251146e-06, + "loss": 0.0831, + "step": 7450 + }, + { + "epoch": 6.804566210045662, + "grad_norm": 32.37555694580078, + "learning_rate": 3.551496702181634e-06, + "loss": 0.2893, + "step": 7451 + }, + { + "epoch": 6.8054794520547945, + "grad_norm": 29.91319465637207, + "learning_rate": 3.5504819888381535e-06, + "loss": 0.2192, + "step": 7452 + }, + { + "epoch": 6.806392694063927, + "grad_norm": 0.09921137243509293, + "learning_rate": 3.549467275494673e-06, + "loss": 0.0007, + "step": 7453 + }, + { + "epoch": 6.8073059360730594, + "grad_norm": 1.4794316291809082, + "learning_rate": 3.548452562151193e-06, + "loss": 0.0095, + "step": 7454 + }, + { + "epoch": 6.808219178082192, + "grad_norm": 0.6298635601997375, + "learning_rate": 3.547437848807712e-06, + "loss": 0.0052, + "step": 7455 + }, + { + "epoch": 6.809132420091324, + "grad_norm": 0.22812886536121368, + "learning_rate": 3.5464231354642314e-06, + "loss": 0.0014, + "step": 7456 + }, + { + "epoch": 6.810045662100457, + "grad_norm": 2.0411429405212402, + "learning_rate": 3.545408422120751e-06, + "loss": 0.0118, + "step": 7457 + }, + { + "epoch": 6.810958904109589, + "grad_norm": 83.27349090576172, + "learning_rate": 3.544393708777271e-06, + "loss": 0.7544, + "step": 7458 + }, + { + "epoch": 6.811872146118722, + "grad_norm": 62.38881301879883, + "learning_rate": 3.5433789954337905e-06, + "loss": 0.4595, + "step": 7459 + }, + { + "epoch": 6.812785388127854, + "grad_norm": 2.6325509548187256, + "learning_rate": 3.54236428209031e-06, + "loss": 0.0171, + "step": 7460 + }, + { + "epoch": 6.813698630136987, + "grad_norm": 17.351858139038086, + "learning_rate": 3.541349568746829e-06, + "loss": 0.1269, + "step": 7461 + }, + { + "epoch": 6.814611872146119, + "grad_norm": 4.495995044708252, + "learning_rate": 3.540334855403349e-06, + "loss": 0.0354, + "step": 7462 + }, + { + "epoch": 6.8155251141552515, + "grad_norm": 89.4161148071289, + "learning_rate": 3.5393201420598684e-06, + "loss": 3.0347, + "step": 7463 + }, + { + "epoch": 6.816438356164383, + "grad_norm": 1.939497470855713, + "learning_rate": 3.538305428716388e-06, + "loss": 0.0082, + "step": 7464 + }, + { + "epoch": 6.817351598173516, + "grad_norm": 15.587080001831055, + "learning_rate": 3.5372907153729074e-06, + "loss": 0.1138, + "step": 7465 + }, + { + "epoch": 6.818264840182648, + "grad_norm": 13.995903968811035, + "learning_rate": 3.5362760020294267e-06, + "loss": 0.0977, + "step": 7466 + }, + { + "epoch": 6.8191780821917805, + "grad_norm": 7.496971607208252, + "learning_rate": 3.535261288685947e-06, + "loss": 0.0655, + "step": 7467 + }, + { + "epoch": 6.820091324200913, + "grad_norm": 27.86009407043457, + "learning_rate": 3.534246575342466e-06, + "loss": 0.1422, + "step": 7468 + }, + { + "epoch": 6.821004566210045, + "grad_norm": 1.1725975275039673, + "learning_rate": 3.5332318619989857e-06, + "loss": 0.0069, + "step": 7469 + }, + { + "epoch": 6.821917808219178, + "grad_norm": 20.985130310058594, + "learning_rate": 3.532217148655505e-06, + "loss": 0.1349, + "step": 7470 + }, + { + "epoch": 6.82283105022831, + "grad_norm": 0.3249942660331726, + "learning_rate": 3.5312024353120243e-06, + "loss": 0.002, + "step": 7471 + }, + { + "epoch": 6.823744292237443, + "grad_norm": 2.6459648609161377, + "learning_rate": 3.5301877219685444e-06, + "loss": 0.0142, + "step": 7472 + }, + { + "epoch": 6.824657534246575, + "grad_norm": 91.2942886352539, + "learning_rate": 3.5291730086250637e-06, + "loss": 0.794, + "step": 7473 + }, + { + "epoch": 6.825570776255708, + "grad_norm": 5.343826770782471, + "learning_rate": 3.5281582952815834e-06, + "loss": 0.0122, + "step": 7474 + }, + { + "epoch": 6.82648401826484, + "grad_norm": 0.9353645443916321, + "learning_rate": 3.5271435819381026e-06, + "loss": 0.0063, + "step": 7475 + }, + { + "epoch": 6.8273972602739725, + "grad_norm": 8.8355131149292, + "learning_rate": 3.5261288685946223e-06, + "loss": 0.0426, + "step": 7476 + }, + { + "epoch": 6.828310502283105, + "grad_norm": 58.582000732421875, + "learning_rate": 3.525114155251142e-06, + "loss": 0.6283, + "step": 7477 + }, + { + "epoch": 6.829223744292237, + "grad_norm": 40.9715690612793, + "learning_rate": 3.5240994419076613e-06, + "loss": 0.3549, + "step": 7478 + }, + { + "epoch": 6.83013698630137, + "grad_norm": 10.347558975219727, + "learning_rate": 3.523084728564181e-06, + "loss": 0.0582, + "step": 7479 + }, + { + "epoch": 6.831050228310502, + "grad_norm": 0.9348245859146118, + "learning_rate": 3.5220700152207007e-06, + "loss": 0.0069, + "step": 7480 + }, + { + "epoch": 6.831963470319635, + "grad_norm": 4.919580936431885, + "learning_rate": 3.52105530187722e-06, + "loss": 0.0228, + "step": 7481 + }, + { + "epoch": 6.832876712328767, + "grad_norm": 59.92014694213867, + "learning_rate": 3.5200405885337396e-06, + "loss": 0.4355, + "step": 7482 + }, + { + "epoch": 6.8337899543379, + "grad_norm": 2.998999834060669, + "learning_rate": 3.519025875190259e-06, + "loss": 0.0139, + "step": 7483 + }, + { + "epoch": 6.834703196347032, + "grad_norm": 1.1875149011611938, + "learning_rate": 3.518011161846779e-06, + "loss": 0.007, + "step": 7484 + }, + { + "epoch": 6.835616438356165, + "grad_norm": 92.69986724853516, + "learning_rate": 3.5169964485032983e-06, + "loss": 1.7521, + "step": 7485 + }, + { + "epoch": 6.836529680365297, + "grad_norm": 3.053109645843506, + "learning_rate": 3.5159817351598176e-06, + "loss": 0.0183, + "step": 7486 + }, + { + "epoch": 6.8374429223744295, + "grad_norm": 0.5645354390144348, + "learning_rate": 3.5149670218163372e-06, + "loss": 0.0043, + "step": 7487 + }, + { + "epoch": 6.838356164383562, + "grad_norm": 0.04216758906841278, + "learning_rate": 3.5139523084728565e-06, + "loss": 0.0002, + "step": 7488 + }, + { + "epoch": 6.839269406392694, + "grad_norm": 7.221572399139404, + "learning_rate": 3.5129375951293766e-06, + "loss": 0.0543, + "step": 7489 + }, + { + "epoch": 6.840182648401827, + "grad_norm": 6.482089996337891, + "learning_rate": 3.511922881785896e-06, + "loss": 0.0427, + "step": 7490 + }, + { + "epoch": 6.8410958904109584, + "grad_norm": 149.99337768554688, + "learning_rate": 3.510908168442415e-06, + "loss": 2.268, + "step": 7491 + }, + { + "epoch": 6.842009132420092, + "grad_norm": 17.163820266723633, + "learning_rate": 3.509893455098935e-06, + "loss": 0.1119, + "step": 7492 + }, + { + "epoch": 6.842922374429223, + "grad_norm": 47.58928298950195, + "learning_rate": 3.508878741755454e-06, + "loss": 0.4561, + "step": 7493 + }, + { + "epoch": 6.843835616438356, + "grad_norm": 7.338235378265381, + "learning_rate": 3.5078640284119742e-06, + "loss": 0.0409, + "step": 7494 + }, + { + "epoch": 6.844748858447488, + "grad_norm": 13.392500877380371, + "learning_rate": 3.5068493150684935e-06, + "loss": 0.076, + "step": 7495 + }, + { + "epoch": 6.845662100456621, + "grad_norm": 2.7807810306549072, + "learning_rate": 3.5058346017250128e-06, + "loss": 0.0156, + "step": 7496 + }, + { + "epoch": 6.846575342465753, + "grad_norm": 0.3012140691280365, + "learning_rate": 3.5048198883815325e-06, + "loss": 0.0018, + "step": 7497 + }, + { + "epoch": 6.847488584474886, + "grad_norm": 1.8441460132598877, + "learning_rate": 3.503805175038052e-06, + "loss": 0.0105, + "step": 7498 + }, + { + "epoch": 6.848401826484018, + "grad_norm": 0.3329034745693207, + "learning_rate": 3.502790461694572e-06, + "loss": 0.0031, + "step": 7499 + }, + { + "epoch": 6.8493150684931505, + "grad_norm": 161.9816436767578, + "learning_rate": 3.501775748351091e-06, + "loss": 2.2979, + "step": 7500 + }, + { + "epoch": 6.850228310502283, + "grad_norm": 12.617585182189941, + "learning_rate": 3.5007610350076104e-06, + "loss": 0.0781, + "step": 7501 + }, + { + "epoch": 6.851141552511415, + "grad_norm": 2.8255815505981445, + "learning_rate": 3.4997463216641305e-06, + "loss": 0.014, + "step": 7502 + }, + { + "epoch": 6.852054794520548, + "grad_norm": 9.459508895874023, + "learning_rate": 3.4987316083206498e-06, + "loss": 0.0716, + "step": 7503 + }, + { + "epoch": 6.85296803652968, + "grad_norm": 0.6697777509689331, + "learning_rate": 3.4977168949771695e-06, + "loss": 0.0026, + "step": 7504 + }, + { + "epoch": 6.853881278538813, + "grad_norm": 5.288496494293213, + "learning_rate": 3.4967021816336887e-06, + "loss": 0.0361, + "step": 7505 + }, + { + "epoch": 6.854794520547945, + "grad_norm": 1.9105104207992554, + "learning_rate": 3.495687468290208e-06, + "loss": 0.011, + "step": 7506 + }, + { + "epoch": 6.855707762557078, + "grad_norm": 1.4632195234298706, + "learning_rate": 3.494672754946728e-06, + "loss": 0.0071, + "step": 7507 + }, + { + "epoch": 6.85662100456621, + "grad_norm": 27.527297973632812, + "learning_rate": 3.4936580416032474e-06, + "loss": 0.2097, + "step": 7508 + }, + { + "epoch": 6.857534246575343, + "grad_norm": 1.3113300800323486, + "learning_rate": 3.492643328259767e-06, + "loss": 0.0102, + "step": 7509 + }, + { + "epoch": 6.858447488584475, + "grad_norm": 4.049346923828125, + "learning_rate": 3.4916286149162863e-06, + "loss": 0.0347, + "step": 7510 + }, + { + "epoch": 6.8593607305936075, + "grad_norm": 15.979491233825684, + "learning_rate": 3.4906139015728056e-06, + "loss": 0.0862, + "step": 7511 + }, + { + "epoch": 6.86027397260274, + "grad_norm": 0.3373635411262512, + "learning_rate": 3.4895991882293257e-06, + "loss": 0.0021, + "step": 7512 + }, + { + "epoch": 6.861187214611872, + "grad_norm": 0.5668258666992188, + "learning_rate": 3.488584474885845e-06, + "loss": 0.0042, + "step": 7513 + }, + { + "epoch": 6.862100456621005, + "grad_norm": 1.0458801984786987, + "learning_rate": 3.4875697615423647e-06, + "loss": 0.0048, + "step": 7514 + }, + { + "epoch": 6.863013698630137, + "grad_norm": 0.4582161605358124, + "learning_rate": 3.486555048198884e-06, + "loss": 0.0025, + "step": 7515 + }, + { + "epoch": 6.86392694063927, + "grad_norm": 1.9998024702072144, + "learning_rate": 3.4855403348554032e-06, + "loss": 0.0107, + "step": 7516 + }, + { + "epoch": 6.864840182648402, + "grad_norm": 66.26168823242188, + "learning_rate": 3.4845256215119233e-06, + "loss": 0.7871, + "step": 7517 + }, + { + "epoch": 6.865753424657534, + "grad_norm": 24.59510040283203, + "learning_rate": 3.4835109081684426e-06, + "loss": 0.1681, + "step": 7518 + }, + { + "epoch": 6.866666666666667, + "grad_norm": 2.1264171600341797, + "learning_rate": 3.4824961948249623e-06, + "loss": 0.0143, + "step": 7519 + }, + { + "epoch": 6.867579908675799, + "grad_norm": 2.8515257835388184, + "learning_rate": 3.481481481481482e-06, + "loss": 0.0237, + "step": 7520 + }, + { + "epoch": 6.868493150684931, + "grad_norm": 0.19136479496955872, + "learning_rate": 3.4804667681380013e-06, + "loss": 0.0011, + "step": 7521 + }, + { + "epoch": 6.869406392694064, + "grad_norm": 2.0598137378692627, + "learning_rate": 3.479452054794521e-06, + "loss": 0.0139, + "step": 7522 + }, + { + "epoch": 6.870319634703196, + "grad_norm": 21.501632690429688, + "learning_rate": 3.4784373414510402e-06, + "loss": 0.1957, + "step": 7523 + }, + { + "epoch": 6.8712328767123285, + "grad_norm": 0.4902336597442627, + "learning_rate": 3.4774226281075603e-06, + "loss": 0.0031, + "step": 7524 + }, + { + "epoch": 6.872146118721461, + "grad_norm": 1.0300499200820923, + "learning_rate": 3.4764079147640796e-06, + "loss": 0.0069, + "step": 7525 + }, + { + "epoch": 6.873059360730593, + "grad_norm": 13.073698997497559, + "learning_rate": 3.475393201420599e-06, + "loss": 0.0992, + "step": 7526 + }, + { + "epoch": 6.873972602739726, + "grad_norm": 0.8284772038459778, + "learning_rate": 3.4743784880771186e-06, + "loss": 0.0053, + "step": 7527 + }, + { + "epoch": 6.874885844748858, + "grad_norm": 0.060891278088092804, + "learning_rate": 3.473363774733638e-06, + "loss": 0.0004, + "step": 7528 + }, + { + "epoch": 6.875799086757991, + "grad_norm": 2.270972967147827, + "learning_rate": 3.472349061390158e-06, + "loss": 0.0153, + "step": 7529 + }, + { + "epoch": 6.876712328767123, + "grad_norm": 14.01248550415039, + "learning_rate": 3.4713343480466772e-06, + "loss": 0.1039, + "step": 7530 + }, + { + "epoch": 6.877625570776256, + "grad_norm": 0.7699475288391113, + "learning_rate": 3.4703196347031965e-06, + "loss": 0.0064, + "step": 7531 + }, + { + "epoch": 6.878538812785388, + "grad_norm": 2.567697525024414, + "learning_rate": 3.469304921359716e-06, + "loss": 0.0189, + "step": 7532 + }, + { + "epoch": 6.879452054794521, + "grad_norm": 13.348701477050781, + "learning_rate": 3.4682902080162355e-06, + "loss": 0.0948, + "step": 7533 + }, + { + "epoch": 6.880365296803653, + "grad_norm": 1.842233657836914, + "learning_rate": 3.4672754946727556e-06, + "loss": 0.0128, + "step": 7534 + }, + { + "epoch": 6.8812785388127855, + "grad_norm": 8.45283031463623, + "learning_rate": 3.466260781329275e-06, + "loss": 0.0724, + "step": 7535 + }, + { + "epoch": 6.882191780821918, + "grad_norm": 39.09015655517578, + "learning_rate": 3.465246067985794e-06, + "loss": 0.3637, + "step": 7536 + }, + { + "epoch": 6.88310502283105, + "grad_norm": 0.3589327931404114, + "learning_rate": 3.464231354642314e-06, + "loss": 0.0034, + "step": 7537 + }, + { + "epoch": 6.884018264840183, + "grad_norm": 1.6777727603912354, + "learning_rate": 3.4632166412988335e-06, + "loss": 0.013, + "step": 7538 + }, + { + "epoch": 6.884931506849315, + "grad_norm": 18.886985778808594, + "learning_rate": 3.462201927955353e-06, + "loss": 0.1117, + "step": 7539 + }, + { + "epoch": 6.885844748858448, + "grad_norm": 0.3460524380207062, + "learning_rate": 3.4611872146118725e-06, + "loss": 0.0021, + "step": 7540 + }, + { + "epoch": 6.88675799086758, + "grad_norm": 1.2704315185546875, + "learning_rate": 3.4601725012683917e-06, + "loss": 0.0125, + "step": 7541 + }, + { + "epoch": 6.887671232876713, + "grad_norm": 0.20037095248699188, + "learning_rate": 3.459157787924912e-06, + "loss": 0.0015, + "step": 7542 + }, + { + "epoch": 6.888584474885845, + "grad_norm": 0.17368076741695404, + "learning_rate": 3.458143074581431e-06, + "loss": 0.001, + "step": 7543 + }, + { + "epoch": 6.889497716894978, + "grad_norm": 1.3299243450164795, + "learning_rate": 3.457128361237951e-06, + "loss": 0.0039, + "step": 7544 + }, + { + "epoch": 6.890410958904109, + "grad_norm": 4.160149097442627, + "learning_rate": 3.45611364789447e-06, + "loss": 0.0282, + "step": 7545 + }, + { + "epoch": 6.8913242009132425, + "grad_norm": 3.896698236465454, + "learning_rate": 3.4550989345509893e-06, + "loss": 0.0261, + "step": 7546 + }, + { + "epoch": 6.892237442922374, + "grad_norm": 3.852173328399658, + "learning_rate": 3.4540842212075094e-06, + "loss": 0.0225, + "step": 7547 + }, + { + "epoch": 6.8931506849315065, + "grad_norm": 5.765018939971924, + "learning_rate": 3.4530695078640287e-06, + "loss": 0.0424, + "step": 7548 + }, + { + "epoch": 6.894063926940639, + "grad_norm": 2.1421000957489014, + "learning_rate": 3.4520547945205484e-06, + "loss": 0.0192, + "step": 7549 + }, + { + "epoch": 6.894977168949771, + "grad_norm": 2.242246627807617, + "learning_rate": 3.4510400811770677e-06, + "loss": 0.0093, + "step": 7550 + }, + { + "epoch": 6.895890410958904, + "grad_norm": 4.227834701538086, + "learning_rate": 3.450025367833587e-06, + "loss": 0.0321, + "step": 7551 + }, + { + "epoch": 6.896803652968036, + "grad_norm": 1.2020431756973267, + "learning_rate": 3.449010654490107e-06, + "loss": 0.0068, + "step": 7552 + }, + { + "epoch": 6.897716894977169, + "grad_norm": 2.0217063426971436, + "learning_rate": 3.4479959411466263e-06, + "loss": 0.0146, + "step": 7553 + }, + { + "epoch": 6.898630136986301, + "grad_norm": 8.025837898254395, + "learning_rate": 3.446981227803146e-06, + "loss": 0.0637, + "step": 7554 + }, + { + "epoch": 6.899543378995434, + "grad_norm": 2.7106082439422607, + "learning_rate": 3.4459665144596653e-06, + "loss": 0.0181, + "step": 7555 + }, + { + "epoch": 6.900456621004566, + "grad_norm": 3.3025243282318115, + "learning_rate": 3.4449518011161846e-06, + "loss": 0.022, + "step": 7556 + }, + { + "epoch": 6.901369863013699, + "grad_norm": 0.4356660842895508, + "learning_rate": 3.4439370877727047e-06, + "loss": 0.0023, + "step": 7557 + }, + { + "epoch": 6.902283105022831, + "grad_norm": 25.458873748779297, + "learning_rate": 3.442922374429224e-06, + "loss": 0.1854, + "step": 7558 + }, + { + "epoch": 6.9031963470319635, + "grad_norm": 1.2641865015029907, + "learning_rate": 3.4419076610857436e-06, + "loss": 0.0061, + "step": 7559 + }, + { + "epoch": 6.904109589041096, + "grad_norm": 0.41113361716270447, + "learning_rate": 3.4408929477422633e-06, + "loss": 0.003, + "step": 7560 + }, + { + "epoch": 6.905022831050228, + "grad_norm": 2.3064093589782715, + "learning_rate": 3.4398782343987826e-06, + "loss": 0.0166, + "step": 7561 + }, + { + "epoch": 6.905936073059361, + "grad_norm": 0.1077285185456276, + "learning_rate": 3.4388635210553023e-06, + "loss": 0.0005, + "step": 7562 + }, + { + "epoch": 6.906849315068493, + "grad_norm": 27.679147720336914, + "learning_rate": 3.4378488077118216e-06, + "loss": 0.186, + "step": 7563 + }, + { + "epoch": 6.907762557077626, + "grad_norm": 0.09807410091161728, + "learning_rate": 3.4368340943683417e-06, + "loss": 0.0006, + "step": 7564 + }, + { + "epoch": 6.908675799086758, + "grad_norm": 0.6976600289344788, + "learning_rate": 3.435819381024861e-06, + "loss": 0.0055, + "step": 7565 + }, + { + "epoch": 6.909589041095891, + "grad_norm": 4.718173980712891, + "learning_rate": 3.43480466768138e-06, + "loss": 0.0357, + "step": 7566 + }, + { + "epoch": 6.910502283105023, + "grad_norm": 22.86434555053711, + "learning_rate": 3.4337899543379e-06, + "loss": 0.1817, + "step": 7567 + }, + { + "epoch": 6.911415525114156, + "grad_norm": 0.5255762338638306, + "learning_rate": 3.432775240994419e-06, + "loss": 0.0022, + "step": 7568 + }, + { + "epoch": 6.912328767123288, + "grad_norm": 3.4961631298065186, + "learning_rate": 3.4317605276509393e-06, + "loss": 0.0214, + "step": 7569 + }, + { + "epoch": 6.91324200913242, + "grad_norm": 0.3020193576812744, + "learning_rate": 3.4307458143074586e-06, + "loss": 0.0028, + "step": 7570 + }, + { + "epoch": 6.914155251141553, + "grad_norm": 4.69863224029541, + "learning_rate": 3.429731100963978e-06, + "loss": 0.0331, + "step": 7571 + }, + { + "epoch": 6.9150684931506845, + "grad_norm": 3.959425687789917, + "learning_rate": 3.4287163876204975e-06, + "loss": 0.0214, + "step": 7572 + }, + { + "epoch": 6.915981735159818, + "grad_norm": 0.24841606616973877, + "learning_rate": 3.4277016742770168e-06, + "loss": 0.002, + "step": 7573 + }, + { + "epoch": 6.916894977168949, + "grad_norm": 3.7694664001464844, + "learning_rate": 3.426686960933537e-06, + "loss": 0.021, + "step": 7574 + }, + { + "epoch": 6.917808219178082, + "grad_norm": 1.6479870080947876, + "learning_rate": 3.425672247590056e-06, + "loss": 0.0127, + "step": 7575 + }, + { + "epoch": 6.918721461187214, + "grad_norm": 65.97767639160156, + "learning_rate": 3.4246575342465754e-06, + "loss": 0.9529, + "step": 7576 + }, + { + "epoch": 6.919634703196347, + "grad_norm": 0.8034248948097229, + "learning_rate": 3.423642820903095e-06, + "loss": 0.0061, + "step": 7577 + }, + { + "epoch": 6.920547945205479, + "grad_norm": 0.952698826789856, + "learning_rate": 3.422628107559615e-06, + "loss": 0.0061, + "step": 7578 + }, + { + "epoch": 6.921461187214612, + "grad_norm": 0.05170467495918274, + "learning_rate": 3.4216133942161345e-06, + "loss": 0.0004, + "step": 7579 + }, + { + "epoch": 6.922374429223744, + "grad_norm": 100.2374496459961, + "learning_rate": 3.4205986808726538e-06, + "loss": 1.2894, + "step": 7580 + }, + { + "epoch": 6.923287671232877, + "grad_norm": 25.159454345703125, + "learning_rate": 3.419583967529173e-06, + "loss": 0.1368, + "step": 7581 + }, + { + "epoch": 6.924200913242009, + "grad_norm": 0.41745561361312866, + "learning_rate": 3.418569254185693e-06, + "loss": 0.0026, + "step": 7582 + }, + { + "epoch": 6.9251141552511415, + "grad_norm": 3.900851249694824, + "learning_rate": 3.4175545408422124e-06, + "loss": 0.0219, + "step": 7583 + }, + { + "epoch": 6.926027397260274, + "grad_norm": 74.1570816040039, + "learning_rate": 3.416539827498732e-06, + "loss": 0.9962, + "step": 7584 + }, + { + "epoch": 6.926940639269406, + "grad_norm": 8.82931900024414, + "learning_rate": 3.4155251141552514e-06, + "loss": 0.0673, + "step": 7585 + }, + { + "epoch": 6.927853881278539, + "grad_norm": 5.070070266723633, + "learning_rate": 3.4145104008117707e-06, + "loss": 0.0379, + "step": 7586 + }, + { + "epoch": 6.928767123287671, + "grad_norm": 2.486055612564087, + "learning_rate": 3.4134956874682908e-06, + "loss": 0.0099, + "step": 7587 + }, + { + "epoch": 6.929680365296804, + "grad_norm": 3.8425838947296143, + "learning_rate": 3.41248097412481e-06, + "loss": 0.0276, + "step": 7588 + }, + { + "epoch": 6.930593607305936, + "grad_norm": 17.13591957092285, + "learning_rate": 3.4114662607813297e-06, + "loss": 0.1111, + "step": 7589 + }, + { + "epoch": 6.931506849315069, + "grad_norm": 0.2717897593975067, + "learning_rate": 3.410451547437849e-06, + "loss": 0.001, + "step": 7590 + }, + { + "epoch": 6.932420091324201, + "grad_norm": 1.2072004079818726, + "learning_rate": 3.4094368340943683e-06, + "loss": 0.0098, + "step": 7591 + }, + { + "epoch": 6.933333333333334, + "grad_norm": 1.6838762760162354, + "learning_rate": 3.4084221207508884e-06, + "loss": 0.0145, + "step": 7592 + }, + { + "epoch": 6.934246575342466, + "grad_norm": 1.1852144002914429, + "learning_rate": 3.4074074074074077e-06, + "loss": 0.0089, + "step": 7593 + }, + { + "epoch": 6.9351598173515985, + "grad_norm": 3.154406785964966, + "learning_rate": 3.4063926940639274e-06, + "loss": 0.0202, + "step": 7594 + }, + { + "epoch": 6.936073059360731, + "grad_norm": 0.6762200593948364, + "learning_rate": 3.4053779807204466e-06, + "loss": 0.0038, + "step": 7595 + }, + { + "epoch": 6.936986301369863, + "grad_norm": 25.961931228637695, + "learning_rate": 3.404363267376966e-06, + "loss": 0.1937, + "step": 7596 + }, + { + "epoch": 6.937899543378995, + "grad_norm": 0.3063622713088989, + "learning_rate": 3.403348554033486e-06, + "loss": 0.0022, + "step": 7597 + }, + { + "epoch": 6.938812785388128, + "grad_norm": 49.19408416748047, + "learning_rate": 3.4023338406900053e-06, + "loss": 0.4106, + "step": 7598 + }, + { + "epoch": 6.93972602739726, + "grad_norm": 147.12283325195312, + "learning_rate": 3.401319127346525e-06, + "loss": 1.5165, + "step": 7599 + }, + { + "epoch": 6.940639269406392, + "grad_norm": 0.8287892937660217, + "learning_rate": 3.4003044140030447e-06, + "loss": 0.0037, + "step": 7600 + }, + { + "epoch": 6.941552511415525, + "grad_norm": 5.95847749710083, + "learning_rate": 3.399289700659564e-06, + "loss": 0.0381, + "step": 7601 + }, + { + "epoch": 6.942465753424657, + "grad_norm": 10.693875312805176, + "learning_rate": 3.3982749873160836e-06, + "loss": 0.0687, + "step": 7602 + }, + { + "epoch": 6.94337899543379, + "grad_norm": 2.845529794692993, + "learning_rate": 3.397260273972603e-06, + "loss": 0.0139, + "step": 7603 + }, + { + "epoch": 6.944292237442922, + "grad_norm": 13.935898780822754, + "learning_rate": 3.396245560629123e-06, + "loss": 0.0698, + "step": 7604 + }, + { + "epoch": 6.945205479452055, + "grad_norm": 7.719151020050049, + "learning_rate": 3.3952308472856423e-06, + "loss": 0.0608, + "step": 7605 + }, + { + "epoch": 6.946118721461187, + "grad_norm": 16.29834747314453, + "learning_rate": 3.3942161339421615e-06, + "loss": 0.0943, + "step": 7606 + }, + { + "epoch": 6.9470319634703195, + "grad_norm": 0.8449977040290833, + "learning_rate": 3.3932014205986812e-06, + "loss": 0.0059, + "step": 7607 + }, + { + "epoch": 6.947945205479452, + "grad_norm": 57.774349212646484, + "learning_rate": 3.3921867072552005e-06, + "loss": 0.399, + "step": 7608 + }, + { + "epoch": 6.948858447488584, + "grad_norm": 0.42915573716163635, + "learning_rate": 3.3911719939117206e-06, + "loss": 0.0031, + "step": 7609 + }, + { + "epoch": 6.949771689497717, + "grad_norm": 8.220477104187012, + "learning_rate": 3.39015728056824e-06, + "loss": 0.0679, + "step": 7610 + }, + { + "epoch": 6.950684931506849, + "grad_norm": 2.34906005859375, + "learning_rate": 3.389142567224759e-06, + "loss": 0.0159, + "step": 7611 + }, + { + "epoch": 6.951598173515982, + "grad_norm": 0.04099981114268303, + "learning_rate": 3.388127853881279e-06, + "loss": 0.0003, + "step": 7612 + }, + { + "epoch": 6.952511415525114, + "grad_norm": 2.3653125762939453, + "learning_rate": 3.387113140537798e-06, + "loss": 0.0131, + "step": 7613 + }, + { + "epoch": 6.953424657534247, + "grad_norm": 2.952146053314209, + "learning_rate": 3.3860984271943182e-06, + "loss": 0.0145, + "step": 7614 + }, + { + "epoch": 6.954337899543379, + "grad_norm": 0.07256808876991272, + "learning_rate": 3.3850837138508375e-06, + "loss": 0.0006, + "step": 7615 + }, + { + "epoch": 6.955251141552512, + "grad_norm": 41.15657424926758, + "learning_rate": 3.3840690005073568e-06, + "loss": 0.425, + "step": 7616 + }, + { + "epoch": 6.956164383561644, + "grad_norm": 2.4702253341674805, + "learning_rate": 3.3830542871638765e-06, + "loss": 0.0171, + "step": 7617 + }, + { + "epoch": 6.9570776255707765, + "grad_norm": 27.338979721069336, + "learning_rate": 3.3820395738203957e-06, + "loss": 0.1841, + "step": 7618 + }, + { + "epoch": 6.957990867579909, + "grad_norm": 15.722593307495117, + "learning_rate": 3.381024860476916e-06, + "loss": 0.1745, + "step": 7619 + }, + { + "epoch": 6.958904109589041, + "grad_norm": 22.488386154174805, + "learning_rate": 3.380010147133435e-06, + "loss": 0.0988, + "step": 7620 + }, + { + "epoch": 6.959817351598174, + "grad_norm": 0.14145486056804657, + "learning_rate": 3.3789954337899544e-06, + "loss": 0.0009, + "step": 7621 + }, + { + "epoch": 6.960730593607306, + "grad_norm": 1.180567979812622, + "learning_rate": 3.3779807204464745e-06, + "loss": 0.009, + "step": 7622 + }, + { + "epoch": 6.961643835616439, + "grad_norm": 0.5047200918197632, + "learning_rate": 3.3769660071029938e-06, + "loss": 0.0021, + "step": 7623 + }, + { + "epoch": 6.96255707762557, + "grad_norm": 2.12650728225708, + "learning_rate": 3.3759512937595135e-06, + "loss": 0.0145, + "step": 7624 + }, + { + "epoch": 6.963470319634704, + "grad_norm": 3.824669122695923, + "learning_rate": 3.3749365804160327e-06, + "loss": 0.0339, + "step": 7625 + }, + { + "epoch": 6.964383561643835, + "grad_norm": 0.6828582882881165, + "learning_rate": 3.373921867072552e-06, + "loss": 0.0059, + "step": 7626 + }, + { + "epoch": 6.965296803652968, + "grad_norm": 3.8627049922943115, + "learning_rate": 3.372907153729072e-06, + "loss": 0.0325, + "step": 7627 + }, + { + "epoch": 6.9662100456621, + "grad_norm": 0.6344544291496277, + "learning_rate": 3.3718924403855914e-06, + "loss": 0.0036, + "step": 7628 + }, + { + "epoch": 6.967123287671233, + "grad_norm": 82.64561462402344, + "learning_rate": 3.370877727042111e-06, + "loss": 1.9912, + "step": 7629 + }, + { + "epoch": 6.968036529680365, + "grad_norm": 52.32822036743164, + "learning_rate": 3.3698630136986303e-06, + "loss": 0.2717, + "step": 7630 + }, + { + "epoch": 6.9689497716894975, + "grad_norm": 8.52302074432373, + "learning_rate": 3.3688483003551496e-06, + "loss": 0.0471, + "step": 7631 + }, + { + "epoch": 6.96986301369863, + "grad_norm": 0.8902357816696167, + "learning_rate": 3.3678335870116697e-06, + "loss": 0.0053, + "step": 7632 + }, + { + "epoch": 6.970776255707762, + "grad_norm": 1.550581932067871, + "learning_rate": 3.366818873668189e-06, + "loss": 0.0048, + "step": 7633 + }, + { + "epoch": 6.971689497716895, + "grad_norm": 0.6219874024391174, + "learning_rate": 3.3658041603247087e-06, + "loss": 0.0027, + "step": 7634 + }, + { + "epoch": 6.972602739726027, + "grad_norm": 17.333425521850586, + "learning_rate": 3.364789446981228e-06, + "loss": 0.1017, + "step": 7635 + }, + { + "epoch": 6.97351598173516, + "grad_norm": 11.291817665100098, + "learning_rate": 3.3637747336377472e-06, + "loss": 0.0702, + "step": 7636 + }, + { + "epoch": 6.974429223744292, + "grad_norm": 0.3376178741455078, + "learning_rate": 3.3627600202942673e-06, + "loss": 0.0026, + "step": 7637 + }, + { + "epoch": 6.975342465753425, + "grad_norm": 36.59862518310547, + "learning_rate": 3.3617453069507866e-06, + "loss": 0.3591, + "step": 7638 + }, + { + "epoch": 6.976255707762557, + "grad_norm": 2.886707305908203, + "learning_rate": 3.3607305936073063e-06, + "loss": 0.0171, + "step": 7639 + }, + { + "epoch": 6.9771689497716896, + "grad_norm": 0.08796792477369308, + "learning_rate": 3.359715880263826e-06, + "loss": 0.0004, + "step": 7640 + }, + { + "epoch": 6.978082191780822, + "grad_norm": 9.537444114685059, + "learning_rate": 3.3587011669203453e-06, + "loss": 0.0433, + "step": 7641 + }, + { + "epoch": 6.9789954337899545, + "grad_norm": 0.3461964428424835, + "learning_rate": 3.357686453576865e-06, + "loss": 0.0013, + "step": 7642 + }, + { + "epoch": 6.979908675799087, + "grad_norm": 1.7867672443389893, + "learning_rate": 3.3566717402333842e-06, + "loss": 0.0134, + "step": 7643 + }, + { + "epoch": 6.980821917808219, + "grad_norm": 0.3672531843185425, + "learning_rate": 3.3556570268899043e-06, + "loss": 0.0026, + "step": 7644 + }, + { + "epoch": 6.981735159817352, + "grad_norm": 1.0799990892410278, + "learning_rate": 3.3546423135464236e-06, + "loss": 0.0082, + "step": 7645 + }, + { + "epoch": 6.982648401826484, + "grad_norm": 5.879373550415039, + "learning_rate": 3.353627600202943e-06, + "loss": 0.0326, + "step": 7646 + }, + { + "epoch": 6.983561643835617, + "grad_norm": 8.902728080749512, + "learning_rate": 3.3526128868594626e-06, + "loss": 0.0591, + "step": 7647 + }, + { + "epoch": 6.984474885844749, + "grad_norm": 5.311580181121826, + "learning_rate": 3.351598173515982e-06, + "loss": 0.0339, + "step": 7648 + }, + { + "epoch": 6.985388127853882, + "grad_norm": 2.7173995971679688, + "learning_rate": 3.350583460172502e-06, + "loss": 0.0193, + "step": 7649 + }, + { + "epoch": 6.986301369863014, + "grad_norm": 17.264314651489258, + "learning_rate": 3.3495687468290212e-06, + "loss": 0.1308, + "step": 7650 + }, + { + "epoch": 6.987214611872146, + "grad_norm": 27.145580291748047, + "learning_rate": 3.3485540334855405e-06, + "loss": 0.1186, + "step": 7651 + }, + { + "epoch": 6.988127853881279, + "grad_norm": 0.28018718957901, + "learning_rate": 3.34753932014206e-06, + "loss": 0.0019, + "step": 7652 + }, + { + "epoch": 6.989041095890411, + "grad_norm": 24.84144401550293, + "learning_rate": 3.3465246067985794e-06, + "loss": 0.1343, + "step": 7653 + }, + { + "epoch": 6.989954337899543, + "grad_norm": 2.1075615882873535, + "learning_rate": 3.3455098934550996e-06, + "loss": 0.0189, + "step": 7654 + }, + { + "epoch": 6.9908675799086755, + "grad_norm": 1.2839629650115967, + "learning_rate": 3.344495180111619e-06, + "loss": 0.0087, + "step": 7655 + }, + { + "epoch": 6.991780821917808, + "grad_norm": 1.9454716444015503, + "learning_rate": 3.343480466768138e-06, + "loss": 0.007, + "step": 7656 + }, + { + "epoch": 6.99269406392694, + "grad_norm": 0.11986494809389114, + "learning_rate": 3.342465753424658e-06, + "loss": 0.0006, + "step": 7657 + }, + { + "epoch": 6.993607305936073, + "grad_norm": 13.544310569763184, + "learning_rate": 3.341451040081177e-06, + "loss": 0.0704, + "step": 7658 + }, + { + "epoch": 6.994520547945205, + "grad_norm": 0.37797507643699646, + "learning_rate": 3.340436326737697e-06, + "loss": 0.003, + "step": 7659 + }, + { + "epoch": 6.995433789954338, + "grad_norm": 3.128558397293091, + "learning_rate": 3.3394216133942164e-06, + "loss": 0.0199, + "step": 7660 + }, + { + "epoch": 6.99634703196347, + "grad_norm": 0.7665093541145325, + "learning_rate": 3.3384069000507357e-06, + "loss": 0.0042, + "step": 7661 + }, + { + "epoch": 6.997260273972603, + "grad_norm": 0.658754289150238, + "learning_rate": 3.337392186707256e-06, + "loss": 0.0035, + "step": 7662 + }, + { + "epoch": 6.998173515981735, + "grad_norm": 1.186307430267334, + "learning_rate": 3.336377473363775e-06, + "loss": 0.0078, + "step": 7663 + }, + { + "epoch": 6.9990867579908675, + "grad_norm": 7.420338153839111, + "learning_rate": 3.3353627600202948e-06, + "loss": 0.0485, + "step": 7664 + }, + { + "epoch": 7.0, + "grad_norm": 26.94220733642578, + "learning_rate": 3.334348046676814e-06, + "loss": 0.3332, + "step": 7665 + }, + { + "epoch": 7.0009132420091325, + "grad_norm": 0.09083881229162216, + "learning_rate": 3.3333333333333333e-06, + "loss": 0.0007, + "step": 7666 + }, + { + "epoch": 7.001826484018265, + "grad_norm": 0.5814070701599121, + "learning_rate": 3.3323186199898534e-06, + "loss": 0.0046, + "step": 7667 + }, + { + "epoch": 7.002739726027397, + "grad_norm": 26.269155502319336, + "learning_rate": 3.3313039066463727e-06, + "loss": 0.2019, + "step": 7668 + }, + { + "epoch": 7.00365296803653, + "grad_norm": 2.7538862228393555, + "learning_rate": 3.3302891933028924e-06, + "loss": 0.0174, + "step": 7669 + }, + { + "epoch": 7.004566210045662, + "grad_norm": 0.6695045828819275, + "learning_rate": 3.3292744799594117e-06, + "loss": 0.0041, + "step": 7670 + }, + { + "epoch": 7.005479452054795, + "grad_norm": 100.08240509033203, + "learning_rate": 3.328259766615931e-06, + "loss": 0.3863, + "step": 7671 + }, + { + "epoch": 7.006392694063927, + "grad_norm": 0.8381305932998657, + "learning_rate": 3.327245053272451e-06, + "loss": 0.0044, + "step": 7672 + }, + { + "epoch": 7.00730593607306, + "grad_norm": 6.655390739440918, + "learning_rate": 3.3262303399289703e-06, + "loss": 0.0404, + "step": 7673 + }, + { + "epoch": 7.008219178082192, + "grad_norm": 0.8268344402313232, + "learning_rate": 3.32521562658549e-06, + "loss": 0.0044, + "step": 7674 + }, + { + "epoch": 7.0091324200913245, + "grad_norm": 0.6033569574356079, + "learning_rate": 3.3242009132420093e-06, + "loss": 0.0049, + "step": 7675 + }, + { + "epoch": 7.010045662100457, + "grad_norm": 0.16741342842578888, + "learning_rate": 3.3231861998985286e-06, + "loss": 0.0008, + "step": 7676 + }, + { + "epoch": 7.010958904109589, + "grad_norm": 7.205438613891602, + "learning_rate": 3.3221714865550487e-06, + "loss": 0.038, + "step": 7677 + }, + { + "epoch": 7.011872146118722, + "grad_norm": 5.037789821624756, + "learning_rate": 3.321156773211568e-06, + "loss": 0.0384, + "step": 7678 + }, + { + "epoch": 7.0127853881278535, + "grad_norm": 3.140747308731079, + "learning_rate": 3.3201420598680876e-06, + "loss": 0.017, + "step": 7679 + }, + { + "epoch": 7.013698630136986, + "grad_norm": 6.830609321594238, + "learning_rate": 3.3191273465246073e-06, + "loss": 0.0426, + "step": 7680 + }, + { + "epoch": 7.014611872146118, + "grad_norm": 0.9159179925918579, + "learning_rate": 3.3181126331811266e-06, + "loss": 0.0065, + "step": 7681 + }, + { + "epoch": 7.015525114155251, + "grad_norm": 1.4201459884643555, + "learning_rate": 3.3170979198376463e-06, + "loss": 0.0091, + "step": 7682 + }, + { + "epoch": 7.016438356164383, + "grad_norm": 3.984147071838379, + "learning_rate": 3.3160832064941655e-06, + "loss": 0.0227, + "step": 7683 + }, + { + "epoch": 7.017351598173516, + "grad_norm": 7.713351249694824, + "learning_rate": 3.3150684931506857e-06, + "loss": 0.0354, + "step": 7684 + }, + { + "epoch": 7.018264840182648, + "grad_norm": 3.7409307956695557, + "learning_rate": 3.314053779807205e-06, + "loss": 0.0244, + "step": 7685 + }, + { + "epoch": 7.019178082191781, + "grad_norm": 3.6147468090057373, + "learning_rate": 3.313039066463724e-06, + "loss": 0.0166, + "step": 7686 + }, + { + "epoch": 7.020091324200913, + "grad_norm": 7.3271026611328125, + "learning_rate": 3.312024353120244e-06, + "loss": 0.0453, + "step": 7687 + }, + { + "epoch": 7.0210045662100455, + "grad_norm": 0.7943800091743469, + "learning_rate": 3.311009639776763e-06, + "loss": 0.0059, + "step": 7688 + }, + { + "epoch": 7.021917808219178, + "grad_norm": 26.328535079956055, + "learning_rate": 3.3099949264332833e-06, + "loss": 0.1774, + "step": 7689 + }, + { + "epoch": 7.0228310502283104, + "grad_norm": 3.0879337787628174, + "learning_rate": 3.3089802130898025e-06, + "loss": 0.0327, + "step": 7690 + }, + { + "epoch": 7.023744292237443, + "grad_norm": 1.795493721961975, + "learning_rate": 3.307965499746322e-06, + "loss": 0.0105, + "step": 7691 + }, + { + "epoch": 7.024657534246575, + "grad_norm": 9.027324676513672, + "learning_rate": 3.3069507864028415e-06, + "loss": 0.0644, + "step": 7692 + }, + { + "epoch": 7.025570776255708, + "grad_norm": 4.601191520690918, + "learning_rate": 3.3059360730593608e-06, + "loss": 0.0271, + "step": 7693 + }, + { + "epoch": 7.02648401826484, + "grad_norm": 0.4370533525943756, + "learning_rate": 3.304921359715881e-06, + "loss": 0.0034, + "step": 7694 + }, + { + "epoch": 7.027397260273973, + "grad_norm": 1.5104990005493164, + "learning_rate": 3.3039066463724e-06, + "loss": 0.0124, + "step": 7695 + }, + { + "epoch": 7.028310502283105, + "grad_norm": 3.0430450439453125, + "learning_rate": 3.3028919330289194e-06, + "loss": 0.0164, + "step": 7696 + }, + { + "epoch": 7.029223744292238, + "grad_norm": 2.462808609008789, + "learning_rate": 3.301877219685439e-06, + "loss": 0.0132, + "step": 7697 + }, + { + "epoch": 7.03013698630137, + "grad_norm": 10.743644714355469, + "learning_rate": 3.3008625063419584e-06, + "loss": 0.0673, + "step": 7698 + }, + { + "epoch": 7.0310502283105025, + "grad_norm": 2.3978874683380127, + "learning_rate": 3.2998477929984785e-06, + "loss": 0.0132, + "step": 7699 + }, + { + "epoch": 7.031963470319635, + "grad_norm": 0.12876512110233307, + "learning_rate": 3.2988330796549978e-06, + "loss": 0.0005, + "step": 7700 + }, + { + "epoch": 7.032876712328767, + "grad_norm": 1.7305806875228882, + "learning_rate": 3.297818366311517e-06, + "loss": 0.0109, + "step": 7701 + }, + { + "epoch": 7.0337899543379, + "grad_norm": 5.281260013580322, + "learning_rate": 3.296803652968037e-06, + "loss": 0.0333, + "step": 7702 + }, + { + "epoch": 7.034703196347032, + "grad_norm": 107.02674102783203, + "learning_rate": 3.2957889396245564e-06, + "loss": 0.6592, + "step": 7703 + }, + { + "epoch": 7.035616438356165, + "grad_norm": 0.20409700274467468, + "learning_rate": 3.294774226281076e-06, + "loss": 0.0018, + "step": 7704 + }, + { + "epoch": 7.036529680365296, + "grad_norm": 14.103638648986816, + "learning_rate": 3.2937595129375954e-06, + "loss": 0.0791, + "step": 7705 + }, + { + "epoch": 7.037442922374429, + "grad_norm": 31.9835262298584, + "learning_rate": 3.2927447995941147e-06, + "loss": 0.1874, + "step": 7706 + }, + { + "epoch": 7.038356164383561, + "grad_norm": 72.91566467285156, + "learning_rate": 3.2917300862506348e-06, + "loss": 0.7741, + "step": 7707 + }, + { + "epoch": 7.039269406392694, + "grad_norm": 5.586527347564697, + "learning_rate": 3.290715372907154e-06, + "loss": 0.0367, + "step": 7708 + }, + { + "epoch": 7.040182648401826, + "grad_norm": 0.14652560651302338, + "learning_rate": 3.2897006595636737e-06, + "loss": 0.0009, + "step": 7709 + }, + { + "epoch": 7.041095890410959, + "grad_norm": 3.6119585037231445, + "learning_rate": 3.288685946220193e-06, + "loss": 0.0193, + "step": 7710 + }, + { + "epoch": 7.042009132420091, + "grad_norm": 1.4573798179626465, + "learning_rate": 3.2876712328767123e-06, + "loss": 0.007, + "step": 7711 + }, + { + "epoch": 7.0429223744292235, + "grad_norm": 3.251983404159546, + "learning_rate": 3.2866565195332324e-06, + "loss": 0.0164, + "step": 7712 + }, + { + "epoch": 7.043835616438356, + "grad_norm": 1.1807689666748047, + "learning_rate": 3.2856418061897517e-06, + "loss": 0.0041, + "step": 7713 + }, + { + "epoch": 7.044748858447488, + "grad_norm": 1.5539560317993164, + "learning_rate": 3.2846270928462713e-06, + "loss": 0.0111, + "step": 7714 + }, + { + "epoch": 7.045662100456621, + "grad_norm": 12.08154582977295, + "learning_rate": 3.2836123795027906e-06, + "loss": 0.0515, + "step": 7715 + }, + { + "epoch": 7.046575342465753, + "grad_norm": 0.05077378824353218, + "learning_rate": 3.28259766615931e-06, + "loss": 0.0003, + "step": 7716 + }, + { + "epoch": 7.047488584474886, + "grad_norm": 8.229166030883789, + "learning_rate": 3.28158295281583e-06, + "loss": 0.0266, + "step": 7717 + }, + { + "epoch": 7.048401826484018, + "grad_norm": 7.371649265289307, + "learning_rate": 3.2805682394723493e-06, + "loss": 0.025, + "step": 7718 + }, + { + "epoch": 7.049315068493151, + "grad_norm": 1.1528266668319702, + "learning_rate": 3.279553526128869e-06, + "loss": 0.0058, + "step": 7719 + }, + { + "epoch": 7.050228310502283, + "grad_norm": 0.23144228756427765, + "learning_rate": 3.2785388127853882e-06, + "loss": 0.0011, + "step": 7720 + }, + { + "epoch": 7.051141552511416, + "grad_norm": 1.0855391025543213, + "learning_rate": 3.277524099441908e-06, + "loss": 0.008, + "step": 7721 + }, + { + "epoch": 7.052054794520548, + "grad_norm": 0.9409242272377014, + "learning_rate": 3.2765093860984276e-06, + "loss": 0.004, + "step": 7722 + }, + { + "epoch": 7.0529680365296805, + "grad_norm": 5.634562969207764, + "learning_rate": 3.275494672754947e-06, + "loss": 0.0346, + "step": 7723 + }, + { + "epoch": 7.053881278538813, + "grad_norm": 35.92062759399414, + "learning_rate": 3.274479959411467e-06, + "loss": 0.3307, + "step": 7724 + }, + { + "epoch": 7.054794520547945, + "grad_norm": 2.401519536972046, + "learning_rate": 3.2734652460679863e-06, + "loss": 0.0028, + "step": 7725 + }, + { + "epoch": 7.055707762557078, + "grad_norm": 0.02288336679339409, + "learning_rate": 3.2724505327245055e-06, + "loss": 0.0001, + "step": 7726 + }, + { + "epoch": 7.05662100456621, + "grad_norm": 0.13366803526878357, + "learning_rate": 3.2714358193810252e-06, + "loss": 0.001, + "step": 7727 + }, + { + "epoch": 7.057534246575343, + "grad_norm": 95.03376007080078, + "learning_rate": 3.2704211060375445e-06, + "loss": 2.2168, + "step": 7728 + }, + { + "epoch": 7.058447488584475, + "grad_norm": 3.8028783798217773, + "learning_rate": 3.2694063926940646e-06, + "loss": 0.0105, + "step": 7729 + }, + { + "epoch": 7.059360730593608, + "grad_norm": 0.9146735072135925, + "learning_rate": 3.268391679350584e-06, + "loss": 0.0033, + "step": 7730 + }, + { + "epoch": 7.06027397260274, + "grad_norm": 42.53907012939453, + "learning_rate": 3.267376966007103e-06, + "loss": 0.1342, + "step": 7731 + }, + { + "epoch": 7.061187214611872, + "grad_norm": 0.3686397075653076, + "learning_rate": 3.266362252663623e-06, + "loss": 0.0022, + "step": 7732 + }, + { + "epoch": 7.062100456621004, + "grad_norm": 2.0980494022369385, + "learning_rate": 3.265347539320142e-06, + "loss": 0.0176, + "step": 7733 + }, + { + "epoch": 7.063013698630137, + "grad_norm": 0.06597699970006943, + "learning_rate": 3.2643328259766622e-06, + "loss": 0.0004, + "step": 7734 + }, + { + "epoch": 7.063926940639269, + "grad_norm": 16.261497497558594, + "learning_rate": 3.2633181126331815e-06, + "loss": 0.0409, + "step": 7735 + }, + { + "epoch": 7.0648401826484015, + "grad_norm": 43.78114700317383, + "learning_rate": 3.2623033992897008e-06, + "loss": 0.2692, + "step": 7736 + }, + { + "epoch": 7.065753424657534, + "grad_norm": 27.701196670532227, + "learning_rate": 3.2612886859462204e-06, + "loss": 0.1633, + "step": 7737 + }, + { + "epoch": 7.066666666666666, + "grad_norm": 0.04287626966834068, + "learning_rate": 3.2602739726027397e-06, + "loss": 0.0002, + "step": 7738 + }, + { + "epoch": 7.067579908675799, + "grad_norm": 15.728071212768555, + "learning_rate": 3.25925925925926e-06, + "loss": 0.0828, + "step": 7739 + }, + { + "epoch": 7.068493150684931, + "grad_norm": 0.8582038879394531, + "learning_rate": 3.258244545915779e-06, + "loss": 0.005, + "step": 7740 + }, + { + "epoch": 7.069406392694064, + "grad_norm": 3.473569869995117, + "learning_rate": 3.2572298325722984e-06, + "loss": 0.02, + "step": 7741 + }, + { + "epoch": 7.070319634703196, + "grad_norm": 22.072227478027344, + "learning_rate": 3.2562151192288185e-06, + "loss": 0.1478, + "step": 7742 + }, + { + "epoch": 7.071232876712329, + "grad_norm": 0.43877294659614563, + "learning_rate": 3.2552004058853378e-06, + "loss": 0.0026, + "step": 7743 + }, + { + "epoch": 7.072146118721461, + "grad_norm": 1.1382277011871338, + "learning_rate": 3.2541856925418574e-06, + "loss": 0.0081, + "step": 7744 + }, + { + "epoch": 7.073059360730594, + "grad_norm": 8.286348342895508, + "learning_rate": 3.2531709791983767e-06, + "loss": 0.0499, + "step": 7745 + }, + { + "epoch": 7.073972602739726, + "grad_norm": 0.10371505469083786, + "learning_rate": 3.252156265854896e-06, + "loss": 0.0007, + "step": 7746 + }, + { + "epoch": 7.0748858447488585, + "grad_norm": 31.20500946044922, + "learning_rate": 3.251141552511416e-06, + "loss": 0.1692, + "step": 7747 + }, + { + "epoch": 7.075799086757991, + "grad_norm": 0.9764239192008972, + "learning_rate": 3.2501268391679354e-06, + "loss": 0.0083, + "step": 7748 + }, + { + "epoch": 7.076712328767123, + "grad_norm": 7.0432562828063965, + "learning_rate": 3.249112125824455e-06, + "loss": 0.0462, + "step": 7749 + }, + { + "epoch": 7.077625570776256, + "grad_norm": 10.05502986907959, + "learning_rate": 3.2480974124809743e-06, + "loss": 0.0693, + "step": 7750 + }, + { + "epoch": 7.078538812785388, + "grad_norm": 4.63901424407959, + "learning_rate": 3.2470826991374936e-06, + "loss": 0.0438, + "step": 7751 + }, + { + "epoch": 7.079452054794521, + "grad_norm": 53.67639923095703, + "learning_rate": 3.2460679857940137e-06, + "loss": 0.5965, + "step": 7752 + }, + { + "epoch": 7.080365296803653, + "grad_norm": 0.2217695266008377, + "learning_rate": 3.245053272450533e-06, + "loss": 0.0015, + "step": 7753 + }, + { + "epoch": 7.081278538812786, + "grad_norm": 1.8124792575836182, + "learning_rate": 3.2440385591070527e-06, + "loss": 0.0135, + "step": 7754 + }, + { + "epoch": 7.082191780821918, + "grad_norm": 0.2063378542661667, + "learning_rate": 3.243023845763572e-06, + "loss": 0.0015, + "step": 7755 + }, + { + "epoch": 7.083105022831051, + "grad_norm": 0.08761663734912872, + "learning_rate": 3.242009132420091e-06, + "loss": 0.0005, + "step": 7756 + }, + { + "epoch": 7.084018264840183, + "grad_norm": 11.249950408935547, + "learning_rate": 3.2409944190766113e-06, + "loss": 0.0502, + "step": 7757 + }, + { + "epoch": 7.0849315068493155, + "grad_norm": 0.1102527379989624, + "learning_rate": 3.2399797057331306e-06, + "loss": 0.0008, + "step": 7758 + }, + { + "epoch": 7.085844748858447, + "grad_norm": 2.867198944091797, + "learning_rate": 3.2389649923896503e-06, + "loss": 0.0235, + "step": 7759 + }, + { + "epoch": 7.0867579908675795, + "grad_norm": 0.3956398069858551, + "learning_rate": 3.2379502790461696e-06, + "loss": 0.0022, + "step": 7760 + }, + { + "epoch": 7.087671232876712, + "grad_norm": 23.936227798461914, + "learning_rate": 3.2369355657026892e-06, + "loss": 0.2519, + "step": 7761 + }, + { + "epoch": 7.088584474885844, + "grad_norm": 0.19733260571956635, + "learning_rate": 3.235920852359209e-06, + "loss": 0.0012, + "step": 7762 + }, + { + "epoch": 7.089497716894977, + "grad_norm": 1.463883399963379, + "learning_rate": 3.234906139015728e-06, + "loss": 0.0095, + "step": 7763 + }, + { + "epoch": 7.090410958904109, + "grad_norm": 144.06350708007812, + "learning_rate": 3.2338914256722483e-06, + "loss": 2.833, + "step": 7764 + }, + { + "epoch": 7.091324200913242, + "grad_norm": 4.557416915893555, + "learning_rate": 3.2328767123287676e-06, + "loss": 0.0277, + "step": 7765 + }, + { + "epoch": 7.092237442922374, + "grad_norm": 10.363571166992188, + "learning_rate": 3.231861998985287e-06, + "loss": 0.0702, + "step": 7766 + }, + { + "epoch": 7.093150684931507, + "grad_norm": 0.4171682596206665, + "learning_rate": 3.2308472856418066e-06, + "loss": 0.0029, + "step": 7767 + }, + { + "epoch": 7.094063926940639, + "grad_norm": 4.271155834197998, + "learning_rate": 3.229832572298326e-06, + "loss": 0.0265, + "step": 7768 + }, + { + "epoch": 7.094977168949772, + "grad_norm": 0.3919726312160492, + "learning_rate": 3.228817858954846e-06, + "loss": 0.0021, + "step": 7769 + }, + { + "epoch": 7.095890410958904, + "grad_norm": 0.10151822119951248, + "learning_rate": 3.227803145611365e-06, + "loss": 0.0008, + "step": 7770 + }, + { + "epoch": 7.0968036529680365, + "grad_norm": 67.31822967529297, + "learning_rate": 3.2267884322678845e-06, + "loss": 0.375, + "step": 7771 + }, + { + "epoch": 7.097716894977169, + "grad_norm": 1.3217507600784302, + "learning_rate": 3.225773718924404e-06, + "loss": 0.008, + "step": 7772 + }, + { + "epoch": 7.098630136986301, + "grad_norm": 5.2987751960754395, + "learning_rate": 3.2247590055809234e-06, + "loss": 0.0242, + "step": 7773 + }, + { + "epoch": 7.099543378995434, + "grad_norm": 15.388245582580566, + "learning_rate": 3.2237442922374436e-06, + "loss": 0.0885, + "step": 7774 + }, + { + "epoch": 7.100456621004566, + "grad_norm": 0.26093602180480957, + "learning_rate": 3.222729578893963e-06, + "loss": 0.002, + "step": 7775 + }, + { + "epoch": 7.101369863013699, + "grad_norm": 4.855891227722168, + "learning_rate": 3.221714865550482e-06, + "loss": 0.0212, + "step": 7776 + }, + { + "epoch": 7.102283105022831, + "grad_norm": 7.425218105316162, + "learning_rate": 3.2207001522070018e-06, + "loss": 0.0457, + "step": 7777 + }, + { + "epoch": 7.103196347031964, + "grad_norm": 0.5409614443778992, + "learning_rate": 3.219685438863521e-06, + "loss": 0.0042, + "step": 7778 + }, + { + "epoch": 7.104109589041096, + "grad_norm": 11.502517700195312, + "learning_rate": 3.218670725520041e-06, + "loss": 0.0799, + "step": 7779 + }, + { + "epoch": 7.105022831050229, + "grad_norm": 6.145299911499023, + "learning_rate": 3.2176560121765604e-06, + "loss": 0.035, + "step": 7780 + }, + { + "epoch": 7.105936073059361, + "grad_norm": 0.289168119430542, + "learning_rate": 3.2166412988330797e-06, + "loss": 0.0018, + "step": 7781 + }, + { + "epoch": 7.1068493150684935, + "grad_norm": 0.3181706964969635, + "learning_rate": 3.2156265854896e-06, + "loss": 0.0015, + "step": 7782 + }, + { + "epoch": 7.107762557077626, + "grad_norm": 1.771791934967041, + "learning_rate": 3.214611872146119e-06, + "loss": 0.0122, + "step": 7783 + }, + { + "epoch": 7.108675799086758, + "grad_norm": 3.6115756034851074, + "learning_rate": 3.2135971588026388e-06, + "loss": 0.0291, + "step": 7784 + }, + { + "epoch": 7.109589041095891, + "grad_norm": 11.332926750183105, + "learning_rate": 3.212582445459158e-06, + "loss": 0.0574, + "step": 7785 + }, + { + "epoch": 7.110502283105022, + "grad_norm": 0.4176786243915558, + "learning_rate": 3.2115677321156773e-06, + "loss": 0.0033, + "step": 7786 + }, + { + "epoch": 7.111415525114155, + "grad_norm": 1.1248936653137207, + "learning_rate": 3.2105530187721974e-06, + "loss": 0.0066, + "step": 7787 + }, + { + "epoch": 7.112328767123287, + "grad_norm": 0.6912286281585693, + "learning_rate": 3.2095383054287167e-06, + "loss": 0.004, + "step": 7788 + }, + { + "epoch": 7.11324200913242, + "grad_norm": 0.06046930328011513, + "learning_rate": 3.2085235920852364e-06, + "loss": 0.0004, + "step": 7789 + }, + { + "epoch": 7.114155251141552, + "grad_norm": 0.6423749327659607, + "learning_rate": 3.2075088787417557e-06, + "loss": 0.0053, + "step": 7790 + }, + { + "epoch": 7.115068493150685, + "grad_norm": 6.127788066864014, + "learning_rate": 3.206494165398275e-06, + "loss": 0.026, + "step": 7791 + }, + { + "epoch": 7.115981735159817, + "grad_norm": 1.5819672346115112, + "learning_rate": 3.205479452054795e-06, + "loss": 0.0095, + "step": 7792 + }, + { + "epoch": 7.11689497716895, + "grad_norm": 0.22999313473701477, + "learning_rate": 3.2044647387113143e-06, + "loss": 0.0014, + "step": 7793 + }, + { + "epoch": 7.117808219178082, + "grad_norm": 2.7759664058685303, + "learning_rate": 3.203450025367834e-06, + "loss": 0.0214, + "step": 7794 + }, + { + "epoch": 7.1187214611872145, + "grad_norm": 7.01632022857666, + "learning_rate": 3.2024353120243533e-06, + "loss": 0.0363, + "step": 7795 + }, + { + "epoch": 7.119634703196347, + "grad_norm": 49.636924743652344, + "learning_rate": 3.2014205986808725e-06, + "loss": 0.3374, + "step": 7796 + }, + { + "epoch": 7.120547945205479, + "grad_norm": 3.0186731815338135, + "learning_rate": 3.2004058853373927e-06, + "loss": 0.026, + "step": 7797 + }, + { + "epoch": 7.121461187214612, + "grad_norm": 71.3681640625, + "learning_rate": 3.199391171993912e-06, + "loss": 1.3188, + "step": 7798 + }, + { + "epoch": 7.122374429223744, + "grad_norm": 1.1949180364608765, + "learning_rate": 3.1983764586504316e-06, + "loss": 0.0104, + "step": 7799 + }, + { + "epoch": 7.123287671232877, + "grad_norm": 0.7203550338745117, + "learning_rate": 3.197361745306951e-06, + "loss": 0.0043, + "step": 7800 + }, + { + "epoch": 7.124200913242009, + "grad_norm": 1.683113932609558, + "learning_rate": 3.1963470319634706e-06, + "loss": 0.0101, + "step": 7801 + }, + { + "epoch": 7.125114155251142, + "grad_norm": 47.967689514160156, + "learning_rate": 3.1953323186199903e-06, + "loss": 0.2392, + "step": 7802 + }, + { + "epoch": 7.126027397260274, + "grad_norm": 8.013522148132324, + "learning_rate": 3.1943176052765095e-06, + "loss": 0.0432, + "step": 7803 + }, + { + "epoch": 7.126940639269407, + "grad_norm": 19.951488494873047, + "learning_rate": 3.1933028919330297e-06, + "loss": 0.1357, + "step": 7804 + }, + { + "epoch": 7.127853881278539, + "grad_norm": 52.9118537902832, + "learning_rate": 3.192288178589549e-06, + "loss": 0.7785, + "step": 7805 + }, + { + "epoch": 7.1287671232876715, + "grad_norm": 0.2146795243024826, + "learning_rate": 3.191273465246068e-06, + "loss": 0.0015, + "step": 7806 + }, + { + "epoch": 7.129680365296804, + "grad_norm": 0.045642901211977005, + "learning_rate": 3.190258751902588e-06, + "loss": 0.0003, + "step": 7807 + }, + { + "epoch": 7.130593607305936, + "grad_norm": 3.653905153274536, + "learning_rate": 3.189244038559107e-06, + "loss": 0.0277, + "step": 7808 + }, + { + "epoch": 7.131506849315069, + "grad_norm": 25.751928329467773, + "learning_rate": 3.1882293252156273e-06, + "loss": 0.1181, + "step": 7809 + }, + { + "epoch": 7.132420091324201, + "grad_norm": 1.010493516921997, + "learning_rate": 3.1872146118721465e-06, + "loss": 0.0051, + "step": 7810 + }, + { + "epoch": 7.133333333333334, + "grad_norm": 7.338930606842041, + "learning_rate": 3.186199898528666e-06, + "loss": 0.0428, + "step": 7811 + }, + { + "epoch": 7.134246575342466, + "grad_norm": 2.5576353073120117, + "learning_rate": 3.1851851851851855e-06, + "loss": 0.0152, + "step": 7812 + }, + { + "epoch": 7.135159817351598, + "grad_norm": 32.32489776611328, + "learning_rate": 3.1841704718417048e-06, + "loss": 0.6478, + "step": 7813 + }, + { + "epoch": 7.13607305936073, + "grad_norm": 26.153371810913086, + "learning_rate": 3.183155758498225e-06, + "loss": 0.2183, + "step": 7814 + }, + { + "epoch": 7.136986301369863, + "grad_norm": 0.4741670489311218, + "learning_rate": 3.182141045154744e-06, + "loss": 0.0036, + "step": 7815 + }, + { + "epoch": 7.137899543378995, + "grad_norm": 4.792236804962158, + "learning_rate": 3.1811263318112634e-06, + "loss": 0.0167, + "step": 7816 + }, + { + "epoch": 7.138812785388128, + "grad_norm": 7.051824569702148, + "learning_rate": 3.180111618467783e-06, + "loss": 0.0479, + "step": 7817 + }, + { + "epoch": 7.13972602739726, + "grad_norm": 1.6686121225357056, + "learning_rate": 3.1790969051243024e-06, + "loss": 0.0085, + "step": 7818 + }, + { + "epoch": 7.1406392694063925, + "grad_norm": 3.535860300064087, + "learning_rate": 3.1780821917808225e-06, + "loss": 0.024, + "step": 7819 + }, + { + "epoch": 7.141552511415525, + "grad_norm": 5.709267616271973, + "learning_rate": 3.1770674784373418e-06, + "loss": 0.0277, + "step": 7820 + }, + { + "epoch": 7.142465753424657, + "grad_norm": 1.2846354246139526, + "learning_rate": 3.176052765093861e-06, + "loss": 0.0067, + "step": 7821 + }, + { + "epoch": 7.14337899543379, + "grad_norm": 7.8088507652282715, + "learning_rate": 3.1750380517503807e-06, + "loss": 0.0457, + "step": 7822 + }, + { + "epoch": 7.144292237442922, + "grad_norm": 43.60807418823242, + "learning_rate": 3.1740233384069004e-06, + "loss": 0.3301, + "step": 7823 + }, + { + "epoch": 7.145205479452055, + "grad_norm": 4.309041500091553, + "learning_rate": 3.17300862506342e-06, + "loss": 0.0224, + "step": 7824 + }, + { + "epoch": 7.146118721461187, + "grad_norm": 44.065589904785156, + "learning_rate": 3.1719939117199394e-06, + "loss": 1.0831, + "step": 7825 + }, + { + "epoch": 7.14703196347032, + "grad_norm": 0.041048988699913025, + "learning_rate": 3.1709791983764586e-06, + "loss": 0.0003, + "step": 7826 + }, + { + "epoch": 7.147945205479452, + "grad_norm": 4.985848903656006, + "learning_rate": 3.1699644850329788e-06, + "loss": 0.0322, + "step": 7827 + }, + { + "epoch": 7.148858447488585, + "grad_norm": 8.008514404296875, + "learning_rate": 3.168949771689498e-06, + "loss": 0.0481, + "step": 7828 + }, + { + "epoch": 7.149771689497717, + "grad_norm": 0.05511200428009033, + "learning_rate": 3.1679350583460177e-06, + "loss": 0.0004, + "step": 7829 + }, + { + "epoch": 7.1506849315068495, + "grad_norm": 0.08655966818332672, + "learning_rate": 3.166920345002537e-06, + "loss": 0.0005, + "step": 7830 + }, + { + "epoch": 7.151598173515982, + "grad_norm": 2.665851593017578, + "learning_rate": 3.1659056316590563e-06, + "loss": 0.0178, + "step": 7831 + }, + { + "epoch": 7.152511415525114, + "grad_norm": 0.4114997982978821, + "learning_rate": 3.1648909183155764e-06, + "loss": 0.0029, + "step": 7832 + }, + { + "epoch": 7.153424657534247, + "grad_norm": 1.1318260431289673, + "learning_rate": 3.1638762049720956e-06, + "loss": 0.0051, + "step": 7833 + }, + { + "epoch": 7.154337899543379, + "grad_norm": 2.1474714279174805, + "learning_rate": 3.1628614916286153e-06, + "loss": 0.0109, + "step": 7834 + }, + { + "epoch": 7.155251141552512, + "grad_norm": 1.0490219593048096, + "learning_rate": 3.1618467782851346e-06, + "loss": 0.0055, + "step": 7835 + }, + { + "epoch": 7.156164383561644, + "grad_norm": 0.7599635720252991, + "learning_rate": 3.160832064941654e-06, + "loss": 0.0047, + "step": 7836 + }, + { + "epoch": 7.157077625570777, + "grad_norm": 1.5757739543914795, + "learning_rate": 3.159817351598174e-06, + "loss": 0.0089, + "step": 7837 + }, + { + "epoch": 7.157990867579909, + "grad_norm": 0.44227126240730286, + "learning_rate": 3.1588026382546933e-06, + "loss": 0.0013, + "step": 7838 + }, + { + "epoch": 7.1589041095890416, + "grad_norm": 0.3411739468574524, + "learning_rate": 3.157787924911213e-06, + "loss": 0.0024, + "step": 7839 + }, + { + "epoch": 7.159817351598173, + "grad_norm": 4.300808906555176, + "learning_rate": 3.1567732115677322e-06, + "loss": 0.0314, + "step": 7840 + }, + { + "epoch": 7.160730593607306, + "grad_norm": 0.2291320264339447, + "learning_rate": 3.155758498224252e-06, + "loss": 0.0013, + "step": 7841 + }, + { + "epoch": 7.161643835616438, + "grad_norm": 62.94526290893555, + "learning_rate": 3.1547437848807716e-06, + "loss": 0.5714, + "step": 7842 + }, + { + "epoch": 7.1625570776255705, + "grad_norm": 0.895203173160553, + "learning_rate": 3.153729071537291e-06, + "loss": 0.0062, + "step": 7843 + }, + { + "epoch": 7.163470319634703, + "grad_norm": 0.4913378953933716, + "learning_rate": 3.152714358193811e-06, + "loss": 0.0033, + "step": 7844 + }, + { + "epoch": 7.164383561643835, + "grad_norm": 2.501835346221924, + "learning_rate": 3.1516996448503303e-06, + "loss": 0.0156, + "step": 7845 + }, + { + "epoch": 7.165296803652968, + "grad_norm": 1.9790635108947754, + "learning_rate": 3.1506849315068495e-06, + "loss": 0.0079, + "step": 7846 + }, + { + "epoch": 7.1662100456621, + "grad_norm": 1.9482940435409546, + "learning_rate": 3.1496702181633692e-06, + "loss": 0.0156, + "step": 7847 + }, + { + "epoch": 7.167123287671233, + "grad_norm": 3.4238083362579346, + "learning_rate": 3.1486555048198885e-06, + "loss": 0.0231, + "step": 7848 + }, + { + "epoch": 7.168036529680365, + "grad_norm": 0.23452706634998322, + "learning_rate": 3.1476407914764086e-06, + "loss": 0.0016, + "step": 7849 + }, + { + "epoch": 7.168949771689498, + "grad_norm": 1.6010687351226807, + "learning_rate": 3.146626078132928e-06, + "loss": 0.0119, + "step": 7850 + }, + { + "epoch": 7.16986301369863, + "grad_norm": 12.418828010559082, + "learning_rate": 3.145611364789447e-06, + "loss": 0.1129, + "step": 7851 + }, + { + "epoch": 7.170776255707763, + "grad_norm": 82.9460220336914, + "learning_rate": 3.144596651445967e-06, + "loss": 2.2056, + "step": 7852 + }, + { + "epoch": 7.171689497716895, + "grad_norm": 73.69223022460938, + "learning_rate": 3.143581938102486e-06, + "loss": 0.6711, + "step": 7853 + }, + { + "epoch": 7.1726027397260275, + "grad_norm": 3.111953020095825, + "learning_rate": 3.142567224759006e-06, + "loss": 0.021, + "step": 7854 + }, + { + "epoch": 7.17351598173516, + "grad_norm": 16.07465934753418, + "learning_rate": 3.1415525114155255e-06, + "loss": 0.1226, + "step": 7855 + }, + { + "epoch": 7.174429223744292, + "grad_norm": 0.5867805480957031, + "learning_rate": 3.1405377980720447e-06, + "loss": 0.0029, + "step": 7856 + }, + { + "epoch": 7.175342465753425, + "grad_norm": 0.32763952016830444, + "learning_rate": 3.1395230847285644e-06, + "loss": 0.0025, + "step": 7857 + }, + { + "epoch": 7.176255707762557, + "grad_norm": 0.34427058696746826, + "learning_rate": 3.1385083713850837e-06, + "loss": 0.0018, + "step": 7858 + }, + { + "epoch": 7.17716894977169, + "grad_norm": 22.136476516723633, + "learning_rate": 3.137493658041604e-06, + "loss": 0.1689, + "step": 7859 + }, + { + "epoch": 7.178082191780822, + "grad_norm": 6.5778608322143555, + "learning_rate": 3.136478944698123e-06, + "loss": 0.0342, + "step": 7860 + }, + { + "epoch": 7.178995433789955, + "grad_norm": 0.3730573058128357, + "learning_rate": 3.1354642313546424e-06, + "loss": 0.0018, + "step": 7861 + }, + { + "epoch": 7.179908675799087, + "grad_norm": 0.18875549733638763, + "learning_rate": 3.134449518011162e-06, + "loss": 0.0014, + "step": 7862 + }, + { + "epoch": 7.1808219178082195, + "grad_norm": 10.43045711517334, + "learning_rate": 3.1334348046676817e-06, + "loss": 0.0537, + "step": 7863 + }, + { + "epoch": 7.181735159817352, + "grad_norm": 5.9582929611206055, + "learning_rate": 3.1324200913242014e-06, + "loss": 0.0284, + "step": 7864 + }, + { + "epoch": 7.182648401826484, + "grad_norm": 0.9619400501251221, + "learning_rate": 3.1314053779807207e-06, + "loss": 0.007, + "step": 7865 + }, + { + "epoch": 7.183561643835616, + "grad_norm": 74.56137084960938, + "learning_rate": 3.13039066463724e-06, + "loss": 0.6486, + "step": 7866 + }, + { + "epoch": 7.1844748858447485, + "grad_norm": 18.342737197875977, + "learning_rate": 3.12937595129376e-06, + "loss": 0.0876, + "step": 7867 + }, + { + "epoch": 7.185388127853881, + "grad_norm": 1.0751501321792603, + "learning_rate": 3.1283612379502794e-06, + "loss": 0.0072, + "step": 7868 + }, + { + "epoch": 7.186301369863013, + "grad_norm": 0.5353661179542542, + "learning_rate": 3.127346524606799e-06, + "loss": 0.003, + "step": 7869 + }, + { + "epoch": 7.187214611872146, + "grad_norm": 1.2650082111358643, + "learning_rate": 3.1263318112633183e-06, + "loss": 0.0065, + "step": 7870 + }, + { + "epoch": 7.188127853881278, + "grad_norm": 2.5806212425231934, + "learning_rate": 3.1253170979198376e-06, + "loss": 0.0164, + "step": 7871 + }, + { + "epoch": 7.189041095890411, + "grad_norm": 19.507524490356445, + "learning_rate": 3.1243023845763577e-06, + "loss": 0.1509, + "step": 7872 + }, + { + "epoch": 7.189954337899543, + "grad_norm": 1.0461159944534302, + "learning_rate": 3.123287671232877e-06, + "loss": 0.0058, + "step": 7873 + }, + { + "epoch": 7.190867579908676, + "grad_norm": 3.027172803878784, + "learning_rate": 3.1222729578893967e-06, + "loss": 0.0133, + "step": 7874 + }, + { + "epoch": 7.191780821917808, + "grad_norm": 14.829670906066895, + "learning_rate": 3.121258244545916e-06, + "loss": 0.1259, + "step": 7875 + }, + { + "epoch": 7.1926940639269406, + "grad_norm": 0.109989233314991, + "learning_rate": 3.120243531202435e-06, + "loss": 0.0005, + "step": 7876 + }, + { + "epoch": 7.193607305936073, + "grad_norm": 0.6122707724571228, + "learning_rate": 3.1192288178589553e-06, + "loss": 0.0041, + "step": 7877 + }, + { + "epoch": 7.1945205479452055, + "grad_norm": 1.2425593137741089, + "learning_rate": 3.1182141045154746e-06, + "loss": 0.0082, + "step": 7878 + }, + { + "epoch": 7.195433789954338, + "grad_norm": 9.72891902923584, + "learning_rate": 3.1171993911719943e-06, + "loss": 0.0647, + "step": 7879 + }, + { + "epoch": 7.19634703196347, + "grad_norm": 7.688808917999268, + "learning_rate": 3.1161846778285135e-06, + "loss": 0.031, + "step": 7880 + }, + { + "epoch": 7.197260273972603, + "grad_norm": 0.30604830384254456, + "learning_rate": 3.1151699644850332e-06, + "loss": 0.0019, + "step": 7881 + }, + { + "epoch": 7.198173515981735, + "grad_norm": 2.5040578842163086, + "learning_rate": 3.114155251141553e-06, + "loss": 0.018, + "step": 7882 + }, + { + "epoch": 7.199086757990868, + "grad_norm": 0.0627308189868927, + "learning_rate": 3.113140537798072e-06, + "loss": 0.0004, + "step": 7883 + }, + { + "epoch": 7.2, + "grad_norm": 18.081430435180664, + "learning_rate": 3.1121258244545923e-06, + "loss": 0.1239, + "step": 7884 + }, + { + "epoch": 7.200913242009133, + "grad_norm": 1.0852100849151611, + "learning_rate": 3.1111111111111116e-06, + "loss": 0.0062, + "step": 7885 + }, + { + "epoch": 7.201826484018265, + "grad_norm": 0.5345044136047363, + "learning_rate": 3.110096397767631e-06, + "loss": 0.0046, + "step": 7886 + }, + { + "epoch": 7.2027397260273975, + "grad_norm": 0.113934725522995, + "learning_rate": 3.1090816844241505e-06, + "loss": 0.0009, + "step": 7887 + }, + { + "epoch": 7.20365296803653, + "grad_norm": 0.17550435662269592, + "learning_rate": 3.10806697108067e-06, + "loss": 0.0006, + "step": 7888 + }, + { + "epoch": 7.2045662100456624, + "grad_norm": 20.357608795166016, + "learning_rate": 3.10705225773719e-06, + "loss": 0.1166, + "step": 7889 + }, + { + "epoch": 7.205479452054795, + "grad_norm": 0.8749228119850159, + "learning_rate": 3.106037544393709e-06, + "loss": 0.0065, + "step": 7890 + }, + { + "epoch": 7.206392694063927, + "grad_norm": 4.884247779846191, + "learning_rate": 3.1050228310502285e-06, + "loss": 0.0399, + "step": 7891 + }, + { + "epoch": 7.207305936073059, + "grad_norm": 0.6332027912139893, + "learning_rate": 3.104008117706748e-06, + "loss": 0.0039, + "step": 7892 + }, + { + "epoch": 7.208219178082191, + "grad_norm": 50.14615249633789, + "learning_rate": 3.1029934043632674e-06, + "loss": 0.4213, + "step": 7893 + }, + { + "epoch": 7.209132420091324, + "grad_norm": 1.0096580982208252, + "learning_rate": 3.1019786910197875e-06, + "loss": 0.0062, + "step": 7894 + }, + { + "epoch": 7.210045662100456, + "grad_norm": 5.636942386627197, + "learning_rate": 3.100963977676307e-06, + "loss": 0.0578, + "step": 7895 + }, + { + "epoch": 7.210958904109589, + "grad_norm": 1.1169530153274536, + "learning_rate": 3.099949264332826e-06, + "loss": 0.0051, + "step": 7896 + }, + { + "epoch": 7.211872146118721, + "grad_norm": 5.283539772033691, + "learning_rate": 3.0989345509893458e-06, + "loss": 0.0324, + "step": 7897 + }, + { + "epoch": 7.212785388127854, + "grad_norm": 5.3945112228393555, + "learning_rate": 3.097919837645865e-06, + "loss": 0.0352, + "step": 7898 + }, + { + "epoch": 7.213698630136986, + "grad_norm": 0.7290663719177246, + "learning_rate": 3.096905124302385e-06, + "loss": 0.0038, + "step": 7899 + }, + { + "epoch": 7.2146118721461185, + "grad_norm": 28.358123779296875, + "learning_rate": 3.0958904109589044e-06, + "loss": 0.0231, + "step": 7900 + }, + { + "epoch": 7.215525114155251, + "grad_norm": 2.580404758453369, + "learning_rate": 3.0948756976154237e-06, + "loss": 0.0144, + "step": 7901 + }, + { + "epoch": 7.2164383561643834, + "grad_norm": 1.449034333229065, + "learning_rate": 3.0938609842719434e-06, + "loss": 0.01, + "step": 7902 + }, + { + "epoch": 7.217351598173516, + "grad_norm": 0.16948118805885315, + "learning_rate": 3.092846270928463e-06, + "loss": 0.0012, + "step": 7903 + }, + { + "epoch": 7.218264840182648, + "grad_norm": 1.0444657802581787, + "learning_rate": 3.0918315575849828e-06, + "loss": 0.0062, + "step": 7904 + }, + { + "epoch": 7.219178082191781, + "grad_norm": 1.0043877363204956, + "learning_rate": 3.090816844241502e-06, + "loss": 0.0062, + "step": 7905 + }, + { + "epoch": 7.220091324200913, + "grad_norm": 0.9517595767974854, + "learning_rate": 3.0898021308980213e-06, + "loss": 0.0059, + "step": 7906 + }, + { + "epoch": 7.221004566210046, + "grad_norm": 5.04098653793335, + "learning_rate": 3.0887874175545414e-06, + "loss": 0.0278, + "step": 7907 + }, + { + "epoch": 7.221917808219178, + "grad_norm": 1.8282796144485474, + "learning_rate": 3.0877727042110607e-06, + "loss": 0.0108, + "step": 7908 + }, + { + "epoch": 7.222831050228311, + "grad_norm": 0.1245703399181366, + "learning_rate": 3.0867579908675804e-06, + "loss": 0.0009, + "step": 7909 + }, + { + "epoch": 7.223744292237443, + "grad_norm": 26.598003387451172, + "learning_rate": 3.0857432775240996e-06, + "loss": 0.2719, + "step": 7910 + }, + { + "epoch": 7.2246575342465755, + "grad_norm": 2.8056230545043945, + "learning_rate": 3.084728564180619e-06, + "loss": 0.0177, + "step": 7911 + }, + { + "epoch": 7.225570776255708, + "grad_norm": 7.555580139160156, + "learning_rate": 3.083713850837139e-06, + "loss": 0.0395, + "step": 7912 + }, + { + "epoch": 7.22648401826484, + "grad_norm": 3.3623483180999756, + "learning_rate": 3.0826991374936583e-06, + "loss": 0.0207, + "step": 7913 + }, + { + "epoch": 7.227397260273973, + "grad_norm": 0.36925047636032104, + "learning_rate": 3.081684424150178e-06, + "loss": 0.0031, + "step": 7914 + }, + { + "epoch": 7.228310502283105, + "grad_norm": 11.339324951171875, + "learning_rate": 3.0806697108066973e-06, + "loss": 0.0975, + "step": 7915 + }, + { + "epoch": 7.229223744292238, + "grad_norm": 12.49152660369873, + "learning_rate": 3.0796549974632165e-06, + "loss": 0.0865, + "step": 7916 + }, + { + "epoch": 7.23013698630137, + "grad_norm": 0.9645519852638245, + "learning_rate": 3.0786402841197366e-06, + "loss": 0.0059, + "step": 7917 + }, + { + "epoch": 7.231050228310503, + "grad_norm": 0.6708939671516418, + "learning_rate": 3.077625570776256e-06, + "loss": 0.0059, + "step": 7918 + }, + { + "epoch": 7.231963470319634, + "grad_norm": 0.11495591700077057, + "learning_rate": 3.0766108574327756e-06, + "loss": 0.0009, + "step": 7919 + }, + { + "epoch": 7.232876712328767, + "grad_norm": 14.923958778381348, + "learning_rate": 3.075596144089295e-06, + "loss": 0.1172, + "step": 7920 + }, + { + "epoch": 7.233789954337899, + "grad_norm": 15.924825668334961, + "learning_rate": 3.0745814307458146e-06, + "loss": 0.0864, + "step": 7921 + }, + { + "epoch": 7.234703196347032, + "grad_norm": 3.762331008911133, + "learning_rate": 3.0735667174023343e-06, + "loss": 0.0176, + "step": 7922 + }, + { + "epoch": 7.235616438356164, + "grad_norm": 7.568788528442383, + "learning_rate": 3.0725520040588535e-06, + "loss": 0.0399, + "step": 7923 + }, + { + "epoch": 7.2365296803652965, + "grad_norm": 52.61973571777344, + "learning_rate": 3.0715372907153736e-06, + "loss": 0.361, + "step": 7924 + }, + { + "epoch": 7.237442922374429, + "grad_norm": 3.7253072261810303, + "learning_rate": 3.070522577371893e-06, + "loss": 0.0297, + "step": 7925 + }, + { + "epoch": 7.238356164383561, + "grad_norm": 0.3430056571960449, + "learning_rate": 3.069507864028412e-06, + "loss": 0.002, + "step": 7926 + }, + { + "epoch": 7.239269406392694, + "grad_norm": 18.784461975097656, + "learning_rate": 3.068493150684932e-06, + "loss": 0.1269, + "step": 7927 + }, + { + "epoch": 7.240182648401826, + "grad_norm": 38.97477340698242, + "learning_rate": 3.067478437341451e-06, + "loss": 0.2469, + "step": 7928 + }, + { + "epoch": 7.241095890410959, + "grad_norm": 0.05045762658119202, + "learning_rate": 3.0664637239979713e-06, + "loss": 0.0003, + "step": 7929 + }, + { + "epoch": 7.242009132420091, + "grad_norm": 0.08312273025512695, + "learning_rate": 3.0654490106544905e-06, + "loss": 0.0006, + "step": 7930 + }, + { + "epoch": 7.242922374429224, + "grad_norm": 4.389736175537109, + "learning_rate": 3.06443429731101e-06, + "loss": 0.0281, + "step": 7931 + }, + { + "epoch": 7.243835616438356, + "grad_norm": 23.605976104736328, + "learning_rate": 3.0634195839675295e-06, + "loss": 0.0868, + "step": 7932 + }, + { + "epoch": 7.244748858447489, + "grad_norm": 39.88422393798828, + "learning_rate": 3.0624048706240488e-06, + "loss": 0.3044, + "step": 7933 + }, + { + "epoch": 7.245662100456621, + "grad_norm": 101.88858032226562, + "learning_rate": 3.061390157280569e-06, + "loss": 0.9489, + "step": 7934 + }, + { + "epoch": 7.2465753424657535, + "grad_norm": 4.061405658721924, + "learning_rate": 3.060375443937088e-06, + "loss": 0.0277, + "step": 7935 + }, + { + "epoch": 7.247488584474886, + "grad_norm": 3.46486234664917, + "learning_rate": 3.0593607305936074e-06, + "loss": 0.0199, + "step": 7936 + }, + { + "epoch": 7.248401826484018, + "grad_norm": 7.579877853393555, + "learning_rate": 3.058346017250127e-06, + "loss": 0.0363, + "step": 7937 + }, + { + "epoch": 7.249315068493151, + "grad_norm": 6.107235431671143, + "learning_rate": 3.0573313039066464e-06, + "loss": 0.0468, + "step": 7938 + }, + { + "epoch": 7.250228310502283, + "grad_norm": 0.16349844634532928, + "learning_rate": 3.0563165905631665e-06, + "loss": 0.0008, + "step": 7939 + }, + { + "epoch": 7.251141552511416, + "grad_norm": 3.5936458110809326, + "learning_rate": 3.0553018772196858e-06, + "loss": 0.0171, + "step": 7940 + }, + { + "epoch": 7.252054794520548, + "grad_norm": 0.2764563262462616, + "learning_rate": 3.054287163876205e-06, + "loss": 0.0013, + "step": 7941 + }, + { + "epoch": 7.252968036529681, + "grad_norm": 1.9024620056152344, + "learning_rate": 3.0532724505327247e-06, + "loss": 0.0161, + "step": 7942 + }, + { + "epoch": 7.253881278538813, + "grad_norm": 8.80142593383789, + "learning_rate": 3.0522577371892444e-06, + "loss": 0.0481, + "step": 7943 + }, + { + "epoch": 7.254794520547946, + "grad_norm": 4.999517440795898, + "learning_rate": 3.051243023845764e-06, + "loss": 0.0473, + "step": 7944 + }, + { + "epoch": 7.255707762557078, + "grad_norm": 2.0413835048675537, + "learning_rate": 3.0502283105022834e-06, + "loss": 0.0089, + "step": 7945 + }, + { + "epoch": 7.25662100456621, + "grad_norm": 0.977706253528595, + "learning_rate": 3.0492135971588026e-06, + "loss": 0.0071, + "step": 7946 + }, + { + "epoch": 7.257534246575342, + "grad_norm": 3.2019248008728027, + "learning_rate": 3.0481988838153227e-06, + "loss": 0.0183, + "step": 7947 + }, + { + "epoch": 7.2584474885844745, + "grad_norm": 0.07787267118692398, + "learning_rate": 3.047184170471842e-06, + "loss": 0.0005, + "step": 7948 + }, + { + "epoch": 7.259360730593607, + "grad_norm": 20.349760055541992, + "learning_rate": 3.0461694571283617e-06, + "loss": 0.1545, + "step": 7949 + }, + { + "epoch": 7.260273972602739, + "grad_norm": 1.2605968713760376, + "learning_rate": 3.045154743784881e-06, + "loss": 0.0076, + "step": 7950 + }, + { + "epoch": 7.261187214611872, + "grad_norm": 8.847810745239258, + "learning_rate": 3.0441400304414002e-06, + "loss": 0.0653, + "step": 7951 + }, + { + "epoch": 7.262100456621004, + "grad_norm": 0.2809632122516632, + "learning_rate": 3.0431253170979204e-06, + "loss": 0.0015, + "step": 7952 + }, + { + "epoch": 7.263013698630137, + "grad_norm": 2.5904462337493896, + "learning_rate": 3.0421106037544396e-06, + "loss": 0.0165, + "step": 7953 + }, + { + "epoch": 7.263926940639269, + "grad_norm": 3.2761313915252686, + "learning_rate": 3.0410958904109593e-06, + "loss": 0.021, + "step": 7954 + }, + { + "epoch": 7.264840182648402, + "grad_norm": 0.3776184320449829, + "learning_rate": 3.0400811770674786e-06, + "loss": 0.0027, + "step": 7955 + }, + { + "epoch": 7.265753424657534, + "grad_norm": 0.44345325231552124, + "learning_rate": 3.039066463723998e-06, + "loss": 0.0029, + "step": 7956 + }, + { + "epoch": 7.266666666666667, + "grad_norm": 42.69056701660156, + "learning_rate": 3.038051750380518e-06, + "loss": 0.4702, + "step": 7957 + }, + { + "epoch": 7.267579908675799, + "grad_norm": 0.20121163129806519, + "learning_rate": 3.0370370370370372e-06, + "loss": 0.0012, + "step": 7958 + }, + { + "epoch": 7.2684931506849315, + "grad_norm": 1.3949155807495117, + "learning_rate": 3.036022323693557e-06, + "loss": 0.0069, + "step": 7959 + }, + { + "epoch": 7.269406392694064, + "grad_norm": 2.7183218002319336, + "learning_rate": 3.035007610350076e-06, + "loss": 0.0117, + "step": 7960 + }, + { + "epoch": 7.270319634703196, + "grad_norm": 300.36962890625, + "learning_rate": 3.033992897006596e-06, + "loss": 1.879, + "step": 7961 + }, + { + "epoch": 7.271232876712329, + "grad_norm": 3.199878215789795, + "learning_rate": 3.0329781836631156e-06, + "loss": 0.0159, + "step": 7962 + }, + { + "epoch": 7.272146118721461, + "grad_norm": 3.1802151203155518, + "learning_rate": 3.031963470319635e-06, + "loss": 0.0243, + "step": 7963 + }, + { + "epoch": 7.273059360730594, + "grad_norm": 5.740826606750488, + "learning_rate": 3.0309487569761546e-06, + "loss": 0.0331, + "step": 7964 + }, + { + "epoch": 7.273972602739726, + "grad_norm": 0.11808677017688751, + "learning_rate": 3.0299340436326742e-06, + "loss": 0.0005, + "step": 7965 + }, + { + "epoch": 7.274885844748859, + "grad_norm": 45.23946762084961, + "learning_rate": 3.0289193302891935e-06, + "loss": 0.2981, + "step": 7966 + }, + { + "epoch": 7.275799086757991, + "grad_norm": 0.6046728491783142, + "learning_rate": 3.027904616945713e-06, + "loss": 0.0037, + "step": 7967 + }, + { + "epoch": 7.276712328767124, + "grad_norm": 15.38680362701416, + "learning_rate": 3.0268899036022325e-06, + "loss": 0.0624, + "step": 7968 + }, + { + "epoch": 7.277625570776256, + "grad_norm": 15.135478019714355, + "learning_rate": 3.0258751902587526e-06, + "loss": 0.1236, + "step": 7969 + }, + { + "epoch": 7.2785388127853885, + "grad_norm": 1.8323876857757568, + "learning_rate": 3.024860476915272e-06, + "loss": 0.0089, + "step": 7970 + }, + { + "epoch": 7.279452054794521, + "grad_norm": 31.3782901763916, + "learning_rate": 3.023845763571791e-06, + "loss": 0.1746, + "step": 7971 + }, + { + "epoch": 7.280365296803653, + "grad_norm": 6.03181791305542, + "learning_rate": 3.022831050228311e-06, + "loss": 0.0294, + "step": 7972 + }, + { + "epoch": 7.281278538812785, + "grad_norm": 1.3320960998535156, + "learning_rate": 3.02181633688483e-06, + "loss": 0.0065, + "step": 7973 + }, + { + "epoch": 7.282191780821917, + "grad_norm": 5.9482622146606445, + "learning_rate": 3.02080162354135e-06, + "loss": 0.0398, + "step": 7974 + }, + { + "epoch": 7.28310502283105, + "grad_norm": 0.01104236301034689, + "learning_rate": 3.0197869101978695e-06, + "loss": 0.0001, + "step": 7975 + }, + { + "epoch": 7.284018264840182, + "grad_norm": 0.5105925798416138, + "learning_rate": 3.0187721968543887e-06, + "loss": 0.0038, + "step": 7976 + }, + { + "epoch": 7.284931506849315, + "grad_norm": 62.64516067504883, + "learning_rate": 3.0177574835109084e-06, + "loss": 0.713, + "step": 7977 + }, + { + "epoch": 7.285844748858447, + "grad_norm": 2.5482337474823, + "learning_rate": 3.0167427701674277e-06, + "loss": 0.0177, + "step": 7978 + }, + { + "epoch": 7.28675799086758, + "grad_norm": 38.08377456665039, + "learning_rate": 3.015728056823948e-06, + "loss": 0.4202, + "step": 7979 + }, + { + "epoch": 7.287671232876712, + "grad_norm": 0.32889965176582336, + "learning_rate": 3.014713343480467e-06, + "loss": 0.0021, + "step": 7980 + }, + { + "epoch": 7.288584474885845, + "grad_norm": 0.010518752969801426, + "learning_rate": 3.0136986301369864e-06, + "loss": 0.0001, + "step": 7981 + }, + { + "epoch": 7.289497716894977, + "grad_norm": 5.2798848152160645, + "learning_rate": 3.012683916793506e-06, + "loss": 0.0384, + "step": 7982 + }, + { + "epoch": 7.2904109589041095, + "grad_norm": 4.67840051651001, + "learning_rate": 3.0116692034500257e-06, + "loss": 0.0369, + "step": 7983 + }, + { + "epoch": 7.291324200913242, + "grad_norm": 0.5081013441085815, + "learning_rate": 3.0106544901065454e-06, + "loss": 0.0033, + "step": 7984 + }, + { + "epoch": 7.292237442922374, + "grad_norm": 1.0760258436203003, + "learning_rate": 3.0096397767630647e-06, + "loss": 0.0086, + "step": 7985 + }, + { + "epoch": 7.293150684931507, + "grad_norm": 0.009567003697156906, + "learning_rate": 3.008625063419584e-06, + "loss": 0.0001, + "step": 7986 + }, + { + "epoch": 7.294063926940639, + "grad_norm": 2.385701894760132, + "learning_rate": 3.007610350076104e-06, + "loss": 0.0201, + "step": 7987 + }, + { + "epoch": 7.294977168949772, + "grad_norm": 21.784080505371094, + "learning_rate": 3.0065956367326233e-06, + "loss": 0.1529, + "step": 7988 + }, + { + "epoch": 7.295890410958904, + "grad_norm": 74.45021057128906, + "learning_rate": 3.005580923389143e-06, + "loss": 0.6537, + "step": 7989 + }, + { + "epoch": 7.296803652968037, + "grad_norm": 2.2204174995422363, + "learning_rate": 3.0045662100456623e-06, + "loss": 0.013, + "step": 7990 + }, + { + "epoch": 7.297716894977169, + "grad_norm": 99.74090576171875, + "learning_rate": 3.0035514967021816e-06, + "loss": 0.8357, + "step": 7991 + }, + { + "epoch": 7.298630136986302, + "grad_norm": 3.4823451042175293, + "learning_rate": 3.0025367833587017e-06, + "loss": 0.0201, + "step": 7992 + }, + { + "epoch": 7.299543378995434, + "grad_norm": 34.453895568847656, + "learning_rate": 3.001522070015221e-06, + "loss": 0.2477, + "step": 7993 + }, + { + "epoch": 7.3004566210045665, + "grad_norm": 142.23228454589844, + "learning_rate": 3.0005073566717407e-06, + "loss": 2.2958, + "step": 7994 + }, + { + "epoch": 7.301369863013699, + "grad_norm": 8.868895530700684, + "learning_rate": 2.99949264332826e-06, + "loss": 0.0583, + "step": 7995 + }, + { + "epoch": 7.302283105022831, + "grad_norm": 2.1170811653137207, + "learning_rate": 2.998477929984779e-06, + "loss": 0.0146, + "step": 7996 + }, + { + "epoch": 7.303196347031964, + "grad_norm": 4.267459869384766, + "learning_rate": 2.9974632166412993e-06, + "loss": 0.0284, + "step": 7997 + }, + { + "epoch": 7.304109589041096, + "grad_norm": 5.535163402557373, + "learning_rate": 2.9964485032978186e-06, + "loss": 0.0401, + "step": 7998 + }, + { + "epoch": 7.305022831050229, + "grad_norm": 0.156532883644104, + "learning_rate": 2.995433789954338e-06, + "loss": 0.0012, + "step": 7999 + }, + { + "epoch": 7.30593607305936, + "grad_norm": 93.59367370605469, + "learning_rate": 2.9944190766108575e-06, + "loss": 1.0194, + "step": 8000 + }, + { + "epoch": 7.306849315068493, + "grad_norm": 43.01848220825195, + "learning_rate": 2.9934043632673772e-06, + "loss": 0.2203, + "step": 8001 + }, + { + "epoch": 7.307762557077625, + "grad_norm": 65.6192855834961, + "learning_rate": 2.992389649923897e-06, + "loss": 0.3375, + "step": 8002 + }, + { + "epoch": 7.308675799086758, + "grad_norm": 11.13132381439209, + "learning_rate": 2.991374936580416e-06, + "loss": 0.0673, + "step": 8003 + }, + { + "epoch": 7.30958904109589, + "grad_norm": 3.2234389781951904, + "learning_rate": 2.9903602232369355e-06, + "loss": 0.0154, + "step": 8004 + }, + { + "epoch": 7.310502283105023, + "grad_norm": 5.079448223114014, + "learning_rate": 2.9893455098934556e-06, + "loss": 0.0156, + "step": 8005 + }, + { + "epoch": 7.311415525114155, + "grad_norm": 0.13036245107650757, + "learning_rate": 2.988330796549975e-06, + "loss": 0.0009, + "step": 8006 + }, + { + "epoch": 7.3123287671232875, + "grad_norm": 0.06222504749894142, + "learning_rate": 2.9873160832064945e-06, + "loss": 0.0003, + "step": 8007 + }, + { + "epoch": 7.31324200913242, + "grad_norm": 0.11708035320043564, + "learning_rate": 2.986301369863014e-06, + "loss": 0.0004, + "step": 8008 + }, + { + "epoch": 7.314155251141552, + "grad_norm": 1.2468925714492798, + "learning_rate": 2.985286656519533e-06, + "loss": 0.0083, + "step": 8009 + }, + { + "epoch": 7.315068493150685, + "grad_norm": 2.506399393081665, + "learning_rate": 2.984271943176053e-06, + "loss": 0.0209, + "step": 8010 + }, + { + "epoch": 7.315981735159817, + "grad_norm": 15.826714515686035, + "learning_rate": 2.9832572298325725e-06, + "loss": 0.0806, + "step": 8011 + }, + { + "epoch": 7.31689497716895, + "grad_norm": 3.6405797004699707, + "learning_rate": 2.982242516489092e-06, + "loss": 0.027, + "step": 8012 + }, + { + "epoch": 7.317808219178082, + "grad_norm": 2.598932981491089, + "learning_rate": 2.9812278031456114e-06, + "loss": 0.0149, + "step": 8013 + }, + { + "epoch": 7.318721461187215, + "grad_norm": 3.654740810394287, + "learning_rate": 2.9802130898021307e-06, + "loss": 0.0203, + "step": 8014 + }, + { + "epoch": 7.319634703196347, + "grad_norm": 0.08017488569021225, + "learning_rate": 2.979198376458651e-06, + "loss": 0.0005, + "step": 8015 + }, + { + "epoch": 7.32054794520548, + "grad_norm": 0.30605918169021606, + "learning_rate": 2.97818366311517e-06, + "loss": 0.0019, + "step": 8016 + }, + { + "epoch": 7.321461187214612, + "grad_norm": 0.4309595227241516, + "learning_rate": 2.9771689497716898e-06, + "loss": 0.0036, + "step": 8017 + }, + { + "epoch": 7.3223744292237445, + "grad_norm": 1.447598934173584, + "learning_rate": 2.976154236428209e-06, + "loss": 0.0089, + "step": 8018 + }, + { + "epoch": 7.323287671232877, + "grad_norm": 0.16965633630752563, + "learning_rate": 2.9751395230847287e-06, + "loss": 0.0006, + "step": 8019 + }, + { + "epoch": 7.324200913242009, + "grad_norm": 5.817731857299805, + "learning_rate": 2.9741248097412484e-06, + "loss": 0.0406, + "step": 8020 + }, + { + "epoch": 7.325114155251142, + "grad_norm": 1.8088115453720093, + "learning_rate": 2.9731100963977677e-06, + "loss": 0.0144, + "step": 8021 + }, + { + "epoch": 7.326027397260274, + "grad_norm": 77.49221801757812, + "learning_rate": 2.9720953830542874e-06, + "loss": 0.7132, + "step": 8022 + }, + { + "epoch": 7.326940639269407, + "grad_norm": 0.1998235434293747, + "learning_rate": 2.971080669710807e-06, + "loss": 0.0009, + "step": 8023 + }, + { + "epoch": 7.327853881278539, + "grad_norm": 24.98392677307129, + "learning_rate": 2.9700659563673263e-06, + "loss": 0.1926, + "step": 8024 + }, + { + "epoch": 7.328767123287671, + "grad_norm": 0.42421039938926697, + "learning_rate": 2.969051243023846e-06, + "loss": 0.0025, + "step": 8025 + }, + { + "epoch": 7.329680365296804, + "grad_norm": 2.7955503463745117, + "learning_rate": 2.9680365296803653e-06, + "loss": 0.0214, + "step": 8026 + }, + { + "epoch": 7.330593607305936, + "grad_norm": 2.9052815437316895, + "learning_rate": 2.9670218163368854e-06, + "loss": 0.0178, + "step": 8027 + }, + { + "epoch": 7.331506849315068, + "grad_norm": 2.5158395767211914, + "learning_rate": 2.9660071029934047e-06, + "loss": 0.0118, + "step": 8028 + }, + { + "epoch": 7.332420091324201, + "grad_norm": 0.06638337671756744, + "learning_rate": 2.964992389649924e-06, + "loss": 0.0006, + "step": 8029 + }, + { + "epoch": 7.333333333333333, + "grad_norm": 5.731706142425537, + "learning_rate": 2.9639776763064436e-06, + "loss": 0.0451, + "step": 8030 + }, + { + "epoch": 7.3342465753424655, + "grad_norm": 0.31242528557777405, + "learning_rate": 2.962962962962963e-06, + "loss": 0.0029, + "step": 8031 + }, + { + "epoch": 7.335159817351598, + "grad_norm": 2.25793719291687, + "learning_rate": 2.961948249619483e-06, + "loss": 0.0124, + "step": 8032 + }, + { + "epoch": 7.33607305936073, + "grad_norm": 0.16880464553833008, + "learning_rate": 2.9609335362760023e-06, + "loss": 0.0012, + "step": 8033 + }, + { + "epoch": 7.336986301369863, + "grad_norm": 3.7526354789733887, + "learning_rate": 2.9599188229325216e-06, + "loss": 0.0224, + "step": 8034 + }, + { + "epoch": 7.337899543378995, + "grad_norm": 20.589086532592773, + "learning_rate": 2.9589041095890413e-06, + "loss": 0.1299, + "step": 8035 + }, + { + "epoch": 7.338812785388128, + "grad_norm": 0.37351194024086, + "learning_rate": 2.9578893962455605e-06, + "loss": 0.0028, + "step": 8036 + }, + { + "epoch": 7.33972602739726, + "grad_norm": 1.7834358215332031, + "learning_rate": 2.9568746829020806e-06, + "loss": 0.0111, + "step": 8037 + }, + { + "epoch": 7.340639269406393, + "grad_norm": 0.2209925651550293, + "learning_rate": 2.9558599695586e-06, + "loss": 0.0014, + "step": 8038 + }, + { + "epoch": 7.341552511415525, + "grad_norm": 0.11271748691797256, + "learning_rate": 2.954845256215119e-06, + "loss": 0.0006, + "step": 8039 + }, + { + "epoch": 7.342465753424658, + "grad_norm": 35.42188262939453, + "learning_rate": 2.953830542871639e-06, + "loss": 0.3389, + "step": 8040 + }, + { + "epoch": 7.34337899543379, + "grad_norm": 26.62152862548828, + "learning_rate": 2.9528158295281586e-06, + "loss": 0.1007, + "step": 8041 + }, + { + "epoch": 7.3442922374429225, + "grad_norm": 2.3145809173583984, + "learning_rate": 2.9518011161846782e-06, + "loss": 0.0142, + "step": 8042 + }, + { + "epoch": 7.345205479452055, + "grad_norm": 82.01165008544922, + "learning_rate": 2.9507864028411975e-06, + "loss": 0.5098, + "step": 8043 + }, + { + "epoch": 7.346118721461187, + "grad_norm": 8.20546817779541, + "learning_rate": 2.9497716894977168e-06, + "loss": 0.0531, + "step": 8044 + }, + { + "epoch": 7.34703196347032, + "grad_norm": 17.37845230102539, + "learning_rate": 2.948756976154237e-06, + "loss": 0.094, + "step": 8045 + }, + { + "epoch": 7.347945205479452, + "grad_norm": 3.872361421585083, + "learning_rate": 2.947742262810756e-06, + "loss": 0.023, + "step": 8046 + }, + { + "epoch": 7.348858447488585, + "grad_norm": 0.4314737915992737, + "learning_rate": 2.946727549467276e-06, + "loss": 0.0027, + "step": 8047 + }, + { + "epoch": 7.349771689497717, + "grad_norm": 1.8787667751312256, + "learning_rate": 2.945712836123795e-06, + "loss": 0.0084, + "step": 8048 + }, + { + "epoch": 7.35068493150685, + "grad_norm": 8.057753562927246, + "learning_rate": 2.9446981227803144e-06, + "loss": 0.0538, + "step": 8049 + }, + { + "epoch": 7.351598173515982, + "grad_norm": 0.30349501967430115, + "learning_rate": 2.9436834094368345e-06, + "loss": 0.0016, + "step": 8050 + }, + { + "epoch": 7.352511415525115, + "grad_norm": 1.1236597299575806, + "learning_rate": 2.9426686960933538e-06, + "loss": 0.0067, + "step": 8051 + }, + { + "epoch": 7.353424657534246, + "grad_norm": 4.324310302734375, + "learning_rate": 2.9416539827498735e-06, + "loss": 0.0225, + "step": 8052 + }, + { + "epoch": 7.3543378995433795, + "grad_norm": 48.94182586669922, + "learning_rate": 2.9406392694063927e-06, + "loss": 0.372, + "step": 8053 + }, + { + "epoch": 7.355251141552511, + "grad_norm": 3.9673080444335938, + "learning_rate": 2.939624556062912e-06, + "loss": 0.0256, + "step": 8054 + }, + { + "epoch": 7.3561643835616435, + "grad_norm": 0.13169237971305847, + "learning_rate": 2.938609842719432e-06, + "loss": 0.001, + "step": 8055 + }, + { + "epoch": 7.357077625570776, + "grad_norm": 0.08296217024326324, + "learning_rate": 2.9375951293759514e-06, + "loss": 0.0007, + "step": 8056 + }, + { + "epoch": 7.357990867579908, + "grad_norm": 0.016485661268234253, + "learning_rate": 2.936580416032471e-06, + "loss": 0.0001, + "step": 8057 + }, + { + "epoch": 7.358904109589041, + "grad_norm": 0.34427711367607117, + "learning_rate": 2.9355657026889904e-06, + "loss": 0.0018, + "step": 8058 + }, + { + "epoch": 7.359817351598173, + "grad_norm": 21.037748336791992, + "learning_rate": 2.93455098934551e-06, + "loss": 0.1215, + "step": 8059 + }, + { + "epoch": 7.360730593607306, + "grad_norm": 2.4796595573425293, + "learning_rate": 2.9335362760020297e-06, + "loss": 0.0155, + "step": 8060 + }, + { + "epoch": 7.361643835616438, + "grad_norm": 1.8518143892288208, + "learning_rate": 2.932521562658549e-06, + "loss": 0.012, + "step": 8061 + }, + { + "epoch": 7.362557077625571, + "grad_norm": 2.041609287261963, + "learning_rate": 2.9315068493150687e-06, + "loss": 0.0152, + "step": 8062 + }, + { + "epoch": 7.363470319634703, + "grad_norm": 73.25422668457031, + "learning_rate": 2.9304921359715884e-06, + "loss": 0.384, + "step": 8063 + }, + { + "epoch": 7.364383561643836, + "grad_norm": 4.309953689575195, + "learning_rate": 2.9294774226281077e-06, + "loss": 0.0155, + "step": 8064 + }, + { + "epoch": 7.365296803652968, + "grad_norm": 33.195777893066406, + "learning_rate": 2.9284627092846274e-06, + "loss": 0.4274, + "step": 8065 + }, + { + "epoch": 7.3662100456621005, + "grad_norm": 0.21334104239940643, + "learning_rate": 2.9274479959411466e-06, + "loss": 0.0011, + "step": 8066 + }, + { + "epoch": 7.367123287671233, + "grad_norm": 0.3318624198436737, + "learning_rate": 2.9264332825976667e-06, + "loss": 0.0022, + "step": 8067 + }, + { + "epoch": 7.368036529680365, + "grad_norm": 0.11966521292924881, + "learning_rate": 2.925418569254186e-06, + "loss": 0.0006, + "step": 8068 + }, + { + "epoch": 7.368949771689498, + "grad_norm": 42.9818229675293, + "learning_rate": 2.9244038559107053e-06, + "loss": 0.1934, + "step": 8069 + }, + { + "epoch": 7.36986301369863, + "grad_norm": 5.314877510070801, + "learning_rate": 2.923389142567225e-06, + "loss": 0.0276, + "step": 8070 + }, + { + "epoch": 7.370776255707763, + "grad_norm": 0.3238295912742615, + "learning_rate": 2.9223744292237442e-06, + "loss": 0.0024, + "step": 8071 + }, + { + "epoch": 7.371689497716895, + "grad_norm": 1.7142388820648193, + "learning_rate": 2.9213597158802644e-06, + "loss": 0.0109, + "step": 8072 + }, + { + "epoch": 7.372602739726028, + "grad_norm": 12.59319019317627, + "learning_rate": 2.9203450025367836e-06, + "loss": 0.0561, + "step": 8073 + }, + { + "epoch": 7.37351598173516, + "grad_norm": 2.3003387451171875, + "learning_rate": 2.919330289193303e-06, + "loss": 0.0134, + "step": 8074 + }, + { + "epoch": 7.3744292237442925, + "grad_norm": 11.296998977661133, + "learning_rate": 2.9183155758498226e-06, + "loss": 0.054, + "step": 8075 + }, + { + "epoch": 7.375342465753425, + "grad_norm": 114.09709930419922, + "learning_rate": 2.917300862506342e-06, + "loss": 3.6406, + "step": 8076 + }, + { + "epoch": 7.3762557077625575, + "grad_norm": 0.44765275716781616, + "learning_rate": 2.916286149162862e-06, + "loss": 0.0024, + "step": 8077 + }, + { + "epoch": 7.37716894977169, + "grad_norm": 0.46156567335128784, + "learning_rate": 2.9152714358193812e-06, + "loss": 0.0032, + "step": 8078 + }, + { + "epoch": 7.3780821917808215, + "grad_norm": 0.2982487082481384, + "learning_rate": 2.9142567224759005e-06, + "loss": 0.0018, + "step": 8079 + }, + { + "epoch": 7.378995433789954, + "grad_norm": 10.540945053100586, + "learning_rate": 2.91324200913242e-06, + "loss": 0.0575, + "step": 8080 + }, + { + "epoch": 7.379908675799086, + "grad_norm": 1.3537741899490356, + "learning_rate": 2.91222729578894e-06, + "loss": 0.012, + "step": 8081 + }, + { + "epoch": 7.380821917808219, + "grad_norm": 0.14310388267040253, + "learning_rate": 2.9112125824454596e-06, + "loss": 0.0008, + "step": 8082 + }, + { + "epoch": 7.381735159817351, + "grad_norm": 3.878047466278076, + "learning_rate": 2.910197869101979e-06, + "loss": 0.0221, + "step": 8083 + }, + { + "epoch": 7.382648401826484, + "grad_norm": 0.48922428488731384, + "learning_rate": 2.909183155758498e-06, + "loss": 0.0035, + "step": 8084 + }, + { + "epoch": 7.383561643835616, + "grad_norm": 105.98066711425781, + "learning_rate": 2.9081684424150182e-06, + "loss": 0.9524, + "step": 8085 + }, + { + "epoch": 7.384474885844749, + "grad_norm": 8.323872566223145, + "learning_rate": 2.9071537290715375e-06, + "loss": 0.0645, + "step": 8086 + }, + { + "epoch": 7.385388127853881, + "grad_norm": 8.117050170898438, + "learning_rate": 2.906139015728057e-06, + "loss": 0.0467, + "step": 8087 + }, + { + "epoch": 7.3863013698630136, + "grad_norm": 0.8211556673049927, + "learning_rate": 2.9051243023845765e-06, + "loss": 0.0038, + "step": 8088 + }, + { + "epoch": 7.387214611872146, + "grad_norm": 0.26958930492401123, + "learning_rate": 2.9041095890410957e-06, + "loss": 0.0016, + "step": 8089 + }, + { + "epoch": 7.3881278538812785, + "grad_norm": 81.56175231933594, + "learning_rate": 2.903094875697616e-06, + "loss": 0.7343, + "step": 8090 + }, + { + "epoch": 7.389041095890411, + "grad_norm": 2.6616084575653076, + "learning_rate": 2.902080162354135e-06, + "loss": 0.0139, + "step": 8091 + }, + { + "epoch": 7.389954337899543, + "grad_norm": 9.749642372131348, + "learning_rate": 2.901065449010655e-06, + "loss": 0.0877, + "step": 8092 + }, + { + "epoch": 7.390867579908676, + "grad_norm": 21.422178268432617, + "learning_rate": 2.900050735667174e-06, + "loss": 0.1767, + "step": 8093 + }, + { + "epoch": 7.391780821917808, + "grad_norm": 0.5395153760910034, + "learning_rate": 2.8990360223236933e-06, + "loss": 0.0037, + "step": 8094 + }, + { + "epoch": 7.392694063926941, + "grad_norm": 0.9845094084739685, + "learning_rate": 2.8980213089802135e-06, + "loss": 0.0055, + "step": 8095 + }, + { + "epoch": 7.393607305936073, + "grad_norm": 5.674506664276123, + "learning_rate": 2.8970065956367327e-06, + "loss": 0.033, + "step": 8096 + }, + { + "epoch": 7.394520547945206, + "grad_norm": 2.2054545879364014, + "learning_rate": 2.8959918822932524e-06, + "loss": 0.0114, + "step": 8097 + }, + { + "epoch": 7.395433789954338, + "grad_norm": 1.5301231145858765, + "learning_rate": 2.8949771689497717e-06, + "loss": 0.0083, + "step": 8098 + }, + { + "epoch": 7.3963470319634705, + "grad_norm": 1.4530502557754517, + "learning_rate": 2.8939624556062914e-06, + "loss": 0.0094, + "step": 8099 + }, + { + "epoch": 7.397260273972603, + "grad_norm": 25.199087142944336, + "learning_rate": 2.892947742262811e-06, + "loss": 0.2657, + "step": 8100 + }, + { + "epoch": 7.3981735159817354, + "grad_norm": 63.674068450927734, + "learning_rate": 2.8919330289193303e-06, + "loss": 0.1581, + "step": 8101 + }, + { + "epoch": 7.399086757990868, + "grad_norm": 5.5552544593811035, + "learning_rate": 2.89091831557585e-06, + "loss": 0.0321, + "step": 8102 + }, + { + "epoch": 7.4, + "grad_norm": 1.3213351964950562, + "learning_rate": 2.8899036022323697e-06, + "loss": 0.0096, + "step": 8103 + }, + { + "epoch": 7.400913242009133, + "grad_norm": 0.3660881817340851, + "learning_rate": 2.888888888888889e-06, + "loss": 0.0025, + "step": 8104 + }, + { + "epoch": 7.401826484018265, + "grad_norm": 25.248931884765625, + "learning_rate": 2.8878741755454087e-06, + "loss": 0.1019, + "step": 8105 + }, + { + "epoch": 7.402739726027397, + "grad_norm": 19.053279876708984, + "learning_rate": 2.886859462201928e-06, + "loss": 0.1238, + "step": 8106 + }, + { + "epoch": 7.403652968036529, + "grad_norm": 9.339284896850586, + "learning_rate": 2.885844748858448e-06, + "loss": 0.0743, + "step": 8107 + }, + { + "epoch": 7.404566210045662, + "grad_norm": 107.61808013916016, + "learning_rate": 2.8848300355149673e-06, + "loss": 1.0458, + "step": 8108 + }, + { + "epoch": 7.405479452054794, + "grad_norm": 0.02493644878268242, + "learning_rate": 2.8838153221714866e-06, + "loss": 0.0002, + "step": 8109 + }, + { + "epoch": 7.406392694063927, + "grad_norm": 0.9213988184928894, + "learning_rate": 2.8828006088280063e-06, + "loss": 0.0043, + "step": 8110 + }, + { + "epoch": 7.407305936073059, + "grad_norm": 1.0045177936553955, + "learning_rate": 2.8817858954845256e-06, + "loss": 0.0064, + "step": 8111 + }, + { + "epoch": 7.4082191780821915, + "grad_norm": 6.838371753692627, + "learning_rate": 2.8807711821410457e-06, + "loss": 0.0387, + "step": 8112 + }, + { + "epoch": 7.409132420091324, + "grad_norm": 1.5589818954467773, + "learning_rate": 2.879756468797565e-06, + "loss": 0.011, + "step": 8113 + }, + { + "epoch": 7.4100456621004565, + "grad_norm": 1.77431058883667, + "learning_rate": 2.8787417554540842e-06, + "loss": 0.0126, + "step": 8114 + }, + { + "epoch": 7.410958904109589, + "grad_norm": 0.3030621111392975, + "learning_rate": 2.877727042110604e-06, + "loss": 0.002, + "step": 8115 + }, + { + "epoch": 7.411872146118721, + "grad_norm": 46.84543991088867, + "learning_rate": 2.876712328767123e-06, + "loss": 0.3005, + "step": 8116 + }, + { + "epoch": 7.412785388127854, + "grad_norm": 4.981894016265869, + "learning_rate": 2.8756976154236433e-06, + "loss": 0.029, + "step": 8117 + }, + { + "epoch": 7.413698630136986, + "grad_norm": 57.59669494628906, + "learning_rate": 2.8746829020801626e-06, + "loss": 0.6825, + "step": 8118 + }, + { + "epoch": 7.414611872146119, + "grad_norm": 0.19383284449577332, + "learning_rate": 2.873668188736682e-06, + "loss": 0.0012, + "step": 8119 + }, + { + "epoch": 7.415525114155251, + "grad_norm": 3.1638457775115967, + "learning_rate": 2.8726534753932015e-06, + "loss": 0.0209, + "step": 8120 + }, + { + "epoch": 7.416438356164384, + "grad_norm": 2.4661684036254883, + "learning_rate": 2.8716387620497212e-06, + "loss": 0.0161, + "step": 8121 + }, + { + "epoch": 7.417351598173516, + "grad_norm": 0.7120986580848694, + "learning_rate": 2.870624048706241e-06, + "loss": 0.0051, + "step": 8122 + }, + { + "epoch": 7.4182648401826485, + "grad_norm": 13.345760345458984, + "learning_rate": 2.86960933536276e-06, + "loss": 0.0581, + "step": 8123 + }, + { + "epoch": 7.419178082191781, + "grad_norm": 0.30203717947006226, + "learning_rate": 2.8685946220192794e-06, + "loss": 0.0021, + "step": 8124 + }, + { + "epoch": 7.420091324200913, + "grad_norm": 3.8951733112335205, + "learning_rate": 2.8675799086757996e-06, + "loss": 0.0161, + "step": 8125 + }, + { + "epoch": 7.421004566210046, + "grad_norm": 2.3489487171173096, + "learning_rate": 2.866565195332319e-06, + "loss": 0.0135, + "step": 8126 + }, + { + "epoch": 7.421917808219178, + "grad_norm": 0.23677611351013184, + "learning_rate": 2.8655504819888385e-06, + "loss": 0.0012, + "step": 8127 + }, + { + "epoch": 7.422831050228311, + "grad_norm": 0.18686237931251526, + "learning_rate": 2.864535768645358e-06, + "loss": 0.0012, + "step": 8128 + }, + { + "epoch": 7.423744292237443, + "grad_norm": 34.70219802856445, + "learning_rate": 2.863521055301877e-06, + "loss": 0.3365, + "step": 8129 + }, + { + "epoch": 7.424657534246576, + "grad_norm": 15.032793998718262, + "learning_rate": 2.862506341958397e-06, + "loss": 0.1156, + "step": 8130 + }, + { + "epoch": 7.425570776255708, + "grad_norm": 1.0703997611999512, + "learning_rate": 2.8614916286149164e-06, + "loss": 0.0061, + "step": 8131 + }, + { + "epoch": 7.426484018264841, + "grad_norm": 97.249755859375, + "learning_rate": 2.860476915271436e-06, + "loss": 2.7023, + "step": 8132 + }, + { + "epoch": 7.427397260273972, + "grad_norm": 0.024061091244220734, + "learning_rate": 2.8594622019279554e-06, + "loss": 0.0002, + "step": 8133 + }, + { + "epoch": 7.428310502283105, + "grad_norm": 1.4741421937942505, + "learning_rate": 2.8584474885844747e-06, + "loss": 0.0072, + "step": 8134 + }, + { + "epoch": 7.429223744292237, + "grad_norm": 0.30562251806259155, + "learning_rate": 2.857432775240995e-06, + "loss": 0.002, + "step": 8135 + }, + { + "epoch": 7.4301369863013695, + "grad_norm": 0.29007261991500854, + "learning_rate": 2.856418061897514e-06, + "loss": 0.0019, + "step": 8136 + }, + { + "epoch": 7.431050228310502, + "grad_norm": 9.064859390258789, + "learning_rate": 2.8554033485540337e-06, + "loss": 0.0443, + "step": 8137 + }, + { + "epoch": 7.4319634703196344, + "grad_norm": 4.8092851638793945, + "learning_rate": 2.854388635210553e-06, + "loss": 0.0394, + "step": 8138 + }, + { + "epoch": 7.432876712328767, + "grad_norm": 0.06374301761388779, + "learning_rate": 2.8533739218670727e-06, + "loss": 0.0005, + "step": 8139 + }, + { + "epoch": 7.433789954337899, + "grad_norm": 1.7927931547164917, + "learning_rate": 2.8523592085235924e-06, + "loss": 0.0121, + "step": 8140 + }, + { + "epoch": 7.434703196347032, + "grad_norm": 0.6174733638763428, + "learning_rate": 2.8513444951801117e-06, + "loss": 0.0034, + "step": 8141 + }, + { + "epoch": 7.435616438356164, + "grad_norm": 1.2563362121582031, + "learning_rate": 2.8503297818366314e-06, + "loss": 0.0091, + "step": 8142 + }, + { + "epoch": 7.436529680365297, + "grad_norm": 0.10591049492359161, + "learning_rate": 2.849315068493151e-06, + "loss": 0.0005, + "step": 8143 + }, + { + "epoch": 7.437442922374429, + "grad_norm": 2.003760576248169, + "learning_rate": 2.8483003551496703e-06, + "loss": 0.0083, + "step": 8144 + }, + { + "epoch": 7.438356164383562, + "grad_norm": 4.267139434814453, + "learning_rate": 2.84728564180619e-06, + "loss": 0.0281, + "step": 8145 + }, + { + "epoch": 7.439269406392694, + "grad_norm": 1.2892043590545654, + "learning_rate": 2.8462709284627093e-06, + "loss": 0.007, + "step": 8146 + }, + { + "epoch": 7.4401826484018265, + "grad_norm": 14.642317771911621, + "learning_rate": 2.8452562151192294e-06, + "loss": 0.077, + "step": 8147 + }, + { + "epoch": 7.441095890410959, + "grad_norm": 0.3589616119861603, + "learning_rate": 2.8442415017757487e-06, + "loss": 0.0018, + "step": 8148 + }, + { + "epoch": 7.442009132420091, + "grad_norm": 1.1425113677978516, + "learning_rate": 2.843226788432268e-06, + "loss": 0.0068, + "step": 8149 + }, + { + "epoch": 7.442922374429224, + "grad_norm": 0.21463611721992493, + "learning_rate": 2.8422120750887876e-06, + "loss": 0.0017, + "step": 8150 + }, + { + "epoch": 7.443835616438356, + "grad_norm": 59.87998962402344, + "learning_rate": 2.841197361745307e-06, + "loss": 0.9102, + "step": 8151 + }, + { + "epoch": 7.444748858447489, + "grad_norm": 0.2526308298110962, + "learning_rate": 2.840182648401827e-06, + "loss": 0.0005, + "step": 8152 + }, + { + "epoch": 7.445662100456621, + "grad_norm": 0.1652698665857315, + "learning_rate": 2.8391679350583463e-06, + "loss": 0.0009, + "step": 8153 + }, + { + "epoch": 7.446575342465754, + "grad_norm": 6.2674994468688965, + "learning_rate": 2.8381532217148656e-06, + "loss": 0.0354, + "step": 8154 + }, + { + "epoch": 7.447488584474886, + "grad_norm": 1.3272713422775269, + "learning_rate": 2.8371385083713852e-06, + "loss": 0.0083, + "step": 8155 + }, + { + "epoch": 7.448401826484019, + "grad_norm": 1.222427487373352, + "learning_rate": 2.8361237950279045e-06, + "loss": 0.0082, + "step": 8156 + }, + { + "epoch": 7.449315068493151, + "grad_norm": 0.07382141053676605, + "learning_rate": 2.8351090816844246e-06, + "loss": 0.0004, + "step": 8157 + }, + { + "epoch": 7.4502283105022835, + "grad_norm": 12.29259967803955, + "learning_rate": 2.834094368340944e-06, + "loss": 0.0763, + "step": 8158 + }, + { + "epoch": 7.451141552511416, + "grad_norm": 0.07717503607273102, + "learning_rate": 2.833079654997463e-06, + "loss": 0.0005, + "step": 8159 + }, + { + "epoch": 7.4520547945205475, + "grad_norm": 9.931085586547852, + "learning_rate": 2.832064941653983e-06, + "loss": 0.0451, + "step": 8160 + }, + { + "epoch": 7.45296803652968, + "grad_norm": 1.6415929794311523, + "learning_rate": 2.8310502283105025e-06, + "loss": 0.0067, + "step": 8161 + }, + { + "epoch": 7.453881278538812, + "grad_norm": 12.18235969543457, + "learning_rate": 2.8300355149670222e-06, + "loss": 0.0507, + "step": 8162 + }, + { + "epoch": 7.454794520547945, + "grad_norm": 29.788272857666016, + "learning_rate": 2.8290208016235415e-06, + "loss": 0.0688, + "step": 8163 + }, + { + "epoch": 7.455707762557077, + "grad_norm": 1.11492121219635, + "learning_rate": 2.8280060882800608e-06, + "loss": 0.0052, + "step": 8164 + }, + { + "epoch": 7.45662100456621, + "grad_norm": 4.641983509063721, + "learning_rate": 2.826991374936581e-06, + "loss": 0.0212, + "step": 8165 + }, + { + "epoch": 7.457534246575342, + "grad_norm": 2.9710211753845215, + "learning_rate": 2.8259766615931e-06, + "loss": 0.0163, + "step": 8166 + }, + { + "epoch": 7.458447488584475, + "grad_norm": 9.064157485961914, + "learning_rate": 2.82496194824962e-06, + "loss": 0.0547, + "step": 8167 + }, + { + "epoch": 7.459360730593607, + "grad_norm": 5.178205490112305, + "learning_rate": 2.823947234906139e-06, + "loss": 0.0308, + "step": 8168 + }, + { + "epoch": 7.46027397260274, + "grad_norm": 0.10833338648080826, + "learning_rate": 2.8229325215626584e-06, + "loss": 0.0008, + "step": 8169 + }, + { + "epoch": 7.461187214611872, + "grad_norm": 5.406217575073242, + "learning_rate": 2.8219178082191785e-06, + "loss": 0.0276, + "step": 8170 + }, + { + "epoch": 7.4621004566210045, + "grad_norm": 60.05986785888672, + "learning_rate": 2.8209030948756978e-06, + "loss": 0.4952, + "step": 8171 + }, + { + "epoch": 7.463013698630137, + "grad_norm": 26.692228317260742, + "learning_rate": 2.8198883815322175e-06, + "loss": 0.1489, + "step": 8172 + }, + { + "epoch": 7.463926940639269, + "grad_norm": 7.180351734161377, + "learning_rate": 2.8188736681887367e-06, + "loss": 0.0318, + "step": 8173 + }, + { + "epoch": 7.464840182648402, + "grad_norm": 0.012498850002884865, + "learning_rate": 2.817858954845256e-06, + "loss": 0.0001, + "step": 8174 + }, + { + "epoch": 7.465753424657534, + "grad_norm": 34.21120834350586, + "learning_rate": 2.816844241501776e-06, + "loss": 0.2209, + "step": 8175 + }, + { + "epoch": 7.466666666666667, + "grad_norm": 0.47162148356437683, + "learning_rate": 2.8158295281582954e-06, + "loss": 0.0037, + "step": 8176 + }, + { + "epoch": 7.467579908675799, + "grad_norm": 3.4179821014404297, + "learning_rate": 2.814814814814815e-06, + "loss": 0.0209, + "step": 8177 + }, + { + "epoch": 7.468493150684932, + "grad_norm": 5.614200592041016, + "learning_rate": 2.8138001014713343e-06, + "loss": 0.0426, + "step": 8178 + }, + { + "epoch": 7.469406392694064, + "grad_norm": 5.546093463897705, + "learning_rate": 2.812785388127854e-06, + "loss": 0.0409, + "step": 8179 + }, + { + "epoch": 7.470319634703197, + "grad_norm": 0.1816716492176056, + "learning_rate": 2.8117706747843737e-06, + "loss": 0.0012, + "step": 8180 + }, + { + "epoch": 7.471232876712329, + "grad_norm": 0.5191556215286255, + "learning_rate": 2.810755961440893e-06, + "loss": 0.0025, + "step": 8181 + }, + { + "epoch": 7.4721461187214615, + "grad_norm": 5.161081314086914, + "learning_rate": 2.8097412480974127e-06, + "loss": 0.0309, + "step": 8182 + }, + { + "epoch": 7.473059360730594, + "grad_norm": 2.835528612136841, + "learning_rate": 2.8087265347539324e-06, + "loss": 0.0176, + "step": 8183 + }, + { + "epoch": 7.473972602739726, + "grad_norm": 7.285641193389893, + "learning_rate": 2.8077118214104517e-06, + "loss": 0.0568, + "step": 8184 + }, + { + "epoch": 7.474885844748858, + "grad_norm": 0.22781670093536377, + "learning_rate": 2.8066971080669713e-06, + "loss": 0.0016, + "step": 8185 + }, + { + "epoch": 7.475799086757991, + "grad_norm": 1.0790326595306396, + "learning_rate": 2.8056823947234906e-06, + "loss": 0.0026, + "step": 8186 + }, + { + "epoch": 7.476712328767123, + "grad_norm": 30.735143661499023, + "learning_rate": 2.8046676813800107e-06, + "loss": 0.2926, + "step": 8187 + }, + { + "epoch": 7.477625570776255, + "grad_norm": 0.12135164439678192, + "learning_rate": 2.80365296803653e-06, + "loss": 0.0005, + "step": 8188 + }, + { + "epoch": 7.478538812785388, + "grad_norm": 7.29564094543457, + "learning_rate": 2.8026382546930493e-06, + "loss": 0.0525, + "step": 8189 + }, + { + "epoch": 7.47945205479452, + "grad_norm": 17.797019958496094, + "learning_rate": 2.801623541349569e-06, + "loss": 0.1554, + "step": 8190 + }, + { + "epoch": 7.480365296803653, + "grad_norm": 7.91173791885376, + "learning_rate": 2.8006088280060882e-06, + "loss": 0.0404, + "step": 8191 + }, + { + "epoch": 7.481278538812785, + "grad_norm": 2.045586347579956, + "learning_rate": 2.7995941146626083e-06, + "loss": 0.0099, + "step": 8192 + }, + { + "epoch": 7.482191780821918, + "grad_norm": 0.08817227184772491, + "learning_rate": 2.7985794013191276e-06, + "loss": 0.0006, + "step": 8193 + }, + { + "epoch": 7.48310502283105, + "grad_norm": 0.19673429429531097, + "learning_rate": 2.797564687975647e-06, + "loss": 0.001, + "step": 8194 + }, + { + "epoch": 7.4840182648401825, + "grad_norm": 0.39564719796180725, + "learning_rate": 2.7965499746321666e-06, + "loss": 0.0021, + "step": 8195 + }, + { + "epoch": 7.484931506849315, + "grad_norm": 0.2378932237625122, + "learning_rate": 2.795535261288686e-06, + "loss": 0.0012, + "step": 8196 + }, + { + "epoch": 7.485844748858447, + "grad_norm": 10.577837944030762, + "learning_rate": 2.794520547945206e-06, + "loss": 0.0652, + "step": 8197 + }, + { + "epoch": 7.48675799086758, + "grad_norm": 0.020718922838568687, + "learning_rate": 2.7935058346017252e-06, + "loss": 0.0002, + "step": 8198 + }, + { + "epoch": 7.487671232876712, + "grad_norm": 6.7902607917785645, + "learning_rate": 2.7924911212582445e-06, + "loss": 0.0383, + "step": 8199 + }, + { + "epoch": 7.488584474885845, + "grad_norm": 19.66273307800293, + "learning_rate": 2.791476407914764e-06, + "loss": 0.1402, + "step": 8200 + }, + { + "epoch": 7.489497716894977, + "grad_norm": 14.077263832092285, + "learning_rate": 2.790461694571284e-06, + "loss": 0.1218, + "step": 8201 + }, + { + "epoch": 7.49041095890411, + "grad_norm": 58.09649658203125, + "learning_rate": 2.7894469812278036e-06, + "loss": 0.9112, + "step": 8202 + }, + { + "epoch": 7.491324200913242, + "grad_norm": 63.873504638671875, + "learning_rate": 2.788432267884323e-06, + "loss": 0.1967, + "step": 8203 + }, + { + "epoch": 7.492237442922375, + "grad_norm": 0.08772023767232895, + "learning_rate": 2.787417554540842e-06, + "loss": 0.0006, + "step": 8204 + }, + { + "epoch": 7.493150684931507, + "grad_norm": 101.93949890136719, + "learning_rate": 2.7864028411973622e-06, + "loss": 0.8846, + "step": 8205 + }, + { + "epoch": 7.4940639269406395, + "grad_norm": 30.276411056518555, + "learning_rate": 2.7853881278538815e-06, + "loss": 0.2882, + "step": 8206 + }, + { + "epoch": 7.494977168949772, + "grad_norm": 0.12629097700119019, + "learning_rate": 2.784373414510401e-06, + "loss": 0.0008, + "step": 8207 + }, + { + "epoch": 7.495890410958904, + "grad_norm": 8.162943840026855, + "learning_rate": 2.7833587011669205e-06, + "loss": 0.0439, + "step": 8208 + }, + { + "epoch": 7.496803652968037, + "grad_norm": 1.2653419971466064, + "learning_rate": 2.7823439878234397e-06, + "loss": 0.0067, + "step": 8209 + }, + { + "epoch": 7.497716894977169, + "grad_norm": 3.856541633605957, + "learning_rate": 2.78132927447996e-06, + "loss": 0.0274, + "step": 8210 + }, + { + "epoch": 7.498630136986302, + "grad_norm": 1.7811739444732666, + "learning_rate": 2.780314561136479e-06, + "loss": 0.0145, + "step": 8211 + }, + { + "epoch": 7.499543378995433, + "grad_norm": 3.830172538757324, + "learning_rate": 2.779299847792999e-06, + "loss": 0.0376, + "step": 8212 + }, + { + "epoch": 7.500456621004567, + "grad_norm": 6.127881050109863, + "learning_rate": 2.778285134449518e-06, + "loss": 0.0267, + "step": 8213 + }, + { + "epoch": 7.501369863013698, + "grad_norm": 9.963836669921875, + "learning_rate": 2.7772704211060373e-06, + "loss": 0.0623, + "step": 8214 + }, + { + "epoch": 7.502283105022831, + "grad_norm": 16.697629928588867, + "learning_rate": 2.7762557077625574e-06, + "loss": 0.1322, + "step": 8215 + }, + { + "epoch": 7.503196347031963, + "grad_norm": 1.5992591381072998, + "learning_rate": 2.7752409944190767e-06, + "loss": 0.0099, + "step": 8216 + }, + { + "epoch": 7.504109589041096, + "grad_norm": 63.35062026977539, + "learning_rate": 2.7742262810755964e-06, + "loss": 0.6426, + "step": 8217 + }, + { + "epoch": 7.505022831050228, + "grad_norm": 0.6661657691001892, + "learning_rate": 2.7732115677321157e-06, + "loss": 0.0036, + "step": 8218 + }, + { + "epoch": 7.5059360730593605, + "grad_norm": 0.02783266454935074, + "learning_rate": 2.7721968543886354e-06, + "loss": 0.0002, + "step": 8219 + }, + { + "epoch": 7.506849315068493, + "grad_norm": 45.112979888916016, + "learning_rate": 2.771182141045155e-06, + "loss": 0.3247, + "step": 8220 + }, + { + "epoch": 7.507762557077625, + "grad_norm": 2.309166431427002, + "learning_rate": 2.7701674277016743e-06, + "loss": 0.0144, + "step": 8221 + }, + { + "epoch": 7.508675799086758, + "grad_norm": 47.82057189941406, + "learning_rate": 2.769152714358194e-06, + "loss": 0.5431, + "step": 8222 + }, + { + "epoch": 7.50958904109589, + "grad_norm": 2.279764175415039, + "learning_rate": 2.7681380010147137e-06, + "loss": 0.0211, + "step": 8223 + }, + { + "epoch": 7.510502283105023, + "grad_norm": 1.3424952030181885, + "learning_rate": 2.767123287671233e-06, + "loss": 0.0079, + "step": 8224 + }, + { + "epoch": 7.511415525114155, + "grad_norm": 32.1335334777832, + "learning_rate": 2.7661085743277527e-06, + "loss": 0.1801, + "step": 8225 + }, + { + "epoch": 7.512328767123288, + "grad_norm": 44.67936325073242, + "learning_rate": 2.765093860984272e-06, + "loss": 0.2074, + "step": 8226 + }, + { + "epoch": 7.51324200913242, + "grad_norm": 6.667366027832031, + "learning_rate": 2.764079147640792e-06, + "loss": 0.009, + "step": 8227 + }, + { + "epoch": 7.514155251141553, + "grad_norm": 7.205948829650879, + "learning_rate": 2.7630644342973113e-06, + "loss": 0.0567, + "step": 8228 + }, + { + "epoch": 7.515068493150685, + "grad_norm": 8.698446273803711, + "learning_rate": 2.7620497209538306e-06, + "loss": 0.0311, + "step": 8229 + }, + { + "epoch": 7.5159817351598175, + "grad_norm": 1.1869865655899048, + "learning_rate": 2.7610350076103503e-06, + "loss": 0.0082, + "step": 8230 + }, + { + "epoch": 7.51689497716895, + "grad_norm": 1.483609914779663, + "learning_rate": 2.7600202942668696e-06, + "loss": 0.0081, + "step": 8231 + }, + { + "epoch": 7.517808219178082, + "grad_norm": 51.75704574584961, + "learning_rate": 2.7590055809233897e-06, + "loss": 0.3809, + "step": 8232 + }, + { + "epoch": 7.518721461187215, + "grad_norm": 1.4445502758026123, + "learning_rate": 2.757990867579909e-06, + "loss": 0.0065, + "step": 8233 + }, + { + "epoch": 7.519634703196347, + "grad_norm": 18.62564468383789, + "learning_rate": 2.756976154236428e-06, + "loss": 0.1333, + "step": 8234 + }, + { + "epoch": 7.52054794520548, + "grad_norm": 18.51862907409668, + "learning_rate": 2.755961440892948e-06, + "loss": 0.1582, + "step": 8235 + }, + { + "epoch": 7.521461187214612, + "grad_norm": 4.173365116119385, + "learning_rate": 2.754946727549467e-06, + "loss": 0.0151, + "step": 8236 + }, + { + "epoch": 7.522374429223745, + "grad_norm": 1.1240737438201904, + "learning_rate": 2.7539320142059873e-06, + "loss": 0.008, + "step": 8237 + }, + { + "epoch": 7.523287671232877, + "grad_norm": 1.4217274188995361, + "learning_rate": 2.7529173008625066e-06, + "loss": 0.0089, + "step": 8238 + }, + { + "epoch": 7.524200913242009, + "grad_norm": 31.60453987121582, + "learning_rate": 2.751902587519026e-06, + "loss": 0.3263, + "step": 8239 + }, + { + "epoch": 7.525114155251142, + "grad_norm": 11.66923999786377, + "learning_rate": 2.7508878741755455e-06, + "loss": 0.0749, + "step": 8240 + }, + { + "epoch": 7.526027397260274, + "grad_norm": 6.535472393035889, + "learning_rate": 2.749873160832065e-06, + "loss": 0.0379, + "step": 8241 + }, + { + "epoch": 7.526940639269406, + "grad_norm": 1.44674813747406, + "learning_rate": 2.748858447488585e-06, + "loss": 0.0071, + "step": 8242 + }, + { + "epoch": 7.5278538812785385, + "grad_norm": 1.3866173028945923, + "learning_rate": 2.747843734145104e-06, + "loss": 0.0058, + "step": 8243 + }, + { + "epoch": 7.528767123287671, + "grad_norm": 8.844053268432617, + "learning_rate": 2.7468290208016234e-06, + "loss": 0.0833, + "step": 8244 + }, + { + "epoch": 7.529680365296803, + "grad_norm": 1.1023727655410767, + "learning_rate": 2.7458143074581436e-06, + "loss": 0.0038, + "step": 8245 + }, + { + "epoch": 7.530593607305936, + "grad_norm": 3.4156382083892822, + "learning_rate": 2.744799594114663e-06, + "loss": 0.0214, + "step": 8246 + }, + { + "epoch": 7.531506849315068, + "grad_norm": 0.0678803101181984, + "learning_rate": 2.7437848807711825e-06, + "loss": 0.0004, + "step": 8247 + }, + { + "epoch": 7.532420091324201, + "grad_norm": 8.485189437866211, + "learning_rate": 2.7427701674277018e-06, + "loss": 0.0257, + "step": 8248 + }, + { + "epoch": 7.533333333333333, + "grad_norm": 53.616703033447266, + "learning_rate": 2.741755454084221e-06, + "loss": 0.4182, + "step": 8249 + }, + { + "epoch": 7.534246575342466, + "grad_norm": 2.560091495513916, + "learning_rate": 2.740740740740741e-06, + "loss": 0.0223, + "step": 8250 + }, + { + "epoch": 7.535159817351598, + "grad_norm": 8.394368171691895, + "learning_rate": 2.7397260273972604e-06, + "loss": 0.0702, + "step": 8251 + }, + { + "epoch": 7.536073059360731, + "grad_norm": 18.953962326049805, + "learning_rate": 2.73871131405378e-06, + "loss": 0.1733, + "step": 8252 + }, + { + "epoch": 7.536986301369863, + "grad_norm": 15.797707557678223, + "learning_rate": 2.7376966007102994e-06, + "loss": 0.1131, + "step": 8253 + }, + { + "epoch": 7.5378995433789955, + "grad_norm": 58.48487091064453, + "learning_rate": 2.7366818873668187e-06, + "loss": 0.3127, + "step": 8254 + }, + { + "epoch": 7.538812785388128, + "grad_norm": 3.307178497314453, + "learning_rate": 2.7356671740233388e-06, + "loss": 0.018, + "step": 8255 + }, + { + "epoch": 7.53972602739726, + "grad_norm": 2.8654723167419434, + "learning_rate": 2.734652460679858e-06, + "loss": 0.0063, + "step": 8256 + }, + { + "epoch": 7.540639269406393, + "grad_norm": 0.10545282065868378, + "learning_rate": 2.7336377473363777e-06, + "loss": 0.0006, + "step": 8257 + }, + { + "epoch": 7.541552511415525, + "grad_norm": 2.7773423194885254, + "learning_rate": 2.732623033992897e-06, + "loss": 0.0192, + "step": 8258 + }, + { + "epoch": 7.542465753424658, + "grad_norm": 0.18238332867622375, + "learning_rate": 2.7316083206494167e-06, + "loss": 0.0011, + "step": 8259 + }, + { + "epoch": 7.54337899543379, + "grad_norm": 12.588890075683594, + "learning_rate": 2.7305936073059364e-06, + "loss": 0.1035, + "step": 8260 + }, + { + "epoch": 7.544292237442923, + "grad_norm": 10.342535018920898, + "learning_rate": 2.7295788939624557e-06, + "loss": 0.0688, + "step": 8261 + }, + { + "epoch": 7.545205479452055, + "grad_norm": 5.91970157623291, + "learning_rate": 2.7285641806189754e-06, + "loss": 0.05, + "step": 8262 + }, + { + "epoch": 7.546118721461188, + "grad_norm": 0.853215754032135, + "learning_rate": 2.727549467275495e-06, + "loss": 0.0041, + "step": 8263 + }, + { + "epoch": 7.54703196347032, + "grad_norm": 16.029804229736328, + "learning_rate": 2.7265347539320143e-06, + "loss": 0.0715, + "step": 8264 + }, + { + "epoch": 7.5479452054794525, + "grad_norm": 0.3634248971939087, + "learning_rate": 2.725520040588534e-06, + "loss": 0.0026, + "step": 8265 + }, + { + "epoch": 7.548858447488584, + "grad_norm": 6.739582061767578, + "learning_rate": 2.7245053272450533e-06, + "loss": 0.0577, + "step": 8266 + }, + { + "epoch": 7.549771689497717, + "grad_norm": 75.7167739868164, + "learning_rate": 2.7234906139015734e-06, + "loss": 0.5992, + "step": 8267 + }, + { + "epoch": 7.550684931506849, + "grad_norm": 27.316606521606445, + "learning_rate": 2.7224759005580927e-06, + "loss": 0.1436, + "step": 8268 + }, + { + "epoch": 7.551598173515981, + "grad_norm": 7.814277648925781, + "learning_rate": 2.721461187214612e-06, + "loss": 0.0589, + "step": 8269 + }, + { + "epoch": 7.552511415525114, + "grad_norm": 0.2122264951467514, + "learning_rate": 2.7204464738711316e-06, + "loss": 0.0019, + "step": 8270 + }, + { + "epoch": 7.553424657534246, + "grad_norm": 5.56856107711792, + "learning_rate": 2.719431760527651e-06, + "loss": 0.0354, + "step": 8271 + }, + { + "epoch": 7.554337899543379, + "grad_norm": 39.757568359375, + "learning_rate": 2.718417047184171e-06, + "loss": 0.371, + "step": 8272 + }, + { + "epoch": 7.555251141552511, + "grad_norm": 2.8613622188568115, + "learning_rate": 2.7174023338406903e-06, + "loss": 0.0103, + "step": 8273 + }, + { + "epoch": 7.556164383561644, + "grad_norm": 2.110265016555786, + "learning_rate": 2.7163876204972095e-06, + "loss": 0.0155, + "step": 8274 + }, + { + "epoch": 7.557077625570776, + "grad_norm": 0.3394511342048645, + "learning_rate": 2.7153729071537292e-06, + "loss": 0.0028, + "step": 8275 + }, + { + "epoch": 7.557990867579909, + "grad_norm": 0.32539355754852295, + "learning_rate": 2.7143581938102485e-06, + "loss": 0.0021, + "step": 8276 + }, + { + "epoch": 7.558904109589041, + "grad_norm": 5.84145975112915, + "learning_rate": 2.7133434804667686e-06, + "loss": 0.0378, + "step": 8277 + }, + { + "epoch": 7.5598173515981735, + "grad_norm": 0.7154954671859741, + "learning_rate": 2.712328767123288e-06, + "loss": 0.0046, + "step": 8278 + }, + { + "epoch": 7.560730593607306, + "grad_norm": 0.6931118965148926, + "learning_rate": 2.711314053779807e-06, + "loss": 0.0054, + "step": 8279 + }, + { + "epoch": 7.561643835616438, + "grad_norm": 1.3390730619430542, + "learning_rate": 2.710299340436327e-06, + "loss": 0.0066, + "step": 8280 + }, + { + "epoch": 7.562557077625571, + "grad_norm": 6.73822021484375, + "learning_rate": 2.7092846270928465e-06, + "loss": 0.0453, + "step": 8281 + }, + { + "epoch": 7.563470319634703, + "grad_norm": 0.18923993408679962, + "learning_rate": 2.7082699137493662e-06, + "loss": 0.001, + "step": 8282 + }, + { + "epoch": 7.564383561643836, + "grad_norm": 7.856287002563477, + "learning_rate": 2.7072552004058855e-06, + "loss": 0.0535, + "step": 8283 + }, + { + "epoch": 7.565296803652968, + "grad_norm": 15.73646068572998, + "learning_rate": 2.7062404870624048e-06, + "loss": 0.0809, + "step": 8284 + }, + { + "epoch": 7.566210045662101, + "grad_norm": 24.636871337890625, + "learning_rate": 2.705225773718925e-06, + "loss": 0.2216, + "step": 8285 + }, + { + "epoch": 7.567123287671233, + "grad_norm": 15.563560485839844, + "learning_rate": 2.704211060375444e-06, + "loss": 0.1453, + "step": 8286 + }, + { + "epoch": 7.5680365296803656, + "grad_norm": 42.96854019165039, + "learning_rate": 2.703196347031964e-06, + "loss": 0.2392, + "step": 8287 + }, + { + "epoch": 7.568949771689498, + "grad_norm": 2.0416979789733887, + "learning_rate": 2.702181633688483e-06, + "loss": 0.0164, + "step": 8288 + }, + { + "epoch": 7.5698630136986305, + "grad_norm": 0.6080272793769836, + "learning_rate": 2.7011669203450024e-06, + "loss": 0.0038, + "step": 8289 + }, + { + "epoch": 7.570776255707763, + "grad_norm": 3.1126530170440674, + "learning_rate": 2.7001522070015225e-06, + "loss": 0.021, + "step": 8290 + }, + { + "epoch": 7.5716894977168945, + "grad_norm": 5.953017234802246, + "learning_rate": 2.6991374936580418e-06, + "loss": 0.0449, + "step": 8291 + }, + { + "epoch": 7.572602739726028, + "grad_norm": 18.227691650390625, + "learning_rate": 2.6981227803145615e-06, + "loss": 0.1394, + "step": 8292 + }, + { + "epoch": 7.573515981735159, + "grad_norm": 1.515608787536621, + "learning_rate": 2.6971080669710807e-06, + "loss": 0.009, + "step": 8293 + }, + { + "epoch": 7.574429223744293, + "grad_norm": 3.527585506439209, + "learning_rate": 2.6960933536276e-06, + "loss": 0.0187, + "step": 8294 + }, + { + "epoch": 7.575342465753424, + "grad_norm": 0.4916287660598755, + "learning_rate": 2.69507864028412e-06, + "loss": 0.0036, + "step": 8295 + }, + { + "epoch": 7.576255707762557, + "grad_norm": 1.5730907917022705, + "learning_rate": 2.6940639269406394e-06, + "loss": 0.0083, + "step": 8296 + }, + { + "epoch": 7.577168949771689, + "grad_norm": 51.128883361816406, + "learning_rate": 2.693049213597159e-06, + "loss": 0.3745, + "step": 8297 + }, + { + "epoch": 7.578082191780822, + "grad_norm": 17.24150276184082, + "learning_rate": 2.6920345002536783e-06, + "loss": 0.0568, + "step": 8298 + }, + { + "epoch": 7.578995433789954, + "grad_norm": 17.27112579345703, + "learning_rate": 2.691019786910198e-06, + "loss": 0.1351, + "step": 8299 + }, + { + "epoch": 7.579908675799087, + "grad_norm": 4.464442729949951, + "learning_rate": 2.6900050735667177e-06, + "loss": 0.0231, + "step": 8300 + }, + { + "epoch": 7.580821917808219, + "grad_norm": 5.7456512451171875, + "learning_rate": 2.688990360223237e-06, + "loss": 0.0488, + "step": 8301 + }, + { + "epoch": 7.5817351598173515, + "grad_norm": 6.757369041442871, + "learning_rate": 2.6879756468797567e-06, + "loss": 0.0304, + "step": 8302 + }, + { + "epoch": 7.582648401826484, + "grad_norm": 3.4196150302886963, + "learning_rate": 2.6869609335362764e-06, + "loss": 0.028, + "step": 8303 + }, + { + "epoch": 7.583561643835616, + "grad_norm": 0.23334474861621857, + "learning_rate": 2.6859462201927956e-06, + "loss": 0.001, + "step": 8304 + }, + { + "epoch": 7.584474885844749, + "grad_norm": 1.5922892093658447, + "learning_rate": 2.6849315068493153e-06, + "loss": 0.0112, + "step": 8305 + }, + { + "epoch": 7.585388127853881, + "grad_norm": 3.1556766033172607, + "learning_rate": 2.6839167935058346e-06, + "loss": 0.0218, + "step": 8306 + }, + { + "epoch": 7.586301369863014, + "grad_norm": 3.3805835247039795, + "learning_rate": 2.6829020801623547e-06, + "loss": 0.0213, + "step": 8307 + }, + { + "epoch": 7.587214611872146, + "grad_norm": 1.9140230417251587, + "learning_rate": 2.681887366818874e-06, + "loss": 0.0082, + "step": 8308 + }, + { + "epoch": 7.588127853881279, + "grad_norm": 0.27387306094169617, + "learning_rate": 2.6808726534753933e-06, + "loss": 0.0022, + "step": 8309 + }, + { + "epoch": 7.589041095890411, + "grad_norm": 2.446577787399292, + "learning_rate": 2.679857940131913e-06, + "loss": 0.011, + "step": 8310 + }, + { + "epoch": 7.5899543378995435, + "grad_norm": 0.820435643196106, + "learning_rate": 2.6788432267884322e-06, + "loss": 0.0049, + "step": 8311 + }, + { + "epoch": 7.590867579908676, + "grad_norm": 6.087874412536621, + "learning_rate": 2.6778285134449523e-06, + "loss": 0.0516, + "step": 8312 + }, + { + "epoch": 7.5917808219178085, + "grad_norm": 0.30167779326438904, + "learning_rate": 2.6768138001014716e-06, + "loss": 0.0018, + "step": 8313 + }, + { + "epoch": 7.592694063926941, + "grad_norm": 0.19117262959480286, + "learning_rate": 2.675799086757991e-06, + "loss": 0.0014, + "step": 8314 + }, + { + "epoch": 7.593607305936073, + "grad_norm": 5.601530075073242, + "learning_rate": 2.6747843734145106e-06, + "loss": 0.0332, + "step": 8315 + }, + { + "epoch": 7.594520547945206, + "grad_norm": 0.5325958132743835, + "learning_rate": 2.67376966007103e-06, + "loss": 0.0036, + "step": 8316 + }, + { + "epoch": 7.595433789954338, + "grad_norm": 38.73601531982422, + "learning_rate": 2.67275494672755e-06, + "loss": 0.2701, + "step": 8317 + }, + { + "epoch": 7.59634703196347, + "grad_norm": 3.6033997535705566, + "learning_rate": 2.6717402333840692e-06, + "loss": 0.0255, + "step": 8318 + }, + { + "epoch": 7.597260273972603, + "grad_norm": 27.825725555419922, + "learning_rate": 2.6707255200405885e-06, + "loss": 0.201, + "step": 8319 + }, + { + "epoch": 7.598173515981735, + "grad_norm": 2.4036190509796143, + "learning_rate": 2.669710806697108e-06, + "loss": 0.0112, + "step": 8320 + }, + { + "epoch": 7.599086757990867, + "grad_norm": 0.3082070052623749, + "learning_rate": 2.668696093353628e-06, + "loss": 0.002, + "step": 8321 + }, + { + "epoch": 7.6, + "grad_norm": 2.2029359340667725, + "learning_rate": 2.6676813800101476e-06, + "loss": 0.0116, + "step": 8322 + }, + { + "epoch": 7.600913242009132, + "grad_norm": 1.23002028465271, + "learning_rate": 2.666666666666667e-06, + "loss": 0.0077, + "step": 8323 + }, + { + "epoch": 7.6018264840182646, + "grad_norm": 10.660752296447754, + "learning_rate": 2.665651953323186e-06, + "loss": 0.0645, + "step": 8324 + }, + { + "epoch": 7.602739726027397, + "grad_norm": 1.8575711250305176, + "learning_rate": 2.6646372399797062e-06, + "loss": 0.015, + "step": 8325 + }, + { + "epoch": 7.6036529680365295, + "grad_norm": 17.595190048217773, + "learning_rate": 2.6636225266362255e-06, + "loss": 0.101, + "step": 8326 + }, + { + "epoch": 7.604566210045662, + "grad_norm": 22.64761734008789, + "learning_rate": 2.662607813292745e-06, + "loss": 0.1477, + "step": 8327 + }, + { + "epoch": 7.605479452054794, + "grad_norm": 5.170454025268555, + "learning_rate": 2.6615930999492644e-06, + "loss": 0.0182, + "step": 8328 + }, + { + "epoch": 7.606392694063927, + "grad_norm": 0.02063395082950592, + "learning_rate": 2.6605783866057837e-06, + "loss": 0.0001, + "step": 8329 + }, + { + "epoch": 7.607305936073059, + "grad_norm": 7.358591079711914, + "learning_rate": 2.659563673262304e-06, + "loss": 0.0388, + "step": 8330 + }, + { + "epoch": 7.608219178082192, + "grad_norm": 0.9904012084007263, + "learning_rate": 2.658548959918823e-06, + "loss": 0.0074, + "step": 8331 + }, + { + "epoch": 7.609132420091324, + "grad_norm": 13.055115699768066, + "learning_rate": 2.6575342465753428e-06, + "loss": 0.106, + "step": 8332 + }, + { + "epoch": 7.610045662100457, + "grad_norm": 8.373809814453125, + "learning_rate": 2.656519533231862e-06, + "loss": 0.0717, + "step": 8333 + }, + { + "epoch": 7.610958904109589, + "grad_norm": 15.38681697845459, + "learning_rate": 2.6555048198883813e-06, + "loss": 0.1456, + "step": 8334 + }, + { + "epoch": 7.6118721461187215, + "grad_norm": 0.8566028475761414, + "learning_rate": 2.6544901065449014e-06, + "loss": 0.006, + "step": 8335 + }, + { + "epoch": 7.612785388127854, + "grad_norm": 76.01911163330078, + "learning_rate": 2.6534753932014207e-06, + "loss": 0.4635, + "step": 8336 + }, + { + "epoch": 7.6136986301369864, + "grad_norm": 42.892120361328125, + "learning_rate": 2.6524606798579404e-06, + "loss": 0.4641, + "step": 8337 + }, + { + "epoch": 7.614611872146119, + "grad_norm": 16.69777488708496, + "learning_rate": 2.6514459665144597e-06, + "loss": 0.0423, + "step": 8338 + }, + { + "epoch": 7.615525114155251, + "grad_norm": 3.7840259075164795, + "learning_rate": 2.6504312531709794e-06, + "loss": 0.0227, + "step": 8339 + }, + { + "epoch": 7.616438356164384, + "grad_norm": 4.5383453369140625, + "learning_rate": 2.649416539827499e-06, + "loss": 0.0081, + "step": 8340 + }, + { + "epoch": 7.617351598173516, + "grad_norm": 0.8912345170974731, + "learning_rate": 2.6484018264840183e-06, + "loss": 0.0056, + "step": 8341 + }, + { + "epoch": 7.618264840182649, + "grad_norm": 0.6196173429489136, + "learning_rate": 2.647387113140538e-06, + "loss": 0.0044, + "step": 8342 + }, + { + "epoch": 7.619178082191781, + "grad_norm": 0.4273727536201477, + "learning_rate": 2.6463723997970577e-06, + "loss": 0.0025, + "step": 8343 + }, + { + "epoch": 7.620091324200914, + "grad_norm": 0.4512869417667389, + "learning_rate": 2.645357686453577e-06, + "loss": 0.0029, + "step": 8344 + }, + { + "epoch": 7.621004566210045, + "grad_norm": 14.210027694702148, + "learning_rate": 2.6443429731100967e-06, + "loss": 0.0689, + "step": 8345 + }, + { + "epoch": 7.6219178082191785, + "grad_norm": 1.8341437578201294, + "learning_rate": 2.643328259766616e-06, + "loss": 0.0139, + "step": 8346 + }, + { + "epoch": 7.62283105022831, + "grad_norm": 79.96058654785156, + "learning_rate": 2.642313546423136e-06, + "loss": 0.4936, + "step": 8347 + }, + { + "epoch": 7.6237442922374425, + "grad_norm": 1.000989556312561, + "learning_rate": 2.6412988330796553e-06, + "loss": 0.0055, + "step": 8348 + }, + { + "epoch": 7.624657534246575, + "grad_norm": 16.97439956665039, + "learning_rate": 2.6402841197361746e-06, + "loss": 0.0785, + "step": 8349 + }, + { + "epoch": 7.6255707762557075, + "grad_norm": 10.608771324157715, + "learning_rate": 2.6392694063926943e-06, + "loss": 0.0763, + "step": 8350 + }, + { + "epoch": 7.62648401826484, + "grad_norm": 20.083763122558594, + "learning_rate": 2.6382546930492135e-06, + "loss": 0.148, + "step": 8351 + }, + { + "epoch": 7.627397260273972, + "grad_norm": 1.9977781772613525, + "learning_rate": 2.6372399797057337e-06, + "loss": 0.0039, + "step": 8352 + }, + { + "epoch": 7.628310502283105, + "grad_norm": 0.6260843873023987, + "learning_rate": 2.636225266362253e-06, + "loss": 0.0028, + "step": 8353 + }, + { + "epoch": 7.629223744292237, + "grad_norm": 5.227760314941406, + "learning_rate": 2.635210553018772e-06, + "loss": 0.0257, + "step": 8354 + }, + { + "epoch": 7.63013698630137, + "grad_norm": 0.18665635585784912, + "learning_rate": 2.634195839675292e-06, + "loss": 0.0013, + "step": 8355 + }, + { + "epoch": 7.631050228310502, + "grad_norm": 8.35835075378418, + "learning_rate": 2.633181126331811e-06, + "loss": 0.0441, + "step": 8356 + }, + { + "epoch": 7.631963470319635, + "grad_norm": 0.28283822536468506, + "learning_rate": 2.6321664129883313e-06, + "loss": 0.0019, + "step": 8357 + }, + { + "epoch": 7.632876712328767, + "grad_norm": 0.7366871237754822, + "learning_rate": 2.6311516996448505e-06, + "loss": 0.0033, + "step": 8358 + }, + { + "epoch": 7.6337899543378995, + "grad_norm": 5.170358657836914, + "learning_rate": 2.63013698630137e-06, + "loss": 0.0193, + "step": 8359 + }, + { + "epoch": 7.634703196347032, + "grad_norm": 6.1018171310424805, + "learning_rate": 2.6291222729578895e-06, + "loss": 0.044, + "step": 8360 + }, + { + "epoch": 7.635616438356164, + "grad_norm": 3.3477768898010254, + "learning_rate": 2.628107559614409e-06, + "loss": 0.019, + "step": 8361 + }, + { + "epoch": 7.636529680365297, + "grad_norm": 0.3977595567703247, + "learning_rate": 2.627092846270929e-06, + "loss": 0.0021, + "step": 8362 + }, + { + "epoch": 7.637442922374429, + "grad_norm": 2.4555063247680664, + "learning_rate": 2.626078132927448e-06, + "loss": 0.0222, + "step": 8363 + }, + { + "epoch": 7.638356164383562, + "grad_norm": 1.394718050956726, + "learning_rate": 2.6250634195839674e-06, + "loss": 0.0086, + "step": 8364 + }, + { + "epoch": 7.639269406392694, + "grad_norm": 0.14122264087200165, + "learning_rate": 2.6240487062404875e-06, + "loss": 0.0011, + "step": 8365 + }, + { + "epoch": 7.640182648401827, + "grad_norm": 3.7466657161712646, + "learning_rate": 2.623033992897007e-06, + "loss": 0.0176, + "step": 8366 + }, + { + "epoch": 7.641095890410959, + "grad_norm": 1.4753671884536743, + "learning_rate": 2.6220192795535265e-06, + "loss": 0.0074, + "step": 8367 + }, + { + "epoch": 7.642009132420092, + "grad_norm": 12.265588760375977, + "learning_rate": 2.6210045662100458e-06, + "loss": 0.067, + "step": 8368 + }, + { + "epoch": 7.642922374429224, + "grad_norm": 0.5578894019126892, + "learning_rate": 2.619989852866565e-06, + "loss": 0.0026, + "step": 8369 + }, + { + "epoch": 7.6438356164383565, + "grad_norm": 2.591472864151001, + "learning_rate": 2.618975139523085e-06, + "loss": 0.0144, + "step": 8370 + }, + { + "epoch": 7.644748858447489, + "grad_norm": 7.1167216300964355, + "learning_rate": 2.6179604261796044e-06, + "loss": 0.0349, + "step": 8371 + }, + { + "epoch": 7.6456621004566205, + "grad_norm": 2.2498443126678467, + "learning_rate": 2.616945712836124e-06, + "loss": 0.0122, + "step": 8372 + }, + { + "epoch": 7.646575342465754, + "grad_norm": 1.3417731523513794, + "learning_rate": 2.6159309994926434e-06, + "loss": 0.0083, + "step": 8373 + }, + { + "epoch": 7.647488584474885, + "grad_norm": 0.41838276386260986, + "learning_rate": 2.6149162861491627e-06, + "loss": 0.003, + "step": 8374 + }, + { + "epoch": 7.648401826484018, + "grad_norm": 3.7707574367523193, + "learning_rate": 2.6139015728056828e-06, + "loss": 0.0191, + "step": 8375 + }, + { + "epoch": 7.64931506849315, + "grad_norm": 0.2777758538722992, + "learning_rate": 2.612886859462202e-06, + "loss": 0.0022, + "step": 8376 + }, + { + "epoch": 7.650228310502283, + "grad_norm": 13.381359100341797, + "learning_rate": 2.6118721461187217e-06, + "loss": 0.0803, + "step": 8377 + }, + { + "epoch": 7.651141552511415, + "grad_norm": 70.65721893310547, + "learning_rate": 2.610857432775241e-06, + "loss": 0.7542, + "step": 8378 + }, + { + "epoch": 7.652054794520548, + "grad_norm": 1.661388635635376, + "learning_rate": 2.6098427194317607e-06, + "loss": 0.0083, + "step": 8379 + }, + { + "epoch": 7.65296803652968, + "grad_norm": 10.596964836120605, + "learning_rate": 2.6088280060882804e-06, + "loss": 0.0691, + "step": 8380 + }, + { + "epoch": 7.653881278538813, + "grad_norm": 54.79593276977539, + "learning_rate": 2.6078132927447997e-06, + "loss": 0.3318, + "step": 8381 + }, + { + "epoch": 7.654794520547945, + "grad_norm": 0.7798440456390381, + "learning_rate": 2.6067985794013193e-06, + "loss": 0.0045, + "step": 8382 + }, + { + "epoch": 7.6557077625570775, + "grad_norm": 113.2149429321289, + "learning_rate": 2.605783866057839e-06, + "loss": 1.2818, + "step": 8383 + }, + { + "epoch": 7.65662100456621, + "grad_norm": 3.9769041538238525, + "learning_rate": 2.6047691527143583e-06, + "loss": 0.0229, + "step": 8384 + }, + { + "epoch": 7.657534246575342, + "grad_norm": 0.760898768901825, + "learning_rate": 2.603754439370878e-06, + "loss": 0.006, + "step": 8385 + }, + { + "epoch": 7.658447488584475, + "grad_norm": 14.34677505493164, + "learning_rate": 2.6027397260273973e-06, + "loss": 0.053, + "step": 8386 + }, + { + "epoch": 7.659360730593607, + "grad_norm": 30.96936798095703, + "learning_rate": 2.6017250126839174e-06, + "loss": 0.2063, + "step": 8387 + }, + { + "epoch": 7.66027397260274, + "grad_norm": 45.06000518798828, + "learning_rate": 2.6007102993404366e-06, + "loss": 0.3996, + "step": 8388 + }, + { + "epoch": 7.661187214611872, + "grad_norm": 0.7449963688850403, + "learning_rate": 2.599695585996956e-06, + "loss": 0.0036, + "step": 8389 + }, + { + "epoch": 7.662100456621005, + "grad_norm": 3.895580291748047, + "learning_rate": 2.5986808726534756e-06, + "loss": 0.0292, + "step": 8390 + }, + { + "epoch": 7.663013698630137, + "grad_norm": 2.6746575832366943, + "learning_rate": 2.597666159309995e-06, + "loss": 0.0178, + "step": 8391 + }, + { + "epoch": 7.66392694063927, + "grad_norm": 3.9185738563537598, + "learning_rate": 2.596651445966515e-06, + "loss": 0.0319, + "step": 8392 + }, + { + "epoch": 7.664840182648402, + "grad_norm": 0.2538435161113739, + "learning_rate": 2.5956367326230343e-06, + "loss": 0.0015, + "step": 8393 + }, + { + "epoch": 7.6657534246575345, + "grad_norm": 0.8241569995880127, + "learning_rate": 2.5946220192795535e-06, + "loss": 0.0038, + "step": 8394 + }, + { + "epoch": 7.666666666666667, + "grad_norm": 13.155547142028809, + "learning_rate": 2.5936073059360732e-06, + "loss": 0.1173, + "step": 8395 + }, + { + "epoch": 7.667579908675799, + "grad_norm": 0.17376260459423065, + "learning_rate": 2.5925925925925925e-06, + "loss": 0.0014, + "step": 8396 + }, + { + "epoch": 7.668493150684932, + "grad_norm": 0.1765177845954895, + "learning_rate": 2.5915778792491126e-06, + "loss": 0.0012, + "step": 8397 + }, + { + "epoch": 7.669406392694064, + "grad_norm": 32.70857238769531, + "learning_rate": 2.590563165905632e-06, + "loss": 0.4037, + "step": 8398 + }, + { + "epoch": 7.670319634703196, + "grad_norm": 0.1316203773021698, + "learning_rate": 2.589548452562151e-06, + "loss": 0.0008, + "step": 8399 + }, + { + "epoch": 7.671232876712329, + "grad_norm": 0.6737135052680969, + "learning_rate": 2.588533739218671e-06, + "loss": 0.0042, + "step": 8400 + }, + { + "epoch": 7.672146118721461, + "grad_norm": 2.671971559524536, + "learning_rate": 2.5875190258751905e-06, + "loss": 0.0217, + "step": 8401 + }, + { + "epoch": 7.673059360730593, + "grad_norm": 0.14764174818992615, + "learning_rate": 2.5865043125317102e-06, + "loss": 0.0008, + "step": 8402 + }, + { + "epoch": 7.673972602739726, + "grad_norm": 0.33623749017715454, + "learning_rate": 2.5854895991882295e-06, + "loss": 0.0024, + "step": 8403 + }, + { + "epoch": 7.674885844748858, + "grad_norm": 93.49516296386719, + "learning_rate": 2.5844748858447488e-06, + "loss": 1.0765, + "step": 8404 + }, + { + "epoch": 7.675799086757991, + "grad_norm": 1.800399661064148, + "learning_rate": 2.583460172501269e-06, + "loss": 0.0094, + "step": 8405 + }, + { + "epoch": 7.676712328767123, + "grad_norm": 3.0328149795532227, + "learning_rate": 2.582445459157788e-06, + "loss": 0.023, + "step": 8406 + }, + { + "epoch": 7.6776255707762555, + "grad_norm": 1.1373575925827026, + "learning_rate": 2.581430745814308e-06, + "loss": 0.0064, + "step": 8407 + }, + { + "epoch": 7.678538812785388, + "grad_norm": 0.7699821591377258, + "learning_rate": 2.580416032470827e-06, + "loss": 0.0047, + "step": 8408 + }, + { + "epoch": 7.67945205479452, + "grad_norm": 12.465307235717773, + "learning_rate": 2.5794013191273464e-06, + "loss": 0.0732, + "step": 8409 + }, + { + "epoch": 7.680365296803653, + "grad_norm": 0.8582245111465454, + "learning_rate": 2.5783866057838665e-06, + "loss": 0.0058, + "step": 8410 + }, + { + "epoch": 7.681278538812785, + "grad_norm": 0.5110266804695129, + "learning_rate": 2.5773718924403858e-06, + "loss": 0.0028, + "step": 8411 + }, + { + "epoch": 7.682191780821918, + "grad_norm": 67.10941314697266, + "learning_rate": 2.5763571790969054e-06, + "loss": 0.3681, + "step": 8412 + }, + { + "epoch": 7.68310502283105, + "grad_norm": 53.205101013183594, + "learning_rate": 2.5753424657534247e-06, + "loss": 0.412, + "step": 8413 + }, + { + "epoch": 7.684018264840183, + "grad_norm": 1.001705527305603, + "learning_rate": 2.574327752409944e-06, + "loss": 0.0053, + "step": 8414 + }, + { + "epoch": 7.684931506849315, + "grad_norm": 69.08386993408203, + "learning_rate": 2.573313039066464e-06, + "loss": 0.3927, + "step": 8415 + }, + { + "epoch": 7.685844748858448, + "grad_norm": 1.9065802097320557, + "learning_rate": 2.5722983257229834e-06, + "loss": 0.0131, + "step": 8416 + }, + { + "epoch": 7.68675799086758, + "grad_norm": 0.4707699716091156, + "learning_rate": 2.571283612379503e-06, + "loss": 0.0022, + "step": 8417 + }, + { + "epoch": 7.6876712328767125, + "grad_norm": 1.2625939846038818, + "learning_rate": 2.5702688990360223e-06, + "loss": 0.0077, + "step": 8418 + }, + { + "epoch": 7.688584474885845, + "grad_norm": 0.7746290564537048, + "learning_rate": 2.569254185692542e-06, + "loss": 0.0055, + "step": 8419 + }, + { + "epoch": 7.689497716894977, + "grad_norm": 0.5488254427909851, + "learning_rate": 2.5682394723490617e-06, + "loss": 0.0033, + "step": 8420 + }, + { + "epoch": 7.69041095890411, + "grad_norm": 2.142714023590088, + "learning_rate": 2.567224759005581e-06, + "loss": 0.0152, + "step": 8421 + }, + { + "epoch": 7.691324200913242, + "grad_norm": 0.00591003242880106, + "learning_rate": 2.5662100456621007e-06, + "loss": 0.0, + "step": 8422 + }, + { + "epoch": 7.692237442922375, + "grad_norm": 1.5066555738449097, + "learning_rate": 2.5651953323186204e-06, + "loss": 0.007, + "step": 8423 + }, + { + "epoch": 7.693150684931507, + "grad_norm": 0.9758457541465759, + "learning_rate": 2.5641806189751396e-06, + "loss": 0.0067, + "step": 8424 + }, + { + "epoch": 7.69406392694064, + "grad_norm": 1.071018099784851, + "learning_rate": 2.5631659056316593e-06, + "loss": 0.0058, + "step": 8425 + }, + { + "epoch": 7.694977168949771, + "grad_norm": 1.4028877019882202, + "learning_rate": 2.5621511922881786e-06, + "loss": 0.0087, + "step": 8426 + }, + { + "epoch": 7.695890410958905, + "grad_norm": 1.2872388362884521, + "learning_rate": 2.5611364789446987e-06, + "loss": 0.009, + "step": 8427 + }, + { + "epoch": 7.696803652968036, + "grad_norm": 14.582695960998535, + "learning_rate": 2.560121765601218e-06, + "loss": 0.1054, + "step": 8428 + }, + { + "epoch": 7.697716894977169, + "grad_norm": 0.17442698776721954, + "learning_rate": 2.5591070522577372e-06, + "loss": 0.001, + "step": 8429 + }, + { + "epoch": 7.698630136986301, + "grad_norm": 1.0262271165847778, + "learning_rate": 2.558092338914257e-06, + "loss": 0.0067, + "step": 8430 + }, + { + "epoch": 7.6995433789954335, + "grad_norm": 73.29202270507812, + "learning_rate": 2.557077625570776e-06, + "loss": 0.8659, + "step": 8431 + }, + { + "epoch": 7.700456621004566, + "grad_norm": 1.6039804220199585, + "learning_rate": 2.5560629122272963e-06, + "loss": 0.0112, + "step": 8432 + }, + { + "epoch": 7.701369863013698, + "grad_norm": 11.981701850891113, + "learning_rate": 2.5550481988838156e-06, + "loss": 0.0625, + "step": 8433 + }, + { + "epoch": 7.702283105022831, + "grad_norm": 37.61684799194336, + "learning_rate": 2.554033485540335e-06, + "loss": 0.2227, + "step": 8434 + }, + { + "epoch": 7.703196347031963, + "grad_norm": 8.453336715698242, + "learning_rate": 2.5530187721968546e-06, + "loss": 0.0388, + "step": 8435 + }, + { + "epoch": 7.704109589041096, + "grad_norm": 103.01555633544922, + "learning_rate": 2.552004058853374e-06, + "loss": 2.5271, + "step": 8436 + }, + { + "epoch": 7.705022831050228, + "grad_norm": 7.3161940574646, + "learning_rate": 2.550989345509894e-06, + "loss": 0.0344, + "step": 8437 + }, + { + "epoch": 7.705936073059361, + "grad_norm": 12.900339126586914, + "learning_rate": 2.549974632166413e-06, + "loss": 0.0783, + "step": 8438 + }, + { + "epoch": 7.706849315068493, + "grad_norm": 1.845401644706726, + "learning_rate": 2.5489599188229325e-06, + "loss": 0.0102, + "step": 8439 + }, + { + "epoch": 7.707762557077626, + "grad_norm": 5.229362487792969, + "learning_rate": 2.547945205479452e-06, + "loss": 0.0417, + "step": 8440 + }, + { + "epoch": 7.708675799086758, + "grad_norm": 0.46715831756591797, + "learning_rate": 2.546930492135972e-06, + "loss": 0.0038, + "step": 8441 + }, + { + "epoch": 7.7095890410958905, + "grad_norm": 9.992793083190918, + "learning_rate": 2.5459157787924915e-06, + "loss": 0.0708, + "step": 8442 + }, + { + "epoch": 7.710502283105023, + "grad_norm": 0.9688999056816101, + "learning_rate": 2.544901065449011e-06, + "loss": 0.0059, + "step": 8443 + }, + { + "epoch": 7.711415525114155, + "grad_norm": 0.01494770497083664, + "learning_rate": 2.54388635210553e-06, + "loss": 0.0001, + "step": 8444 + }, + { + "epoch": 7.712328767123288, + "grad_norm": 7.001521587371826, + "learning_rate": 2.54287163876205e-06, + "loss": 0.0412, + "step": 8445 + }, + { + "epoch": 7.71324200913242, + "grad_norm": 5.757671356201172, + "learning_rate": 2.5418569254185695e-06, + "loss": 0.0292, + "step": 8446 + }, + { + "epoch": 7.714155251141553, + "grad_norm": 0.9617651700973511, + "learning_rate": 2.540842212075089e-06, + "loss": 0.0083, + "step": 8447 + }, + { + "epoch": 7.715068493150685, + "grad_norm": 1.664480447769165, + "learning_rate": 2.5398274987316084e-06, + "loss": 0.0094, + "step": 8448 + }, + { + "epoch": 7.715981735159818, + "grad_norm": 0.15311402082443237, + "learning_rate": 2.5388127853881277e-06, + "loss": 0.0008, + "step": 8449 + }, + { + "epoch": 7.71689497716895, + "grad_norm": 78.30779266357422, + "learning_rate": 2.537798072044648e-06, + "loss": 0.6207, + "step": 8450 + }, + { + "epoch": 7.717808219178083, + "grad_norm": 38.515464782714844, + "learning_rate": 2.536783358701167e-06, + "loss": 0.1659, + "step": 8451 + }, + { + "epoch": 7.718721461187215, + "grad_norm": 32.73860549926758, + "learning_rate": 2.5357686453576868e-06, + "loss": 0.2181, + "step": 8452 + }, + { + "epoch": 7.719634703196347, + "grad_norm": 3.8134777545928955, + "learning_rate": 2.534753932014206e-06, + "loss": 0.0174, + "step": 8453 + }, + { + "epoch": 7.72054794520548, + "grad_norm": 1.4492058753967285, + "learning_rate": 2.5337392186707253e-06, + "loss": 0.0074, + "step": 8454 + }, + { + "epoch": 7.7214611872146115, + "grad_norm": 45.14616394042969, + "learning_rate": 2.5327245053272454e-06, + "loss": 0.2887, + "step": 8455 + }, + { + "epoch": 7.722374429223744, + "grad_norm": 1.123939037322998, + "learning_rate": 2.5317097919837647e-06, + "loss": 0.0044, + "step": 8456 + }, + { + "epoch": 7.723287671232876, + "grad_norm": 0.8750854730606079, + "learning_rate": 2.5306950786402844e-06, + "loss": 0.0056, + "step": 8457 + }, + { + "epoch": 7.724200913242009, + "grad_norm": 1.1044842004776, + "learning_rate": 2.5296803652968037e-06, + "loss": 0.0056, + "step": 8458 + }, + { + "epoch": 7.725114155251141, + "grad_norm": 0.06770257651805878, + "learning_rate": 2.5286656519533234e-06, + "loss": 0.0004, + "step": 8459 + }, + { + "epoch": 7.726027397260274, + "grad_norm": 0.4605392515659332, + "learning_rate": 2.527650938609843e-06, + "loss": 0.0032, + "step": 8460 + }, + { + "epoch": 7.726940639269406, + "grad_norm": 11.951367378234863, + "learning_rate": 2.5266362252663623e-06, + "loss": 0.0639, + "step": 8461 + }, + { + "epoch": 7.727853881278539, + "grad_norm": 3.068690538406372, + "learning_rate": 2.525621511922882e-06, + "loss": 0.0117, + "step": 8462 + }, + { + "epoch": 7.728767123287671, + "grad_norm": 10.385239601135254, + "learning_rate": 2.5246067985794017e-06, + "loss": 0.0776, + "step": 8463 + }, + { + "epoch": 7.729680365296804, + "grad_norm": 1.4116380214691162, + "learning_rate": 2.523592085235921e-06, + "loss": 0.0076, + "step": 8464 + }, + { + "epoch": 7.730593607305936, + "grad_norm": 26.16836929321289, + "learning_rate": 2.5225773718924407e-06, + "loss": 0.1988, + "step": 8465 + }, + { + "epoch": 7.7315068493150685, + "grad_norm": 0.03990582376718521, + "learning_rate": 2.52156265854896e-06, + "loss": 0.0003, + "step": 8466 + }, + { + "epoch": 7.732420091324201, + "grad_norm": 0.9469547867774963, + "learning_rate": 2.52054794520548e-06, + "loss": 0.006, + "step": 8467 + }, + { + "epoch": 7.733333333333333, + "grad_norm": 6.0012030601501465, + "learning_rate": 2.5195332318619993e-06, + "loss": 0.0392, + "step": 8468 + }, + { + "epoch": 7.734246575342466, + "grad_norm": 7.456626892089844, + "learning_rate": 2.5185185185185186e-06, + "loss": 0.0429, + "step": 8469 + }, + { + "epoch": 7.735159817351598, + "grad_norm": 20.677669525146484, + "learning_rate": 2.5175038051750383e-06, + "loss": 0.1025, + "step": 8470 + }, + { + "epoch": 7.736073059360731, + "grad_norm": 6.8415141105651855, + "learning_rate": 2.5164890918315575e-06, + "loss": 0.0481, + "step": 8471 + }, + { + "epoch": 7.736986301369863, + "grad_norm": 2.4750173091888428, + "learning_rate": 2.5154743784880777e-06, + "loss": 0.0159, + "step": 8472 + }, + { + "epoch": 7.737899543378996, + "grad_norm": 1.7591865062713623, + "learning_rate": 2.514459665144597e-06, + "loss": 0.013, + "step": 8473 + }, + { + "epoch": 7.738812785388128, + "grad_norm": 5.4305315017700195, + "learning_rate": 2.513444951801116e-06, + "loss": 0.0306, + "step": 8474 + }, + { + "epoch": 7.739726027397261, + "grad_norm": 0.45035359263420105, + "learning_rate": 2.512430238457636e-06, + "loss": 0.0025, + "step": 8475 + }, + { + "epoch": 7.740639269406393, + "grad_norm": 126.3740005493164, + "learning_rate": 2.511415525114155e-06, + "loss": 1.4809, + "step": 8476 + }, + { + "epoch": 7.7415525114155255, + "grad_norm": 48.99962615966797, + "learning_rate": 2.5104008117706753e-06, + "loss": 0.2152, + "step": 8477 + }, + { + "epoch": 7.742465753424657, + "grad_norm": 37.88582992553711, + "learning_rate": 2.5093860984271945e-06, + "loss": 0.2298, + "step": 8478 + }, + { + "epoch": 7.74337899543379, + "grad_norm": 0.2854591906070709, + "learning_rate": 2.508371385083714e-06, + "loss": 0.0017, + "step": 8479 + }, + { + "epoch": 7.744292237442922, + "grad_norm": 0.1235850602388382, + "learning_rate": 2.5073566717402335e-06, + "loss": 0.0007, + "step": 8480 + }, + { + "epoch": 7.745205479452055, + "grad_norm": 91.90619659423828, + "learning_rate": 2.506341958396753e-06, + "loss": 1.0613, + "step": 8481 + }, + { + "epoch": 7.746118721461187, + "grad_norm": 0.00964993704110384, + "learning_rate": 2.505327245053273e-06, + "loss": 0.0001, + "step": 8482 + }, + { + "epoch": 7.747031963470319, + "grad_norm": 4.9356584548950195, + "learning_rate": 2.504312531709792e-06, + "loss": 0.0154, + "step": 8483 + }, + { + "epoch": 7.747945205479452, + "grad_norm": 1.0328634977340698, + "learning_rate": 2.5032978183663114e-06, + "loss": 0.0061, + "step": 8484 + }, + { + "epoch": 7.748858447488584, + "grad_norm": 6.639577388763428, + "learning_rate": 2.5022831050228315e-06, + "loss": 0.0403, + "step": 8485 + }, + { + "epoch": 7.749771689497717, + "grad_norm": 1.2517242431640625, + "learning_rate": 2.501268391679351e-06, + "loss": 0.0077, + "step": 8486 + }, + { + "epoch": 7.750684931506849, + "grad_norm": 66.22010803222656, + "learning_rate": 2.5002536783358705e-06, + "loss": 0.5719, + "step": 8487 + }, + { + "epoch": 7.751598173515982, + "grad_norm": 2.8864028453826904, + "learning_rate": 2.4992389649923898e-06, + "loss": 0.0184, + "step": 8488 + }, + { + "epoch": 7.752511415525114, + "grad_norm": 0.36013516783714294, + "learning_rate": 2.4982242516489095e-06, + "loss": 0.0017, + "step": 8489 + }, + { + "epoch": 7.7534246575342465, + "grad_norm": 9.105583190917969, + "learning_rate": 2.497209538305429e-06, + "loss": 0.069, + "step": 8490 + }, + { + "epoch": 7.754337899543379, + "grad_norm": 8.203500747680664, + "learning_rate": 2.4961948249619484e-06, + "loss": 0.0692, + "step": 8491 + }, + { + "epoch": 7.755251141552511, + "grad_norm": 21.122055053710938, + "learning_rate": 2.495180111618468e-06, + "loss": 0.1559, + "step": 8492 + }, + { + "epoch": 7.756164383561644, + "grad_norm": 62.24258804321289, + "learning_rate": 2.4941653982749874e-06, + "loss": 0.5709, + "step": 8493 + }, + { + "epoch": 7.757077625570776, + "grad_norm": 4.363428115844727, + "learning_rate": 2.493150684931507e-06, + "loss": 0.0216, + "step": 8494 + }, + { + "epoch": 7.757990867579909, + "grad_norm": 0.6792408227920532, + "learning_rate": 2.4921359715880268e-06, + "loss": 0.0031, + "step": 8495 + }, + { + "epoch": 7.758904109589041, + "grad_norm": 6.016949653625488, + "learning_rate": 2.491121258244546e-06, + "loss": 0.029, + "step": 8496 + }, + { + "epoch": 7.759817351598174, + "grad_norm": 0.27437806129455566, + "learning_rate": 2.4901065449010657e-06, + "loss": 0.0013, + "step": 8497 + }, + { + "epoch": 7.760730593607306, + "grad_norm": 35.06062316894531, + "learning_rate": 2.489091831557585e-06, + "loss": 0.2414, + "step": 8498 + }, + { + "epoch": 7.761643835616439, + "grad_norm": 19.662742614746094, + "learning_rate": 2.4880771182141047e-06, + "loss": 0.121, + "step": 8499 + }, + { + "epoch": 7.762557077625571, + "grad_norm": 3.0914418697357178, + "learning_rate": 2.4870624048706244e-06, + "loss": 0.0265, + "step": 8500 + }, + { + "epoch": 7.7634703196347035, + "grad_norm": 3.1344761848449707, + "learning_rate": 2.4860476915271436e-06, + "loss": 0.0212, + "step": 8501 + }, + { + "epoch": 7.764383561643836, + "grad_norm": 0.3691173195838928, + "learning_rate": 2.4850329781836633e-06, + "loss": 0.0025, + "step": 8502 + }, + { + "epoch": 7.765296803652968, + "grad_norm": 0.22853194177150726, + "learning_rate": 2.484018264840183e-06, + "loss": 0.0014, + "step": 8503 + }, + { + "epoch": 7.766210045662101, + "grad_norm": 68.81044006347656, + "learning_rate": 2.4830035514967023e-06, + "loss": 0.4656, + "step": 8504 + }, + { + "epoch": 7.767123287671232, + "grad_norm": 10.1591796875, + "learning_rate": 2.481988838153222e-06, + "loss": 0.0639, + "step": 8505 + }, + { + "epoch": 7.768036529680366, + "grad_norm": 3.302220106124878, + "learning_rate": 2.4809741248097413e-06, + "loss": 0.0158, + "step": 8506 + }, + { + "epoch": 7.768949771689497, + "grad_norm": 98.17849731445312, + "learning_rate": 2.479959411466261e-06, + "loss": 2.5225, + "step": 8507 + }, + { + "epoch": 7.76986301369863, + "grad_norm": 22.91567039489746, + "learning_rate": 2.4789446981227806e-06, + "loss": 0.1942, + "step": 8508 + }, + { + "epoch": 7.770776255707762, + "grad_norm": 1.6187180280685425, + "learning_rate": 2.4779299847793e-06, + "loss": 0.0115, + "step": 8509 + }, + { + "epoch": 7.771689497716895, + "grad_norm": 12.420680046081543, + "learning_rate": 2.4769152714358196e-06, + "loss": 0.0745, + "step": 8510 + }, + { + "epoch": 7.772602739726027, + "grad_norm": 11.681007385253906, + "learning_rate": 2.475900558092339e-06, + "loss": 0.0667, + "step": 8511 + }, + { + "epoch": 7.77351598173516, + "grad_norm": 2.679997682571411, + "learning_rate": 2.4748858447488586e-06, + "loss": 0.0116, + "step": 8512 + }, + { + "epoch": 7.774429223744292, + "grad_norm": 0.18342141807079315, + "learning_rate": 2.4738711314053783e-06, + "loss": 0.0013, + "step": 8513 + }, + { + "epoch": 7.7753424657534245, + "grad_norm": 1.3578945398330688, + "learning_rate": 2.472856418061898e-06, + "loss": 0.0072, + "step": 8514 + }, + { + "epoch": 7.776255707762557, + "grad_norm": 1.4437710046768188, + "learning_rate": 2.4718417047184172e-06, + "loss": 0.0083, + "step": 8515 + }, + { + "epoch": 7.777168949771689, + "grad_norm": 0.04197027534246445, + "learning_rate": 2.4708269913749365e-06, + "loss": 0.0003, + "step": 8516 + }, + { + "epoch": 7.778082191780822, + "grad_norm": 54.013343811035156, + "learning_rate": 2.469812278031456e-06, + "loss": 0.6118, + "step": 8517 + }, + { + "epoch": 7.778995433789954, + "grad_norm": 0.8440414071083069, + "learning_rate": 2.468797564687976e-06, + "loss": 0.0072, + "step": 8518 + }, + { + "epoch": 7.779908675799087, + "grad_norm": 96.48123168945312, + "learning_rate": 2.4677828513444956e-06, + "loss": 0.4646, + "step": 8519 + }, + { + "epoch": 7.780821917808219, + "grad_norm": 0.35791462659835815, + "learning_rate": 2.466768138001015e-06, + "loss": 0.0031, + "step": 8520 + }, + { + "epoch": 7.781735159817352, + "grad_norm": 0.3049001693725586, + "learning_rate": 2.4657534246575345e-06, + "loss": 0.0014, + "step": 8521 + }, + { + "epoch": 7.782648401826484, + "grad_norm": 5.882657051086426, + "learning_rate": 2.4647387113140538e-06, + "loss": 0.031, + "step": 8522 + }, + { + "epoch": 7.7835616438356166, + "grad_norm": 1.5417675971984863, + "learning_rate": 2.4637239979705735e-06, + "loss": 0.0079, + "step": 8523 + }, + { + "epoch": 7.784474885844749, + "grad_norm": 1.955430269241333, + "learning_rate": 2.462709284627093e-06, + "loss": 0.0076, + "step": 8524 + }, + { + "epoch": 7.7853881278538815, + "grad_norm": 0.05669167637825012, + "learning_rate": 2.461694571283613e-06, + "loss": 0.0004, + "step": 8525 + }, + { + "epoch": 7.786301369863014, + "grad_norm": 0.4610481560230255, + "learning_rate": 2.460679857940132e-06, + "loss": 0.0027, + "step": 8526 + }, + { + "epoch": 7.787214611872146, + "grad_norm": 11.913956642150879, + "learning_rate": 2.4596651445966514e-06, + "loss": 0.1097, + "step": 8527 + }, + { + "epoch": 7.788127853881279, + "grad_norm": 0.6714839339256287, + "learning_rate": 2.458650431253171e-06, + "loss": 0.0035, + "step": 8528 + }, + { + "epoch": 7.789041095890411, + "grad_norm": 0.9983475208282471, + "learning_rate": 2.4576357179096908e-06, + "loss": 0.006, + "step": 8529 + }, + { + "epoch": 7.789954337899544, + "grad_norm": 14.981825828552246, + "learning_rate": 2.4566210045662105e-06, + "loss": 0.0756, + "step": 8530 + }, + { + "epoch": 7.790867579908676, + "grad_norm": 51.40768051147461, + "learning_rate": 2.4556062912227297e-06, + "loss": 0.3654, + "step": 8531 + }, + { + "epoch": 7.791780821917808, + "grad_norm": 1.4580415487289429, + "learning_rate": 2.4545915778792494e-06, + "loss": 0.0099, + "step": 8532 + }, + { + "epoch": 7.792694063926941, + "grad_norm": 19.706193923950195, + "learning_rate": 2.4535768645357687e-06, + "loss": 0.134, + "step": 8533 + }, + { + "epoch": 7.793607305936073, + "grad_norm": 0.5328431725502014, + "learning_rate": 2.4525621511922884e-06, + "loss": 0.0025, + "step": 8534 + }, + { + "epoch": 7.794520547945205, + "grad_norm": 2.8111166954040527, + "learning_rate": 2.451547437848808e-06, + "loss": 0.0137, + "step": 8535 + }, + { + "epoch": 7.7954337899543376, + "grad_norm": 38.39690399169922, + "learning_rate": 2.4505327245053274e-06, + "loss": 0.2542, + "step": 8536 + }, + { + "epoch": 7.79634703196347, + "grad_norm": 1.6967099905014038, + "learning_rate": 2.449518011161847e-06, + "loss": 0.0094, + "step": 8537 + }, + { + "epoch": 7.7972602739726025, + "grad_norm": 0.6841561794281006, + "learning_rate": 2.4485032978183663e-06, + "loss": 0.0049, + "step": 8538 + }, + { + "epoch": 7.798173515981735, + "grad_norm": 0.8566179275512695, + "learning_rate": 2.447488584474886e-06, + "loss": 0.0043, + "step": 8539 + }, + { + "epoch": 7.799086757990867, + "grad_norm": 0.054459840059280396, + "learning_rate": 2.4464738711314057e-06, + "loss": 0.0002, + "step": 8540 + }, + { + "epoch": 7.8, + "grad_norm": 0.0466490276157856, + "learning_rate": 2.445459157787925e-06, + "loss": 0.0003, + "step": 8541 + }, + { + "epoch": 7.800913242009132, + "grad_norm": 0.43770232796669006, + "learning_rate": 2.4444444444444447e-06, + "loss": 0.0032, + "step": 8542 + }, + { + "epoch": 7.801826484018265, + "grad_norm": 0.3214038610458374, + "learning_rate": 2.4434297311009644e-06, + "loss": 0.0014, + "step": 8543 + }, + { + "epoch": 7.802739726027397, + "grad_norm": 1.9845082759857178, + "learning_rate": 2.4424150177574836e-06, + "loss": 0.0089, + "step": 8544 + }, + { + "epoch": 7.80365296803653, + "grad_norm": 2.6695010662078857, + "learning_rate": 2.4414003044140033e-06, + "loss": 0.0206, + "step": 8545 + }, + { + "epoch": 7.804566210045662, + "grad_norm": 0.019032800570130348, + "learning_rate": 2.4403855910705226e-06, + "loss": 0.0001, + "step": 8546 + }, + { + "epoch": 7.8054794520547945, + "grad_norm": 10.245553970336914, + "learning_rate": 2.4393708777270423e-06, + "loss": 0.0586, + "step": 8547 + }, + { + "epoch": 7.806392694063927, + "grad_norm": 5.177995681762695, + "learning_rate": 2.438356164383562e-06, + "loss": 0.0294, + "step": 8548 + }, + { + "epoch": 7.8073059360730594, + "grad_norm": 5.160867214202881, + "learning_rate": 2.4373414510400812e-06, + "loss": 0.0084, + "step": 8549 + }, + { + "epoch": 7.808219178082192, + "grad_norm": 31.4989070892334, + "learning_rate": 2.436326737696601e-06, + "loss": 0.1685, + "step": 8550 + }, + { + "epoch": 7.809132420091324, + "grad_norm": 1.9544227123260498, + "learning_rate": 2.43531202435312e-06, + "loss": 0.0082, + "step": 8551 + }, + { + "epoch": 7.810045662100457, + "grad_norm": 4.060977458953857, + "learning_rate": 2.43429731100964e-06, + "loss": 0.0275, + "step": 8552 + }, + { + "epoch": 7.810958904109589, + "grad_norm": 68.39547729492188, + "learning_rate": 2.4332825976661596e-06, + "loss": 0.5128, + "step": 8553 + }, + { + "epoch": 7.811872146118722, + "grad_norm": 0.1576310247182846, + "learning_rate": 2.4322678843226793e-06, + "loss": 0.0013, + "step": 8554 + }, + { + "epoch": 7.812785388127854, + "grad_norm": 8.531307220458984, + "learning_rate": 2.4312531709791985e-06, + "loss": 0.0707, + "step": 8555 + }, + { + "epoch": 7.813698630136987, + "grad_norm": 0.48249250650405884, + "learning_rate": 2.430238457635718e-06, + "loss": 0.0037, + "step": 8556 + }, + { + "epoch": 7.814611872146119, + "grad_norm": 6.759383678436279, + "learning_rate": 2.4292237442922375e-06, + "loss": 0.0444, + "step": 8557 + }, + { + "epoch": 7.8155251141552515, + "grad_norm": 1.8531054258346558, + "learning_rate": 2.428209030948757e-06, + "loss": 0.0087, + "step": 8558 + }, + { + "epoch": 7.816438356164383, + "grad_norm": 1.2998276948928833, + "learning_rate": 2.427194317605277e-06, + "loss": 0.0076, + "step": 8559 + }, + { + "epoch": 7.817351598173516, + "grad_norm": 22.971900939941406, + "learning_rate": 2.426179604261796e-06, + "loss": 0.173, + "step": 8560 + }, + { + "epoch": 7.818264840182648, + "grad_norm": 0.7957291603088379, + "learning_rate": 2.425164890918316e-06, + "loss": 0.002, + "step": 8561 + }, + { + "epoch": 7.8191780821917805, + "grad_norm": 5.856578826904297, + "learning_rate": 2.424150177574835e-06, + "loss": 0.0452, + "step": 8562 + }, + { + "epoch": 7.820091324200913, + "grad_norm": 13.970934867858887, + "learning_rate": 2.423135464231355e-06, + "loss": 0.1185, + "step": 8563 + }, + { + "epoch": 7.821004566210045, + "grad_norm": 8.686532020568848, + "learning_rate": 2.4221207508878745e-06, + "loss": 0.0549, + "step": 8564 + }, + { + "epoch": 7.821917808219178, + "grad_norm": 0.43307560682296753, + "learning_rate": 2.421106037544394e-06, + "loss": 0.0032, + "step": 8565 + }, + { + "epoch": 7.82283105022831, + "grad_norm": 5.664992332458496, + "learning_rate": 2.4200913242009135e-06, + "loss": 0.0305, + "step": 8566 + }, + { + "epoch": 7.823744292237443, + "grad_norm": 1.7540805339813232, + "learning_rate": 2.4190766108574327e-06, + "loss": 0.0109, + "step": 8567 + }, + { + "epoch": 7.824657534246575, + "grad_norm": 9.138778686523438, + "learning_rate": 2.4180618975139524e-06, + "loss": 0.0449, + "step": 8568 + }, + { + "epoch": 7.825570776255708, + "grad_norm": 18.973255157470703, + "learning_rate": 2.417047184170472e-06, + "loss": 0.0949, + "step": 8569 + }, + { + "epoch": 7.82648401826484, + "grad_norm": 4.260653972625732, + "learning_rate": 2.416032470826992e-06, + "loss": 0.035, + "step": 8570 + }, + { + "epoch": 7.8273972602739725, + "grad_norm": 0.26682451367378235, + "learning_rate": 2.415017757483511e-06, + "loss": 0.0018, + "step": 8571 + }, + { + "epoch": 7.828310502283105, + "grad_norm": 0.11532357335090637, + "learning_rate": 2.4140030441400308e-06, + "loss": 0.0007, + "step": 8572 + }, + { + "epoch": 7.829223744292237, + "grad_norm": 0.053494516760110855, + "learning_rate": 2.41298833079655e-06, + "loss": 0.0003, + "step": 8573 + }, + { + "epoch": 7.83013698630137, + "grad_norm": 40.5998649597168, + "learning_rate": 2.4119736174530697e-06, + "loss": 0.3383, + "step": 8574 + }, + { + "epoch": 7.831050228310502, + "grad_norm": 0.15600652992725372, + "learning_rate": 2.4109589041095894e-06, + "loss": 0.001, + "step": 8575 + }, + { + "epoch": 7.831963470319635, + "grad_norm": 53.31300735473633, + "learning_rate": 2.4099441907661087e-06, + "loss": 0.2626, + "step": 8576 + }, + { + "epoch": 7.832876712328767, + "grad_norm": 1.4365839958190918, + "learning_rate": 2.4089294774226284e-06, + "loss": 0.0095, + "step": 8577 + }, + { + "epoch": 7.8337899543379, + "grad_norm": 2.601792335510254, + "learning_rate": 2.4079147640791476e-06, + "loss": 0.0155, + "step": 8578 + }, + { + "epoch": 7.834703196347032, + "grad_norm": 12.257784843444824, + "learning_rate": 2.4069000507356673e-06, + "loss": 0.1079, + "step": 8579 + }, + { + "epoch": 7.835616438356165, + "grad_norm": 2.709296941757202, + "learning_rate": 2.405885337392187e-06, + "loss": 0.0207, + "step": 8580 + }, + { + "epoch": 7.836529680365297, + "grad_norm": 10.71678352355957, + "learning_rate": 2.4048706240487063e-06, + "loss": 0.0577, + "step": 8581 + }, + { + "epoch": 7.8374429223744295, + "grad_norm": 57.949405670166016, + "learning_rate": 2.403855910705226e-06, + "loss": 0.5765, + "step": 8582 + }, + { + "epoch": 7.838356164383562, + "grad_norm": 0.41351568698883057, + "learning_rate": 2.4028411973617457e-06, + "loss": 0.0029, + "step": 8583 + }, + { + "epoch": 7.839269406392694, + "grad_norm": 21.215097427368164, + "learning_rate": 2.401826484018265e-06, + "loss": 0.1096, + "step": 8584 + }, + { + "epoch": 7.840182648401827, + "grad_norm": 67.86446380615234, + "learning_rate": 2.4008117706747846e-06, + "loss": 0.7216, + "step": 8585 + }, + { + "epoch": 7.8410958904109584, + "grad_norm": 5.652812957763672, + "learning_rate": 2.399797057331304e-06, + "loss": 0.0462, + "step": 8586 + }, + { + "epoch": 7.842009132420092, + "grad_norm": 4.316204071044922, + "learning_rate": 2.3987823439878236e-06, + "loss": 0.0273, + "step": 8587 + }, + { + "epoch": 7.842922374429223, + "grad_norm": 1.7041171789169312, + "learning_rate": 2.3977676306443433e-06, + "loss": 0.007, + "step": 8588 + }, + { + "epoch": 7.843835616438356, + "grad_norm": 0.06759412586688995, + "learning_rate": 2.3967529173008626e-06, + "loss": 0.0003, + "step": 8589 + }, + { + "epoch": 7.844748858447488, + "grad_norm": 23.239341735839844, + "learning_rate": 2.3957382039573823e-06, + "loss": 0.1862, + "step": 8590 + }, + { + "epoch": 7.845662100456621, + "grad_norm": 2.4720630645751953, + "learning_rate": 2.3947234906139015e-06, + "loss": 0.0129, + "step": 8591 + }, + { + "epoch": 7.846575342465753, + "grad_norm": 2.6792783737182617, + "learning_rate": 2.3937087772704212e-06, + "loss": 0.0168, + "step": 8592 + }, + { + "epoch": 7.847488584474886, + "grad_norm": 10.234284400939941, + "learning_rate": 2.392694063926941e-06, + "loss": 0.0657, + "step": 8593 + }, + { + "epoch": 7.848401826484018, + "grad_norm": 2.9498562812805176, + "learning_rate": 2.3916793505834606e-06, + "loss": 0.0137, + "step": 8594 + }, + { + "epoch": 7.8493150684931505, + "grad_norm": 44.98371124267578, + "learning_rate": 2.39066463723998e-06, + "loss": 0.6187, + "step": 8595 + }, + { + "epoch": 7.850228310502283, + "grad_norm": 1.6974565982818604, + "learning_rate": 2.389649923896499e-06, + "loss": 0.0098, + "step": 8596 + }, + { + "epoch": 7.851141552511415, + "grad_norm": 2.233363389968872, + "learning_rate": 2.388635210553019e-06, + "loss": 0.0164, + "step": 8597 + }, + { + "epoch": 7.852054794520548, + "grad_norm": 0.7371277213096619, + "learning_rate": 2.3876204972095385e-06, + "loss": 0.0052, + "step": 8598 + }, + { + "epoch": 7.85296803652968, + "grad_norm": 3.7801623344421387, + "learning_rate": 2.3866057838660582e-06, + "loss": 0.0232, + "step": 8599 + }, + { + "epoch": 7.853881278538813, + "grad_norm": 1.1056534051895142, + "learning_rate": 2.3855910705225775e-06, + "loss": 0.0065, + "step": 8600 + }, + { + "epoch": 7.854794520547945, + "grad_norm": 6.400585174560547, + "learning_rate": 2.384576357179097e-06, + "loss": 0.0388, + "step": 8601 + }, + { + "epoch": 7.855707762557078, + "grad_norm": 47.54220962524414, + "learning_rate": 2.3835616438356164e-06, + "loss": 0.432, + "step": 8602 + }, + { + "epoch": 7.85662100456621, + "grad_norm": 3.105330467224121, + "learning_rate": 2.382546930492136e-06, + "loss": 0.017, + "step": 8603 + }, + { + "epoch": 7.857534246575343, + "grad_norm": 5.389704704284668, + "learning_rate": 2.381532217148656e-06, + "loss": 0.0129, + "step": 8604 + }, + { + "epoch": 7.858447488584475, + "grad_norm": 26.82937240600586, + "learning_rate": 2.3805175038051755e-06, + "loss": 0.237, + "step": 8605 + }, + { + "epoch": 7.8593607305936075, + "grad_norm": 4.264394760131836, + "learning_rate": 2.379502790461695e-06, + "loss": 0.0096, + "step": 8606 + }, + { + "epoch": 7.86027397260274, + "grad_norm": 0.5819056630134583, + "learning_rate": 2.378488077118214e-06, + "loss": 0.0024, + "step": 8607 + }, + { + "epoch": 7.861187214611872, + "grad_norm": 6.396565914154053, + "learning_rate": 2.3774733637747338e-06, + "loss": 0.0329, + "step": 8608 + }, + { + "epoch": 7.862100456621005, + "grad_norm": 124.16515350341797, + "learning_rate": 2.3764586504312534e-06, + "loss": 0.7268, + "step": 8609 + }, + { + "epoch": 7.863013698630137, + "grad_norm": 2.8313028812408447, + "learning_rate": 2.375443937087773e-06, + "loss": 0.021, + "step": 8610 + }, + { + "epoch": 7.86392694063927, + "grad_norm": 0.15920978784561157, + "learning_rate": 2.3744292237442924e-06, + "loss": 0.0006, + "step": 8611 + }, + { + "epoch": 7.864840182648402, + "grad_norm": 28.49139404296875, + "learning_rate": 2.373414510400812e-06, + "loss": 0.1779, + "step": 8612 + }, + { + "epoch": 7.865753424657534, + "grad_norm": 2.670351982116699, + "learning_rate": 2.3723997970573314e-06, + "loss": 0.0186, + "step": 8613 + }, + { + "epoch": 7.866666666666667, + "grad_norm": 0.6689489483833313, + "learning_rate": 2.371385083713851e-06, + "loss": 0.004, + "step": 8614 + }, + { + "epoch": 7.867579908675799, + "grad_norm": 15.496308326721191, + "learning_rate": 2.3703703703703707e-06, + "loss": 0.1179, + "step": 8615 + }, + { + "epoch": 7.868493150684931, + "grad_norm": 2.312279462814331, + "learning_rate": 2.36935565702689e-06, + "loss": 0.0161, + "step": 8616 + }, + { + "epoch": 7.869406392694064, + "grad_norm": 6.331683158874512, + "learning_rate": 2.3683409436834097e-06, + "loss": 0.0304, + "step": 8617 + }, + { + "epoch": 7.870319634703196, + "grad_norm": 9.88495922088623, + "learning_rate": 2.367326230339929e-06, + "loss": 0.0636, + "step": 8618 + }, + { + "epoch": 7.8712328767123285, + "grad_norm": 1.4289048910140991, + "learning_rate": 2.3663115169964487e-06, + "loss": 0.0113, + "step": 8619 + }, + { + "epoch": 7.872146118721461, + "grad_norm": 0.10356377065181732, + "learning_rate": 2.3652968036529684e-06, + "loss": 0.0005, + "step": 8620 + }, + { + "epoch": 7.873059360730593, + "grad_norm": 0.14100132882595062, + "learning_rate": 2.3642820903094876e-06, + "loss": 0.0009, + "step": 8621 + }, + { + "epoch": 7.873972602739726, + "grad_norm": 0.33162105083465576, + "learning_rate": 2.3632673769660073e-06, + "loss": 0.0019, + "step": 8622 + }, + { + "epoch": 7.874885844748858, + "grad_norm": 0.19213463366031647, + "learning_rate": 2.362252663622527e-06, + "loss": 0.0009, + "step": 8623 + }, + { + "epoch": 7.875799086757991, + "grad_norm": 68.26844787597656, + "learning_rate": 2.3612379502790463e-06, + "loss": 0.1543, + "step": 8624 + }, + { + "epoch": 7.876712328767123, + "grad_norm": 0.356885701417923, + "learning_rate": 2.360223236935566e-06, + "loss": 0.0018, + "step": 8625 + }, + { + "epoch": 7.877625570776256, + "grad_norm": 117.28203582763672, + "learning_rate": 2.3592085235920852e-06, + "loss": 1.9587, + "step": 8626 + }, + { + "epoch": 7.878538812785388, + "grad_norm": 0.20053058862686157, + "learning_rate": 2.358193810248605e-06, + "loss": 0.0011, + "step": 8627 + }, + { + "epoch": 7.879452054794521, + "grad_norm": 2.92706561088562, + "learning_rate": 2.3571790969051246e-06, + "loss": 0.018, + "step": 8628 + }, + { + "epoch": 7.880365296803653, + "grad_norm": 2.9314002990722656, + "learning_rate": 2.356164383561644e-06, + "loss": 0.0244, + "step": 8629 + }, + { + "epoch": 7.8812785388127855, + "grad_norm": 15.083223342895508, + "learning_rate": 2.3551496702181636e-06, + "loss": 0.1037, + "step": 8630 + }, + { + "epoch": 7.882191780821918, + "grad_norm": 3.0139758586883545, + "learning_rate": 2.354134956874683e-06, + "loss": 0.0177, + "step": 8631 + }, + { + "epoch": 7.88310502283105, + "grad_norm": 0.36445799469947815, + "learning_rate": 2.3531202435312025e-06, + "loss": 0.0025, + "step": 8632 + }, + { + "epoch": 7.884018264840183, + "grad_norm": 19.887357711791992, + "learning_rate": 2.3521055301877222e-06, + "loss": 0.1164, + "step": 8633 + }, + { + "epoch": 7.884931506849315, + "grad_norm": 0.42166668176651, + "learning_rate": 2.351090816844242e-06, + "loss": 0.0024, + "step": 8634 + }, + { + "epoch": 7.885844748858448, + "grad_norm": 2.9008193016052246, + "learning_rate": 2.350076103500761e-06, + "loss": 0.0164, + "step": 8635 + }, + { + "epoch": 7.88675799086758, + "grad_norm": 13.578120231628418, + "learning_rate": 2.3490613901572805e-06, + "loss": 0.1011, + "step": 8636 + }, + { + "epoch": 7.887671232876713, + "grad_norm": 0.13222849369049072, + "learning_rate": 2.3480466768138e-06, + "loss": 0.0009, + "step": 8637 + }, + { + "epoch": 7.888584474885845, + "grad_norm": 0.3387431800365448, + "learning_rate": 2.34703196347032e-06, + "loss": 0.0026, + "step": 8638 + }, + { + "epoch": 7.889497716894978, + "grad_norm": 0.06558472663164139, + "learning_rate": 2.3460172501268395e-06, + "loss": 0.0004, + "step": 8639 + }, + { + "epoch": 7.890410958904109, + "grad_norm": 0.2070460468530655, + "learning_rate": 2.345002536783359e-06, + "loss": 0.0012, + "step": 8640 + }, + { + "epoch": 7.8913242009132425, + "grad_norm": 0.4579417109489441, + "learning_rate": 2.343987823439878e-06, + "loss": 0.002, + "step": 8641 + }, + { + "epoch": 7.892237442922374, + "grad_norm": 2.606358289718628, + "learning_rate": 2.3429731100963978e-06, + "loss": 0.0151, + "step": 8642 + }, + { + "epoch": 7.8931506849315065, + "grad_norm": 30.488117218017578, + "learning_rate": 2.3419583967529175e-06, + "loss": 0.2086, + "step": 8643 + }, + { + "epoch": 7.894063926940639, + "grad_norm": 2.623433828353882, + "learning_rate": 2.340943683409437e-06, + "loss": 0.0141, + "step": 8644 + }, + { + "epoch": 7.894977168949771, + "grad_norm": 9.224105834960938, + "learning_rate": 2.339928970065957e-06, + "loss": 0.0526, + "step": 8645 + }, + { + "epoch": 7.895890410958904, + "grad_norm": 1.9424705505371094, + "learning_rate": 2.338914256722476e-06, + "loss": 0.0128, + "step": 8646 + }, + { + "epoch": 7.896803652968036, + "grad_norm": 9.370805740356445, + "learning_rate": 2.3378995433789954e-06, + "loss": 0.058, + "step": 8647 + }, + { + "epoch": 7.897716894977169, + "grad_norm": 55.340702056884766, + "learning_rate": 2.336884830035515e-06, + "loss": 0.2842, + "step": 8648 + }, + { + "epoch": 7.898630136986301, + "grad_norm": 18.195384979248047, + "learning_rate": 2.3358701166920348e-06, + "loss": 0.0975, + "step": 8649 + }, + { + "epoch": 7.899543378995434, + "grad_norm": 28.61464500427246, + "learning_rate": 2.3348554033485545e-06, + "loss": 0.2075, + "step": 8650 + }, + { + "epoch": 7.900456621004566, + "grad_norm": 16.92331886291504, + "learning_rate": 2.3338406900050737e-06, + "loss": 0.1461, + "step": 8651 + }, + { + "epoch": 7.901369863013699, + "grad_norm": 3.4903948307037354, + "learning_rate": 2.3328259766615934e-06, + "loss": 0.0182, + "step": 8652 + }, + { + "epoch": 7.902283105022831, + "grad_norm": 5.599267482757568, + "learning_rate": 2.3318112633181127e-06, + "loss": 0.0329, + "step": 8653 + }, + { + "epoch": 7.9031963470319635, + "grad_norm": 1.1763137578964233, + "learning_rate": 2.3307965499746324e-06, + "loss": 0.008, + "step": 8654 + }, + { + "epoch": 7.904109589041096, + "grad_norm": 64.12189483642578, + "learning_rate": 2.329781836631152e-06, + "loss": 0.5483, + "step": 8655 + }, + { + "epoch": 7.905022831050228, + "grad_norm": 23.996728897094727, + "learning_rate": 2.3287671232876713e-06, + "loss": 0.2385, + "step": 8656 + }, + { + "epoch": 7.905936073059361, + "grad_norm": 1.3314439058303833, + "learning_rate": 2.327752409944191e-06, + "loss": 0.0044, + "step": 8657 + }, + { + "epoch": 7.906849315068493, + "grad_norm": 2.1867153644561768, + "learning_rate": 2.3267376966007103e-06, + "loss": 0.0128, + "step": 8658 + }, + { + "epoch": 7.907762557077626, + "grad_norm": 1.658318042755127, + "learning_rate": 2.32572298325723e-06, + "loss": 0.0074, + "step": 8659 + }, + { + "epoch": 7.908675799086758, + "grad_norm": 0.17917300760746002, + "learning_rate": 2.3247082699137497e-06, + "loss": 0.0015, + "step": 8660 + }, + { + "epoch": 7.909589041095891, + "grad_norm": 3.5852675437927246, + "learning_rate": 2.323693556570269e-06, + "loss": 0.0168, + "step": 8661 + }, + { + "epoch": 7.910502283105023, + "grad_norm": 0.7072393894195557, + "learning_rate": 2.3226788432267887e-06, + "loss": 0.0059, + "step": 8662 + }, + { + "epoch": 7.911415525114156, + "grad_norm": 2.0782976150512695, + "learning_rate": 2.3216641298833083e-06, + "loss": 0.0142, + "step": 8663 + }, + { + "epoch": 7.912328767123288, + "grad_norm": 2.964940071105957, + "learning_rate": 2.3206494165398276e-06, + "loss": 0.0107, + "step": 8664 + }, + { + "epoch": 7.91324200913242, + "grad_norm": 1.2894295454025269, + "learning_rate": 2.3196347031963473e-06, + "loss": 0.0091, + "step": 8665 + }, + { + "epoch": 7.914155251141553, + "grad_norm": 26.33806037902832, + "learning_rate": 2.3186199898528666e-06, + "loss": 0.1701, + "step": 8666 + }, + { + "epoch": 7.9150684931506845, + "grad_norm": 0.9913585186004639, + "learning_rate": 2.3176052765093863e-06, + "loss": 0.0048, + "step": 8667 + }, + { + "epoch": 7.915981735159818, + "grad_norm": 2.9993903636932373, + "learning_rate": 2.316590563165906e-06, + "loss": 0.0196, + "step": 8668 + }, + { + "epoch": 7.916894977168949, + "grad_norm": 0.9921146631240845, + "learning_rate": 2.3155758498224252e-06, + "loss": 0.0052, + "step": 8669 + }, + { + "epoch": 7.917808219178082, + "grad_norm": 0.017288116738200188, + "learning_rate": 2.314561136478945e-06, + "loss": 0.0001, + "step": 8670 + }, + { + "epoch": 7.918721461187214, + "grad_norm": 0.6181174516677856, + "learning_rate": 2.313546423135464e-06, + "loss": 0.0038, + "step": 8671 + }, + { + "epoch": 7.919634703196347, + "grad_norm": 6.34761381149292, + "learning_rate": 2.312531709791984e-06, + "loss": 0.04, + "step": 8672 + }, + { + "epoch": 7.920547945205479, + "grad_norm": 1.8709851503372192, + "learning_rate": 2.3115169964485036e-06, + "loss": 0.0119, + "step": 8673 + }, + { + "epoch": 7.921461187214612, + "grad_norm": 9.218259811401367, + "learning_rate": 2.3105022831050233e-06, + "loss": 0.0399, + "step": 8674 + }, + { + "epoch": 7.922374429223744, + "grad_norm": 1.1360995769500732, + "learning_rate": 2.3094875697615425e-06, + "loss": 0.0071, + "step": 8675 + }, + { + "epoch": 7.923287671232877, + "grad_norm": 3.46197509765625, + "learning_rate": 2.308472856418062e-06, + "loss": 0.0155, + "step": 8676 + }, + { + "epoch": 7.924200913242009, + "grad_norm": 8.865250587463379, + "learning_rate": 2.3074581430745815e-06, + "loss": 0.052, + "step": 8677 + }, + { + "epoch": 7.9251141552511415, + "grad_norm": 3.3067398071289062, + "learning_rate": 2.306443429731101e-06, + "loss": 0.0184, + "step": 8678 + }, + { + "epoch": 7.926027397260274, + "grad_norm": 1.1365559101104736, + "learning_rate": 2.305428716387621e-06, + "loss": 0.0082, + "step": 8679 + }, + { + "epoch": 7.926940639269406, + "grad_norm": 0.7095019817352295, + "learning_rate": 2.30441400304414e-06, + "loss": 0.0051, + "step": 8680 + }, + { + "epoch": 7.927853881278539, + "grad_norm": 35.31437683105469, + "learning_rate": 2.3033992897006594e-06, + "loss": 0.1964, + "step": 8681 + }, + { + "epoch": 7.928767123287671, + "grad_norm": 27.667156219482422, + "learning_rate": 2.302384576357179e-06, + "loss": 0.1907, + "step": 8682 + }, + { + "epoch": 7.929680365296804, + "grad_norm": 0.08606815338134766, + "learning_rate": 2.301369863013699e-06, + "loss": 0.0006, + "step": 8683 + }, + { + "epoch": 7.930593607305936, + "grad_norm": 3.0352840423583984, + "learning_rate": 2.3003551496702185e-06, + "loss": 0.0107, + "step": 8684 + }, + { + "epoch": 7.931506849315069, + "grad_norm": 0.019652171060442924, + "learning_rate": 2.299340436326738e-06, + "loss": 0.0001, + "step": 8685 + }, + { + "epoch": 7.932420091324201, + "grad_norm": 65.29437255859375, + "learning_rate": 2.2983257229832575e-06, + "loss": 0.3543, + "step": 8686 + }, + { + "epoch": 7.933333333333334, + "grad_norm": 122.75720977783203, + "learning_rate": 2.2973110096397767e-06, + "loss": 1.1284, + "step": 8687 + }, + { + "epoch": 7.934246575342466, + "grad_norm": 1.2173842191696167, + "learning_rate": 2.2962962962962964e-06, + "loss": 0.0061, + "step": 8688 + }, + { + "epoch": 7.9351598173515985, + "grad_norm": 2.9750821590423584, + "learning_rate": 2.295281582952816e-06, + "loss": 0.0178, + "step": 8689 + }, + { + "epoch": 7.936073059360731, + "grad_norm": 21.13409423828125, + "learning_rate": 2.294266869609336e-06, + "loss": 0.1701, + "step": 8690 + }, + { + "epoch": 7.936986301369863, + "grad_norm": 10.043171882629395, + "learning_rate": 2.293252156265855e-06, + "loss": 0.0666, + "step": 8691 + }, + { + "epoch": 7.937899543378995, + "grad_norm": 2.845848560333252, + "learning_rate": 2.2922374429223748e-06, + "loss": 0.0211, + "step": 8692 + }, + { + "epoch": 7.938812785388128, + "grad_norm": 1.7918696403503418, + "learning_rate": 2.291222729578894e-06, + "loss": 0.0097, + "step": 8693 + }, + { + "epoch": 7.93972602739726, + "grad_norm": 0.6797829270362854, + "learning_rate": 2.2902080162354137e-06, + "loss": 0.0039, + "step": 8694 + }, + { + "epoch": 7.940639269406392, + "grad_norm": 0.3657839000225067, + "learning_rate": 2.2891933028919334e-06, + "loss": 0.0022, + "step": 8695 + }, + { + "epoch": 7.941552511415525, + "grad_norm": 2.1202123165130615, + "learning_rate": 2.2881785895484527e-06, + "loss": 0.0116, + "step": 8696 + }, + { + "epoch": 7.942465753424657, + "grad_norm": 1.4066376686096191, + "learning_rate": 2.2871638762049724e-06, + "loss": 0.007, + "step": 8697 + }, + { + "epoch": 7.94337899543379, + "grad_norm": 4.133856773376465, + "learning_rate": 2.2861491628614916e-06, + "loss": 0.0287, + "step": 8698 + }, + { + "epoch": 7.944292237442922, + "grad_norm": 28.898365020751953, + "learning_rate": 2.2851344495180113e-06, + "loss": 0.1425, + "step": 8699 + }, + { + "epoch": 7.945205479452055, + "grad_norm": 0.07729022204875946, + "learning_rate": 2.284119736174531e-06, + "loss": 0.0005, + "step": 8700 + }, + { + "epoch": 7.946118721461187, + "grad_norm": 2.1938393115997314, + "learning_rate": 2.2831050228310503e-06, + "loss": 0.0117, + "step": 8701 + }, + { + "epoch": 7.9470319634703195, + "grad_norm": 10.524508476257324, + "learning_rate": 2.28209030948757e-06, + "loss": 0.0768, + "step": 8702 + }, + { + "epoch": 7.947945205479452, + "grad_norm": 0.28629201650619507, + "learning_rate": 2.2810755961440897e-06, + "loss": 0.0015, + "step": 8703 + }, + { + "epoch": 7.948858447488584, + "grad_norm": 1.4148439168930054, + "learning_rate": 2.280060882800609e-06, + "loss": 0.0061, + "step": 8704 + }, + { + "epoch": 7.949771689497717, + "grad_norm": 0.04352482408285141, + "learning_rate": 2.2790461694571286e-06, + "loss": 0.0003, + "step": 8705 + }, + { + "epoch": 7.950684931506849, + "grad_norm": 2.0408051013946533, + "learning_rate": 2.278031456113648e-06, + "loss": 0.0145, + "step": 8706 + }, + { + "epoch": 7.951598173515982, + "grad_norm": 0.7092624306678772, + "learning_rate": 2.2770167427701676e-06, + "loss": 0.0047, + "step": 8707 + }, + { + "epoch": 7.952511415525114, + "grad_norm": 113.95391845703125, + "learning_rate": 2.2760020294266873e-06, + "loss": 0.4896, + "step": 8708 + }, + { + "epoch": 7.953424657534247, + "grad_norm": 0.5420204401016235, + "learning_rate": 2.2749873160832066e-06, + "loss": 0.0027, + "step": 8709 + }, + { + "epoch": 7.954337899543379, + "grad_norm": 106.0118408203125, + "learning_rate": 2.2739726027397262e-06, + "loss": 1.4858, + "step": 8710 + }, + { + "epoch": 7.955251141552512, + "grad_norm": 1.9869273900985718, + "learning_rate": 2.2729578893962455e-06, + "loss": 0.0123, + "step": 8711 + }, + { + "epoch": 7.956164383561644, + "grad_norm": 103.4651870727539, + "learning_rate": 2.271943176052765e-06, + "loss": 1.8915, + "step": 8712 + }, + { + "epoch": 7.9570776255707765, + "grad_norm": 52.75053405761719, + "learning_rate": 2.270928462709285e-06, + "loss": 0.4957, + "step": 8713 + }, + { + "epoch": 7.957990867579909, + "grad_norm": 15.925929069519043, + "learning_rate": 2.2699137493658046e-06, + "loss": 0.1491, + "step": 8714 + }, + { + "epoch": 7.958904109589041, + "grad_norm": 0.40849435329437256, + "learning_rate": 2.268899036022324e-06, + "loss": 0.0038, + "step": 8715 + }, + { + "epoch": 7.959817351598174, + "grad_norm": 2.465897560119629, + "learning_rate": 2.267884322678843e-06, + "loss": 0.0128, + "step": 8716 + }, + { + "epoch": 7.960730593607306, + "grad_norm": 8.233209609985352, + "learning_rate": 2.266869609335363e-06, + "loss": 0.0463, + "step": 8717 + }, + { + "epoch": 7.961643835616439, + "grad_norm": 1.3499081134796143, + "learning_rate": 2.2658548959918825e-06, + "loss": 0.0068, + "step": 8718 + }, + { + "epoch": 7.96255707762557, + "grad_norm": 54.06340408325195, + "learning_rate": 2.264840182648402e-06, + "loss": 0.4426, + "step": 8719 + }, + { + "epoch": 7.963470319634704, + "grad_norm": 6.684980392456055, + "learning_rate": 2.2638254693049215e-06, + "loss": 0.0354, + "step": 8720 + }, + { + "epoch": 7.964383561643835, + "grad_norm": 0.6370133757591248, + "learning_rate": 2.2628107559614407e-06, + "loss": 0.0029, + "step": 8721 + }, + { + "epoch": 7.965296803652968, + "grad_norm": 51.27949905395508, + "learning_rate": 2.2617960426179604e-06, + "loss": 0.4099, + "step": 8722 + }, + { + "epoch": 7.9662100456621, + "grad_norm": 33.542091369628906, + "learning_rate": 2.26078132927448e-06, + "loss": 0.3051, + "step": 8723 + }, + { + "epoch": 7.967123287671233, + "grad_norm": 3.4979050159454346, + "learning_rate": 2.259766615931e-06, + "loss": 0.0175, + "step": 8724 + }, + { + "epoch": 7.968036529680365, + "grad_norm": 43.72832489013672, + "learning_rate": 2.2587519025875195e-06, + "loss": 0.2039, + "step": 8725 + }, + { + "epoch": 7.9689497716894975, + "grad_norm": 6.656828880310059, + "learning_rate": 2.2577371892440388e-06, + "loss": 0.0406, + "step": 8726 + }, + { + "epoch": 7.96986301369863, + "grad_norm": 3.5312516689300537, + "learning_rate": 2.256722475900558e-06, + "loss": 0.022, + "step": 8727 + }, + { + "epoch": 7.970776255707762, + "grad_norm": 25.959949493408203, + "learning_rate": 2.2557077625570777e-06, + "loss": 0.2018, + "step": 8728 + }, + { + "epoch": 7.971689497716895, + "grad_norm": 53.563941955566406, + "learning_rate": 2.2546930492135974e-06, + "loss": 0.3175, + "step": 8729 + }, + { + "epoch": 7.972602739726027, + "grad_norm": 0.7654889822006226, + "learning_rate": 2.253678335870117e-06, + "loss": 0.0058, + "step": 8730 + }, + { + "epoch": 7.97351598173516, + "grad_norm": 0.689769446849823, + "learning_rate": 2.2526636225266364e-06, + "loss": 0.0053, + "step": 8731 + }, + { + "epoch": 7.974429223744292, + "grad_norm": 3.6202988624572754, + "learning_rate": 2.2516489091831557e-06, + "loss": 0.0219, + "step": 8732 + }, + { + "epoch": 7.975342465753425, + "grad_norm": 9.135167121887207, + "learning_rate": 2.2506341958396754e-06, + "loss": 0.017, + "step": 8733 + }, + { + "epoch": 7.976255707762557, + "grad_norm": 38.06856918334961, + "learning_rate": 2.249619482496195e-06, + "loss": 0.1285, + "step": 8734 + }, + { + "epoch": 7.9771689497716896, + "grad_norm": 1.9805649518966675, + "learning_rate": 2.2486047691527147e-06, + "loss": 0.0113, + "step": 8735 + }, + { + "epoch": 7.978082191780822, + "grad_norm": 0.018672166392207146, + "learning_rate": 2.247590055809234e-06, + "loss": 0.0001, + "step": 8736 + }, + { + "epoch": 7.9789954337899545, + "grad_norm": 0.24584978818893433, + "learning_rate": 2.2465753424657537e-06, + "loss": 0.0017, + "step": 8737 + }, + { + "epoch": 7.979908675799087, + "grad_norm": 0.20167799293994904, + "learning_rate": 2.245560629122273e-06, + "loss": 0.0012, + "step": 8738 + }, + { + "epoch": 7.980821917808219, + "grad_norm": 0.9824639558792114, + "learning_rate": 2.2445459157787927e-06, + "loss": 0.0065, + "step": 8739 + }, + { + "epoch": 7.981735159817352, + "grad_norm": 51.53583526611328, + "learning_rate": 2.2435312024353124e-06, + "loss": 0.4664, + "step": 8740 + }, + { + "epoch": 7.982648401826484, + "grad_norm": 4.145506381988525, + "learning_rate": 2.2425164890918316e-06, + "loss": 0.0191, + "step": 8741 + }, + { + "epoch": 7.983561643835617, + "grad_norm": 7.2626166343688965, + "learning_rate": 2.2415017757483513e-06, + "loss": 0.0244, + "step": 8742 + }, + { + "epoch": 7.984474885844749, + "grad_norm": 45.22275161743164, + "learning_rate": 2.240487062404871e-06, + "loss": 0.2982, + "step": 8743 + }, + { + "epoch": 7.985388127853882, + "grad_norm": 0.6439334154129028, + "learning_rate": 2.2394723490613903e-06, + "loss": 0.002, + "step": 8744 + }, + { + "epoch": 7.986301369863014, + "grad_norm": 37.68803405761719, + "learning_rate": 2.23845763571791e-06, + "loss": 0.1231, + "step": 8745 + }, + { + "epoch": 7.987214611872146, + "grad_norm": 0.10321711748838425, + "learning_rate": 2.2374429223744292e-06, + "loss": 0.0007, + "step": 8746 + }, + { + "epoch": 7.988127853881279, + "grad_norm": 13.27196216583252, + "learning_rate": 2.236428209030949e-06, + "loss": 0.1509, + "step": 8747 + }, + { + "epoch": 7.989041095890411, + "grad_norm": 0.5412625670433044, + "learning_rate": 2.2354134956874686e-06, + "loss": 0.0039, + "step": 8748 + }, + { + "epoch": 7.989954337899543, + "grad_norm": 5.54589319229126, + "learning_rate": 2.234398782343988e-06, + "loss": 0.0301, + "step": 8749 + }, + { + "epoch": 7.9908675799086755, + "grad_norm": 5.894397258758545, + "learning_rate": 2.2333840690005076e-06, + "loss": 0.0345, + "step": 8750 + }, + { + "epoch": 7.991780821917808, + "grad_norm": 0.49905475974082947, + "learning_rate": 2.232369355657027e-06, + "loss": 0.0033, + "step": 8751 + }, + { + "epoch": 7.99269406392694, + "grad_norm": 6.144713878631592, + "learning_rate": 2.2313546423135465e-06, + "loss": 0.0276, + "step": 8752 + }, + { + "epoch": 7.993607305936073, + "grad_norm": 0.06349267810583115, + "learning_rate": 2.2303399289700662e-06, + "loss": 0.0004, + "step": 8753 + }, + { + "epoch": 7.994520547945205, + "grad_norm": 0.7757874727249146, + "learning_rate": 2.229325215626586e-06, + "loss": 0.0065, + "step": 8754 + }, + { + "epoch": 7.995433789954338, + "grad_norm": 5.4968180656433105, + "learning_rate": 2.228310502283105e-06, + "loss": 0.0303, + "step": 8755 + }, + { + "epoch": 7.99634703196347, + "grad_norm": 0.11214578151702881, + "learning_rate": 2.2272957889396245e-06, + "loss": 0.0007, + "step": 8756 + }, + { + "epoch": 7.997260273972603, + "grad_norm": 6.553471565246582, + "learning_rate": 2.226281075596144e-06, + "loss": 0.0385, + "step": 8757 + }, + { + "epoch": 7.998173515981735, + "grad_norm": 0.0066371639259159565, + "learning_rate": 2.225266362252664e-06, + "loss": 0.0, + "step": 8758 + }, + { + "epoch": 7.9990867579908675, + "grad_norm": 17.553869247436523, + "learning_rate": 2.2242516489091835e-06, + "loss": 0.1283, + "step": 8759 + }, + { + "epoch": 8.0, + "grad_norm": 96.47925567626953, + "learning_rate": 2.223236935565703e-06, + "loss": 0.8238, + "step": 8760 + }, + { + "epoch": 8.000913242009132, + "grad_norm": 116.12772369384766, + "learning_rate": 2.222222222222222e-06, + "loss": 1.7867, + "step": 8761 + }, + { + "epoch": 8.001826484018265, + "grad_norm": 22.4669189453125, + "learning_rate": 2.2212075088787418e-06, + "loss": 0.2515, + "step": 8762 + }, + { + "epoch": 8.002739726027396, + "grad_norm": 0.5266666412353516, + "learning_rate": 2.2201927955352615e-06, + "loss": 0.0039, + "step": 8763 + }, + { + "epoch": 8.00365296803653, + "grad_norm": 4.785877227783203, + "learning_rate": 2.219178082191781e-06, + "loss": 0.0288, + "step": 8764 + }, + { + "epoch": 8.004566210045661, + "grad_norm": 15.754936218261719, + "learning_rate": 2.218163368848301e-06, + "loss": 0.1374, + "step": 8765 + }, + { + "epoch": 8.005479452054795, + "grad_norm": 1.49299955368042, + "learning_rate": 2.21714865550482e-06, + "loss": 0.008, + "step": 8766 + }, + { + "epoch": 8.006392694063926, + "grad_norm": 0.993485689163208, + "learning_rate": 2.2161339421613394e-06, + "loss": 0.0056, + "step": 8767 + }, + { + "epoch": 8.00730593607306, + "grad_norm": 0.2790305018424988, + "learning_rate": 2.215119228817859e-06, + "loss": 0.0012, + "step": 8768 + }, + { + "epoch": 8.008219178082191, + "grad_norm": 11.218036651611328, + "learning_rate": 2.2141045154743788e-06, + "loss": 0.0658, + "step": 8769 + }, + { + "epoch": 8.009132420091325, + "grad_norm": 0.6416463851928711, + "learning_rate": 2.2130898021308985e-06, + "loss": 0.0035, + "step": 8770 + }, + { + "epoch": 8.010045662100456, + "grad_norm": 1.307419776916504, + "learning_rate": 2.2120750887874177e-06, + "loss": 0.0065, + "step": 8771 + }, + { + "epoch": 8.01095890410959, + "grad_norm": 11.570515632629395, + "learning_rate": 2.211060375443937e-06, + "loss": 0.0738, + "step": 8772 + }, + { + "epoch": 8.011872146118721, + "grad_norm": 0.8977282047271729, + "learning_rate": 2.2100456621004567e-06, + "loss": 0.0062, + "step": 8773 + }, + { + "epoch": 8.012785388127854, + "grad_norm": 24.585214614868164, + "learning_rate": 2.2090309487569764e-06, + "loss": 0.1168, + "step": 8774 + }, + { + "epoch": 8.013698630136986, + "grad_norm": 0.5072063207626343, + "learning_rate": 2.208016235413496e-06, + "loss": 0.0038, + "step": 8775 + }, + { + "epoch": 8.01461187214612, + "grad_norm": 0.14383023977279663, + "learning_rate": 2.2070015220700153e-06, + "loss": 0.0008, + "step": 8776 + }, + { + "epoch": 8.01552511415525, + "grad_norm": 1.5758849382400513, + "learning_rate": 2.205986808726535e-06, + "loss": 0.0134, + "step": 8777 + }, + { + "epoch": 8.016438356164384, + "grad_norm": 5.965433597564697, + "learning_rate": 2.2049720953830543e-06, + "loss": 0.0281, + "step": 8778 + }, + { + "epoch": 8.017351598173516, + "grad_norm": 12.289565086364746, + "learning_rate": 2.203957382039574e-06, + "loss": 0.1108, + "step": 8779 + }, + { + "epoch": 8.018264840182649, + "grad_norm": 0.4246496260166168, + "learning_rate": 2.2029426686960937e-06, + "loss": 0.0025, + "step": 8780 + }, + { + "epoch": 8.01917808219178, + "grad_norm": 0.7009654641151428, + "learning_rate": 2.201927955352613e-06, + "loss": 0.0048, + "step": 8781 + }, + { + "epoch": 8.020091324200914, + "grad_norm": 0.25872233510017395, + "learning_rate": 2.2009132420091326e-06, + "loss": 0.002, + "step": 8782 + }, + { + "epoch": 8.021004566210046, + "grad_norm": 0.3547845482826233, + "learning_rate": 2.199898528665652e-06, + "loss": 0.0016, + "step": 8783 + }, + { + "epoch": 8.021917808219179, + "grad_norm": 10.481523513793945, + "learning_rate": 2.1988838153221716e-06, + "loss": 0.0478, + "step": 8784 + }, + { + "epoch": 8.02283105022831, + "grad_norm": 81.99079132080078, + "learning_rate": 2.1978691019786913e-06, + "loss": 0.7266, + "step": 8785 + }, + { + "epoch": 8.023744292237444, + "grad_norm": 1.785321593284607, + "learning_rate": 2.1968543886352106e-06, + "loss": 0.0124, + "step": 8786 + }, + { + "epoch": 8.024657534246575, + "grad_norm": 29.867752075195312, + "learning_rate": 2.1958396752917303e-06, + "loss": 0.2015, + "step": 8787 + }, + { + "epoch": 8.025570776255707, + "grad_norm": 0.44082435965538025, + "learning_rate": 2.19482496194825e-06, + "loss": 0.0029, + "step": 8788 + }, + { + "epoch": 8.02648401826484, + "grad_norm": 1.2272562980651855, + "learning_rate": 2.1938102486047692e-06, + "loss": 0.0082, + "step": 8789 + }, + { + "epoch": 8.027397260273972, + "grad_norm": 23.723278045654297, + "learning_rate": 2.192795535261289e-06, + "loss": 0.2482, + "step": 8790 + }, + { + "epoch": 8.028310502283105, + "grad_norm": 33.63526153564453, + "learning_rate": 2.191780821917808e-06, + "loss": 0.2039, + "step": 8791 + }, + { + "epoch": 8.029223744292237, + "grad_norm": 0.030027028173208237, + "learning_rate": 2.190766108574328e-06, + "loss": 0.0002, + "step": 8792 + }, + { + "epoch": 8.03013698630137, + "grad_norm": 74.27276611328125, + "learning_rate": 2.1897513952308476e-06, + "loss": 0.571, + "step": 8793 + }, + { + "epoch": 8.031050228310502, + "grad_norm": 0.11698788404464722, + "learning_rate": 2.1887366818873673e-06, + "loss": 0.0007, + "step": 8794 + }, + { + "epoch": 8.031963470319635, + "grad_norm": 2.509984254837036, + "learning_rate": 2.1877219685438865e-06, + "loss": 0.0166, + "step": 8795 + }, + { + "epoch": 8.032876712328767, + "grad_norm": 22.007917404174805, + "learning_rate": 2.186707255200406e-06, + "loss": 0.1111, + "step": 8796 + }, + { + "epoch": 8.0337899543379, + "grad_norm": 15.27442741394043, + "learning_rate": 2.1856925418569255e-06, + "loss": 0.0817, + "step": 8797 + }, + { + "epoch": 8.034703196347031, + "grad_norm": 2.9569005966186523, + "learning_rate": 2.184677828513445e-06, + "loss": 0.0176, + "step": 8798 + }, + { + "epoch": 8.035616438356165, + "grad_norm": 39.7000732421875, + "learning_rate": 2.183663115169965e-06, + "loss": 0.3138, + "step": 8799 + }, + { + "epoch": 8.036529680365296, + "grad_norm": 16.116918563842773, + "learning_rate": 2.182648401826484e-06, + "loss": 0.1225, + "step": 8800 + }, + { + "epoch": 8.03744292237443, + "grad_norm": 2.193979501724243, + "learning_rate": 2.1816336884830034e-06, + "loss": 0.0135, + "step": 8801 + }, + { + "epoch": 8.038356164383561, + "grad_norm": 0.93686842918396, + "learning_rate": 2.180618975139523e-06, + "loss": 0.0028, + "step": 8802 + }, + { + "epoch": 8.039269406392695, + "grad_norm": 15.6764497756958, + "learning_rate": 2.179604261796043e-06, + "loss": 0.1067, + "step": 8803 + }, + { + "epoch": 8.040182648401826, + "grad_norm": 2.3466031551361084, + "learning_rate": 2.1785895484525625e-06, + "loss": 0.0115, + "step": 8804 + }, + { + "epoch": 8.04109589041096, + "grad_norm": 7.084660053253174, + "learning_rate": 2.177574835109082e-06, + "loss": 0.0535, + "step": 8805 + }, + { + "epoch": 8.042009132420091, + "grad_norm": 5.5796990394592285, + "learning_rate": 2.1765601217656014e-06, + "loss": 0.0276, + "step": 8806 + }, + { + "epoch": 8.042922374429224, + "grad_norm": 3.646972894668579, + "learning_rate": 2.1755454084221207e-06, + "loss": 0.0196, + "step": 8807 + }, + { + "epoch": 8.043835616438356, + "grad_norm": 0.3332923352718353, + "learning_rate": 2.1745306950786404e-06, + "loss": 0.0016, + "step": 8808 + }, + { + "epoch": 8.04474885844749, + "grad_norm": 0.7902898192405701, + "learning_rate": 2.17351598173516e-06, + "loss": 0.0048, + "step": 8809 + }, + { + "epoch": 8.045662100456621, + "grad_norm": 7.913810729980469, + "learning_rate": 2.1725012683916798e-06, + "loss": 0.0505, + "step": 8810 + }, + { + "epoch": 8.046575342465754, + "grad_norm": 7.165537357330322, + "learning_rate": 2.171486555048199e-06, + "loss": 0.0617, + "step": 8811 + }, + { + "epoch": 8.047488584474886, + "grad_norm": 102.90968322753906, + "learning_rate": 2.1704718417047183e-06, + "loss": 0.5599, + "step": 8812 + }, + { + "epoch": 8.04840182648402, + "grad_norm": 1.2893348932266235, + "learning_rate": 2.169457128361238e-06, + "loss": 0.0049, + "step": 8813 + }, + { + "epoch": 8.04931506849315, + "grad_norm": 0.22204330563545227, + "learning_rate": 2.1684424150177577e-06, + "loss": 0.0014, + "step": 8814 + }, + { + "epoch": 8.050228310502282, + "grad_norm": 0.28318750858306885, + "learning_rate": 2.1674277016742774e-06, + "loss": 0.0013, + "step": 8815 + }, + { + "epoch": 8.051141552511416, + "grad_norm": 0.09048189222812653, + "learning_rate": 2.1664129883307967e-06, + "loss": 0.0006, + "step": 8816 + }, + { + "epoch": 8.052054794520547, + "grad_norm": 1.0173581838607788, + "learning_rate": 2.1653982749873164e-06, + "loss": 0.0094, + "step": 8817 + }, + { + "epoch": 8.05296803652968, + "grad_norm": 0.5073345303535461, + "learning_rate": 2.1643835616438356e-06, + "loss": 0.0024, + "step": 8818 + }, + { + "epoch": 8.053881278538812, + "grad_norm": 16.57391929626465, + "learning_rate": 2.1633688483003553e-06, + "loss": 0.119, + "step": 8819 + }, + { + "epoch": 8.054794520547945, + "grad_norm": 0.038163989782333374, + "learning_rate": 2.162354134956875e-06, + "loss": 0.0002, + "step": 8820 + }, + { + "epoch": 8.055707762557077, + "grad_norm": 3.533395528793335, + "learning_rate": 2.1613394216133943e-06, + "loss": 0.0158, + "step": 8821 + }, + { + "epoch": 8.05662100456621, + "grad_norm": 11.870747566223145, + "learning_rate": 2.160324708269914e-06, + "loss": 0.1034, + "step": 8822 + }, + { + "epoch": 8.057534246575342, + "grad_norm": 5.830184459686279, + "learning_rate": 2.1593099949264332e-06, + "loss": 0.0334, + "step": 8823 + }, + { + "epoch": 8.058447488584475, + "grad_norm": 0.26819026470184326, + "learning_rate": 2.158295281582953e-06, + "loss": 0.0013, + "step": 8824 + }, + { + "epoch": 8.059360730593607, + "grad_norm": 23.528194427490234, + "learning_rate": 2.1572805682394726e-06, + "loss": 0.2202, + "step": 8825 + }, + { + "epoch": 8.06027397260274, + "grad_norm": 2.263442277908325, + "learning_rate": 2.156265854895992e-06, + "loss": 0.012, + "step": 8826 + }, + { + "epoch": 8.061187214611872, + "grad_norm": 10.517757415771484, + "learning_rate": 2.1552511415525116e-06, + "loss": 0.0601, + "step": 8827 + }, + { + "epoch": 8.062100456621005, + "grad_norm": 7.554980754852295, + "learning_rate": 2.1542364282090313e-06, + "loss": 0.0267, + "step": 8828 + }, + { + "epoch": 8.063013698630137, + "grad_norm": 4.233997821807861, + "learning_rate": 2.1532217148655505e-06, + "loss": 0.018, + "step": 8829 + }, + { + "epoch": 8.06392694063927, + "grad_norm": 12.5092191696167, + "learning_rate": 2.1522070015220702e-06, + "loss": 0.0724, + "step": 8830 + }, + { + "epoch": 8.064840182648402, + "grad_norm": 18.351789474487305, + "learning_rate": 2.1511922881785895e-06, + "loss": 0.1115, + "step": 8831 + }, + { + "epoch": 8.065753424657535, + "grad_norm": 2.3441503047943115, + "learning_rate": 2.150177574835109e-06, + "loss": 0.0137, + "step": 8832 + }, + { + "epoch": 8.066666666666666, + "grad_norm": 0.33988773822784424, + "learning_rate": 2.149162861491629e-06, + "loss": 0.0025, + "step": 8833 + }, + { + "epoch": 8.0675799086758, + "grad_norm": 0.8787178993225098, + "learning_rate": 2.148148148148148e-06, + "loss": 0.0053, + "step": 8834 + }, + { + "epoch": 8.068493150684931, + "grad_norm": 207.1331329345703, + "learning_rate": 2.147133434804668e-06, + "loss": 0.5814, + "step": 8835 + }, + { + "epoch": 8.069406392694065, + "grad_norm": 17.80339813232422, + "learning_rate": 2.146118721461187e-06, + "loss": 0.136, + "step": 8836 + }, + { + "epoch": 8.070319634703196, + "grad_norm": 5.65878963470459, + "learning_rate": 2.145104008117707e-06, + "loss": 0.0248, + "step": 8837 + }, + { + "epoch": 8.07123287671233, + "grad_norm": 23.313758850097656, + "learning_rate": 2.1440892947742265e-06, + "loss": 0.2445, + "step": 8838 + }, + { + "epoch": 8.072146118721461, + "grad_norm": 96.5539779663086, + "learning_rate": 2.143074581430746e-06, + "loss": 1.3282, + "step": 8839 + }, + { + "epoch": 8.073059360730593, + "grad_norm": 0.99824059009552, + "learning_rate": 2.1420598680872655e-06, + "loss": 0.0064, + "step": 8840 + }, + { + "epoch": 8.073972602739726, + "grad_norm": 1.2346646785736084, + "learning_rate": 2.1410451547437847e-06, + "loss": 0.0035, + "step": 8841 + }, + { + "epoch": 8.074885844748858, + "grad_norm": 17.179277420043945, + "learning_rate": 2.1400304414003044e-06, + "loss": 0.125, + "step": 8842 + }, + { + "epoch": 8.075799086757991, + "grad_norm": 38.40990447998047, + "learning_rate": 2.139015728056824e-06, + "loss": 0.2832, + "step": 8843 + }, + { + "epoch": 8.076712328767123, + "grad_norm": 0.9270575642585754, + "learning_rate": 2.138001014713344e-06, + "loss": 0.0055, + "step": 8844 + }, + { + "epoch": 8.077625570776256, + "grad_norm": 0.28790268301963806, + "learning_rate": 2.1369863013698635e-06, + "loss": 0.0016, + "step": 8845 + }, + { + "epoch": 8.078538812785387, + "grad_norm": 0.06407959014177322, + "learning_rate": 2.1359715880263828e-06, + "loss": 0.0005, + "step": 8846 + }, + { + "epoch": 8.07945205479452, + "grad_norm": 0.5953637361526489, + "learning_rate": 2.134956874682902e-06, + "loss": 0.0047, + "step": 8847 + }, + { + "epoch": 8.080365296803652, + "grad_norm": 3.089890480041504, + "learning_rate": 2.1339421613394217e-06, + "loss": 0.0114, + "step": 8848 + }, + { + "epoch": 8.081278538812786, + "grad_norm": 0.08942537009716034, + "learning_rate": 2.1329274479959414e-06, + "loss": 0.0006, + "step": 8849 + }, + { + "epoch": 8.082191780821917, + "grad_norm": 0.5071535110473633, + "learning_rate": 2.131912734652461e-06, + "loss": 0.0038, + "step": 8850 + }, + { + "epoch": 8.08310502283105, + "grad_norm": 32.3765983581543, + "learning_rate": 2.1308980213089804e-06, + "loss": 0.1907, + "step": 8851 + }, + { + "epoch": 8.084018264840182, + "grad_norm": 4.933530330657959, + "learning_rate": 2.1298833079654997e-06, + "loss": 0.0232, + "step": 8852 + }, + { + "epoch": 8.084931506849315, + "grad_norm": 1.3473989963531494, + "learning_rate": 2.1288685946220193e-06, + "loss": 0.0101, + "step": 8853 + }, + { + "epoch": 8.085844748858447, + "grad_norm": 4.485136985778809, + "learning_rate": 2.127853881278539e-06, + "loss": 0.0279, + "step": 8854 + }, + { + "epoch": 8.08675799086758, + "grad_norm": 1.1129666566848755, + "learning_rate": 2.1268391679350587e-06, + "loss": 0.0053, + "step": 8855 + }, + { + "epoch": 8.087671232876712, + "grad_norm": 10.399182319641113, + "learning_rate": 2.125824454591578e-06, + "loss": 0.0711, + "step": 8856 + }, + { + "epoch": 8.088584474885845, + "grad_norm": 5.466529846191406, + "learning_rate": 2.1248097412480977e-06, + "loss": 0.0183, + "step": 8857 + }, + { + "epoch": 8.089497716894977, + "grad_norm": 0.41505059599876404, + "learning_rate": 2.123795027904617e-06, + "loss": 0.0025, + "step": 8858 + }, + { + "epoch": 8.09041095890411, + "grad_norm": 2.247438669204712, + "learning_rate": 2.1227803145611367e-06, + "loss": 0.0141, + "step": 8859 + }, + { + "epoch": 8.091324200913242, + "grad_norm": 40.016204833984375, + "learning_rate": 2.1217656012176563e-06, + "loss": 0.2844, + "step": 8860 + }, + { + "epoch": 8.092237442922375, + "grad_norm": 0.11211233586072922, + "learning_rate": 2.1207508878741756e-06, + "loss": 0.0005, + "step": 8861 + }, + { + "epoch": 8.093150684931507, + "grad_norm": 0.6125285625457764, + "learning_rate": 2.1197361745306953e-06, + "loss": 0.0033, + "step": 8862 + }, + { + "epoch": 8.09406392694064, + "grad_norm": 0.5339972972869873, + "learning_rate": 2.1187214611872146e-06, + "loss": 0.0028, + "step": 8863 + }, + { + "epoch": 8.094977168949772, + "grad_norm": 0.12598459422588348, + "learning_rate": 2.1177067478437343e-06, + "loss": 0.0007, + "step": 8864 + }, + { + "epoch": 8.095890410958905, + "grad_norm": 0.40364357829093933, + "learning_rate": 2.116692034500254e-06, + "loss": 0.0023, + "step": 8865 + }, + { + "epoch": 8.096803652968037, + "grad_norm": 4.920936107635498, + "learning_rate": 2.1156773211567732e-06, + "loss": 0.025, + "step": 8866 + }, + { + "epoch": 8.097716894977168, + "grad_norm": 0.9704781174659729, + "learning_rate": 2.114662607813293e-06, + "loss": 0.0052, + "step": 8867 + }, + { + "epoch": 8.098630136986301, + "grad_norm": 3.30547833442688, + "learning_rate": 2.1136478944698126e-06, + "loss": 0.017, + "step": 8868 + }, + { + "epoch": 8.099543378995433, + "grad_norm": 5.722532272338867, + "learning_rate": 2.112633181126332e-06, + "loss": 0.01, + "step": 8869 + }, + { + "epoch": 8.100456621004566, + "grad_norm": 0.2828029692173004, + "learning_rate": 2.1116184677828516e-06, + "loss": 0.0018, + "step": 8870 + }, + { + "epoch": 8.101369863013698, + "grad_norm": 23.935401916503906, + "learning_rate": 2.110603754439371e-06, + "loss": 0.1539, + "step": 8871 + }, + { + "epoch": 8.102283105022831, + "grad_norm": 100.06520080566406, + "learning_rate": 2.1095890410958905e-06, + "loss": 1.2116, + "step": 8872 + }, + { + "epoch": 8.103196347031963, + "grad_norm": 2.6080703735351562, + "learning_rate": 2.1085743277524102e-06, + "loss": 0.0197, + "step": 8873 + }, + { + "epoch": 8.104109589041096, + "grad_norm": 0.10459428280591965, + "learning_rate": 2.1075596144089295e-06, + "loss": 0.0008, + "step": 8874 + }, + { + "epoch": 8.105022831050228, + "grad_norm": 0.3032257556915283, + "learning_rate": 2.106544901065449e-06, + "loss": 0.0019, + "step": 8875 + }, + { + "epoch": 8.105936073059361, + "grad_norm": 6.532039165496826, + "learning_rate": 2.1055301877219685e-06, + "loss": 0.0365, + "step": 8876 + }, + { + "epoch": 8.106849315068493, + "grad_norm": 0.2401503622531891, + "learning_rate": 2.104515474378488e-06, + "loss": 0.0013, + "step": 8877 + }, + { + "epoch": 8.107762557077626, + "grad_norm": 21.247718811035156, + "learning_rate": 2.103500761035008e-06, + "loss": 0.0688, + "step": 8878 + }, + { + "epoch": 8.108675799086758, + "grad_norm": 36.924991607666016, + "learning_rate": 2.1024860476915275e-06, + "loss": 0.2953, + "step": 8879 + }, + { + "epoch": 8.10958904109589, + "grad_norm": 0.8937210440635681, + "learning_rate": 2.101471334348047e-06, + "loss": 0.0063, + "step": 8880 + }, + { + "epoch": 8.110502283105022, + "grad_norm": 18.89056396484375, + "learning_rate": 2.100456621004566e-06, + "loss": 0.0917, + "step": 8881 + }, + { + "epoch": 8.111415525114156, + "grad_norm": 0.015080679208040237, + "learning_rate": 2.0994419076610858e-06, + "loss": 0.0001, + "step": 8882 + }, + { + "epoch": 8.112328767123287, + "grad_norm": 3.212000608444214, + "learning_rate": 2.0984271943176054e-06, + "loss": 0.0203, + "step": 8883 + }, + { + "epoch": 8.11324200913242, + "grad_norm": 24.7502498626709, + "learning_rate": 2.097412480974125e-06, + "loss": 0.1391, + "step": 8884 + }, + { + "epoch": 8.114155251141552, + "grad_norm": 2.326789140701294, + "learning_rate": 2.0963977676306444e-06, + "loss": 0.0097, + "step": 8885 + }, + { + "epoch": 8.115068493150686, + "grad_norm": 0.12414665520191193, + "learning_rate": 2.095383054287164e-06, + "loss": 0.0008, + "step": 8886 + }, + { + "epoch": 8.115981735159817, + "grad_norm": 0.5623774528503418, + "learning_rate": 2.0943683409436834e-06, + "loss": 0.0035, + "step": 8887 + }, + { + "epoch": 8.11689497716895, + "grad_norm": 4.830626010894775, + "learning_rate": 2.093353627600203e-06, + "loss": 0.0275, + "step": 8888 + }, + { + "epoch": 8.117808219178082, + "grad_norm": 4.8048200607299805, + "learning_rate": 2.0923389142567228e-06, + "loss": 0.0423, + "step": 8889 + }, + { + "epoch": 8.118721461187215, + "grad_norm": 3.913370370864868, + "learning_rate": 2.0913242009132424e-06, + "loss": 0.0282, + "step": 8890 + }, + { + "epoch": 8.119634703196347, + "grad_norm": 1.929728388786316, + "learning_rate": 2.0903094875697617e-06, + "loss": 0.0101, + "step": 8891 + }, + { + "epoch": 8.12054794520548, + "grad_norm": 0.8613299131393433, + "learning_rate": 2.089294774226281e-06, + "loss": 0.0036, + "step": 8892 + }, + { + "epoch": 8.121461187214612, + "grad_norm": 25.31000518798828, + "learning_rate": 2.0882800608828007e-06, + "loss": 0.115, + "step": 8893 + }, + { + "epoch": 8.122374429223743, + "grad_norm": 1.9133318662643433, + "learning_rate": 2.0872653475393204e-06, + "loss": 0.0096, + "step": 8894 + }, + { + "epoch": 8.123287671232877, + "grad_norm": 2.826801061630249, + "learning_rate": 2.08625063419584e-06, + "loss": 0.0225, + "step": 8895 + }, + { + "epoch": 8.124200913242008, + "grad_norm": 7.424625396728516, + "learning_rate": 2.0852359208523593e-06, + "loss": 0.0386, + "step": 8896 + }, + { + "epoch": 8.125114155251142, + "grad_norm": 68.72169494628906, + "learning_rate": 2.084221207508879e-06, + "loss": 0.6674, + "step": 8897 + }, + { + "epoch": 8.126027397260273, + "grad_norm": 0.01422535814344883, + "learning_rate": 2.0832064941653983e-06, + "loss": 0.0001, + "step": 8898 + }, + { + "epoch": 8.126940639269407, + "grad_norm": 5.560054779052734, + "learning_rate": 2.082191780821918e-06, + "loss": 0.0264, + "step": 8899 + }, + { + "epoch": 8.127853881278538, + "grad_norm": 12.521990776062012, + "learning_rate": 2.0811770674784377e-06, + "loss": 0.0419, + "step": 8900 + }, + { + "epoch": 8.128767123287671, + "grad_norm": 19.324682235717773, + "learning_rate": 2.080162354134957e-06, + "loss": 0.1415, + "step": 8901 + }, + { + "epoch": 8.129680365296803, + "grad_norm": 0.4718891382217407, + "learning_rate": 2.0791476407914766e-06, + "loss": 0.0015, + "step": 8902 + }, + { + "epoch": 8.130593607305936, + "grad_norm": 2.9290711879730225, + "learning_rate": 2.078132927447996e-06, + "loss": 0.0224, + "step": 8903 + }, + { + "epoch": 8.131506849315068, + "grad_norm": 6.731039524078369, + "learning_rate": 2.0771182141045156e-06, + "loss": 0.0363, + "step": 8904 + }, + { + "epoch": 8.132420091324201, + "grad_norm": 0.0052512045949697495, + "learning_rate": 2.0761035007610353e-06, + "loss": 0.0, + "step": 8905 + }, + { + "epoch": 8.133333333333333, + "grad_norm": 13.034346580505371, + "learning_rate": 2.0750887874175546e-06, + "loss": 0.0899, + "step": 8906 + }, + { + "epoch": 8.134246575342466, + "grad_norm": 4.095391750335693, + "learning_rate": 2.0740740740740742e-06, + "loss": 0.0335, + "step": 8907 + }, + { + "epoch": 8.135159817351598, + "grad_norm": 0.02513493038713932, + "learning_rate": 2.073059360730594e-06, + "loss": 0.0001, + "step": 8908 + }, + { + "epoch": 8.136073059360731, + "grad_norm": 47.40768051147461, + "learning_rate": 2.072044647387113e-06, + "loss": 0.2818, + "step": 8909 + }, + { + "epoch": 8.136986301369863, + "grad_norm": 1.2370944023132324, + "learning_rate": 2.071029934043633e-06, + "loss": 0.0069, + "step": 8910 + }, + { + "epoch": 8.137899543378996, + "grad_norm": 9.896203994750977, + "learning_rate": 2.070015220700152e-06, + "loss": 0.0485, + "step": 8911 + }, + { + "epoch": 8.138812785388128, + "grad_norm": 0.07282542437314987, + "learning_rate": 2.069000507356672e-06, + "loss": 0.0005, + "step": 8912 + }, + { + "epoch": 8.139726027397261, + "grad_norm": 0.9747841954231262, + "learning_rate": 2.0679857940131916e-06, + "loss": 0.0068, + "step": 8913 + }, + { + "epoch": 8.140639269406392, + "grad_norm": 14.280805587768555, + "learning_rate": 2.066971080669711e-06, + "loss": 0.0759, + "step": 8914 + }, + { + "epoch": 8.141552511415526, + "grad_norm": 1.7903120517730713, + "learning_rate": 2.0659563673262305e-06, + "loss": 0.0077, + "step": 8915 + }, + { + "epoch": 8.142465753424657, + "grad_norm": 0.08985552191734314, + "learning_rate": 2.0649416539827498e-06, + "loss": 0.0005, + "step": 8916 + }, + { + "epoch": 8.14337899543379, + "grad_norm": 27.537445068359375, + "learning_rate": 2.0639269406392695e-06, + "loss": 0.2756, + "step": 8917 + }, + { + "epoch": 8.144292237442922, + "grad_norm": 3.7711191177368164, + "learning_rate": 2.062912227295789e-06, + "loss": 0.0252, + "step": 8918 + }, + { + "epoch": 8.145205479452056, + "grad_norm": 3.595304012298584, + "learning_rate": 2.061897513952309e-06, + "loss": 0.01, + "step": 8919 + }, + { + "epoch": 8.146118721461187, + "grad_norm": 50.98616409301758, + "learning_rate": 2.060882800608828e-06, + "loss": 0.3527, + "step": 8920 + }, + { + "epoch": 8.147031963470319, + "grad_norm": 7.480936050415039, + "learning_rate": 2.0598680872653474e-06, + "loss": 0.0246, + "step": 8921 + }, + { + "epoch": 8.147945205479452, + "grad_norm": 140.24569702148438, + "learning_rate": 2.058853373921867e-06, + "loss": 1.0084, + "step": 8922 + }, + { + "epoch": 8.148858447488584, + "grad_norm": 32.59166717529297, + "learning_rate": 2.0578386605783868e-06, + "loss": 0.1507, + "step": 8923 + }, + { + "epoch": 8.149771689497717, + "grad_norm": 128.32595825195312, + "learning_rate": 2.0568239472349065e-06, + "loss": 2.3464, + "step": 8924 + }, + { + "epoch": 8.150684931506849, + "grad_norm": 1.4567404985427856, + "learning_rate": 2.0558092338914257e-06, + "loss": 0.0067, + "step": 8925 + }, + { + "epoch": 8.151598173515982, + "grad_norm": 3.488251209259033, + "learning_rate": 2.0547945205479454e-06, + "loss": 0.0169, + "step": 8926 + }, + { + "epoch": 8.152511415525113, + "grad_norm": 0.01763715222477913, + "learning_rate": 2.0537798072044647e-06, + "loss": 0.0001, + "step": 8927 + }, + { + "epoch": 8.153424657534247, + "grad_norm": 43.487342834472656, + "learning_rate": 2.0527650938609844e-06, + "loss": 0.2449, + "step": 8928 + }, + { + "epoch": 8.154337899543378, + "grad_norm": 0.47127285599708557, + "learning_rate": 2.051750380517504e-06, + "loss": 0.0034, + "step": 8929 + }, + { + "epoch": 8.155251141552512, + "grad_norm": 67.93960571289062, + "learning_rate": 2.0507356671740238e-06, + "loss": 0.8094, + "step": 8930 + }, + { + "epoch": 8.156164383561643, + "grad_norm": 1.255710482597351, + "learning_rate": 2.049720953830543e-06, + "loss": 0.0071, + "step": 8931 + }, + { + "epoch": 8.157077625570777, + "grad_norm": 0.39842724800109863, + "learning_rate": 2.0487062404870623e-06, + "loss": 0.0024, + "step": 8932 + }, + { + "epoch": 8.157990867579908, + "grad_norm": 0.7029629349708557, + "learning_rate": 2.047691527143582e-06, + "loss": 0.0046, + "step": 8933 + }, + { + "epoch": 8.158904109589042, + "grad_norm": 41.96156311035156, + "learning_rate": 2.0466768138001017e-06, + "loss": 0.3269, + "step": 8934 + }, + { + "epoch": 8.159817351598173, + "grad_norm": 0.14321009814739227, + "learning_rate": 2.0456621004566214e-06, + "loss": 0.0006, + "step": 8935 + }, + { + "epoch": 8.160730593607306, + "grad_norm": 6.78011417388916, + "learning_rate": 2.0446473871131407e-06, + "loss": 0.0471, + "step": 8936 + }, + { + "epoch": 8.161643835616438, + "grad_norm": 5.515163898468018, + "learning_rate": 2.0436326737696604e-06, + "loss": 0.0295, + "step": 8937 + }, + { + "epoch": 8.162557077625571, + "grad_norm": 23.982099533081055, + "learning_rate": 2.0426179604261796e-06, + "loss": 0.1483, + "step": 8938 + }, + { + "epoch": 8.163470319634703, + "grad_norm": 27.411392211914062, + "learning_rate": 2.0416032470826993e-06, + "loss": 0.2413, + "step": 8939 + }, + { + "epoch": 8.164383561643836, + "grad_norm": 3.329468011856079, + "learning_rate": 2.040588533739219e-06, + "loss": 0.0244, + "step": 8940 + }, + { + "epoch": 8.165296803652968, + "grad_norm": 8.67014217376709, + "learning_rate": 2.0395738203957383e-06, + "loss": 0.0626, + "step": 8941 + }, + { + "epoch": 8.166210045662101, + "grad_norm": 3.535100221633911, + "learning_rate": 2.038559107052258e-06, + "loss": 0.023, + "step": 8942 + }, + { + "epoch": 8.167123287671233, + "grad_norm": 0.5930272340774536, + "learning_rate": 2.0375443937087772e-06, + "loss": 0.0045, + "step": 8943 + }, + { + "epoch": 8.168036529680366, + "grad_norm": 2.22825026512146, + "learning_rate": 2.036529680365297e-06, + "loss": 0.0127, + "step": 8944 + }, + { + "epoch": 8.168949771689498, + "grad_norm": 0.19498412311077118, + "learning_rate": 2.0355149670218166e-06, + "loss": 0.0006, + "step": 8945 + }, + { + "epoch": 8.169863013698631, + "grad_norm": 17.264482498168945, + "learning_rate": 2.034500253678336e-06, + "loss": 0.1191, + "step": 8946 + }, + { + "epoch": 8.170776255707763, + "grad_norm": 19.273815155029297, + "learning_rate": 2.0334855403348556e-06, + "loss": 0.1068, + "step": 8947 + }, + { + "epoch": 8.171689497716894, + "grad_norm": 0.15405942499637604, + "learning_rate": 2.0324708269913753e-06, + "loss": 0.001, + "step": 8948 + }, + { + "epoch": 8.172602739726027, + "grad_norm": 0.1943155825138092, + "learning_rate": 2.0314561136478945e-06, + "loss": 0.0014, + "step": 8949 + }, + { + "epoch": 8.173515981735159, + "grad_norm": 0.3688557744026184, + "learning_rate": 2.0304414003044142e-06, + "loss": 0.0019, + "step": 8950 + }, + { + "epoch": 8.174429223744292, + "grad_norm": 0.023058533668518066, + "learning_rate": 2.0294266869609335e-06, + "loss": 0.0002, + "step": 8951 + }, + { + "epoch": 8.175342465753424, + "grad_norm": 0.45826369524002075, + "learning_rate": 2.028411973617453e-06, + "loss": 0.0026, + "step": 8952 + }, + { + "epoch": 8.176255707762557, + "grad_norm": 2.2219467163085938, + "learning_rate": 2.027397260273973e-06, + "loss": 0.0197, + "step": 8953 + }, + { + "epoch": 8.177168949771689, + "grad_norm": 1.5661216974258423, + "learning_rate": 2.026382546930492e-06, + "loss": 0.0087, + "step": 8954 + }, + { + "epoch": 8.178082191780822, + "grad_norm": 0.5831947326660156, + "learning_rate": 2.025367833587012e-06, + "loss": 0.0029, + "step": 8955 + }, + { + "epoch": 8.178995433789954, + "grad_norm": 9.79152774810791, + "learning_rate": 2.024353120243531e-06, + "loss": 0.0541, + "step": 8956 + }, + { + "epoch": 8.179908675799087, + "grad_norm": 0.3646579086780548, + "learning_rate": 2.023338406900051e-06, + "loss": 0.0021, + "step": 8957 + }, + { + "epoch": 8.180821917808219, + "grad_norm": 0.6019629240036011, + "learning_rate": 2.0223236935565705e-06, + "loss": 0.0034, + "step": 8958 + }, + { + "epoch": 8.181735159817352, + "grad_norm": 0.7554134726524353, + "learning_rate": 2.02130898021309e-06, + "loss": 0.0035, + "step": 8959 + }, + { + "epoch": 8.182648401826484, + "grad_norm": 5.386643409729004, + "learning_rate": 2.0202942668696095e-06, + "loss": 0.0319, + "step": 8960 + }, + { + "epoch": 8.183561643835617, + "grad_norm": 0.10178887099027634, + "learning_rate": 2.0192795535261287e-06, + "loss": 0.0008, + "step": 8961 + }, + { + "epoch": 8.184474885844748, + "grad_norm": 0.6312891840934753, + "learning_rate": 2.0182648401826484e-06, + "loss": 0.0029, + "step": 8962 + }, + { + "epoch": 8.185388127853882, + "grad_norm": 9.514449119567871, + "learning_rate": 2.017250126839168e-06, + "loss": 0.0558, + "step": 8963 + }, + { + "epoch": 8.186301369863013, + "grad_norm": 0.23207490146160126, + "learning_rate": 2.016235413495688e-06, + "loss": 0.0012, + "step": 8964 + }, + { + "epoch": 8.187214611872147, + "grad_norm": 1.3232088088989258, + "learning_rate": 2.015220700152207e-06, + "loss": 0.0111, + "step": 8965 + }, + { + "epoch": 8.188127853881278, + "grad_norm": 0.6054887175559998, + "learning_rate": 2.0142059868087268e-06, + "loss": 0.0033, + "step": 8966 + }, + { + "epoch": 8.189041095890412, + "grad_norm": 68.08336639404297, + "learning_rate": 2.013191273465246e-06, + "loss": 0.4042, + "step": 8967 + }, + { + "epoch": 8.189954337899543, + "grad_norm": 0.06798097491264343, + "learning_rate": 2.0121765601217657e-06, + "loss": 0.0004, + "step": 8968 + }, + { + "epoch": 8.190867579908677, + "grad_norm": 0.7282757759094238, + "learning_rate": 2.0111618467782854e-06, + "loss": 0.0029, + "step": 8969 + }, + { + "epoch": 8.191780821917808, + "grad_norm": 0.6768425703048706, + "learning_rate": 2.010147133434805e-06, + "loss": 0.0055, + "step": 8970 + }, + { + "epoch": 8.192694063926941, + "grad_norm": 0.02031186781823635, + "learning_rate": 2.0091324200913244e-06, + "loss": 0.0002, + "step": 8971 + }, + { + "epoch": 8.193607305936073, + "grad_norm": 39.90622329711914, + "learning_rate": 2.0081177067478436e-06, + "loss": 0.2531, + "step": 8972 + }, + { + "epoch": 8.194520547945206, + "grad_norm": 18.474750518798828, + "learning_rate": 2.0071029934043633e-06, + "loss": 0.0691, + "step": 8973 + }, + { + "epoch": 8.195433789954338, + "grad_norm": 0.627252459526062, + "learning_rate": 2.006088280060883e-06, + "loss": 0.0032, + "step": 8974 + }, + { + "epoch": 8.19634703196347, + "grad_norm": 0.1914719045162201, + "learning_rate": 2.0050735667174027e-06, + "loss": 0.0013, + "step": 8975 + }, + { + "epoch": 8.197260273972603, + "grad_norm": 65.29568481445312, + "learning_rate": 2.004058853373922e-06, + "loss": 0.436, + "step": 8976 + }, + { + "epoch": 8.198173515981734, + "grad_norm": 8.126344680786133, + "learning_rate": 2.0030441400304417e-06, + "loss": 0.0318, + "step": 8977 + }, + { + "epoch": 8.199086757990868, + "grad_norm": 0.5087704062461853, + "learning_rate": 2.002029426686961e-06, + "loss": 0.0029, + "step": 8978 + }, + { + "epoch": 8.2, + "grad_norm": 2.5590567588806152, + "learning_rate": 2.0010147133434806e-06, + "loss": 0.0168, + "step": 8979 + }, + { + "epoch": 8.200913242009133, + "grad_norm": 2.780543088912964, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.0124, + "step": 8980 + }, + { + "epoch": 8.201826484018264, + "grad_norm": 0.7421752214431763, + "learning_rate": 1.9989852866565196e-06, + "loss": 0.0046, + "step": 8981 + }, + { + "epoch": 8.202739726027398, + "grad_norm": 0.17012357711791992, + "learning_rate": 1.9979705733130393e-06, + "loss": 0.0009, + "step": 8982 + }, + { + "epoch": 8.203652968036529, + "grad_norm": 1.0315403938293457, + "learning_rate": 1.9969558599695586e-06, + "loss": 0.0075, + "step": 8983 + }, + { + "epoch": 8.204566210045662, + "grad_norm": 38.482688903808594, + "learning_rate": 1.9959411466260783e-06, + "loss": 0.2593, + "step": 8984 + }, + { + "epoch": 8.205479452054794, + "grad_norm": 1.6681146621704102, + "learning_rate": 1.994926433282598e-06, + "loss": 0.0107, + "step": 8985 + }, + { + "epoch": 8.206392694063927, + "grad_norm": 68.14253234863281, + "learning_rate": 1.9939117199391172e-06, + "loss": 0.5256, + "step": 8986 + }, + { + "epoch": 8.207305936073059, + "grad_norm": 2.7104313373565674, + "learning_rate": 1.992897006595637e-06, + "loss": 0.0123, + "step": 8987 + }, + { + "epoch": 8.208219178082192, + "grad_norm": 4.254960060119629, + "learning_rate": 1.9918822932521566e-06, + "loss": 0.0399, + "step": 8988 + }, + { + "epoch": 8.209132420091324, + "grad_norm": 1.0002021789550781, + "learning_rate": 1.990867579908676e-06, + "loss": 0.0038, + "step": 8989 + }, + { + "epoch": 8.210045662100457, + "grad_norm": 1.4030755758285522, + "learning_rate": 1.9898528665651956e-06, + "loss": 0.011, + "step": 8990 + }, + { + "epoch": 8.210958904109589, + "grad_norm": 8.307805061340332, + "learning_rate": 1.988838153221715e-06, + "loss": 0.0429, + "step": 8991 + }, + { + "epoch": 8.211872146118722, + "grad_norm": 0.31813308596611023, + "learning_rate": 1.9878234398782345e-06, + "loss": 0.0025, + "step": 8992 + }, + { + "epoch": 8.212785388127854, + "grad_norm": 106.22728729248047, + "learning_rate": 1.9868087265347542e-06, + "loss": 1.4628, + "step": 8993 + }, + { + "epoch": 8.213698630136987, + "grad_norm": 1.9646512269973755, + "learning_rate": 1.9857940131912735e-06, + "loss": 0.0115, + "step": 8994 + }, + { + "epoch": 8.214611872146119, + "grad_norm": 0.2463911920785904, + "learning_rate": 1.984779299847793e-06, + "loss": 0.0015, + "step": 8995 + }, + { + "epoch": 8.215525114155252, + "grad_norm": 0.023868681862950325, + "learning_rate": 1.9837645865043124e-06, + "loss": 0.0001, + "step": 8996 + }, + { + "epoch": 8.216438356164383, + "grad_norm": 0.021297810599207878, + "learning_rate": 1.982749873160832e-06, + "loss": 0.0001, + "step": 8997 + }, + { + "epoch": 8.217351598173517, + "grad_norm": 0.24236644804477692, + "learning_rate": 1.981735159817352e-06, + "loss": 0.0016, + "step": 8998 + }, + { + "epoch": 8.218264840182648, + "grad_norm": 0.10627233237028122, + "learning_rate": 1.9807204464738715e-06, + "loss": 0.0008, + "step": 8999 + }, + { + "epoch": 8.219178082191782, + "grad_norm": 6.872030258178711, + "learning_rate": 1.9797057331303908e-06, + "loss": 0.0446, + "step": 9000 + }, + { + "epoch": 8.220091324200913, + "grad_norm": 94.00840759277344, + "learning_rate": 1.97869101978691e-06, + "loss": 0.8247, + "step": 9001 + }, + { + "epoch": 8.221004566210045, + "grad_norm": 5.446061611175537, + "learning_rate": 1.9776763064434297e-06, + "loss": 0.0095, + "step": 9002 + }, + { + "epoch": 8.221917808219178, + "grad_norm": 0.7045621871948242, + "learning_rate": 1.9766615930999494e-06, + "loss": 0.0055, + "step": 9003 + }, + { + "epoch": 8.22283105022831, + "grad_norm": 1.2119853496551514, + "learning_rate": 1.975646879756469e-06, + "loss": 0.0091, + "step": 9004 + }, + { + "epoch": 8.223744292237443, + "grad_norm": 0.9966086149215698, + "learning_rate": 1.9746321664129884e-06, + "loss": 0.0046, + "step": 9005 + }, + { + "epoch": 8.224657534246575, + "grad_norm": 14.883482933044434, + "learning_rate": 1.973617453069508e-06, + "loss": 0.1982, + "step": 9006 + }, + { + "epoch": 8.225570776255708, + "grad_norm": 0.8051097989082336, + "learning_rate": 1.9726027397260274e-06, + "loss": 0.0036, + "step": 9007 + }, + { + "epoch": 8.22648401826484, + "grad_norm": 35.58965301513672, + "learning_rate": 1.971588026382547e-06, + "loss": 0.2497, + "step": 9008 + }, + { + "epoch": 8.227397260273973, + "grad_norm": 30.00749969482422, + "learning_rate": 1.9705733130390667e-06, + "loss": 0.2351, + "step": 9009 + }, + { + "epoch": 8.228310502283104, + "grad_norm": 1.7727490663528442, + "learning_rate": 1.9695585996955864e-06, + "loss": 0.0075, + "step": 9010 + }, + { + "epoch": 8.229223744292238, + "grad_norm": 7.566439151763916, + "learning_rate": 1.9685438863521057e-06, + "loss": 0.0444, + "step": 9011 + }, + { + "epoch": 8.23013698630137, + "grad_norm": 20.562820434570312, + "learning_rate": 1.967529173008625e-06, + "loss": 0.1269, + "step": 9012 + }, + { + "epoch": 8.231050228310503, + "grad_norm": 10.270395278930664, + "learning_rate": 1.9665144596651447e-06, + "loss": 0.0613, + "step": 9013 + }, + { + "epoch": 8.231963470319634, + "grad_norm": 18.897869110107422, + "learning_rate": 1.9654997463216644e-06, + "loss": 0.1073, + "step": 9014 + }, + { + "epoch": 8.232876712328768, + "grad_norm": 21.963178634643555, + "learning_rate": 1.964485032978184e-06, + "loss": 0.1531, + "step": 9015 + }, + { + "epoch": 8.2337899543379, + "grad_norm": 1.450792908668518, + "learning_rate": 1.9634703196347033e-06, + "loss": 0.0077, + "step": 9016 + }, + { + "epoch": 8.234703196347033, + "grad_norm": 0.2179163694381714, + "learning_rate": 1.962455606291223e-06, + "loss": 0.0009, + "step": 9017 + }, + { + "epoch": 8.235616438356164, + "grad_norm": 4.478318691253662, + "learning_rate": 1.9614408929477423e-06, + "loss": 0.0289, + "step": 9018 + }, + { + "epoch": 8.236529680365297, + "grad_norm": 21.143075942993164, + "learning_rate": 1.960426179604262e-06, + "loss": 0.1654, + "step": 9019 + }, + { + "epoch": 8.237442922374429, + "grad_norm": 15.900199890136719, + "learning_rate": 1.9594114662607817e-06, + "loss": 0.0794, + "step": 9020 + }, + { + "epoch": 8.238356164383562, + "grad_norm": 0.01745646446943283, + "learning_rate": 1.958396752917301e-06, + "loss": 0.0001, + "step": 9021 + }, + { + "epoch": 8.239269406392694, + "grad_norm": 0.0922788679599762, + "learning_rate": 1.9573820395738206e-06, + "loss": 0.0007, + "step": 9022 + }, + { + "epoch": 8.240182648401827, + "grad_norm": 1.3661335706710815, + "learning_rate": 1.95636732623034e-06, + "loss": 0.0109, + "step": 9023 + }, + { + "epoch": 8.241095890410959, + "grad_norm": 0.12553593516349792, + "learning_rate": 1.9553526128868596e-06, + "loss": 0.0008, + "step": 9024 + }, + { + "epoch": 8.242009132420092, + "grad_norm": 12.63357925415039, + "learning_rate": 1.9543378995433793e-06, + "loss": 0.0601, + "step": 9025 + }, + { + "epoch": 8.242922374429224, + "grad_norm": 2.8088502883911133, + "learning_rate": 1.9533231861998985e-06, + "loss": 0.0216, + "step": 9026 + }, + { + "epoch": 8.243835616438357, + "grad_norm": 28.647716522216797, + "learning_rate": 1.9523084728564182e-06, + "loss": 0.1397, + "step": 9027 + }, + { + "epoch": 8.244748858447489, + "grad_norm": 0.15716552734375, + "learning_rate": 1.951293759512938e-06, + "loss": 0.0006, + "step": 9028 + }, + { + "epoch": 8.24566210045662, + "grad_norm": 5.872570514678955, + "learning_rate": 1.950279046169457e-06, + "loss": 0.0343, + "step": 9029 + }, + { + "epoch": 8.246575342465754, + "grad_norm": 2.014399290084839, + "learning_rate": 1.949264332825977e-06, + "loss": 0.0089, + "step": 9030 + }, + { + "epoch": 8.247488584474885, + "grad_norm": 5.887149810791016, + "learning_rate": 1.948249619482496e-06, + "loss": 0.0458, + "step": 9031 + }, + { + "epoch": 8.248401826484018, + "grad_norm": 0.10935140401124954, + "learning_rate": 1.947234906139016e-06, + "loss": 0.0007, + "step": 9032 + }, + { + "epoch": 8.24931506849315, + "grad_norm": 0.14964932203292847, + "learning_rate": 1.9462201927955355e-06, + "loss": 0.0012, + "step": 9033 + }, + { + "epoch": 8.250228310502283, + "grad_norm": 8.065169334411621, + "learning_rate": 1.945205479452055e-06, + "loss": 0.051, + "step": 9034 + }, + { + "epoch": 8.251141552511415, + "grad_norm": 0.22322389483451843, + "learning_rate": 1.9441907661085745e-06, + "loss": 0.0009, + "step": 9035 + }, + { + "epoch": 8.252054794520548, + "grad_norm": 1.3298604488372803, + "learning_rate": 1.9431760527650938e-06, + "loss": 0.0102, + "step": 9036 + }, + { + "epoch": 8.25296803652968, + "grad_norm": 12.623518943786621, + "learning_rate": 1.9421613394216135e-06, + "loss": 0.0969, + "step": 9037 + }, + { + "epoch": 8.253881278538813, + "grad_norm": 8.943318367004395, + "learning_rate": 1.941146626078133e-06, + "loss": 0.0369, + "step": 9038 + }, + { + "epoch": 8.254794520547945, + "grad_norm": 2.545551061630249, + "learning_rate": 1.940131912734653e-06, + "loss": 0.0121, + "step": 9039 + }, + { + "epoch": 8.255707762557078, + "grad_norm": 0.9130836725234985, + "learning_rate": 1.939117199391172e-06, + "loss": 0.0042, + "step": 9040 + }, + { + "epoch": 8.25662100456621, + "grad_norm": 1.14435875415802, + "learning_rate": 1.9381024860476914e-06, + "loss": 0.008, + "step": 9041 + }, + { + "epoch": 8.257534246575343, + "grad_norm": 1.4312351942062378, + "learning_rate": 1.937087772704211e-06, + "loss": 0.0083, + "step": 9042 + }, + { + "epoch": 8.258447488584475, + "grad_norm": 4.56403112411499, + "learning_rate": 1.9360730593607308e-06, + "loss": 0.0322, + "step": 9043 + }, + { + "epoch": 8.259360730593608, + "grad_norm": 3.068326711654663, + "learning_rate": 1.9350583460172505e-06, + "loss": 0.0199, + "step": 9044 + }, + { + "epoch": 8.26027397260274, + "grad_norm": 0.39115768671035767, + "learning_rate": 1.9340436326737697e-06, + "loss": 0.0018, + "step": 9045 + }, + { + "epoch": 8.261187214611873, + "grad_norm": 2.742706537246704, + "learning_rate": 1.9330289193302894e-06, + "loss": 0.0137, + "step": 9046 + }, + { + "epoch": 8.262100456621004, + "grad_norm": 0.9696947336196899, + "learning_rate": 1.9320142059868087e-06, + "loss": 0.0057, + "step": 9047 + }, + { + "epoch": 8.263013698630138, + "grad_norm": 0.35553357005119324, + "learning_rate": 1.9309994926433284e-06, + "loss": 0.002, + "step": 9048 + }, + { + "epoch": 8.26392694063927, + "grad_norm": 1.5074247121810913, + "learning_rate": 1.929984779299848e-06, + "loss": 0.0076, + "step": 9049 + }, + { + "epoch": 8.264840182648403, + "grad_norm": 0.10910718142986298, + "learning_rate": 1.9289700659563678e-06, + "loss": 0.0008, + "step": 9050 + }, + { + "epoch": 8.265753424657534, + "grad_norm": 0.05812252685427666, + "learning_rate": 1.927955352612887e-06, + "loss": 0.0003, + "step": 9051 + }, + { + "epoch": 8.266666666666667, + "grad_norm": 2.6327788829803467, + "learning_rate": 1.9269406392694063e-06, + "loss": 0.0149, + "step": 9052 + }, + { + "epoch": 8.267579908675799, + "grad_norm": 1.0132648944854736, + "learning_rate": 1.925925925925926e-06, + "loss": 0.0038, + "step": 9053 + }, + { + "epoch": 8.268493150684932, + "grad_norm": 31.892553329467773, + "learning_rate": 1.9249112125824457e-06, + "loss": 0.1822, + "step": 9054 + }, + { + "epoch": 8.269406392694064, + "grad_norm": 3.5543556213378906, + "learning_rate": 1.9238964992389654e-06, + "loss": 0.0196, + "step": 9055 + }, + { + "epoch": 8.270319634703196, + "grad_norm": 2.1575021743774414, + "learning_rate": 1.9228817858954846e-06, + "loss": 0.0114, + "step": 9056 + }, + { + "epoch": 8.271232876712329, + "grad_norm": 111.21676635742188, + "learning_rate": 1.9218670725520043e-06, + "loss": 0.903, + "step": 9057 + }, + { + "epoch": 8.27214611872146, + "grad_norm": 2.267484664916992, + "learning_rate": 1.9208523592085236e-06, + "loss": 0.0129, + "step": 9058 + }, + { + "epoch": 8.273059360730594, + "grad_norm": 0.6316254734992981, + "learning_rate": 1.9198376458650433e-06, + "loss": 0.0037, + "step": 9059 + }, + { + "epoch": 8.273972602739725, + "grad_norm": 0.6479966044425964, + "learning_rate": 1.918822932521563e-06, + "loss": 0.0035, + "step": 9060 + }, + { + "epoch": 8.274885844748859, + "grad_norm": 0.03703915700316429, + "learning_rate": 1.9178082191780823e-06, + "loss": 0.0003, + "step": 9061 + }, + { + "epoch": 8.27579908675799, + "grad_norm": 9.749112129211426, + "learning_rate": 1.916793505834602e-06, + "loss": 0.0767, + "step": 9062 + }, + { + "epoch": 8.276712328767124, + "grad_norm": 0.17342419922351837, + "learning_rate": 1.9157787924911212e-06, + "loss": 0.0007, + "step": 9063 + }, + { + "epoch": 8.277625570776255, + "grad_norm": 0.07138998806476593, + "learning_rate": 1.914764079147641e-06, + "loss": 0.0004, + "step": 9064 + }, + { + "epoch": 8.278538812785389, + "grad_norm": 0.3797382712364197, + "learning_rate": 1.9137493658041606e-06, + "loss": 0.0019, + "step": 9065 + }, + { + "epoch": 8.27945205479452, + "grad_norm": 0.04136704280972481, + "learning_rate": 1.91273465246068e-06, + "loss": 0.0003, + "step": 9066 + }, + { + "epoch": 8.280365296803653, + "grad_norm": 13.020991325378418, + "learning_rate": 1.9117199391171996e-06, + "loss": 0.1035, + "step": 9067 + }, + { + "epoch": 8.281278538812785, + "grad_norm": 0.3236444592475891, + "learning_rate": 1.9107052257737193e-06, + "loss": 0.0026, + "step": 9068 + }, + { + "epoch": 8.282191780821918, + "grad_norm": 1.1140791177749634, + "learning_rate": 1.9096905124302385e-06, + "loss": 0.0084, + "step": 9069 + }, + { + "epoch": 8.28310502283105, + "grad_norm": 44.92032241821289, + "learning_rate": 1.9086757990867582e-06, + "loss": 0.4819, + "step": 9070 + }, + { + "epoch": 8.284018264840183, + "grad_norm": 2.848097324371338, + "learning_rate": 1.9076610857432775e-06, + "loss": 0.0219, + "step": 9071 + }, + { + "epoch": 8.284931506849315, + "grad_norm": 30.337575912475586, + "learning_rate": 1.9066463723997972e-06, + "loss": 0.2383, + "step": 9072 + }, + { + "epoch": 8.285844748858448, + "grad_norm": 0.0732942447066307, + "learning_rate": 1.9056316590563167e-06, + "loss": 0.0003, + "step": 9073 + }, + { + "epoch": 8.28675799086758, + "grad_norm": 1.8048911094665527, + "learning_rate": 1.9046169457128364e-06, + "loss": 0.0078, + "step": 9074 + }, + { + "epoch": 8.287671232876713, + "grad_norm": 1.6256842613220215, + "learning_rate": 1.9036022323693558e-06, + "loss": 0.0088, + "step": 9075 + }, + { + "epoch": 8.288584474885845, + "grad_norm": 80.11865997314453, + "learning_rate": 1.9025875190258753e-06, + "loss": 0.7699, + "step": 9076 + }, + { + "epoch": 8.289497716894978, + "grad_norm": 2.226161241531372, + "learning_rate": 1.9015728056823948e-06, + "loss": 0.0113, + "step": 9077 + }, + { + "epoch": 8.29041095890411, + "grad_norm": 3.5190250873565674, + "learning_rate": 1.9005580923389145e-06, + "loss": 0.0168, + "step": 9078 + }, + { + "epoch": 8.291324200913243, + "grad_norm": 3.4593186378479004, + "learning_rate": 1.899543378995434e-06, + "loss": 0.023, + "step": 9079 + }, + { + "epoch": 8.292237442922374, + "grad_norm": 1.3128280639648438, + "learning_rate": 1.8985286656519537e-06, + "loss": 0.0079, + "step": 9080 + }, + { + "epoch": 8.293150684931508, + "grad_norm": 0.23343096673488617, + "learning_rate": 1.897513952308473e-06, + "loss": 0.0011, + "step": 9081 + }, + { + "epoch": 8.29406392694064, + "grad_norm": 6.576775074005127, + "learning_rate": 1.8964992389649924e-06, + "loss": 0.029, + "step": 9082 + }, + { + "epoch": 8.29497716894977, + "grad_norm": 25.927568435668945, + "learning_rate": 1.895484525621512e-06, + "loss": 0.1951, + "step": 9083 + }, + { + "epoch": 8.295890410958904, + "grad_norm": 6.118603706359863, + "learning_rate": 1.8944698122780316e-06, + "loss": 0.0254, + "step": 9084 + }, + { + "epoch": 8.296803652968036, + "grad_norm": 48.94851303100586, + "learning_rate": 1.8934550989345513e-06, + "loss": 0.3066, + "step": 9085 + }, + { + "epoch": 8.29771689497717, + "grad_norm": 1.2871012687683105, + "learning_rate": 1.8924403855910705e-06, + "loss": 0.0082, + "step": 9086 + }, + { + "epoch": 8.2986301369863, + "grad_norm": 15.847304344177246, + "learning_rate": 1.8914256722475902e-06, + "loss": 0.0715, + "step": 9087 + }, + { + "epoch": 8.299543378995434, + "grad_norm": 0.7166712880134583, + "learning_rate": 1.8904109589041097e-06, + "loss": 0.0046, + "step": 9088 + }, + { + "epoch": 8.300456621004566, + "grad_norm": 1.6546088457107544, + "learning_rate": 1.8893962455606294e-06, + "loss": 0.0127, + "step": 9089 + }, + { + "epoch": 8.301369863013699, + "grad_norm": 0.191332146525383, + "learning_rate": 1.8883815322171489e-06, + "loss": 0.0012, + "step": 9090 + }, + { + "epoch": 8.30228310502283, + "grad_norm": 0.10630188882350922, + "learning_rate": 1.8873668188736682e-06, + "loss": 0.0006, + "step": 9091 + }, + { + "epoch": 8.303196347031964, + "grad_norm": 24.008054733276367, + "learning_rate": 1.8863521055301878e-06, + "loss": 0.0302, + "step": 9092 + }, + { + "epoch": 8.304109589041095, + "grad_norm": 0.8083822727203369, + "learning_rate": 1.8853373921867073e-06, + "loss": 0.0046, + "step": 9093 + }, + { + "epoch": 8.305022831050229, + "grad_norm": 1.5756399631500244, + "learning_rate": 1.884322678843227e-06, + "loss": 0.0068, + "step": 9094 + }, + { + "epoch": 8.30593607305936, + "grad_norm": 0.4719529151916504, + "learning_rate": 1.8833079654997465e-06, + "loss": 0.0022, + "step": 9095 + }, + { + "epoch": 8.306849315068494, + "grad_norm": 0.20426782965660095, + "learning_rate": 1.882293252156266e-06, + "loss": 0.0009, + "step": 9096 + }, + { + "epoch": 8.307762557077625, + "grad_norm": 0.24765905737876892, + "learning_rate": 1.8812785388127855e-06, + "loss": 0.0014, + "step": 9097 + }, + { + "epoch": 8.308675799086759, + "grad_norm": 3.6258692741394043, + "learning_rate": 1.8802638254693052e-06, + "loss": 0.0259, + "step": 9098 + }, + { + "epoch": 8.30958904109589, + "grad_norm": 2.2281768321990967, + "learning_rate": 1.8792491121258246e-06, + "loss": 0.0113, + "step": 9099 + }, + { + "epoch": 8.310502283105023, + "grad_norm": 13.39547061920166, + "learning_rate": 1.8782343987823443e-06, + "loss": 0.0584, + "step": 9100 + }, + { + "epoch": 8.311415525114155, + "grad_norm": 0.3528984785079956, + "learning_rate": 1.8772196854388636e-06, + "loss": 0.0023, + "step": 9101 + }, + { + "epoch": 8.312328767123288, + "grad_norm": 0.2980153262615204, + "learning_rate": 1.876204972095383e-06, + "loss": 0.0018, + "step": 9102 + }, + { + "epoch": 8.31324200913242, + "grad_norm": 2.186471462249756, + "learning_rate": 1.8751902587519028e-06, + "loss": 0.0099, + "step": 9103 + }, + { + "epoch": 8.314155251141553, + "grad_norm": 121.04320526123047, + "learning_rate": 1.8741755454084222e-06, + "loss": 7.0314, + "step": 9104 + }, + { + "epoch": 8.315068493150685, + "grad_norm": 2.8738996982574463, + "learning_rate": 1.873160832064942e-06, + "loss": 0.0168, + "step": 9105 + }, + { + "epoch": 8.315981735159818, + "grad_norm": 0.16065946221351624, + "learning_rate": 1.8721461187214612e-06, + "loss": 0.0008, + "step": 9106 + }, + { + "epoch": 8.31689497716895, + "grad_norm": 19.053874969482422, + "learning_rate": 1.871131405377981e-06, + "loss": 0.0828, + "step": 9107 + }, + { + "epoch": 8.317808219178083, + "grad_norm": 0.02417277731001377, + "learning_rate": 1.8701166920345004e-06, + "loss": 0.0002, + "step": 9108 + }, + { + "epoch": 8.318721461187215, + "grad_norm": 9.650516510009766, + "learning_rate": 1.86910197869102e-06, + "loss": 0.0495, + "step": 9109 + }, + { + "epoch": 8.319634703196346, + "grad_norm": 11.681127548217773, + "learning_rate": 1.8680872653475395e-06, + "loss": 0.082, + "step": 9110 + }, + { + "epoch": 8.32054794520548, + "grad_norm": 13.030863761901855, + "learning_rate": 1.8670725520040588e-06, + "loss": 0.1032, + "step": 9111 + }, + { + "epoch": 8.321461187214611, + "grad_norm": 11.416001319885254, + "learning_rate": 1.8660578386605785e-06, + "loss": 0.0197, + "step": 9112 + }, + { + "epoch": 8.322374429223744, + "grad_norm": 0.08972577750682831, + "learning_rate": 1.865043125317098e-06, + "loss": 0.0005, + "step": 9113 + }, + { + "epoch": 8.323287671232876, + "grad_norm": 0.4930965304374695, + "learning_rate": 1.8640284119736177e-06, + "loss": 0.002, + "step": 9114 + }, + { + "epoch": 8.32420091324201, + "grad_norm": 6.9758076667785645, + "learning_rate": 1.8630136986301372e-06, + "loss": 0.0181, + "step": 9115 + }, + { + "epoch": 8.325114155251141, + "grad_norm": 0.9381680488586426, + "learning_rate": 1.8619989852866566e-06, + "loss": 0.0054, + "step": 9116 + }, + { + "epoch": 8.326027397260274, + "grad_norm": 0.27699798345565796, + "learning_rate": 1.8609842719431761e-06, + "loss": 0.0015, + "step": 9117 + }, + { + "epoch": 8.326940639269406, + "grad_norm": 1.4001387357711792, + "learning_rate": 1.8599695585996958e-06, + "loss": 0.0087, + "step": 9118 + }, + { + "epoch": 8.32785388127854, + "grad_norm": 2.6383917331695557, + "learning_rate": 1.8589548452562153e-06, + "loss": 0.0166, + "step": 9119 + }, + { + "epoch": 8.32876712328767, + "grad_norm": 0.6809476613998413, + "learning_rate": 1.857940131912735e-06, + "loss": 0.005, + "step": 9120 + }, + { + "epoch": 8.329680365296804, + "grad_norm": 0.9492015838623047, + "learning_rate": 1.8569254185692543e-06, + "loss": 0.0061, + "step": 9121 + }, + { + "epoch": 8.330593607305936, + "grad_norm": 1.216044545173645, + "learning_rate": 1.8559107052257737e-06, + "loss": 0.0085, + "step": 9122 + }, + { + "epoch": 8.331506849315069, + "grad_norm": 3.4659934043884277, + "learning_rate": 1.8548959918822934e-06, + "loss": 0.021, + "step": 9123 + }, + { + "epoch": 8.3324200913242, + "grad_norm": 0.3231009840965271, + "learning_rate": 1.853881278538813e-06, + "loss": 0.0012, + "step": 9124 + }, + { + "epoch": 8.333333333333334, + "grad_norm": 1.6878424882888794, + "learning_rate": 1.8528665651953326e-06, + "loss": 0.0108, + "step": 9125 + }, + { + "epoch": 8.334246575342465, + "grad_norm": 0.5553179383277893, + "learning_rate": 1.8518518518518519e-06, + "loss": 0.0035, + "step": 9126 + }, + { + "epoch": 8.335159817351599, + "grad_norm": 1.2393900156021118, + "learning_rate": 1.8508371385083716e-06, + "loss": 0.0069, + "step": 9127 + }, + { + "epoch": 8.33607305936073, + "grad_norm": 7.6058197021484375, + "learning_rate": 1.849822425164891e-06, + "loss": 0.0363, + "step": 9128 + }, + { + "epoch": 8.336986301369864, + "grad_norm": 16.127347946166992, + "learning_rate": 1.8488077118214107e-06, + "loss": 0.0836, + "step": 9129 + }, + { + "epoch": 8.337899543378995, + "grad_norm": 1.4368394613265991, + "learning_rate": 1.8477929984779302e-06, + "loss": 0.0088, + "step": 9130 + }, + { + "epoch": 8.338812785388129, + "grad_norm": 2.3116602897644043, + "learning_rate": 1.8467782851344495e-06, + "loss": 0.0212, + "step": 9131 + }, + { + "epoch": 8.33972602739726, + "grad_norm": 10.643640518188477, + "learning_rate": 1.8457635717909692e-06, + "loss": 0.1044, + "step": 9132 + }, + { + "epoch": 8.340639269406394, + "grad_norm": 4.671444892883301, + "learning_rate": 1.8447488584474887e-06, + "loss": 0.0273, + "step": 9133 + }, + { + "epoch": 8.341552511415525, + "grad_norm": 2.8819644451141357, + "learning_rate": 1.8437341451040083e-06, + "loss": 0.0189, + "step": 9134 + }, + { + "epoch": 8.342465753424657, + "grad_norm": 0.5147028565406799, + "learning_rate": 1.8427194317605278e-06, + "loss": 0.0024, + "step": 9135 + }, + { + "epoch": 8.34337899543379, + "grad_norm": 0.3473948538303375, + "learning_rate": 1.8417047184170473e-06, + "loss": 0.0022, + "step": 9136 + }, + { + "epoch": 8.344292237442922, + "grad_norm": 46.62960433959961, + "learning_rate": 1.8406900050735668e-06, + "loss": 0.2844, + "step": 9137 + }, + { + "epoch": 8.345205479452055, + "grad_norm": 2.6024858951568604, + "learning_rate": 1.8396752917300865e-06, + "loss": 0.0162, + "step": 9138 + }, + { + "epoch": 8.346118721461186, + "grad_norm": 0.22394807636737823, + "learning_rate": 1.838660578386606e-06, + "loss": 0.0012, + "step": 9139 + }, + { + "epoch": 8.34703196347032, + "grad_norm": 1.048186182975769, + "learning_rate": 1.8376458650431257e-06, + "loss": 0.0059, + "step": 9140 + }, + { + "epoch": 8.347945205479451, + "grad_norm": 2.6225242614746094, + "learning_rate": 1.836631151699645e-06, + "loss": 0.0099, + "step": 9141 + }, + { + "epoch": 8.348858447488585, + "grad_norm": 0.07455331832170486, + "learning_rate": 1.8356164383561644e-06, + "loss": 0.0005, + "step": 9142 + }, + { + "epoch": 8.349771689497716, + "grad_norm": 104.92935180664062, + "learning_rate": 1.834601725012684e-06, + "loss": 2.0565, + "step": 9143 + }, + { + "epoch": 8.35068493150685, + "grad_norm": 22.405818939208984, + "learning_rate": 1.8335870116692036e-06, + "loss": 0.1295, + "step": 9144 + }, + { + "epoch": 8.351598173515981, + "grad_norm": 10.137529373168945, + "learning_rate": 1.8325722983257233e-06, + "loss": 0.0675, + "step": 9145 + }, + { + "epoch": 8.352511415525115, + "grad_norm": 103.29873657226562, + "learning_rate": 1.8315575849822425e-06, + "loss": 1.979, + "step": 9146 + }, + { + "epoch": 8.353424657534246, + "grad_norm": 5.233254909515381, + "learning_rate": 1.8305428716387622e-06, + "loss": 0.0323, + "step": 9147 + }, + { + "epoch": 8.35433789954338, + "grad_norm": 0.0508195124566555, + "learning_rate": 1.8295281582952817e-06, + "loss": 0.0002, + "step": 9148 + }, + { + "epoch": 8.355251141552511, + "grad_norm": 1.4232345819473267, + "learning_rate": 1.8285134449518014e-06, + "loss": 0.0106, + "step": 9149 + }, + { + "epoch": 8.356164383561644, + "grad_norm": 0.3448099195957184, + "learning_rate": 1.8274987316083209e-06, + "loss": 0.0018, + "step": 9150 + }, + { + "epoch": 8.357077625570776, + "grad_norm": 0.028681157156825066, + "learning_rate": 1.8264840182648401e-06, + "loss": 0.0002, + "step": 9151 + }, + { + "epoch": 8.35799086757991, + "grad_norm": 4.960476875305176, + "learning_rate": 1.8254693049213598e-06, + "loss": 0.0264, + "step": 9152 + }, + { + "epoch": 8.35890410958904, + "grad_norm": 7.84684944152832, + "learning_rate": 1.8244545915778793e-06, + "loss": 0.0461, + "step": 9153 + }, + { + "epoch": 8.359817351598174, + "grad_norm": 0.34238845109939575, + "learning_rate": 1.823439878234399e-06, + "loss": 0.0029, + "step": 9154 + }, + { + "epoch": 8.360730593607306, + "grad_norm": 0.202021062374115, + "learning_rate": 1.8224251648909185e-06, + "loss": 0.0013, + "step": 9155 + }, + { + "epoch": 8.361643835616439, + "grad_norm": 3.9023044109344482, + "learning_rate": 1.821410451547438e-06, + "loss": 0.0255, + "step": 9156 + }, + { + "epoch": 8.36255707762557, + "grad_norm": 9.951224327087402, + "learning_rate": 1.8203957382039575e-06, + "loss": 0.0672, + "step": 9157 + }, + { + "epoch": 8.363470319634704, + "grad_norm": 0.1790570318698883, + "learning_rate": 1.8193810248604771e-06, + "loss": 0.0012, + "step": 9158 + }, + { + "epoch": 8.364383561643836, + "grad_norm": 48.98713302612305, + "learning_rate": 1.8183663115169966e-06, + "loss": 0.3727, + "step": 9159 + }, + { + "epoch": 8.365296803652967, + "grad_norm": 137.79530334472656, + "learning_rate": 1.8173515981735163e-06, + "loss": 1.2751, + "step": 9160 + }, + { + "epoch": 8.3662100456621, + "grad_norm": 41.031532287597656, + "learning_rate": 1.8163368848300356e-06, + "loss": 0.2474, + "step": 9161 + }, + { + "epoch": 8.367123287671232, + "grad_norm": 0.3839341700077057, + "learning_rate": 1.815322171486555e-06, + "loss": 0.0023, + "step": 9162 + }, + { + "epoch": 8.368036529680365, + "grad_norm": 10.51451587677002, + "learning_rate": 1.8143074581430748e-06, + "loss": 0.0813, + "step": 9163 + }, + { + "epoch": 8.368949771689497, + "grad_norm": 25.304473876953125, + "learning_rate": 1.8132927447995942e-06, + "loss": 0.285, + "step": 9164 + }, + { + "epoch": 8.36986301369863, + "grad_norm": 4.134999752044678, + "learning_rate": 1.812278031456114e-06, + "loss": 0.0221, + "step": 9165 + }, + { + "epoch": 8.370776255707762, + "grad_norm": 7.782009601593018, + "learning_rate": 1.8112633181126332e-06, + "loss": 0.0442, + "step": 9166 + }, + { + "epoch": 8.371689497716895, + "grad_norm": 0.2225833684206009, + "learning_rate": 1.8102486047691529e-06, + "loss": 0.0013, + "step": 9167 + }, + { + "epoch": 8.372602739726027, + "grad_norm": 8.3782377243042, + "learning_rate": 1.8092338914256724e-06, + "loss": 0.0414, + "step": 9168 + }, + { + "epoch": 8.37351598173516, + "grad_norm": 64.81509399414062, + "learning_rate": 1.808219178082192e-06, + "loss": 0.3806, + "step": 9169 + }, + { + "epoch": 8.374429223744292, + "grad_norm": 13.017881393432617, + "learning_rate": 1.8072044647387115e-06, + "loss": 0.104, + "step": 9170 + }, + { + "epoch": 8.375342465753425, + "grad_norm": 13.222221374511719, + "learning_rate": 1.8061897513952308e-06, + "loss": 0.0737, + "step": 9171 + }, + { + "epoch": 8.376255707762557, + "grad_norm": 6.744823932647705, + "learning_rate": 1.8051750380517505e-06, + "loss": 0.046, + "step": 9172 + }, + { + "epoch": 8.37716894977169, + "grad_norm": 0.3316834568977356, + "learning_rate": 1.80416032470827e-06, + "loss": 0.0022, + "step": 9173 + }, + { + "epoch": 8.378082191780821, + "grad_norm": 1.1589305400848389, + "learning_rate": 1.8031456113647897e-06, + "loss": 0.0056, + "step": 9174 + }, + { + "epoch": 8.378995433789955, + "grad_norm": 1.2391533851623535, + "learning_rate": 1.8021308980213092e-06, + "loss": 0.0059, + "step": 9175 + }, + { + "epoch": 8.379908675799086, + "grad_norm": 0.838100790977478, + "learning_rate": 1.8011161846778286e-06, + "loss": 0.0059, + "step": 9176 + }, + { + "epoch": 8.38082191780822, + "grad_norm": 1.681160807609558, + "learning_rate": 1.8001014713343481e-06, + "loss": 0.0112, + "step": 9177 + }, + { + "epoch": 8.381735159817351, + "grad_norm": 2.3117103576660156, + "learning_rate": 1.7990867579908678e-06, + "loss": 0.0123, + "step": 9178 + }, + { + "epoch": 8.382648401826485, + "grad_norm": 1.924469232559204, + "learning_rate": 1.7980720446473873e-06, + "loss": 0.0102, + "step": 9179 + }, + { + "epoch": 8.383561643835616, + "grad_norm": 0.10069722682237625, + "learning_rate": 1.797057331303907e-06, + "loss": 0.0007, + "step": 9180 + }, + { + "epoch": 8.38447488584475, + "grad_norm": 1.130242109298706, + "learning_rate": 1.7960426179604263e-06, + "loss": 0.0044, + "step": 9181 + }, + { + "epoch": 8.385388127853881, + "grad_norm": 0.17427858710289001, + "learning_rate": 1.7950279046169457e-06, + "loss": 0.0011, + "step": 9182 + }, + { + "epoch": 8.386301369863014, + "grad_norm": 1.2710552215576172, + "learning_rate": 1.7940131912734654e-06, + "loss": 0.0056, + "step": 9183 + }, + { + "epoch": 8.387214611872146, + "grad_norm": 61.880733489990234, + "learning_rate": 1.792998477929985e-06, + "loss": 0.7842, + "step": 9184 + }, + { + "epoch": 8.38812785388128, + "grad_norm": 4.161498069763184, + "learning_rate": 1.7919837645865046e-06, + "loss": 0.0247, + "step": 9185 + }, + { + "epoch": 8.389041095890411, + "grad_norm": 1.186829686164856, + "learning_rate": 1.7909690512430239e-06, + "loss": 0.0071, + "step": 9186 + }, + { + "epoch": 8.389954337899542, + "grad_norm": 11.710837364196777, + "learning_rate": 1.7899543378995436e-06, + "loss": 0.0856, + "step": 9187 + }, + { + "epoch": 8.390867579908676, + "grad_norm": 6.903137683868408, + "learning_rate": 1.788939624556063e-06, + "loss": 0.0518, + "step": 9188 + }, + { + "epoch": 8.391780821917807, + "grad_norm": 0.6240594983100891, + "learning_rate": 1.7879249112125827e-06, + "loss": 0.0045, + "step": 9189 + }, + { + "epoch": 8.39269406392694, + "grad_norm": 0.1743091344833374, + "learning_rate": 1.7869101978691022e-06, + "loss": 0.0011, + "step": 9190 + }, + { + "epoch": 8.393607305936072, + "grad_norm": 1.055748701095581, + "learning_rate": 1.7858954845256215e-06, + "loss": 0.0055, + "step": 9191 + }, + { + "epoch": 8.394520547945206, + "grad_norm": 3.016404628753662, + "learning_rate": 1.7848807711821412e-06, + "loss": 0.0213, + "step": 9192 + }, + { + "epoch": 8.395433789954337, + "grad_norm": 1.655043601989746, + "learning_rate": 1.7838660578386607e-06, + "loss": 0.0091, + "step": 9193 + }, + { + "epoch": 8.39634703196347, + "grad_norm": 4.62476921081543, + "learning_rate": 1.7828513444951803e-06, + "loss": 0.0274, + "step": 9194 + }, + { + "epoch": 8.397260273972602, + "grad_norm": 33.561126708984375, + "learning_rate": 1.7818366311516998e-06, + "loss": 0.1314, + "step": 9195 + }, + { + "epoch": 8.398173515981735, + "grad_norm": 0.022009897977113724, + "learning_rate": 1.7808219178082193e-06, + "loss": 0.0001, + "step": 9196 + }, + { + "epoch": 8.399086757990867, + "grad_norm": 6.403730869293213, + "learning_rate": 1.7798072044647388e-06, + "loss": 0.0343, + "step": 9197 + }, + { + "epoch": 8.4, + "grad_norm": 0.6251058578491211, + "learning_rate": 1.7787924911212585e-06, + "loss": 0.0048, + "step": 9198 + }, + { + "epoch": 8.400913242009132, + "grad_norm": 2.4376301765441895, + "learning_rate": 1.777777777777778e-06, + "loss": 0.0135, + "step": 9199 + }, + { + "epoch": 8.401826484018265, + "grad_norm": 5.213415145874023, + "learning_rate": 1.7767630644342976e-06, + "loss": 0.0259, + "step": 9200 + }, + { + "epoch": 8.402739726027397, + "grad_norm": 0.8624003529548645, + "learning_rate": 1.775748351090817e-06, + "loss": 0.0053, + "step": 9201 + }, + { + "epoch": 8.40365296803653, + "grad_norm": 6.840163707733154, + "learning_rate": 1.7747336377473364e-06, + "loss": 0.0476, + "step": 9202 + }, + { + "epoch": 8.404566210045662, + "grad_norm": 1.760538101196289, + "learning_rate": 1.773718924403856e-06, + "loss": 0.0114, + "step": 9203 + }, + { + "epoch": 8.405479452054795, + "grad_norm": 6.437352657318115, + "learning_rate": 1.7727042110603756e-06, + "loss": 0.0381, + "step": 9204 + }, + { + "epoch": 8.406392694063927, + "grad_norm": 1.277114987373352, + "learning_rate": 1.7716894977168953e-06, + "loss": 0.007, + "step": 9205 + }, + { + "epoch": 8.40730593607306, + "grad_norm": 2.822812795639038, + "learning_rate": 1.7706747843734145e-06, + "loss": 0.0192, + "step": 9206 + }, + { + "epoch": 8.408219178082192, + "grad_norm": 0.08616413176059723, + "learning_rate": 1.7696600710299342e-06, + "loss": 0.0003, + "step": 9207 + }, + { + "epoch": 8.409132420091325, + "grad_norm": 21.530471801757812, + "learning_rate": 1.7686453576864537e-06, + "loss": 0.1899, + "step": 9208 + }, + { + "epoch": 8.410045662100456, + "grad_norm": 0.9737540483474731, + "learning_rate": 1.7676306443429734e-06, + "loss": 0.0056, + "step": 9209 + }, + { + "epoch": 8.41095890410959, + "grad_norm": 0.09744902700185776, + "learning_rate": 1.7666159309994929e-06, + "loss": 0.0006, + "step": 9210 + }, + { + "epoch": 8.411872146118721, + "grad_norm": 35.50082778930664, + "learning_rate": 1.7656012176560121e-06, + "loss": 0.3347, + "step": 9211 + }, + { + "epoch": 8.412785388127855, + "grad_norm": 20.70328712463379, + "learning_rate": 1.7645865043125318e-06, + "loss": 0.078, + "step": 9212 + }, + { + "epoch": 8.413698630136986, + "grad_norm": 0.00453105429187417, + "learning_rate": 1.7635717909690513e-06, + "loss": 0.0, + "step": 9213 + }, + { + "epoch": 8.414611872146118, + "grad_norm": 0.14534568786621094, + "learning_rate": 1.762557077625571e-06, + "loss": 0.0008, + "step": 9214 + }, + { + "epoch": 8.415525114155251, + "grad_norm": 58.47908020019531, + "learning_rate": 1.7615423642820905e-06, + "loss": 0.8291, + "step": 9215 + }, + { + "epoch": 8.416438356164383, + "grad_norm": 47.627777099609375, + "learning_rate": 1.76052765093861e-06, + "loss": 0.903, + "step": 9216 + }, + { + "epoch": 8.417351598173516, + "grad_norm": 1.0162402391433716, + "learning_rate": 1.7595129375951294e-06, + "loss": 0.0071, + "step": 9217 + }, + { + "epoch": 8.418264840182648, + "grad_norm": 1.8225725889205933, + "learning_rate": 1.7584982242516491e-06, + "loss": 0.012, + "step": 9218 + }, + { + "epoch": 8.419178082191781, + "grad_norm": 2.460541009902954, + "learning_rate": 1.7574835109081686e-06, + "loss": 0.0158, + "step": 9219 + }, + { + "epoch": 8.420091324200913, + "grad_norm": 0.24343368411064148, + "learning_rate": 1.7564687975646883e-06, + "loss": 0.0018, + "step": 9220 + }, + { + "epoch": 8.421004566210046, + "grad_norm": 2.0003910064697266, + "learning_rate": 1.7554540842212076e-06, + "loss": 0.0149, + "step": 9221 + }, + { + "epoch": 8.421917808219177, + "grad_norm": 22.38874053955078, + "learning_rate": 1.754439370877727e-06, + "loss": 0.1251, + "step": 9222 + }, + { + "epoch": 8.42283105022831, + "grad_norm": 3.022188425064087, + "learning_rate": 1.7534246575342468e-06, + "loss": 0.0174, + "step": 9223 + }, + { + "epoch": 8.423744292237442, + "grad_norm": 11.908267974853516, + "learning_rate": 1.7524099441907662e-06, + "loss": 0.0788, + "step": 9224 + }, + { + "epoch": 8.424657534246576, + "grad_norm": 17.848649978637695, + "learning_rate": 1.751395230847286e-06, + "loss": 0.0325, + "step": 9225 + }, + { + "epoch": 8.425570776255707, + "grad_norm": 1.5605510473251343, + "learning_rate": 1.7503805175038052e-06, + "loss": 0.0097, + "step": 9226 + }, + { + "epoch": 8.42648401826484, + "grad_norm": 41.91118240356445, + "learning_rate": 1.7493658041603249e-06, + "loss": 0.3413, + "step": 9227 + }, + { + "epoch": 8.427397260273972, + "grad_norm": 1.0197335481643677, + "learning_rate": 1.7483510908168444e-06, + "loss": 0.0067, + "step": 9228 + }, + { + "epoch": 8.428310502283106, + "grad_norm": 64.8496322631836, + "learning_rate": 1.747336377473364e-06, + "loss": 0.4329, + "step": 9229 + }, + { + "epoch": 8.429223744292237, + "grad_norm": 11.418516159057617, + "learning_rate": 1.7463216641298835e-06, + "loss": 0.0778, + "step": 9230 + }, + { + "epoch": 8.43013698630137, + "grad_norm": 0.026333482936024666, + "learning_rate": 1.7453069507864028e-06, + "loss": 0.0001, + "step": 9231 + }, + { + "epoch": 8.431050228310502, + "grad_norm": 1.645406723022461, + "learning_rate": 1.7442922374429225e-06, + "loss": 0.008, + "step": 9232 + }, + { + "epoch": 8.431963470319635, + "grad_norm": 0.07505669444799423, + "learning_rate": 1.743277524099442e-06, + "loss": 0.0005, + "step": 9233 + }, + { + "epoch": 8.432876712328767, + "grad_norm": 33.50475311279297, + "learning_rate": 1.7422628107559617e-06, + "loss": 0.2778, + "step": 9234 + }, + { + "epoch": 8.4337899543379, + "grad_norm": 0.2512698471546173, + "learning_rate": 1.7412480974124812e-06, + "loss": 0.0016, + "step": 9235 + }, + { + "epoch": 8.434703196347032, + "grad_norm": 0.17691266536712646, + "learning_rate": 1.7402333840690006e-06, + "loss": 0.0013, + "step": 9236 + }, + { + "epoch": 8.435616438356165, + "grad_norm": 1.304550290107727, + "learning_rate": 1.7392186707255201e-06, + "loss": 0.0091, + "step": 9237 + }, + { + "epoch": 8.436529680365297, + "grad_norm": 35.663272857666016, + "learning_rate": 1.7382039573820398e-06, + "loss": 0.308, + "step": 9238 + }, + { + "epoch": 8.43744292237443, + "grad_norm": 0.30834928154945374, + "learning_rate": 1.7371892440385593e-06, + "loss": 0.0021, + "step": 9239 + }, + { + "epoch": 8.438356164383562, + "grad_norm": 1.273576021194458, + "learning_rate": 1.736174530695079e-06, + "loss": 0.009, + "step": 9240 + }, + { + "epoch": 8.439269406392693, + "grad_norm": 0.3464849293231964, + "learning_rate": 1.7351598173515982e-06, + "loss": 0.0017, + "step": 9241 + }, + { + "epoch": 8.440182648401827, + "grad_norm": 0.29269158840179443, + "learning_rate": 1.7341451040081177e-06, + "loss": 0.0014, + "step": 9242 + }, + { + "epoch": 8.441095890410958, + "grad_norm": 0.6697933077812195, + "learning_rate": 1.7331303906646374e-06, + "loss": 0.0036, + "step": 9243 + }, + { + "epoch": 8.442009132420091, + "grad_norm": 90.86997985839844, + "learning_rate": 1.732115677321157e-06, + "loss": 1.1671, + "step": 9244 + }, + { + "epoch": 8.442922374429223, + "grad_norm": 8.912300109863281, + "learning_rate": 1.7311009639776766e-06, + "loss": 0.0866, + "step": 9245 + }, + { + "epoch": 8.443835616438356, + "grad_norm": 4.155231952667236, + "learning_rate": 1.7300862506341959e-06, + "loss": 0.0235, + "step": 9246 + }, + { + "epoch": 8.444748858447488, + "grad_norm": 46.78068542480469, + "learning_rate": 1.7290715372907156e-06, + "loss": 0.2451, + "step": 9247 + }, + { + "epoch": 8.445662100456621, + "grad_norm": 3.557966709136963, + "learning_rate": 1.728056823947235e-06, + "loss": 0.0201, + "step": 9248 + }, + { + "epoch": 8.446575342465753, + "grad_norm": 3.9978694915771484, + "learning_rate": 1.7270421106037547e-06, + "loss": 0.0094, + "step": 9249 + }, + { + "epoch": 8.447488584474886, + "grad_norm": 1.1424089670181274, + "learning_rate": 1.7260273972602742e-06, + "loss": 0.0076, + "step": 9250 + }, + { + "epoch": 8.448401826484018, + "grad_norm": 5.1311869621276855, + "learning_rate": 1.7250126839167935e-06, + "loss": 0.0321, + "step": 9251 + }, + { + "epoch": 8.449315068493151, + "grad_norm": 4.870851993560791, + "learning_rate": 1.7239979705733132e-06, + "loss": 0.0246, + "step": 9252 + }, + { + "epoch": 8.450228310502283, + "grad_norm": 0.2751826047897339, + "learning_rate": 1.7229832572298326e-06, + "loss": 0.0012, + "step": 9253 + }, + { + "epoch": 8.451141552511416, + "grad_norm": 30.133169174194336, + "learning_rate": 1.7219685438863523e-06, + "loss": 0.1698, + "step": 9254 + }, + { + "epoch": 8.452054794520548, + "grad_norm": 1.7150871753692627, + "learning_rate": 1.7209538305428718e-06, + "loss": 0.0124, + "step": 9255 + }, + { + "epoch": 8.45296803652968, + "grad_norm": 0.24964149296283722, + "learning_rate": 1.7199391171993913e-06, + "loss": 0.0013, + "step": 9256 + }, + { + "epoch": 8.453881278538812, + "grad_norm": 1.8103888034820557, + "learning_rate": 1.7189244038559108e-06, + "loss": 0.0092, + "step": 9257 + }, + { + "epoch": 8.454794520547946, + "grad_norm": 124.15494537353516, + "learning_rate": 1.7179096905124305e-06, + "loss": 1.3596, + "step": 9258 + }, + { + "epoch": 8.455707762557077, + "grad_norm": 0.8560634851455688, + "learning_rate": 1.71689497716895e-06, + "loss": 0.0077, + "step": 9259 + }, + { + "epoch": 8.45662100456621, + "grad_norm": 1.905908465385437, + "learning_rate": 1.7158802638254696e-06, + "loss": 0.0118, + "step": 9260 + }, + { + "epoch": 8.457534246575342, + "grad_norm": 9.79293155670166, + "learning_rate": 1.714865550481989e-06, + "loss": 0.0698, + "step": 9261 + }, + { + "epoch": 8.458447488584476, + "grad_norm": 1.3436079025268555, + "learning_rate": 1.7138508371385084e-06, + "loss": 0.008, + "step": 9262 + }, + { + "epoch": 8.459360730593607, + "grad_norm": 72.51179504394531, + "learning_rate": 1.712836123795028e-06, + "loss": 0.6397, + "step": 9263 + }, + { + "epoch": 8.46027397260274, + "grad_norm": 9.364668846130371, + "learning_rate": 1.7118214104515476e-06, + "loss": 0.0562, + "step": 9264 + }, + { + "epoch": 8.461187214611872, + "grad_norm": 13.500975608825684, + "learning_rate": 1.7108066971080673e-06, + "loss": 0.1156, + "step": 9265 + }, + { + "epoch": 8.462100456621005, + "grad_norm": 4.263811111450195, + "learning_rate": 1.7097919837645865e-06, + "loss": 0.0291, + "step": 9266 + }, + { + "epoch": 8.463013698630137, + "grad_norm": 18.74907112121582, + "learning_rate": 1.7087772704211062e-06, + "loss": 0.0911, + "step": 9267 + }, + { + "epoch": 8.463926940639269, + "grad_norm": 0.6366510987281799, + "learning_rate": 1.7077625570776257e-06, + "loss": 0.0044, + "step": 9268 + }, + { + "epoch": 8.464840182648402, + "grad_norm": 0.15852504968643188, + "learning_rate": 1.7067478437341454e-06, + "loss": 0.0007, + "step": 9269 + }, + { + "epoch": 8.465753424657533, + "grad_norm": 18.35127067565918, + "learning_rate": 1.7057331303906649e-06, + "loss": 0.0917, + "step": 9270 + }, + { + "epoch": 8.466666666666667, + "grad_norm": 14.48599910736084, + "learning_rate": 1.7047184170471841e-06, + "loss": 0.1144, + "step": 9271 + }, + { + "epoch": 8.467579908675798, + "grad_norm": 6.547211647033691, + "learning_rate": 1.7037037037037038e-06, + "loss": 0.0418, + "step": 9272 + }, + { + "epoch": 8.468493150684932, + "grad_norm": 6.1474175453186035, + "learning_rate": 1.7026889903602233e-06, + "loss": 0.0447, + "step": 9273 + }, + { + "epoch": 8.469406392694063, + "grad_norm": 0.07602577656507492, + "learning_rate": 1.701674277016743e-06, + "loss": 0.0005, + "step": 9274 + }, + { + "epoch": 8.470319634703197, + "grad_norm": 0.2062741369009018, + "learning_rate": 1.7006595636732625e-06, + "loss": 0.0011, + "step": 9275 + }, + { + "epoch": 8.471232876712328, + "grad_norm": 0.8707839846611023, + "learning_rate": 1.699644850329782e-06, + "loss": 0.0066, + "step": 9276 + }, + { + "epoch": 8.472146118721462, + "grad_norm": 0.053556863218545914, + "learning_rate": 1.6986301369863014e-06, + "loss": 0.0003, + "step": 9277 + }, + { + "epoch": 8.473059360730593, + "grad_norm": 0.08184588700532913, + "learning_rate": 1.6976154236428211e-06, + "loss": 0.0004, + "step": 9278 + }, + { + "epoch": 8.473972602739726, + "grad_norm": 71.35020446777344, + "learning_rate": 1.6966007102993406e-06, + "loss": 1.0452, + "step": 9279 + }, + { + "epoch": 8.474885844748858, + "grad_norm": 0.02892189845442772, + "learning_rate": 1.6955859969558603e-06, + "loss": 0.0002, + "step": 9280 + }, + { + "epoch": 8.475799086757991, + "grad_norm": 4.782618522644043, + "learning_rate": 1.6945712836123796e-06, + "loss": 0.0344, + "step": 9281 + }, + { + "epoch": 8.476712328767123, + "grad_norm": 0.3527463376522064, + "learning_rate": 1.693556570268899e-06, + "loss": 0.0026, + "step": 9282 + }, + { + "epoch": 8.477625570776256, + "grad_norm": 0.6120038628578186, + "learning_rate": 1.6925418569254187e-06, + "loss": 0.0044, + "step": 9283 + }, + { + "epoch": 8.478538812785388, + "grad_norm": 0.5742766857147217, + "learning_rate": 1.6915271435819382e-06, + "loss": 0.0035, + "step": 9284 + }, + { + "epoch": 8.479452054794521, + "grad_norm": 0.19122439622879028, + "learning_rate": 1.690512430238458e-06, + "loss": 0.0011, + "step": 9285 + }, + { + "epoch": 8.480365296803653, + "grad_norm": 0.29071348905563354, + "learning_rate": 1.6894977168949772e-06, + "loss": 0.0013, + "step": 9286 + }, + { + "epoch": 8.481278538812786, + "grad_norm": 50.71426010131836, + "learning_rate": 1.6884830035514969e-06, + "loss": 0.7285, + "step": 9287 + }, + { + "epoch": 8.482191780821918, + "grad_norm": 111.74443054199219, + "learning_rate": 1.6874682902080164e-06, + "loss": 2.1894, + "step": 9288 + }, + { + "epoch": 8.483105022831051, + "grad_norm": 1.5377181768417358, + "learning_rate": 1.686453576864536e-06, + "loss": 0.0074, + "step": 9289 + }, + { + "epoch": 8.484018264840183, + "grad_norm": 3.646618604660034, + "learning_rate": 1.6854388635210555e-06, + "loss": 0.0166, + "step": 9290 + }, + { + "epoch": 8.484931506849316, + "grad_norm": 1.535065770149231, + "learning_rate": 1.6844241501775748e-06, + "loss": 0.0084, + "step": 9291 + }, + { + "epoch": 8.485844748858447, + "grad_norm": 4.197874546051025, + "learning_rate": 1.6834094368340945e-06, + "loss": 0.0169, + "step": 9292 + }, + { + "epoch": 8.48675799086758, + "grad_norm": 1.3805142641067505, + "learning_rate": 1.682394723490614e-06, + "loss": 0.0101, + "step": 9293 + }, + { + "epoch": 8.487671232876712, + "grad_norm": 2.4783596992492676, + "learning_rate": 1.6813800101471337e-06, + "loss": 0.0219, + "step": 9294 + }, + { + "epoch": 8.488584474885844, + "grad_norm": 2.246628999710083, + "learning_rate": 1.6803652968036531e-06, + "loss": 0.0174, + "step": 9295 + }, + { + "epoch": 8.489497716894977, + "grad_norm": 67.13044738769531, + "learning_rate": 1.6793505834601726e-06, + "loss": 0.3418, + "step": 9296 + }, + { + "epoch": 8.490410958904109, + "grad_norm": 30.472747802734375, + "learning_rate": 1.6783358701166921e-06, + "loss": 0.2576, + "step": 9297 + }, + { + "epoch": 8.491324200913242, + "grad_norm": 0.0071585956029593945, + "learning_rate": 1.6773211567732118e-06, + "loss": 0.0, + "step": 9298 + }, + { + "epoch": 8.492237442922374, + "grad_norm": 28.766862869262695, + "learning_rate": 1.6763064434297313e-06, + "loss": 0.1845, + "step": 9299 + }, + { + "epoch": 8.493150684931507, + "grad_norm": 3.710376262664795, + "learning_rate": 1.675291730086251e-06, + "loss": 0.0204, + "step": 9300 + }, + { + "epoch": 8.494063926940639, + "grad_norm": 1.1017005443572998, + "learning_rate": 1.6742770167427702e-06, + "loss": 0.0046, + "step": 9301 + }, + { + "epoch": 8.494977168949772, + "grad_norm": 0.044535133987665176, + "learning_rate": 1.6732623033992897e-06, + "loss": 0.0003, + "step": 9302 + }, + { + "epoch": 8.495890410958904, + "grad_norm": 1.4060139656066895, + "learning_rate": 1.6722475900558094e-06, + "loss": 0.0074, + "step": 9303 + }, + { + "epoch": 8.496803652968037, + "grad_norm": 0.5837509632110596, + "learning_rate": 1.671232876712329e-06, + "loss": 0.0037, + "step": 9304 + }, + { + "epoch": 8.497716894977168, + "grad_norm": 57.88008117675781, + "learning_rate": 1.6702181633688486e-06, + "loss": 0.3385, + "step": 9305 + }, + { + "epoch": 8.498630136986302, + "grad_norm": 13.772364616394043, + "learning_rate": 1.6692034500253679e-06, + "loss": 0.0689, + "step": 9306 + }, + { + "epoch": 8.499543378995433, + "grad_norm": 3.687711000442505, + "learning_rate": 1.6681887366818875e-06, + "loss": 0.0171, + "step": 9307 + }, + { + "epoch": 8.500456621004567, + "grad_norm": 3.4499804973602295, + "learning_rate": 1.667174023338407e-06, + "loss": 0.0186, + "step": 9308 + }, + { + "epoch": 8.501369863013698, + "grad_norm": 38.58668518066406, + "learning_rate": 1.6661593099949267e-06, + "loss": 0.4207, + "step": 9309 + }, + { + "epoch": 8.502283105022832, + "grad_norm": 0.5379658341407776, + "learning_rate": 1.6651445966514462e-06, + "loss": 0.0038, + "step": 9310 + }, + { + "epoch": 8.503196347031963, + "grad_norm": 0.6915196776390076, + "learning_rate": 1.6641298833079655e-06, + "loss": 0.0044, + "step": 9311 + }, + { + "epoch": 8.504109589041096, + "grad_norm": 2.111520290374756, + "learning_rate": 1.6631151699644852e-06, + "loss": 0.0179, + "step": 9312 + }, + { + "epoch": 8.505022831050228, + "grad_norm": 2.366881847381592, + "learning_rate": 1.6621004566210046e-06, + "loss": 0.0105, + "step": 9313 + }, + { + "epoch": 8.505936073059361, + "grad_norm": 3.865680456161499, + "learning_rate": 1.6610857432775243e-06, + "loss": 0.0172, + "step": 9314 + }, + { + "epoch": 8.506849315068493, + "grad_norm": 0.25896891951560974, + "learning_rate": 1.6600710299340438e-06, + "loss": 0.0018, + "step": 9315 + }, + { + "epoch": 8.507762557077626, + "grad_norm": 0.8525363206863403, + "learning_rate": 1.6590563165905633e-06, + "loss": 0.0066, + "step": 9316 + }, + { + "epoch": 8.508675799086758, + "grad_norm": 0.5530787110328674, + "learning_rate": 1.6580416032470828e-06, + "loss": 0.0037, + "step": 9317 + }, + { + "epoch": 8.509589041095891, + "grad_norm": 0.02649657242000103, + "learning_rate": 1.6570268899036025e-06, + "loss": 0.0002, + "step": 9318 + }, + { + "epoch": 8.510502283105023, + "grad_norm": 8.798196792602539, + "learning_rate": 1.656012176560122e-06, + "loss": 0.0575, + "step": 9319 + }, + { + "epoch": 8.511415525114156, + "grad_norm": 0.3637799918651581, + "learning_rate": 1.6549974632166416e-06, + "loss": 0.0016, + "step": 9320 + }, + { + "epoch": 8.512328767123288, + "grad_norm": 1.7873646020889282, + "learning_rate": 1.653982749873161e-06, + "loss": 0.0134, + "step": 9321 + }, + { + "epoch": 8.51324200913242, + "grad_norm": 0.5888168215751648, + "learning_rate": 1.6529680365296804e-06, + "loss": 0.0037, + "step": 9322 + }, + { + "epoch": 8.514155251141553, + "grad_norm": 0.30428242683410645, + "learning_rate": 1.6519533231862e-06, + "loss": 0.002, + "step": 9323 + }, + { + "epoch": 8.515068493150684, + "grad_norm": 0.4356941878795624, + "learning_rate": 1.6509386098427196e-06, + "loss": 0.0028, + "step": 9324 + }, + { + "epoch": 8.515981735159817, + "grad_norm": 0.11167487502098083, + "learning_rate": 1.6499238964992393e-06, + "loss": 0.0008, + "step": 9325 + }, + { + "epoch": 8.516894977168949, + "grad_norm": 0.7078837752342224, + "learning_rate": 1.6489091831557585e-06, + "loss": 0.0038, + "step": 9326 + }, + { + "epoch": 8.517808219178082, + "grad_norm": 1.1734813451766968, + "learning_rate": 1.6478944698122782e-06, + "loss": 0.0088, + "step": 9327 + }, + { + "epoch": 8.518721461187214, + "grad_norm": 0.6794598698616028, + "learning_rate": 1.6468797564687977e-06, + "loss": 0.0019, + "step": 9328 + }, + { + "epoch": 8.519634703196347, + "grad_norm": 0.6528857350349426, + "learning_rate": 1.6458650431253174e-06, + "loss": 0.003, + "step": 9329 + }, + { + "epoch": 8.520547945205479, + "grad_norm": 0.3298863172531128, + "learning_rate": 1.6448503297818369e-06, + "loss": 0.0022, + "step": 9330 + }, + { + "epoch": 8.521461187214612, + "grad_norm": 0.37355268001556396, + "learning_rate": 1.6438356164383561e-06, + "loss": 0.0021, + "step": 9331 + }, + { + "epoch": 8.522374429223744, + "grad_norm": 0.043115466833114624, + "learning_rate": 1.6428209030948758e-06, + "loss": 0.0003, + "step": 9332 + }, + { + "epoch": 8.523287671232877, + "grad_norm": 0.5828148126602173, + "learning_rate": 1.6418061897513953e-06, + "loss": 0.0045, + "step": 9333 + }, + { + "epoch": 8.524200913242009, + "grad_norm": 2.3563764095306396, + "learning_rate": 1.640791476407915e-06, + "loss": 0.0053, + "step": 9334 + }, + { + "epoch": 8.525114155251142, + "grad_norm": 0.7335487604141235, + "learning_rate": 1.6397767630644345e-06, + "loss": 0.0035, + "step": 9335 + }, + { + "epoch": 8.526027397260274, + "grad_norm": 0.20327214896678925, + "learning_rate": 1.638762049720954e-06, + "loss": 0.001, + "step": 9336 + }, + { + "epoch": 8.526940639269407, + "grad_norm": 32.19795227050781, + "learning_rate": 1.6377473363774734e-06, + "loss": 0.1676, + "step": 9337 + }, + { + "epoch": 8.527853881278538, + "grad_norm": 0.270251989364624, + "learning_rate": 1.6367326230339931e-06, + "loss": 0.0019, + "step": 9338 + }, + { + "epoch": 8.528767123287672, + "grad_norm": 0.21758055686950684, + "learning_rate": 1.6357179096905126e-06, + "loss": 0.0011, + "step": 9339 + }, + { + "epoch": 8.529680365296803, + "grad_norm": 0.49151474237442017, + "learning_rate": 1.6347031963470323e-06, + "loss": 0.0029, + "step": 9340 + }, + { + "epoch": 8.530593607305937, + "grad_norm": 0.2019994854927063, + "learning_rate": 1.6336884830035516e-06, + "loss": 0.0011, + "step": 9341 + }, + { + "epoch": 8.531506849315068, + "grad_norm": 21.41377067565918, + "learning_rate": 1.632673769660071e-06, + "loss": 0.1332, + "step": 9342 + }, + { + "epoch": 8.532420091324202, + "grad_norm": 0.09073149412870407, + "learning_rate": 1.6316590563165907e-06, + "loss": 0.0007, + "step": 9343 + }, + { + "epoch": 8.533333333333333, + "grad_norm": 5.076369762420654, + "learning_rate": 1.6306443429731102e-06, + "loss": 0.0253, + "step": 9344 + }, + { + "epoch": 8.534246575342467, + "grad_norm": 138.40928649902344, + "learning_rate": 1.62962962962963e-06, + "loss": 2.8388, + "step": 9345 + }, + { + "epoch": 8.535159817351598, + "grad_norm": 0.27056559920310974, + "learning_rate": 1.6286149162861492e-06, + "loss": 0.0016, + "step": 9346 + }, + { + "epoch": 8.536073059360731, + "grad_norm": 97.50865173339844, + "learning_rate": 1.6276002029426689e-06, + "loss": 1.7424, + "step": 9347 + }, + { + "epoch": 8.536986301369863, + "grad_norm": 10.849489212036133, + "learning_rate": 1.6265854895991884e-06, + "loss": 0.0558, + "step": 9348 + }, + { + "epoch": 8.537899543378995, + "grad_norm": 9.067298889160156, + "learning_rate": 1.625570776255708e-06, + "loss": 0.0279, + "step": 9349 + }, + { + "epoch": 8.538812785388128, + "grad_norm": 23.43269157409668, + "learning_rate": 1.6245560629122275e-06, + "loss": 0.1003, + "step": 9350 + }, + { + "epoch": 8.53972602739726, + "grad_norm": 0.3622977137565613, + "learning_rate": 1.6235413495687468e-06, + "loss": 0.0029, + "step": 9351 + }, + { + "epoch": 8.540639269406393, + "grad_norm": 0.3635936677455902, + "learning_rate": 1.6225266362252665e-06, + "loss": 0.0018, + "step": 9352 + }, + { + "epoch": 8.541552511415524, + "grad_norm": 0.7534795999526978, + "learning_rate": 1.621511922881786e-06, + "loss": 0.0052, + "step": 9353 + }, + { + "epoch": 8.542465753424658, + "grad_norm": 0.1378335803747177, + "learning_rate": 1.6204972095383057e-06, + "loss": 0.0007, + "step": 9354 + }, + { + "epoch": 8.54337899543379, + "grad_norm": 43.79753494262695, + "learning_rate": 1.6194824961948251e-06, + "loss": 0.4358, + "step": 9355 + }, + { + "epoch": 8.544292237442923, + "grad_norm": 0.45776861906051636, + "learning_rate": 1.6184677828513446e-06, + "loss": 0.0031, + "step": 9356 + }, + { + "epoch": 8.545205479452054, + "grad_norm": 8.392826080322266, + "learning_rate": 1.617453069507864e-06, + "loss": 0.0309, + "step": 9357 + }, + { + "epoch": 8.546118721461188, + "grad_norm": 1.646388053894043, + "learning_rate": 1.6164383561643838e-06, + "loss": 0.0074, + "step": 9358 + }, + { + "epoch": 8.54703196347032, + "grad_norm": 0.43731170892715454, + "learning_rate": 1.6154236428209033e-06, + "loss": 0.0033, + "step": 9359 + }, + { + "epoch": 8.547945205479452, + "grad_norm": 19.329177856445312, + "learning_rate": 1.614408929477423e-06, + "loss": 0.1547, + "step": 9360 + }, + { + "epoch": 8.548858447488584, + "grad_norm": 0.11555232852697372, + "learning_rate": 1.6133942161339422e-06, + "loss": 0.0008, + "step": 9361 + }, + { + "epoch": 8.549771689497717, + "grad_norm": 51.953765869140625, + "learning_rate": 1.6123795027904617e-06, + "loss": 0.1574, + "step": 9362 + }, + { + "epoch": 8.550684931506849, + "grad_norm": 19.088298797607422, + "learning_rate": 1.6113647894469814e-06, + "loss": 0.1352, + "step": 9363 + }, + { + "epoch": 8.551598173515982, + "grad_norm": 0.1353798806667328, + "learning_rate": 1.6103500761035009e-06, + "loss": 0.0009, + "step": 9364 + }, + { + "epoch": 8.552511415525114, + "grad_norm": 0.6993772387504578, + "learning_rate": 1.6093353627600206e-06, + "loss": 0.0047, + "step": 9365 + }, + { + "epoch": 8.553424657534247, + "grad_norm": 29.676025390625, + "learning_rate": 1.6083206494165399e-06, + "loss": 0.2062, + "step": 9366 + }, + { + "epoch": 8.554337899543379, + "grad_norm": 0.03770504891872406, + "learning_rate": 1.6073059360730595e-06, + "loss": 0.0002, + "step": 9367 + }, + { + "epoch": 8.555251141552512, + "grad_norm": 0.44643568992614746, + "learning_rate": 1.606291222729579e-06, + "loss": 0.0029, + "step": 9368 + }, + { + "epoch": 8.556164383561644, + "grad_norm": 0.001969007309526205, + "learning_rate": 1.6052765093860987e-06, + "loss": 0.0, + "step": 9369 + }, + { + "epoch": 8.557077625570777, + "grad_norm": 72.38839721679688, + "learning_rate": 1.6042617960426182e-06, + "loss": 0.4605, + "step": 9370 + }, + { + "epoch": 8.557990867579909, + "grad_norm": 2.011801242828369, + "learning_rate": 1.6032470826991375e-06, + "loss": 0.0162, + "step": 9371 + }, + { + "epoch": 8.558904109589042, + "grad_norm": 0.20467223227024078, + "learning_rate": 1.6022323693556572e-06, + "loss": 0.0013, + "step": 9372 + }, + { + "epoch": 8.559817351598173, + "grad_norm": 9.215907096862793, + "learning_rate": 1.6012176560121766e-06, + "loss": 0.0517, + "step": 9373 + }, + { + "epoch": 8.560730593607307, + "grad_norm": 2.5612242221832275, + "learning_rate": 1.6002029426686963e-06, + "loss": 0.0191, + "step": 9374 + }, + { + "epoch": 8.561643835616438, + "grad_norm": 0.1502179354429245, + "learning_rate": 1.5991882293252158e-06, + "loss": 0.0011, + "step": 9375 + }, + { + "epoch": 8.56255707762557, + "grad_norm": 3.1109206676483154, + "learning_rate": 1.5981735159817353e-06, + "loss": 0.0213, + "step": 9376 + }, + { + "epoch": 8.563470319634703, + "grad_norm": 33.91893005371094, + "learning_rate": 1.5971588026382548e-06, + "loss": 0.3789, + "step": 9377 + }, + { + "epoch": 8.564383561643835, + "grad_norm": 0.45132988691329956, + "learning_rate": 1.5961440892947745e-06, + "loss": 0.0031, + "step": 9378 + }, + { + "epoch": 8.565296803652968, + "grad_norm": 79.15133666992188, + "learning_rate": 1.595129375951294e-06, + "loss": 0.556, + "step": 9379 + }, + { + "epoch": 8.5662100456621, + "grad_norm": 0.9735737442970276, + "learning_rate": 1.5941146626078136e-06, + "loss": 0.0068, + "step": 9380 + }, + { + "epoch": 8.567123287671233, + "grad_norm": 5.979523181915283, + "learning_rate": 1.593099949264333e-06, + "loss": 0.0316, + "step": 9381 + }, + { + "epoch": 8.568036529680365, + "grad_norm": 0.30033573508262634, + "learning_rate": 1.5920852359208524e-06, + "loss": 0.0021, + "step": 9382 + }, + { + "epoch": 8.568949771689498, + "grad_norm": 1.1690633296966553, + "learning_rate": 1.591070522577372e-06, + "loss": 0.0067, + "step": 9383 + }, + { + "epoch": 8.56986301369863, + "grad_norm": 0.019573139026761055, + "learning_rate": 1.5900558092338916e-06, + "loss": 0.0001, + "step": 9384 + }, + { + "epoch": 8.570776255707763, + "grad_norm": 8.602537155151367, + "learning_rate": 1.5890410958904112e-06, + "loss": 0.0455, + "step": 9385 + }, + { + "epoch": 8.571689497716894, + "grad_norm": 0.35825198888778687, + "learning_rate": 1.5880263825469305e-06, + "loss": 0.0021, + "step": 9386 + }, + { + "epoch": 8.572602739726028, + "grad_norm": 1.1771000623703003, + "learning_rate": 1.5870116692034502e-06, + "loss": 0.0059, + "step": 9387 + }, + { + "epoch": 8.57351598173516, + "grad_norm": 0.6594258546829224, + "learning_rate": 1.5859969558599697e-06, + "loss": 0.0029, + "step": 9388 + }, + { + "epoch": 8.574429223744293, + "grad_norm": 19.045442581176758, + "learning_rate": 1.5849822425164894e-06, + "loss": 0.1389, + "step": 9389 + }, + { + "epoch": 8.575342465753424, + "grad_norm": 2.379793643951416, + "learning_rate": 1.5839675291730089e-06, + "loss": 0.0169, + "step": 9390 + }, + { + "epoch": 8.576255707762558, + "grad_norm": 0.4135130047798157, + "learning_rate": 1.5829528158295281e-06, + "loss": 0.0025, + "step": 9391 + }, + { + "epoch": 8.57716894977169, + "grad_norm": 3.0151853561401367, + "learning_rate": 1.5819381024860478e-06, + "loss": 0.0199, + "step": 9392 + }, + { + "epoch": 8.578082191780823, + "grad_norm": 0.09520109742879868, + "learning_rate": 1.5809233891425673e-06, + "loss": 0.0007, + "step": 9393 + }, + { + "epoch": 8.578995433789954, + "grad_norm": 0.019030921161174774, + "learning_rate": 1.579908675799087e-06, + "loss": 0.0001, + "step": 9394 + }, + { + "epoch": 8.579908675799087, + "grad_norm": 0.16797441244125366, + "learning_rate": 1.5788939624556065e-06, + "loss": 0.001, + "step": 9395 + }, + { + "epoch": 8.580821917808219, + "grad_norm": 3.7714529037475586, + "learning_rate": 1.577879249112126e-06, + "loss": 0.0284, + "step": 9396 + }, + { + "epoch": 8.581735159817352, + "grad_norm": 0.47633838653564453, + "learning_rate": 1.5768645357686454e-06, + "loss": 0.0029, + "step": 9397 + }, + { + "epoch": 8.582648401826484, + "grad_norm": 0.8251369595527649, + "learning_rate": 1.5758498224251651e-06, + "loss": 0.0041, + "step": 9398 + }, + { + "epoch": 8.583561643835617, + "grad_norm": 3.054675817489624, + "learning_rate": 1.5748351090816846e-06, + "loss": 0.0156, + "step": 9399 + }, + { + "epoch": 8.584474885844749, + "grad_norm": 6.626904010772705, + "learning_rate": 1.5738203957382043e-06, + "loss": 0.0372, + "step": 9400 + }, + { + "epoch": 8.585388127853882, + "grad_norm": 9.434012413024902, + "learning_rate": 1.5728056823947236e-06, + "loss": 0.05, + "step": 9401 + }, + { + "epoch": 8.586301369863014, + "grad_norm": 14.101387977600098, + "learning_rate": 1.571790969051243e-06, + "loss": 0.0717, + "step": 9402 + }, + { + "epoch": 8.587214611872145, + "grad_norm": 2.6820383071899414, + "learning_rate": 1.5707762557077627e-06, + "loss": 0.0123, + "step": 9403 + }, + { + "epoch": 8.588127853881279, + "grad_norm": 1.5827604532241821, + "learning_rate": 1.5697615423642822e-06, + "loss": 0.0098, + "step": 9404 + }, + { + "epoch": 8.58904109589041, + "grad_norm": 3.8202006816864014, + "learning_rate": 1.568746829020802e-06, + "loss": 0.0251, + "step": 9405 + }, + { + "epoch": 8.589954337899544, + "grad_norm": 0.34445372223854065, + "learning_rate": 1.5677321156773212e-06, + "loss": 0.0027, + "step": 9406 + }, + { + "epoch": 8.590867579908675, + "grad_norm": 0.1790219396352768, + "learning_rate": 1.5667174023338409e-06, + "loss": 0.0007, + "step": 9407 + }, + { + "epoch": 8.591780821917808, + "grad_norm": 2.9084584712982178, + "learning_rate": 1.5657026889903604e-06, + "loss": 0.0272, + "step": 9408 + }, + { + "epoch": 8.59269406392694, + "grad_norm": 3.5349392890930176, + "learning_rate": 1.56468797564688e-06, + "loss": 0.0221, + "step": 9409 + }, + { + "epoch": 8.593607305936073, + "grad_norm": 0.24066707491874695, + "learning_rate": 1.5636732623033995e-06, + "loss": 0.0017, + "step": 9410 + }, + { + "epoch": 8.594520547945205, + "grad_norm": 6.134392738342285, + "learning_rate": 1.5626585489599188e-06, + "loss": 0.0309, + "step": 9411 + }, + { + "epoch": 8.595433789954338, + "grad_norm": 9.710427284240723, + "learning_rate": 1.5616438356164385e-06, + "loss": 0.0781, + "step": 9412 + }, + { + "epoch": 8.59634703196347, + "grad_norm": 1.598426342010498, + "learning_rate": 1.560629122272958e-06, + "loss": 0.009, + "step": 9413 + }, + { + "epoch": 8.597260273972603, + "grad_norm": 7.244246959686279, + "learning_rate": 1.5596144089294777e-06, + "loss": 0.042, + "step": 9414 + }, + { + "epoch": 8.598173515981735, + "grad_norm": 1.1664835214614868, + "learning_rate": 1.5585996955859971e-06, + "loss": 0.0067, + "step": 9415 + }, + { + "epoch": 8.599086757990868, + "grad_norm": 0.9606152176856995, + "learning_rate": 1.5575849822425166e-06, + "loss": 0.0049, + "step": 9416 + }, + { + "epoch": 8.6, + "grad_norm": 124.0887680053711, + "learning_rate": 1.556570268899036e-06, + "loss": 3.3948, + "step": 9417 + }, + { + "epoch": 8.600913242009133, + "grad_norm": 0.08205513656139374, + "learning_rate": 1.5555555555555558e-06, + "loss": 0.0005, + "step": 9418 + }, + { + "epoch": 8.601826484018265, + "grad_norm": 3.882780075073242, + "learning_rate": 1.5545408422120753e-06, + "loss": 0.0252, + "step": 9419 + }, + { + "epoch": 8.602739726027398, + "grad_norm": 0.03850611671805382, + "learning_rate": 1.553526128868595e-06, + "loss": 0.0003, + "step": 9420 + }, + { + "epoch": 8.60365296803653, + "grad_norm": 1.267329454421997, + "learning_rate": 1.5525114155251142e-06, + "loss": 0.0064, + "step": 9421 + }, + { + "epoch": 8.604566210045663, + "grad_norm": 0.33376461267471313, + "learning_rate": 1.5514967021816337e-06, + "loss": 0.0019, + "step": 9422 + }, + { + "epoch": 8.605479452054794, + "grad_norm": 0.5386064052581787, + "learning_rate": 1.5504819888381534e-06, + "loss": 0.0032, + "step": 9423 + }, + { + "epoch": 8.606392694063928, + "grad_norm": 2.6867001056671143, + "learning_rate": 1.5494672754946729e-06, + "loss": 0.0147, + "step": 9424 + }, + { + "epoch": 8.60730593607306, + "grad_norm": 16.952138900756836, + "learning_rate": 1.5484525621511926e-06, + "loss": 0.1156, + "step": 9425 + }, + { + "epoch": 8.608219178082193, + "grad_norm": 1.7564300298690796, + "learning_rate": 1.5474378488077118e-06, + "loss": 0.0146, + "step": 9426 + }, + { + "epoch": 8.609132420091324, + "grad_norm": 0.901829183101654, + "learning_rate": 1.5464231354642315e-06, + "loss": 0.0031, + "step": 9427 + }, + { + "epoch": 8.610045662100458, + "grad_norm": 0.37303832173347473, + "learning_rate": 1.545408422120751e-06, + "loss": 0.0026, + "step": 9428 + }, + { + "epoch": 8.610958904109589, + "grad_norm": 2.981212854385376, + "learning_rate": 1.5443937087772707e-06, + "loss": 0.0184, + "step": 9429 + }, + { + "epoch": 8.61187214611872, + "grad_norm": 1.4941036701202393, + "learning_rate": 1.5433789954337902e-06, + "loss": 0.0068, + "step": 9430 + }, + { + "epoch": 8.612785388127854, + "grad_norm": 29.783031463623047, + "learning_rate": 1.5423642820903095e-06, + "loss": 0.2432, + "step": 9431 + }, + { + "epoch": 8.613698630136986, + "grad_norm": 0.029704434797167778, + "learning_rate": 1.5413495687468292e-06, + "loss": 0.0001, + "step": 9432 + }, + { + "epoch": 8.614611872146119, + "grad_norm": 0.2119290828704834, + "learning_rate": 1.5403348554033486e-06, + "loss": 0.0015, + "step": 9433 + }, + { + "epoch": 8.61552511415525, + "grad_norm": 1.999859094619751, + "learning_rate": 1.5393201420598683e-06, + "loss": 0.0129, + "step": 9434 + }, + { + "epoch": 8.616438356164384, + "grad_norm": 71.93518829345703, + "learning_rate": 1.5383054287163878e-06, + "loss": 1.0213, + "step": 9435 + }, + { + "epoch": 8.617351598173515, + "grad_norm": 0.6065266728401184, + "learning_rate": 1.5372907153729073e-06, + "loss": 0.0045, + "step": 9436 + }, + { + "epoch": 8.618264840182649, + "grad_norm": 0.23978523910045624, + "learning_rate": 1.5362760020294268e-06, + "loss": 0.0014, + "step": 9437 + }, + { + "epoch": 8.61917808219178, + "grad_norm": 41.72502899169922, + "learning_rate": 1.5352612886859465e-06, + "loss": 0.2703, + "step": 9438 + }, + { + "epoch": 8.620091324200914, + "grad_norm": 10.699243545532227, + "learning_rate": 1.534246575342466e-06, + "loss": 0.0435, + "step": 9439 + }, + { + "epoch": 8.621004566210045, + "grad_norm": 3.827364206314087, + "learning_rate": 1.5332318619989856e-06, + "loss": 0.0246, + "step": 9440 + }, + { + "epoch": 8.621917808219179, + "grad_norm": 0.15435673296451569, + "learning_rate": 1.532217148655505e-06, + "loss": 0.0013, + "step": 9441 + }, + { + "epoch": 8.62283105022831, + "grad_norm": 0.01624361239373684, + "learning_rate": 1.5312024353120244e-06, + "loss": 0.0001, + "step": 9442 + }, + { + "epoch": 8.623744292237443, + "grad_norm": 29.47149085998535, + "learning_rate": 1.530187721968544e-06, + "loss": 0.2662, + "step": 9443 + }, + { + "epoch": 8.624657534246575, + "grad_norm": 0.12197542935609818, + "learning_rate": 1.5291730086250635e-06, + "loss": 0.0008, + "step": 9444 + }, + { + "epoch": 8.625570776255708, + "grad_norm": 36.756099700927734, + "learning_rate": 1.5281582952815832e-06, + "loss": 0.4729, + "step": 9445 + }, + { + "epoch": 8.62648401826484, + "grad_norm": 0.8157781958580017, + "learning_rate": 1.5271435819381025e-06, + "loss": 0.005, + "step": 9446 + }, + { + "epoch": 8.627397260273973, + "grad_norm": 0.23728565871715546, + "learning_rate": 1.5261288685946222e-06, + "loss": 0.0013, + "step": 9447 + }, + { + "epoch": 8.628310502283105, + "grad_norm": 8.65129280090332, + "learning_rate": 1.5251141552511417e-06, + "loss": 0.0626, + "step": 9448 + }, + { + "epoch": 8.629223744292238, + "grad_norm": 0.6415495276451111, + "learning_rate": 1.5240994419076614e-06, + "loss": 0.0045, + "step": 9449 + }, + { + "epoch": 8.63013698630137, + "grad_norm": 0.26063787937164307, + "learning_rate": 1.5230847285641809e-06, + "loss": 0.0017, + "step": 9450 + }, + { + "epoch": 8.631050228310503, + "grad_norm": 30.411344528198242, + "learning_rate": 1.5220700152207001e-06, + "loss": 0.1775, + "step": 9451 + }, + { + "epoch": 8.631963470319635, + "grad_norm": 2.3339459896087646, + "learning_rate": 1.5210553018772198e-06, + "loss": 0.0168, + "step": 9452 + }, + { + "epoch": 8.632876712328766, + "grad_norm": 11.017351150512695, + "learning_rate": 1.5200405885337393e-06, + "loss": 0.0739, + "step": 9453 + }, + { + "epoch": 8.6337899543379, + "grad_norm": 61.485477447509766, + "learning_rate": 1.519025875190259e-06, + "loss": 0.5286, + "step": 9454 + }, + { + "epoch": 8.634703196347033, + "grad_norm": 0.032779768109321594, + "learning_rate": 1.5180111618467785e-06, + "loss": 0.0002, + "step": 9455 + }, + { + "epoch": 8.635616438356164, + "grad_norm": 0.09656410664319992, + "learning_rate": 1.516996448503298e-06, + "loss": 0.0007, + "step": 9456 + }, + { + "epoch": 8.636529680365296, + "grad_norm": 5.253391265869141, + "learning_rate": 1.5159817351598174e-06, + "loss": 0.0344, + "step": 9457 + }, + { + "epoch": 8.63744292237443, + "grad_norm": 5.564919948577881, + "learning_rate": 1.5149670218163371e-06, + "loss": 0.0439, + "step": 9458 + }, + { + "epoch": 8.638356164383561, + "grad_norm": 2.6775052547454834, + "learning_rate": 1.5139523084728566e-06, + "loss": 0.0162, + "step": 9459 + }, + { + "epoch": 8.639269406392694, + "grad_norm": 0.0039505502209067345, + "learning_rate": 1.5129375951293763e-06, + "loss": 0.0, + "step": 9460 + }, + { + "epoch": 8.640182648401826, + "grad_norm": 0.4485568404197693, + "learning_rate": 1.5119228817858956e-06, + "loss": 0.0021, + "step": 9461 + }, + { + "epoch": 8.64109589041096, + "grad_norm": 38.45359802246094, + "learning_rate": 1.510908168442415e-06, + "loss": 0.2724, + "step": 9462 + }, + { + "epoch": 8.64200913242009, + "grad_norm": 0.046040408313274384, + "learning_rate": 1.5098934550989347e-06, + "loss": 0.0004, + "step": 9463 + }, + { + "epoch": 8.642922374429224, + "grad_norm": 36.40475082397461, + "learning_rate": 1.5088787417554542e-06, + "loss": 0.3692, + "step": 9464 + }, + { + "epoch": 8.643835616438356, + "grad_norm": 31.29414176940918, + "learning_rate": 1.507864028411974e-06, + "loss": 0.1771, + "step": 9465 + }, + { + "epoch": 8.644748858447489, + "grad_norm": 29.731914520263672, + "learning_rate": 1.5068493150684932e-06, + "loss": 0.1477, + "step": 9466 + }, + { + "epoch": 8.64566210045662, + "grad_norm": 0.7704469561576843, + "learning_rate": 1.5058346017250129e-06, + "loss": 0.0049, + "step": 9467 + }, + { + "epoch": 8.646575342465754, + "grad_norm": 3.009204626083374, + "learning_rate": 1.5048198883815323e-06, + "loss": 0.0139, + "step": 9468 + }, + { + "epoch": 8.647488584474885, + "grad_norm": 0.12530773878097534, + "learning_rate": 1.503805175038052e-06, + "loss": 0.0011, + "step": 9469 + }, + { + "epoch": 8.648401826484019, + "grad_norm": 0.598636269569397, + "learning_rate": 1.5027904616945715e-06, + "loss": 0.003, + "step": 9470 + }, + { + "epoch": 8.64931506849315, + "grad_norm": 1.594498634338379, + "learning_rate": 1.5017757483510908e-06, + "loss": 0.0126, + "step": 9471 + }, + { + "epoch": 8.650228310502284, + "grad_norm": 12.454145431518555, + "learning_rate": 1.5007610350076105e-06, + "loss": 0.0812, + "step": 9472 + }, + { + "epoch": 8.651141552511415, + "grad_norm": 31.817596435546875, + "learning_rate": 1.49974632166413e-06, + "loss": 0.1351, + "step": 9473 + }, + { + "epoch": 8.652054794520549, + "grad_norm": 5.751229286193848, + "learning_rate": 1.4987316083206497e-06, + "loss": 0.034, + "step": 9474 + }, + { + "epoch": 8.65296803652968, + "grad_norm": 0.4274657070636749, + "learning_rate": 1.497716894977169e-06, + "loss": 0.0026, + "step": 9475 + }, + { + "epoch": 8.653881278538814, + "grad_norm": 56.852787017822266, + "learning_rate": 1.4967021816336886e-06, + "loss": 0.3838, + "step": 9476 + }, + { + "epoch": 8.654794520547945, + "grad_norm": 0.017384449020028114, + "learning_rate": 1.495687468290208e-06, + "loss": 0.0001, + "step": 9477 + }, + { + "epoch": 8.655707762557078, + "grad_norm": 0.22292284667491913, + "learning_rate": 1.4946727549467278e-06, + "loss": 0.0012, + "step": 9478 + }, + { + "epoch": 8.65662100456621, + "grad_norm": 0.42019006609916687, + "learning_rate": 1.4936580416032473e-06, + "loss": 0.0017, + "step": 9479 + }, + { + "epoch": 8.657534246575342, + "grad_norm": 0.026034468784928322, + "learning_rate": 1.4926433282597665e-06, + "loss": 0.0001, + "step": 9480 + }, + { + "epoch": 8.658447488584475, + "grad_norm": 2.7320735454559326, + "learning_rate": 1.4916286149162862e-06, + "loss": 0.0212, + "step": 9481 + }, + { + "epoch": 8.659360730593608, + "grad_norm": 2.659872531890869, + "learning_rate": 1.4906139015728057e-06, + "loss": 0.0115, + "step": 9482 + }, + { + "epoch": 8.66027397260274, + "grad_norm": 3.735652446746826, + "learning_rate": 1.4895991882293254e-06, + "loss": 0.0221, + "step": 9483 + }, + { + "epoch": 8.661187214611871, + "grad_norm": 5.774184226989746, + "learning_rate": 1.4885844748858449e-06, + "loss": 0.039, + "step": 9484 + }, + { + "epoch": 8.662100456621005, + "grad_norm": 9.72753620147705, + "learning_rate": 1.4875697615423644e-06, + "loss": 0.0542, + "step": 9485 + }, + { + "epoch": 8.663013698630136, + "grad_norm": 19.24060821533203, + "learning_rate": 1.4865550481988838e-06, + "loss": 0.105, + "step": 9486 + }, + { + "epoch": 8.66392694063927, + "grad_norm": 0.3712141215801239, + "learning_rate": 1.4855403348554035e-06, + "loss": 0.0026, + "step": 9487 + }, + { + "epoch": 8.664840182648401, + "grad_norm": 16.04030990600586, + "learning_rate": 1.484525621511923e-06, + "loss": 0.1579, + "step": 9488 + }, + { + "epoch": 8.665753424657535, + "grad_norm": 1.0948513746261597, + "learning_rate": 1.4835109081684427e-06, + "loss": 0.0082, + "step": 9489 + }, + { + "epoch": 8.666666666666666, + "grad_norm": 0.6790801286697388, + "learning_rate": 1.482496194824962e-06, + "loss": 0.0045, + "step": 9490 + }, + { + "epoch": 8.6675799086758, + "grad_norm": 1.1042026281356812, + "learning_rate": 1.4814814814814815e-06, + "loss": 0.0077, + "step": 9491 + }, + { + "epoch": 8.668493150684931, + "grad_norm": 2.709075450897217, + "learning_rate": 1.4804667681380011e-06, + "loss": 0.0099, + "step": 9492 + }, + { + "epoch": 8.669406392694064, + "grad_norm": 6.000307559967041, + "learning_rate": 1.4794520547945206e-06, + "loss": 0.0182, + "step": 9493 + }, + { + "epoch": 8.670319634703196, + "grad_norm": 4.972477436065674, + "learning_rate": 1.4784373414510403e-06, + "loss": 0.0235, + "step": 9494 + }, + { + "epoch": 8.67123287671233, + "grad_norm": 8.547374725341797, + "learning_rate": 1.4774226281075596e-06, + "loss": 0.0327, + "step": 9495 + }, + { + "epoch": 8.67214611872146, + "grad_norm": 0.4411901533603668, + "learning_rate": 1.4764079147640793e-06, + "loss": 0.0032, + "step": 9496 + }, + { + "epoch": 8.673059360730594, + "grad_norm": 0.2515563368797302, + "learning_rate": 1.4753932014205988e-06, + "loss": 0.0012, + "step": 9497 + }, + { + "epoch": 8.673972602739726, + "grad_norm": 0.45616620779037476, + "learning_rate": 1.4743784880771185e-06, + "loss": 0.0033, + "step": 9498 + }, + { + "epoch": 8.674885844748859, + "grad_norm": 0.2686190605163574, + "learning_rate": 1.473363774733638e-06, + "loss": 0.001, + "step": 9499 + }, + { + "epoch": 8.67579908675799, + "grad_norm": 32.04535675048828, + "learning_rate": 1.4723490613901572e-06, + "loss": 0.1592, + "step": 9500 + }, + { + "epoch": 8.676712328767124, + "grad_norm": 9.888651847839355, + "learning_rate": 1.4713343480466769e-06, + "loss": 0.061, + "step": 9501 + }, + { + "epoch": 8.677625570776256, + "grad_norm": 2.0452728271484375, + "learning_rate": 1.4703196347031964e-06, + "loss": 0.0082, + "step": 9502 + }, + { + "epoch": 8.678538812785389, + "grad_norm": 1.4648401737213135, + "learning_rate": 1.469304921359716e-06, + "loss": 0.0085, + "step": 9503 + }, + { + "epoch": 8.67945205479452, + "grad_norm": 0.13662214577198029, + "learning_rate": 1.4682902080162355e-06, + "loss": 0.0008, + "step": 9504 + }, + { + "epoch": 8.680365296803654, + "grad_norm": 3.5417492389678955, + "learning_rate": 1.467275494672755e-06, + "loss": 0.0211, + "step": 9505 + }, + { + "epoch": 8.681278538812785, + "grad_norm": 1.6725034713745117, + "learning_rate": 1.4662607813292745e-06, + "loss": 0.0116, + "step": 9506 + }, + { + "epoch": 8.682191780821917, + "grad_norm": 2.642827272415161, + "learning_rate": 1.4652460679857942e-06, + "loss": 0.0155, + "step": 9507 + }, + { + "epoch": 8.68310502283105, + "grad_norm": 0.10156802088022232, + "learning_rate": 1.4642313546423137e-06, + "loss": 0.0005, + "step": 9508 + }, + { + "epoch": 8.684018264840184, + "grad_norm": 0.27086400985717773, + "learning_rate": 1.4632166412988334e-06, + "loss": 0.0015, + "step": 9509 + }, + { + "epoch": 8.684931506849315, + "grad_norm": 0.6115013957023621, + "learning_rate": 1.4622019279553526e-06, + "loss": 0.0037, + "step": 9510 + }, + { + "epoch": 8.685844748858447, + "grad_norm": 0.29320597648620605, + "learning_rate": 1.4611872146118721e-06, + "loss": 0.002, + "step": 9511 + }, + { + "epoch": 8.68675799086758, + "grad_norm": 1.5210005044937134, + "learning_rate": 1.4601725012683918e-06, + "loss": 0.0093, + "step": 9512 + }, + { + "epoch": 8.687671232876712, + "grad_norm": 0.02691136859357357, + "learning_rate": 1.4591577879249113e-06, + "loss": 0.0001, + "step": 9513 + }, + { + "epoch": 8.688584474885845, + "grad_norm": 10.281847953796387, + "learning_rate": 1.458143074581431e-06, + "loss": 0.0616, + "step": 9514 + }, + { + "epoch": 8.689497716894977, + "grad_norm": 2.929680824279785, + "learning_rate": 1.4571283612379503e-06, + "loss": 0.0143, + "step": 9515 + }, + { + "epoch": 8.69041095890411, + "grad_norm": 0.3035878837108612, + "learning_rate": 1.45611364789447e-06, + "loss": 0.0019, + "step": 9516 + }, + { + "epoch": 8.691324200913241, + "grad_norm": 5.263148307800293, + "learning_rate": 1.4550989345509894e-06, + "loss": 0.0322, + "step": 9517 + }, + { + "epoch": 8.692237442922375, + "grad_norm": 0.5934036374092102, + "learning_rate": 1.4540842212075091e-06, + "loss": 0.0019, + "step": 9518 + }, + { + "epoch": 8.693150684931506, + "grad_norm": 0.47737887501716614, + "learning_rate": 1.4530695078640286e-06, + "loss": 0.0027, + "step": 9519 + }, + { + "epoch": 8.69406392694064, + "grad_norm": 2.463771104812622, + "learning_rate": 1.4520547945205479e-06, + "loss": 0.0137, + "step": 9520 + }, + { + "epoch": 8.694977168949771, + "grad_norm": 0.028428833931684494, + "learning_rate": 1.4510400811770676e-06, + "loss": 0.0001, + "step": 9521 + }, + { + "epoch": 8.695890410958905, + "grad_norm": 6.7577223777771, + "learning_rate": 1.450025367833587e-06, + "loss": 0.0607, + "step": 9522 + }, + { + "epoch": 8.696803652968036, + "grad_norm": 0.028268365189433098, + "learning_rate": 1.4490106544901067e-06, + "loss": 0.0002, + "step": 9523 + }, + { + "epoch": 8.69771689497717, + "grad_norm": 4.615580081939697, + "learning_rate": 1.4479959411466262e-06, + "loss": 0.0232, + "step": 9524 + }, + { + "epoch": 8.698630136986301, + "grad_norm": 0.34782204031944275, + "learning_rate": 1.4469812278031457e-06, + "loss": 0.0017, + "step": 9525 + }, + { + "epoch": 8.699543378995434, + "grad_norm": 0.908362865447998, + "learning_rate": 1.4459665144596652e-06, + "loss": 0.0045, + "step": 9526 + }, + { + "epoch": 8.700456621004566, + "grad_norm": 1.0475178956985474, + "learning_rate": 1.4449518011161849e-06, + "loss": 0.0058, + "step": 9527 + }, + { + "epoch": 8.7013698630137, + "grad_norm": 0.3742212653160095, + "learning_rate": 1.4439370877727043e-06, + "loss": 0.0022, + "step": 9528 + }, + { + "epoch": 8.70228310502283, + "grad_norm": 61.512168884277344, + "learning_rate": 1.442922374429224e-06, + "loss": 0.8859, + "step": 9529 + }, + { + "epoch": 8.703196347031964, + "grad_norm": 0.41842713952064514, + "learning_rate": 1.4419076610857433e-06, + "loss": 0.0026, + "step": 9530 + }, + { + "epoch": 8.704109589041096, + "grad_norm": 0.2870216369628906, + "learning_rate": 1.4408929477422628e-06, + "loss": 0.0016, + "step": 9531 + }, + { + "epoch": 8.70502283105023, + "grad_norm": 0.874517023563385, + "learning_rate": 1.4398782343987825e-06, + "loss": 0.0047, + "step": 9532 + }, + { + "epoch": 8.70593607305936, + "grad_norm": 54.082252502441406, + "learning_rate": 1.438863521055302e-06, + "loss": 0.3905, + "step": 9533 + }, + { + "epoch": 8.706849315068492, + "grad_norm": 0.16358551383018494, + "learning_rate": 1.4378488077118216e-06, + "loss": 0.0013, + "step": 9534 + }, + { + "epoch": 8.707762557077626, + "grad_norm": 0.894520103931427, + "learning_rate": 1.436834094368341e-06, + "loss": 0.0045, + "step": 9535 + }, + { + "epoch": 8.708675799086759, + "grad_norm": 15.36956787109375, + "learning_rate": 1.4358193810248606e-06, + "loss": 0.1196, + "step": 9536 + }, + { + "epoch": 8.70958904109589, + "grad_norm": 0.16555680334568024, + "learning_rate": 1.43480466768138e-06, + "loss": 0.001, + "step": 9537 + }, + { + "epoch": 8.710502283105022, + "grad_norm": 5.146872043609619, + "learning_rate": 1.4337899543378998e-06, + "loss": 0.0365, + "step": 9538 + }, + { + "epoch": 8.711415525114155, + "grad_norm": 38.986690521240234, + "learning_rate": 1.4327752409944193e-06, + "loss": 0.3311, + "step": 9539 + }, + { + "epoch": 8.712328767123287, + "grad_norm": 0.3242109715938568, + "learning_rate": 1.4317605276509385e-06, + "loss": 0.0022, + "step": 9540 + }, + { + "epoch": 8.71324200913242, + "grad_norm": 39.077125549316406, + "learning_rate": 1.4307458143074582e-06, + "loss": 0.2836, + "step": 9541 + }, + { + "epoch": 8.714155251141552, + "grad_norm": 129.15188598632812, + "learning_rate": 1.4297311009639777e-06, + "loss": 1.0139, + "step": 9542 + }, + { + "epoch": 8.715068493150685, + "grad_norm": 0.3837023973464966, + "learning_rate": 1.4287163876204974e-06, + "loss": 0.003, + "step": 9543 + }, + { + "epoch": 8.715981735159817, + "grad_norm": 0.10170982778072357, + "learning_rate": 1.4277016742770169e-06, + "loss": 0.0006, + "step": 9544 + }, + { + "epoch": 8.71689497716895, + "grad_norm": 2.839461326599121, + "learning_rate": 1.4266869609335364e-06, + "loss": 0.0226, + "step": 9545 + }, + { + "epoch": 8.717808219178082, + "grad_norm": 64.91011810302734, + "learning_rate": 1.4256722475900558e-06, + "loss": 0.0929, + "step": 9546 + }, + { + "epoch": 8.718721461187215, + "grad_norm": 4.272731781005859, + "learning_rate": 1.4246575342465755e-06, + "loss": 0.0334, + "step": 9547 + }, + { + "epoch": 8.719634703196347, + "grad_norm": 0.2625698745250702, + "learning_rate": 1.423642820903095e-06, + "loss": 0.0015, + "step": 9548 + }, + { + "epoch": 8.72054794520548, + "grad_norm": 25.22418975830078, + "learning_rate": 1.4226281075596147e-06, + "loss": 0.1582, + "step": 9549 + }, + { + "epoch": 8.721461187214611, + "grad_norm": 44.375892639160156, + "learning_rate": 1.421613394216134e-06, + "loss": 0.3547, + "step": 9550 + }, + { + "epoch": 8.722374429223745, + "grad_norm": 2.674849271774292, + "learning_rate": 1.4205986808726534e-06, + "loss": 0.0238, + "step": 9551 + }, + { + "epoch": 8.723287671232876, + "grad_norm": 0.06848989427089691, + "learning_rate": 1.4195839675291731e-06, + "loss": 0.0004, + "step": 9552 + }, + { + "epoch": 8.72420091324201, + "grad_norm": 0.09167347848415375, + "learning_rate": 1.4185692541856926e-06, + "loss": 0.0006, + "step": 9553 + }, + { + "epoch": 8.725114155251141, + "grad_norm": 0.010615888983011246, + "learning_rate": 1.4175545408422123e-06, + "loss": 0.0001, + "step": 9554 + }, + { + "epoch": 8.726027397260275, + "grad_norm": 6.105497360229492, + "learning_rate": 1.4165398274987316e-06, + "loss": 0.0368, + "step": 9555 + }, + { + "epoch": 8.726940639269406, + "grad_norm": 9.600424766540527, + "learning_rate": 1.4155251141552513e-06, + "loss": 0.0926, + "step": 9556 + }, + { + "epoch": 8.72785388127854, + "grad_norm": 5.207786560058594, + "learning_rate": 1.4145104008117708e-06, + "loss": 0.0422, + "step": 9557 + }, + { + "epoch": 8.728767123287671, + "grad_norm": 18.118209838867188, + "learning_rate": 1.4134956874682904e-06, + "loss": 0.1301, + "step": 9558 + }, + { + "epoch": 8.729680365296804, + "grad_norm": 32.61836624145508, + "learning_rate": 1.41248097412481e-06, + "loss": 0.2246, + "step": 9559 + }, + { + "epoch": 8.730593607305936, + "grad_norm": 47.536563873291016, + "learning_rate": 1.4114662607813292e-06, + "loss": 0.3665, + "step": 9560 + }, + { + "epoch": 8.731506849315068, + "grad_norm": 5.678260803222656, + "learning_rate": 1.4104515474378489e-06, + "loss": 0.0407, + "step": 9561 + }, + { + "epoch": 8.732420091324201, + "grad_norm": 16.99492835998535, + "learning_rate": 1.4094368340943684e-06, + "loss": 0.1051, + "step": 9562 + }, + { + "epoch": 8.733333333333333, + "grad_norm": 28.547992706298828, + "learning_rate": 1.408422120750888e-06, + "loss": 0.3491, + "step": 9563 + }, + { + "epoch": 8.734246575342466, + "grad_norm": 0.46231865882873535, + "learning_rate": 1.4074074074074075e-06, + "loss": 0.0031, + "step": 9564 + }, + { + "epoch": 8.735159817351597, + "grad_norm": 0.1147090420126915, + "learning_rate": 1.406392694063927e-06, + "loss": 0.0005, + "step": 9565 + }, + { + "epoch": 8.73607305936073, + "grad_norm": 7.550386905670166, + "learning_rate": 1.4053779807204465e-06, + "loss": 0.0399, + "step": 9566 + }, + { + "epoch": 8.736986301369862, + "grad_norm": 16.240476608276367, + "learning_rate": 1.4043632673769662e-06, + "loss": 0.0998, + "step": 9567 + }, + { + "epoch": 8.737899543378996, + "grad_norm": 0.20963068306446075, + "learning_rate": 1.4033485540334857e-06, + "loss": 0.0015, + "step": 9568 + }, + { + "epoch": 8.738812785388127, + "grad_norm": 1.0866693258285522, + "learning_rate": 1.4023338406900054e-06, + "loss": 0.0051, + "step": 9569 + }, + { + "epoch": 8.73972602739726, + "grad_norm": 25.664344787597656, + "learning_rate": 1.4013191273465246e-06, + "loss": 0.1411, + "step": 9570 + }, + { + "epoch": 8.740639269406392, + "grad_norm": 0.460555762052536, + "learning_rate": 1.4003044140030441e-06, + "loss": 0.0037, + "step": 9571 + }, + { + "epoch": 8.741552511415525, + "grad_norm": 4.661508560180664, + "learning_rate": 1.3992897006595638e-06, + "loss": 0.0076, + "step": 9572 + }, + { + "epoch": 8.742465753424657, + "grad_norm": 0.018507294356822968, + "learning_rate": 1.3982749873160833e-06, + "loss": 0.0001, + "step": 9573 + }, + { + "epoch": 8.74337899543379, + "grad_norm": 22.790307998657227, + "learning_rate": 1.397260273972603e-06, + "loss": 0.1478, + "step": 9574 + }, + { + "epoch": 8.744292237442922, + "grad_norm": 0.09661862999200821, + "learning_rate": 1.3962455606291222e-06, + "loss": 0.0007, + "step": 9575 + }, + { + "epoch": 8.745205479452055, + "grad_norm": 0.378047376871109, + "learning_rate": 1.395230847285642e-06, + "loss": 0.0023, + "step": 9576 + }, + { + "epoch": 8.746118721461187, + "grad_norm": 1.9686893224716187, + "learning_rate": 1.3942161339421614e-06, + "loss": 0.0133, + "step": 9577 + }, + { + "epoch": 8.74703196347032, + "grad_norm": 0.16944238543510437, + "learning_rate": 1.3932014205986811e-06, + "loss": 0.0007, + "step": 9578 + }, + { + "epoch": 8.747945205479452, + "grad_norm": 0.17829665541648865, + "learning_rate": 1.3921867072552006e-06, + "loss": 0.0011, + "step": 9579 + }, + { + "epoch": 8.748858447488585, + "grad_norm": 15.9302978515625, + "learning_rate": 1.3911719939117199e-06, + "loss": 0.0871, + "step": 9580 + }, + { + "epoch": 8.749771689497717, + "grad_norm": 1.6166762113571167, + "learning_rate": 1.3901572805682396e-06, + "loss": 0.009, + "step": 9581 + }, + { + "epoch": 8.75068493150685, + "grad_norm": 1.8912147283554077, + "learning_rate": 1.389142567224759e-06, + "loss": 0.0145, + "step": 9582 + }, + { + "epoch": 8.751598173515982, + "grad_norm": 0.019366202875971794, + "learning_rate": 1.3881278538812787e-06, + "loss": 0.0001, + "step": 9583 + }, + { + "epoch": 8.752511415525115, + "grad_norm": 37.05235290527344, + "learning_rate": 1.3871131405377982e-06, + "loss": 0.2326, + "step": 9584 + }, + { + "epoch": 8.753424657534246, + "grad_norm": 6.172194957733154, + "learning_rate": 1.3860984271943177e-06, + "loss": 0.0172, + "step": 9585 + }, + { + "epoch": 8.75433789954338, + "grad_norm": 2.0131309032440186, + "learning_rate": 1.3850837138508372e-06, + "loss": 0.008, + "step": 9586 + }, + { + "epoch": 8.755251141552511, + "grad_norm": 2.279618263244629, + "learning_rate": 1.3840690005073569e-06, + "loss": 0.0163, + "step": 9587 + }, + { + "epoch": 8.756164383561643, + "grad_norm": 0.21464166045188904, + "learning_rate": 1.3830542871638763e-06, + "loss": 0.0016, + "step": 9588 + }, + { + "epoch": 8.757077625570776, + "grad_norm": 7.062154293060303, + "learning_rate": 1.382039573820396e-06, + "loss": 0.0455, + "step": 9589 + }, + { + "epoch": 8.757990867579908, + "grad_norm": 0.04862551763653755, + "learning_rate": 1.3810248604769153e-06, + "loss": 0.0003, + "step": 9590 + }, + { + "epoch": 8.758904109589041, + "grad_norm": 8.45777702331543, + "learning_rate": 1.3800101471334348e-06, + "loss": 0.0501, + "step": 9591 + }, + { + "epoch": 8.759817351598173, + "grad_norm": 0.7666011452674866, + "learning_rate": 1.3789954337899545e-06, + "loss": 0.0056, + "step": 9592 + }, + { + "epoch": 8.760730593607306, + "grad_norm": 41.818363189697266, + "learning_rate": 1.377980720446474e-06, + "loss": 0.3075, + "step": 9593 + }, + { + "epoch": 8.761643835616438, + "grad_norm": 4.122413635253906, + "learning_rate": 1.3769660071029936e-06, + "loss": 0.024, + "step": 9594 + }, + { + "epoch": 8.762557077625571, + "grad_norm": 5.379820346832275, + "learning_rate": 1.375951293759513e-06, + "loss": 0.0226, + "step": 9595 + }, + { + "epoch": 8.763470319634703, + "grad_norm": 3.449350595474243, + "learning_rate": 1.3749365804160326e-06, + "loss": 0.0216, + "step": 9596 + }, + { + "epoch": 8.764383561643836, + "grad_norm": 4.746633052825928, + "learning_rate": 1.373921867072552e-06, + "loss": 0.0402, + "step": 9597 + }, + { + "epoch": 8.765296803652967, + "grad_norm": 0.10630057752132416, + "learning_rate": 1.3729071537290718e-06, + "loss": 0.0008, + "step": 9598 + }, + { + "epoch": 8.7662100456621, + "grad_norm": 0.46559908986091614, + "learning_rate": 1.3718924403855913e-06, + "loss": 0.0029, + "step": 9599 + }, + { + "epoch": 8.767123287671232, + "grad_norm": 0.09321696311235428, + "learning_rate": 1.3708777270421105e-06, + "loss": 0.0007, + "step": 9600 + }, + { + "epoch": 8.768036529680366, + "grad_norm": 0.009164039045572281, + "learning_rate": 1.3698630136986302e-06, + "loss": 0.0001, + "step": 9601 + }, + { + "epoch": 8.768949771689497, + "grad_norm": 1.5405687093734741, + "learning_rate": 1.3688483003551497e-06, + "loss": 0.006, + "step": 9602 + }, + { + "epoch": 8.76986301369863, + "grad_norm": 0.7536260485649109, + "learning_rate": 1.3678335870116694e-06, + "loss": 0.0063, + "step": 9603 + }, + { + "epoch": 8.770776255707762, + "grad_norm": 0.2763906419277191, + "learning_rate": 1.3668188736681889e-06, + "loss": 0.0015, + "step": 9604 + }, + { + "epoch": 8.771689497716896, + "grad_norm": 1.0761034488677979, + "learning_rate": 1.3658041603247084e-06, + "loss": 0.0055, + "step": 9605 + }, + { + "epoch": 8.772602739726027, + "grad_norm": 1.04498291015625, + "learning_rate": 1.3647894469812278e-06, + "loss": 0.0082, + "step": 9606 + }, + { + "epoch": 8.77351598173516, + "grad_norm": 0.9481047987937927, + "learning_rate": 1.3637747336377475e-06, + "loss": 0.0042, + "step": 9607 + }, + { + "epoch": 8.774429223744292, + "grad_norm": 0.06413771957159042, + "learning_rate": 1.362760020294267e-06, + "loss": 0.0004, + "step": 9608 + }, + { + "epoch": 8.775342465753425, + "grad_norm": 1.0862728357315063, + "learning_rate": 1.3617453069507867e-06, + "loss": 0.0093, + "step": 9609 + }, + { + "epoch": 8.776255707762557, + "grad_norm": 1.0771098136901855, + "learning_rate": 1.360730593607306e-06, + "loss": 0.0062, + "step": 9610 + }, + { + "epoch": 8.77716894977169, + "grad_norm": 0.09839016199111938, + "learning_rate": 1.3597158802638254e-06, + "loss": 0.0006, + "step": 9611 + }, + { + "epoch": 8.778082191780822, + "grad_norm": 6.718138694763184, + "learning_rate": 1.3587011669203451e-06, + "loss": 0.0355, + "step": 9612 + }, + { + "epoch": 8.778995433789955, + "grad_norm": 0.057602059096097946, + "learning_rate": 1.3576864535768646e-06, + "loss": 0.0004, + "step": 9613 + }, + { + "epoch": 8.779908675799087, + "grad_norm": 3.3341128826141357, + "learning_rate": 1.3566717402333843e-06, + "loss": 0.0223, + "step": 9614 + }, + { + "epoch": 8.780821917808218, + "grad_norm": 26.409896850585938, + "learning_rate": 1.3556570268899036e-06, + "loss": 0.2651, + "step": 9615 + }, + { + "epoch": 8.781735159817352, + "grad_norm": 0.16937772929668427, + "learning_rate": 1.3546423135464233e-06, + "loss": 0.0008, + "step": 9616 + }, + { + "epoch": 8.782648401826483, + "grad_norm": 1.798617959022522, + "learning_rate": 1.3536276002029427e-06, + "loss": 0.0119, + "step": 9617 + }, + { + "epoch": 8.783561643835617, + "grad_norm": 0.4294552505016327, + "learning_rate": 1.3526128868594624e-06, + "loss": 0.0021, + "step": 9618 + }, + { + "epoch": 8.784474885844748, + "grad_norm": 0.09407724440097809, + "learning_rate": 1.351598173515982e-06, + "loss": 0.0006, + "step": 9619 + }, + { + "epoch": 8.785388127853881, + "grad_norm": 1.406702995300293, + "learning_rate": 1.3505834601725012e-06, + "loss": 0.0097, + "step": 9620 + }, + { + "epoch": 8.786301369863013, + "grad_norm": 0.13140763342380524, + "learning_rate": 1.3495687468290209e-06, + "loss": 0.0009, + "step": 9621 + }, + { + "epoch": 8.787214611872146, + "grad_norm": 0.03850812092423439, + "learning_rate": 1.3485540334855404e-06, + "loss": 0.0002, + "step": 9622 + }, + { + "epoch": 8.788127853881278, + "grad_norm": 5.886261463165283, + "learning_rate": 1.34753932014206e-06, + "loss": 0.0167, + "step": 9623 + }, + { + "epoch": 8.789041095890411, + "grad_norm": 2.311796188354492, + "learning_rate": 1.3465246067985795e-06, + "loss": 0.0155, + "step": 9624 + }, + { + "epoch": 8.789954337899543, + "grad_norm": 0.014511064626276493, + "learning_rate": 1.345509893455099e-06, + "loss": 0.0001, + "step": 9625 + }, + { + "epoch": 8.790867579908676, + "grad_norm": 0.42117658257484436, + "learning_rate": 1.3444951801116185e-06, + "loss": 0.0024, + "step": 9626 + }, + { + "epoch": 8.791780821917808, + "grad_norm": 0.8941344618797302, + "learning_rate": 1.3434804667681382e-06, + "loss": 0.0063, + "step": 9627 + }, + { + "epoch": 8.792694063926941, + "grad_norm": 0.4548048973083496, + "learning_rate": 1.3424657534246577e-06, + "loss": 0.0028, + "step": 9628 + }, + { + "epoch": 8.793607305936073, + "grad_norm": 7.9252519607543945, + "learning_rate": 1.3414510400811774e-06, + "loss": 0.0508, + "step": 9629 + }, + { + "epoch": 8.794520547945206, + "grad_norm": 1.8392488956451416, + "learning_rate": 1.3404363267376966e-06, + "loss": 0.0088, + "step": 9630 + }, + { + "epoch": 8.795433789954338, + "grad_norm": 5.389313220977783, + "learning_rate": 1.3394216133942161e-06, + "loss": 0.0276, + "step": 9631 + }, + { + "epoch": 8.796347031963471, + "grad_norm": 0.3775031864643097, + "learning_rate": 1.3384069000507358e-06, + "loss": 0.0031, + "step": 9632 + }, + { + "epoch": 8.797260273972602, + "grad_norm": 0.771155059337616, + "learning_rate": 1.3373921867072553e-06, + "loss": 0.0033, + "step": 9633 + }, + { + "epoch": 8.798173515981736, + "grad_norm": 1.734588384628296, + "learning_rate": 1.336377473363775e-06, + "loss": 0.0094, + "step": 9634 + }, + { + "epoch": 8.799086757990867, + "grad_norm": 5.691331386566162, + "learning_rate": 1.3353627600202942e-06, + "loss": 0.0232, + "step": 9635 + }, + { + "epoch": 8.8, + "grad_norm": 0.3252505958080292, + "learning_rate": 1.334348046676814e-06, + "loss": 0.0022, + "step": 9636 + }, + { + "epoch": 8.800913242009132, + "grad_norm": 21.819339752197266, + "learning_rate": 1.3333333333333334e-06, + "loss": 0.1525, + "step": 9637 + }, + { + "epoch": 8.801826484018266, + "grad_norm": 0.8852002024650574, + "learning_rate": 1.3323186199898531e-06, + "loss": 0.005, + "step": 9638 + }, + { + "epoch": 8.802739726027397, + "grad_norm": 37.443359375, + "learning_rate": 1.3313039066463726e-06, + "loss": 0.3572, + "step": 9639 + }, + { + "epoch": 8.80365296803653, + "grad_norm": 0.5835970640182495, + "learning_rate": 1.3302891933028919e-06, + "loss": 0.0049, + "step": 9640 + }, + { + "epoch": 8.804566210045662, + "grad_norm": 10.109201431274414, + "learning_rate": 1.3292744799594115e-06, + "loss": 0.0556, + "step": 9641 + }, + { + "epoch": 8.805479452054794, + "grad_norm": 11.970658302307129, + "learning_rate": 1.328259766615931e-06, + "loss": 0.1475, + "step": 9642 + }, + { + "epoch": 8.806392694063927, + "grad_norm": 4.470813751220703, + "learning_rate": 1.3272450532724507e-06, + "loss": 0.0388, + "step": 9643 + }, + { + "epoch": 8.807305936073059, + "grad_norm": 0.06456788629293442, + "learning_rate": 1.3262303399289702e-06, + "loss": 0.0005, + "step": 9644 + }, + { + "epoch": 8.808219178082192, + "grad_norm": 6.023353576660156, + "learning_rate": 1.3252156265854897e-06, + "loss": 0.0366, + "step": 9645 + }, + { + "epoch": 8.809132420091323, + "grad_norm": 0.2625364065170288, + "learning_rate": 1.3242009132420092e-06, + "loss": 0.0016, + "step": 9646 + }, + { + "epoch": 8.810045662100457, + "grad_norm": 7.101990699768066, + "learning_rate": 1.3231861998985289e-06, + "loss": 0.0603, + "step": 9647 + }, + { + "epoch": 8.810958904109588, + "grad_norm": 69.9875717163086, + "learning_rate": 1.3221714865550483e-06, + "loss": 0.9031, + "step": 9648 + }, + { + "epoch": 8.811872146118722, + "grad_norm": 1.4312570095062256, + "learning_rate": 1.321156773211568e-06, + "loss": 0.013, + "step": 9649 + }, + { + "epoch": 8.812785388127853, + "grad_norm": 1.4416638612747192, + "learning_rate": 1.3201420598680873e-06, + "loss": 0.0107, + "step": 9650 + }, + { + "epoch": 8.813698630136987, + "grad_norm": 0.394681841135025, + "learning_rate": 1.3191273465246068e-06, + "loss": 0.0017, + "step": 9651 + }, + { + "epoch": 8.814611872146118, + "grad_norm": 2.1500701904296875, + "learning_rate": 1.3181126331811265e-06, + "loss": 0.0095, + "step": 9652 + }, + { + "epoch": 8.815525114155252, + "grad_norm": 6.853540897369385, + "learning_rate": 1.317097919837646e-06, + "loss": 0.035, + "step": 9653 + }, + { + "epoch": 8.816438356164383, + "grad_norm": 108.15519714355469, + "learning_rate": 1.3160832064941656e-06, + "loss": 4.5189, + "step": 9654 + }, + { + "epoch": 8.817351598173516, + "grad_norm": 14.361184120178223, + "learning_rate": 1.315068493150685e-06, + "loss": 0.1151, + "step": 9655 + }, + { + "epoch": 8.818264840182648, + "grad_norm": 50.75281524658203, + "learning_rate": 1.3140537798072046e-06, + "loss": 0.4573, + "step": 9656 + }, + { + "epoch": 8.819178082191781, + "grad_norm": 45.683956146240234, + "learning_rate": 1.313039066463724e-06, + "loss": 0.4134, + "step": 9657 + }, + { + "epoch": 8.820091324200913, + "grad_norm": 0.16212143003940582, + "learning_rate": 1.3120243531202438e-06, + "loss": 0.0011, + "step": 9658 + }, + { + "epoch": 8.821004566210046, + "grad_norm": 44.52375030517578, + "learning_rate": 1.3110096397767633e-06, + "loss": 0.4791, + "step": 9659 + }, + { + "epoch": 8.821917808219178, + "grad_norm": 8.218385696411133, + "learning_rate": 1.3099949264332825e-06, + "loss": 0.0341, + "step": 9660 + }, + { + "epoch": 8.822831050228311, + "grad_norm": 0.15943053364753723, + "learning_rate": 1.3089802130898022e-06, + "loss": 0.0011, + "step": 9661 + }, + { + "epoch": 8.823744292237443, + "grad_norm": 83.9933090209961, + "learning_rate": 1.3079654997463217e-06, + "loss": 0.7603, + "step": 9662 + }, + { + "epoch": 8.824657534246576, + "grad_norm": 40.94196319580078, + "learning_rate": 1.3069507864028414e-06, + "loss": 0.2815, + "step": 9663 + }, + { + "epoch": 8.825570776255708, + "grad_norm": 0.10437360405921936, + "learning_rate": 1.3059360730593609e-06, + "loss": 0.0005, + "step": 9664 + }, + { + "epoch": 8.826484018264841, + "grad_norm": 15.189714431762695, + "learning_rate": 1.3049213597158803e-06, + "loss": 0.0769, + "step": 9665 + }, + { + "epoch": 8.827397260273973, + "grad_norm": 6.7800822257995605, + "learning_rate": 1.3039066463723998e-06, + "loss": 0.0461, + "step": 9666 + }, + { + "epoch": 8.828310502283106, + "grad_norm": 2.1182944774627686, + "learning_rate": 1.3028919330289195e-06, + "loss": 0.0154, + "step": 9667 + }, + { + "epoch": 8.829223744292237, + "grad_norm": 33.698280334472656, + "learning_rate": 1.301877219685439e-06, + "loss": 0.2161, + "step": 9668 + }, + { + "epoch": 8.830136986301369, + "grad_norm": 4.061429977416992, + "learning_rate": 1.3008625063419587e-06, + "loss": 0.0247, + "step": 9669 + }, + { + "epoch": 8.831050228310502, + "grad_norm": 1.7561562061309814, + "learning_rate": 1.299847792998478e-06, + "loss": 0.0096, + "step": 9670 + }, + { + "epoch": 8.831963470319634, + "grad_norm": 5.192687511444092, + "learning_rate": 1.2988330796549974e-06, + "loss": 0.0242, + "step": 9671 + }, + { + "epoch": 8.832876712328767, + "grad_norm": 0.042209692299366, + "learning_rate": 1.2978183663115171e-06, + "loss": 0.0003, + "step": 9672 + }, + { + "epoch": 8.833789954337899, + "grad_norm": 0.35866352915763855, + "learning_rate": 1.2968036529680366e-06, + "loss": 0.0024, + "step": 9673 + }, + { + "epoch": 8.834703196347032, + "grad_norm": 34.39210510253906, + "learning_rate": 1.2957889396245563e-06, + "loss": 0.2309, + "step": 9674 + }, + { + "epoch": 8.835616438356164, + "grad_norm": 0.23198002576828003, + "learning_rate": 1.2947742262810756e-06, + "loss": 0.0011, + "step": 9675 + }, + { + "epoch": 8.836529680365297, + "grad_norm": 0.13834162056446075, + "learning_rate": 1.2937595129375953e-06, + "loss": 0.0009, + "step": 9676 + }, + { + "epoch": 8.837442922374429, + "grad_norm": 0.3261282742023468, + "learning_rate": 1.2927447995941147e-06, + "loss": 0.0014, + "step": 9677 + }, + { + "epoch": 8.838356164383562, + "grad_norm": 3.9392287731170654, + "learning_rate": 1.2917300862506344e-06, + "loss": 0.0205, + "step": 9678 + }, + { + "epoch": 8.839269406392694, + "grad_norm": 27.443416595458984, + "learning_rate": 1.290715372907154e-06, + "loss": 0.0808, + "step": 9679 + }, + { + "epoch": 8.840182648401827, + "grad_norm": 0.20466217398643494, + "learning_rate": 1.2897006595636732e-06, + "loss": 0.001, + "step": 9680 + }, + { + "epoch": 8.841095890410958, + "grad_norm": 3.4133195877075195, + "learning_rate": 1.2886859462201929e-06, + "loss": 0.0245, + "step": 9681 + }, + { + "epoch": 8.842009132420092, + "grad_norm": 0.003875209717079997, + "learning_rate": 1.2876712328767124e-06, + "loss": 0.0, + "step": 9682 + }, + { + "epoch": 8.842922374429223, + "grad_norm": 31.01601219177246, + "learning_rate": 1.286656519533232e-06, + "loss": 0.1864, + "step": 9683 + }, + { + "epoch": 8.843835616438357, + "grad_norm": 0.8882082104682922, + "learning_rate": 1.2856418061897515e-06, + "loss": 0.0055, + "step": 9684 + }, + { + "epoch": 8.844748858447488, + "grad_norm": 3.6585633754730225, + "learning_rate": 1.284627092846271e-06, + "loss": 0.0298, + "step": 9685 + }, + { + "epoch": 8.845662100456622, + "grad_norm": 7.874732971191406, + "learning_rate": 1.2836123795027905e-06, + "loss": 0.0592, + "step": 9686 + }, + { + "epoch": 8.846575342465753, + "grad_norm": 31.06962776184082, + "learning_rate": 1.2825976661593102e-06, + "loss": 0.2523, + "step": 9687 + }, + { + "epoch": 8.847488584474887, + "grad_norm": 5.666117191314697, + "learning_rate": 1.2815829528158297e-06, + "loss": 0.0325, + "step": 9688 + }, + { + "epoch": 8.848401826484018, + "grad_norm": 0.04653865098953247, + "learning_rate": 1.2805682394723494e-06, + "loss": 0.0003, + "step": 9689 + }, + { + "epoch": 8.849315068493151, + "grad_norm": 0.45562559366226196, + "learning_rate": 1.2795535261288686e-06, + "loss": 0.0035, + "step": 9690 + }, + { + "epoch": 8.850228310502283, + "grad_norm": 12.1483154296875, + "learning_rate": 1.278538812785388e-06, + "loss": 0.1012, + "step": 9691 + }, + { + "epoch": 8.851141552511416, + "grad_norm": 0.6340387463569641, + "learning_rate": 1.2775240994419078e-06, + "loss": 0.004, + "step": 9692 + }, + { + "epoch": 8.852054794520548, + "grad_norm": 5.889936447143555, + "learning_rate": 1.2765093860984273e-06, + "loss": 0.0356, + "step": 9693 + }, + { + "epoch": 8.852968036529681, + "grad_norm": 10.251195907592773, + "learning_rate": 1.275494672754947e-06, + "loss": 0.072, + "step": 9694 + }, + { + "epoch": 8.853881278538813, + "grad_norm": 0.1695934683084488, + "learning_rate": 1.2744799594114662e-06, + "loss": 0.0008, + "step": 9695 + }, + { + "epoch": 8.854794520547944, + "grad_norm": 3.726986885070801, + "learning_rate": 1.273465246067986e-06, + "loss": 0.0217, + "step": 9696 + }, + { + "epoch": 8.855707762557078, + "grad_norm": 2.105914354324341, + "learning_rate": 1.2724505327245054e-06, + "loss": 0.0123, + "step": 9697 + }, + { + "epoch": 8.85662100456621, + "grad_norm": 0.35216835141181946, + "learning_rate": 1.271435819381025e-06, + "loss": 0.0015, + "step": 9698 + }, + { + "epoch": 8.857534246575343, + "grad_norm": 99.24079132080078, + "learning_rate": 1.2704211060375446e-06, + "loss": 1.6805, + "step": 9699 + }, + { + "epoch": 8.858447488584474, + "grad_norm": 0.8469319343566895, + "learning_rate": 1.2694063926940639e-06, + "loss": 0.0036, + "step": 9700 + }, + { + "epoch": 8.859360730593608, + "grad_norm": 0.8421669602394104, + "learning_rate": 1.2683916793505835e-06, + "loss": 0.007, + "step": 9701 + }, + { + "epoch": 8.860273972602739, + "grad_norm": 3.151927947998047, + "learning_rate": 1.267376966007103e-06, + "loss": 0.0154, + "step": 9702 + }, + { + "epoch": 8.861187214611872, + "grad_norm": 0.48564037680625916, + "learning_rate": 1.2663622526636227e-06, + "loss": 0.0027, + "step": 9703 + }, + { + "epoch": 8.862100456621004, + "grad_norm": 17.831403732299805, + "learning_rate": 1.2653475393201422e-06, + "loss": 0.1621, + "step": 9704 + }, + { + "epoch": 8.863013698630137, + "grad_norm": 5.6795973777771, + "learning_rate": 1.2643328259766617e-06, + "loss": 0.0226, + "step": 9705 + }, + { + "epoch": 8.863926940639269, + "grad_norm": 2.2440531253814697, + "learning_rate": 1.2633181126331812e-06, + "loss": 0.0137, + "step": 9706 + }, + { + "epoch": 8.864840182648402, + "grad_norm": 50.14794158935547, + "learning_rate": 1.2623033992897008e-06, + "loss": 0.5344, + "step": 9707 + }, + { + "epoch": 8.865753424657534, + "grad_norm": 2.3883347511291504, + "learning_rate": 1.2612886859462203e-06, + "loss": 0.0105, + "step": 9708 + }, + { + "epoch": 8.866666666666667, + "grad_norm": 0.5018916726112366, + "learning_rate": 1.26027397260274e-06, + "loss": 0.0037, + "step": 9709 + }, + { + "epoch": 8.867579908675799, + "grad_norm": 0.16730427742004395, + "learning_rate": 1.2592592592592593e-06, + "loss": 0.0009, + "step": 9710 + }, + { + "epoch": 8.868493150684932, + "grad_norm": 8.59871768951416, + "learning_rate": 1.2582445459157788e-06, + "loss": 0.0614, + "step": 9711 + }, + { + "epoch": 8.869406392694064, + "grad_norm": 38.18241500854492, + "learning_rate": 1.2572298325722985e-06, + "loss": 0.2214, + "step": 9712 + }, + { + "epoch": 8.870319634703197, + "grad_norm": 1.4219568967819214, + "learning_rate": 1.256215119228818e-06, + "loss": 0.0053, + "step": 9713 + }, + { + "epoch": 8.871232876712329, + "grad_norm": 1.4882301092147827, + "learning_rate": 1.2552004058853376e-06, + "loss": 0.0067, + "step": 9714 + }, + { + "epoch": 8.872146118721462, + "grad_norm": 4.961648464202881, + "learning_rate": 1.254185692541857e-06, + "loss": 0.0313, + "step": 9715 + }, + { + "epoch": 8.873059360730593, + "grad_norm": 1.7760322093963623, + "learning_rate": 1.2531709791983766e-06, + "loss": 0.0089, + "step": 9716 + }, + { + "epoch": 8.873972602739727, + "grad_norm": 6.733945369720459, + "learning_rate": 1.252156265854896e-06, + "loss": 0.0317, + "step": 9717 + }, + { + "epoch": 8.874885844748858, + "grad_norm": 3.3342862129211426, + "learning_rate": 1.2511415525114158e-06, + "loss": 0.0197, + "step": 9718 + }, + { + "epoch": 8.875799086757992, + "grad_norm": 0.11563169956207275, + "learning_rate": 1.2501268391679352e-06, + "loss": 0.0007, + "step": 9719 + }, + { + "epoch": 8.876712328767123, + "grad_norm": 18.739639282226562, + "learning_rate": 1.2491121258244547e-06, + "loss": 0.2229, + "step": 9720 + }, + { + "epoch": 8.877625570776257, + "grad_norm": 0.2055639624595642, + "learning_rate": 1.2480974124809742e-06, + "loss": 0.0012, + "step": 9721 + }, + { + "epoch": 8.878538812785388, + "grad_norm": 10.792848587036133, + "learning_rate": 1.2470826991374937e-06, + "loss": 0.0497, + "step": 9722 + }, + { + "epoch": 8.87945205479452, + "grad_norm": 13.73298168182373, + "learning_rate": 1.2460679857940134e-06, + "loss": 0.0886, + "step": 9723 + }, + { + "epoch": 8.880365296803653, + "grad_norm": 0.0151553088799119, + "learning_rate": 1.2450532724505329e-06, + "loss": 0.0001, + "step": 9724 + }, + { + "epoch": 8.881278538812785, + "grad_norm": 0.05356855317950249, + "learning_rate": 1.2440385591070523e-06, + "loss": 0.0003, + "step": 9725 + }, + { + "epoch": 8.882191780821918, + "grad_norm": 13.462475776672363, + "learning_rate": 1.2430238457635718e-06, + "loss": 0.0566, + "step": 9726 + }, + { + "epoch": 8.88310502283105, + "grad_norm": 1.7093535661697388, + "learning_rate": 1.2420091324200915e-06, + "loss": 0.0109, + "step": 9727 + }, + { + "epoch": 8.884018264840183, + "grad_norm": 0.9973182082176208, + "learning_rate": 1.240994419076611e-06, + "loss": 0.0067, + "step": 9728 + }, + { + "epoch": 8.884931506849314, + "grad_norm": 29.295122146606445, + "learning_rate": 1.2399797057331305e-06, + "loss": 0.2994, + "step": 9729 + }, + { + "epoch": 8.885844748858448, + "grad_norm": 9.575739860534668, + "learning_rate": 1.23896499238965e-06, + "loss": 0.0599, + "step": 9730 + }, + { + "epoch": 8.88675799086758, + "grad_norm": 2.9654359817504883, + "learning_rate": 1.2379502790461694e-06, + "loss": 0.0128, + "step": 9731 + }, + { + "epoch": 8.887671232876713, + "grad_norm": 19.820531845092773, + "learning_rate": 1.2369355657026891e-06, + "loss": 0.1259, + "step": 9732 + }, + { + "epoch": 8.888584474885844, + "grad_norm": 1.541243314743042, + "learning_rate": 1.2359208523592086e-06, + "loss": 0.0087, + "step": 9733 + }, + { + "epoch": 8.889497716894978, + "grad_norm": 6.5399861335754395, + "learning_rate": 1.234906139015728e-06, + "loss": 0.0344, + "step": 9734 + }, + { + "epoch": 8.89041095890411, + "grad_norm": 8.509038925170898, + "learning_rate": 1.2338914256722478e-06, + "loss": 0.0357, + "step": 9735 + }, + { + "epoch": 8.891324200913242, + "grad_norm": 14.549285888671875, + "learning_rate": 1.2328767123287673e-06, + "loss": 0.0861, + "step": 9736 + }, + { + "epoch": 8.892237442922374, + "grad_norm": 10.111226081848145, + "learning_rate": 1.2318619989852867e-06, + "loss": 0.0648, + "step": 9737 + }, + { + "epoch": 8.893150684931507, + "grad_norm": 12.647592544555664, + "learning_rate": 1.2308472856418064e-06, + "loss": 0.0596, + "step": 9738 + }, + { + "epoch": 8.894063926940639, + "grad_norm": 0.3576829433441162, + "learning_rate": 1.2298325722983257e-06, + "loss": 0.0021, + "step": 9739 + }, + { + "epoch": 8.894977168949772, + "grad_norm": 0.3903171122074127, + "learning_rate": 1.2288178589548454e-06, + "loss": 0.0019, + "step": 9740 + }, + { + "epoch": 8.895890410958904, + "grad_norm": 0.018676556646823883, + "learning_rate": 1.2278031456113649e-06, + "loss": 0.0001, + "step": 9741 + }, + { + "epoch": 8.896803652968037, + "grad_norm": 0.6682994961738586, + "learning_rate": 1.2267884322678844e-06, + "loss": 0.0041, + "step": 9742 + }, + { + "epoch": 8.897716894977169, + "grad_norm": 0.04348771646618843, + "learning_rate": 1.225773718924404e-06, + "loss": 0.0003, + "step": 9743 + }, + { + "epoch": 8.898630136986302, + "grad_norm": 0.6126535534858704, + "learning_rate": 1.2247590055809235e-06, + "loss": 0.0043, + "step": 9744 + }, + { + "epoch": 8.899543378995434, + "grad_norm": 21.745187759399414, + "learning_rate": 1.223744292237443e-06, + "loss": 0.1389, + "step": 9745 + }, + { + "epoch": 8.900456621004567, + "grad_norm": 2.5364394187927246, + "learning_rate": 1.2227295788939625e-06, + "loss": 0.0121, + "step": 9746 + }, + { + "epoch": 8.901369863013699, + "grad_norm": 0.030941003933548927, + "learning_rate": 1.2217148655504822e-06, + "loss": 0.0002, + "step": 9747 + }, + { + "epoch": 8.902283105022832, + "grad_norm": 26.996417999267578, + "learning_rate": 1.2207001522070017e-06, + "loss": 0.1231, + "step": 9748 + }, + { + "epoch": 8.903196347031963, + "grad_norm": 3.104785919189453, + "learning_rate": 1.2196854388635211e-06, + "loss": 0.0197, + "step": 9749 + }, + { + "epoch": 8.904109589041095, + "grad_norm": 2.6606040000915527, + "learning_rate": 1.2186707255200406e-06, + "loss": 0.025, + "step": 9750 + }, + { + "epoch": 8.905022831050228, + "grad_norm": 122.41053771972656, + "learning_rate": 1.21765601217656e-06, + "loss": 4.6519, + "step": 9751 + }, + { + "epoch": 8.90593607305936, + "grad_norm": 0.041413813829422, + "learning_rate": 1.2166412988330798e-06, + "loss": 0.0002, + "step": 9752 + }, + { + "epoch": 8.906849315068493, + "grad_norm": 20.416990280151367, + "learning_rate": 1.2156265854895993e-06, + "loss": 0.0363, + "step": 9753 + }, + { + "epoch": 8.907762557077625, + "grad_norm": 4.507416725158691, + "learning_rate": 1.2146118721461188e-06, + "loss": 0.0201, + "step": 9754 + }, + { + "epoch": 8.908675799086758, + "grad_norm": 47.91225051879883, + "learning_rate": 1.2135971588026384e-06, + "loss": 0.3971, + "step": 9755 + }, + { + "epoch": 8.90958904109589, + "grad_norm": 0.9053398966789246, + "learning_rate": 1.212582445459158e-06, + "loss": 0.0052, + "step": 9756 + }, + { + "epoch": 8.910502283105023, + "grad_norm": 0.021591292694211006, + "learning_rate": 1.2115677321156774e-06, + "loss": 0.0001, + "step": 9757 + }, + { + "epoch": 8.911415525114155, + "grad_norm": 37.16183853149414, + "learning_rate": 1.210553018772197e-06, + "loss": 0.4881, + "step": 9758 + }, + { + "epoch": 8.912328767123288, + "grad_norm": 77.78848266601562, + "learning_rate": 1.2095383054287164e-06, + "loss": 0.7658, + "step": 9759 + }, + { + "epoch": 8.91324200913242, + "grad_norm": 0.6069559454917908, + "learning_rate": 1.208523592085236e-06, + "loss": 0.0045, + "step": 9760 + }, + { + "epoch": 8.914155251141553, + "grad_norm": 0.4102071225643158, + "learning_rate": 1.2075088787417555e-06, + "loss": 0.0021, + "step": 9761 + }, + { + "epoch": 8.915068493150685, + "grad_norm": 4.971374034881592, + "learning_rate": 1.206494165398275e-06, + "loss": 0.0189, + "step": 9762 + }, + { + "epoch": 8.915981735159818, + "grad_norm": 0.4894484579563141, + "learning_rate": 1.2054794520547947e-06, + "loss": 0.0033, + "step": 9763 + }, + { + "epoch": 8.91689497716895, + "grad_norm": 0.2401156723499298, + "learning_rate": 1.2044647387113142e-06, + "loss": 0.0011, + "step": 9764 + }, + { + "epoch": 8.917808219178083, + "grad_norm": 0.49514326453208923, + "learning_rate": 1.2034500253678337e-06, + "loss": 0.0041, + "step": 9765 + }, + { + "epoch": 8.918721461187214, + "grad_norm": 0.6015564799308777, + "learning_rate": 1.2024353120243532e-06, + "loss": 0.0047, + "step": 9766 + }, + { + "epoch": 8.919634703196348, + "grad_norm": 0.012053899466991425, + "learning_rate": 1.2014205986808728e-06, + "loss": 0.0001, + "step": 9767 + }, + { + "epoch": 8.92054794520548, + "grad_norm": 4.531834602355957, + "learning_rate": 1.2004058853373923e-06, + "loss": 0.0273, + "step": 9768 + }, + { + "epoch": 8.921461187214613, + "grad_norm": 0.6271222829818726, + "learning_rate": 1.1993911719939118e-06, + "loss": 0.0054, + "step": 9769 + }, + { + "epoch": 8.922374429223744, + "grad_norm": 0.24447190761566162, + "learning_rate": 1.1983764586504313e-06, + "loss": 0.0018, + "step": 9770 + }, + { + "epoch": 8.923287671232877, + "grad_norm": 1.0152002573013306, + "learning_rate": 1.1973617453069508e-06, + "loss": 0.0034, + "step": 9771 + }, + { + "epoch": 8.924200913242009, + "grad_norm": 6.5715813636779785, + "learning_rate": 1.1963470319634705e-06, + "loss": 0.0317, + "step": 9772 + }, + { + "epoch": 8.925114155251142, + "grad_norm": 5.068697929382324, + "learning_rate": 1.19533231861999e-06, + "loss": 0.038, + "step": 9773 + }, + { + "epoch": 8.926027397260274, + "grad_norm": 0.3667300343513489, + "learning_rate": 1.1943176052765094e-06, + "loss": 0.0021, + "step": 9774 + }, + { + "epoch": 8.926940639269407, + "grad_norm": 12.686027526855469, + "learning_rate": 1.1933028919330291e-06, + "loss": 0.0881, + "step": 9775 + }, + { + "epoch": 8.927853881278539, + "grad_norm": 32.23162841796875, + "learning_rate": 1.1922881785895486e-06, + "loss": 0.3616, + "step": 9776 + }, + { + "epoch": 8.92876712328767, + "grad_norm": 1.165359616279602, + "learning_rate": 1.191273465246068e-06, + "loss": 0.0083, + "step": 9777 + }, + { + "epoch": 8.929680365296804, + "grad_norm": 19.033952713012695, + "learning_rate": 1.1902587519025878e-06, + "loss": 0.0738, + "step": 9778 + }, + { + "epoch": 8.930593607305935, + "grad_norm": 0.22463174164295197, + "learning_rate": 1.189244038559107e-06, + "loss": 0.0013, + "step": 9779 + }, + { + "epoch": 8.931506849315069, + "grad_norm": 8.596761703491211, + "learning_rate": 1.1882293252156267e-06, + "loss": 0.0649, + "step": 9780 + }, + { + "epoch": 8.9324200913242, + "grad_norm": 11.476851463317871, + "learning_rate": 1.1872146118721462e-06, + "loss": 0.0863, + "step": 9781 + }, + { + "epoch": 8.933333333333334, + "grad_norm": 5.3219499588012695, + "learning_rate": 1.1861998985286657e-06, + "loss": 0.0353, + "step": 9782 + }, + { + "epoch": 8.934246575342465, + "grad_norm": 15.008055686950684, + "learning_rate": 1.1851851851851854e-06, + "loss": 0.1026, + "step": 9783 + }, + { + "epoch": 8.935159817351598, + "grad_norm": 5.844153881072998, + "learning_rate": 1.1841704718417049e-06, + "loss": 0.0124, + "step": 9784 + }, + { + "epoch": 8.93607305936073, + "grad_norm": 12.052104949951172, + "learning_rate": 1.1831557584982243e-06, + "loss": 0.1416, + "step": 9785 + }, + { + "epoch": 8.936986301369863, + "grad_norm": 0.049013037234544754, + "learning_rate": 1.1821410451547438e-06, + "loss": 0.0004, + "step": 9786 + }, + { + "epoch": 8.937899543378995, + "grad_norm": 10.358060836791992, + "learning_rate": 1.1811263318112635e-06, + "loss": 0.0634, + "step": 9787 + }, + { + "epoch": 8.938812785388128, + "grad_norm": 0.11951693892478943, + "learning_rate": 1.180111618467783e-06, + "loss": 0.0005, + "step": 9788 + }, + { + "epoch": 8.93972602739726, + "grad_norm": 0.764905571937561, + "learning_rate": 1.1790969051243025e-06, + "loss": 0.0039, + "step": 9789 + }, + { + "epoch": 8.940639269406393, + "grad_norm": 3.540788412094116, + "learning_rate": 1.178082191780822e-06, + "loss": 0.0209, + "step": 9790 + }, + { + "epoch": 8.941552511415525, + "grad_norm": 0.33600106835365295, + "learning_rate": 1.1770674784373414e-06, + "loss": 0.002, + "step": 9791 + }, + { + "epoch": 8.942465753424658, + "grad_norm": 0.24867278337478638, + "learning_rate": 1.1760527650938611e-06, + "loss": 0.0014, + "step": 9792 + }, + { + "epoch": 8.94337899543379, + "grad_norm": 6.1189188957214355, + "learning_rate": 1.1750380517503806e-06, + "loss": 0.0288, + "step": 9793 + }, + { + "epoch": 8.944292237442923, + "grad_norm": 113.6630859375, + "learning_rate": 1.1740233384069e-06, + "loss": 1.6455, + "step": 9794 + }, + { + "epoch": 8.945205479452055, + "grad_norm": 1.8877054452896118, + "learning_rate": 1.1730086250634198e-06, + "loss": 0.0121, + "step": 9795 + }, + { + "epoch": 8.946118721461188, + "grad_norm": 0.2807902991771698, + "learning_rate": 1.171993911719939e-06, + "loss": 0.0023, + "step": 9796 + }, + { + "epoch": 8.94703196347032, + "grad_norm": 0.9268717169761658, + "learning_rate": 1.1709791983764587e-06, + "loss": 0.0068, + "step": 9797 + }, + { + "epoch": 8.947945205479453, + "grad_norm": 0.7625171542167664, + "learning_rate": 1.1699644850329784e-06, + "loss": 0.0042, + "step": 9798 + }, + { + "epoch": 8.948858447488584, + "grad_norm": 1.575685739517212, + "learning_rate": 1.1689497716894977e-06, + "loss": 0.0112, + "step": 9799 + }, + { + "epoch": 8.949771689497716, + "grad_norm": 12.554295539855957, + "learning_rate": 1.1679350583460174e-06, + "loss": 0.0766, + "step": 9800 + }, + { + "epoch": 8.95068493150685, + "grad_norm": 8.588421821594238, + "learning_rate": 1.1669203450025369e-06, + "loss": 0.0479, + "step": 9801 + }, + { + "epoch": 8.951598173515983, + "grad_norm": 4.262197017669678, + "learning_rate": 1.1659056316590563e-06, + "loss": 0.0251, + "step": 9802 + }, + { + "epoch": 8.952511415525114, + "grad_norm": 0.8330851793289185, + "learning_rate": 1.164890918315576e-06, + "loss": 0.0052, + "step": 9803 + }, + { + "epoch": 8.953424657534246, + "grad_norm": 4.199349403381348, + "learning_rate": 1.1638762049720955e-06, + "loss": 0.0174, + "step": 9804 + }, + { + "epoch": 8.954337899543379, + "grad_norm": 1.236488938331604, + "learning_rate": 1.162861491628615e-06, + "loss": 0.008, + "step": 9805 + }, + { + "epoch": 8.95525114155251, + "grad_norm": 4.3064188957214355, + "learning_rate": 1.1618467782851345e-06, + "loss": 0.0243, + "step": 9806 + }, + { + "epoch": 8.956164383561644, + "grad_norm": 7.978588581085205, + "learning_rate": 1.1608320649416542e-06, + "loss": 0.0484, + "step": 9807 + }, + { + "epoch": 8.957077625570776, + "grad_norm": 0.40948083996772766, + "learning_rate": 1.1598173515981737e-06, + "loss": 0.0027, + "step": 9808 + }, + { + "epoch": 8.957990867579909, + "grad_norm": 1.6099737882614136, + "learning_rate": 1.1588026382546931e-06, + "loss": 0.0073, + "step": 9809 + }, + { + "epoch": 8.95890410958904, + "grad_norm": 6.952436923980713, + "learning_rate": 1.1577879249112126e-06, + "loss": 0.0599, + "step": 9810 + }, + { + "epoch": 8.959817351598174, + "grad_norm": 6.972729682922363, + "learning_rate": 1.156773211567732e-06, + "loss": 0.0372, + "step": 9811 + }, + { + "epoch": 8.960730593607305, + "grad_norm": 0.16908586025238037, + "learning_rate": 1.1557584982242518e-06, + "loss": 0.0008, + "step": 9812 + }, + { + "epoch": 8.961643835616439, + "grad_norm": 2.2244343757629395, + "learning_rate": 1.1547437848807713e-06, + "loss": 0.0148, + "step": 9813 + }, + { + "epoch": 8.96255707762557, + "grad_norm": 3.2171475887298584, + "learning_rate": 1.1537290715372907e-06, + "loss": 0.0168, + "step": 9814 + }, + { + "epoch": 8.963470319634704, + "grad_norm": 3.1108007431030273, + "learning_rate": 1.1527143581938104e-06, + "loss": 0.024, + "step": 9815 + }, + { + "epoch": 8.964383561643835, + "grad_norm": 1.3490240573883057, + "learning_rate": 1.1516996448503297e-06, + "loss": 0.0065, + "step": 9816 + }, + { + "epoch": 8.965296803652969, + "grad_norm": 0.1501314789056778, + "learning_rate": 1.1506849315068494e-06, + "loss": 0.0009, + "step": 9817 + }, + { + "epoch": 8.9662100456621, + "grad_norm": 18.633941650390625, + "learning_rate": 1.149670218163369e-06, + "loss": 0.1078, + "step": 9818 + }, + { + "epoch": 8.967123287671233, + "grad_norm": 27.869985580444336, + "learning_rate": 1.1486555048198884e-06, + "loss": 0.0971, + "step": 9819 + }, + { + "epoch": 8.968036529680365, + "grad_norm": 45.581851959228516, + "learning_rate": 1.147640791476408e-06, + "loss": 0.3445, + "step": 9820 + }, + { + "epoch": 8.968949771689498, + "grad_norm": 1.3221274614334106, + "learning_rate": 1.1466260781329275e-06, + "loss": 0.0071, + "step": 9821 + }, + { + "epoch": 8.96986301369863, + "grad_norm": 1.9933167695999146, + "learning_rate": 1.145611364789447e-06, + "loss": 0.0118, + "step": 9822 + }, + { + "epoch": 8.970776255707763, + "grad_norm": 3.5796291828155518, + "learning_rate": 1.1445966514459667e-06, + "loss": 0.0157, + "step": 9823 + }, + { + "epoch": 8.971689497716895, + "grad_norm": 6.478082656860352, + "learning_rate": 1.1435819381024862e-06, + "loss": 0.0357, + "step": 9824 + }, + { + "epoch": 8.972602739726028, + "grad_norm": 7.158123970031738, + "learning_rate": 1.1425672247590057e-06, + "loss": 0.0317, + "step": 9825 + }, + { + "epoch": 8.97351598173516, + "grad_norm": 8.505228996276855, + "learning_rate": 1.1415525114155251e-06, + "loss": 0.0485, + "step": 9826 + }, + { + "epoch": 8.974429223744291, + "grad_norm": 5.39371919631958, + "learning_rate": 1.1405377980720448e-06, + "loss": 0.0252, + "step": 9827 + }, + { + "epoch": 8.975342465753425, + "grad_norm": 2.4850552082061768, + "learning_rate": 1.1395230847285643e-06, + "loss": 0.0185, + "step": 9828 + }, + { + "epoch": 8.976255707762558, + "grad_norm": 8.318456649780273, + "learning_rate": 1.1385083713850838e-06, + "loss": 0.0384, + "step": 9829 + }, + { + "epoch": 8.97716894977169, + "grad_norm": 3.2937676906585693, + "learning_rate": 1.1374936580416033e-06, + "loss": 0.0276, + "step": 9830 + }, + { + "epoch": 8.978082191780821, + "grad_norm": 234.5017852783203, + "learning_rate": 1.1364789446981228e-06, + "loss": 2.7844, + "step": 9831 + }, + { + "epoch": 8.978995433789954, + "grad_norm": 0.16160480678081512, + "learning_rate": 1.1354642313546425e-06, + "loss": 0.0011, + "step": 9832 + }, + { + "epoch": 8.979908675799086, + "grad_norm": 18.449501037597656, + "learning_rate": 1.134449518011162e-06, + "loss": 0.1382, + "step": 9833 + }, + { + "epoch": 8.98082191780822, + "grad_norm": 0.451354056596756, + "learning_rate": 1.1334348046676814e-06, + "loss": 0.0032, + "step": 9834 + }, + { + "epoch": 8.981735159817351, + "grad_norm": 7.773001194000244, + "learning_rate": 1.132420091324201e-06, + "loss": 0.0487, + "step": 9835 + }, + { + "epoch": 8.982648401826484, + "grad_norm": 1.702476143836975, + "learning_rate": 1.1314053779807204e-06, + "loss": 0.0084, + "step": 9836 + }, + { + "epoch": 8.983561643835616, + "grad_norm": 0.3326089084148407, + "learning_rate": 1.13039066463724e-06, + "loss": 0.0022, + "step": 9837 + }, + { + "epoch": 8.98447488584475, + "grad_norm": 18.10955810546875, + "learning_rate": 1.1293759512937598e-06, + "loss": 0.1044, + "step": 9838 + }, + { + "epoch": 8.98538812785388, + "grad_norm": 0.6200478672981262, + "learning_rate": 1.128361237950279e-06, + "loss": 0.0042, + "step": 9839 + }, + { + "epoch": 8.986301369863014, + "grad_norm": 5.232530117034912, + "learning_rate": 1.1273465246067987e-06, + "loss": 0.0242, + "step": 9840 + }, + { + "epoch": 8.987214611872146, + "grad_norm": 4.161727428436279, + "learning_rate": 1.1263318112633182e-06, + "loss": 0.0232, + "step": 9841 + }, + { + "epoch": 8.988127853881279, + "grad_norm": 2.2918713092803955, + "learning_rate": 1.1253170979198377e-06, + "loss": 0.0174, + "step": 9842 + }, + { + "epoch": 8.98904109589041, + "grad_norm": 1.1769523620605469, + "learning_rate": 1.1243023845763574e-06, + "loss": 0.0059, + "step": 9843 + }, + { + "epoch": 8.989954337899544, + "grad_norm": 7.042264938354492, + "learning_rate": 1.1232876712328769e-06, + "loss": 0.0496, + "step": 9844 + }, + { + "epoch": 8.990867579908675, + "grad_norm": 3.9912638664245605, + "learning_rate": 1.1222729578893963e-06, + "loss": 0.0212, + "step": 9845 + }, + { + "epoch": 8.991780821917809, + "grad_norm": 15.79181957244873, + "learning_rate": 1.1212582445459158e-06, + "loss": 0.0816, + "step": 9846 + }, + { + "epoch": 8.99269406392694, + "grad_norm": 4.549249649047852, + "learning_rate": 1.1202435312024355e-06, + "loss": 0.0286, + "step": 9847 + }, + { + "epoch": 8.993607305936074, + "grad_norm": 0.10151300579309464, + "learning_rate": 1.119228817858955e-06, + "loss": 0.0008, + "step": 9848 + }, + { + "epoch": 8.994520547945205, + "grad_norm": 1.6851966381072998, + "learning_rate": 1.1182141045154745e-06, + "loss": 0.0082, + "step": 9849 + }, + { + "epoch": 8.995433789954339, + "grad_norm": 106.9466781616211, + "learning_rate": 1.117199391171994e-06, + "loss": 1.8971, + "step": 9850 + }, + { + "epoch": 8.99634703196347, + "grad_norm": 0.8295902609825134, + "learning_rate": 1.1161846778285134e-06, + "loss": 0.0068, + "step": 9851 + }, + { + "epoch": 8.997260273972604, + "grad_norm": 0.34875521063804626, + "learning_rate": 1.1151699644850331e-06, + "loss": 0.0016, + "step": 9852 + }, + { + "epoch": 8.998173515981735, + "grad_norm": 0.9316357374191284, + "learning_rate": 1.1141552511415526e-06, + "loss": 0.0049, + "step": 9853 + }, + { + "epoch": 8.999086757990867, + "grad_norm": 2.6714086532592773, + "learning_rate": 1.113140537798072e-06, + "loss": 0.0217, + "step": 9854 + }, + { + "epoch": 9.0, + "grad_norm": 2.5005056858062744, + "learning_rate": 1.1121258244545918e-06, + "loss": 0.0181, + "step": 9855 + }, + { + "epoch": 9.000913242009132, + "grad_norm": 0.0972447320818901, + "learning_rate": 1.111111111111111e-06, + "loss": 0.0006, + "step": 9856 + }, + { + "epoch": 9.001826484018265, + "grad_norm": 1.034501552581787, + "learning_rate": 1.1100963977676307e-06, + "loss": 0.0061, + "step": 9857 + }, + { + "epoch": 9.002739726027396, + "grad_norm": 1.4740025997161865, + "learning_rate": 1.1090816844241504e-06, + "loss": 0.0088, + "step": 9858 + }, + { + "epoch": 9.00365296803653, + "grad_norm": 3.7948944568634033, + "learning_rate": 1.1080669710806697e-06, + "loss": 0.0253, + "step": 9859 + }, + { + "epoch": 9.004566210045661, + "grad_norm": 0.2852112650871277, + "learning_rate": 1.1070522577371894e-06, + "loss": 0.0024, + "step": 9860 + }, + { + "epoch": 9.005479452054795, + "grad_norm": 0.6388015151023865, + "learning_rate": 1.1060375443937089e-06, + "loss": 0.0044, + "step": 9861 + }, + { + "epoch": 9.006392694063926, + "grad_norm": 0.8835785388946533, + "learning_rate": 1.1050228310502283e-06, + "loss": 0.0052, + "step": 9862 + }, + { + "epoch": 9.00730593607306, + "grad_norm": 1.428151249885559, + "learning_rate": 1.104008117706748e-06, + "loss": 0.0077, + "step": 9863 + }, + { + "epoch": 9.008219178082191, + "grad_norm": 1.1886768341064453, + "learning_rate": 1.1029934043632675e-06, + "loss": 0.007, + "step": 9864 + }, + { + "epoch": 9.009132420091325, + "grad_norm": 38.66246032714844, + "learning_rate": 1.101978691019787e-06, + "loss": 0.1607, + "step": 9865 + }, + { + "epoch": 9.010045662100456, + "grad_norm": 18.109582901000977, + "learning_rate": 1.1009639776763065e-06, + "loss": 0.1191, + "step": 9866 + }, + { + "epoch": 9.01095890410959, + "grad_norm": 0.4233149290084839, + "learning_rate": 1.099949264332826e-06, + "loss": 0.004, + "step": 9867 + }, + { + "epoch": 9.011872146118721, + "grad_norm": 1.9385157823562622, + "learning_rate": 1.0989345509893456e-06, + "loss": 0.0196, + "step": 9868 + }, + { + "epoch": 9.012785388127854, + "grad_norm": 77.65215301513672, + "learning_rate": 1.0979198376458651e-06, + "loss": 0.4841, + "step": 9869 + }, + { + "epoch": 9.013698630136986, + "grad_norm": 34.454307556152344, + "learning_rate": 1.0969051243023846e-06, + "loss": 0.2284, + "step": 9870 + }, + { + "epoch": 9.01461187214612, + "grad_norm": 0.0043792021460831165, + "learning_rate": 1.095890410958904e-06, + "loss": 0.0, + "step": 9871 + }, + { + "epoch": 9.01552511415525, + "grad_norm": 33.669822692871094, + "learning_rate": 1.0948756976154238e-06, + "loss": 0.1695, + "step": 9872 + }, + { + "epoch": 9.016438356164384, + "grad_norm": 2.0732128620147705, + "learning_rate": 1.0938609842719433e-06, + "loss": 0.0084, + "step": 9873 + }, + { + "epoch": 9.017351598173516, + "grad_norm": 1.1821315288543701, + "learning_rate": 1.0928462709284627e-06, + "loss": 0.0088, + "step": 9874 + }, + { + "epoch": 9.018264840182649, + "grad_norm": 0.10977492481470108, + "learning_rate": 1.0918315575849824e-06, + "loss": 0.0008, + "step": 9875 + }, + { + "epoch": 9.01917808219178, + "grad_norm": 0.9388059973716736, + "learning_rate": 1.0908168442415017e-06, + "loss": 0.0076, + "step": 9876 + }, + { + "epoch": 9.020091324200914, + "grad_norm": 4.00321102142334, + "learning_rate": 1.0898021308980214e-06, + "loss": 0.0207, + "step": 9877 + }, + { + "epoch": 9.021004566210046, + "grad_norm": 0.6513509750366211, + "learning_rate": 1.088787417554541e-06, + "loss": 0.0025, + "step": 9878 + }, + { + "epoch": 9.021917808219179, + "grad_norm": 11.572553634643555, + "learning_rate": 1.0877727042110604e-06, + "loss": 0.0425, + "step": 9879 + }, + { + "epoch": 9.02283105022831, + "grad_norm": 1.176753044128418, + "learning_rate": 1.08675799086758e-06, + "loss": 0.0063, + "step": 9880 + }, + { + "epoch": 9.023744292237444, + "grad_norm": 11.13795280456543, + "learning_rate": 1.0857432775240995e-06, + "loss": 0.0659, + "step": 9881 + }, + { + "epoch": 9.024657534246575, + "grad_norm": 44.26383590698242, + "learning_rate": 1.084728564180619e-06, + "loss": 0.1957, + "step": 9882 + }, + { + "epoch": 9.025570776255707, + "grad_norm": 4.353766441345215, + "learning_rate": 1.0837138508371387e-06, + "loss": 0.0284, + "step": 9883 + }, + { + "epoch": 9.02648401826484, + "grad_norm": 4.348663330078125, + "learning_rate": 1.0826991374936582e-06, + "loss": 0.0211, + "step": 9884 + }, + { + "epoch": 9.027397260273972, + "grad_norm": 4.10806941986084, + "learning_rate": 1.0816844241501777e-06, + "loss": 0.0152, + "step": 9885 + }, + { + "epoch": 9.028310502283105, + "grad_norm": 14.09305191040039, + "learning_rate": 1.0806697108066971e-06, + "loss": 0.0898, + "step": 9886 + }, + { + "epoch": 9.029223744292237, + "grad_norm": 0.038460470736026764, + "learning_rate": 1.0796549974632166e-06, + "loss": 0.0002, + "step": 9887 + }, + { + "epoch": 9.03013698630137, + "grad_norm": 14.851967811584473, + "learning_rate": 1.0786402841197363e-06, + "loss": 0.0875, + "step": 9888 + }, + { + "epoch": 9.031050228310502, + "grad_norm": 1.7024425268173218, + "learning_rate": 1.0776255707762558e-06, + "loss": 0.0111, + "step": 9889 + }, + { + "epoch": 9.031963470319635, + "grad_norm": 0.503818929195404, + "learning_rate": 1.0766108574327753e-06, + "loss": 0.0028, + "step": 9890 + }, + { + "epoch": 9.032876712328767, + "grad_norm": 12.502306938171387, + "learning_rate": 1.0755961440892948e-06, + "loss": 0.0808, + "step": 9891 + }, + { + "epoch": 9.0337899543379, + "grad_norm": 6.203315258026123, + "learning_rate": 1.0745814307458144e-06, + "loss": 0.0318, + "step": 9892 + }, + { + "epoch": 9.034703196347031, + "grad_norm": 4.419556617736816, + "learning_rate": 1.073566717402334e-06, + "loss": 0.027, + "step": 9893 + }, + { + "epoch": 9.035616438356165, + "grad_norm": 0.07089780271053314, + "learning_rate": 1.0725520040588534e-06, + "loss": 0.0004, + "step": 9894 + }, + { + "epoch": 9.036529680365296, + "grad_norm": 1.3530248403549194, + "learning_rate": 1.071537290715373e-06, + "loss": 0.0059, + "step": 9895 + }, + { + "epoch": 9.03744292237443, + "grad_norm": 12.56069278717041, + "learning_rate": 1.0705225773718924e-06, + "loss": 0.0957, + "step": 9896 + }, + { + "epoch": 9.038356164383561, + "grad_norm": 0.07954565435647964, + "learning_rate": 1.069507864028412e-06, + "loss": 0.0005, + "step": 9897 + }, + { + "epoch": 9.039269406392695, + "grad_norm": 10.04957103729248, + "learning_rate": 1.0684931506849318e-06, + "loss": 0.0745, + "step": 9898 + }, + { + "epoch": 9.040182648401826, + "grad_norm": 0.7799043655395508, + "learning_rate": 1.067478437341451e-06, + "loss": 0.0053, + "step": 9899 + }, + { + "epoch": 9.04109589041096, + "grad_norm": 1.1127794981002808, + "learning_rate": 1.0664637239979707e-06, + "loss": 0.0071, + "step": 9900 + }, + { + "epoch": 9.042009132420091, + "grad_norm": 0.5818769335746765, + "learning_rate": 1.0654490106544902e-06, + "loss": 0.0044, + "step": 9901 + }, + { + "epoch": 9.042922374429224, + "grad_norm": 0.5949493646621704, + "learning_rate": 1.0644342973110097e-06, + "loss": 0.003, + "step": 9902 + }, + { + "epoch": 9.043835616438356, + "grad_norm": 0.31719788908958435, + "learning_rate": 1.0634195839675294e-06, + "loss": 0.0016, + "step": 9903 + }, + { + "epoch": 9.04474885844749, + "grad_norm": 13.553098678588867, + "learning_rate": 1.0624048706240488e-06, + "loss": 0.0829, + "step": 9904 + }, + { + "epoch": 9.045662100456621, + "grad_norm": 7.380324840545654, + "learning_rate": 1.0613901572805683e-06, + "loss": 0.0489, + "step": 9905 + }, + { + "epoch": 9.046575342465754, + "grad_norm": 0.3288176953792572, + "learning_rate": 1.0603754439370878e-06, + "loss": 0.0027, + "step": 9906 + }, + { + "epoch": 9.047488584474886, + "grad_norm": 0.6783069372177124, + "learning_rate": 1.0593607305936073e-06, + "loss": 0.0043, + "step": 9907 + }, + { + "epoch": 9.04840182648402, + "grad_norm": 0.05933120846748352, + "learning_rate": 1.058346017250127e-06, + "loss": 0.0004, + "step": 9908 + }, + { + "epoch": 9.04931506849315, + "grad_norm": 1.1970343589782715, + "learning_rate": 1.0573313039066465e-06, + "loss": 0.0091, + "step": 9909 + }, + { + "epoch": 9.050228310502282, + "grad_norm": 4.298831939697266, + "learning_rate": 1.056316590563166e-06, + "loss": 0.0344, + "step": 9910 + }, + { + "epoch": 9.051141552511416, + "grad_norm": 0.36665958166122437, + "learning_rate": 1.0553018772196854e-06, + "loss": 0.0023, + "step": 9911 + }, + { + "epoch": 9.052054794520547, + "grad_norm": 3.71653151512146, + "learning_rate": 1.0542871638762051e-06, + "loss": 0.0205, + "step": 9912 + }, + { + "epoch": 9.05296803652968, + "grad_norm": 26.325897216796875, + "learning_rate": 1.0532724505327246e-06, + "loss": 0.1419, + "step": 9913 + }, + { + "epoch": 9.053881278538812, + "grad_norm": 0.3974607586860657, + "learning_rate": 1.052257737189244e-06, + "loss": 0.0016, + "step": 9914 + }, + { + "epoch": 9.054794520547945, + "grad_norm": 55.3441162109375, + "learning_rate": 1.0512430238457638e-06, + "loss": 0.4765, + "step": 9915 + }, + { + "epoch": 9.055707762557077, + "grad_norm": 0.45071160793304443, + "learning_rate": 1.050228310502283e-06, + "loss": 0.0023, + "step": 9916 + }, + { + "epoch": 9.05662100456621, + "grad_norm": 1.7289984226226807, + "learning_rate": 1.0492135971588027e-06, + "loss": 0.011, + "step": 9917 + }, + { + "epoch": 9.057534246575342, + "grad_norm": 0.5397674441337585, + "learning_rate": 1.0481988838153222e-06, + "loss": 0.0032, + "step": 9918 + }, + { + "epoch": 9.058447488584475, + "grad_norm": 0.22614160180091858, + "learning_rate": 1.0471841704718417e-06, + "loss": 0.0013, + "step": 9919 + }, + { + "epoch": 9.059360730593607, + "grad_norm": 0.2926305830478668, + "learning_rate": 1.0461694571283614e-06, + "loss": 0.0013, + "step": 9920 + }, + { + "epoch": 9.06027397260274, + "grad_norm": 1.0760102272033691, + "learning_rate": 1.0451547437848809e-06, + "loss": 0.0069, + "step": 9921 + }, + { + "epoch": 9.061187214611872, + "grad_norm": 0.2039516270160675, + "learning_rate": 1.0441400304414003e-06, + "loss": 0.0011, + "step": 9922 + }, + { + "epoch": 9.062100456621005, + "grad_norm": 3.7340338230133057, + "learning_rate": 1.04312531709792e-06, + "loss": 0.0239, + "step": 9923 + }, + { + "epoch": 9.063013698630137, + "grad_norm": 2.4376590251922607, + "learning_rate": 1.0421106037544395e-06, + "loss": 0.0146, + "step": 9924 + }, + { + "epoch": 9.06392694063927, + "grad_norm": 4.3029327392578125, + "learning_rate": 1.041095890410959e-06, + "loss": 0.0274, + "step": 9925 + }, + { + "epoch": 9.064840182648402, + "grad_norm": 45.321537017822266, + "learning_rate": 1.0400811770674785e-06, + "loss": 0.1889, + "step": 9926 + }, + { + "epoch": 9.065753424657535, + "grad_norm": 0.36612266302108765, + "learning_rate": 1.039066463723998e-06, + "loss": 0.0023, + "step": 9927 + }, + { + "epoch": 9.066666666666666, + "grad_norm": 9.887238502502441, + "learning_rate": 1.0380517503805176e-06, + "loss": 0.0571, + "step": 9928 + }, + { + "epoch": 9.0675799086758, + "grad_norm": 1.1745522022247314, + "learning_rate": 1.0370370370370371e-06, + "loss": 0.0075, + "step": 9929 + }, + { + "epoch": 9.068493150684931, + "grad_norm": 28.416746139526367, + "learning_rate": 1.0360223236935566e-06, + "loss": 0.1248, + "step": 9930 + }, + { + "epoch": 9.069406392694065, + "grad_norm": 4.668123722076416, + "learning_rate": 1.035007610350076e-06, + "loss": 0.0349, + "step": 9931 + }, + { + "epoch": 9.070319634703196, + "grad_norm": 5.578891277313232, + "learning_rate": 1.0339928970065958e-06, + "loss": 0.0467, + "step": 9932 + }, + { + "epoch": 9.07123287671233, + "grad_norm": 23.461732864379883, + "learning_rate": 1.0329781836631153e-06, + "loss": 0.1342, + "step": 9933 + }, + { + "epoch": 9.072146118721461, + "grad_norm": 0.08787541836500168, + "learning_rate": 1.0319634703196347e-06, + "loss": 0.0005, + "step": 9934 + }, + { + "epoch": 9.073059360730593, + "grad_norm": 107.31277465820312, + "learning_rate": 1.0309487569761544e-06, + "loss": 2.4833, + "step": 9935 + }, + { + "epoch": 9.073972602739726, + "grad_norm": 0.43070435523986816, + "learning_rate": 1.0299340436326737e-06, + "loss": 0.003, + "step": 9936 + }, + { + "epoch": 9.074885844748858, + "grad_norm": 2.497499942779541, + "learning_rate": 1.0289193302891934e-06, + "loss": 0.0091, + "step": 9937 + }, + { + "epoch": 9.075799086757991, + "grad_norm": 2.2467026710510254, + "learning_rate": 1.0279046169457129e-06, + "loss": 0.0061, + "step": 9938 + }, + { + "epoch": 9.076712328767123, + "grad_norm": 3.831503391265869, + "learning_rate": 1.0268899036022324e-06, + "loss": 0.0206, + "step": 9939 + }, + { + "epoch": 9.077625570776256, + "grad_norm": 0.15365716814994812, + "learning_rate": 1.025875190258752e-06, + "loss": 0.0009, + "step": 9940 + }, + { + "epoch": 9.078538812785387, + "grad_norm": 0.33063995838165283, + "learning_rate": 1.0248604769152715e-06, + "loss": 0.0016, + "step": 9941 + }, + { + "epoch": 9.07945205479452, + "grad_norm": 4.826691150665283, + "learning_rate": 1.023845763571791e-06, + "loss": 0.0224, + "step": 9942 + }, + { + "epoch": 9.080365296803652, + "grad_norm": 0.5035514235496521, + "learning_rate": 1.0228310502283107e-06, + "loss": 0.0028, + "step": 9943 + }, + { + "epoch": 9.081278538812786, + "grad_norm": 2.8215677738189697, + "learning_rate": 1.0218163368848302e-06, + "loss": 0.0158, + "step": 9944 + }, + { + "epoch": 9.082191780821917, + "grad_norm": 2.5622506141662598, + "learning_rate": 1.0208016235413497e-06, + "loss": 0.0187, + "step": 9945 + }, + { + "epoch": 9.08310502283105, + "grad_norm": 33.60354995727539, + "learning_rate": 1.0197869101978691e-06, + "loss": 0.2507, + "step": 9946 + }, + { + "epoch": 9.084018264840182, + "grad_norm": 86.76915740966797, + "learning_rate": 1.0187721968543886e-06, + "loss": 0.8385, + "step": 9947 + }, + { + "epoch": 9.084931506849315, + "grad_norm": 0.12263408303260803, + "learning_rate": 1.0177574835109083e-06, + "loss": 0.0008, + "step": 9948 + }, + { + "epoch": 9.085844748858447, + "grad_norm": 0.12688769400119781, + "learning_rate": 1.0167427701674278e-06, + "loss": 0.0005, + "step": 9949 + }, + { + "epoch": 9.08675799086758, + "grad_norm": 0.48244890570640564, + "learning_rate": 1.0157280568239473e-06, + "loss": 0.0034, + "step": 9950 + }, + { + "epoch": 9.087671232876712, + "grad_norm": 3.6111907958984375, + "learning_rate": 1.0147133434804667e-06, + "loss": 0.0196, + "step": 9951 + }, + { + "epoch": 9.088584474885845, + "grad_norm": 0.01904284954071045, + "learning_rate": 1.0136986301369864e-06, + "loss": 0.0001, + "step": 9952 + }, + { + "epoch": 9.089497716894977, + "grad_norm": 6.974794387817383, + "learning_rate": 1.012683916793506e-06, + "loss": 0.0591, + "step": 9953 + }, + { + "epoch": 9.09041095890411, + "grad_norm": 3.4327192306518555, + "learning_rate": 1.0116692034500254e-06, + "loss": 0.0146, + "step": 9954 + }, + { + "epoch": 9.091324200913242, + "grad_norm": 9.330728530883789, + "learning_rate": 1.010654490106545e-06, + "loss": 0.0434, + "step": 9955 + }, + { + "epoch": 9.092237442922375, + "grad_norm": 50.44504928588867, + "learning_rate": 1.0096397767630644e-06, + "loss": 0.437, + "step": 9956 + }, + { + "epoch": 9.093150684931507, + "grad_norm": 4.0982513427734375, + "learning_rate": 1.008625063419584e-06, + "loss": 0.0158, + "step": 9957 + }, + { + "epoch": 9.09406392694064, + "grad_norm": 1.4017114639282227, + "learning_rate": 1.0076103500761035e-06, + "loss": 0.0062, + "step": 9958 + }, + { + "epoch": 9.094977168949772, + "grad_norm": 0.45815345644950867, + "learning_rate": 1.006595636732623e-06, + "loss": 0.0034, + "step": 9959 + }, + { + "epoch": 9.095890410958905, + "grad_norm": 0.02698717825114727, + "learning_rate": 1.0055809233891427e-06, + "loss": 0.0002, + "step": 9960 + }, + { + "epoch": 9.096803652968037, + "grad_norm": 4.696172714233398, + "learning_rate": 1.0045662100456622e-06, + "loss": 0.0052, + "step": 9961 + }, + { + "epoch": 9.097716894977168, + "grad_norm": 1.298513650894165, + "learning_rate": 1.0035514967021817e-06, + "loss": 0.009, + "step": 9962 + }, + { + "epoch": 9.098630136986301, + "grad_norm": 0.4575071334838867, + "learning_rate": 1.0025367833587014e-06, + "loss": 0.0041, + "step": 9963 + }, + { + "epoch": 9.099543378995433, + "grad_norm": 2.0969343185424805, + "learning_rate": 1.0015220700152208e-06, + "loss": 0.0027, + "step": 9964 + }, + { + "epoch": 9.100456621004566, + "grad_norm": 0.8078073263168335, + "learning_rate": 1.0005073566717403e-06, + "loss": 0.007, + "step": 9965 + }, + { + "epoch": 9.101369863013698, + "grad_norm": 1.3850849866867065, + "learning_rate": 9.994926433282598e-07, + "loss": 0.0089, + "step": 9966 + }, + { + "epoch": 9.102283105022831, + "grad_norm": 0.6430597305297852, + "learning_rate": 9.984779299847793e-07, + "loss": 0.0041, + "step": 9967 + }, + { + "epoch": 9.103196347031963, + "grad_norm": 6.63968563079834, + "learning_rate": 9.97463216641299e-07, + "loss": 0.0408, + "step": 9968 + }, + { + "epoch": 9.104109589041096, + "grad_norm": 0.12037342041730881, + "learning_rate": 9.964485032978185e-07, + "loss": 0.0009, + "step": 9969 + }, + { + "epoch": 9.105022831050228, + "grad_norm": 24.971357345581055, + "learning_rate": 9.95433789954338e-07, + "loss": 0.2018, + "step": 9970 + }, + { + "epoch": 9.105936073059361, + "grad_norm": 0.19191767275333405, + "learning_rate": 9.944190766108574e-07, + "loss": 0.0016, + "step": 9971 + }, + { + "epoch": 9.106849315068493, + "grad_norm": 2.710923194885254, + "learning_rate": 9.934043632673771e-07, + "loss": 0.0129, + "step": 9972 + }, + { + "epoch": 9.107762557077626, + "grad_norm": 0.30855104327201843, + "learning_rate": 9.923896499238966e-07, + "loss": 0.002, + "step": 9973 + }, + { + "epoch": 9.108675799086758, + "grad_norm": 10.379520416259766, + "learning_rate": 9.91374936580416e-07, + "loss": 0.0629, + "step": 9974 + }, + { + "epoch": 9.10958904109589, + "grad_norm": 0.1814744919538498, + "learning_rate": 9.903602232369358e-07, + "loss": 0.0008, + "step": 9975 + }, + { + "epoch": 9.110502283105022, + "grad_norm": 0.08920720964670181, + "learning_rate": 9.89345509893455e-07, + "loss": 0.0004, + "step": 9976 + }, + { + "epoch": 9.111415525114156, + "grad_norm": 0.400336891412735, + "learning_rate": 9.883307965499747e-07, + "loss": 0.002, + "step": 9977 + }, + { + "epoch": 9.112328767123287, + "grad_norm": 0.5278018116950989, + "learning_rate": 9.873160832064942e-07, + "loss": 0.0028, + "step": 9978 + }, + { + "epoch": 9.11324200913242, + "grad_norm": 6.459722995758057, + "learning_rate": 9.863013698630137e-07, + "loss": 0.0458, + "step": 9979 + }, + { + "epoch": 9.114155251141552, + "grad_norm": 3.25468111038208, + "learning_rate": 9.852866565195334e-07, + "loss": 0.0259, + "step": 9980 + }, + { + "epoch": 9.115068493150686, + "grad_norm": 0.5269016623497009, + "learning_rate": 9.842719431760529e-07, + "loss": 0.0015, + "step": 9981 + }, + { + "epoch": 9.115981735159817, + "grad_norm": 1.4938936233520508, + "learning_rate": 9.832572298325723e-07, + "loss": 0.0093, + "step": 9982 + }, + { + "epoch": 9.11689497716895, + "grad_norm": 23.304229736328125, + "learning_rate": 9.82242516489092e-07, + "loss": 0.106, + "step": 9983 + }, + { + "epoch": 9.117808219178082, + "grad_norm": 0.020131615921854973, + "learning_rate": 9.812278031456115e-07, + "loss": 0.0002, + "step": 9984 + }, + { + "epoch": 9.118721461187215, + "grad_norm": 42.28857421875, + "learning_rate": 9.80213089802131e-07, + "loss": 0.5399, + "step": 9985 + }, + { + "epoch": 9.119634703196347, + "grad_norm": 0.5784845352172852, + "learning_rate": 9.791983764586505e-07, + "loss": 0.0037, + "step": 9986 + }, + { + "epoch": 9.12054794520548, + "grad_norm": 0.5318914651870728, + "learning_rate": 9.7818366311517e-07, + "loss": 0.0037, + "step": 9987 + }, + { + "epoch": 9.121461187214612, + "grad_norm": 0.4506818652153015, + "learning_rate": 9.771689497716896e-07, + "loss": 0.0018, + "step": 9988 + }, + { + "epoch": 9.122374429223743, + "grad_norm": 0.08738785237073898, + "learning_rate": 9.761542364282091e-07, + "loss": 0.0006, + "step": 9989 + }, + { + "epoch": 9.123287671232877, + "grad_norm": 0.13590413331985474, + "learning_rate": 9.751395230847286e-07, + "loss": 0.0008, + "step": 9990 + }, + { + "epoch": 9.124200913242008, + "grad_norm": 0.11652134358882904, + "learning_rate": 9.74124809741248e-07, + "loss": 0.0008, + "step": 9991 + }, + { + "epoch": 9.125114155251142, + "grad_norm": 0.8932674527168274, + "learning_rate": 9.731100963977678e-07, + "loss": 0.0061, + "step": 9992 + }, + { + "epoch": 9.126027397260273, + "grad_norm": 0.10781610012054443, + "learning_rate": 9.720953830542873e-07, + "loss": 0.0005, + "step": 9993 + }, + { + "epoch": 9.126940639269407, + "grad_norm": 0.8821057677268982, + "learning_rate": 9.710806697108067e-07, + "loss": 0.0048, + "step": 9994 + }, + { + "epoch": 9.127853881278538, + "grad_norm": 67.32794189453125, + "learning_rate": 9.700659563673264e-07, + "loss": 0.5343, + "step": 9995 + }, + { + "epoch": 9.128767123287671, + "grad_norm": 3.3175251483917236, + "learning_rate": 9.690512430238457e-07, + "loss": 0.0143, + "step": 9996 + }, + { + "epoch": 9.129680365296803, + "grad_norm": 0.0901031643152237, + "learning_rate": 9.680365296803654e-07, + "loss": 0.0006, + "step": 9997 + }, + { + "epoch": 9.130593607305936, + "grad_norm": 0.18027593195438385, + "learning_rate": 9.670218163368849e-07, + "loss": 0.0011, + "step": 9998 + }, + { + "epoch": 9.131506849315068, + "grad_norm": 52.37831115722656, + "learning_rate": 9.660071029934043e-07, + "loss": 0.4222, + "step": 9999 + }, + { + "epoch": 9.132420091324201, + "grad_norm": 4.314111232757568, + "learning_rate": 9.64992389649924e-07, + "loss": 0.0219, + "step": 10000 + }, + { + "epoch": 9.133333333333333, + "grad_norm": 1.3675837516784668, + "learning_rate": 9.639776763064435e-07, + "loss": 0.0128, + "step": 10001 + }, + { + "epoch": 9.134246575342466, + "grad_norm": 13.860060691833496, + "learning_rate": 9.62962962962963e-07, + "loss": 0.0867, + "step": 10002 + }, + { + "epoch": 9.135159817351598, + "grad_norm": 0.12232168763875961, + "learning_rate": 9.619482496194827e-07, + "loss": 0.0009, + "step": 10003 + }, + { + "epoch": 9.136073059360731, + "grad_norm": 0.11271098256111145, + "learning_rate": 9.609335362760022e-07, + "loss": 0.0005, + "step": 10004 + }, + { + "epoch": 9.136986301369863, + "grad_norm": 8.949652671813965, + "learning_rate": 9.599188229325217e-07, + "loss": 0.032, + "step": 10005 + }, + { + "epoch": 9.137899543378996, + "grad_norm": 11.375618934631348, + "learning_rate": 9.589041095890411e-07, + "loss": 0.0693, + "step": 10006 + }, + { + "epoch": 9.138812785388128, + "grad_norm": 2.062607765197754, + "learning_rate": 9.578893962455606e-07, + "loss": 0.0139, + "step": 10007 + }, + { + "epoch": 9.139726027397261, + "grad_norm": 12.547645568847656, + "learning_rate": 9.568746829020803e-07, + "loss": 0.0669, + "step": 10008 + }, + { + "epoch": 9.140639269406392, + "grad_norm": 24.60028648376465, + "learning_rate": 9.558599695585998e-07, + "loss": 0.1458, + "step": 10009 + }, + { + "epoch": 9.141552511415526, + "grad_norm": 1.3075990676879883, + "learning_rate": 9.548452562151193e-07, + "loss": 0.0089, + "step": 10010 + }, + { + "epoch": 9.142465753424657, + "grad_norm": 0.19895578920841217, + "learning_rate": 9.538305428716387e-07, + "loss": 0.001, + "step": 10011 + }, + { + "epoch": 9.14337899543379, + "grad_norm": 1.1858463287353516, + "learning_rate": 9.528158295281583e-07, + "loss": 0.0067, + "step": 10012 + }, + { + "epoch": 9.144292237442922, + "grad_norm": 0.9061986207962036, + "learning_rate": 9.518011161846779e-07, + "loss": 0.0039, + "step": 10013 + }, + { + "epoch": 9.145205479452056, + "grad_norm": 0.16420714557170868, + "learning_rate": 9.507864028411974e-07, + "loss": 0.0012, + "step": 10014 + }, + { + "epoch": 9.146118721461187, + "grad_norm": 11.789826393127441, + "learning_rate": 9.49771689497717e-07, + "loss": 0.071, + "step": 10015 + }, + { + "epoch": 9.147031963470319, + "grad_norm": 35.21945571899414, + "learning_rate": 9.487569761542365e-07, + "loss": 0.2117, + "step": 10016 + }, + { + "epoch": 9.147945205479452, + "grad_norm": 2.749758243560791, + "learning_rate": 9.47742262810756e-07, + "loss": 0.0126, + "step": 10017 + }, + { + "epoch": 9.148858447488584, + "grad_norm": 0.26407134532928467, + "learning_rate": 9.467275494672756e-07, + "loss": 0.002, + "step": 10018 + }, + { + "epoch": 9.149771689497717, + "grad_norm": 6.513951301574707, + "learning_rate": 9.457128361237951e-07, + "loss": 0.0248, + "step": 10019 + }, + { + "epoch": 9.150684931506849, + "grad_norm": 19.160646438598633, + "learning_rate": 9.446981227803147e-07, + "loss": 0.1157, + "step": 10020 + }, + { + "epoch": 9.151598173515982, + "grad_norm": 5.568652629852295, + "learning_rate": 9.436834094368341e-07, + "loss": 0.0425, + "step": 10021 + }, + { + "epoch": 9.152511415525113, + "grad_norm": 27.18406867980957, + "learning_rate": 9.426686960933537e-07, + "loss": 0.2058, + "step": 10022 + }, + { + "epoch": 9.153424657534247, + "grad_norm": 0.2571437656879425, + "learning_rate": 9.416539827498732e-07, + "loss": 0.0015, + "step": 10023 + }, + { + "epoch": 9.154337899543378, + "grad_norm": 0.3877418637275696, + "learning_rate": 9.406392694063927e-07, + "loss": 0.003, + "step": 10024 + }, + { + "epoch": 9.155251141552512, + "grad_norm": 1.432450532913208, + "learning_rate": 9.396245560629123e-07, + "loss": 0.0088, + "step": 10025 + }, + { + "epoch": 9.156164383561643, + "grad_norm": 9.965012550354004, + "learning_rate": 9.386098427194318e-07, + "loss": 0.064, + "step": 10026 + }, + { + "epoch": 9.157077625570777, + "grad_norm": 3.684995174407959, + "learning_rate": 9.375951293759514e-07, + "loss": 0.0246, + "step": 10027 + }, + { + "epoch": 9.157990867579908, + "grad_norm": 1.613386631011963, + "learning_rate": 9.36580416032471e-07, + "loss": 0.0102, + "step": 10028 + }, + { + "epoch": 9.158904109589042, + "grad_norm": 11.285295486450195, + "learning_rate": 9.355657026889904e-07, + "loss": 0.0315, + "step": 10029 + }, + { + "epoch": 9.159817351598173, + "grad_norm": 1.6090245246887207, + "learning_rate": 9.3455098934551e-07, + "loss": 0.0075, + "step": 10030 + }, + { + "epoch": 9.160730593607306, + "grad_norm": 3.980478525161743, + "learning_rate": 9.335362760020294e-07, + "loss": 0.0281, + "step": 10031 + }, + { + "epoch": 9.161643835616438, + "grad_norm": 16.72954559326172, + "learning_rate": 9.32521562658549e-07, + "loss": 0.1671, + "step": 10032 + }, + { + "epoch": 9.162557077625571, + "grad_norm": 19.207006454467773, + "learning_rate": 9.315068493150686e-07, + "loss": 0.1358, + "step": 10033 + }, + { + "epoch": 9.163470319634703, + "grad_norm": 0.47363725304603577, + "learning_rate": 9.304921359715881e-07, + "loss": 0.0035, + "step": 10034 + }, + { + "epoch": 9.164383561643836, + "grad_norm": 0.027545783668756485, + "learning_rate": 9.294774226281076e-07, + "loss": 0.0001, + "step": 10035 + }, + { + "epoch": 9.165296803652968, + "grad_norm": 0.23594582080841064, + "learning_rate": 9.284627092846271e-07, + "loss": 0.0017, + "step": 10036 + }, + { + "epoch": 9.166210045662101, + "grad_norm": 1.6765680313110352, + "learning_rate": 9.274479959411467e-07, + "loss": 0.0093, + "step": 10037 + }, + { + "epoch": 9.167123287671233, + "grad_norm": 0.08727238327264786, + "learning_rate": 9.264332825976663e-07, + "loss": 0.0005, + "step": 10038 + }, + { + "epoch": 9.168036529680366, + "grad_norm": 0.7742429971694946, + "learning_rate": 9.254185692541858e-07, + "loss": 0.0047, + "step": 10039 + }, + { + "epoch": 9.168949771689498, + "grad_norm": 10.607063293457031, + "learning_rate": 9.244038559107054e-07, + "loss": 0.0537, + "step": 10040 + }, + { + "epoch": 9.169863013698631, + "grad_norm": 0.6391409635543823, + "learning_rate": 9.233891425672247e-07, + "loss": 0.0042, + "step": 10041 + }, + { + "epoch": 9.170776255707763, + "grad_norm": 14.850139617919922, + "learning_rate": 9.223744292237443e-07, + "loss": 0.058, + "step": 10042 + }, + { + "epoch": 9.171689497716894, + "grad_norm": 7.760124206542969, + "learning_rate": 9.213597158802639e-07, + "loss": 0.048, + "step": 10043 + }, + { + "epoch": 9.172602739726027, + "grad_norm": 0.05922950059175491, + "learning_rate": 9.203450025367834e-07, + "loss": 0.0004, + "step": 10044 + }, + { + "epoch": 9.173515981735159, + "grad_norm": 0.9093757271766663, + "learning_rate": 9.19330289193303e-07, + "loss": 0.0046, + "step": 10045 + }, + { + "epoch": 9.174429223744292, + "grad_norm": 29.802446365356445, + "learning_rate": 9.183155758498225e-07, + "loss": 0.1636, + "step": 10046 + }, + { + "epoch": 9.175342465753424, + "grad_norm": 1.1611344814300537, + "learning_rate": 9.17300862506342e-07, + "loss": 0.0077, + "step": 10047 + }, + { + "epoch": 9.176255707762557, + "grad_norm": 1.4882124662399292, + "learning_rate": 9.162861491628616e-07, + "loss": 0.007, + "step": 10048 + }, + { + "epoch": 9.177168949771689, + "grad_norm": 0.5391651391983032, + "learning_rate": 9.152714358193811e-07, + "loss": 0.0039, + "step": 10049 + }, + { + "epoch": 9.178082191780822, + "grad_norm": 15.644207954406738, + "learning_rate": 9.142567224759007e-07, + "loss": 0.083, + "step": 10050 + }, + { + "epoch": 9.178995433789954, + "grad_norm": 12.7400484085083, + "learning_rate": 9.132420091324201e-07, + "loss": 0.0849, + "step": 10051 + }, + { + "epoch": 9.179908675799087, + "grad_norm": 1.3587144613265991, + "learning_rate": 9.122272957889397e-07, + "loss": 0.0065, + "step": 10052 + }, + { + "epoch": 9.180821917808219, + "grad_norm": 0.1050049364566803, + "learning_rate": 9.112125824454592e-07, + "loss": 0.0006, + "step": 10053 + }, + { + "epoch": 9.181735159817352, + "grad_norm": 1.876326322555542, + "learning_rate": 9.101978691019787e-07, + "loss": 0.0134, + "step": 10054 + }, + { + "epoch": 9.182648401826484, + "grad_norm": 5.523950099945068, + "learning_rate": 9.091831557584983e-07, + "loss": 0.0255, + "step": 10055 + }, + { + "epoch": 9.183561643835617, + "grad_norm": 0.013389885425567627, + "learning_rate": 9.081684424150178e-07, + "loss": 0.0001, + "step": 10056 + }, + { + "epoch": 9.184474885844748, + "grad_norm": 1.2681201696395874, + "learning_rate": 9.071537290715374e-07, + "loss": 0.007, + "step": 10057 + }, + { + "epoch": 9.185388127853882, + "grad_norm": 36.398841857910156, + "learning_rate": 9.06139015728057e-07, + "loss": 0.2722, + "step": 10058 + }, + { + "epoch": 9.186301369863013, + "grad_norm": 0.4720153212547302, + "learning_rate": 9.051243023845764e-07, + "loss": 0.0035, + "step": 10059 + }, + { + "epoch": 9.187214611872147, + "grad_norm": 0.751547634601593, + "learning_rate": 9.04109589041096e-07, + "loss": 0.0046, + "step": 10060 + }, + { + "epoch": 9.188127853881278, + "grad_norm": 2.9652910232543945, + "learning_rate": 9.030948756976154e-07, + "loss": 0.019, + "step": 10061 + }, + { + "epoch": 9.189041095890412, + "grad_norm": 44.967384338378906, + "learning_rate": 9.02080162354135e-07, + "loss": 0.3418, + "step": 10062 + }, + { + "epoch": 9.189954337899543, + "grad_norm": 1.941185474395752, + "learning_rate": 9.010654490106546e-07, + "loss": 0.0077, + "step": 10063 + }, + { + "epoch": 9.190867579908677, + "grad_norm": 1.1089142560958862, + "learning_rate": 9.000507356671741e-07, + "loss": 0.0049, + "step": 10064 + }, + { + "epoch": 9.191780821917808, + "grad_norm": 3.5062122344970703, + "learning_rate": 8.990360223236936e-07, + "loss": 0.0226, + "step": 10065 + }, + { + "epoch": 9.192694063926941, + "grad_norm": 14.933874130249023, + "learning_rate": 8.980213089802131e-07, + "loss": 0.1347, + "step": 10066 + }, + { + "epoch": 9.193607305936073, + "grad_norm": 0.00724453991279006, + "learning_rate": 8.970065956367327e-07, + "loss": 0.0, + "step": 10067 + }, + { + "epoch": 9.194520547945206, + "grad_norm": 33.541290283203125, + "learning_rate": 8.959918822932523e-07, + "loss": 0.2194, + "step": 10068 + }, + { + "epoch": 9.195433789954338, + "grad_norm": 10.193276405334473, + "learning_rate": 8.949771689497718e-07, + "loss": 0.0624, + "step": 10069 + }, + { + "epoch": 9.19634703196347, + "grad_norm": 0.1837274432182312, + "learning_rate": 8.939624556062914e-07, + "loss": 0.001, + "step": 10070 + }, + { + "epoch": 9.197260273972603, + "grad_norm": 0.13744527101516724, + "learning_rate": 8.929477422628107e-07, + "loss": 0.0007, + "step": 10071 + }, + { + "epoch": 9.198173515981734, + "grad_norm": 0.5968984365463257, + "learning_rate": 8.919330289193303e-07, + "loss": 0.0033, + "step": 10072 + }, + { + "epoch": 9.199086757990868, + "grad_norm": 0.23295873403549194, + "learning_rate": 8.909183155758499e-07, + "loss": 0.0012, + "step": 10073 + }, + { + "epoch": 9.2, + "grad_norm": 0.8619002103805542, + "learning_rate": 8.899036022323694e-07, + "loss": 0.0064, + "step": 10074 + }, + { + "epoch": 9.200913242009133, + "grad_norm": 2.178135395050049, + "learning_rate": 8.88888888888889e-07, + "loss": 0.0089, + "step": 10075 + }, + { + "epoch": 9.201826484018264, + "grad_norm": 29.886987686157227, + "learning_rate": 8.878741755454085e-07, + "loss": 0.1632, + "step": 10076 + }, + { + "epoch": 9.202739726027398, + "grad_norm": 23.573284149169922, + "learning_rate": 8.86859462201928e-07, + "loss": 0.2608, + "step": 10077 + }, + { + "epoch": 9.203652968036529, + "grad_norm": 18.19723129272461, + "learning_rate": 8.858447488584476e-07, + "loss": 0.1162, + "step": 10078 + }, + { + "epoch": 9.204566210045662, + "grad_norm": 5.581526279449463, + "learning_rate": 8.848300355149671e-07, + "loss": 0.0288, + "step": 10079 + }, + { + "epoch": 9.205479452054794, + "grad_norm": 3.620969533920288, + "learning_rate": 8.838153221714867e-07, + "loss": 0.023, + "step": 10080 + }, + { + "epoch": 9.206392694063927, + "grad_norm": 0.031615760177373886, + "learning_rate": 8.828006088280061e-07, + "loss": 0.0002, + "step": 10081 + }, + { + "epoch": 9.207305936073059, + "grad_norm": 0.23590992391109467, + "learning_rate": 8.817858954845257e-07, + "loss": 0.0014, + "step": 10082 + }, + { + "epoch": 9.208219178082192, + "grad_norm": 2.7026753425598145, + "learning_rate": 8.807711821410452e-07, + "loss": 0.0169, + "step": 10083 + }, + { + "epoch": 9.209132420091324, + "grad_norm": 30.56671905517578, + "learning_rate": 8.797564687975647e-07, + "loss": 0.2536, + "step": 10084 + }, + { + "epoch": 9.210045662100457, + "grad_norm": 80.8609848022461, + "learning_rate": 8.787417554540843e-07, + "loss": 0.696, + "step": 10085 + }, + { + "epoch": 9.210958904109589, + "grad_norm": 0.7731278538703918, + "learning_rate": 8.777270421106038e-07, + "loss": 0.0059, + "step": 10086 + }, + { + "epoch": 9.211872146118722, + "grad_norm": 3.9215052127838135, + "learning_rate": 8.767123287671234e-07, + "loss": 0.0205, + "step": 10087 + }, + { + "epoch": 9.212785388127854, + "grad_norm": 21.528709411621094, + "learning_rate": 8.75697615423643e-07, + "loss": 0.1361, + "step": 10088 + }, + { + "epoch": 9.213698630136987, + "grad_norm": 3.665604591369629, + "learning_rate": 8.746829020801624e-07, + "loss": 0.0206, + "step": 10089 + }, + { + "epoch": 9.214611872146119, + "grad_norm": 0.5128761529922485, + "learning_rate": 8.73668188736682e-07, + "loss": 0.0028, + "step": 10090 + }, + { + "epoch": 9.215525114155252, + "grad_norm": 38.437232971191406, + "learning_rate": 8.726534753932014e-07, + "loss": 0.1669, + "step": 10091 + }, + { + "epoch": 9.216438356164383, + "grad_norm": 0.3408287465572357, + "learning_rate": 8.71638762049721e-07, + "loss": 0.0018, + "step": 10092 + }, + { + "epoch": 9.217351598173517, + "grad_norm": 0.5201738476753235, + "learning_rate": 8.706240487062406e-07, + "loss": 0.0036, + "step": 10093 + }, + { + "epoch": 9.218264840182648, + "grad_norm": 0.9152113199234009, + "learning_rate": 8.696093353627601e-07, + "loss": 0.0051, + "step": 10094 + }, + { + "epoch": 9.219178082191782, + "grad_norm": 0.4644927680492401, + "learning_rate": 8.685946220192796e-07, + "loss": 0.0029, + "step": 10095 + }, + { + "epoch": 9.220091324200913, + "grad_norm": 0.4064687192440033, + "learning_rate": 8.675799086757991e-07, + "loss": 0.001, + "step": 10096 + }, + { + "epoch": 9.221004566210045, + "grad_norm": 11.864093780517578, + "learning_rate": 8.665651953323187e-07, + "loss": 0.0498, + "step": 10097 + }, + { + "epoch": 9.221917808219178, + "grad_norm": 1.3872014284133911, + "learning_rate": 8.655504819888383e-07, + "loss": 0.0102, + "step": 10098 + }, + { + "epoch": 9.22283105022831, + "grad_norm": 0.22667816281318665, + "learning_rate": 8.645357686453578e-07, + "loss": 0.0021, + "step": 10099 + }, + { + "epoch": 9.223744292237443, + "grad_norm": 33.198543548583984, + "learning_rate": 8.635210553018774e-07, + "loss": 0.2582, + "step": 10100 + }, + { + "epoch": 9.224657534246575, + "grad_norm": 33.782432556152344, + "learning_rate": 8.625063419583967e-07, + "loss": 0.2721, + "step": 10101 + }, + { + "epoch": 9.225570776255708, + "grad_norm": 12.098878860473633, + "learning_rate": 8.614916286149163e-07, + "loss": 0.071, + "step": 10102 + }, + { + "epoch": 9.22648401826484, + "grad_norm": 47.58504104614258, + "learning_rate": 8.604769152714359e-07, + "loss": 0.2982, + "step": 10103 + }, + { + "epoch": 9.227397260273973, + "grad_norm": 15.140144348144531, + "learning_rate": 8.594622019279554e-07, + "loss": 0.059, + "step": 10104 + }, + { + "epoch": 9.228310502283104, + "grad_norm": 0.42407724261283875, + "learning_rate": 8.58447488584475e-07, + "loss": 0.0012, + "step": 10105 + }, + { + "epoch": 9.229223744292238, + "grad_norm": 4.5387468338012695, + "learning_rate": 8.574327752409945e-07, + "loss": 0.0191, + "step": 10106 + }, + { + "epoch": 9.23013698630137, + "grad_norm": 1.904900074005127, + "learning_rate": 8.56418061897514e-07, + "loss": 0.0115, + "step": 10107 + }, + { + "epoch": 9.231050228310503, + "grad_norm": 3.8386406898498535, + "learning_rate": 8.554033485540336e-07, + "loss": 0.0251, + "step": 10108 + }, + { + "epoch": 9.231963470319634, + "grad_norm": 0.1065196543931961, + "learning_rate": 8.543886352105531e-07, + "loss": 0.0007, + "step": 10109 + }, + { + "epoch": 9.232876712328768, + "grad_norm": 1.947831630706787, + "learning_rate": 8.533739218670727e-07, + "loss": 0.0049, + "step": 10110 + }, + { + "epoch": 9.2337899543379, + "grad_norm": 0.23197901248931885, + "learning_rate": 8.523592085235921e-07, + "loss": 0.0016, + "step": 10111 + }, + { + "epoch": 9.234703196347033, + "grad_norm": 9.744351387023926, + "learning_rate": 8.513444951801117e-07, + "loss": 0.0607, + "step": 10112 + }, + { + "epoch": 9.235616438356164, + "grad_norm": 7.520163059234619, + "learning_rate": 8.503297818366312e-07, + "loss": 0.054, + "step": 10113 + }, + { + "epoch": 9.236529680365297, + "grad_norm": 0.2227458953857422, + "learning_rate": 8.493150684931507e-07, + "loss": 0.0017, + "step": 10114 + }, + { + "epoch": 9.237442922374429, + "grad_norm": 57.85593032836914, + "learning_rate": 8.483003551496703e-07, + "loss": 0.3437, + "step": 10115 + }, + { + "epoch": 9.238356164383562, + "grad_norm": 2.101132392883301, + "learning_rate": 8.472856418061898e-07, + "loss": 0.0141, + "step": 10116 + }, + { + "epoch": 9.239269406392694, + "grad_norm": 25.032812118530273, + "learning_rate": 8.462709284627094e-07, + "loss": 0.0659, + "step": 10117 + }, + { + "epoch": 9.240182648401827, + "grad_norm": 0.08526314049959183, + "learning_rate": 8.45256215119229e-07, + "loss": 0.0006, + "step": 10118 + }, + { + "epoch": 9.241095890410959, + "grad_norm": 49.432640075683594, + "learning_rate": 8.442415017757484e-07, + "loss": 0.2235, + "step": 10119 + }, + { + "epoch": 9.242009132420092, + "grad_norm": 5.240802764892578, + "learning_rate": 8.43226788432268e-07, + "loss": 0.0236, + "step": 10120 + }, + { + "epoch": 9.242922374429224, + "grad_norm": 0.5464282035827637, + "learning_rate": 8.422120750887874e-07, + "loss": 0.0037, + "step": 10121 + }, + { + "epoch": 9.243835616438357, + "grad_norm": 0.015572863630950451, + "learning_rate": 8.41197361745307e-07, + "loss": 0.0001, + "step": 10122 + }, + { + "epoch": 9.244748858447489, + "grad_norm": 1.4367507696151733, + "learning_rate": 8.401826484018266e-07, + "loss": 0.0098, + "step": 10123 + }, + { + "epoch": 9.24566210045662, + "grad_norm": 0.4009878635406494, + "learning_rate": 8.391679350583461e-07, + "loss": 0.0029, + "step": 10124 + }, + { + "epoch": 9.246575342465754, + "grad_norm": 0.05375906080007553, + "learning_rate": 8.381532217148656e-07, + "loss": 0.0003, + "step": 10125 + }, + { + "epoch": 9.247488584474885, + "grad_norm": 3.4286372661590576, + "learning_rate": 8.371385083713851e-07, + "loss": 0.0241, + "step": 10126 + }, + { + "epoch": 9.248401826484018, + "grad_norm": 3.3756394386291504, + "learning_rate": 8.361237950279047e-07, + "loss": 0.0103, + "step": 10127 + }, + { + "epoch": 9.24931506849315, + "grad_norm": 6.8429484367370605, + "learning_rate": 8.351090816844243e-07, + "loss": 0.0332, + "step": 10128 + }, + { + "epoch": 9.250228310502283, + "grad_norm": 0.7467966675758362, + "learning_rate": 8.340943683409438e-07, + "loss": 0.0049, + "step": 10129 + }, + { + "epoch": 9.251141552511415, + "grad_norm": 1.2461342811584473, + "learning_rate": 8.330796549974634e-07, + "loss": 0.0099, + "step": 10130 + }, + { + "epoch": 9.252054794520548, + "grad_norm": 0.24215930700302124, + "learning_rate": 8.320649416539827e-07, + "loss": 0.0015, + "step": 10131 + }, + { + "epoch": 9.25296803652968, + "grad_norm": 5.288771152496338, + "learning_rate": 8.310502283105023e-07, + "loss": 0.0058, + "step": 10132 + }, + { + "epoch": 9.253881278538813, + "grad_norm": 21.676664352416992, + "learning_rate": 8.300355149670219e-07, + "loss": 0.1821, + "step": 10133 + }, + { + "epoch": 9.254794520547945, + "grad_norm": 6.436239242553711, + "learning_rate": 8.290208016235414e-07, + "loss": 0.0381, + "step": 10134 + }, + { + "epoch": 9.255707762557078, + "grad_norm": 8.0176420211792, + "learning_rate": 8.28006088280061e-07, + "loss": 0.0379, + "step": 10135 + }, + { + "epoch": 9.25662100456621, + "grad_norm": 0.13757435977458954, + "learning_rate": 8.269913749365805e-07, + "loss": 0.0008, + "step": 10136 + }, + { + "epoch": 9.257534246575343, + "grad_norm": 5.772387981414795, + "learning_rate": 8.259766615931e-07, + "loss": 0.0232, + "step": 10137 + }, + { + "epoch": 9.258447488584475, + "grad_norm": 0.06817848980426788, + "learning_rate": 8.249619482496196e-07, + "loss": 0.0005, + "step": 10138 + }, + { + "epoch": 9.259360730593608, + "grad_norm": 36.08654022216797, + "learning_rate": 8.239472349061391e-07, + "loss": 0.1911, + "step": 10139 + }, + { + "epoch": 9.26027397260274, + "grad_norm": 1.7647569179534912, + "learning_rate": 8.229325215626587e-07, + "loss": 0.0109, + "step": 10140 + }, + { + "epoch": 9.261187214611873, + "grad_norm": 0.16095508635044098, + "learning_rate": 8.219178082191781e-07, + "loss": 0.0005, + "step": 10141 + }, + { + "epoch": 9.262100456621004, + "grad_norm": 0.8297039866447449, + "learning_rate": 8.209030948756977e-07, + "loss": 0.0048, + "step": 10142 + }, + { + "epoch": 9.263013698630138, + "grad_norm": 0.08759438991546631, + "learning_rate": 8.198883815322172e-07, + "loss": 0.0004, + "step": 10143 + }, + { + "epoch": 9.26392694063927, + "grad_norm": 32.96742248535156, + "learning_rate": 8.188736681887367e-07, + "loss": 0.2047, + "step": 10144 + }, + { + "epoch": 9.264840182648403, + "grad_norm": 0.13092437386512756, + "learning_rate": 8.178589548452563e-07, + "loss": 0.0009, + "step": 10145 + }, + { + "epoch": 9.265753424657534, + "grad_norm": 0.2268526703119278, + "learning_rate": 8.168442415017758e-07, + "loss": 0.0012, + "step": 10146 + }, + { + "epoch": 9.266666666666667, + "grad_norm": 1.1279680728912354, + "learning_rate": 8.158295281582954e-07, + "loss": 0.0072, + "step": 10147 + }, + { + "epoch": 9.267579908675799, + "grad_norm": 0.8035292029380798, + "learning_rate": 8.14814814814815e-07, + "loss": 0.0059, + "step": 10148 + }, + { + "epoch": 9.268493150684932, + "grad_norm": 5.069411277770996, + "learning_rate": 8.138001014713344e-07, + "loss": 0.0361, + "step": 10149 + }, + { + "epoch": 9.269406392694064, + "grad_norm": 1.831070065498352, + "learning_rate": 8.12785388127854e-07, + "loss": 0.0085, + "step": 10150 + }, + { + "epoch": 9.270319634703196, + "grad_norm": 0.17923226952552795, + "learning_rate": 8.117706747843734e-07, + "loss": 0.0006, + "step": 10151 + }, + { + "epoch": 9.271232876712329, + "grad_norm": 0.4481487572193146, + "learning_rate": 8.10755961440893e-07, + "loss": 0.0034, + "step": 10152 + }, + { + "epoch": 9.27214611872146, + "grad_norm": 5.614796161651611, + "learning_rate": 8.097412480974126e-07, + "loss": 0.0271, + "step": 10153 + }, + { + "epoch": 9.273059360730594, + "grad_norm": 11.003302574157715, + "learning_rate": 8.08726534753932e-07, + "loss": 0.0549, + "step": 10154 + }, + { + "epoch": 9.273972602739725, + "grad_norm": 0.06157452613115311, + "learning_rate": 8.077118214104516e-07, + "loss": 0.0004, + "step": 10155 + }, + { + "epoch": 9.274885844748859, + "grad_norm": 1.6995688676834106, + "learning_rate": 8.066971080669711e-07, + "loss": 0.0124, + "step": 10156 + }, + { + "epoch": 9.27579908675799, + "grad_norm": 100.1946029663086, + "learning_rate": 8.056823947234907e-07, + "loss": 2.2253, + "step": 10157 + }, + { + "epoch": 9.276712328767124, + "grad_norm": 1.1324034929275513, + "learning_rate": 8.046676813800103e-07, + "loss": 0.007, + "step": 10158 + }, + { + "epoch": 9.277625570776255, + "grad_norm": 5.380889892578125, + "learning_rate": 8.036529680365298e-07, + "loss": 0.0325, + "step": 10159 + }, + { + "epoch": 9.278538812785389, + "grad_norm": 0.6878947615623474, + "learning_rate": 8.026382546930494e-07, + "loss": 0.0038, + "step": 10160 + }, + { + "epoch": 9.27945205479452, + "grad_norm": 1.5957121849060059, + "learning_rate": 8.016235413495687e-07, + "loss": 0.01, + "step": 10161 + }, + { + "epoch": 9.280365296803653, + "grad_norm": 3.6984546184539795, + "learning_rate": 8.006088280060883e-07, + "loss": 0.0103, + "step": 10162 + }, + { + "epoch": 9.281278538812785, + "grad_norm": 0.3219124674797058, + "learning_rate": 7.995941146626079e-07, + "loss": 0.0018, + "step": 10163 + }, + { + "epoch": 9.282191780821918, + "grad_norm": 16.996950149536133, + "learning_rate": 7.985794013191274e-07, + "loss": 0.092, + "step": 10164 + }, + { + "epoch": 9.28310502283105, + "grad_norm": 0.7147420048713684, + "learning_rate": 7.97564687975647e-07, + "loss": 0.0057, + "step": 10165 + }, + { + "epoch": 9.284018264840183, + "grad_norm": 8.781486511230469, + "learning_rate": 7.965499746321665e-07, + "loss": 0.0592, + "step": 10166 + }, + { + "epoch": 9.284931506849315, + "grad_norm": 1.0338881015777588, + "learning_rate": 7.95535261288686e-07, + "loss": 0.0069, + "step": 10167 + }, + { + "epoch": 9.285844748858448, + "grad_norm": 21.249385833740234, + "learning_rate": 7.945205479452056e-07, + "loss": 0.1559, + "step": 10168 + }, + { + "epoch": 9.28675799086758, + "grad_norm": 0.6614393591880798, + "learning_rate": 7.935058346017251e-07, + "loss": 0.0033, + "step": 10169 + }, + { + "epoch": 9.287671232876713, + "grad_norm": 0.022261038422584534, + "learning_rate": 7.924911212582447e-07, + "loss": 0.0001, + "step": 10170 + }, + { + "epoch": 9.288584474885845, + "grad_norm": 0.7498911023139954, + "learning_rate": 7.914764079147641e-07, + "loss": 0.0051, + "step": 10171 + }, + { + "epoch": 9.289497716894978, + "grad_norm": 0.7070425152778625, + "learning_rate": 7.904616945712837e-07, + "loss": 0.0058, + "step": 10172 + }, + { + "epoch": 9.29041095890411, + "grad_norm": 0.29877835512161255, + "learning_rate": 7.894469812278032e-07, + "loss": 0.0019, + "step": 10173 + }, + { + "epoch": 9.291324200913243, + "grad_norm": 1.2986315488815308, + "learning_rate": 7.884322678843227e-07, + "loss": 0.0047, + "step": 10174 + }, + { + "epoch": 9.292237442922374, + "grad_norm": 1.0387145280838013, + "learning_rate": 7.874175545408423e-07, + "loss": 0.0065, + "step": 10175 + }, + { + "epoch": 9.293150684931508, + "grad_norm": 87.12120056152344, + "learning_rate": 7.864028411973618e-07, + "loss": 2.2279, + "step": 10176 + }, + { + "epoch": 9.29406392694064, + "grad_norm": 42.35429763793945, + "learning_rate": 7.853881278538814e-07, + "loss": 0.2726, + "step": 10177 + }, + { + "epoch": 9.29497716894977, + "grad_norm": 12.654717445373535, + "learning_rate": 7.84373414510401e-07, + "loss": 0.0808, + "step": 10178 + }, + { + "epoch": 9.295890410958904, + "grad_norm": 35.50700378417969, + "learning_rate": 7.833587011669204e-07, + "loss": 0.2665, + "step": 10179 + }, + { + "epoch": 9.296803652968036, + "grad_norm": 42.461612701416016, + "learning_rate": 7.8234398782344e-07, + "loss": 0.1105, + "step": 10180 + }, + { + "epoch": 9.29771689497717, + "grad_norm": 1.9313734769821167, + "learning_rate": 7.813292744799594e-07, + "loss": 0.0111, + "step": 10181 + }, + { + "epoch": 9.2986301369863, + "grad_norm": 27.280330657958984, + "learning_rate": 7.80314561136479e-07, + "loss": 0.1888, + "step": 10182 + }, + { + "epoch": 9.299543378995434, + "grad_norm": 0.15578164160251617, + "learning_rate": 7.792998477929986e-07, + "loss": 0.0011, + "step": 10183 + }, + { + "epoch": 9.300456621004566, + "grad_norm": 0.7750376462936401, + "learning_rate": 7.78285134449518e-07, + "loss": 0.0031, + "step": 10184 + }, + { + "epoch": 9.301369863013699, + "grad_norm": 1.0837862491607666, + "learning_rate": 7.772704211060376e-07, + "loss": 0.0052, + "step": 10185 + }, + { + "epoch": 9.30228310502283, + "grad_norm": 0.061089418828487396, + "learning_rate": 7.762557077625571e-07, + "loss": 0.0004, + "step": 10186 + }, + { + "epoch": 9.303196347031964, + "grad_norm": 0.1835973709821701, + "learning_rate": 7.752409944190767e-07, + "loss": 0.0012, + "step": 10187 + }, + { + "epoch": 9.304109589041095, + "grad_norm": 8.217074394226074, + "learning_rate": 7.742262810755963e-07, + "loss": 0.039, + "step": 10188 + }, + { + "epoch": 9.305022831050229, + "grad_norm": 16.052339553833008, + "learning_rate": 7.732115677321158e-07, + "loss": 0.088, + "step": 10189 + }, + { + "epoch": 9.30593607305936, + "grad_norm": 4.225114345550537, + "learning_rate": 7.721968543886354e-07, + "loss": 0.027, + "step": 10190 + }, + { + "epoch": 9.306849315068494, + "grad_norm": 0.05783597752451897, + "learning_rate": 7.711821410451547e-07, + "loss": 0.0004, + "step": 10191 + }, + { + "epoch": 9.307762557077625, + "grad_norm": 1.5389958620071411, + "learning_rate": 7.701674277016743e-07, + "loss": 0.0099, + "step": 10192 + }, + { + "epoch": 9.308675799086759, + "grad_norm": 1.1769533157348633, + "learning_rate": 7.691527143581939e-07, + "loss": 0.0064, + "step": 10193 + }, + { + "epoch": 9.30958904109589, + "grad_norm": 2.0527100563049316, + "learning_rate": 7.681380010147134e-07, + "loss": 0.0149, + "step": 10194 + }, + { + "epoch": 9.310502283105023, + "grad_norm": 4.574986934661865, + "learning_rate": 7.67123287671233e-07, + "loss": 0.0245, + "step": 10195 + }, + { + "epoch": 9.311415525114155, + "grad_norm": 0.2297273427248001, + "learning_rate": 7.661085743277524e-07, + "loss": 0.0013, + "step": 10196 + }, + { + "epoch": 9.312328767123288, + "grad_norm": 124.16891479492188, + "learning_rate": 7.65093860984272e-07, + "loss": 6.1174, + "step": 10197 + }, + { + "epoch": 9.31324200913242, + "grad_norm": 10.985279083251953, + "learning_rate": 7.640791476407916e-07, + "loss": 0.0637, + "step": 10198 + }, + { + "epoch": 9.314155251141553, + "grad_norm": 1.9043501615524292, + "learning_rate": 7.630644342973111e-07, + "loss": 0.012, + "step": 10199 + }, + { + "epoch": 9.315068493150685, + "grad_norm": 0.43073657155036926, + "learning_rate": 7.620497209538307e-07, + "loss": 0.0025, + "step": 10200 + }, + { + "epoch": 9.315981735159818, + "grad_norm": 2.3084352016448975, + "learning_rate": 7.610350076103501e-07, + "loss": 0.0125, + "step": 10201 + }, + { + "epoch": 9.31689497716895, + "grad_norm": 11.496755599975586, + "learning_rate": 7.600202942668696e-07, + "loss": 0.1059, + "step": 10202 + }, + { + "epoch": 9.317808219178083, + "grad_norm": 0.21864478290081024, + "learning_rate": 7.590055809233892e-07, + "loss": 0.0013, + "step": 10203 + }, + { + "epoch": 9.318721461187215, + "grad_norm": 6.278544902801514, + "learning_rate": 7.579908675799087e-07, + "loss": 0.0393, + "step": 10204 + }, + { + "epoch": 9.319634703196346, + "grad_norm": 0.30257463455200195, + "learning_rate": 7.569761542364283e-07, + "loss": 0.0016, + "step": 10205 + }, + { + "epoch": 9.32054794520548, + "grad_norm": 2.2739460468292236, + "learning_rate": 7.559614408929478e-07, + "loss": 0.006, + "step": 10206 + }, + { + "epoch": 9.321461187214611, + "grad_norm": 29.20458221435547, + "learning_rate": 7.549467275494674e-07, + "loss": 0.1755, + "step": 10207 + }, + { + "epoch": 9.322374429223744, + "grad_norm": 22.93973159790039, + "learning_rate": 7.53932014205987e-07, + "loss": 0.1054, + "step": 10208 + }, + { + "epoch": 9.323287671232876, + "grad_norm": 11.245747566223145, + "learning_rate": 7.529173008625064e-07, + "loss": 0.0494, + "step": 10209 + }, + { + "epoch": 9.32420091324201, + "grad_norm": 0.9681994915008545, + "learning_rate": 7.51902587519026e-07, + "loss": 0.0058, + "step": 10210 + }, + { + "epoch": 9.325114155251141, + "grad_norm": 5.921933174133301, + "learning_rate": 7.508878741755454e-07, + "loss": 0.0399, + "step": 10211 + }, + { + "epoch": 9.326027397260274, + "grad_norm": 15.041393280029297, + "learning_rate": 7.49873160832065e-07, + "loss": 0.1099, + "step": 10212 + }, + { + "epoch": 9.326940639269406, + "grad_norm": 1.4398236274719238, + "learning_rate": 7.488584474885845e-07, + "loss": 0.0096, + "step": 10213 + }, + { + "epoch": 9.32785388127854, + "grad_norm": 0.3827212154865265, + "learning_rate": 7.47843734145104e-07, + "loss": 0.0023, + "step": 10214 + }, + { + "epoch": 9.32876712328767, + "grad_norm": 1.4490984678268433, + "learning_rate": 7.468290208016236e-07, + "loss": 0.0085, + "step": 10215 + }, + { + "epoch": 9.329680365296804, + "grad_norm": 74.493408203125, + "learning_rate": 7.458143074581431e-07, + "loss": 1.1157, + "step": 10216 + }, + { + "epoch": 9.330593607305936, + "grad_norm": 101.32250213623047, + "learning_rate": 7.447995941146627e-07, + "loss": 1.7322, + "step": 10217 + }, + { + "epoch": 9.331506849315069, + "grad_norm": 0.2421054244041443, + "learning_rate": 7.437848807711822e-07, + "loss": 0.0016, + "step": 10218 + }, + { + "epoch": 9.3324200913242, + "grad_norm": 3.4160289764404297, + "learning_rate": 7.427701674277018e-07, + "loss": 0.0163, + "step": 10219 + }, + { + "epoch": 9.333333333333334, + "grad_norm": 0.03981028124690056, + "learning_rate": 7.417554540842214e-07, + "loss": 0.0002, + "step": 10220 + }, + { + "epoch": 9.334246575342465, + "grad_norm": 0.12421702593564987, + "learning_rate": 7.407407407407407e-07, + "loss": 0.0007, + "step": 10221 + }, + { + "epoch": 9.335159817351599, + "grad_norm": 5.519114971160889, + "learning_rate": 7.397260273972603e-07, + "loss": 0.0284, + "step": 10222 + }, + { + "epoch": 9.33607305936073, + "grad_norm": 1.6384536027908325, + "learning_rate": 7.387113140537798e-07, + "loss": 0.0125, + "step": 10223 + }, + { + "epoch": 9.336986301369864, + "grad_norm": 2.9575376510620117, + "learning_rate": 7.376966007102994e-07, + "loss": 0.0179, + "step": 10224 + }, + { + "epoch": 9.337899543378995, + "grad_norm": 3.6130475997924805, + "learning_rate": 7.36681887366819e-07, + "loss": 0.0267, + "step": 10225 + }, + { + "epoch": 9.338812785388129, + "grad_norm": 0.08840052038431168, + "learning_rate": 7.356671740233384e-07, + "loss": 0.0005, + "step": 10226 + }, + { + "epoch": 9.33972602739726, + "grad_norm": 10.779854774475098, + "learning_rate": 7.34652460679858e-07, + "loss": 0.036, + "step": 10227 + }, + { + "epoch": 9.340639269406394, + "grad_norm": 0.8618690967559814, + "learning_rate": 7.336377473363775e-07, + "loss": 0.0059, + "step": 10228 + }, + { + "epoch": 9.341552511415525, + "grad_norm": 0.27104219794273376, + "learning_rate": 7.326230339928971e-07, + "loss": 0.0014, + "step": 10229 + }, + { + "epoch": 9.342465753424657, + "grad_norm": 36.23059844970703, + "learning_rate": 7.316083206494167e-07, + "loss": 0.2745, + "step": 10230 + }, + { + "epoch": 9.34337899543379, + "grad_norm": 0.5787906646728516, + "learning_rate": 7.305936073059361e-07, + "loss": 0.0026, + "step": 10231 + }, + { + "epoch": 9.344292237442922, + "grad_norm": 10.057059288024902, + "learning_rate": 7.295788939624556e-07, + "loss": 0.0646, + "step": 10232 + }, + { + "epoch": 9.345205479452055, + "grad_norm": 94.78102111816406, + "learning_rate": 7.285641806189751e-07, + "loss": 1.8126, + "step": 10233 + }, + { + "epoch": 9.346118721461186, + "grad_norm": 0.27399083971977234, + "learning_rate": 7.275494672754947e-07, + "loss": 0.0018, + "step": 10234 + }, + { + "epoch": 9.34703196347032, + "grad_norm": 4.27701997756958, + "learning_rate": 7.265347539320143e-07, + "loss": 0.0222, + "step": 10235 + }, + { + "epoch": 9.347945205479451, + "grad_norm": 6.989017963409424, + "learning_rate": 7.255200405885338e-07, + "loss": 0.0531, + "step": 10236 + }, + { + "epoch": 9.348858447488585, + "grad_norm": 0.2785792946815491, + "learning_rate": 7.245053272450534e-07, + "loss": 0.0021, + "step": 10237 + }, + { + "epoch": 9.349771689497716, + "grad_norm": 22.798542022705078, + "learning_rate": 7.234906139015728e-07, + "loss": 0.0968, + "step": 10238 + }, + { + "epoch": 9.35068493150685, + "grad_norm": 21.992374420166016, + "learning_rate": 7.224759005580924e-07, + "loss": 0.1953, + "step": 10239 + }, + { + "epoch": 9.351598173515981, + "grad_norm": 0.767890214920044, + "learning_rate": 7.21461187214612e-07, + "loss": 0.005, + "step": 10240 + }, + { + "epoch": 9.352511415525115, + "grad_norm": 6.439982891082764, + "learning_rate": 7.204464738711314e-07, + "loss": 0.047, + "step": 10241 + }, + { + "epoch": 9.353424657534246, + "grad_norm": 0.5119748711585999, + "learning_rate": 7.19431760527651e-07, + "loss": 0.003, + "step": 10242 + }, + { + "epoch": 9.35433789954338, + "grad_norm": 0.8424152731895447, + "learning_rate": 7.184170471841705e-07, + "loss": 0.0069, + "step": 10243 + }, + { + "epoch": 9.355251141552511, + "grad_norm": 0.8276972770690918, + "learning_rate": 7.1740233384069e-07, + "loss": 0.0058, + "step": 10244 + }, + { + "epoch": 9.356164383561644, + "grad_norm": 1.2861334085464478, + "learning_rate": 7.163876204972096e-07, + "loss": 0.0062, + "step": 10245 + }, + { + "epoch": 9.357077625570776, + "grad_norm": 0.019020158797502518, + "learning_rate": 7.153729071537291e-07, + "loss": 0.0001, + "step": 10246 + }, + { + "epoch": 9.35799086757991, + "grad_norm": 36.289485931396484, + "learning_rate": 7.143581938102487e-07, + "loss": 0.2148, + "step": 10247 + }, + { + "epoch": 9.35890410958904, + "grad_norm": 1.4175280332565308, + "learning_rate": 7.133434804667682e-07, + "loss": 0.009, + "step": 10248 + }, + { + "epoch": 9.359817351598174, + "grad_norm": 18.043434143066406, + "learning_rate": 7.123287671232878e-07, + "loss": 0.0779, + "step": 10249 + }, + { + "epoch": 9.360730593607306, + "grad_norm": 27.71659278869629, + "learning_rate": 7.113140537798073e-07, + "loss": 0.1938, + "step": 10250 + }, + { + "epoch": 9.361643835616439, + "grad_norm": 1.3663426637649536, + "learning_rate": 7.102993404363267e-07, + "loss": 0.009, + "step": 10251 + }, + { + "epoch": 9.36255707762557, + "grad_norm": 0.11248617619276047, + "learning_rate": 7.092846270928463e-07, + "loss": 0.0008, + "step": 10252 + }, + { + "epoch": 9.363470319634704, + "grad_norm": 19.814228057861328, + "learning_rate": 7.082699137493658e-07, + "loss": 0.1791, + "step": 10253 + }, + { + "epoch": 9.364383561643836, + "grad_norm": 0.47731688618659973, + "learning_rate": 7.072552004058854e-07, + "loss": 0.0013, + "step": 10254 + }, + { + "epoch": 9.365296803652967, + "grad_norm": 2.4987199306488037, + "learning_rate": 7.06240487062405e-07, + "loss": 0.0107, + "step": 10255 + }, + { + "epoch": 9.3662100456621, + "grad_norm": 0.6923184990882874, + "learning_rate": 7.052257737189244e-07, + "loss": 0.0054, + "step": 10256 + }, + { + "epoch": 9.367123287671232, + "grad_norm": 3.6777453422546387, + "learning_rate": 7.04211060375444e-07, + "loss": 0.0212, + "step": 10257 + }, + { + "epoch": 9.368036529680365, + "grad_norm": 81.48926544189453, + "learning_rate": 7.031963470319635e-07, + "loss": 1.0832, + "step": 10258 + }, + { + "epoch": 9.368949771689497, + "grad_norm": 0.048838768154382706, + "learning_rate": 7.021816336884831e-07, + "loss": 0.0004, + "step": 10259 + }, + { + "epoch": 9.36986301369863, + "grad_norm": 19.80209732055664, + "learning_rate": 7.011669203450027e-07, + "loss": 0.0884, + "step": 10260 + }, + { + "epoch": 9.370776255707762, + "grad_norm": 0.43523356318473816, + "learning_rate": 7.001522070015221e-07, + "loss": 0.0024, + "step": 10261 + }, + { + "epoch": 9.371689497716895, + "grad_norm": 3.4356002807617188, + "learning_rate": 6.991374936580416e-07, + "loss": 0.0223, + "step": 10262 + }, + { + "epoch": 9.372602739726027, + "grad_norm": 0.06584371626377106, + "learning_rate": 6.981227803145611e-07, + "loss": 0.0003, + "step": 10263 + }, + { + "epoch": 9.37351598173516, + "grad_norm": 0.02848367765545845, + "learning_rate": 6.971080669710807e-07, + "loss": 0.0001, + "step": 10264 + }, + { + "epoch": 9.374429223744292, + "grad_norm": 3.2935500144958496, + "learning_rate": 6.960933536276003e-07, + "loss": 0.0217, + "step": 10265 + }, + { + "epoch": 9.375342465753425, + "grad_norm": 0.056618113070726395, + "learning_rate": 6.950786402841198e-07, + "loss": 0.0003, + "step": 10266 + }, + { + "epoch": 9.376255707762557, + "grad_norm": 1.214269995689392, + "learning_rate": 6.940639269406394e-07, + "loss": 0.008, + "step": 10267 + }, + { + "epoch": 9.37716894977169, + "grad_norm": 1.3423508405685425, + "learning_rate": 6.930492135971588e-07, + "loss": 0.0046, + "step": 10268 + }, + { + "epoch": 9.378082191780821, + "grad_norm": 2.5121524333953857, + "learning_rate": 6.920345002536784e-07, + "loss": 0.0143, + "step": 10269 + }, + { + "epoch": 9.378995433789955, + "grad_norm": 1.3426175117492676, + "learning_rate": 6.91019786910198e-07, + "loss": 0.009, + "step": 10270 + }, + { + "epoch": 9.379908675799086, + "grad_norm": 0.09614154696464539, + "learning_rate": 6.900050735667174e-07, + "loss": 0.0005, + "step": 10271 + }, + { + "epoch": 9.38082191780822, + "grad_norm": 8.424325942993164, + "learning_rate": 6.88990360223237e-07, + "loss": 0.0534, + "step": 10272 + }, + { + "epoch": 9.381735159817351, + "grad_norm": 6.618882656097412, + "learning_rate": 6.879756468797565e-07, + "loss": 0.0444, + "step": 10273 + }, + { + "epoch": 9.382648401826485, + "grad_norm": 2.6432995796203613, + "learning_rate": 6.86960933536276e-07, + "loss": 0.01, + "step": 10274 + }, + { + "epoch": 9.383561643835616, + "grad_norm": 22.75399398803711, + "learning_rate": 6.859462201927956e-07, + "loss": 0.1811, + "step": 10275 + }, + { + "epoch": 9.38447488584475, + "grad_norm": 0.0907229483127594, + "learning_rate": 6.849315068493151e-07, + "loss": 0.0004, + "step": 10276 + }, + { + "epoch": 9.385388127853881, + "grad_norm": 5.009456157684326, + "learning_rate": 6.839167935058347e-07, + "loss": 0.0229, + "step": 10277 + }, + { + "epoch": 9.386301369863014, + "grad_norm": 52.731502532958984, + "learning_rate": 6.829020801623542e-07, + "loss": 0.4804, + "step": 10278 + }, + { + "epoch": 9.387214611872146, + "grad_norm": 1.136089563369751, + "learning_rate": 6.818873668188738e-07, + "loss": 0.0065, + "step": 10279 + }, + { + "epoch": 9.38812785388128, + "grad_norm": 3.8160340785980225, + "learning_rate": 6.808726534753933e-07, + "loss": 0.0262, + "step": 10280 + }, + { + "epoch": 9.389041095890411, + "grad_norm": 0.6121772527694702, + "learning_rate": 6.798579401319127e-07, + "loss": 0.0029, + "step": 10281 + }, + { + "epoch": 9.389954337899542, + "grad_norm": 13.628019332885742, + "learning_rate": 6.788432267884323e-07, + "loss": 0.0743, + "step": 10282 + }, + { + "epoch": 9.390867579908676, + "grad_norm": 0.9615313410758972, + "learning_rate": 6.778285134449518e-07, + "loss": 0.0051, + "step": 10283 + }, + { + "epoch": 9.391780821917807, + "grad_norm": 0.4184541702270508, + "learning_rate": 6.768138001014714e-07, + "loss": 0.0021, + "step": 10284 + }, + { + "epoch": 9.39269406392694, + "grad_norm": 17.32297134399414, + "learning_rate": 6.75799086757991e-07, + "loss": 0.0832, + "step": 10285 + }, + { + "epoch": 9.393607305936072, + "grad_norm": 33.418087005615234, + "learning_rate": 6.747843734145104e-07, + "loss": 0.2333, + "step": 10286 + }, + { + "epoch": 9.394520547945206, + "grad_norm": 5.492107391357422, + "learning_rate": 6.7376966007103e-07, + "loss": 0.0296, + "step": 10287 + }, + { + "epoch": 9.395433789954337, + "grad_norm": 10.041707038879395, + "learning_rate": 6.727549467275495e-07, + "loss": 0.0783, + "step": 10288 + }, + { + "epoch": 9.39634703196347, + "grad_norm": 3.2702484130859375, + "learning_rate": 6.717402333840691e-07, + "loss": 0.0141, + "step": 10289 + }, + { + "epoch": 9.397260273972602, + "grad_norm": 7.060543060302734, + "learning_rate": 6.707255200405887e-07, + "loss": 0.0323, + "step": 10290 + }, + { + "epoch": 9.398173515981735, + "grad_norm": 0.5648098587989807, + "learning_rate": 6.697108066971081e-07, + "loss": 0.0027, + "step": 10291 + }, + { + "epoch": 9.399086757990867, + "grad_norm": 83.0674819946289, + "learning_rate": 6.686960933536276e-07, + "loss": 0.9945, + "step": 10292 + }, + { + "epoch": 9.4, + "grad_norm": 4.196168422698975, + "learning_rate": 6.676813800101471e-07, + "loss": 0.0315, + "step": 10293 + }, + { + "epoch": 9.400913242009132, + "grad_norm": 0.6188735961914062, + "learning_rate": 6.666666666666667e-07, + "loss": 0.0039, + "step": 10294 + }, + { + "epoch": 9.401826484018265, + "grad_norm": 210.55397033691406, + "learning_rate": 6.656519533231863e-07, + "loss": 2.3517, + "step": 10295 + }, + { + "epoch": 9.402739726027397, + "grad_norm": 1.0820205211639404, + "learning_rate": 6.646372399797058e-07, + "loss": 0.0045, + "step": 10296 + }, + { + "epoch": 9.40365296803653, + "grad_norm": 1.3743959665298462, + "learning_rate": 6.636225266362254e-07, + "loss": 0.0074, + "step": 10297 + }, + { + "epoch": 9.404566210045662, + "grad_norm": 71.09619140625, + "learning_rate": 6.626078132927448e-07, + "loss": 0.7436, + "step": 10298 + }, + { + "epoch": 9.405479452054795, + "grad_norm": 52.414859771728516, + "learning_rate": 6.615930999492644e-07, + "loss": 0.7443, + "step": 10299 + }, + { + "epoch": 9.406392694063927, + "grad_norm": 2.136364459991455, + "learning_rate": 6.60578386605784e-07, + "loss": 0.0156, + "step": 10300 + }, + { + "epoch": 9.40730593607306, + "grad_norm": 1.808579444885254, + "learning_rate": 6.595636732623034e-07, + "loss": 0.0075, + "step": 10301 + }, + { + "epoch": 9.408219178082192, + "grad_norm": 3.4458744525909424, + "learning_rate": 6.58548959918823e-07, + "loss": 0.0234, + "step": 10302 + }, + { + "epoch": 9.409132420091325, + "grad_norm": 0.09457962960004807, + "learning_rate": 6.575342465753425e-07, + "loss": 0.0005, + "step": 10303 + }, + { + "epoch": 9.410045662100456, + "grad_norm": 2.5755069255828857, + "learning_rate": 6.56519533231862e-07, + "loss": 0.0131, + "step": 10304 + }, + { + "epoch": 9.41095890410959, + "grad_norm": 0.571942150592804, + "learning_rate": 6.555048198883816e-07, + "loss": 0.0041, + "step": 10305 + }, + { + "epoch": 9.411872146118721, + "grad_norm": 3.1752209663391113, + "learning_rate": 6.544901065449011e-07, + "loss": 0.0197, + "step": 10306 + }, + { + "epoch": 9.412785388127855, + "grad_norm": 0.1674412041902542, + "learning_rate": 6.534753932014207e-07, + "loss": 0.0007, + "step": 10307 + }, + { + "epoch": 9.413698630136986, + "grad_norm": 0.6606001257896423, + "learning_rate": 6.524606798579402e-07, + "loss": 0.003, + "step": 10308 + }, + { + "epoch": 9.414611872146118, + "grad_norm": 11.78971004486084, + "learning_rate": 6.514459665144598e-07, + "loss": 0.0265, + "step": 10309 + }, + { + "epoch": 9.415525114155251, + "grad_norm": 0.11610882729291916, + "learning_rate": 6.504312531709793e-07, + "loss": 0.001, + "step": 10310 + }, + { + "epoch": 9.416438356164383, + "grad_norm": 1.1322689056396484, + "learning_rate": 6.494165398274987e-07, + "loss": 0.0089, + "step": 10311 + }, + { + "epoch": 9.417351598173516, + "grad_norm": 0.16114938259124756, + "learning_rate": 6.484018264840183e-07, + "loss": 0.0009, + "step": 10312 + }, + { + "epoch": 9.418264840182648, + "grad_norm": 71.23600769042969, + "learning_rate": 6.473871131405378e-07, + "loss": 0.3036, + "step": 10313 + }, + { + "epoch": 9.419178082191781, + "grad_norm": 0.37428489327430725, + "learning_rate": 6.463723997970574e-07, + "loss": 0.0019, + "step": 10314 + }, + { + "epoch": 9.420091324200913, + "grad_norm": 0.5586318969726562, + "learning_rate": 6.45357686453577e-07, + "loss": 0.0033, + "step": 10315 + }, + { + "epoch": 9.421004566210046, + "grad_norm": 1.0208959579467773, + "learning_rate": 6.443429731100964e-07, + "loss": 0.006, + "step": 10316 + }, + { + "epoch": 9.421917808219177, + "grad_norm": 0.2021511346101761, + "learning_rate": 6.43328259766616e-07, + "loss": 0.0016, + "step": 10317 + }, + { + "epoch": 9.42283105022831, + "grad_norm": 0.2458174079656601, + "learning_rate": 6.423135464231355e-07, + "loss": 0.0013, + "step": 10318 + }, + { + "epoch": 9.423744292237442, + "grad_norm": 1.2281148433685303, + "learning_rate": 6.412988330796551e-07, + "loss": 0.0074, + "step": 10319 + }, + { + "epoch": 9.424657534246576, + "grad_norm": 31.476865768432617, + "learning_rate": 6.402841197361747e-07, + "loss": 0.3794, + "step": 10320 + }, + { + "epoch": 9.425570776255707, + "grad_norm": 5.281273365020752, + "learning_rate": 6.39269406392694e-07, + "loss": 0.0268, + "step": 10321 + }, + { + "epoch": 9.42648401826484, + "grad_norm": 4.344176292419434, + "learning_rate": 6.382546930492136e-07, + "loss": 0.0194, + "step": 10322 + }, + { + "epoch": 9.427397260273972, + "grad_norm": 8.569454193115234, + "learning_rate": 6.372399797057331e-07, + "loss": 0.07, + "step": 10323 + }, + { + "epoch": 9.428310502283106, + "grad_norm": 0.22381338477134705, + "learning_rate": 6.362252663622527e-07, + "loss": 0.0014, + "step": 10324 + }, + { + "epoch": 9.429223744292237, + "grad_norm": 18.59149742126465, + "learning_rate": 6.352105530187723e-07, + "loss": 0.1138, + "step": 10325 + }, + { + "epoch": 9.43013698630137, + "grad_norm": 0.13342152535915375, + "learning_rate": 6.341958396752918e-07, + "loss": 0.0008, + "step": 10326 + }, + { + "epoch": 9.431050228310502, + "grad_norm": 0.2510927617549896, + "learning_rate": 6.331811263318114e-07, + "loss": 0.0015, + "step": 10327 + }, + { + "epoch": 9.431963470319635, + "grad_norm": 1.7759124040603638, + "learning_rate": 6.321664129883308e-07, + "loss": 0.0151, + "step": 10328 + }, + { + "epoch": 9.432876712328767, + "grad_norm": 1.4608718156814575, + "learning_rate": 6.311516996448504e-07, + "loss": 0.0087, + "step": 10329 + }, + { + "epoch": 9.4337899543379, + "grad_norm": 29.686079025268555, + "learning_rate": 6.3013698630137e-07, + "loss": 0.1047, + "step": 10330 + }, + { + "epoch": 9.434703196347032, + "grad_norm": 1.0883127450942993, + "learning_rate": 6.291222729578894e-07, + "loss": 0.0041, + "step": 10331 + }, + { + "epoch": 9.435616438356165, + "grad_norm": 71.3584976196289, + "learning_rate": 6.28107559614409e-07, + "loss": 1.0736, + "step": 10332 + }, + { + "epoch": 9.436529680365297, + "grad_norm": 4.324583053588867, + "learning_rate": 6.270928462709285e-07, + "loss": 0.0281, + "step": 10333 + }, + { + "epoch": 9.43744292237443, + "grad_norm": 16.97123146057129, + "learning_rate": 6.26078132927448e-07, + "loss": 0.0822, + "step": 10334 + }, + { + "epoch": 9.438356164383562, + "grad_norm": 20.290454864501953, + "learning_rate": 6.250634195839676e-07, + "loss": 0.0888, + "step": 10335 + }, + { + "epoch": 9.439269406392693, + "grad_norm": 0.9876218438148499, + "learning_rate": 6.240487062404871e-07, + "loss": 0.0068, + "step": 10336 + }, + { + "epoch": 9.440182648401827, + "grad_norm": 0.4852045774459839, + "learning_rate": 6.230339928970067e-07, + "loss": 0.003, + "step": 10337 + }, + { + "epoch": 9.441095890410958, + "grad_norm": 5.915914535522461, + "learning_rate": 6.220192795535262e-07, + "loss": 0.0464, + "step": 10338 + }, + { + "epoch": 9.442009132420091, + "grad_norm": 0.1487431526184082, + "learning_rate": 6.210045662100458e-07, + "loss": 0.0011, + "step": 10339 + }, + { + "epoch": 9.442922374429223, + "grad_norm": 12.339932441711426, + "learning_rate": 6.199898528665652e-07, + "loss": 0.0463, + "step": 10340 + }, + { + "epoch": 9.443835616438356, + "grad_norm": 0.34873586893081665, + "learning_rate": 6.189751395230847e-07, + "loss": 0.0021, + "step": 10341 + }, + { + "epoch": 9.444748858447488, + "grad_norm": 0.04876101762056351, + "learning_rate": 6.179604261796043e-07, + "loss": 0.0004, + "step": 10342 + }, + { + "epoch": 9.445662100456621, + "grad_norm": 2.9514944553375244, + "learning_rate": 6.169457128361239e-07, + "loss": 0.0197, + "step": 10343 + }, + { + "epoch": 9.446575342465753, + "grad_norm": 1.076826572418213, + "learning_rate": 6.159309994926434e-07, + "loss": 0.0046, + "step": 10344 + }, + { + "epoch": 9.447488584474886, + "grad_norm": 3.1791298389434814, + "learning_rate": 6.149162861491628e-07, + "loss": 0.0269, + "step": 10345 + }, + { + "epoch": 9.448401826484018, + "grad_norm": 0.07517550140619278, + "learning_rate": 6.139015728056824e-07, + "loss": 0.0003, + "step": 10346 + }, + { + "epoch": 9.449315068493151, + "grad_norm": 0.5120327472686768, + "learning_rate": 6.12886859462202e-07, + "loss": 0.0031, + "step": 10347 + }, + { + "epoch": 9.450228310502283, + "grad_norm": 1.410696268081665, + "learning_rate": 6.118721461187215e-07, + "loss": 0.0069, + "step": 10348 + }, + { + "epoch": 9.451141552511416, + "grad_norm": 0.5924072861671448, + "learning_rate": 6.108574327752411e-07, + "loss": 0.0029, + "step": 10349 + }, + { + "epoch": 9.452054794520548, + "grad_norm": 0.11241094768047333, + "learning_rate": 6.098427194317606e-07, + "loss": 0.0006, + "step": 10350 + }, + { + "epoch": 9.45296803652968, + "grad_norm": 0.14373722672462463, + "learning_rate": 6.0882800608828e-07, + "loss": 0.0007, + "step": 10351 + }, + { + "epoch": 9.453881278538812, + "grad_norm": 0.35078078508377075, + "learning_rate": 6.078132927447996e-07, + "loss": 0.0019, + "step": 10352 + }, + { + "epoch": 9.454794520547946, + "grad_norm": 0.2170308232307434, + "learning_rate": 6.067985794013192e-07, + "loss": 0.0014, + "step": 10353 + }, + { + "epoch": 9.455707762557077, + "grad_norm": 20.256563186645508, + "learning_rate": 6.057838660578387e-07, + "loss": 0.1284, + "step": 10354 + }, + { + "epoch": 9.45662100456621, + "grad_norm": 6.295645236968994, + "learning_rate": 6.047691527143582e-07, + "loss": 0.036, + "step": 10355 + }, + { + "epoch": 9.457534246575342, + "grad_norm": 28.26291847229004, + "learning_rate": 6.037544393708778e-07, + "loss": 0.1689, + "step": 10356 + }, + { + "epoch": 9.458447488584476, + "grad_norm": 30.404678344726562, + "learning_rate": 6.027397260273974e-07, + "loss": 0.2162, + "step": 10357 + }, + { + "epoch": 9.459360730593607, + "grad_norm": 6.3203935623168945, + "learning_rate": 6.017250126839168e-07, + "loss": 0.0522, + "step": 10358 + }, + { + "epoch": 9.46027397260274, + "grad_norm": 0.6931574940681458, + "learning_rate": 6.007102993404364e-07, + "loss": 0.0037, + "step": 10359 + }, + { + "epoch": 9.461187214611872, + "grad_norm": 65.76372528076172, + "learning_rate": 5.996955859969559e-07, + "loss": 0.6725, + "step": 10360 + }, + { + "epoch": 9.462100456621005, + "grad_norm": 0.5029572248458862, + "learning_rate": 5.986808726534754e-07, + "loss": 0.0031, + "step": 10361 + }, + { + "epoch": 9.463013698630137, + "grad_norm": 3.877229690551758, + "learning_rate": 5.97666159309995e-07, + "loss": 0.0323, + "step": 10362 + }, + { + "epoch": 9.463926940639269, + "grad_norm": 66.02428436279297, + "learning_rate": 5.966514459665146e-07, + "loss": 0.411, + "step": 10363 + }, + { + "epoch": 9.464840182648402, + "grad_norm": 0.45217958092689514, + "learning_rate": 5.95636732623034e-07, + "loss": 0.0034, + "step": 10364 + }, + { + "epoch": 9.465753424657533, + "grad_norm": 5.673483848571777, + "learning_rate": 5.946220192795535e-07, + "loss": 0.0508, + "step": 10365 + }, + { + "epoch": 9.466666666666667, + "grad_norm": 0.28351080417633057, + "learning_rate": 5.936073059360731e-07, + "loss": 0.0015, + "step": 10366 + }, + { + "epoch": 9.467579908675798, + "grad_norm": 0.9632049798965454, + "learning_rate": 5.925925925925927e-07, + "loss": 0.0058, + "step": 10367 + }, + { + "epoch": 9.468493150684932, + "grad_norm": 13.109999656677246, + "learning_rate": 5.915778792491122e-07, + "loss": 0.0877, + "step": 10368 + }, + { + "epoch": 9.469406392694063, + "grad_norm": 13.086474418640137, + "learning_rate": 5.905631659056318e-07, + "loss": 0.0741, + "step": 10369 + }, + { + "epoch": 9.470319634703197, + "grad_norm": 1.8512850999832153, + "learning_rate": 5.895484525621512e-07, + "loss": 0.0111, + "step": 10370 + }, + { + "epoch": 9.471232876712328, + "grad_norm": 0.7374162673950195, + "learning_rate": 5.885337392186707e-07, + "loss": 0.0036, + "step": 10371 + }, + { + "epoch": 9.472146118721462, + "grad_norm": 9.946258544921875, + "learning_rate": 5.875190258751903e-07, + "loss": 0.0731, + "step": 10372 + }, + { + "epoch": 9.473059360730593, + "grad_norm": 1.6949926614761353, + "learning_rate": 5.865043125317099e-07, + "loss": 0.0081, + "step": 10373 + }, + { + "epoch": 9.473972602739726, + "grad_norm": 1.8954274654388428, + "learning_rate": 5.854895991882294e-07, + "loss": 0.0093, + "step": 10374 + }, + { + "epoch": 9.474885844748858, + "grad_norm": 27.8588809967041, + "learning_rate": 5.844748858447488e-07, + "loss": 0.2655, + "step": 10375 + }, + { + "epoch": 9.475799086757991, + "grad_norm": 2.9439456462860107, + "learning_rate": 5.834601725012684e-07, + "loss": 0.0129, + "step": 10376 + }, + { + "epoch": 9.476712328767123, + "grad_norm": 2.441547155380249, + "learning_rate": 5.82445459157788e-07, + "loss": 0.0121, + "step": 10377 + }, + { + "epoch": 9.477625570776256, + "grad_norm": 3.7288801670074463, + "learning_rate": 5.814307458143075e-07, + "loss": 0.0256, + "step": 10378 + }, + { + "epoch": 9.478538812785388, + "grad_norm": 1.2618329524993896, + "learning_rate": 5.804160324708271e-07, + "loss": 0.0065, + "step": 10379 + }, + { + "epoch": 9.479452054794521, + "grad_norm": 1.9954500198364258, + "learning_rate": 5.794013191273466e-07, + "loss": 0.0099, + "step": 10380 + }, + { + "epoch": 9.480365296803653, + "grad_norm": 0.07799458503723145, + "learning_rate": 5.78386605783866e-07, + "loss": 0.0006, + "step": 10381 + }, + { + "epoch": 9.481278538812786, + "grad_norm": 6.285923480987549, + "learning_rate": 5.773718924403856e-07, + "loss": 0.0444, + "step": 10382 + }, + { + "epoch": 9.482191780821918, + "grad_norm": 0.6129355430603027, + "learning_rate": 5.763571790969052e-07, + "loss": 0.0037, + "step": 10383 + }, + { + "epoch": 9.483105022831051, + "grad_norm": 1.834916353225708, + "learning_rate": 5.753424657534247e-07, + "loss": 0.0133, + "step": 10384 + }, + { + "epoch": 9.484018264840183, + "grad_norm": 49.5770378112793, + "learning_rate": 5.743277524099442e-07, + "loss": 0.4248, + "step": 10385 + }, + { + "epoch": 9.484931506849316, + "grad_norm": 4.254714012145996, + "learning_rate": 5.733130390664638e-07, + "loss": 0.0222, + "step": 10386 + }, + { + "epoch": 9.485844748858447, + "grad_norm": 18.21733283996582, + "learning_rate": 5.722983257229834e-07, + "loss": 0.1314, + "step": 10387 + }, + { + "epoch": 9.48675799086758, + "grad_norm": 0.017430780455470085, + "learning_rate": 5.712836123795028e-07, + "loss": 0.0001, + "step": 10388 + }, + { + "epoch": 9.487671232876712, + "grad_norm": 0.2683887779712677, + "learning_rate": 5.702688990360224e-07, + "loss": 0.0016, + "step": 10389 + }, + { + "epoch": 9.488584474885844, + "grad_norm": 6.401536464691162, + "learning_rate": 5.692541856925419e-07, + "loss": 0.0467, + "step": 10390 + }, + { + "epoch": 9.489497716894977, + "grad_norm": 0.2798135280609131, + "learning_rate": 5.682394723490614e-07, + "loss": 0.0017, + "step": 10391 + }, + { + "epoch": 9.490410958904109, + "grad_norm": 0.2578390836715698, + "learning_rate": 5.67224759005581e-07, + "loss": 0.0017, + "step": 10392 + }, + { + "epoch": 9.491324200913242, + "grad_norm": 1.146847128868103, + "learning_rate": 5.662100456621006e-07, + "loss": 0.0062, + "step": 10393 + }, + { + "epoch": 9.492237442922374, + "grad_norm": 22.10458755493164, + "learning_rate": 5.6519533231862e-07, + "loss": 0.2126, + "step": 10394 + }, + { + "epoch": 9.493150684931507, + "grad_norm": 0.1556284874677658, + "learning_rate": 5.641806189751395e-07, + "loss": 0.001, + "step": 10395 + }, + { + "epoch": 9.494063926940639, + "grad_norm": 1.7013109922409058, + "learning_rate": 5.631659056316591e-07, + "loss": 0.0082, + "step": 10396 + }, + { + "epoch": 9.494977168949772, + "grad_norm": 6.273348808288574, + "learning_rate": 5.621511922881787e-07, + "loss": 0.0276, + "step": 10397 + }, + { + "epoch": 9.495890410958904, + "grad_norm": 0.6384163498878479, + "learning_rate": 5.611364789446982e-07, + "loss": 0.0045, + "step": 10398 + }, + { + "epoch": 9.496803652968037, + "grad_norm": 0.370443731546402, + "learning_rate": 5.601217656012178e-07, + "loss": 0.0025, + "step": 10399 + }, + { + "epoch": 9.497716894977168, + "grad_norm": 3.2914230823516846, + "learning_rate": 5.591070522577372e-07, + "loss": 0.0206, + "step": 10400 + }, + { + "epoch": 9.498630136986302, + "grad_norm": 11.27112865447998, + "learning_rate": 5.580923389142567e-07, + "loss": 0.0897, + "step": 10401 + }, + { + "epoch": 9.499543378995433, + "grad_norm": 4.623160362243652, + "learning_rate": 5.570776255707763e-07, + "loss": 0.0353, + "step": 10402 + }, + { + "epoch": 9.500456621004567, + "grad_norm": 1.3738821744918823, + "learning_rate": 5.560629122272959e-07, + "loss": 0.006, + "step": 10403 + }, + { + "epoch": 9.501369863013698, + "grad_norm": 0.02678702212870121, + "learning_rate": 5.550481988838154e-07, + "loss": 0.0002, + "step": 10404 + }, + { + "epoch": 9.502283105022832, + "grad_norm": 3.3374617099761963, + "learning_rate": 5.540334855403348e-07, + "loss": 0.0183, + "step": 10405 + }, + { + "epoch": 9.503196347031963, + "grad_norm": 0.37382179498672485, + "learning_rate": 5.530187721968544e-07, + "loss": 0.0021, + "step": 10406 + }, + { + "epoch": 9.504109589041096, + "grad_norm": 0.008122644387185574, + "learning_rate": 5.52004058853374e-07, + "loss": 0.0, + "step": 10407 + }, + { + "epoch": 9.505022831050228, + "grad_norm": 0.3968046009540558, + "learning_rate": 5.509893455098935e-07, + "loss": 0.002, + "step": 10408 + }, + { + "epoch": 9.505936073059361, + "grad_norm": 0.037917058914899826, + "learning_rate": 5.49974632166413e-07, + "loss": 0.0002, + "step": 10409 + }, + { + "epoch": 9.506849315068493, + "grad_norm": 0.013580485247075558, + "learning_rate": 5.489599188229326e-07, + "loss": 0.0001, + "step": 10410 + }, + { + "epoch": 9.507762557077626, + "grad_norm": 2.8562376499176025, + "learning_rate": 5.47945205479452e-07, + "loss": 0.0136, + "step": 10411 + }, + { + "epoch": 9.508675799086758, + "grad_norm": 6.881867408752441, + "learning_rate": 5.469304921359716e-07, + "loss": 0.0365, + "step": 10412 + }, + { + "epoch": 9.509589041095891, + "grad_norm": 41.78014373779297, + "learning_rate": 5.459157787924912e-07, + "loss": 0.3167, + "step": 10413 + }, + { + "epoch": 9.510502283105023, + "grad_norm": 0.16827517747879028, + "learning_rate": 5.449010654490107e-07, + "loss": 0.0011, + "step": 10414 + }, + { + "epoch": 9.511415525114156, + "grad_norm": 7.276819705963135, + "learning_rate": 5.438863521055302e-07, + "loss": 0.0326, + "step": 10415 + }, + { + "epoch": 9.512328767123288, + "grad_norm": 1.4096417427062988, + "learning_rate": 5.428716387620498e-07, + "loss": 0.009, + "step": 10416 + }, + { + "epoch": 9.51324200913242, + "grad_norm": 0.7711318135261536, + "learning_rate": 5.418569254185693e-07, + "loss": 0.0052, + "step": 10417 + }, + { + "epoch": 9.514155251141553, + "grad_norm": 2.8709654808044434, + "learning_rate": 5.408422120750888e-07, + "loss": 0.0151, + "step": 10418 + }, + { + "epoch": 9.515068493150684, + "grad_norm": 1.7089051008224487, + "learning_rate": 5.398274987316083e-07, + "loss": 0.0089, + "step": 10419 + }, + { + "epoch": 9.515981735159817, + "grad_norm": 0.45258456468582153, + "learning_rate": 5.388127853881279e-07, + "loss": 0.0036, + "step": 10420 + }, + { + "epoch": 9.516894977168949, + "grad_norm": 55.76749038696289, + "learning_rate": 5.377980720446474e-07, + "loss": 0.3548, + "step": 10421 + }, + { + "epoch": 9.517808219178082, + "grad_norm": 0.0505867637693882, + "learning_rate": 5.36783358701167e-07, + "loss": 0.0002, + "step": 10422 + }, + { + "epoch": 9.518721461187214, + "grad_norm": 2.8262903690338135, + "learning_rate": 5.357686453576865e-07, + "loss": 0.0135, + "step": 10423 + }, + { + "epoch": 9.519634703196347, + "grad_norm": 0.3187035620212555, + "learning_rate": 5.34753932014206e-07, + "loss": 0.0012, + "step": 10424 + }, + { + "epoch": 9.520547945205479, + "grad_norm": 5.417100429534912, + "learning_rate": 5.337392186707255e-07, + "loss": 0.0434, + "step": 10425 + }, + { + "epoch": 9.521461187214612, + "grad_norm": 0.20236888527870178, + "learning_rate": 5.327245053272451e-07, + "loss": 0.0013, + "step": 10426 + }, + { + "epoch": 9.522374429223744, + "grad_norm": 1.405677318572998, + "learning_rate": 5.317097919837647e-07, + "loss": 0.0118, + "step": 10427 + }, + { + "epoch": 9.523287671232877, + "grad_norm": 2.0930161476135254, + "learning_rate": 5.306950786402842e-07, + "loss": 0.0129, + "step": 10428 + }, + { + "epoch": 9.524200913242009, + "grad_norm": 20.00035858154297, + "learning_rate": 5.296803652968036e-07, + "loss": 0.1253, + "step": 10429 + }, + { + "epoch": 9.525114155251142, + "grad_norm": 19.421920776367188, + "learning_rate": 5.286656519533232e-07, + "loss": 0.2728, + "step": 10430 + }, + { + "epoch": 9.526027397260274, + "grad_norm": 0.1550893932580948, + "learning_rate": 5.276509386098427e-07, + "loss": 0.0004, + "step": 10431 + }, + { + "epoch": 9.526940639269407, + "grad_norm": 1.1809444427490234, + "learning_rate": 5.266362252663623e-07, + "loss": 0.0062, + "step": 10432 + }, + { + "epoch": 9.527853881278538, + "grad_norm": 1.8697506189346313, + "learning_rate": 5.256215119228819e-07, + "loss": 0.0053, + "step": 10433 + }, + { + "epoch": 9.528767123287672, + "grad_norm": 0.4086272418498993, + "learning_rate": 5.246067985794014e-07, + "loss": 0.0021, + "step": 10434 + }, + { + "epoch": 9.529680365296803, + "grad_norm": 102.74285888671875, + "learning_rate": 5.235920852359208e-07, + "loss": 2.3514, + "step": 10435 + }, + { + "epoch": 9.530593607305937, + "grad_norm": 0.3892247676849365, + "learning_rate": 5.225773718924404e-07, + "loss": 0.0018, + "step": 10436 + }, + { + "epoch": 9.531506849315068, + "grad_norm": 3.141991376876831, + "learning_rate": 5.2156265854896e-07, + "loss": 0.031, + "step": 10437 + }, + { + "epoch": 9.532420091324202, + "grad_norm": 18.855485916137695, + "learning_rate": 5.205479452054795e-07, + "loss": 0.1018, + "step": 10438 + }, + { + "epoch": 9.533333333333333, + "grad_norm": 3.1088345050811768, + "learning_rate": 5.19533231861999e-07, + "loss": 0.0232, + "step": 10439 + }, + { + "epoch": 9.534246575342467, + "grad_norm": 69.70235443115234, + "learning_rate": 5.185185185185186e-07, + "loss": 0.6292, + "step": 10440 + }, + { + "epoch": 9.535159817351598, + "grad_norm": 0.09727265685796738, + "learning_rate": 5.17503805175038e-07, + "loss": 0.0006, + "step": 10441 + }, + { + "epoch": 9.536073059360731, + "grad_norm": 0.41962337493896484, + "learning_rate": 5.164890918315576e-07, + "loss": 0.0027, + "step": 10442 + }, + { + "epoch": 9.536986301369863, + "grad_norm": 0.14063818752765656, + "learning_rate": 5.154743784880772e-07, + "loss": 0.0008, + "step": 10443 + }, + { + "epoch": 9.537899543378995, + "grad_norm": 0.8792809844017029, + "learning_rate": 5.144596651445967e-07, + "loss": 0.0067, + "step": 10444 + }, + { + "epoch": 9.538812785388128, + "grad_norm": 0.14145556092262268, + "learning_rate": 5.134449518011162e-07, + "loss": 0.0009, + "step": 10445 + }, + { + "epoch": 9.53972602739726, + "grad_norm": 90.242431640625, + "learning_rate": 5.124302384576358e-07, + "loss": 1.0111, + "step": 10446 + }, + { + "epoch": 9.540639269406393, + "grad_norm": 3.368166208267212, + "learning_rate": 5.114155251141553e-07, + "loss": 0.0203, + "step": 10447 + }, + { + "epoch": 9.541552511415524, + "grad_norm": 14.59603214263916, + "learning_rate": 5.104008117706748e-07, + "loss": 0.0995, + "step": 10448 + }, + { + "epoch": 9.542465753424658, + "grad_norm": 42.26048278808594, + "learning_rate": 5.093860984271943e-07, + "loss": 0.3665, + "step": 10449 + }, + { + "epoch": 9.54337899543379, + "grad_norm": 2.4067091941833496, + "learning_rate": 5.083713850837139e-07, + "loss": 0.0169, + "step": 10450 + }, + { + "epoch": 9.544292237442923, + "grad_norm": 0.27178168296813965, + "learning_rate": 5.073566717402334e-07, + "loss": 0.0005, + "step": 10451 + }, + { + "epoch": 9.545205479452054, + "grad_norm": 6.645107269287109, + "learning_rate": 5.06341958396753e-07, + "loss": 0.0478, + "step": 10452 + }, + { + "epoch": 9.546118721461188, + "grad_norm": 0.49010732769966125, + "learning_rate": 5.053272450532725e-07, + "loss": 0.0026, + "step": 10453 + }, + { + "epoch": 9.54703196347032, + "grad_norm": 0.15964804589748383, + "learning_rate": 5.04312531709792e-07, + "loss": 0.0007, + "step": 10454 + }, + { + "epoch": 9.547945205479452, + "grad_norm": 0.5446297526359558, + "learning_rate": 5.032978183663115e-07, + "loss": 0.0028, + "step": 10455 + }, + { + "epoch": 9.548858447488584, + "grad_norm": 4.41827392578125, + "learning_rate": 5.022831050228311e-07, + "loss": 0.0272, + "step": 10456 + }, + { + "epoch": 9.549771689497717, + "grad_norm": 1.1012983322143555, + "learning_rate": 5.012683916793507e-07, + "loss": 0.0043, + "step": 10457 + }, + { + "epoch": 9.550684931506849, + "grad_norm": 2.1246962547302246, + "learning_rate": 5.002536783358702e-07, + "loss": 0.0106, + "step": 10458 + }, + { + "epoch": 9.551598173515982, + "grad_norm": 2.4250283241271973, + "learning_rate": 4.992389649923896e-07, + "loss": 0.0185, + "step": 10459 + }, + { + "epoch": 9.552511415525114, + "grad_norm": 0.9527918100357056, + "learning_rate": 4.982242516489092e-07, + "loss": 0.0062, + "step": 10460 + }, + { + "epoch": 9.553424657534247, + "grad_norm": 1.1382392644882202, + "learning_rate": 4.972095383054287e-07, + "loss": 0.0082, + "step": 10461 + }, + { + "epoch": 9.554337899543379, + "grad_norm": 3.688588857650757, + "learning_rate": 4.961948249619483e-07, + "loss": 0.0271, + "step": 10462 + }, + { + "epoch": 9.555251141552512, + "grad_norm": 0.04121604561805725, + "learning_rate": 4.951801116184679e-07, + "loss": 0.0003, + "step": 10463 + }, + { + "epoch": 9.556164383561644, + "grad_norm": 0.027156509459018707, + "learning_rate": 4.941653982749874e-07, + "loss": 0.0001, + "step": 10464 + }, + { + "epoch": 9.557077625570777, + "grad_norm": 0.8726006150245667, + "learning_rate": 4.931506849315068e-07, + "loss": 0.004, + "step": 10465 + }, + { + "epoch": 9.557990867579909, + "grad_norm": 1.2040596008300781, + "learning_rate": 4.921359715880264e-07, + "loss": 0.0075, + "step": 10466 + }, + { + "epoch": 9.558904109589042, + "grad_norm": 3.826957941055298, + "learning_rate": 4.91121258244546e-07, + "loss": 0.0341, + "step": 10467 + }, + { + "epoch": 9.559817351598173, + "grad_norm": 0.19355282187461853, + "learning_rate": 4.901065449010655e-07, + "loss": 0.0014, + "step": 10468 + }, + { + "epoch": 9.560730593607307, + "grad_norm": 0.25371891260147095, + "learning_rate": 4.89091831557585e-07, + "loss": 0.0022, + "step": 10469 + }, + { + "epoch": 9.561643835616438, + "grad_norm": 6.671116352081299, + "learning_rate": 4.880771182141046e-07, + "loss": 0.0404, + "step": 10470 + }, + { + "epoch": 9.56255707762557, + "grad_norm": 0.9797327518463135, + "learning_rate": 4.87062404870624e-07, + "loss": 0.0067, + "step": 10471 + }, + { + "epoch": 9.563470319634703, + "grad_norm": 0.11062172800302505, + "learning_rate": 4.860476915271436e-07, + "loss": 0.0008, + "step": 10472 + }, + { + "epoch": 9.564383561643835, + "grad_norm": 15.363935470581055, + "learning_rate": 4.850329781836632e-07, + "loss": 0.0769, + "step": 10473 + }, + { + "epoch": 9.565296803652968, + "grad_norm": 47.73090362548828, + "learning_rate": 4.840182648401827e-07, + "loss": 0.3701, + "step": 10474 + }, + { + "epoch": 9.5662100456621, + "grad_norm": 0.5184289813041687, + "learning_rate": 4.830035514967022e-07, + "loss": 0.004, + "step": 10475 + }, + { + "epoch": 9.567123287671233, + "grad_norm": 1.0010623931884766, + "learning_rate": 4.819888381532218e-07, + "loss": 0.0052, + "step": 10476 + }, + { + "epoch": 9.568036529680365, + "grad_norm": 23.429855346679688, + "learning_rate": 4.809741248097413e-07, + "loss": 0.1615, + "step": 10477 + }, + { + "epoch": 9.568949771689498, + "grad_norm": 0.411163866519928, + "learning_rate": 4.799594114662608e-07, + "loss": 0.0029, + "step": 10478 + }, + { + "epoch": 9.56986301369863, + "grad_norm": 71.72360229492188, + "learning_rate": 4.789446981227803e-07, + "loss": 0.5222, + "step": 10479 + }, + { + "epoch": 9.570776255707763, + "grad_norm": 7.0529913902282715, + "learning_rate": 4.779299847792999e-07, + "loss": 0.0417, + "step": 10480 + }, + { + "epoch": 9.571689497716894, + "grad_norm": 1.8898751735687256, + "learning_rate": 4.769152714358194e-07, + "loss": 0.008, + "step": 10481 + }, + { + "epoch": 9.572602739726028, + "grad_norm": 0.3759688138961792, + "learning_rate": 4.7590055809233896e-07, + "loss": 0.0019, + "step": 10482 + }, + { + "epoch": 9.57351598173516, + "grad_norm": 3.2631053924560547, + "learning_rate": 4.748858447488585e-07, + "loss": 0.0189, + "step": 10483 + }, + { + "epoch": 9.574429223744293, + "grad_norm": 0.20058797299861908, + "learning_rate": 4.73871131405378e-07, + "loss": 0.0015, + "step": 10484 + }, + { + "epoch": 9.575342465753424, + "grad_norm": 16.035598754882812, + "learning_rate": 4.7285641806189756e-07, + "loss": 0.083, + "step": 10485 + }, + { + "epoch": 9.576255707762558, + "grad_norm": 0.060878198593854904, + "learning_rate": 4.7184170471841704e-07, + "loss": 0.0002, + "step": 10486 + }, + { + "epoch": 9.57716894977169, + "grad_norm": 46.01873016357422, + "learning_rate": 4.708269913749366e-07, + "loss": 0.2539, + "step": 10487 + }, + { + "epoch": 9.578082191780823, + "grad_norm": 0.10393835604190826, + "learning_rate": 4.6981227803145616e-07, + "loss": 0.0004, + "step": 10488 + }, + { + "epoch": 9.578995433789954, + "grad_norm": 2.585632562637329, + "learning_rate": 4.687975646879757e-07, + "loss": 0.0137, + "step": 10489 + }, + { + "epoch": 9.579908675799087, + "grad_norm": 9.88095474243164, + "learning_rate": 4.677828513444952e-07, + "loss": 0.0762, + "step": 10490 + }, + { + "epoch": 9.580821917808219, + "grad_norm": 14.465617179870605, + "learning_rate": 4.667681380010147e-07, + "loss": 0.1029, + "step": 10491 + }, + { + "epoch": 9.581735159817352, + "grad_norm": 0.11143777519464493, + "learning_rate": 4.657534246575343e-07, + "loss": 0.0008, + "step": 10492 + }, + { + "epoch": 9.582648401826484, + "grad_norm": 1.8339141607284546, + "learning_rate": 4.647387113140538e-07, + "loss": 0.0097, + "step": 10493 + }, + { + "epoch": 9.583561643835617, + "grad_norm": 0.015303868800401688, + "learning_rate": 4.6372399797057336e-07, + "loss": 0.0001, + "step": 10494 + }, + { + "epoch": 9.584474885844749, + "grad_norm": 0.41831469535827637, + "learning_rate": 4.627092846270929e-07, + "loss": 0.0031, + "step": 10495 + }, + { + "epoch": 9.585388127853882, + "grad_norm": 0.5085836052894592, + "learning_rate": 4.6169457128361237e-07, + "loss": 0.0029, + "step": 10496 + }, + { + "epoch": 9.586301369863014, + "grad_norm": 0.1370776742696762, + "learning_rate": 4.6067985794013196e-07, + "loss": 0.0008, + "step": 10497 + }, + { + "epoch": 9.587214611872145, + "grad_norm": 1.4523792266845703, + "learning_rate": 4.596651445966515e-07, + "loss": 0.0082, + "step": 10498 + }, + { + "epoch": 9.588127853881279, + "grad_norm": 6.997694492340088, + "learning_rate": 4.58650431253171e-07, + "loss": 0.0281, + "step": 10499 + }, + { + "epoch": 9.58904109589041, + "grad_norm": 0.01642502285540104, + "learning_rate": 4.5763571790969056e-07, + "loss": 0.0001, + "step": 10500 + }, + { + "epoch": 9.589954337899544, + "grad_norm": 0.948765754699707, + "learning_rate": 4.5662100456621004e-07, + "loss": 0.0066, + "step": 10501 + }, + { + "epoch": 9.590867579908675, + "grad_norm": 4.0890936851501465, + "learning_rate": 4.556062912227296e-07, + "loss": 0.0302, + "step": 10502 + }, + { + "epoch": 9.591780821917808, + "grad_norm": 4.422569274902344, + "learning_rate": 4.5459157787924916e-07, + "loss": 0.0381, + "step": 10503 + }, + { + "epoch": 9.59269406392694, + "grad_norm": 31.16069221496582, + "learning_rate": 4.535768645357687e-07, + "loss": 0.2203, + "step": 10504 + }, + { + "epoch": 9.593607305936073, + "grad_norm": 30.236099243164062, + "learning_rate": 4.525621511922882e-07, + "loss": 0.1314, + "step": 10505 + }, + { + "epoch": 9.594520547945205, + "grad_norm": 5.05405330657959, + "learning_rate": 4.515474378488077e-07, + "loss": 0.0325, + "step": 10506 + }, + { + "epoch": 9.595433789954338, + "grad_norm": 3.2796711921691895, + "learning_rate": 4.505327245053273e-07, + "loss": 0.0291, + "step": 10507 + }, + { + "epoch": 9.59634703196347, + "grad_norm": 0.49874719977378845, + "learning_rate": 4.495180111618468e-07, + "loss": 0.0036, + "step": 10508 + }, + { + "epoch": 9.597260273972603, + "grad_norm": 1.1135185956954956, + "learning_rate": 4.4850329781836636e-07, + "loss": 0.0042, + "step": 10509 + }, + { + "epoch": 9.598173515981735, + "grad_norm": 21.756250381469727, + "learning_rate": 4.474885844748859e-07, + "loss": 0.1729, + "step": 10510 + }, + { + "epoch": 9.599086757990868, + "grad_norm": 0.46454620361328125, + "learning_rate": 4.4647387113140537e-07, + "loss": 0.0039, + "step": 10511 + }, + { + "epoch": 9.6, + "grad_norm": 7.4957051277160645, + "learning_rate": 4.4545915778792496e-07, + "loss": 0.0425, + "step": 10512 + }, + { + "epoch": 9.600913242009133, + "grad_norm": 83.11312103271484, + "learning_rate": 4.444444444444445e-07, + "loss": 0.6492, + "step": 10513 + }, + { + "epoch": 9.601826484018265, + "grad_norm": 9.942048072814941, + "learning_rate": 4.43429731100964e-07, + "loss": 0.0717, + "step": 10514 + }, + { + "epoch": 9.602739726027398, + "grad_norm": 0.3836768865585327, + "learning_rate": 4.4241501775748356e-07, + "loss": 0.0019, + "step": 10515 + }, + { + "epoch": 9.60365296803653, + "grad_norm": 39.60089874267578, + "learning_rate": 4.4140030441400304e-07, + "loss": 0.131, + "step": 10516 + }, + { + "epoch": 9.604566210045663, + "grad_norm": 2.423973321914673, + "learning_rate": 4.403855910705226e-07, + "loss": 0.0061, + "step": 10517 + }, + { + "epoch": 9.605479452054794, + "grad_norm": 0.24097692966461182, + "learning_rate": 4.3937087772704216e-07, + "loss": 0.002, + "step": 10518 + }, + { + "epoch": 9.606392694063928, + "grad_norm": 1.2475467920303345, + "learning_rate": 4.383561643835617e-07, + "loss": 0.0068, + "step": 10519 + }, + { + "epoch": 9.60730593607306, + "grad_norm": 0.9089560508728027, + "learning_rate": 4.373414510400812e-07, + "loss": 0.0075, + "step": 10520 + }, + { + "epoch": 9.608219178082193, + "grad_norm": 0.9004718065261841, + "learning_rate": 4.363267376966007e-07, + "loss": 0.0041, + "step": 10521 + }, + { + "epoch": 9.609132420091324, + "grad_norm": 3.7839550971984863, + "learning_rate": 4.353120243531203e-07, + "loss": 0.0231, + "step": 10522 + }, + { + "epoch": 9.610045662100458, + "grad_norm": 16.823762893676758, + "learning_rate": 4.342973110096398e-07, + "loss": 0.1054, + "step": 10523 + }, + { + "epoch": 9.610958904109589, + "grad_norm": 2.797545909881592, + "learning_rate": 4.3328259766615935e-07, + "loss": 0.0242, + "step": 10524 + }, + { + "epoch": 9.61187214611872, + "grad_norm": 7.477056980133057, + "learning_rate": 4.322678843226789e-07, + "loss": 0.0473, + "step": 10525 + }, + { + "epoch": 9.612785388127854, + "grad_norm": 0.20822934806346893, + "learning_rate": 4.3125317097919837e-07, + "loss": 0.0012, + "step": 10526 + }, + { + "epoch": 9.613698630136986, + "grad_norm": 5.009194850921631, + "learning_rate": 4.3023845763571795e-07, + "loss": 0.0397, + "step": 10527 + }, + { + "epoch": 9.614611872146119, + "grad_norm": 0.997412919998169, + "learning_rate": 4.292237442922375e-07, + "loss": 0.0064, + "step": 10528 + }, + { + "epoch": 9.61552511415525, + "grad_norm": 32.17522430419922, + "learning_rate": 4.28209030948757e-07, + "loss": 0.266, + "step": 10529 + }, + { + "epoch": 9.616438356164384, + "grad_norm": 0.059668608009815216, + "learning_rate": 4.2719431760527655e-07, + "loss": 0.0004, + "step": 10530 + }, + { + "epoch": 9.617351598173515, + "grad_norm": 0.04074742645025253, + "learning_rate": 4.2617960426179603e-07, + "loss": 0.0003, + "step": 10531 + }, + { + "epoch": 9.618264840182649, + "grad_norm": 2.885017156600952, + "learning_rate": 4.251648909183156e-07, + "loss": 0.0207, + "step": 10532 + }, + { + "epoch": 9.61917808219178, + "grad_norm": 0.3096867799758911, + "learning_rate": 4.2415017757483515e-07, + "loss": 0.0017, + "step": 10533 + }, + { + "epoch": 9.620091324200914, + "grad_norm": 2.8006436824798584, + "learning_rate": 4.231354642313547e-07, + "loss": 0.0111, + "step": 10534 + }, + { + "epoch": 9.621004566210045, + "grad_norm": 4.505757808685303, + "learning_rate": 4.221207508878742e-07, + "loss": 0.0324, + "step": 10535 + }, + { + "epoch": 9.621917808219179, + "grad_norm": 65.68348693847656, + "learning_rate": 4.211060375443937e-07, + "loss": 0.6519, + "step": 10536 + }, + { + "epoch": 9.62283105022831, + "grad_norm": 0.741200864315033, + "learning_rate": 4.200913242009133e-07, + "loss": 0.0034, + "step": 10537 + }, + { + "epoch": 9.623744292237443, + "grad_norm": 7.385648727416992, + "learning_rate": 4.190766108574328e-07, + "loss": 0.0338, + "step": 10538 + }, + { + "epoch": 9.624657534246575, + "grad_norm": 1.2383983135223389, + "learning_rate": 4.1806189751395235e-07, + "loss": 0.0089, + "step": 10539 + }, + { + "epoch": 9.625570776255708, + "grad_norm": 0.24283644556999207, + "learning_rate": 4.170471841704719e-07, + "loss": 0.0013, + "step": 10540 + }, + { + "epoch": 9.62648401826484, + "grad_norm": 30.73184585571289, + "learning_rate": 4.1603247082699137e-07, + "loss": 0.31, + "step": 10541 + }, + { + "epoch": 9.627397260273973, + "grad_norm": 0.3832559287548065, + "learning_rate": 4.1501775748351095e-07, + "loss": 0.0024, + "step": 10542 + }, + { + "epoch": 9.628310502283105, + "grad_norm": 0.5004831552505493, + "learning_rate": 4.140030441400305e-07, + "loss": 0.003, + "step": 10543 + }, + { + "epoch": 9.629223744292238, + "grad_norm": 0.11107321828603745, + "learning_rate": 4.1298833079655e-07, + "loss": 0.0006, + "step": 10544 + }, + { + "epoch": 9.63013698630137, + "grad_norm": 4.304032802581787, + "learning_rate": 4.1197361745306955e-07, + "loss": 0.0231, + "step": 10545 + }, + { + "epoch": 9.631050228310503, + "grad_norm": 0.31651490926742554, + "learning_rate": 4.1095890410958903e-07, + "loss": 0.0016, + "step": 10546 + }, + { + "epoch": 9.631963470319635, + "grad_norm": 2.1282856464385986, + "learning_rate": 4.099441907661086e-07, + "loss": 0.0116, + "step": 10547 + }, + { + "epoch": 9.632876712328766, + "grad_norm": 18.47085952758789, + "learning_rate": 4.0892947742262815e-07, + "loss": 0.1195, + "step": 10548 + }, + { + "epoch": 9.6337899543379, + "grad_norm": 0.7254868745803833, + "learning_rate": 4.079147640791477e-07, + "loss": 0.0045, + "step": 10549 + }, + { + "epoch": 9.634703196347033, + "grad_norm": 1.2200783491134644, + "learning_rate": 4.069000507356672e-07, + "loss": 0.008, + "step": 10550 + }, + { + "epoch": 9.635616438356164, + "grad_norm": 0.3083069324493408, + "learning_rate": 4.058853373921867e-07, + "loss": 0.0014, + "step": 10551 + }, + { + "epoch": 9.636529680365296, + "grad_norm": 0.6351290345191956, + "learning_rate": 4.048706240487063e-07, + "loss": 0.004, + "step": 10552 + }, + { + "epoch": 9.63744292237443, + "grad_norm": 95.79081726074219, + "learning_rate": 4.038559107052258e-07, + "loss": 0.7935, + "step": 10553 + }, + { + "epoch": 9.638356164383561, + "grad_norm": 15.416584014892578, + "learning_rate": 4.0284119736174535e-07, + "loss": 0.1142, + "step": 10554 + }, + { + "epoch": 9.639269406392694, + "grad_norm": 0.4315749704837799, + "learning_rate": 4.018264840182649e-07, + "loss": 0.0024, + "step": 10555 + }, + { + "epoch": 9.640182648401826, + "grad_norm": 0.0927370935678482, + "learning_rate": 4.0081177067478437e-07, + "loss": 0.0007, + "step": 10556 + }, + { + "epoch": 9.64109589041096, + "grad_norm": 0.047568537294864655, + "learning_rate": 3.9979705733130395e-07, + "loss": 0.0003, + "step": 10557 + }, + { + "epoch": 9.64200913242009, + "grad_norm": 0.5098182559013367, + "learning_rate": 3.987823439878235e-07, + "loss": 0.0027, + "step": 10558 + }, + { + "epoch": 9.642922374429224, + "grad_norm": 0.9493091702461243, + "learning_rate": 3.97767630644343e-07, + "loss": 0.0054, + "step": 10559 + }, + { + "epoch": 9.643835616438356, + "grad_norm": 0.17608514428138733, + "learning_rate": 3.9675291730086255e-07, + "loss": 0.0012, + "step": 10560 + }, + { + "epoch": 9.644748858447489, + "grad_norm": 4.050876617431641, + "learning_rate": 3.9573820395738203e-07, + "loss": 0.0261, + "step": 10561 + }, + { + "epoch": 9.64566210045662, + "grad_norm": 1.1488590240478516, + "learning_rate": 3.947234906139016e-07, + "loss": 0.0065, + "step": 10562 + }, + { + "epoch": 9.646575342465754, + "grad_norm": 3.9268996715545654, + "learning_rate": 3.9370877727042115e-07, + "loss": 0.0309, + "step": 10563 + }, + { + "epoch": 9.647488584474885, + "grad_norm": 5.794347286224365, + "learning_rate": 3.926940639269407e-07, + "loss": 0.0314, + "step": 10564 + }, + { + "epoch": 9.648401826484019, + "grad_norm": 0.44423481822013855, + "learning_rate": 3.916793505834602e-07, + "loss": 0.0023, + "step": 10565 + }, + { + "epoch": 9.64931506849315, + "grad_norm": 6.513840675354004, + "learning_rate": 3.906646372399797e-07, + "loss": 0.035, + "step": 10566 + }, + { + "epoch": 9.650228310502284, + "grad_norm": 2.305870532989502, + "learning_rate": 3.896499238964993e-07, + "loss": 0.0205, + "step": 10567 + }, + { + "epoch": 9.651141552511415, + "grad_norm": 70.24288177490234, + "learning_rate": 3.886352105530188e-07, + "loss": 0.1471, + "step": 10568 + }, + { + "epoch": 9.652054794520549, + "grad_norm": 0.5198224782943726, + "learning_rate": 3.8762049720953835e-07, + "loss": 0.0031, + "step": 10569 + }, + { + "epoch": 9.65296803652968, + "grad_norm": 0.06589091569185257, + "learning_rate": 3.866057838660579e-07, + "loss": 0.0003, + "step": 10570 + }, + { + "epoch": 9.653881278538814, + "grad_norm": 0.007095145061612129, + "learning_rate": 3.8559107052257736e-07, + "loss": 0.0, + "step": 10571 + }, + { + "epoch": 9.654794520547945, + "grad_norm": 0.11403211951255798, + "learning_rate": 3.8457635717909695e-07, + "loss": 0.0007, + "step": 10572 + }, + { + "epoch": 9.655707762557078, + "grad_norm": 1.0841636657714844, + "learning_rate": 3.835616438356165e-07, + "loss": 0.0071, + "step": 10573 + }, + { + "epoch": 9.65662100456621, + "grad_norm": 0.9128362536430359, + "learning_rate": 3.82546930492136e-07, + "loss": 0.0043, + "step": 10574 + }, + { + "epoch": 9.657534246575342, + "grad_norm": 1.9912550449371338, + "learning_rate": 3.8153221714865555e-07, + "loss": 0.0152, + "step": 10575 + }, + { + "epoch": 9.658447488584475, + "grad_norm": 1.3287070989608765, + "learning_rate": 3.8051750380517503e-07, + "loss": 0.0101, + "step": 10576 + }, + { + "epoch": 9.659360730593608, + "grad_norm": 0.017162593081593513, + "learning_rate": 3.795027904616946e-07, + "loss": 0.0001, + "step": 10577 + }, + { + "epoch": 9.66027397260274, + "grad_norm": 1.4991891384124756, + "learning_rate": 3.7848807711821415e-07, + "loss": 0.0075, + "step": 10578 + }, + { + "epoch": 9.661187214611871, + "grad_norm": 1.2470273971557617, + "learning_rate": 3.774733637747337e-07, + "loss": 0.0073, + "step": 10579 + }, + { + "epoch": 9.662100456621005, + "grad_norm": 0.46651047468185425, + "learning_rate": 3.764586504312532e-07, + "loss": 0.0032, + "step": 10580 + }, + { + "epoch": 9.663013698630136, + "grad_norm": 3.780241012573242, + "learning_rate": 3.754439370877727e-07, + "loss": 0.0161, + "step": 10581 + }, + { + "epoch": 9.66392694063927, + "grad_norm": 0.1255808025598526, + "learning_rate": 3.7442922374429223e-07, + "loss": 0.0006, + "step": 10582 + }, + { + "epoch": 9.664840182648401, + "grad_norm": 9.153802871704102, + "learning_rate": 3.734145104008118e-07, + "loss": 0.0469, + "step": 10583 + }, + { + "epoch": 9.665753424657535, + "grad_norm": 3.9103803634643555, + "learning_rate": 3.7239979705733135e-07, + "loss": 0.0185, + "step": 10584 + }, + { + "epoch": 9.666666666666666, + "grad_norm": 4.905357837677002, + "learning_rate": 3.713850837138509e-07, + "loss": 0.0297, + "step": 10585 + }, + { + "epoch": 9.6675799086758, + "grad_norm": 5.390342712402344, + "learning_rate": 3.7037037037037036e-07, + "loss": 0.0382, + "step": 10586 + }, + { + "epoch": 9.668493150684931, + "grad_norm": 0.009818952530622482, + "learning_rate": 3.693556570268899e-07, + "loss": 0.0001, + "step": 10587 + }, + { + "epoch": 9.669406392694064, + "grad_norm": 1.157482624053955, + "learning_rate": 3.683409436834095e-07, + "loss": 0.0043, + "step": 10588 + }, + { + "epoch": 9.670319634703196, + "grad_norm": 7.0984015464782715, + "learning_rate": 3.67326230339929e-07, + "loss": 0.0406, + "step": 10589 + }, + { + "epoch": 9.67123287671233, + "grad_norm": 4.529462814331055, + "learning_rate": 3.6631151699644855e-07, + "loss": 0.0085, + "step": 10590 + }, + { + "epoch": 9.67214611872146, + "grad_norm": 0.9831460118293762, + "learning_rate": 3.6529680365296803e-07, + "loss": 0.0088, + "step": 10591 + }, + { + "epoch": 9.673059360730594, + "grad_norm": 0.7134902477264404, + "learning_rate": 3.6428209030948756e-07, + "loss": 0.0052, + "step": 10592 + }, + { + "epoch": 9.673972602739726, + "grad_norm": 0.4478130340576172, + "learning_rate": 3.6326737696600715e-07, + "loss": 0.0026, + "step": 10593 + }, + { + "epoch": 9.674885844748859, + "grad_norm": 0.09098229557275772, + "learning_rate": 3.622526636225267e-07, + "loss": 0.0005, + "step": 10594 + }, + { + "epoch": 9.67579908675799, + "grad_norm": 10.024299621582031, + "learning_rate": 3.612379502790462e-07, + "loss": 0.0525, + "step": 10595 + }, + { + "epoch": 9.676712328767124, + "grad_norm": 0.10665162652730942, + "learning_rate": 3.602232369355657e-07, + "loss": 0.0007, + "step": 10596 + }, + { + "epoch": 9.677625570776256, + "grad_norm": 0.7205829620361328, + "learning_rate": 3.5920852359208523e-07, + "loss": 0.0049, + "step": 10597 + }, + { + "epoch": 9.678538812785389, + "grad_norm": 2.499223232269287, + "learning_rate": 3.581938102486048e-07, + "loss": 0.0067, + "step": 10598 + }, + { + "epoch": 9.67945205479452, + "grad_norm": 0.398258775472641, + "learning_rate": 3.5717909690512435e-07, + "loss": 0.0031, + "step": 10599 + }, + { + "epoch": 9.680365296803654, + "grad_norm": 34.30715560913086, + "learning_rate": 3.561643835616439e-07, + "loss": 0.2057, + "step": 10600 + }, + { + "epoch": 9.681278538812785, + "grad_norm": 6.36525821685791, + "learning_rate": 3.5514967021816336e-07, + "loss": 0.0422, + "step": 10601 + }, + { + "epoch": 9.682191780821917, + "grad_norm": 8.04183292388916, + "learning_rate": 3.541349568746829e-07, + "loss": 0.0589, + "step": 10602 + }, + { + "epoch": 9.68310502283105, + "grad_norm": 0.33496490120887756, + "learning_rate": 3.531202435312025e-07, + "loss": 0.002, + "step": 10603 + }, + { + "epoch": 9.684018264840184, + "grad_norm": 1.5906935930252075, + "learning_rate": 3.52105530187722e-07, + "loss": 0.0093, + "step": 10604 + }, + { + "epoch": 9.684931506849315, + "grad_norm": 0.08431080728769302, + "learning_rate": 3.5109081684424155e-07, + "loss": 0.0006, + "step": 10605 + }, + { + "epoch": 9.685844748858447, + "grad_norm": 29.21817398071289, + "learning_rate": 3.5007610350076103e-07, + "loss": 0.185, + "step": 10606 + }, + { + "epoch": 9.68675799086758, + "grad_norm": 1.3126068115234375, + "learning_rate": 3.4906139015728056e-07, + "loss": 0.0091, + "step": 10607 + }, + { + "epoch": 9.687671232876712, + "grad_norm": 54.565765380859375, + "learning_rate": 3.4804667681380015e-07, + "loss": 0.4293, + "step": 10608 + }, + { + "epoch": 9.688584474885845, + "grad_norm": 0.894505500793457, + "learning_rate": 3.470319634703197e-07, + "loss": 0.0049, + "step": 10609 + }, + { + "epoch": 9.689497716894977, + "grad_norm": 0.10586263984441757, + "learning_rate": 3.460172501268392e-07, + "loss": 0.0005, + "step": 10610 + }, + { + "epoch": 9.69041095890411, + "grad_norm": 0.03291722759604454, + "learning_rate": 3.450025367833587e-07, + "loss": 0.0001, + "step": 10611 + }, + { + "epoch": 9.691324200913241, + "grad_norm": 45.38041305541992, + "learning_rate": 3.4398782343987823e-07, + "loss": 0.3789, + "step": 10612 + }, + { + "epoch": 9.692237442922375, + "grad_norm": 220.22430419921875, + "learning_rate": 3.429731100963978e-07, + "loss": 0.2354, + "step": 10613 + }, + { + "epoch": 9.693150684931506, + "grad_norm": 7.7309370040893555, + "learning_rate": 3.4195839675291735e-07, + "loss": 0.0378, + "step": 10614 + }, + { + "epoch": 9.69406392694064, + "grad_norm": 5.331541538238525, + "learning_rate": 3.409436834094369e-07, + "loss": 0.0347, + "step": 10615 + }, + { + "epoch": 9.694977168949771, + "grad_norm": 0.38971617817878723, + "learning_rate": 3.3992897006595636e-07, + "loss": 0.002, + "step": 10616 + }, + { + "epoch": 9.695890410958905, + "grad_norm": 0.7812002301216125, + "learning_rate": 3.389142567224759e-07, + "loss": 0.0051, + "step": 10617 + }, + { + "epoch": 9.696803652968036, + "grad_norm": 0.02833203226327896, + "learning_rate": 3.378995433789955e-07, + "loss": 0.0002, + "step": 10618 + }, + { + "epoch": 9.69771689497717, + "grad_norm": 0.09640388190746307, + "learning_rate": 3.36884830035515e-07, + "loss": 0.0006, + "step": 10619 + }, + { + "epoch": 9.698630136986301, + "grad_norm": 0.8614482879638672, + "learning_rate": 3.3587011669203455e-07, + "loss": 0.0033, + "step": 10620 + }, + { + "epoch": 9.699543378995434, + "grad_norm": 34.498775482177734, + "learning_rate": 3.3485540334855403e-07, + "loss": 0.2532, + "step": 10621 + }, + { + "epoch": 9.700456621004566, + "grad_norm": 1.5029059648513794, + "learning_rate": 3.3384069000507356e-07, + "loss": 0.0075, + "step": 10622 + }, + { + "epoch": 9.7013698630137, + "grad_norm": 23.96766471862793, + "learning_rate": 3.3282597666159315e-07, + "loss": 0.0764, + "step": 10623 + }, + { + "epoch": 9.70228310502283, + "grad_norm": 43.132381439208984, + "learning_rate": 3.318112633181127e-07, + "loss": 0.2171, + "step": 10624 + }, + { + "epoch": 9.703196347031964, + "grad_norm": 2.9574713706970215, + "learning_rate": 3.307965499746322e-07, + "loss": 0.0158, + "step": 10625 + }, + { + "epoch": 9.704109589041096, + "grad_norm": 13.38511848449707, + "learning_rate": 3.297818366311517e-07, + "loss": 0.0534, + "step": 10626 + }, + { + "epoch": 9.70502283105023, + "grad_norm": 0.0696919709444046, + "learning_rate": 3.2876712328767123e-07, + "loss": 0.0004, + "step": 10627 + }, + { + "epoch": 9.70593607305936, + "grad_norm": 0.41540855169296265, + "learning_rate": 3.277524099441908e-07, + "loss": 0.0025, + "step": 10628 + }, + { + "epoch": 9.706849315068492, + "grad_norm": 5.239280700683594, + "learning_rate": 3.2673769660071035e-07, + "loss": 0.0304, + "step": 10629 + }, + { + "epoch": 9.707762557077626, + "grad_norm": 0.07020479440689087, + "learning_rate": 3.257229832572299e-07, + "loss": 0.0004, + "step": 10630 + }, + { + "epoch": 9.708675799086759, + "grad_norm": 0.5772596597671509, + "learning_rate": 3.2470826991374936e-07, + "loss": 0.0028, + "step": 10631 + }, + { + "epoch": 9.70958904109589, + "grad_norm": 1.585639238357544, + "learning_rate": 3.236935565702689e-07, + "loss": 0.0084, + "step": 10632 + }, + { + "epoch": 9.710502283105022, + "grad_norm": 0.1158430278301239, + "learning_rate": 3.226788432267885e-07, + "loss": 0.0006, + "step": 10633 + }, + { + "epoch": 9.711415525114155, + "grad_norm": 0.4735453426837921, + "learning_rate": 3.21664129883308e-07, + "loss": 0.0027, + "step": 10634 + }, + { + "epoch": 9.712328767123287, + "grad_norm": 25.841753005981445, + "learning_rate": 3.2064941653982755e-07, + "loss": 0.041, + "step": 10635 + }, + { + "epoch": 9.71324200913242, + "grad_norm": 11.876742362976074, + "learning_rate": 3.19634703196347e-07, + "loss": 0.0733, + "step": 10636 + }, + { + "epoch": 9.714155251141552, + "grad_norm": 0.03739383444190025, + "learning_rate": 3.1861998985286656e-07, + "loss": 0.0003, + "step": 10637 + }, + { + "epoch": 9.715068493150685, + "grad_norm": 0.16443951427936554, + "learning_rate": 3.1760527650938615e-07, + "loss": 0.0011, + "step": 10638 + }, + { + "epoch": 9.715981735159817, + "grad_norm": 0.5092663168907166, + "learning_rate": 3.165905631659057e-07, + "loss": 0.0015, + "step": 10639 + }, + { + "epoch": 9.71689497716895, + "grad_norm": 2.0164718627929688, + "learning_rate": 3.155758498224252e-07, + "loss": 0.0105, + "step": 10640 + }, + { + "epoch": 9.717808219178082, + "grad_norm": 5.044966220855713, + "learning_rate": 3.145611364789447e-07, + "loss": 0.0327, + "step": 10641 + }, + { + "epoch": 9.718721461187215, + "grad_norm": 23.992515563964844, + "learning_rate": 3.135464231354642e-07, + "loss": 0.0325, + "step": 10642 + }, + { + "epoch": 9.719634703196347, + "grad_norm": 0.06163003295660019, + "learning_rate": 3.125317097919838e-07, + "loss": 0.0003, + "step": 10643 + }, + { + "epoch": 9.72054794520548, + "grad_norm": 0.07534616440534592, + "learning_rate": 3.1151699644850334e-07, + "loss": 0.0004, + "step": 10644 + }, + { + "epoch": 9.721461187214611, + "grad_norm": 1.2913509607315063, + "learning_rate": 3.105022831050229e-07, + "loss": 0.0075, + "step": 10645 + }, + { + "epoch": 9.722374429223745, + "grad_norm": 17.061424255371094, + "learning_rate": 3.0948756976154236e-07, + "loss": 0.0842, + "step": 10646 + }, + { + "epoch": 9.723287671232876, + "grad_norm": 1.8405227661132812, + "learning_rate": 3.0847285641806194e-07, + "loss": 0.0079, + "step": 10647 + }, + { + "epoch": 9.72420091324201, + "grad_norm": 0.734591007232666, + "learning_rate": 3.074581430745814e-07, + "loss": 0.004, + "step": 10648 + }, + { + "epoch": 9.725114155251141, + "grad_norm": 27.15161895751953, + "learning_rate": 3.06443429731101e-07, + "loss": 0.1441, + "step": 10649 + }, + { + "epoch": 9.726027397260275, + "grad_norm": 4.45664119720459, + "learning_rate": 3.0542871638762054e-07, + "loss": 0.0242, + "step": 10650 + }, + { + "epoch": 9.726940639269406, + "grad_norm": 0.06134519726037979, + "learning_rate": 3.0441400304414e-07, + "loss": 0.0004, + "step": 10651 + }, + { + "epoch": 9.72785388127854, + "grad_norm": 34.878868103027344, + "learning_rate": 3.033992897006596e-07, + "loss": 0.2706, + "step": 10652 + }, + { + "epoch": 9.728767123287671, + "grad_norm": 0.5309545993804932, + "learning_rate": 3.023845763571791e-07, + "loss": 0.0021, + "step": 10653 + }, + { + "epoch": 9.729680365296804, + "grad_norm": 19.411069869995117, + "learning_rate": 3.013698630136987e-07, + "loss": 0.0869, + "step": 10654 + }, + { + "epoch": 9.730593607305936, + "grad_norm": 21.203338623046875, + "learning_rate": 3.003551496702182e-07, + "loss": 0.0737, + "step": 10655 + }, + { + "epoch": 9.731506849315068, + "grad_norm": 0.06760015338659286, + "learning_rate": 2.993404363267377e-07, + "loss": 0.0005, + "step": 10656 + }, + { + "epoch": 9.732420091324201, + "grad_norm": 0.7426692843437195, + "learning_rate": 2.983257229832573e-07, + "loss": 0.0045, + "step": 10657 + }, + { + "epoch": 9.733333333333333, + "grad_norm": 1.1696093082427979, + "learning_rate": 2.9731100963977676e-07, + "loss": 0.0065, + "step": 10658 + }, + { + "epoch": 9.734246575342466, + "grad_norm": 1.3783609867095947, + "learning_rate": 2.9629629629629634e-07, + "loss": 0.0111, + "step": 10659 + }, + { + "epoch": 9.735159817351597, + "grad_norm": 3.731229543685913, + "learning_rate": 2.952815829528159e-07, + "loss": 0.0187, + "step": 10660 + }, + { + "epoch": 9.73607305936073, + "grad_norm": 1.6429774761199951, + "learning_rate": 2.9426686960933536e-07, + "loss": 0.0072, + "step": 10661 + }, + { + "epoch": 9.736986301369862, + "grad_norm": 6.143342018127441, + "learning_rate": 2.9325215626585494e-07, + "loss": 0.0413, + "step": 10662 + }, + { + "epoch": 9.737899543378996, + "grad_norm": 0.20899303257465363, + "learning_rate": 2.922374429223744e-07, + "loss": 0.0014, + "step": 10663 + }, + { + "epoch": 9.738812785388127, + "grad_norm": 0.24340173602104187, + "learning_rate": 2.91222729578894e-07, + "loss": 0.0017, + "step": 10664 + }, + { + "epoch": 9.73972602739726, + "grad_norm": 0.0859467163681984, + "learning_rate": 2.9020801623541354e-07, + "loss": 0.0005, + "step": 10665 + }, + { + "epoch": 9.740639269406392, + "grad_norm": 1.8534852266311646, + "learning_rate": 2.89193302891933e-07, + "loss": 0.0097, + "step": 10666 + }, + { + "epoch": 9.741552511415525, + "grad_norm": 1.3327553272247314, + "learning_rate": 2.881785895484526e-07, + "loss": 0.0091, + "step": 10667 + }, + { + "epoch": 9.742465753424657, + "grad_norm": 0.7053646445274353, + "learning_rate": 2.871638762049721e-07, + "loss": 0.0024, + "step": 10668 + }, + { + "epoch": 9.74337899543379, + "grad_norm": 42.74580764770508, + "learning_rate": 2.861491628614917e-07, + "loss": 0.3067, + "step": 10669 + }, + { + "epoch": 9.744292237442922, + "grad_norm": 1.0670114755630493, + "learning_rate": 2.851344495180112e-07, + "loss": 0.0045, + "step": 10670 + }, + { + "epoch": 9.745205479452055, + "grad_norm": 3.423119068145752, + "learning_rate": 2.841197361745307e-07, + "loss": 0.0173, + "step": 10671 + }, + { + "epoch": 9.746118721461187, + "grad_norm": 2.4198296070098877, + "learning_rate": 2.831050228310503e-07, + "loss": 0.0141, + "step": 10672 + }, + { + "epoch": 9.74703196347032, + "grad_norm": 31.839759826660156, + "learning_rate": 2.8209030948756976e-07, + "loss": 0.3742, + "step": 10673 + }, + { + "epoch": 9.747945205479452, + "grad_norm": 0.41942596435546875, + "learning_rate": 2.8107559614408934e-07, + "loss": 0.0018, + "step": 10674 + }, + { + "epoch": 9.748858447488585, + "grad_norm": 1.089460849761963, + "learning_rate": 2.800608828006089e-07, + "loss": 0.0077, + "step": 10675 + }, + { + "epoch": 9.749771689497717, + "grad_norm": 0.517432451248169, + "learning_rate": 2.7904616945712836e-07, + "loss": 0.0042, + "step": 10676 + }, + { + "epoch": 9.75068493150685, + "grad_norm": 12.77377700805664, + "learning_rate": 2.7803145611364794e-07, + "loss": 0.0814, + "step": 10677 + }, + { + "epoch": 9.751598173515982, + "grad_norm": 0.0628482922911644, + "learning_rate": 2.770167427701674e-07, + "loss": 0.0004, + "step": 10678 + }, + { + "epoch": 9.752511415525115, + "grad_norm": 0.0778474360704422, + "learning_rate": 2.76002029426687e-07, + "loss": 0.0005, + "step": 10679 + }, + { + "epoch": 9.753424657534246, + "grad_norm": 0.5445970296859741, + "learning_rate": 2.749873160832065e-07, + "loss": 0.0033, + "step": 10680 + }, + { + "epoch": 9.75433789954338, + "grad_norm": 0.8905161023139954, + "learning_rate": 2.73972602739726e-07, + "loss": 0.0054, + "step": 10681 + }, + { + "epoch": 9.755251141552511, + "grad_norm": 0.28148844838142395, + "learning_rate": 2.729578893962456e-07, + "loss": 0.0014, + "step": 10682 + }, + { + "epoch": 9.756164383561643, + "grad_norm": 0.04835079237818718, + "learning_rate": 2.719431760527651e-07, + "loss": 0.0002, + "step": 10683 + }, + { + "epoch": 9.757077625570776, + "grad_norm": 0.2998092770576477, + "learning_rate": 2.709284627092847e-07, + "loss": 0.0026, + "step": 10684 + }, + { + "epoch": 9.757990867579908, + "grad_norm": 0.15707282721996307, + "learning_rate": 2.6991374936580416e-07, + "loss": 0.0003, + "step": 10685 + }, + { + "epoch": 9.758904109589041, + "grad_norm": 2.5180716514587402, + "learning_rate": 2.688990360223237e-07, + "loss": 0.011, + "step": 10686 + }, + { + "epoch": 9.759817351598173, + "grad_norm": 78.92591857910156, + "learning_rate": 2.678843226788433e-07, + "loss": 0.7594, + "step": 10687 + }, + { + "epoch": 9.760730593607306, + "grad_norm": 9.890813827514648, + "learning_rate": 2.6686960933536276e-07, + "loss": 0.0564, + "step": 10688 + }, + { + "epoch": 9.761643835616438, + "grad_norm": 3.4534666538238525, + "learning_rate": 2.6585489599188234e-07, + "loss": 0.0201, + "step": 10689 + }, + { + "epoch": 9.762557077625571, + "grad_norm": 0.2822260856628418, + "learning_rate": 2.648401826484018e-07, + "loss": 0.0019, + "step": 10690 + }, + { + "epoch": 9.763470319634703, + "grad_norm": 4.341162204742432, + "learning_rate": 2.6382546930492135e-07, + "loss": 0.0287, + "step": 10691 + }, + { + "epoch": 9.764383561643836, + "grad_norm": 37.93056106567383, + "learning_rate": 2.6281075596144094e-07, + "loss": 0.3747, + "step": 10692 + }, + { + "epoch": 9.765296803652967, + "grad_norm": 0.46273380517959595, + "learning_rate": 2.617960426179604e-07, + "loss": 0.0024, + "step": 10693 + }, + { + "epoch": 9.7662100456621, + "grad_norm": 0.09658174961805344, + "learning_rate": 2.6078132927448e-07, + "loss": 0.0005, + "step": 10694 + }, + { + "epoch": 9.767123287671232, + "grad_norm": 46.91368103027344, + "learning_rate": 2.597666159309995e-07, + "loss": 0.2894, + "step": 10695 + }, + { + "epoch": 9.768036529680366, + "grad_norm": 10.210230827331543, + "learning_rate": 2.58751902587519e-07, + "loss": 0.0593, + "step": 10696 + }, + { + "epoch": 9.768949771689497, + "grad_norm": 47.912784576416016, + "learning_rate": 2.577371892440386e-07, + "loss": 0.3216, + "step": 10697 + }, + { + "epoch": 9.76986301369863, + "grad_norm": 1.0039633512496948, + "learning_rate": 2.567224759005581e-07, + "loss": 0.004, + "step": 10698 + }, + { + "epoch": 9.770776255707762, + "grad_norm": 7.973285675048828, + "learning_rate": 2.557077625570777e-07, + "loss": 0.0733, + "step": 10699 + }, + { + "epoch": 9.771689497716896, + "grad_norm": 0.5823355913162231, + "learning_rate": 2.5469304921359715e-07, + "loss": 0.004, + "step": 10700 + }, + { + "epoch": 9.772602739726027, + "grad_norm": 1.6584185361862183, + "learning_rate": 2.536783358701167e-07, + "loss": 0.0153, + "step": 10701 + }, + { + "epoch": 9.77351598173516, + "grad_norm": 31.120532989501953, + "learning_rate": 2.5266362252663627e-07, + "loss": 0.2549, + "step": 10702 + }, + { + "epoch": 9.774429223744292, + "grad_norm": 12.733890533447266, + "learning_rate": 2.5164890918315575e-07, + "loss": 0.0807, + "step": 10703 + }, + { + "epoch": 9.775342465753425, + "grad_norm": 1.8708654642105103, + "learning_rate": 2.5063419583967534e-07, + "loss": 0.0117, + "step": 10704 + }, + { + "epoch": 9.776255707762557, + "grad_norm": 15.965044975280762, + "learning_rate": 2.496194824961948e-07, + "loss": 0.1406, + "step": 10705 + }, + { + "epoch": 9.77716894977169, + "grad_norm": 0.39707377552986145, + "learning_rate": 2.4860476915271435e-07, + "loss": 0.0026, + "step": 10706 + }, + { + "epoch": 9.778082191780822, + "grad_norm": 0.3141477108001709, + "learning_rate": 2.4759005580923394e-07, + "loss": 0.0015, + "step": 10707 + }, + { + "epoch": 9.778995433789955, + "grad_norm": 0.16226261854171753, + "learning_rate": 2.465753424657534e-07, + "loss": 0.0008, + "step": 10708 + }, + { + "epoch": 9.779908675799087, + "grad_norm": 29.276588439941406, + "learning_rate": 2.45560629122273e-07, + "loss": 0.1459, + "step": 10709 + }, + { + "epoch": 9.780821917808218, + "grad_norm": 0.07568266987800598, + "learning_rate": 2.445459157787925e-07, + "loss": 0.0004, + "step": 10710 + }, + { + "epoch": 9.781735159817352, + "grad_norm": 0.7055368423461914, + "learning_rate": 2.43531202435312e-07, + "loss": 0.0044, + "step": 10711 + }, + { + "epoch": 9.782648401826483, + "grad_norm": 0.355287104845047, + "learning_rate": 2.425164890918316e-07, + "loss": 0.0015, + "step": 10712 + }, + { + "epoch": 9.783561643835617, + "grad_norm": 50.21063232421875, + "learning_rate": 2.415017757483511e-07, + "loss": 0.2614, + "step": 10713 + }, + { + "epoch": 9.784474885844748, + "grad_norm": 0.6812466382980347, + "learning_rate": 2.4048706240487067e-07, + "loss": 0.0039, + "step": 10714 + }, + { + "epoch": 9.785388127853881, + "grad_norm": 0.8444032669067383, + "learning_rate": 2.3947234906139015e-07, + "loss": 0.0055, + "step": 10715 + }, + { + "epoch": 9.786301369863013, + "grad_norm": 2.415692090988159, + "learning_rate": 2.384576357179097e-07, + "loss": 0.0054, + "step": 10716 + }, + { + "epoch": 9.787214611872146, + "grad_norm": 65.8238296508789, + "learning_rate": 2.3744292237442925e-07, + "loss": 0.2389, + "step": 10717 + }, + { + "epoch": 9.788127853881278, + "grad_norm": 23.98836326599121, + "learning_rate": 2.3642820903094878e-07, + "loss": 0.278, + "step": 10718 + }, + { + "epoch": 9.789041095890411, + "grad_norm": 6.957454204559326, + "learning_rate": 2.354134956874683e-07, + "loss": 0.0192, + "step": 10719 + }, + { + "epoch": 9.789954337899543, + "grad_norm": 34.54537582397461, + "learning_rate": 2.3439878234398785e-07, + "loss": 0.2077, + "step": 10720 + }, + { + "epoch": 9.790867579908676, + "grad_norm": 8.595416069030762, + "learning_rate": 2.3338406900050735e-07, + "loss": 0.0396, + "step": 10721 + }, + { + "epoch": 9.791780821917808, + "grad_norm": 0.0775180384516716, + "learning_rate": 2.323693556570269e-07, + "loss": 0.0005, + "step": 10722 + }, + { + "epoch": 9.792694063926941, + "grad_norm": 0.0029776848386973143, + "learning_rate": 2.3135464231354645e-07, + "loss": 0.0, + "step": 10723 + }, + { + "epoch": 9.793607305936073, + "grad_norm": 6.7286696434021, + "learning_rate": 2.3033992897006598e-07, + "loss": 0.0296, + "step": 10724 + }, + { + "epoch": 9.794520547945206, + "grad_norm": 0.2741451561450958, + "learning_rate": 2.293252156265855e-07, + "loss": 0.002, + "step": 10725 + }, + { + "epoch": 9.795433789954338, + "grad_norm": 5.627778053283691, + "learning_rate": 2.2831050228310502e-07, + "loss": 0.0309, + "step": 10726 + }, + { + "epoch": 9.796347031963471, + "grad_norm": 11.578512191772461, + "learning_rate": 2.2729578893962458e-07, + "loss": 0.0648, + "step": 10727 + }, + { + "epoch": 9.797260273972602, + "grad_norm": 23.423080444335938, + "learning_rate": 2.262810755961441e-07, + "loss": 0.1591, + "step": 10728 + }, + { + "epoch": 9.798173515981736, + "grad_norm": 1.5389338731765747, + "learning_rate": 2.2526636225266364e-07, + "loss": 0.008, + "step": 10729 + }, + { + "epoch": 9.799086757990867, + "grad_norm": 104.63399505615234, + "learning_rate": 2.2425164890918318e-07, + "loss": 1.3872, + "step": 10730 + }, + { + "epoch": 9.8, + "grad_norm": 24.447925567626953, + "learning_rate": 2.2323693556570268e-07, + "loss": 0.1375, + "step": 10731 + }, + { + "epoch": 9.800913242009132, + "grad_norm": 0.490585058927536, + "learning_rate": 2.2222222222222224e-07, + "loss": 0.0035, + "step": 10732 + }, + { + "epoch": 9.801826484018266, + "grad_norm": 2.274513006210327, + "learning_rate": 2.2120750887874178e-07, + "loss": 0.0143, + "step": 10733 + }, + { + "epoch": 9.802739726027397, + "grad_norm": 5.053993225097656, + "learning_rate": 2.201927955352613e-07, + "loss": 0.0202, + "step": 10734 + }, + { + "epoch": 9.80365296803653, + "grad_norm": 2.36989426612854, + "learning_rate": 2.1917808219178084e-07, + "loss": 0.0122, + "step": 10735 + }, + { + "epoch": 9.804566210045662, + "grad_norm": 2.415203332901001, + "learning_rate": 2.1816336884830035e-07, + "loss": 0.0141, + "step": 10736 + }, + { + "epoch": 9.805479452054794, + "grad_norm": 0.049755215644836426, + "learning_rate": 2.171486555048199e-07, + "loss": 0.0003, + "step": 10737 + }, + { + "epoch": 9.806392694063927, + "grad_norm": 34.181915283203125, + "learning_rate": 2.1613394216133944e-07, + "loss": 0.3594, + "step": 10738 + }, + { + "epoch": 9.807305936073059, + "grad_norm": 2.1292479038238525, + "learning_rate": 2.1511922881785898e-07, + "loss": 0.0105, + "step": 10739 + }, + { + "epoch": 9.808219178082192, + "grad_norm": 13.206794738769531, + "learning_rate": 2.141045154743785e-07, + "loss": 0.0969, + "step": 10740 + }, + { + "epoch": 9.809132420091323, + "grad_norm": 0.7813107371330261, + "learning_rate": 2.1308980213089802e-07, + "loss": 0.003, + "step": 10741 + }, + { + "epoch": 9.810045662100457, + "grad_norm": 54.3937873840332, + "learning_rate": 2.1207508878741758e-07, + "loss": 0.3806, + "step": 10742 + }, + { + "epoch": 9.810958904109588, + "grad_norm": 15.128925323486328, + "learning_rate": 2.110603754439371e-07, + "loss": 0.0676, + "step": 10743 + }, + { + "epoch": 9.811872146118722, + "grad_norm": 5.362679481506348, + "learning_rate": 2.1004566210045664e-07, + "loss": 0.0262, + "step": 10744 + }, + { + "epoch": 9.812785388127853, + "grad_norm": 1.5418082475662231, + "learning_rate": 2.0903094875697618e-07, + "loss": 0.0069, + "step": 10745 + }, + { + "epoch": 9.813698630136987, + "grad_norm": 0.23665454983711243, + "learning_rate": 2.0801623541349568e-07, + "loss": 0.0013, + "step": 10746 + }, + { + "epoch": 9.814611872146118, + "grad_norm": 0.16908839344978333, + "learning_rate": 2.0700152207001524e-07, + "loss": 0.0008, + "step": 10747 + }, + { + "epoch": 9.815525114155252, + "grad_norm": 2.964594841003418, + "learning_rate": 2.0598680872653478e-07, + "loss": 0.0137, + "step": 10748 + }, + { + "epoch": 9.816438356164383, + "grad_norm": 0.8523082137107849, + "learning_rate": 2.049720953830543e-07, + "loss": 0.0049, + "step": 10749 + }, + { + "epoch": 9.817351598173516, + "grad_norm": 0.04448289796710014, + "learning_rate": 2.0395738203957384e-07, + "loss": 0.0003, + "step": 10750 + }, + { + "epoch": 9.818264840182648, + "grad_norm": 4.375729560852051, + "learning_rate": 2.0294266869609335e-07, + "loss": 0.0265, + "step": 10751 + }, + { + "epoch": 9.819178082191781, + "grad_norm": 8.771864891052246, + "learning_rate": 2.019279553526129e-07, + "loss": 0.0419, + "step": 10752 + }, + { + "epoch": 9.820091324200913, + "grad_norm": 1.1906418800354004, + "learning_rate": 2.0091324200913244e-07, + "loss": 0.0045, + "step": 10753 + }, + { + "epoch": 9.821004566210046, + "grad_norm": 0.5733381509780884, + "learning_rate": 1.9989852866565198e-07, + "loss": 0.0031, + "step": 10754 + }, + { + "epoch": 9.821917808219178, + "grad_norm": 33.4375, + "learning_rate": 1.988838153221715e-07, + "loss": 0.2812, + "step": 10755 + }, + { + "epoch": 9.822831050228311, + "grad_norm": 96.47332763671875, + "learning_rate": 1.9786910197869102e-07, + "loss": 2.6394, + "step": 10756 + }, + { + "epoch": 9.823744292237443, + "grad_norm": 3.1896259784698486, + "learning_rate": 1.9685438863521058e-07, + "loss": 0.0205, + "step": 10757 + }, + { + "epoch": 9.824657534246576, + "grad_norm": 9.487210273742676, + "learning_rate": 1.958396752917301e-07, + "loss": 0.0401, + "step": 10758 + }, + { + "epoch": 9.825570776255708, + "grad_norm": 2.7383127212524414, + "learning_rate": 1.9482496194824964e-07, + "loss": 0.0073, + "step": 10759 + }, + { + "epoch": 9.826484018264841, + "grad_norm": 0.15967556834220886, + "learning_rate": 1.9381024860476918e-07, + "loss": 0.001, + "step": 10760 + }, + { + "epoch": 9.827397260273973, + "grad_norm": 12.021398544311523, + "learning_rate": 1.9279553526128868e-07, + "loss": 0.0798, + "step": 10761 + }, + { + "epoch": 9.828310502283106, + "grad_norm": 4.27122688293457, + "learning_rate": 1.9178082191780824e-07, + "loss": 0.0199, + "step": 10762 + }, + { + "epoch": 9.829223744292237, + "grad_norm": 3.77594256401062, + "learning_rate": 1.9076610857432778e-07, + "loss": 0.0221, + "step": 10763 + }, + { + "epoch": 9.830136986301369, + "grad_norm": 0.6479183435440063, + "learning_rate": 1.897513952308473e-07, + "loss": 0.0028, + "step": 10764 + }, + { + "epoch": 9.831050228310502, + "grad_norm": 0.11161081492900848, + "learning_rate": 1.8873668188736684e-07, + "loss": 0.0005, + "step": 10765 + }, + { + "epoch": 9.831963470319634, + "grad_norm": 0.02663734368979931, + "learning_rate": 1.8772196854388635e-07, + "loss": 0.0001, + "step": 10766 + }, + { + "epoch": 9.832876712328767, + "grad_norm": 12.20815372467041, + "learning_rate": 1.867072552004059e-07, + "loss": 0.0569, + "step": 10767 + }, + { + "epoch": 9.833789954337899, + "grad_norm": 0.08624356240034103, + "learning_rate": 1.8569254185692544e-07, + "loss": 0.0005, + "step": 10768 + }, + { + "epoch": 9.834703196347032, + "grad_norm": 22.893260955810547, + "learning_rate": 1.8467782851344495e-07, + "loss": 0.1576, + "step": 10769 + }, + { + "epoch": 9.835616438356164, + "grad_norm": 0.3639664053916931, + "learning_rate": 1.836631151699645e-07, + "loss": 0.0019, + "step": 10770 + }, + { + "epoch": 9.836529680365297, + "grad_norm": 4.837922096252441, + "learning_rate": 1.8264840182648401e-07, + "loss": 0.0193, + "step": 10771 + }, + { + "epoch": 9.837442922374429, + "grad_norm": 0.10901658236980438, + "learning_rate": 1.8163368848300357e-07, + "loss": 0.0007, + "step": 10772 + }, + { + "epoch": 9.838356164383562, + "grad_norm": 23.43638038635254, + "learning_rate": 1.806189751395231e-07, + "loss": 0.1067, + "step": 10773 + }, + { + "epoch": 9.839269406392694, + "grad_norm": 48.757850646972656, + "learning_rate": 1.7960426179604261e-07, + "loss": 0.3006, + "step": 10774 + }, + { + "epoch": 9.840182648401827, + "grad_norm": 2.3703298568725586, + "learning_rate": 1.7858954845256217e-07, + "loss": 0.0066, + "step": 10775 + }, + { + "epoch": 9.841095890410958, + "grad_norm": 2.9270312786102295, + "learning_rate": 1.7757483510908168e-07, + "loss": 0.0162, + "step": 10776 + }, + { + "epoch": 9.842009132420092, + "grad_norm": 0.3230913281440735, + "learning_rate": 1.7656012176560124e-07, + "loss": 0.0029, + "step": 10777 + }, + { + "epoch": 9.842922374429223, + "grad_norm": 0.9996331930160522, + "learning_rate": 1.7554540842212077e-07, + "loss": 0.0062, + "step": 10778 + }, + { + "epoch": 9.843835616438357, + "grad_norm": 0.059583764523267746, + "learning_rate": 1.7453069507864028e-07, + "loss": 0.0003, + "step": 10779 + }, + { + "epoch": 9.844748858447488, + "grad_norm": 0.13136650621891022, + "learning_rate": 1.7351598173515984e-07, + "loss": 0.0006, + "step": 10780 + }, + { + "epoch": 9.845662100456622, + "grad_norm": 29.147220611572266, + "learning_rate": 1.7250126839167935e-07, + "loss": 0.1254, + "step": 10781 + }, + { + "epoch": 9.846575342465753, + "grad_norm": 0.31286442279815674, + "learning_rate": 1.714865550481989e-07, + "loss": 0.0018, + "step": 10782 + }, + { + "epoch": 9.847488584474887, + "grad_norm": 0.03459823504090309, + "learning_rate": 1.7047184170471844e-07, + "loss": 0.0001, + "step": 10783 + }, + { + "epoch": 9.848401826484018, + "grad_norm": 3.451012372970581, + "learning_rate": 1.6945712836123795e-07, + "loss": 0.0236, + "step": 10784 + }, + { + "epoch": 9.849315068493151, + "grad_norm": 6.418190956115723, + "learning_rate": 1.684424150177575e-07, + "loss": 0.0278, + "step": 10785 + }, + { + "epoch": 9.850228310502283, + "grad_norm": 100.58836364746094, + "learning_rate": 1.6742770167427701e-07, + "loss": 0.9482, + "step": 10786 + }, + { + "epoch": 9.851141552511416, + "grad_norm": 0.8484893441200256, + "learning_rate": 1.6641298833079657e-07, + "loss": 0.0053, + "step": 10787 + }, + { + "epoch": 9.852054794520548, + "grad_norm": 4.427528381347656, + "learning_rate": 1.653982749873161e-07, + "loss": 0.0289, + "step": 10788 + }, + { + "epoch": 9.852968036529681, + "grad_norm": 6.01402473449707, + "learning_rate": 1.6438356164383561e-07, + "loss": 0.045, + "step": 10789 + }, + { + "epoch": 9.853881278538813, + "grad_norm": 12.531757354736328, + "learning_rate": 1.6336884830035517e-07, + "loss": 0.0898, + "step": 10790 + }, + { + "epoch": 9.854794520547944, + "grad_norm": 9.880745887756348, + "learning_rate": 1.6235413495687468e-07, + "loss": 0.0557, + "step": 10791 + }, + { + "epoch": 9.855707762557078, + "grad_norm": 6.210879802703857, + "learning_rate": 1.6133942161339424e-07, + "loss": 0.0381, + "step": 10792 + }, + { + "epoch": 9.85662100456621, + "grad_norm": 3.8671088218688965, + "learning_rate": 1.6032470826991377e-07, + "loss": 0.0138, + "step": 10793 + }, + { + "epoch": 9.857534246575343, + "grad_norm": 28.742530822753906, + "learning_rate": 1.5930999492643328e-07, + "loss": 0.1985, + "step": 10794 + }, + { + "epoch": 9.858447488584474, + "grad_norm": 0.05834566801786423, + "learning_rate": 1.5829528158295284e-07, + "loss": 0.0004, + "step": 10795 + }, + { + "epoch": 9.859360730593608, + "grad_norm": 3.8639094829559326, + "learning_rate": 1.5728056823947235e-07, + "loss": 0.0187, + "step": 10796 + }, + { + "epoch": 9.860273972602739, + "grad_norm": 0.3022826015949249, + "learning_rate": 1.562658548959919e-07, + "loss": 0.002, + "step": 10797 + }, + { + "epoch": 9.861187214611872, + "grad_norm": 9.871163368225098, + "learning_rate": 1.5525114155251144e-07, + "loss": 0.0528, + "step": 10798 + }, + { + "epoch": 9.862100456621004, + "grad_norm": 39.08793640136719, + "learning_rate": 1.5423642820903097e-07, + "loss": 0.0466, + "step": 10799 + }, + { + "epoch": 9.863013698630137, + "grad_norm": 3.4323551654815674, + "learning_rate": 1.532217148655505e-07, + "loss": 0.022, + "step": 10800 + }, + { + "epoch": 9.863926940639269, + "grad_norm": 67.01792907714844, + "learning_rate": 1.5220700152207e-07, + "loss": 0.5391, + "step": 10801 + }, + { + "epoch": 9.864840182648402, + "grad_norm": 1.7265360355377197, + "learning_rate": 1.5119228817858955e-07, + "loss": 0.0124, + "step": 10802 + }, + { + "epoch": 9.865753424657534, + "grad_norm": 47.79594802856445, + "learning_rate": 1.501775748351091e-07, + "loss": 0.4643, + "step": 10803 + }, + { + "epoch": 9.866666666666667, + "grad_norm": 0.3202807605266571, + "learning_rate": 1.4916286149162864e-07, + "loss": 0.0014, + "step": 10804 + }, + { + "epoch": 9.867579908675799, + "grad_norm": 30.95623207092285, + "learning_rate": 1.4814814814814817e-07, + "loss": 0.1689, + "step": 10805 + }, + { + "epoch": 9.868493150684932, + "grad_norm": 8.871271133422852, + "learning_rate": 1.4713343480466768e-07, + "loss": 0.0233, + "step": 10806 + }, + { + "epoch": 9.869406392694064, + "grad_norm": 0.11199638247489929, + "learning_rate": 1.461187214611872e-07, + "loss": 0.0006, + "step": 10807 + }, + { + "epoch": 9.870319634703197, + "grad_norm": 98.76514434814453, + "learning_rate": 1.4510400811770677e-07, + "loss": 2.1209, + "step": 10808 + }, + { + "epoch": 9.871232876712329, + "grad_norm": 0.18546518683433533, + "learning_rate": 1.440892947742263e-07, + "loss": 0.0008, + "step": 10809 + }, + { + "epoch": 9.872146118721462, + "grad_norm": 5.127761363983154, + "learning_rate": 1.4307458143074584e-07, + "loss": 0.0321, + "step": 10810 + }, + { + "epoch": 9.873059360730593, + "grad_norm": 4.942172527313232, + "learning_rate": 1.4205986808726534e-07, + "loss": 0.0352, + "step": 10811 + }, + { + "epoch": 9.873972602739727, + "grad_norm": 3.9763505458831787, + "learning_rate": 1.4104515474378488e-07, + "loss": 0.0261, + "step": 10812 + }, + { + "epoch": 9.874885844748858, + "grad_norm": 0.2228347659111023, + "learning_rate": 1.4003044140030444e-07, + "loss": 0.0012, + "step": 10813 + }, + { + "epoch": 9.875799086757992, + "grad_norm": 0.07009941339492798, + "learning_rate": 1.3901572805682397e-07, + "loss": 0.0004, + "step": 10814 + }, + { + "epoch": 9.876712328767123, + "grad_norm": 0.12936845421791077, + "learning_rate": 1.380010147133435e-07, + "loss": 0.0009, + "step": 10815 + }, + { + "epoch": 9.877625570776257, + "grad_norm": 2.643165111541748, + "learning_rate": 1.36986301369863e-07, + "loss": 0.0149, + "step": 10816 + }, + { + "epoch": 9.878538812785388, + "grad_norm": 0.2836993634700775, + "learning_rate": 1.3597158802638254e-07, + "loss": 0.0013, + "step": 10817 + }, + { + "epoch": 9.87945205479452, + "grad_norm": 0.5967729687690735, + "learning_rate": 1.3495687468290208e-07, + "loss": 0.0033, + "step": 10818 + }, + { + "epoch": 9.880365296803653, + "grad_norm": 38.54998016357422, + "learning_rate": 1.3394216133942164e-07, + "loss": 0.3666, + "step": 10819 + }, + { + "epoch": 9.881278538812785, + "grad_norm": 1.632049322128296, + "learning_rate": 1.3292744799594117e-07, + "loss": 0.0066, + "step": 10820 + }, + { + "epoch": 9.882191780821918, + "grad_norm": 1.1632083654403687, + "learning_rate": 1.3191273465246068e-07, + "loss": 0.0055, + "step": 10821 + }, + { + "epoch": 9.88310502283105, + "grad_norm": 0.6995784044265747, + "learning_rate": 1.308980213089802e-07, + "loss": 0.0039, + "step": 10822 + }, + { + "epoch": 9.884018264840183, + "grad_norm": 0.22158150374889374, + "learning_rate": 1.2988330796549974e-07, + "loss": 0.0016, + "step": 10823 + }, + { + "epoch": 9.884931506849314, + "grad_norm": 7.695059776306152, + "learning_rate": 1.288685946220193e-07, + "loss": 0.0518, + "step": 10824 + }, + { + "epoch": 9.885844748858448, + "grad_norm": 0.8056147694587708, + "learning_rate": 1.2785388127853884e-07, + "loss": 0.005, + "step": 10825 + }, + { + "epoch": 9.88675799086758, + "grad_norm": 6.254096984863281, + "learning_rate": 1.2683916793505834e-07, + "loss": 0.0371, + "step": 10826 + }, + { + "epoch": 9.887671232876713, + "grad_norm": 0.015082832425832748, + "learning_rate": 1.2582445459157788e-07, + "loss": 0.0001, + "step": 10827 + }, + { + "epoch": 9.888584474885844, + "grad_norm": 0.15423116087913513, + "learning_rate": 1.248097412480974e-07, + "loss": 0.0008, + "step": 10828 + }, + { + "epoch": 9.889497716894978, + "grad_norm": 0.417183518409729, + "learning_rate": 1.2379502790461697e-07, + "loss": 0.0022, + "step": 10829 + }, + { + "epoch": 9.89041095890411, + "grad_norm": 0.1330004185438156, + "learning_rate": 1.227803145611365e-07, + "loss": 0.001, + "step": 10830 + }, + { + "epoch": 9.891324200913242, + "grad_norm": 10.927547454833984, + "learning_rate": 1.21765601217656e-07, + "loss": 0.0963, + "step": 10831 + }, + { + "epoch": 9.892237442922374, + "grad_norm": 0.11763051897287369, + "learning_rate": 1.2075088787417554e-07, + "loss": 0.0006, + "step": 10832 + }, + { + "epoch": 9.893150684931507, + "grad_norm": 62.8052978515625, + "learning_rate": 1.1973617453069508e-07, + "loss": 0.7928, + "step": 10833 + }, + { + "epoch": 9.894063926940639, + "grad_norm": 0.045423123985528946, + "learning_rate": 1.1872146118721462e-07, + "loss": 0.0003, + "step": 10834 + }, + { + "epoch": 9.894977168949772, + "grad_norm": 0.8357120156288147, + "learning_rate": 1.1770674784373416e-07, + "loss": 0.0055, + "step": 10835 + }, + { + "epoch": 9.895890410958904, + "grad_norm": 0.10183274745941162, + "learning_rate": 1.1669203450025368e-07, + "loss": 0.0007, + "step": 10836 + }, + { + "epoch": 9.896803652968037, + "grad_norm": 2.0300772190093994, + "learning_rate": 1.1567732115677322e-07, + "loss": 0.0096, + "step": 10837 + }, + { + "epoch": 9.897716894977169, + "grad_norm": 5.742561340332031, + "learning_rate": 1.1466260781329276e-07, + "loss": 0.027, + "step": 10838 + }, + { + "epoch": 9.898630136986302, + "grad_norm": 0.6149722337722778, + "learning_rate": 1.1364789446981229e-07, + "loss": 0.0032, + "step": 10839 + }, + { + "epoch": 9.899543378995434, + "grad_norm": 3.9962072372436523, + "learning_rate": 1.1263318112633182e-07, + "loss": 0.0309, + "step": 10840 + }, + { + "epoch": 9.900456621004567, + "grad_norm": 36.862300872802734, + "learning_rate": 1.1161846778285134e-07, + "loss": 0.295, + "step": 10841 + }, + { + "epoch": 9.901369863013699, + "grad_norm": 0.07957632094621658, + "learning_rate": 1.1060375443937089e-07, + "loss": 0.0005, + "step": 10842 + }, + { + "epoch": 9.902283105022832, + "grad_norm": 0.5444740653038025, + "learning_rate": 1.0958904109589042e-07, + "loss": 0.004, + "step": 10843 + }, + { + "epoch": 9.903196347031963, + "grad_norm": 1.0311304330825806, + "learning_rate": 1.0857432775240996e-07, + "loss": 0.0064, + "step": 10844 + }, + { + "epoch": 9.904109589041095, + "grad_norm": 60.3370475769043, + "learning_rate": 1.0755961440892949e-07, + "loss": 0.4353, + "step": 10845 + }, + { + "epoch": 9.905022831050228, + "grad_norm": 1.7110326290130615, + "learning_rate": 1.0654490106544901e-07, + "loss": 0.0096, + "step": 10846 + }, + { + "epoch": 9.90593607305936, + "grad_norm": 5.9434428215026855, + "learning_rate": 1.0553018772196856e-07, + "loss": 0.033, + "step": 10847 + }, + { + "epoch": 9.906849315068493, + "grad_norm": 0.15176701545715332, + "learning_rate": 1.0451547437848809e-07, + "loss": 0.0008, + "step": 10848 + }, + { + "epoch": 9.907762557077625, + "grad_norm": 1.203794002532959, + "learning_rate": 1.0350076103500762e-07, + "loss": 0.0063, + "step": 10849 + }, + { + "epoch": 9.908675799086758, + "grad_norm": 3.2084949016571045, + "learning_rate": 1.0248604769152715e-07, + "loss": 0.0153, + "step": 10850 + }, + { + "epoch": 9.90958904109589, + "grad_norm": 4.674961090087891, + "learning_rate": 1.0147133434804667e-07, + "loss": 0.0309, + "step": 10851 + }, + { + "epoch": 9.910502283105023, + "grad_norm": 0.44873419404029846, + "learning_rate": 1.0045662100456622e-07, + "loss": 0.0027, + "step": 10852 + }, + { + "epoch": 9.911415525114155, + "grad_norm": 7.229349613189697, + "learning_rate": 9.944190766108575e-08, + "loss": 0.055, + "step": 10853 + }, + { + "epoch": 9.912328767123288, + "grad_norm": 94.42486572265625, + "learning_rate": 9.842719431760529e-08, + "loss": 0.9052, + "step": 10854 + }, + { + "epoch": 9.91324200913242, + "grad_norm": 2.5300748348236084, + "learning_rate": 9.741248097412482e-08, + "loss": 0.0175, + "step": 10855 + }, + { + "epoch": 9.914155251141553, + "grad_norm": 3.560786008834839, + "learning_rate": 9.639776763064434e-08, + "loss": 0.0233, + "step": 10856 + }, + { + "epoch": 9.915068493150685, + "grad_norm": 0.5911676287651062, + "learning_rate": 9.538305428716389e-08, + "loss": 0.0035, + "step": 10857 + }, + { + "epoch": 9.915981735159818, + "grad_norm": 1.2663365602493286, + "learning_rate": 9.436834094368342e-08, + "loss": 0.0091, + "step": 10858 + }, + { + "epoch": 9.91689497716895, + "grad_norm": 20.18129539489746, + "learning_rate": 9.335362760020295e-08, + "loss": 0.1916, + "step": 10859 + }, + { + "epoch": 9.917808219178083, + "grad_norm": 3.37673020362854, + "learning_rate": 9.233891425672247e-08, + "loss": 0.0107, + "step": 10860 + }, + { + "epoch": 9.918721461187214, + "grad_norm": 176.1060791015625, + "learning_rate": 9.132420091324201e-08, + "loss": 0.2288, + "step": 10861 + }, + { + "epoch": 9.919634703196348, + "grad_norm": 39.66014099121094, + "learning_rate": 9.030948756976155e-08, + "loss": 0.226, + "step": 10862 + }, + { + "epoch": 9.92054794520548, + "grad_norm": 0.2796497941017151, + "learning_rate": 8.929477422628109e-08, + "loss": 0.0021, + "step": 10863 + }, + { + "epoch": 9.921461187214613, + "grad_norm": 3.6514155864715576, + "learning_rate": 8.828006088280062e-08, + "loss": 0.0252, + "step": 10864 + }, + { + "epoch": 9.922374429223744, + "grad_norm": 0.08898042887449265, + "learning_rate": 8.726534753932014e-08, + "loss": 0.0005, + "step": 10865 + }, + { + "epoch": 9.923287671232877, + "grad_norm": 23.356060028076172, + "learning_rate": 8.625063419583967e-08, + "loss": 0.1057, + "step": 10866 + }, + { + "epoch": 9.924200913242009, + "grad_norm": 21.468669891357422, + "learning_rate": 8.523592085235922e-08, + "loss": 0.094, + "step": 10867 + }, + { + "epoch": 9.925114155251142, + "grad_norm": 0.8777492046356201, + "learning_rate": 8.422120750887875e-08, + "loss": 0.0042, + "step": 10868 + }, + { + "epoch": 9.926027397260274, + "grad_norm": 2.431199550628662, + "learning_rate": 8.320649416539829e-08, + "loss": 0.0105, + "step": 10869 + }, + { + "epoch": 9.926940639269407, + "grad_norm": 0.8925474286079407, + "learning_rate": 8.219178082191781e-08, + "loss": 0.0061, + "step": 10870 + }, + { + "epoch": 9.927853881278539, + "grad_norm": 2.3739190101623535, + "learning_rate": 8.117706747843734e-08, + "loss": 0.0113, + "step": 10871 + }, + { + "epoch": 9.92876712328767, + "grad_norm": 98.28907775878906, + "learning_rate": 8.016235413495689e-08, + "loss": 2.2967, + "step": 10872 + }, + { + "epoch": 9.929680365296804, + "grad_norm": 0.2391815334558487, + "learning_rate": 7.914764079147642e-08, + "loss": 0.0014, + "step": 10873 + }, + { + "epoch": 9.930593607305935, + "grad_norm": 0.04936845228075981, + "learning_rate": 7.813292744799595e-08, + "loss": 0.0002, + "step": 10874 + }, + { + "epoch": 9.931506849315069, + "grad_norm": 0.21143774688243866, + "learning_rate": 7.711821410451549e-08, + "loss": 0.001, + "step": 10875 + }, + { + "epoch": 9.9324200913242, + "grad_norm": 114.20928192138672, + "learning_rate": 7.6103500761035e-08, + "loss": 0.898, + "step": 10876 + }, + { + "epoch": 9.933333333333334, + "grad_norm": 1.0730109214782715, + "learning_rate": 7.508878741755455e-08, + "loss": 0.0058, + "step": 10877 + }, + { + "epoch": 9.934246575342465, + "grad_norm": 0.3789841830730438, + "learning_rate": 7.407407407407409e-08, + "loss": 0.0028, + "step": 10878 + }, + { + "epoch": 9.935159817351598, + "grad_norm": 0.19728346168994904, + "learning_rate": 7.30593607305936e-08, + "loss": 0.0012, + "step": 10879 + }, + { + "epoch": 9.93607305936073, + "grad_norm": 1.3712422847747803, + "learning_rate": 7.204464738711315e-08, + "loss": 0.0102, + "step": 10880 + }, + { + "epoch": 9.936986301369863, + "grad_norm": 0.01431723777204752, + "learning_rate": 7.102993404363267e-08, + "loss": 0.0001, + "step": 10881 + }, + { + "epoch": 9.937899543378995, + "grad_norm": 0.9490299820899963, + "learning_rate": 7.001522070015222e-08, + "loss": 0.0055, + "step": 10882 + }, + { + "epoch": 9.938812785388128, + "grad_norm": 1.7237192392349243, + "learning_rate": 6.900050735667175e-08, + "loss": 0.012, + "step": 10883 + }, + { + "epoch": 9.93972602739726, + "grad_norm": 3.001801013946533, + "learning_rate": 6.798579401319127e-08, + "loss": 0.0211, + "step": 10884 + }, + { + "epoch": 9.940639269406393, + "grad_norm": 2.2021234035491943, + "learning_rate": 6.697108066971082e-08, + "loss": 0.0126, + "step": 10885 + }, + { + "epoch": 9.941552511415525, + "grad_norm": 0.7447320818901062, + "learning_rate": 6.595636732623034e-08, + "loss": 0.0048, + "step": 10886 + }, + { + "epoch": 9.942465753424658, + "grad_norm": 0.2815524637699127, + "learning_rate": 6.494165398274987e-08, + "loss": 0.0013, + "step": 10887 + }, + { + "epoch": 9.94337899543379, + "grad_norm": 1.4987897872924805, + "learning_rate": 6.392694063926942e-08, + "loss": 0.0084, + "step": 10888 + }, + { + "epoch": 9.944292237442923, + "grad_norm": 0.8837406635284424, + "learning_rate": 6.291222729578894e-08, + "loss": 0.0044, + "step": 10889 + }, + { + "epoch": 9.945205479452055, + "grad_norm": 14.11324405670166, + "learning_rate": 6.189751395230848e-08, + "loss": 0.0658, + "step": 10890 + }, + { + "epoch": 9.946118721461188, + "grad_norm": 0.5858466029167175, + "learning_rate": 6.0882800608828e-08, + "loss": 0.0031, + "step": 10891 + }, + { + "epoch": 9.94703196347032, + "grad_norm": 0.7350096106529236, + "learning_rate": 5.986808726534754e-08, + "loss": 0.0048, + "step": 10892 + }, + { + "epoch": 9.947945205479453, + "grad_norm": 9.028334617614746, + "learning_rate": 5.885337392186708e-08, + "loss": 0.0469, + "step": 10893 + }, + { + "epoch": 9.948858447488584, + "grad_norm": 0.13384145498275757, + "learning_rate": 5.783866057838661e-08, + "loss": 0.0009, + "step": 10894 + }, + { + "epoch": 9.949771689497716, + "grad_norm": 0.5129113793373108, + "learning_rate": 5.6823947234906145e-08, + "loss": 0.0032, + "step": 10895 + }, + { + "epoch": 9.95068493150685, + "grad_norm": 2.6130895614624023, + "learning_rate": 5.580923389142567e-08, + "loss": 0.0142, + "step": 10896 + }, + { + "epoch": 9.951598173515983, + "grad_norm": 1.1258387565612793, + "learning_rate": 5.479452054794521e-08, + "loss": 0.0074, + "step": 10897 + }, + { + "epoch": 9.952511415525114, + "grad_norm": 0.36443084478378296, + "learning_rate": 5.3779807204464744e-08, + "loss": 0.0024, + "step": 10898 + }, + { + "epoch": 9.953424657534246, + "grad_norm": 0.2545385956764221, + "learning_rate": 5.276509386098428e-08, + "loss": 0.0017, + "step": 10899 + }, + { + "epoch": 9.954337899543379, + "grad_norm": 22.911148071289062, + "learning_rate": 5.175038051750381e-08, + "loss": 0.2578, + "step": 10900 + }, + { + "epoch": 9.95525114155251, + "grad_norm": 17.528337478637695, + "learning_rate": 5.073566717402334e-08, + "loss": 0.1774, + "step": 10901 + }, + { + "epoch": 9.956164383561644, + "grad_norm": 1.3288336992263794, + "learning_rate": 4.972095383054288e-08, + "loss": 0.0108, + "step": 10902 + }, + { + "epoch": 9.957077625570776, + "grad_norm": 0.7087932825088501, + "learning_rate": 4.870624048706241e-08, + "loss": 0.0032, + "step": 10903 + }, + { + "epoch": 9.957990867579909, + "grad_norm": 0.4314363896846771, + "learning_rate": 4.7691527143581944e-08, + "loss": 0.0018, + "step": 10904 + }, + { + "epoch": 9.95890410958904, + "grad_norm": 0.42461097240448, + "learning_rate": 4.667681380010148e-08, + "loss": 0.0026, + "step": 10905 + }, + { + "epoch": 9.959817351598174, + "grad_norm": 2.9496796131134033, + "learning_rate": 4.5662100456621004e-08, + "loss": 0.0191, + "step": 10906 + }, + { + "epoch": 9.960730593607305, + "grad_norm": 1.3482805490493774, + "learning_rate": 4.4647387113140544e-08, + "loss": 0.0086, + "step": 10907 + }, + { + "epoch": 9.961643835616439, + "grad_norm": 8.383976936340332, + "learning_rate": 4.363267376966007e-08, + "loss": 0.0484, + "step": 10908 + }, + { + "epoch": 9.96255707762557, + "grad_norm": 2.765540361404419, + "learning_rate": 4.261796042617961e-08, + "loss": 0.0246, + "step": 10909 + }, + { + "epoch": 9.963470319634704, + "grad_norm": 11.89219856262207, + "learning_rate": 4.1603247082699143e-08, + "loss": 0.0529, + "step": 10910 + }, + { + "epoch": 9.964383561643835, + "grad_norm": 2.6986584663391113, + "learning_rate": 4.058853373921867e-08, + "loss": 0.0138, + "step": 10911 + }, + { + "epoch": 9.965296803652969, + "grad_norm": 63.114471435546875, + "learning_rate": 3.957382039573821e-08, + "loss": 0.485, + "step": 10912 + }, + { + "epoch": 9.9662100456621, + "grad_norm": 0.08561558276414871, + "learning_rate": 3.855910705225774e-08, + "loss": 0.0007, + "step": 10913 + }, + { + "epoch": 9.967123287671233, + "grad_norm": 0.07242689281702042, + "learning_rate": 3.7544393708777276e-08, + "loss": 0.0005, + "step": 10914 + }, + { + "epoch": 9.968036529680365, + "grad_norm": 28.452295303344727, + "learning_rate": 3.65296803652968e-08, + "loss": 0.1959, + "step": 10915 + }, + { + "epoch": 9.968949771689498, + "grad_norm": 0.23376449942588806, + "learning_rate": 3.5514967021816336e-08, + "loss": 0.0015, + "step": 10916 + }, + { + "epoch": 9.96986301369863, + "grad_norm": 18.46352195739746, + "learning_rate": 3.4500253678335876e-08, + "loss": 0.1153, + "step": 10917 + }, + { + "epoch": 9.970776255707763, + "grad_norm": 1.2335513830184937, + "learning_rate": 3.348554033485541e-08, + "loss": 0.0073, + "step": 10918 + }, + { + "epoch": 9.971689497716895, + "grad_norm": 0.010000630281865597, + "learning_rate": 3.2470826991374936e-08, + "loss": 0.0001, + "step": 10919 + }, + { + "epoch": 9.972602739726028, + "grad_norm": 0.5370012521743774, + "learning_rate": 3.145611364789447e-08, + "loss": 0.0037, + "step": 10920 + }, + { + "epoch": 9.97351598173516, + "grad_norm": 0.02742135338485241, + "learning_rate": 3.0441400304414e-08, + "loss": 0.0001, + "step": 10921 + }, + { + "epoch": 9.974429223744291, + "grad_norm": 1.9026223421096802, + "learning_rate": 2.942668696093354e-08, + "loss": 0.0135, + "step": 10922 + }, + { + "epoch": 9.975342465753425, + "grad_norm": 0.11436855792999268, + "learning_rate": 2.8411973617453072e-08, + "loss": 0.0007, + "step": 10923 + }, + { + "epoch": 9.976255707762558, + "grad_norm": 0.5255295634269714, + "learning_rate": 2.7397260273972606e-08, + "loss": 0.0035, + "step": 10924 + }, + { + "epoch": 9.97716894977169, + "grad_norm": 0.7328878045082092, + "learning_rate": 2.638254693049214e-08, + "loss": 0.0049, + "step": 10925 + }, + { + "epoch": 9.978082191780821, + "grad_norm": 0.15278705954551697, + "learning_rate": 2.536783358701167e-08, + "loss": 0.0008, + "step": 10926 + }, + { + "epoch": 9.978995433789954, + "grad_norm": 0.4828379154205322, + "learning_rate": 2.4353120243531205e-08, + "loss": 0.0024, + "step": 10927 + }, + { + "epoch": 9.979908675799086, + "grad_norm": 14.777907371520996, + "learning_rate": 2.333840690005074e-08, + "loss": 0.0783, + "step": 10928 + }, + { + "epoch": 9.98082191780822, + "grad_norm": 0.038505133241415024, + "learning_rate": 2.2323693556570272e-08, + "loss": 0.0002, + "step": 10929 + }, + { + "epoch": 9.981735159817351, + "grad_norm": 0.09892095625400543, + "learning_rate": 2.1308980213089805e-08, + "loss": 0.0007, + "step": 10930 + }, + { + "epoch": 9.982648401826484, + "grad_norm": 1.4023325443267822, + "learning_rate": 2.0294266869609335e-08, + "loss": 0.0099, + "step": 10931 + }, + { + "epoch": 9.983561643835616, + "grad_norm": 1.5656229257583618, + "learning_rate": 1.927955352612887e-08, + "loss": 0.0123, + "step": 10932 + }, + { + "epoch": 9.98447488584475, + "grad_norm": 0.01988327130675316, + "learning_rate": 1.82648401826484e-08, + "loss": 0.0001, + "step": 10933 + }, + { + "epoch": 9.98538812785388, + "grad_norm": 11.410658836364746, + "learning_rate": 1.7250126839167938e-08, + "loss": 0.0851, + "step": 10934 + }, + { + "epoch": 9.986301369863014, + "grad_norm": 0.2963363826274872, + "learning_rate": 1.6235413495687468e-08, + "loss": 0.0023, + "step": 10935 + }, + { + "epoch": 9.987214611872146, + "grad_norm": 17.04043960571289, + "learning_rate": 1.5220700152207e-08, + "loss": 0.0687, + "step": 10936 + }, + { + "epoch": 9.988127853881279, + "grad_norm": 0.5303340554237366, + "learning_rate": 1.4205986808726536e-08, + "loss": 0.003, + "step": 10937 + }, + { + "epoch": 9.98904109589041, + "grad_norm": 22.14379119873047, + "learning_rate": 1.319127346524607e-08, + "loss": 0.1327, + "step": 10938 + }, + { + "epoch": 9.989954337899544, + "grad_norm": 46.49553298950195, + "learning_rate": 1.2176560121765603e-08, + "loss": 0.3655, + "step": 10939 + }, + { + "epoch": 9.990867579908675, + "grad_norm": 0.5970691442489624, + "learning_rate": 1.1161846778285136e-08, + "loss": 0.004, + "step": 10940 + }, + { + "epoch": 9.991780821917809, + "grad_norm": 7.944029331207275, + "learning_rate": 1.0147133434804667e-08, + "loss": 0.0434, + "step": 10941 + }, + { + "epoch": 9.99269406392694, + "grad_norm": 1.9030860662460327, + "learning_rate": 9.1324200913242e-09, + "loss": 0.0088, + "step": 10942 + }, + { + "epoch": 9.993607305936074, + "grad_norm": 15.690299034118652, + "learning_rate": 8.117706747843734e-09, + "loss": 0.0861, + "step": 10943 + }, + { + "epoch": 9.994520547945205, + "grad_norm": 1.1024819612503052, + "learning_rate": 7.102993404363268e-09, + "loss": 0.0056, + "step": 10944 + }, + { + "epoch": 9.995433789954339, + "grad_norm": 35.23597717285156, + "learning_rate": 6.088280060882801e-09, + "loss": 0.3262, + "step": 10945 + }, + { + "epoch": 9.99634703196347, + "grad_norm": 0.21397601068019867, + "learning_rate": 5.073566717402334e-09, + "loss": 0.0015, + "step": 10946 + }, + { + "epoch": 9.997260273972604, + "grad_norm": 2.6586451530456543, + "learning_rate": 4.058853373921867e-09, + "loss": 0.0165, + "step": 10947 + }, + { + "epoch": 9.998173515981735, + "grad_norm": 15.874773979187012, + "learning_rate": 3.0441400304414007e-09, + "loss": 0.0813, + "step": 10948 + }, + { + "epoch": 9.999086757990867, + "grad_norm": 0.07462238520383835, + "learning_rate": 2.0294266869609335e-09, + "loss": 0.0005, + "step": 10949 + }, + { + "epoch": 10.0, + "grad_norm": 0.09888054430484772, + "learning_rate": 1.0147133434804667e-09, + "loss": 0.0007, + "step": 10950 + } + ], + "logging_steps": 1.0, + "max_steps": 10950, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": -10950, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}