{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 10.0, "eval_steps": 500, "global_step": 10950, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0009132420091324201, "grad_norm": 21.391523361206055, "learning_rate": 0.0, "loss": 0.2286, "step": 1 }, { "epoch": 0.0018264840182648401, "grad_norm": 44.01503372192383, "learning_rate": 9.1324200913242e-09, "loss": 0.9396, "step": 2 }, { "epoch": 0.0027397260273972603, "grad_norm": 11.075825691223145, "learning_rate": 1.82648401826484e-08, "loss": 0.1385, "step": 3 }, { "epoch": 0.0036529680365296802, "grad_norm": 90.4065170288086, "learning_rate": 2.7397260273972606e-08, "loss": 2.1215, "step": 4 }, { "epoch": 0.0045662100456621, "grad_norm": 82.90166473388672, "learning_rate": 3.65296803652968e-08, "loss": 1.4917, "step": 5 }, { "epoch": 0.005479452054794521, "grad_norm": 119.29186248779297, "learning_rate": 4.5662100456621004e-08, "loss": 2.8204, "step": 6 }, { "epoch": 0.006392694063926941, "grad_norm": 12.071160316467285, "learning_rate": 5.479452054794521e-08, "loss": 0.0863, "step": 7 }, { "epoch": 0.0073059360730593605, "grad_norm": 36.26288604736328, "learning_rate": 6.392694063926942e-08, "loss": 0.5237, "step": 8 }, { "epoch": 0.00821917808219178, "grad_norm": 81.33724212646484, "learning_rate": 7.30593607305936e-08, "loss": 3.02, "step": 9 }, { "epoch": 0.0091324200913242, "grad_norm": 79.85484313964844, "learning_rate": 8.219178082191781e-08, "loss": 3.5249, "step": 10 }, { "epoch": 0.01004566210045662, "grad_norm": 75.603271484375, "learning_rate": 9.132420091324201e-08, "loss": 1.3449, "step": 11 }, { "epoch": 0.010958904109589041, "grad_norm": 4.044003486633301, "learning_rate": 1.0045662100456622e-07, "loss": 0.0272, "step": 12 }, { "epoch": 0.011872146118721462, "grad_norm": 84.81195068359375, "learning_rate": 1.0958904109589042e-07, "loss": 1.7132, "step": 13 }, { "epoch": 0.012785388127853882, "grad_norm": 10.691021919250488, "learning_rate": 1.1872146118721462e-07, "loss": 0.0533, "step": 14 }, { "epoch": 0.0136986301369863, "grad_norm": 78.60084533691406, "learning_rate": 1.2785388127853884e-07, "loss": 2.7268, "step": 15 }, { "epoch": 0.014611872146118721, "grad_norm": 57.4786376953125, "learning_rate": 1.36986301369863e-07, "loss": 0.8429, "step": 16 }, { "epoch": 0.015525114155251141, "grad_norm": 78.68571472167969, "learning_rate": 1.461187214611872e-07, "loss": 0.7043, "step": 17 }, { "epoch": 0.01643835616438356, "grad_norm": 8.016889572143555, "learning_rate": 1.5525114155251144e-07, "loss": 0.0546, "step": 18 }, { "epoch": 0.017351598173515982, "grad_norm": 80.82862091064453, "learning_rate": 1.6438356164383561e-07, "loss": 3.1511, "step": 19 }, { "epoch": 0.0182648401826484, "grad_norm": 1.8268852233886719, "learning_rate": 1.7351598173515984e-07, "loss": 0.012, "step": 20 }, { "epoch": 0.019178082191780823, "grad_norm": 57.777896881103516, "learning_rate": 1.8264840182648401e-07, "loss": 0.6508, "step": 21 }, { "epoch": 0.02009132420091324, "grad_norm": 106.17963409423828, "learning_rate": 1.9178082191780824e-07, "loss": 3.8215, "step": 22 }, { "epoch": 0.021004566210045664, "grad_norm": 24.179880142211914, "learning_rate": 2.0091324200913244e-07, "loss": 0.3333, "step": 23 }, { "epoch": 0.021917808219178082, "grad_norm": 46.74858856201172, "learning_rate": 2.1004566210045664e-07, "loss": 0.4937, "step": 24 }, { "epoch": 0.0228310502283105, "grad_norm": 72.15716552734375, "learning_rate": 2.1917808219178084e-07, "loss": 1.1884, "step": 25 }, { "epoch": 0.023744292237442923, "grad_norm": 7.252676963806152, "learning_rate": 2.2831050228310502e-07, "loss": 0.0816, "step": 26 }, { "epoch": 0.024657534246575342, "grad_norm": 74.41161346435547, "learning_rate": 2.3744292237442925e-07, "loss": 1.5595, "step": 27 }, { "epoch": 0.025570776255707764, "grad_norm": 39.77434158325195, "learning_rate": 2.465753424657534e-07, "loss": 0.3381, "step": 28 }, { "epoch": 0.026484018264840183, "grad_norm": 13.793807983398438, "learning_rate": 2.557077625570777e-07, "loss": 0.1528, "step": 29 }, { "epoch": 0.0273972602739726, "grad_norm": 87.62931823730469, "learning_rate": 2.648401826484018e-07, "loss": 5.7142, "step": 30 }, { "epoch": 0.028310502283105023, "grad_norm": 21.584718704223633, "learning_rate": 2.73972602739726e-07, "loss": 0.1953, "step": 31 }, { "epoch": 0.029223744292237442, "grad_norm": 19.3806209564209, "learning_rate": 2.831050228310503e-07, "loss": 0.152, "step": 32 }, { "epoch": 0.030136986301369864, "grad_norm": 72.74107360839844, "learning_rate": 2.922374429223744e-07, "loss": 2.4482, "step": 33 }, { "epoch": 0.031050228310502283, "grad_norm": 7.332827568054199, "learning_rate": 3.013698630136987e-07, "loss": 0.0549, "step": 34 }, { "epoch": 0.0319634703196347, "grad_norm": 24.998149871826172, "learning_rate": 3.105022831050229e-07, "loss": 0.1978, "step": 35 }, { "epoch": 0.03287671232876712, "grad_norm": 93.74128723144531, "learning_rate": 3.19634703196347e-07, "loss": 1.1507, "step": 36 }, { "epoch": 0.033789954337899546, "grad_norm": 17.981266021728516, "learning_rate": 3.2876712328767123e-07, "loss": 0.1842, "step": 37 }, { "epoch": 0.034703196347031964, "grad_norm": 87.39964294433594, "learning_rate": 3.378995433789955e-07, "loss": 3.8143, "step": 38 }, { "epoch": 0.03561643835616438, "grad_norm": 102.60655975341797, "learning_rate": 3.470319634703197e-07, "loss": 2.3436, "step": 39 }, { "epoch": 0.0365296803652968, "grad_norm": 123.80463409423828, "learning_rate": 3.561643835616439e-07, "loss": 1.2547, "step": 40 }, { "epoch": 0.03744292237442922, "grad_norm": 57.686832427978516, "learning_rate": 3.6529680365296803e-07, "loss": 1.1322, "step": 41 }, { "epoch": 0.038356164383561646, "grad_norm": 73.86532592773438, "learning_rate": 3.7442922374429223e-07, "loss": 2.7643, "step": 42 }, { "epoch": 0.039269406392694065, "grad_norm": 56.60433578491211, "learning_rate": 3.835616438356165e-07, "loss": 1.042, "step": 43 }, { "epoch": 0.04018264840182648, "grad_norm": 39.59470748901367, "learning_rate": 3.926940639269407e-07, "loss": 0.3863, "step": 44 }, { "epoch": 0.0410958904109589, "grad_norm": 59.53639602661133, "learning_rate": 4.018264840182649e-07, "loss": 0.845, "step": 45 }, { "epoch": 0.04200913242009133, "grad_norm": 24.52463150024414, "learning_rate": 4.1095890410958903e-07, "loss": 0.2059, "step": 46 }, { "epoch": 0.042922374429223746, "grad_norm": 76.20769500732422, "learning_rate": 4.200913242009133e-07, "loss": 1.9931, "step": 47 }, { "epoch": 0.043835616438356165, "grad_norm": 4.439702987670898, "learning_rate": 4.292237442922375e-07, "loss": 0.0335, "step": 48 }, { "epoch": 0.04474885844748858, "grad_norm": 15.920783042907715, "learning_rate": 4.383561643835617e-07, "loss": 0.2187, "step": 49 }, { "epoch": 0.045662100456621, "grad_norm": 2.664954900741577, "learning_rate": 4.474885844748859e-07, "loss": 0.0233, "step": 50 }, { "epoch": 0.04657534246575343, "grad_norm": 84.77438354492188, "learning_rate": 4.5662100456621004e-07, "loss": 2.4898, "step": 51 }, { "epoch": 0.047488584474885846, "grad_norm": 71.40901184082031, "learning_rate": 4.657534246575343e-07, "loss": 1.8702, "step": 52 }, { "epoch": 0.048401826484018265, "grad_norm": 64.62373352050781, "learning_rate": 4.748858447488585e-07, "loss": 0.9552, "step": 53 }, { "epoch": 0.049315068493150684, "grad_norm": 95.7716293334961, "learning_rate": 4.840182648401827e-07, "loss": 2.9291, "step": 54 }, { "epoch": 0.0502283105022831, "grad_norm": 23.60930061340332, "learning_rate": 4.931506849315068e-07, "loss": 0.2924, "step": 55 }, { "epoch": 0.05114155251141553, "grad_norm": 109.23299407958984, "learning_rate": 5.022831050228311e-07, "loss": 3.874, "step": 56 }, { "epoch": 0.052054794520547946, "grad_norm": 52.192447662353516, "learning_rate": 5.114155251141553e-07, "loss": 0.6127, "step": 57 }, { "epoch": 0.052968036529680365, "grad_norm": 10.945656776428223, "learning_rate": 5.205479452054795e-07, "loss": 0.0686, "step": 58 }, { "epoch": 0.053881278538812784, "grad_norm": 77.34603881835938, "learning_rate": 5.296803652968036e-07, "loss": 0.4914, "step": 59 }, { "epoch": 0.0547945205479452, "grad_norm": 32.043548583984375, "learning_rate": 5.388127853881279e-07, "loss": 0.3618, "step": 60 }, { "epoch": 0.05570776255707763, "grad_norm": 86.48470306396484, "learning_rate": 5.47945205479452e-07, "loss": 2.2013, "step": 61 }, { "epoch": 0.05662100456621005, "grad_norm": 89.89276885986328, "learning_rate": 5.570776255707763e-07, "loss": 5.8493, "step": 62 }, { "epoch": 0.057534246575342465, "grad_norm": 36.72576141357422, "learning_rate": 5.662100456621006e-07, "loss": 0.2271, "step": 63 }, { "epoch": 0.058447488584474884, "grad_norm": 40.40840148925781, "learning_rate": 5.753424657534247e-07, "loss": 0.4123, "step": 64 }, { "epoch": 0.0593607305936073, "grad_norm": 32.67231369018555, "learning_rate": 5.844748858447488e-07, "loss": 0.3074, "step": 65 }, { "epoch": 0.06027397260273973, "grad_norm": 25.564722061157227, "learning_rate": 5.936073059360731e-07, "loss": 0.2295, "step": 66 }, { "epoch": 0.06118721461187215, "grad_norm": 6.4643659591674805, "learning_rate": 6.027397260273974e-07, "loss": 0.0736, "step": 67 }, { "epoch": 0.062100456621004566, "grad_norm": 5.617248058319092, "learning_rate": 6.118721461187215e-07, "loss": 0.056, "step": 68 }, { "epoch": 0.06301369863013699, "grad_norm": 25.794353485107422, "learning_rate": 6.210045662100458e-07, "loss": 0.3036, "step": 69 }, { "epoch": 0.0639269406392694, "grad_norm": 86.64598846435547, "learning_rate": 6.3013698630137e-07, "loss": 2.9403, "step": 70 }, { "epoch": 0.06484018264840183, "grad_norm": 3.5341360569000244, "learning_rate": 6.39269406392694e-07, "loss": 0.0337, "step": 71 }, { "epoch": 0.06575342465753424, "grad_norm": 62.64170455932617, "learning_rate": 6.484018264840183e-07, "loss": 0.8546, "step": 72 }, { "epoch": 0.06666666666666667, "grad_norm": 123.21611785888672, "learning_rate": 6.575342465753425e-07, "loss": 5.5987, "step": 73 }, { "epoch": 0.06757990867579909, "grad_norm": 79.64836883544922, "learning_rate": 6.666666666666667e-07, "loss": 3.8135, "step": 74 }, { "epoch": 0.0684931506849315, "grad_norm": 59.767860412597656, "learning_rate": 6.75799086757991e-07, "loss": 0.9761, "step": 75 }, { "epoch": 0.06940639269406393, "grad_norm": 85.29195404052734, "learning_rate": 6.849315068493151e-07, "loss": 1.5345, "step": 76 }, { "epoch": 0.07031963470319634, "grad_norm": 26.23980140686035, "learning_rate": 6.940639269406394e-07, "loss": 0.2064, "step": 77 }, { "epoch": 0.07123287671232877, "grad_norm": 23.337711334228516, "learning_rate": 7.031963470319635e-07, "loss": 0.1702, "step": 78 }, { "epoch": 0.07214611872146119, "grad_norm": 101.32007598876953, "learning_rate": 7.123287671232878e-07, "loss": 2.8874, "step": 79 }, { "epoch": 0.0730593607305936, "grad_norm": 78.50829315185547, "learning_rate": 7.21461187214612e-07, "loss": 1.8447, "step": 80 }, { "epoch": 0.07397260273972603, "grad_norm": 83.6994857788086, "learning_rate": 7.305936073059361e-07, "loss": 1.8814, "step": 81 }, { "epoch": 0.07488584474885844, "grad_norm": 63.333866119384766, "learning_rate": 7.397260273972603e-07, "loss": 1.9822, "step": 82 }, { "epoch": 0.07579908675799087, "grad_norm": 62.4296989440918, "learning_rate": 7.488584474885845e-07, "loss": 1.174, "step": 83 }, { "epoch": 0.07671232876712329, "grad_norm": 34.8710823059082, "learning_rate": 7.579908675799087e-07, "loss": 0.5543, "step": 84 }, { "epoch": 0.0776255707762557, "grad_norm": 30.02129554748535, "learning_rate": 7.67123287671233e-07, "loss": 0.2558, "step": 85 }, { "epoch": 0.07853881278538813, "grad_norm": 74.21768188476562, "learning_rate": 7.762557077625571e-07, "loss": 1.9787, "step": 86 }, { "epoch": 0.07945205479452055, "grad_norm": 6.5233001708984375, "learning_rate": 7.853881278538814e-07, "loss": 0.0551, "step": 87 }, { "epoch": 0.08036529680365297, "grad_norm": 64.2131118774414, "learning_rate": 7.945205479452056e-07, "loss": 1.4303, "step": 88 }, { "epoch": 0.08127853881278539, "grad_norm": 15.942605018615723, "learning_rate": 8.036529680365298e-07, "loss": 0.1429, "step": 89 }, { "epoch": 0.0821917808219178, "grad_norm": 102.62910461425781, "learning_rate": 8.12785388127854e-07, "loss": 1.5326, "step": 90 }, { "epoch": 0.08310502283105023, "grad_norm": 104.85505676269531, "learning_rate": 8.219178082191781e-07, "loss": 1.7769, "step": 91 }, { "epoch": 0.08401826484018265, "grad_norm": 59.12873840332031, "learning_rate": 8.310502283105023e-07, "loss": 0.55, "step": 92 }, { "epoch": 0.08493150684931507, "grad_norm": 97.06475067138672, "learning_rate": 8.401826484018266e-07, "loss": 2.3773, "step": 93 }, { "epoch": 0.08584474885844749, "grad_norm": 80.14556121826172, "learning_rate": 8.493150684931507e-07, "loss": 3.0267, "step": 94 }, { "epoch": 0.0867579908675799, "grad_norm": 53.780548095703125, "learning_rate": 8.58447488584475e-07, "loss": 1.2047, "step": 95 }, { "epoch": 0.08767123287671233, "grad_norm": 33.636837005615234, "learning_rate": 8.675799086757991e-07, "loss": 0.3371, "step": 96 }, { "epoch": 0.08858447488584476, "grad_norm": 32.083946228027344, "learning_rate": 8.767123287671234e-07, "loss": 0.3724, "step": 97 }, { "epoch": 0.08949771689497717, "grad_norm": 84.13992309570312, "learning_rate": 8.858447488584476e-07, "loss": 4.9017, "step": 98 }, { "epoch": 0.09041095890410959, "grad_norm": 29.854148864746094, "learning_rate": 8.949771689497718e-07, "loss": 0.3602, "step": 99 }, { "epoch": 0.091324200913242, "grad_norm": 111.30364227294922, "learning_rate": 9.04109589041096e-07, "loss": 3.7998, "step": 100 }, { "epoch": 0.09223744292237443, "grad_norm": 21.684873580932617, "learning_rate": 9.132420091324201e-07, "loss": 0.2207, "step": 101 }, { "epoch": 0.09315068493150686, "grad_norm": 22.360368728637695, "learning_rate": 9.223744292237443e-07, "loss": 0.1026, "step": 102 }, { "epoch": 0.09406392694063927, "grad_norm": 73.77265930175781, "learning_rate": 9.315068493150686e-07, "loss": 0.9048, "step": 103 }, { "epoch": 0.09497716894977169, "grad_norm": 71.70698547363281, "learning_rate": 9.406392694063927e-07, "loss": 1.3401, "step": 104 }, { "epoch": 0.0958904109589041, "grad_norm": 28.53301429748535, "learning_rate": 9.49771689497717e-07, "loss": 0.3319, "step": 105 }, { "epoch": 0.09680365296803653, "grad_norm": 49.73991012573242, "learning_rate": 9.589041095890411e-07, "loss": 0.5882, "step": 106 }, { "epoch": 0.09771689497716896, "grad_norm": 37.02308654785156, "learning_rate": 9.680365296803654e-07, "loss": 0.4937, "step": 107 }, { "epoch": 0.09863013698630137, "grad_norm": 87.6978530883789, "learning_rate": 9.771689497716896e-07, "loss": 2.4034, "step": 108 }, { "epoch": 0.09954337899543379, "grad_norm": 13.598104476928711, "learning_rate": 9.863013698630137e-07, "loss": 0.0957, "step": 109 }, { "epoch": 0.1004566210045662, "grad_norm": 33.52530288696289, "learning_rate": 9.95433789954338e-07, "loss": 0.3637, "step": 110 }, { "epoch": 0.10136986301369863, "grad_norm": 157.69155883789062, "learning_rate": 1.0045662100456622e-06, "loss": 2.619, "step": 111 }, { "epoch": 0.10228310502283106, "grad_norm": 53.07205581665039, "learning_rate": 1.0136986301369864e-06, "loss": 0.7578, "step": 112 }, { "epoch": 0.10319634703196347, "grad_norm": 159.8686065673828, "learning_rate": 1.0228310502283107e-06, "loss": 4.0348, "step": 113 }, { "epoch": 0.10410958904109589, "grad_norm": 74.48485565185547, "learning_rate": 1.0319634703196347e-06, "loss": 1.2365, "step": 114 }, { "epoch": 0.1050228310502283, "grad_norm": 64.15182495117188, "learning_rate": 1.041095890410959e-06, "loss": 3.5729, "step": 115 }, { "epoch": 0.10593607305936073, "grad_norm": 32.23042297363281, "learning_rate": 1.050228310502283e-06, "loss": 0.1409, "step": 116 }, { "epoch": 0.10684931506849316, "grad_norm": 66.3401870727539, "learning_rate": 1.0593607305936073e-06, "loss": 2.5087, "step": 117 }, { "epoch": 0.10776255707762557, "grad_norm": 69.31356048583984, "learning_rate": 1.0684931506849318e-06, "loss": 0.9486, "step": 118 }, { "epoch": 0.108675799086758, "grad_norm": 88.50533294677734, "learning_rate": 1.0776255707762558e-06, "loss": 3.5171, "step": 119 }, { "epoch": 0.1095890410958904, "grad_norm": 72.86996459960938, "learning_rate": 1.08675799086758e-06, "loss": 1.4448, "step": 120 }, { "epoch": 0.11050228310502283, "grad_norm": 115.85166931152344, "learning_rate": 1.095890410958904e-06, "loss": 3.3339, "step": 121 }, { "epoch": 0.11141552511415526, "grad_norm": 65.395263671875, "learning_rate": 1.1050228310502283e-06, "loss": 0.5765, "step": 122 }, { "epoch": 0.11232876712328767, "grad_norm": 91.7804946899414, "learning_rate": 1.1141552511415526e-06, "loss": 3.4186, "step": 123 }, { "epoch": 0.1132420091324201, "grad_norm": 10.819440841674805, "learning_rate": 1.1232876712328769e-06, "loss": 0.1308, "step": 124 }, { "epoch": 0.1141552511415525, "grad_norm": 84.6809310913086, "learning_rate": 1.132420091324201e-06, "loss": 3.8376, "step": 125 }, { "epoch": 0.11506849315068493, "grad_norm": 51.78423309326172, "learning_rate": 1.1415525114155251e-06, "loss": 1.0381, "step": 126 }, { "epoch": 0.11598173515981736, "grad_norm": 75.30387878417969, "learning_rate": 1.1506849315068494e-06, "loss": 4.8218, "step": 127 }, { "epoch": 0.11689497716894977, "grad_norm": 117.73250579833984, "learning_rate": 1.1598173515981737e-06, "loss": 3.1005, "step": 128 }, { "epoch": 0.1178082191780822, "grad_norm": 21.198179244995117, "learning_rate": 1.1689497716894977e-06, "loss": 0.1785, "step": 129 }, { "epoch": 0.1187214611872146, "grad_norm": 84.41876983642578, "learning_rate": 1.178082191780822e-06, "loss": 1.3516, "step": 130 }, { "epoch": 0.11963470319634703, "grad_norm": 130.32968139648438, "learning_rate": 1.1872146118721462e-06, "loss": 1.4781, "step": 131 }, { "epoch": 0.12054794520547946, "grad_norm": 83.43211364746094, "learning_rate": 1.1963470319634705e-06, "loss": 1.9406, "step": 132 }, { "epoch": 0.12146118721461187, "grad_norm": 46.62077331542969, "learning_rate": 1.2054794520547947e-06, "loss": 0.5176, "step": 133 }, { "epoch": 0.1223744292237443, "grad_norm": 135.8280487060547, "learning_rate": 1.2146118721461188e-06, "loss": 6.6402, "step": 134 }, { "epoch": 0.1232876712328767, "grad_norm": 38.21268081665039, "learning_rate": 1.223744292237443e-06, "loss": 0.5988, "step": 135 }, { "epoch": 0.12420091324200913, "grad_norm": 26.021713256835938, "learning_rate": 1.2328767123287673e-06, "loss": 0.2831, "step": 136 }, { "epoch": 0.12511415525114156, "grad_norm": 94.76661682128906, "learning_rate": 1.2420091324200915e-06, "loss": 1.4254, "step": 137 }, { "epoch": 0.12602739726027398, "grad_norm": 44.89736557006836, "learning_rate": 1.2511415525114158e-06, "loss": 0.685, "step": 138 }, { "epoch": 0.12694063926940638, "grad_norm": 19.027864456176758, "learning_rate": 1.26027397260274e-06, "loss": 0.2444, "step": 139 }, { "epoch": 0.1278538812785388, "grad_norm": 7.452744007110596, "learning_rate": 1.2694063926940639e-06, "loss": 0.0721, "step": 140 }, { "epoch": 0.12876712328767123, "grad_norm": 7.1494245529174805, "learning_rate": 1.278538812785388e-06, "loss": 0.0583, "step": 141 }, { "epoch": 0.12968036529680366, "grad_norm": 8.19793701171875, "learning_rate": 1.2876712328767124e-06, "loss": 0.0869, "step": 142 }, { "epoch": 0.13059360730593608, "grad_norm": 64.73320770263672, "learning_rate": 1.2968036529680366e-06, "loss": 1.4245, "step": 143 }, { "epoch": 0.13150684931506848, "grad_norm": 14.15156364440918, "learning_rate": 1.3059360730593609e-06, "loss": 0.1468, "step": 144 }, { "epoch": 0.1324200913242009, "grad_norm": 56.9935188293457, "learning_rate": 1.315068493150685e-06, "loss": 1.687, "step": 145 }, { "epoch": 0.13333333333333333, "grad_norm": 121.81184387207031, "learning_rate": 1.3242009132420092e-06, "loss": 1.6632, "step": 146 }, { "epoch": 0.13424657534246576, "grad_norm": 72.38001251220703, "learning_rate": 1.3333333333333334e-06, "loss": 2.8886, "step": 147 }, { "epoch": 0.13515981735159818, "grad_norm": 77.1128158569336, "learning_rate": 1.3424657534246577e-06, "loss": 2.4571, "step": 148 }, { "epoch": 0.13607305936073058, "grad_norm": 49.179771423339844, "learning_rate": 1.351598173515982e-06, "loss": 0.5987, "step": 149 }, { "epoch": 0.136986301369863, "grad_norm": 101.3380126953125, "learning_rate": 1.360730593607306e-06, "loss": 2.1308, "step": 150 }, { "epoch": 0.13789954337899543, "grad_norm": 37.85033416748047, "learning_rate": 1.3698630136986302e-06, "loss": 0.3084, "step": 151 }, { "epoch": 0.13881278538812786, "grad_norm": 10.217961311340332, "learning_rate": 1.3789954337899545e-06, "loss": 0.1193, "step": 152 }, { "epoch": 0.13972602739726028, "grad_norm": 74.41902923583984, "learning_rate": 1.3881278538812787e-06, "loss": 1.2972, "step": 153 }, { "epoch": 0.14063926940639268, "grad_norm": 13.571035385131836, "learning_rate": 1.397260273972603e-06, "loss": 0.1028, "step": 154 }, { "epoch": 0.1415525114155251, "grad_norm": 27.69960594177246, "learning_rate": 1.406392694063927e-06, "loss": 0.3179, "step": 155 }, { "epoch": 0.14246575342465753, "grad_norm": 28.064054489135742, "learning_rate": 1.4155251141552513e-06, "loss": 0.3776, "step": 156 }, { "epoch": 0.14337899543378996, "grad_norm": 79.07239532470703, "learning_rate": 1.4246575342465755e-06, "loss": 1.8641, "step": 157 }, { "epoch": 0.14429223744292238, "grad_norm": 48.881351470947266, "learning_rate": 1.4337899543378998e-06, "loss": 0.6451, "step": 158 }, { "epoch": 0.14520547945205478, "grad_norm": 64.86824035644531, "learning_rate": 1.442922374429224e-06, "loss": 1.2368, "step": 159 }, { "epoch": 0.1461187214611872, "grad_norm": 58.14299011230469, "learning_rate": 1.4520547945205479e-06, "loss": 0.6846, "step": 160 }, { "epoch": 0.14703196347031963, "grad_norm": 31.831552505493164, "learning_rate": 1.4611872146118721e-06, "loss": 0.2992, "step": 161 }, { "epoch": 0.14794520547945206, "grad_norm": 0.5487489700317383, "learning_rate": 1.4703196347031964e-06, "loss": 0.0055, "step": 162 }, { "epoch": 0.14885844748858448, "grad_norm": 43.98978805541992, "learning_rate": 1.4794520547945206e-06, "loss": 0.5457, "step": 163 }, { "epoch": 0.14977168949771688, "grad_norm": 63.15679931640625, "learning_rate": 1.4885844748858449e-06, "loss": 0.9663, "step": 164 }, { "epoch": 0.1506849315068493, "grad_norm": 170.94093322753906, "learning_rate": 1.497716894977169e-06, "loss": 3.9086, "step": 165 }, { "epoch": 0.15159817351598173, "grad_norm": 50.26017379760742, "learning_rate": 1.5068493150684932e-06, "loss": 0.2552, "step": 166 }, { "epoch": 0.15251141552511416, "grad_norm": 87.70808410644531, "learning_rate": 1.5159817351598174e-06, "loss": 3.3637, "step": 167 }, { "epoch": 0.15342465753424658, "grad_norm": 18.71607208251953, "learning_rate": 1.5251141552511417e-06, "loss": 0.2115, "step": 168 }, { "epoch": 0.154337899543379, "grad_norm": 81.34317779541016, "learning_rate": 1.534246575342466e-06, "loss": 2.7696, "step": 169 }, { "epoch": 0.1552511415525114, "grad_norm": 65.98477172851562, "learning_rate": 1.5433789954337902e-06, "loss": 4.0204, "step": 170 }, { "epoch": 0.15616438356164383, "grad_norm": 62.04148864746094, "learning_rate": 1.5525114155251142e-06, "loss": 0.6781, "step": 171 }, { "epoch": 0.15707762557077626, "grad_norm": 32.48359298706055, "learning_rate": 1.5616438356164385e-06, "loss": 0.543, "step": 172 }, { "epoch": 0.15799086757990868, "grad_norm": 9.923048973083496, "learning_rate": 1.5707762557077627e-06, "loss": 0.0791, "step": 173 }, { "epoch": 0.1589041095890411, "grad_norm": 45.146881103515625, "learning_rate": 1.579908675799087e-06, "loss": 0.639, "step": 174 }, { "epoch": 0.1598173515981735, "grad_norm": 54.33723068237305, "learning_rate": 1.5890410958904112e-06, "loss": 0.6016, "step": 175 }, { "epoch": 0.16073059360730593, "grad_norm": 91.33534240722656, "learning_rate": 1.5981735159817353e-06, "loss": 2.1822, "step": 176 }, { "epoch": 0.16164383561643836, "grad_norm": 62.61236572265625, "learning_rate": 1.6073059360730595e-06, "loss": 2.6588, "step": 177 }, { "epoch": 0.16255707762557078, "grad_norm": 96.92342376708984, "learning_rate": 1.6164383561643838e-06, "loss": 3.5204, "step": 178 }, { "epoch": 0.1634703196347032, "grad_norm": 71.39334869384766, "learning_rate": 1.625570776255708e-06, "loss": 1.0842, "step": 179 }, { "epoch": 0.1643835616438356, "grad_norm": 5.3123602867126465, "learning_rate": 1.6347031963470323e-06, "loss": 0.044, "step": 180 }, { "epoch": 0.16529680365296803, "grad_norm": 31.091522216796875, "learning_rate": 1.6438356164383561e-06, "loss": 0.4018, "step": 181 }, { "epoch": 0.16621004566210046, "grad_norm": 78.97769927978516, "learning_rate": 1.6529680365296804e-06, "loss": 1.777, "step": 182 }, { "epoch": 0.16712328767123288, "grad_norm": 45.31652069091797, "learning_rate": 1.6621004566210046e-06, "loss": 0.5627, "step": 183 }, { "epoch": 0.1680365296803653, "grad_norm": 46.89043045043945, "learning_rate": 1.671232876712329e-06, "loss": 0.5218, "step": 184 }, { "epoch": 0.1689497716894977, "grad_norm": 4.860301494598389, "learning_rate": 1.6803652968036531e-06, "loss": 0.0576, "step": 185 }, { "epoch": 0.16986301369863013, "grad_norm": 24.04788589477539, "learning_rate": 1.6894977168949772e-06, "loss": 0.1675, "step": 186 }, { "epoch": 0.17077625570776256, "grad_norm": 48.823036193847656, "learning_rate": 1.6986301369863014e-06, "loss": 0.623, "step": 187 }, { "epoch": 0.17168949771689498, "grad_norm": 85.97288513183594, "learning_rate": 1.7077625570776257e-06, "loss": 0.9011, "step": 188 }, { "epoch": 0.1726027397260274, "grad_norm": 16.192712783813477, "learning_rate": 1.71689497716895e-06, "loss": 0.188, "step": 189 }, { "epoch": 0.1735159817351598, "grad_norm": 29.790760040283203, "learning_rate": 1.7260273972602742e-06, "loss": 0.3554, "step": 190 }, { "epoch": 0.17442922374429223, "grad_norm": 9.308028221130371, "learning_rate": 1.7351598173515982e-06, "loss": 0.0836, "step": 191 }, { "epoch": 0.17534246575342466, "grad_norm": 10.808247566223145, "learning_rate": 1.7442922374429225e-06, "loss": 0.1257, "step": 192 }, { "epoch": 0.17625570776255708, "grad_norm": 80.31394958496094, "learning_rate": 1.7534246575342468e-06, "loss": 1.0496, "step": 193 }, { "epoch": 0.1771689497716895, "grad_norm": 74.02754211425781, "learning_rate": 1.762557077625571e-06, "loss": 1.8277, "step": 194 }, { "epoch": 0.1780821917808219, "grad_norm": 13.256115913391113, "learning_rate": 1.7716894977168953e-06, "loss": 0.1288, "step": 195 }, { "epoch": 0.17899543378995433, "grad_norm": 62.32326126098633, "learning_rate": 1.7808219178082193e-06, "loss": 1.0268, "step": 196 }, { "epoch": 0.17990867579908676, "grad_norm": 120.84483337402344, "learning_rate": 1.7899543378995436e-06, "loss": 1.5425, "step": 197 }, { "epoch": 0.18082191780821918, "grad_norm": 89.96076965332031, "learning_rate": 1.7990867579908678e-06, "loss": 3.9099, "step": 198 }, { "epoch": 0.1817351598173516, "grad_norm": 3.865832805633545, "learning_rate": 1.808219178082192e-06, "loss": 0.0413, "step": 199 }, { "epoch": 0.182648401826484, "grad_norm": 82.54876708984375, "learning_rate": 1.8173515981735163e-06, "loss": 1.97, "step": 200 }, { "epoch": 0.18356164383561643, "grad_norm": 40.990108489990234, "learning_rate": 1.8264840182648401e-06, "loss": 0.7802, "step": 201 }, { "epoch": 0.18447488584474886, "grad_norm": 48.20011901855469, "learning_rate": 1.8356164383561644e-06, "loss": 0.5734, "step": 202 }, { "epoch": 0.18538812785388128, "grad_norm": 98.79656982421875, "learning_rate": 1.8447488584474887e-06, "loss": 1.5952, "step": 203 }, { "epoch": 0.1863013698630137, "grad_norm": 41.95235824584961, "learning_rate": 1.853881278538813e-06, "loss": 0.4139, "step": 204 }, { "epoch": 0.1872146118721461, "grad_norm": 48.992549896240234, "learning_rate": 1.8630136986301372e-06, "loss": 0.5863, "step": 205 }, { "epoch": 0.18812785388127853, "grad_norm": 82.46612548828125, "learning_rate": 1.8721461187214612e-06, "loss": 3.8998, "step": 206 }, { "epoch": 0.18904109589041096, "grad_norm": 63.1313362121582, "learning_rate": 1.8812785388127855e-06, "loss": 0.66, "step": 207 }, { "epoch": 0.18995433789954339, "grad_norm": 60.10609817504883, "learning_rate": 1.8904109589041097e-06, "loss": 1.2861, "step": 208 }, { "epoch": 0.1908675799086758, "grad_norm": 55.839229583740234, "learning_rate": 1.899543378995434e-06, "loss": 0.4086, "step": 209 }, { "epoch": 0.1917808219178082, "grad_norm": 98.7811279296875, "learning_rate": 1.9086757990867582e-06, "loss": 1.5894, "step": 210 }, { "epoch": 0.19269406392694063, "grad_norm": 66.01945495605469, "learning_rate": 1.9178082191780823e-06, "loss": 1.2382, "step": 211 }, { "epoch": 0.19360730593607306, "grad_norm": 108.2985610961914, "learning_rate": 1.9269406392694063e-06, "loss": 1.2299, "step": 212 }, { "epoch": 0.19452054794520549, "grad_norm": 123.5262680053711, "learning_rate": 1.9360730593607308e-06, "loss": 2.2437, "step": 213 }, { "epoch": 0.1954337899543379, "grad_norm": 54.53616714477539, "learning_rate": 1.945205479452055e-06, "loss": 0.6528, "step": 214 }, { "epoch": 0.1963470319634703, "grad_norm": 60.27893829345703, "learning_rate": 1.9543378995433793e-06, "loss": 1.032, "step": 215 }, { "epoch": 0.19726027397260273, "grad_norm": 15.57522964477539, "learning_rate": 1.9634703196347033e-06, "loss": 0.1362, "step": 216 }, { "epoch": 0.19817351598173516, "grad_norm": 34.583194732666016, "learning_rate": 1.9726027397260274e-06, "loss": 0.3173, "step": 217 }, { "epoch": 0.19908675799086759, "grad_norm": 68.4352798461914, "learning_rate": 1.981735159817352e-06, "loss": 0.9722, "step": 218 }, { "epoch": 0.2, "grad_norm": 13.38995361328125, "learning_rate": 1.990867579908676e-06, "loss": 0.1574, "step": 219 }, { "epoch": 0.2009132420091324, "grad_norm": 90.0170669555664, "learning_rate": 2.0000000000000003e-06, "loss": 2.1027, "step": 220 }, { "epoch": 0.20182648401826483, "grad_norm": 43.48049545288086, "learning_rate": 2.0091324200913244e-06, "loss": 1.062, "step": 221 }, { "epoch": 0.20273972602739726, "grad_norm": 12.060029983520508, "learning_rate": 2.0182648401826484e-06, "loss": 0.1165, "step": 222 }, { "epoch": 0.20365296803652969, "grad_norm": 24.98027801513672, "learning_rate": 2.027397260273973e-06, "loss": 0.2872, "step": 223 }, { "epoch": 0.2045662100456621, "grad_norm": 44.939029693603516, "learning_rate": 2.036529680365297e-06, "loss": 0.5725, "step": 224 }, { "epoch": 0.2054794520547945, "grad_norm": 37.89668655395508, "learning_rate": 2.0456621004566214e-06, "loss": 0.5382, "step": 225 }, { "epoch": 0.20639269406392693, "grad_norm": 115.38385772705078, "learning_rate": 2.0547945205479454e-06, "loss": 4.1085, "step": 226 }, { "epoch": 0.20730593607305936, "grad_norm": 8.606084823608398, "learning_rate": 2.0639269406392695e-06, "loss": 0.0942, "step": 227 }, { "epoch": 0.20821917808219179, "grad_norm": 29.8278865814209, "learning_rate": 2.073059360730594e-06, "loss": 0.2022, "step": 228 }, { "epoch": 0.2091324200913242, "grad_norm": 7.333195209503174, "learning_rate": 2.082191780821918e-06, "loss": 0.0688, "step": 229 }, { "epoch": 0.2100456621004566, "grad_norm": 4.891754150390625, "learning_rate": 2.0913242009132424e-06, "loss": 0.0391, "step": 230 }, { "epoch": 0.21095890410958903, "grad_norm": 72.65245819091797, "learning_rate": 2.100456621004566e-06, "loss": 1.5981, "step": 231 }, { "epoch": 0.21187214611872146, "grad_norm": 96.19274139404297, "learning_rate": 2.1095890410958905e-06, "loss": 3.2431, "step": 232 }, { "epoch": 0.21278538812785389, "grad_norm": 7.461207389831543, "learning_rate": 2.1187214611872146e-06, "loss": 0.0786, "step": 233 }, { "epoch": 0.2136986301369863, "grad_norm": 5.991075038909912, "learning_rate": 2.127853881278539e-06, "loss": 0.0739, "step": 234 }, { "epoch": 0.2146118721461187, "grad_norm": 32.752906799316406, "learning_rate": 2.1369863013698635e-06, "loss": 0.5735, "step": 235 }, { "epoch": 0.21552511415525114, "grad_norm": 66.76049041748047, "learning_rate": 2.146118721461187e-06, "loss": 1.9773, "step": 236 }, { "epoch": 0.21643835616438356, "grad_norm": 72.06596374511719, "learning_rate": 2.1552511415525116e-06, "loss": 1.0181, "step": 237 }, { "epoch": 0.217351598173516, "grad_norm": 4.452163219451904, "learning_rate": 2.1643835616438356e-06, "loss": 0.0386, "step": 238 }, { "epoch": 0.2182648401826484, "grad_norm": 43.42335891723633, "learning_rate": 2.17351598173516e-06, "loss": 0.6211, "step": 239 }, { "epoch": 0.2191780821917808, "grad_norm": 100.93522644042969, "learning_rate": 2.182648401826484e-06, "loss": 1.403, "step": 240 }, { "epoch": 0.22009132420091324, "grad_norm": 38.22095489501953, "learning_rate": 2.191780821917808e-06, "loss": 0.3556, "step": 241 }, { "epoch": 0.22100456621004566, "grad_norm": 49.57134246826172, "learning_rate": 2.2009132420091326e-06, "loss": 0.6796, "step": 242 }, { "epoch": 0.2219178082191781, "grad_norm": 15.464509963989258, "learning_rate": 2.2100456621004567e-06, "loss": 0.1916, "step": 243 }, { "epoch": 0.2228310502283105, "grad_norm": 108.40855407714844, "learning_rate": 2.219178082191781e-06, "loss": 2.0656, "step": 244 }, { "epoch": 0.2237442922374429, "grad_norm": 53.5400390625, "learning_rate": 2.228310502283105e-06, "loss": 0.9957, "step": 245 }, { "epoch": 0.22465753424657534, "grad_norm": 138.486572265625, "learning_rate": 2.2374429223744292e-06, "loss": 0.4316, "step": 246 }, { "epoch": 0.22557077625570776, "grad_norm": 74.4454116821289, "learning_rate": 2.2465753424657537e-06, "loss": 0.69, "step": 247 }, { "epoch": 0.2264840182648402, "grad_norm": 18.25001335144043, "learning_rate": 2.2557077625570777e-06, "loss": 0.2543, "step": 248 }, { "epoch": 0.2273972602739726, "grad_norm": 10.366548538208008, "learning_rate": 2.264840182648402e-06, "loss": 0.1036, "step": 249 }, { "epoch": 0.228310502283105, "grad_norm": 94.80410766601562, "learning_rate": 2.2739726027397262e-06, "loss": 2.5502, "step": 250 }, { "epoch": 0.22922374429223744, "grad_norm": 4.967508792877197, "learning_rate": 2.2831050228310503e-06, "loss": 0.0402, "step": 251 }, { "epoch": 0.23013698630136986, "grad_norm": 55.20021438598633, "learning_rate": 2.2922374429223748e-06, "loss": 1.1264, "step": 252 }, { "epoch": 0.2310502283105023, "grad_norm": 88.09471130371094, "learning_rate": 2.301369863013699e-06, "loss": 0.8187, "step": 253 }, { "epoch": 0.2319634703196347, "grad_norm": 7.521374225616455, "learning_rate": 2.3105022831050233e-06, "loss": 0.085, "step": 254 }, { "epoch": 0.2328767123287671, "grad_norm": 27.76314926147461, "learning_rate": 2.3196347031963473e-06, "loss": 0.2384, "step": 255 }, { "epoch": 0.23378995433789954, "grad_norm": 110.51651000976562, "learning_rate": 2.3287671232876713e-06, "loss": 2.2553, "step": 256 }, { "epoch": 0.23470319634703196, "grad_norm": 18.509052276611328, "learning_rate": 2.3378995433789954e-06, "loss": 0.1816, "step": 257 }, { "epoch": 0.2356164383561644, "grad_norm": 74.37081909179688, "learning_rate": 2.34703196347032e-06, "loss": 0.5193, "step": 258 }, { "epoch": 0.2365296803652968, "grad_norm": 130.0029296875, "learning_rate": 2.356164383561644e-06, "loss": 2.7636, "step": 259 }, { "epoch": 0.2374429223744292, "grad_norm": 35.65701675415039, "learning_rate": 2.3652968036529684e-06, "loss": 0.5299, "step": 260 }, { "epoch": 0.23835616438356164, "grad_norm": 40.69527816772461, "learning_rate": 2.3744292237442924e-06, "loss": 0.7395, "step": 261 }, { "epoch": 0.23926940639269406, "grad_norm": 57.1495361328125, "learning_rate": 2.3835616438356164e-06, "loss": 1.2052, "step": 262 }, { "epoch": 0.2401826484018265, "grad_norm": 29.589590072631836, "learning_rate": 2.392694063926941e-06, "loss": 0.2916, "step": 263 }, { "epoch": 0.2410958904109589, "grad_norm": 31.95876693725586, "learning_rate": 2.401826484018265e-06, "loss": 0.3554, "step": 264 }, { "epoch": 0.2420091324200913, "grad_norm": 80.69293212890625, "learning_rate": 2.4109589041095894e-06, "loss": 2.3611, "step": 265 }, { "epoch": 0.24292237442922374, "grad_norm": 31.826501846313477, "learning_rate": 2.4200913242009135e-06, "loss": 0.3796, "step": 266 }, { "epoch": 0.24383561643835616, "grad_norm": 74.19340515136719, "learning_rate": 2.4292237442922375e-06, "loss": 1.0953, "step": 267 }, { "epoch": 0.2447488584474886, "grad_norm": 107.5763168334961, "learning_rate": 2.438356164383562e-06, "loss": 3.7586, "step": 268 }, { "epoch": 0.245662100456621, "grad_norm": 73.18709564208984, "learning_rate": 2.447488584474886e-06, "loss": 1.3936, "step": 269 }, { "epoch": 0.2465753424657534, "grad_norm": 97.62149047851562, "learning_rate": 2.4566210045662105e-06, "loss": 2.8545, "step": 270 }, { "epoch": 0.24748858447488584, "grad_norm": 87.21404266357422, "learning_rate": 2.4657534246575345e-06, "loss": 2.7008, "step": 271 }, { "epoch": 0.24840182648401826, "grad_norm": 37.04034423828125, "learning_rate": 2.4748858447488586e-06, "loss": 0.4386, "step": 272 }, { "epoch": 0.2493150684931507, "grad_norm": 4.715449810028076, "learning_rate": 2.484018264840183e-06, "loss": 0.055, "step": 273 }, { "epoch": 0.2502283105022831, "grad_norm": 63.824398040771484, "learning_rate": 2.493150684931507e-06, "loss": 1.2092, "step": 274 }, { "epoch": 0.2511415525114155, "grad_norm": 44.78385543823242, "learning_rate": 2.5022831050228315e-06, "loss": 0.5602, "step": 275 }, { "epoch": 0.25205479452054796, "grad_norm": 26.599531173706055, "learning_rate": 2.511415525114155e-06, "loss": 0.2662, "step": 276 }, { "epoch": 0.25296803652968036, "grad_norm": 298.20269775390625, "learning_rate": 2.52054794520548e-06, "loss": 1.2789, "step": 277 }, { "epoch": 0.25388127853881276, "grad_norm": 61.410606384277344, "learning_rate": 2.5296803652968037e-06, "loss": 1.7467, "step": 278 }, { "epoch": 0.2547945205479452, "grad_norm": 7.025510311126709, "learning_rate": 2.5388127853881277e-06, "loss": 0.0566, "step": 279 }, { "epoch": 0.2557077625570776, "grad_norm": 5.008991718292236, "learning_rate": 2.547945205479452e-06, "loss": 0.0537, "step": 280 }, { "epoch": 0.25662100456621006, "grad_norm": 39.362709045410156, "learning_rate": 2.557077625570776e-06, "loss": 1.0329, "step": 281 }, { "epoch": 0.25753424657534246, "grad_norm": 13.461565017700195, "learning_rate": 2.5662100456621007e-06, "loss": 0.0581, "step": 282 }, { "epoch": 0.25844748858447486, "grad_norm": 35.72615051269531, "learning_rate": 2.5753424657534247e-06, "loss": 0.2856, "step": 283 }, { "epoch": 0.2593607305936073, "grad_norm": 144.56704711914062, "learning_rate": 2.5844748858447488e-06, "loss": 1.6042, "step": 284 }, { "epoch": 0.2602739726027397, "grad_norm": 39.69789505004883, "learning_rate": 2.5936073059360732e-06, "loss": 0.6375, "step": 285 }, { "epoch": 0.26118721461187216, "grad_norm": 84.33684539794922, "learning_rate": 2.6027397260273973e-06, "loss": 1.8961, "step": 286 }, { "epoch": 0.26210045662100456, "grad_norm": 24.499244689941406, "learning_rate": 2.6118721461187217e-06, "loss": 0.2799, "step": 287 }, { "epoch": 0.26301369863013696, "grad_norm": 20.77275848388672, "learning_rate": 2.6210045662100458e-06, "loss": 0.142, "step": 288 }, { "epoch": 0.2639269406392694, "grad_norm": 67.01052856445312, "learning_rate": 2.63013698630137e-06, "loss": 2.124, "step": 289 }, { "epoch": 0.2648401826484018, "grad_norm": 90.5927505493164, "learning_rate": 2.6392694063926943e-06, "loss": 2.7075, "step": 290 }, { "epoch": 0.26575342465753427, "grad_norm": 13.948375701904297, "learning_rate": 2.6484018264840183e-06, "loss": 0.2668, "step": 291 }, { "epoch": 0.26666666666666666, "grad_norm": 6.923635959625244, "learning_rate": 2.6575342465753428e-06, "loss": 0.0519, "step": 292 }, { "epoch": 0.26757990867579906, "grad_norm": 79.83607482910156, "learning_rate": 2.666666666666667e-06, "loss": 0.9281, "step": 293 }, { "epoch": 0.2684931506849315, "grad_norm": 4.34301233291626, "learning_rate": 2.675799086757991e-06, "loss": 0.0384, "step": 294 }, { "epoch": 0.2694063926940639, "grad_norm": 29.429771423339844, "learning_rate": 2.6849315068493153e-06, "loss": 0.2962, "step": 295 }, { "epoch": 0.27031963470319637, "grad_norm": 0.7147941589355469, "learning_rate": 2.6940639269406394e-06, "loss": 0.0077, "step": 296 }, { "epoch": 0.27123287671232876, "grad_norm": 39.406612396240234, "learning_rate": 2.703196347031964e-06, "loss": 0.3665, "step": 297 }, { "epoch": 0.27214611872146116, "grad_norm": 46.39921188354492, "learning_rate": 2.712328767123288e-06, "loss": 0.8788, "step": 298 }, { "epoch": 0.2730593607305936, "grad_norm": 42.184757232666016, "learning_rate": 2.721461187214612e-06, "loss": 0.5775, "step": 299 }, { "epoch": 0.273972602739726, "grad_norm": 25.515316009521484, "learning_rate": 2.7305936073059364e-06, "loss": 0.1749, "step": 300 }, { "epoch": 0.27488584474885847, "grad_norm": 28.36213493347168, "learning_rate": 2.7397260273972604e-06, "loss": 0.1875, "step": 301 }, { "epoch": 0.27579908675799086, "grad_norm": 27.317750930786133, "learning_rate": 2.748858447488585e-06, "loss": 0.2458, "step": 302 }, { "epoch": 0.27671232876712326, "grad_norm": 107.00208282470703, "learning_rate": 2.757990867579909e-06, "loss": 1.1319, "step": 303 }, { "epoch": 0.2776255707762557, "grad_norm": 22.468372344970703, "learning_rate": 2.767123287671233e-06, "loss": 0.1515, "step": 304 }, { "epoch": 0.2785388127853881, "grad_norm": 6.944116592407227, "learning_rate": 2.7762557077625574e-06, "loss": 0.0726, "step": 305 }, { "epoch": 0.27945205479452057, "grad_norm": 94.0119857788086, "learning_rate": 2.7853881278538815e-06, "loss": 2.509, "step": 306 }, { "epoch": 0.28036529680365296, "grad_norm": 2.0173680782318115, "learning_rate": 2.794520547945206e-06, "loss": 0.0192, "step": 307 }, { "epoch": 0.28127853881278536, "grad_norm": 94.66632080078125, "learning_rate": 2.80365296803653e-06, "loss": 0.8711, "step": 308 }, { "epoch": 0.2821917808219178, "grad_norm": 16.232744216918945, "learning_rate": 2.812785388127854e-06, "loss": 0.1906, "step": 309 }, { "epoch": 0.2831050228310502, "grad_norm": 10.187146186828613, "learning_rate": 2.8219178082191785e-06, "loss": 0.1073, "step": 310 }, { "epoch": 0.28401826484018267, "grad_norm": 12.72319221496582, "learning_rate": 2.8310502283105025e-06, "loss": 0.145, "step": 311 }, { "epoch": 0.28493150684931506, "grad_norm": 73.50216674804688, "learning_rate": 2.840182648401827e-06, "loss": 1.8729, "step": 312 }, { "epoch": 0.28584474885844746, "grad_norm": 3.024461030960083, "learning_rate": 2.849315068493151e-06, "loss": 0.0342, "step": 313 }, { "epoch": 0.2867579908675799, "grad_norm": 30.972646713256836, "learning_rate": 2.8584474885844747e-06, "loss": 0.1213, "step": 314 }, { "epoch": 0.2876712328767123, "grad_norm": 54.240966796875, "learning_rate": 2.8675799086757996e-06, "loss": 0.994, "step": 315 }, { "epoch": 0.28858447488584477, "grad_norm": 73.1288833618164, "learning_rate": 2.876712328767123e-06, "loss": 2.1375, "step": 316 }, { "epoch": 0.28949771689497716, "grad_norm": 54.13544464111328, "learning_rate": 2.885844748858448e-06, "loss": 0.8222, "step": 317 }, { "epoch": 0.29041095890410956, "grad_norm": 152.84681701660156, "learning_rate": 2.8949771689497717e-06, "loss": 6.5166, "step": 318 }, { "epoch": 0.291324200913242, "grad_norm": 55.39650344848633, "learning_rate": 2.9041095890410957e-06, "loss": 0.6333, "step": 319 }, { "epoch": 0.2922374429223744, "grad_norm": 55.22079849243164, "learning_rate": 2.91324200913242e-06, "loss": 0.5813, "step": 320 }, { "epoch": 0.29315068493150687, "grad_norm": 104.08516693115234, "learning_rate": 2.9223744292237442e-06, "loss": 1.4867, "step": 321 }, { "epoch": 0.29406392694063926, "grad_norm": 75.28111267089844, "learning_rate": 2.9315068493150687e-06, "loss": 2.8383, "step": 322 }, { "epoch": 0.29497716894977166, "grad_norm": 27.597333908081055, "learning_rate": 2.9406392694063927e-06, "loss": 0.3835, "step": 323 }, { "epoch": 0.2958904109589041, "grad_norm": 81.01739501953125, "learning_rate": 2.9497716894977168e-06, "loss": 1.1734, "step": 324 }, { "epoch": 0.2968036529680365, "grad_norm": 16.55040168762207, "learning_rate": 2.9589041095890413e-06, "loss": 0.1289, "step": 325 }, { "epoch": 0.29771689497716897, "grad_norm": 49.57678985595703, "learning_rate": 2.9680365296803653e-06, "loss": 0.8088, "step": 326 }, { "epoch": 0.29863013698630136, "grad_norm": 19.920291900634766, "learning_rate": 2.9771689497716898e-06, "loss": 0.148, "step": 327 }, { "epoch": 0.29954337899543376, "grad_norm": 234.65785217285156, "learning_rate": 2.986301369863014e-06, "loss": 2.2054, "step": 328 }, { "epoch": 0.3004566210045662, "grad_norm": 20.0113582611084, "learning_rate": 2.995433789954338e-06, "loss": 0.2417, "step": 329 }, { "epoch": 0.3013698630136986, "grad_norm": 152.71205139160156, "learning_rate": 3.0045662100456623e-06, "loss": 1.6908, "step": 330 }, { "epoch": 0.30228310502283107, "grad_norm": 100.44230651855469, "learning_rate": 3.0136986301369864e-06, "loss": 1.5969, "step": 331 }, { "epoch": 0.30319634703196346, "grad_norm": 97.4519271850586, "learning_rate": 3.022831050228311e-06, "loss": 3.0538, "step": 332 }, { "epoch": 0.3041095890410959, "grad_norm": 8.2625732421875, "learning_rate": 3.031963470319635e-06, "loss": 0.1038, "step": 333 }, { "epoch": 0.3050228310502283, "grad_norm": 52.021427154541016, "learning_rate": 3.0410958904109593e-06, "loss": 0.6112, "step": 334 }, { "epoch": 0.3059360730593607, "grad_norm": 109.416015625, "learning_rate": 3.0502283105022834e-06, "loss": 0.7545, "step": 335 }, { "epoch": 0.30684931506849317, "grad_norm": 32.47154998779297, "learning_rate": 3.0593607305936074e-06, "loss": 0.4285, "step": 336 }, { "epoch": 0.30776255707762556, "grad_norm": 2.595451831817627, "learning_rate": 3.068493150684932e-06, "loss": 0.0155, "step": 337 }, { "epoch": 0.308675799086758, "grad_norm": 3.6003928184509277, "learning_rate": 3.077625570776256e-06, "loss": 0.0221, "step": 338 }, { "epoch": 0.3095890410958904, "grad_norm": 41.01164245605469, "learning_rate": 3.0867579908675804e-06, "loss": 0.4932, "step": 339 }, { "epoch": 0.3105022831050228, "grad_norm": 4.865055084228516, "learning_rate": 3.0958904109589044e-06, "loss": 0.0423, "step": 340 }, { "epoch": 0.31141552511415527, "grad_norm": 87.6878433227539, "learning_rate": 3.1050228310502285e-06, "loss": 4.9514, "step": 341 }, { "epoch": 0.31232876712328766, "grad_norm": 3.847578287124634, "learning_rate": 3.114155251141553e-06, "loss": 0.0369, "step": 342 }, { "epoch": 0.3132420091324201, "grad_norm": 2.0997073650360107, "learning_rate": 3.123287671232877e-06, "loss": 0.0115, "step": 343 }, { "epoch": 0.3141552511415525, "grad_norm": 2.2623438835144043, "learning_rate": 3.1324200913242014e-06, "loss": 0.0245, "step": 344 }, { "epoch": 0.3150684931506849, "grad_norm": 115.09259033203125, "learning_rate": 3.1415525114155255e-06, "loss": 2.3524, "step": 345 }, { "epoch": 0.31598173515981737, "grad_norm": 31.150346755981445, "learning_rate": 3.1506849315068495e-06, "loss": 0.301, "step": 346 }, { "epoch": 0.31689497716894977, "grad_norm": 7.046755313873291, "learning_rate": 3.159817351598174e-06, "loss": 0.0489, "step": 347 }, { "epoch": 0.3178082191780822, "grad_norm": 64.1182861328125, "learning_rate": 3.168949771689498e-06, "loss": 1.0005, "step": 348 }, { "epoch": 0.3187214611872146, "grad_norm": 57.80517578125, "learning_rate": 3.1780821917808225e-06, "loss": 0.6877, "step": 349 }, { "epoch": 0.319634703196347, "grad_norm": 82.0582275390625, "learning_rate": 3.1872146118721465e-06, "loss": 1.6082, "step": 350 }, { "epoch": 0.32054794520547947, "grad_norm": 2.1384828090667725, "learning_rate": 3.1963470319634706e-06, "loss": 0.0147, "step": 351 }, { "epoch": 0.32146118721461187, "grad_norm": 35.577266693115234, "learning_rate": 3.205479452054795e-06, "loss": 0.3443, "step": 352 }, { "epoch": 0.3223744292237443, "grad_norm": 50.61724090576172, "learning_rate": 3.214611872146119e-06, "loss": 0.5292, "step": 353 }, { "epoch": 0.3232876712328767, "grad_norm": 30.18777847290039, "learning_rate": 3.2237442922374436e-06, "loss": 0.3801, "step": 354 }, { "epoch": 0.3242009132420091, "grad_norm": 4.594858646392822, "learning_rate": 3.2328767123287676e-06, "loss": 0.0445, "step": 355 }, { "epoch": 0.32511415525114157, "grad_norm": 3.377143144607544, "learning_rate": 3.242009132420091e-06, "loss": 0.029, "step": 356 }, { "epoch": 0.32602739726027397, "grad_norm": 129.733154296875, "learning_rate": 3.251141552511416e-06, "loss": 3.7756, "step": 357 }, { "epoch": 0.3269406392694064, "grad_norm": 13.042701721191406, "learning_rate": 3.2602739726027397e-06, "loss": 0.1324, "step": 358 }, { "epoch": 0.3278538812785388, "grad_norm": 61.14798355102539, "learning_rate": 3.2694063926940646e-06, "loss": 1.3219, "step": 359 }, { "epoch": 0.3287671232876712, "grad_norm": 45.18826675415039, "learning_rate": 3.2785388127853882e-06, "loss": 0.5063, "step": 360 }, { "epoch": 0.32968036529680367, "grad_norm": 72.35635375976562, "learning_rate": 3.2876712328767123e-06, "loss": 0.8591, "step": 361 }, { "epoch": 0.33059360730593607, "grad_norm": 6.394048690795898, "learning_rate": 3.296803652968037e-06, "loss": 0.0423, "step": 362 }, { "epoch": 0.3315068493150685, "grad_norm": 122.2677001953125, "learning_rate": 3.3059360730593608e-06, "loss": 3.7917, "step": 363 }, { "epoch": 0.3324200913242009, "grad_norm": 67.34833526611328, "learning_rate": 3.3150684931506857e-06, "loss": 0.4078, "step": 364 }, { "epoch": 0.3333333333333333, "grad_norm": 13.769874572753906, "learning_rate": 3.3242009132420093e-06, "loss": 0.091, "step": 365 }, { "epoch": 0.33424657534246577, "grad_norm": 4.036598205566406, "learning_rate": 3.3333333333333333e-06, "loss": 0.017, "step": 366 }, { "epoch": 0.33515981735159817, "grad_norm": 113.74850463867188, "learning_rate": 3.342465753424658e-06, "loss": 3.8546, "step": 367 }, { "epoch": 0.3360730593607306, "grad_norm": 60.095726013183594, "learning_rate": 3.351598173515982e-06, "loss": 0.8249, "step": 368 }, { "epoch": 0.336986301369863, "grad_norm": 72.10465240478516, "learning_rate": 3.3607305936073063e-06, "loss": 0.3766, "step": 369 }, { "epoch": 0.3378995433789954, "grad_norm": 4.72660493850708, "learning_rate": 3.3698630136986303e-06, "loss": 0.0377, "step": 370 }, { "epoch": 0.33881278538812787, "grad_norm": 233.44357299804688, "learning_rate": 3.3789954337899544e-06, "loss": 2.63, "step": 371 }, { "epoch": 0.33972602739726027, "grad_norm": 61.60961151123047, "learning_rate": 3.388127853881279e-06, "loss": 0.6469, "step": 372 }, { "epoch": 0.3406392694063927, "grad_norm": 89.26587677001953, "learning_rate": 3.397260273972603e-06, "loss": 1.4234, "step": 373 }, { "epoch": 0.3415525114155251, "grad_norm": 89.88206481933594, "learning_rate": 3.4063926940639274e-06, "loss": 1.1847, "step": 374 }, { "epoch": 0.3424657534246575, "grad_norm": 32.484413146972656, "learning_rate": 3.4155251141552514e-06, "loss": 0.4132, "step": 375 }, { "epoch": 0.34337899543378997, "grad_norm": 35.568336486816406, "learning_rate": 3.4246575342465754e-06, "loss": 0.4842, "step": 376 }, { "epoch": 0.34429223744292237, "grad_norm": 3.1341066360473633, "learning_rate": 3.4337899543379e-06, "loss": 0.0134, "step": 377 }, { "epoch": 0.3452054794520548, "grad_norm": 23.109111785888672, "learning_rate": 3.442922374429224e-06, "loss": 0.2163, "step": 378 }, { "epoch": 0.3461187214611872, "grad_norm": 109.4838638305664, "learning_rate": 3.4520547945205484e-06, "loss": 1.3807, "step": 379 }, { "epoch": 0.3470319634703196, "grad_norm": 7.516877174377441, "learning_rate": 3.4611872146118725e-06, "loss": 0.0232, "step": 380 }, { "epoch": 0.34794520547945207, "grad_norm": 184.45761108398438, "learning_rate": 3.4703196347031965e-06, "loss": 1.4256, "step": 381 }, { "epoch": 0.34885844748858447, "grad_norm": 0.6221018433570862, "learning_rate": 3.479452054794521e-06, "loss": 0.0062, "step": 382 }, { "epoch": 0.3497716894977169, "grad_norm": 33.57866287231445, "learning_rate": 3.488584474885845e-06, "loss": 0.2674, "step": 383 }, { "epoch": 0.3506849315068493, "grad_norm": 19.23067855834961, "learning_rate": 3.4977168949771695e-06, "loss": 0.115, "step": 384 }, { "epoch": 0.3515981735159817, "grad_norm": 301.8723449707031, "learning_rate": 3.5068493150684935e-06, "loss": 1.9319, "step": 385 }, { "epoch": 0.35251141552511417, "grad_norm": 55.29923629760742, "learning_rate": 3.5159817351598176e-06, "loss": 0.4, "step": 386 }, { "epoch": 0.35342465753424657, "grad_norm": 85.7860336303711, "learning_rate": 3.525114155251142e-06, "loss": 0.4559, "step": 387 }, { "epoch": 0.354337899543379, "grad_norm": 183.87449645996094, "learning_rate": 3.534246575342466e-06, "loss": 0.9421, "step": 388 }, { "epoch": 0.3552511415525114, "grad_norm": 73.13939666748047, "learning_rate": 3.5433789954337905e-06, "loss": 1.956, "step": 389 }, { "epoch": 0.3561643835616438, "grad_norm": 92.12702178955078, "learning_rate": 3.5525114155251146e-06, "loss": 1.8663, "step": 390 }, { "epoch": 0.35707762557077627, "grad_norm": 67.64469909667969, "learning_rate": 3.5616438356164386e-06, "loss": 1.8999, "step": 391 }, { "epoch": 0.35799086757990867, "grad_norm": 38.1673583984375, "learning_rate": 3.570776255707763e-06, "loss": 0.4063, "step": 392 }, { "epoch": 0.3589041095890411, "grad_norm": 26.75269317626953, "learning_rate": 3.579908675799087e-06, "loss": 0.1919, "step": 393 }, { "epoch": 0.3598173515981735, "grad_norm": 86.62921905517578, "learning_rate": 3.5890410958904116e-06, "loss": 4.2046, "step": 394 }, { "epoch": 0.3607305936073059, "grad_norm": 73.33307647705078, "learning_rate": 3.5981735159817356e-06, "loss": 0.6993, "step": 395 }, { "epoch": 0.36164383561643837, "grad_norm": 16.41733741760254, "learning_rate": 3.6073059360730597e-06, "loss": 0.1507, "step": 396 }, { "epoch": 0.36255707762557077, "grad_norm": 84.48392486572266, "learning_rate": 3.616438356164384e-06, "loss": 1.1311, "step": 397 }, { "epoch": 0.3634703196347032, "grad_norm": 7.627520561218262, "learning_rate": 3.625570776255708e-06, "loss": 0.0547, "step": 398 }, { "epoch": 0.3643835616438356, "grad_norm": 63.42316436767578, "learning_rate": 3.6347031963470326e-06, "loss": 1.0286, "step": 399 }, { "epoch": 0.365296803652968, "grad_norm": 36.56929397583008, "learning_rate": 3.6438356164383567e-06, "loss": 0.6189, "step": 400 }, { "epoch": 0.36621004566210047, "grad_norm": 15.963958740234375, "learning_rate": 3.6529680365296803e-06, "loss": 0.1906, "step": 401 }, { "epoch": 0.36712328767123287, "grad_norm": 21.51779556274414, "learning_rate": 3.662100456621005e-06, "loss": 0.2721, "step": 402 }, { "epoch": 0.3680365296803653, "grad_norm": 15.616464614868164, "learning_rate": 3.671232876712329e-06, "loss": 0.1086, "step": 403 }, { "epoch": 0.3689497716894977, "grad_norm": 83.13922882080078, "learning_rate": 3.6803652968036537e-06, "loss": 0.9419, "step": 404 }, { "epoch": 0.3698630136986301, "grad_norm": 162.45516967773438, "learning_rate": 3.6894977168949773e-06, "loss": 1.1526, "step": 405 }, { "epoch": 0.37077625570776257, "grad_norm": 78.32847595214844, "learning_rate": 3.6986301369863014e-06, "loss": 1.2755, "step": 406 }, { "epoch": 0.37168949771689497, "grad_norm": 77.73373413085938, "learning_rate": 3.707762557077626e-06, "loss": 1.4018, "step": 407 }, { "epoch": 0.3726027397260274, "grad_norm": 29.755128860473633, "learning_rate": 3.71689497716895e-06, "loss": 0.3106, "step": 408 }, { "epoch": 0.3735159817351598, "grad_norm": 36.72953414916992, "learning_rate": 3.7260273972602743e-06, "loss": 0.2132, "step": 409 }, { "epoch": 0.3744292237442922, "grad_norm": 98.95490264892578, "learning_rate": 3.7351598173515984e-06, "loss": 2.0899, "step": 410 }, { "epoch": 0.37534246575342467, "grad_norm": 77.04108428955078, "learning_rate": 3.7442922374429224e-06, "loss": 1.9682, "step": 411 }, { "epoch": 0.37625570776255707, "grad_norm": 104.71251678466797, "learning_rate": 3.753424657534247e-06, "loss": 0.8279, "step": 412 }, { "epoch": 0.3771689497716895, "grad_norm": 88.0051040649414, "learning_rate": 3.762557077625571e-06, "loss": 2.2346, "step": 413 }, { "epoch": 0.3780821917808219, "grad_norm": 4.148565292358398, "learning_rate": 3.7716894977168954e-06, "loss": 0.0343, "step": 414 }, { "epoch": 0.3789954337899543, "grad_norm": 1.0950932502746582, "learning_rate": 3.7808219178082194e-06, "loss": 0.012, "step": 415 }, { "epoch": 0.37990867579908677, "grad_norm": 66.29446411132812, "learning_rate": 3.7899543378995435e-06, "loss": 0.7788, "step": 416 }, { "epoch": 0.38082191780821917, "grad_norm": 140.90643310546875, "learning_rate": 3.799086757990868e-06, "loss": 1.1013, "step": 417 }, { "epoch": 0.3817351598173516, "grad_norm": 70.56643676757812, "learning_rate": 3.808219178082192e-06, "loss": 0.8681, "step": 418 }, { "epoch": 0.382648401826484, "grad_norm": 84.06604766845703, "learning_rate": 3.8173515981735164e-06, "loss": 4.7383, "step": 419 }, { "epoch": 0.3835616438356164, "grad_norm": 25.52215003967285, "learning_rate": 3.826484018264841e-06, "loss": 0.2363, "step": 420 }, { "epoch": 0.38447488584474887, "grad_norm": 10.285091400146484, "learning_rate": 3.8356164383561645e-06, "loss": 0.0741, "step": 421 }, { "epoch": 0.38538812785388127, "grad_norm": 21.332162857055664, "learning_rate": 3.844748858447489e-06, "loss": 0.1497, "step": 422 }, { "epoch": 0.3863013698630137, "grad_norm": 38.91802978515625, "learning_rate": 3.853881278538813e-06, "loss": 0.4474, "step": 423 }, { "epoch": 0.3872146118721461, "grad_norm": 53.973175048828125, "learning_rate": 3.863013698630138e-06, "loss": 0.3742, "step": 424 }, { "epoch": 0.3881278538812785, "grad_norm": 14.769797325134277, "learning_rate": 3.8721461187214615e-06, "loss": 0.1737, "step": 425 }, { "epoch": 0.38904109589041097, "grad_norm": 87.75567626953125, "learning_rate": 3.881278538812785e-06, "loss": 7.062, "step": 426 }, { "epoch": 0.38995433789954337, "grad_norm": 64.97599792480469, "learning_rate": 3.89041095890411e-06, "loss": 0.6704, "step": 427 }, { "epoch": 0.3908675799086758, "grad_norm": 4.37594747543335, "learning_rate": 3.899543378995434e-06, "loss": 0.0389, "step": 428 }, { "epoch": 0.3917808219178082, "grad_norm": 20.90996742248535, "learning_rate": 3.9086757990867586e-06, "loss": 0.185, "step": 429 }, { "epoch": 0.3926940639269406, "grad_norm": 63.172691345214844, "learning_rate": 3.917808219178082e-06, "loss": 0.9473, "step": 430 }, { "epoch": 0.39360730593607307, "grad_norm": 42.276268005371094, "learning_rate": 3.926940639269407e-06, "loss": 0.7362, "step": 431 }, { "epoch": 0.39452054794520547, "grad_norm": 20.83365249633789, "learning_rate": 3.936073059360731e-06, "loss": 0.1147, "step": 432 }, { "epoch": 0.3954337899543379, "grad_norm": 71.1503677368164, "learning_rate": 3.945205479452055e-06, "loss": 1.1091, "step": 433 }, { "epoch": 0.3963470319634703, "grad_norm": 57.9481086730957, "learning_rate": 3.954337899543379e-06, "loss": 1.048, "step": 434 }, { "epoch": 0.3972602739726027, "grad_norm": 43.703372955322266, "learning_rate": 3.963470319634704e-06, "loss": 0.2116, "step": 435 }, { "epoch": 0.39817351598173517, "grad_norm": 23.36627769470215, "learning_rate": 3.972602739726027e-06, "loss": 0.2819, "step": 436 }, { "epoch": 0.39908675799086757, "grad_norm": 0.8956753611564636, "learning_rate": 3.981735159817352e-06, "loss": 0.0098, "step": 437 }, { "epoch": 0.4, "grad_norm": 5.04408597946167, "learning_rate": 3.990867579908676e-06, "loss": 0.0432, "step": 438 }, { "epoch": 0.4009132420091324, "grad_norm": 11.662174224853516, "learning_rate": 4.000000000000001e-06, "loss": 0.111, "step": 439 }, { "epoch": 0.4018264840182648, "grad_norm": 18.267024993896484, "learning_rate": 4.009132420091324e-06, "loss": 0.1974, "step": 440 }, { "epoch": 0.40273972602739727, "grad_norm": 78.30281829833984, "learning_rate": 4.018264840182649e-06, "loss": 2.1333, "step": 441 }, { "epoch": 0.40365296803652967, "grad_norm": 17.122106552124023, "learning_rate": 4.027397260273973e-06, "loss": 0.1863, "step": 442 }, { "epoch": 0.4045662100456621, "grad_norm": 44.18665313720703, "learning_rate": 4.036529680365297e-06, "loss": 0.4983, "step": 443 }, { "epoch": 0.4054794520547945, "grad_norm": 6.249471187591553, "learning_rate": 4.045662100456621e-06, "loss": 0.0566, "step": 444 }, { "epoch": 0.4063926940639269, "grad_norm": 7.358841896057129, "learning_rate": 4.054794520547946e-06, "loss": 0.0513, "step": 445 }, { "epoch": 0.40730593607305937, "grad_norm": 55.501285552978516, "learning_rate": 4.063926940639269e-06, "loss": 1.9399, "step": 446 }, { "epoch": 0.40821917808219177, "grad_norm": 69.8716812133789, "learning_rate": 4.073059360730594e-06, "loss": 0.8495, "step": 447 }, { "epoch": 0.4091324200913242, "grad_norm": 17.721437454223633, "learning_rate": 4.082191780821918e-06, "loss": 0.1784, "step": 448 }, { "epoch": 0.4100456621004566, "grad_norm": 93.11967468261719, "learning_rate": 4.091324200913243e-06, "loss": 2.0644, "step": 449 }, { "epoch": 0.410958904109589, "grad_norm": 100.63162231445312, "learning_rate": 4.100456621004566e-06, "loss": 1.9502, "step": 450 }, { "epoch": 0.41187214611872147, "grad_norm": 25.470014572143555, "learning_rate": 4.109589041095891e-06, "loss": 0.2026, "step": 451 }, { "epoch": 0.41278538812785387, "grad_norm": 85.53765106201172, "learning_rate": 4.118721461187215e-06, "loss": 1.3897, "step": 452 }, { "epoch": 0.4136986301369863, "grad_norm": 12.107916831970215, "learning_rate": 4.127853881278539e-06, "loss": 0.1443, "step": 453 }, { "epoch": 0.4146118721461187, "grad_norm": 28.20136260986328, "learning_rate": 4.136986301369863e-06, "loss": 0.2058, "step": 454 }, { "epoch": 0.4155251141552511, "grad_norm": 62.62458801269531, "learning_rate": 4.146118721461188e-06, "loss": 1.9603, "step": 455 }, { "epoch": 0.41643835616438357, "grad_norm": 68.5009765625, "learning_rate": 4.1552511415525115e-06, "loss": 0.8531, "step": 456 }, { "epoch": 0.41735159817351597, "grad_norm": 75.07286071777344, "learning_rate": 4.164383561643836e-06, "loss": 1.5569, "step": 457 }, { "epoch": 0.4182648401826484, "grad_norm": 3.553246259689331, "learning_rate": 4.1735159817351604e-06, "loss": 0.0303, "step": 458 }, { "epoch": 0.4191780821917808, "grad_norm": 13.435628890991211, "learning_rate": 4.182648401826485e-06, "loss": 0.1259, "step": 459 }, { "epoch": 0.4200913242009132, "grad_norm": 77.77692413330078, "learning_rate": 4.1917808219178085e-06, "loss": 2.9656, "step": 460 }, { "epoch": 0.42100456621004567, "grad_norm": 16.580825805664062, "learning_rate": 4.200913242009132e-06, "loss": 0.0528, "step": 461 }, { "epoch": 0.42191780821917807, "grad_norm": 74.73613739013672, "learning_rate": 4.2100456621004574e-06, "loss": 1.97, "step": 462 }, { "epoch": 0.4228310502283105, "grad_norm": 14.165278434753418, "learning_rate": 4.219178082191781e-06, "loss": 0.0858, "step": 463 }, { "epoch": 0.4237442922374429, "grad_norm": 41.02267837524414, "learning_rate": 4.2283105022831055e-06, "loss": 0.6097, "step": 464 }, { "epoch": 0.4246575342465753, "grad_norm": 87.551025390625, "learning_rate": 4.237442922374429e-06, "loss": 2.0929, "step": 465 }, { "epoch": 0.42557077625570777, "grad_norm": 30.911975860595703, "learning_rate": 4.246575342465754e-06, "loss": 0.2518, "step": 466 }, { "epoch": 0.42648401826484017, "grad_norm": 84.69001007080078, "learning_rate": 4.255707762557078e-06, "loss": 2.4747, "step": 467 }, { "epoch": 0.4273972602739726, "grad_norm": 31.759689331054688, "learning_rate": 4.264840182648402e-06, "loss": 0.3633, "step": 468 }, { "epoch": 0.428310502283105, "grad_norm": 24.52104377746582, "learning_rate": 4.273972602739727e-06, "loss": 0.2034, "step": 469 }, { "epoch": 0.4292237442922374, "grad_norm": 35.608882904052734, "learning_rate": 4.283105022831051e-06, "loss": 0.6316, "step": 470 }, { "epoch": 0.4301369863013699, "grad_norm": 62.29707336425781, "learning_rate": 4.292237442922374e-06, "loss": 0.883, "step": 471 }, { "epoch": 0.43105022831050227, "grad_norm": 74.15577697753906, "learning_rate": 4.301369863013699e-06, "loss": 0.9623, "step": 472 }, { "epoch": 0.4319634703196347, "grad_norm": 21.88360023498535, "learning_rate": 4.310502283105023e-06, "loss": 0.1858, "step": 473 }, { "epoch": 0.4328767123287671, "grad_norm": 92.45098114013672, "learning_rate": 4.319634703196348e-06, "loss": 1.3127, "step": 474 }, { "epoch": 0.4337899543378995, "grad_norm": 87.49791717529297, "learning_rate": 4.328767123287671e-06, "loss": 1.8483, "step": 475 }, { "epoch": 0.434703196347032, "grad_norm": 15.451742172241211, "learning_rate": 4.337899543378996e-06, "loss": 0.1594, "step": 476 }, { "epoch": 0.43561643835616437, "grad_norm": 53.38404846191406, "learning_rate": 4.34703196347032e-06, "loss": 0.6191, "step": 477 }, { "epoch": 0.4365296803652968, "grad_norm": 44.61111831665039, "learning_rate": 4.356164383561644e-06, "loss": 0.4397, "step": 478 }, { "epoch": 0.4374429223744292, "grad_norm": 18.68964385986328, "learning_rate": 4.365296803652968e-06, "loss": 0.1194, "step": 479 }, { "epoch": 0.4383561643835616, "grad_norm": 30.11985969543457, "learning_rate": 4.374429223744293e-06, "loss": 0.4049, "step": 480 }, { "epoch": 0.4392694063926941, "grad_norm": 6.739968776702881, "learning_rate": 4.383561643835616e-06, "loss": 0.0725, "step": 481 }, { "epoch": 0.44018264840182647, "grad_norm": 7.491162300109863, "learning_rate": 4.392694063926941e-06, "loss": 0.0688, "step": 482 }, { "epoch": 0.4410958904109589, "grad_norm": 94.14772033691406, "learning_rate": 4.401826484018265e-06, "loss": 5.1216, "step": 483 }, { "epoch": 0.4420091324200913, "grad_norm": 55.48081588745117, "learning_rate": 4.41095890410959e-06, "loss": 0.504, "step": 484 }, { "epoch": 0.4429223744292237, "grad_norm": 14.034695625305176, "learning_rate": 4.420091324200913e-06, "loss": 0.1159, "step": 485 }, { "epoch": 0.4438356164383562, "grad_norm": 57.705467224121094, "learning_rate": 4.429223744292238e-06, "loss": 0.5785, "step": 486 }, { "epoch": 0.44474885844748857, "grad_norm": 106.55850982666016, "learning_rate": 4.438356164383562e-06, "loss": 5.8105, "step": 487 }, { "epoch": 0.445662100456621, "grad_norm": 70.25466918945312, "learning_rate": 4.447488584474886e-06, "loss": 2.1516, "step": 488 }, { "epoch": 0.4465753424657534, "grad_norm": 48.75373840332031, "learning_rate": 4.45662100456621e-06, "loss": 0.6414, "step": 489 }, { "epoch": 0.4474885844748858, "grad_norm": 228.1983184814453, "learning_rate": 4.465753424657535e-06, "loss": 0.2963, "step": 490 }, { "epoch": 0.4484018264840183, "grad_norm": 33.30781173706055, "learning_rate": 4.4748858447488585e-06, "loss": 0.4109, "step": 491 }, { "epoch": 0.44931506849315067, "grad_norm": 25.691844940185547, "learning_rate": 4.484018264840183e-06, "loss": 0.2165, "step": 492 }, { "epoch": 0.4502283105022831, "grad_norm": 73.90200805664062, "learning_rate": 4.493150684931507e-06, "loss": 0.4425, "step": 493 }, { "epoch": 0.4511415525114155, "grad_norm": 36.07059097290039, "learning_rate": 4.502283105022832e-06, "loss": 0.3409, "step": 494 }, { "epoch": 0.4520547945205479, "grad_norm": 37.16450881958008, "learning_rate": 4.5114155251141555e-06, "loss": 0.3034, "step": 495 }, { "epoch": 0.4529680365296804, "grad_norm": 44.616973876953125, "learning_rate": 4.52054794520548e-06, "loss": 0.5433, "step": 496 }, { "epoch": 0.45388127853881277, "grad_norm": 73.91869354248047, "learning_rate": 4.529680365296804e-06, "loss": 0.9204, "step": 497 }, { "epoch": 0.4547945205479452, "grad_norm": 78.1739730834961, "learning_rate": 4.538812785388128e-06, "loss": 1.297, "step": 498 }, { "epoch": 0.4557077625570776, "grad_norm": 21.021888732910156, "learning_rate": 4.5479452054794525e-06, "loss": 0.0845, "step": 499 }, { "epoch": 0.45662100456621, "grad_norm": 60.44771957397461, "learning_rate": 4.557077625570777e-06, "loss": 1.4813, "step": 500 }, { "epoch": 0.4575342465753425, "grad_norm": 50.0435791015625, "learning_rate": 4.566210045662101e-06, "loss": 0.5937, "step": 501 }, { "epoch": 0.45844748858447487, "grad_norm": 39.04544448852539, "learning_rate": 4.575342465753425e-06, "loss": 0.5028, "step": 502 }, { "epoch": 0.4593607305936073, "grad_norm": 108.34681701660156, "learning_rate": 4.5844748858447495e-06, "loss": 1.992, "step": 503 }, { "epoch": 0.4602739726027397, "grad_norm": 50.924800872802734, "learning_rate": 4.593607305936074e-06, "loss": 0.5345, "step": 504 }, { "epoch": 0.4611872146118721, "grad_norm": 30.759721755981445, "learning_rate": 4.602739726027398e-06, "loss": 0.2285, "step": 505 }, { "epoch": 0.4621004566210046, "grad_norm": 0.2566116750240326, "learning_rate": 4.611872146118721e-06, "loss": 0.0021, "step": 506 }, { "epoch": 0.46301369863013697, "grad_norm": 83.99923706054688, "learning_rate": 4.6210045662100465e-06, "loss": 4.4241, "step": 507 }, { "epoch": 0.4639269406392694, "grad_norm": 68.85009765625, "learning_rate": 4.63013698630137e-06, "loss": 1.4038, "step": 508 }, { "epoch": 0.4648401826484018, "grad_norm": 54.418975830078125, "learning_rate": 4.639269406392695e-06, "loss": 0.6086, "step": 509 }, { "epoch": 0.4657534246575342, "grad_norm": 62.78318786621094, "learning_rate": 4.648401826484018e-06, "loss": 0.3617, "step": 510 }, { "epoch": 0.4666666666666667, "grad_norm": 17.959716796875, "learning_rate": 4.657534246575343e-06, "loss": 0.1906, "step": 511 }, { "epoch": 0.46757990867579907, "grad_norm": 80.54129028320312, "learning_rate": 4.666666666666667e-06, "loss": 0.8254, "step": 512 }, { "epoch": 0.4684931506849315, "grad_norm": 0.8481607437133789, "learning_rate": 4.675799086757991e-06, "loss": 0.007, "step": 513 }, { "epoch": 0.4694063926940639, "grad_norm": 94.12175750732422, "learning_rate": 4.684931506849315e-06, "loss": 1.1059, "step": 514 }, { "epoch": 0.4703196347031963, "grad_norm": 67.6685791015625, "learning_rate": 4.69406392694064e-06, "loss": 0.4545, "step": 515 }, { "epoch": 0.4712328767123288, "grad_norm": 15.112743377685547, "learning_rate": 4.703196347031963e-06, "loss": 0.1207, "step": 516 }, { "epoch": 0.47214611872146117, "grad_norm": 57.61674880981445, "learning_rate": 4.712328767123288e-06, "loss": 0.6216, "step": 517 }, { "epoch": 0.4730593607305936, "grad_norm": 1.3140431642532349, "learning_rate": 4.721461187214612e-06, "loss": 0.0105, "step": 518 }, { "epoch": 0.473972602739726, "grad_norm": 11.955789566040039, "learning_rate": 4.730593607305937e-06, "loss": 0.1199, "step": 519 }, { "epoch": 0.4748858447488584, "grad_norm": 99.33483123779297, "learning_rate": 4.73972602739726e-06, "loss": 2.1858, "step": 520 }, { "epoch": 0.4757990867579909, "grad_norm": 16.00244140625, "learning_rate": 4.748858447488585e-06, "loss": 0.1917, "step": 521 }, { "epoch": 0.4767123287671233, "grad_norm": 159.49607849121094, "learning_rate": 4.757990867579909e-06, "loss": 3.0833, "step": 522 }, { "epoch": 0.4776255707762557, "grad_norm": 92.47789764404297, "learning_rate": 4.767123287671233e-06, "loss": 2.6803, "step": 523 }, { "epoch": 0.4785388127853881, "grad_norm": 29.452665328979492, "learning_rate": 4.776255707762557e-06, "loss": 0.2879, "step": 524 }, { "epoch": 0.4794520547945205, "grad_norm": 51.45266342163086, "learning_rate": 4.785388127853882e-06, "loss": 0.6535, "step": 525 }, { "epoch": 0.480365296803653, "grad_norm": 69.06902313232422, "learning_rate": 4.7945205479452054e-06, "loss": 0.577, "step": 526 }, { "epoch": 0.4812785388127854, "grad_norm": 11.619267463684082, "learning_rate": 4.80365296803653e-06, "loss": 0.0777, "step": 527 }, { "epoch": 0.4821917808219178, "grad_norm": 1.349090337753296, "learning_rate": 4.812785388127854e-06, "loss": 0.0118, "step": 528 }, { "epoch": 0.4831050228310502, "grad_norm": 81.71304321289062, "learning_rate": 4.821917808219179e-06, "loss": 1.0052, "step": 529 }, { "epoch": 0.4840182648401826, "grad_norm": 43.37370300292969, "learning_rate": 4.8310502283105025e-06, "loss": 0.728, "step": 530 }, { "epoch": 0.4849315068493151, "grad_norm": 58.55609130859375, "learning_rate": 4.840182648401827e-06, "loss": 0.7235, "step": 531 }, { "epoch": 0.4858447488584475, "grad_norm": 1.048585057258606, "learning_rate": 4.849315068493151e-06, "loss": 0.0094, "step": 532 }, { "epoch": 0.4867579908675799, "grad_norm": 68.27369689941406, "learning_rate": 4.858447488584475e-06, "loss": 0.808, "step": 533 }, { "epoch": 0.4876712328767123, "grad_norm": 16.810874938964844, "learning_rate": 4.8675799086757995e-06, "loss": 0.0831, "step": 534 }, { "epoch": 0.4885844748858447, "grad_norm": 85.35765838623047, "learning_rate": 4.876712328767124e-06, "loss": 1.8749, "step": 535 }, { "epoch": 0.4894977168949772, "grad_norm": 9.11937427520752, "learning_rate": 4.8858447488584476e-06, "loss": 0.0807, "step": 536 }, { "epoch": 0.4904109589041096, "grad_norm": 56.65217590332031, "learning_rate": 4.894977168949772e-06, "loss": 0.8225, "step": 537 }, { "epoch": 0.491324200913242, "grad_norm": 2.7460126876831055, "learning_rate": 4.9041095890410965e-06, "loss": 0.022, "step": 538 }, { "epoch": 0.4922374429223744, "grad_norm": 202.41217041015625, "learning_rate": 4.913242009132421e-06, "loss": 3.3442, "step": 539 }, { "epoch": 0.4931506849315068, "grad_norm": 70.25475311279297, "learning_rate": 4.9223744292237446e-06, "loss": 1.131, "step": 540 }, { "epoch": 0.4940639269406393, "grad_norm": 48.51894760131836, "learning_rate": 4.931506849315069e-06, "loss": 0.6267, "step": 541 }, { "epoch": 0.4949771689497717, "grad_norm": 62.96455383300781, "learning_rate": 4.9406392694063935e-06, "loss": 0.807, "step": 542 }, { "epoch": 0.4958904109589041, "grad_norm": 35.59819793701172, "learning_rate": 4.949771689497717e-06, "loss": 0.5484, "step": 543 }, { "epoch": 0.4968036529680365, "grad_norm": 21.505062103271484, "learning_rate": 4.958904109589042e-06, "loss": 0.1244, "step": 544 }, { "epoch": 0.4977168949771689, "grad_norm": 94.33080291748047, "learning_rate": 4.968036529680366e-06, "loss": 1.9069, "step": 545 }, { "epoch": 0.4986301369863014, "grad_norm": 1.7957226037979126, "learning_rate": 4.97716894977169e-06, "loss": 0.0145, "step": 546 }, { "epoch": 0.4995433789954338, "grad_norm": 18.905752182006836, "learning_rate": 4.986301369863014e-06, "loss": 0.1736, "step": 547 }, { "epoch": 0.5004566210045662, "grad_norm": 40.198402404785156, "learning_rate": 4.995433789954338e-06, "loss": 0.541, "step": 548 }, { "epoch": 0.5013698630136987, "grad_norm": 10.572152137756348, "learning_rate": 5.004566210045663e-06, "loss": 0.1195, "step": 549 }, { "epoch": 0.502283105022831, "grad_norm": 10.037468910217285, "learning_rate": 5.0136986301369875e-06, "loss": 0.068, "step": 550 }, { "epoch": 0.5031963470319635, "grad_norm": 5.679559707641602, "learning_rate": 5.02283105022831e-06, "loss": 0.0492, "step": 551 }, { "epoch": 0.5041095890410959, "grad_norm": 29.160072326660156, "learning_rate": 5.031963470319635e-06, "loss": 0.2632, "step": 552 }, { "epoch": 0.5050228310502283, "grad_norm": 51.015010833740234, "learning_rate": 5.04109589041096e-06, "loss": 0.6477, "step": 553 }, { "epoch": 0.5059360730593607, "grad_norm": 138.4801788330078, "learning_rate": 5.050228310502283e-06, "loss": 3.9054, "step": 554 }, { "epoch": 0.5068493150684932, "grad_norm": 85.4847640991211, "learning_rate": 5.059360730593607e-06, "loss": 1.1309, "step": 555 }, { "epoch": 0.5077625570776255, "grad_norm": 7.715000629425049, "learning_rate": 5.068493150684932e-06, "loss": 0.0398, "step": 556 }, { "epoch": 0.508675799086758, "grad_norm": 8.39102554321289, "learning_rate": 5.077625570776255e-06, "loss": 0.072, "step": 557 }, { "epoch": 0.5095890410958904, "grad_norm": 25.710521697998047, "learning_rate": 5.08675799086758e-06, "loss": 0.2666, "step": 558 }, { "epoch": 0.5105022831050229, "grad_norm": 9.216446876525879, "learning_rate": 5.095890410958904e-06, "loss": 0.0693, "step": 559 }, { "epoch": 0.5114155251141552, "grad_norm": 13.623147010803223, "learning_rate": 5.10502283105023e-06, "loss": 0.0678, "step": 560 }, { "epoch": 0.5123287671232877, "grad_norm": 11.151211738586426, "learning_rate": 5.114155251141552e-06, "loss": 0.0856, "step": 561 }, { "epoch": 0.5132420091324201, "grad_norm": 48.37599563598633, "learning_rate": 5.123287671232877e-06, "loss": 0.665, "step": 562 }, { "epoch": 0.5141552511415525, "grad_norm": 41.175201416015625, "learning_rate": 5.132420091324201e-06, "loss": 0.4106, "step": 563 }, { "epoch": 0.5150684931506849, "grad_norm": 3.8414909839630127, "learning_rate": 5.141552511415525e-06, "loss": 0.0372, "step": 564 }, { "epoch": 0.5159817351598174, "grad_norm": 15.853111267089844, "learning_rate": 5.1506849315068494e-06, "loss": 0.1271, "step": 565 }, { "epoch": 0.5168949771689497, "grad_norm": 182.28721618652344, "learning_rate": 5.159817351598174e-06, "loss": 2.0322, "step": 566 }, { "epoch": 0.5178082191780822, "grad_norm": 90.53395080566406, "learning_rate": 5.1689497716894975e-06, "loss": 2.0942, "step": 567 }, { "epoch": 0.5187214611872146, "grad_norm": 107.71090698242188, "learning_rate": 5.178082191780822e-06, "loss": 0.7124, "step": 568 }, { "epoch": 0.5196347031963471, "grad_norm": 9.314497947692871, "learning_rate": 5.1872146118721464e-06, "loss": 0.0813, "step": 569 }, { "epoch": 0.5205479452054794, "grad_norm": 20.419828414916992, "learning_rate": 5.196347031963471e-06, "loss": 0.1603, "step": 570 }, { "epoch": 0.5214611872146119, "grad_norm": 121.92138671875, "learning_rate": 5.2054794520547945e-06, "loss": 0.5955, "step": 571 }, { "epoch": 0.5223744292237443, "grad_norm": 69.31221771240234, "learning_rate": 5.214611872146119e-06, "loss": 0.9057, "step": 572 }, { "epoch": 0.5232876712328767, "grad_norm": 65.04753875732422, "learning_rate": 5.2237442922374435e-06, "loss": 0.7041, "step": 573 }, { "epoch": 0.5242009132420091, "grad_norm": 94.45277404785156, "learning_rate": 5.232876712328767e-06, "loss": 2.3661, "step": 574 }, { "epoch": 0.5251141552511416, "grad_norm": 50.49311828613281, "learning_rate": 5.2420091324200915e-06, "loss": 0.5881, "step": 575 }, { "epoch": 0.5260273972602739, "grad_norm": 49.70784378051758, "learning_rate": 5.251141552511416e-06, "loss": 0.7981, "step": 576 }, { "epoch": 0.5269406392694064, "grad_norm": 95.01044464111328, "learning_rate": 5.26027397260274e-06, "loss": 1.977, "step": 577 }, { "epoch": 0.5278538812785388, "grad_norm": 91.88674926757812, "learning_rate": 5.269406392694064e-06, "loss": 2.0608, "step": 578 }, { "epoch": 0.5287671232876713, "grad_norm": 5.030986309051514, "learning_rate": 5.2785388127853886e-06, "loss": 0.0384, "step": 579 }, { "epoch": 0.5296803652968036, "grad_norm": 76.63216400146484, "learning_rate": 5.287671232876713e-06, "loss": 1.0611, "step": 580 }, { "epoch": 0.5305936073059361, "grad_norm": 20.983190536499023, "learning_rate": 5.296803652968037e-06, "loss": 0.1932, "step": 581 }, { "epoch": 0.5315068493150685, "grad_norm": 74.58145904541016, "learning_rate": 5.305936073059361e-06, "loss": 1.1044, "step": 582 }, { "epoch": 0.5324200913242009, "grad_norm": 12.040328025817871, "learning_rate": 5.3150684931506856e-06, "loss": 0.0802, "step": 583 }, { "epoch": 0.5333333333333333, "grad_norm": 6.108545303344727, "learning_rate": 5.324200913242009e-06, "loss": 0.0354, "step": 584 }, { "epoch": 0.5342465753424658, "grad_norm": 64.76498413085938, "learning_rate": 5.333333333333334e-06, "loss": 1.3474, "step": 585 }, { "epoch": 0.5351598173515981, "grad_norm": 28.661935806274414, "learning_rate": 5.342465753424658e-06, "loss": 0.3086, "step": 586 }, { "epoch": 0.5360730593607306, "grad_norm": 77.86654663085938, "learning_rate": 5.351598173515982e-06, "loss": 0.9208, "step": 587 }, { "epoch": 0.536986301369863, "grad_norm": 36.82304382324219, "learning_rate": 5.360730593607306e-06, "loss": 0.3891, "step": 588 }, { "epoch": 0.5378995433789955, "grad_norm": 68.4932632446289, "learning_rate": 5.369863013698631e-06, "loss": 1.0345, "step": 589 }, { "epoch": 0.5388127853881278, "grad_norm": 89.85137176513672, "learning_rate": 5.378995433789955e-06, "loss": 1.8116, "step": 590 }, { "epoch": 0.5397260273972603, "grad_norm": 39.44499206542969, "learning_rate": 5.388127853881279e-06, "loss": 0.3203, "step": 591 }, { "epoch": 0.5406392694063927, "grad_norm": 61.73306655883789, "learning_rate": 5.397260273972603e-06, "loss": 0.5035, "step": 592 }, { "epoch": 0.5415525114155251, "grad_norm": 14.703970909118652, "learning_rate": 5.406392694063928e-06, "loss": 0.1531, "step": 593 }, { "epoch": 0.5424657534246575, "grad_norm": 6.391432285308838, "learning_rate": 5.415525114155251e-06, "loss": 0.0614, "step": 594 }, { "epoch": 0.54337899543379, "grad_norm": 20.11590576171875, "learning_rate": 5.424657534246576e-06, "loss": 0.2554, "step": 595 }, { "epoch": 0.5442922374429223, "grad_norm": 1.9555344581604004, "learning_rate": 5.4337899543379e-06, "loss": 0.0188, "step": 596 }, { "epoch": 0.5452054794520548, "grad_norm": 11.602871894836426, "learning_rate": 5.442922374429224e-06, "loss": 0.1282, "step": 597 }, { "epoch": 0.5461187214611872, "grad_norm": 102.5412368774414, "learning_rate": 5.452054794520548e-06, "loss": 0.6223, "step": 598 }, { "epoch": 0.5470319634703197, "grad_norm": 118.12004089355469, "learning_rate": 5.461187214611873e-06, "loss": 1.0346, "step": 599 }, { "epoch": 0.547945205479452, "grad_norm": 53.87303924560547, "learning_rate": 5.470319634703197e-06, "loss": 0.8151, "step": 600 }, { "epoch": 0.5488584474885845, "grad_norm": 138.2152099609375, "learning_rate": 5.479452054794521e-06, "loss": 3.8661, "step": 601 }, { "epoch": 0.5497716894977169, "grad_norm": 51.20602798461914, "learning_rate": 5.488584474885845e-06, "loss": 0.7614, "step": 602 }, { "epoch": 0.5506849315068493, "grad_norm": 93.51461791992188, "learning_rate": 5.49771689497717e-06, "loss": 2.3426, "step": 603 }, { "epoch": 0.5515981735159817, "grad_norm": 113.75943756103516, "learning_rate": 5.506849315068493e-06, "loss": 3.6936, "step": 604 }, { "epoch": 0.5525114155251142, "grad_norm": 10.359208106994629, "learning_rate": 5.515981735159818e-06, "loss": 0.1082, "step": 605 }, { "epoch": 0.5534246575342465, "grad_norm": 129.7666473388672, "learning_rate": 5.525114155251142e-06, "loss": 0.8537, "step": 606 }, { "epoch": 0.554337899543379, "grad_norm": 75.770751953125, "learning_rate": 5.534246575342466e-06, "loss": 1.3526, "step": 607 }, { "epoch": 0.5552511415525114, "grad_norm": 100.25353240966797, "learning_rate": 5.5433789954337904e-06, "loss": 0.9341, "step": 608 }, { "epoch": 0.5561643835616439, "grad_norm": 99.09423065185547, "learning_rate": 5.552511415525115e-06, "loss": 2.3151, "step": 609 }, { "epoch": 0.5570776255707762, "grad_norm": 4.200573444366455, "learning_rate": 5.561643835616439e-06, "loss": 0.0305, "step": 610 }, { "epoch": 0.5579908675799087, "grad_norm": 71.75627899169922, "learning_rate": 5.570776255707763e-06, "loss": 1.8687, "step": 611 }, { "epoch": 0.5589041095890411, "grad_norm": 3.497199773788452, "learning_rate": 5.5799086757990874e-06, "loss": 0.0322, "step": 612 }, { "epoch": 0.5598173515981735, "grad_norm": 10.46337890625, "learning_rate": 5.589041095890412e-06, "loss": 0.1059, "step": 613 }, { "epoch": 0.5607305936073059, "grad_norm": 65.29375457763672, "learning_rate": 5.5981735159817355e-06, "loss": 1.02, "step": 614 }, { "epoch": 0.5616438356164384, "grad_norm": 71.3831787109375, "learning_rate": 5.60730593607306e-06, "loss": 1.0442, "step": 615 }, { "epoch": 0.5625570776255707, "grad_norm": 74.18376922607422, "learning_rate": 5.6164383561643845e-06, "loss": 1.2953, "step": 616 }, { "epoch": 0.5634703196347032, "grad_norm": 9.343914031982422, "learning_rate": 5.625570776255708e-06, "loss": 0.0701, "step": 617 }, { "epoch": 0.5643835616438356, "grad_norm": 31.813344955444336, "learning_rate": 5.6347031963470325e-06, "loss": 0.4208, "step": 618 }, { "epoch": 0.5652968036529681, "grad_norm": 35.702205657958984, "learning_rate": 5.643835616438357e-06, "loss": 0.3154, "step": 619 }, { "epoch": 0.5662100456621004, "grad_norm": 12.789281845092773, "learning_rate": 5.6529680365296815e-06, "loss": 0.1199, "step": 620 }, { "epoch": 0.5671232876712329, "grad_norm": 51.71504592895508, "learning_rate": 5.662100456621005e-06, "loss": 0.323, "step": 621 }, { "epoch": 0.5680365296803653, "grad_norm": 158.00343322753906, "learning_rate": 5.6712328767123296e-06, "loss": 2.8636, "step": 622 }, { "epoch": 0.5689497716894977, "grad_norm": 420.9605712890625, "learning_rate": 5.680365296803654e-06, "loss": 2.2397, "step": 623 }, { "epoch": 0.5698630136986301, "grad_norm": 54.04663848876953, "learning_rate": 5.689497716894977e-06, "loss": 0.5974, "step": 624 }, { "epoch": 0.5707762557077626, "grad_norm": 22.023073196411133, "learning_rate": 5.698630136986302e-06, "loss": 0.2552, "step": 625 }, { "epoch": 0.5716894977168949, "grad_norm": 23.038076400756836, "learning_rate": 5.7077625570776266e-06, "loss": 0.2328, "step": 626 }, { "epoch": 0.5726027397260274, "grad_norm": 41.9617919921875, "learning_rate": 5.716894977168949e-06, "loss": 0.3534, "step": 627 }, { "epoch": 0.5735159817351598, "grad_norm": 20.247339248657227, "learning_rate": 5.726027397260274e-06, "loss": 0.1718, "step": 628 }, { "epoch": 0.5744292237442923, "grad_norm": 48.46094512939453, "learning_rate": 5.735159817351599e-06, "loss": 0.6787, "step": 629 }, { "epoch": 0.5753424657534246, "grad_norm": 83.17173767089844, "learning_rate": 5.744292237442924e-06, "loss": 2.4932, "step": 630 }, { "epoch": 0.5762557077625571, "grad_norm": 39.00690841674805, "learning_rate": 5.753424657534246e-06, "loss": 0.3426, "step": 631 }, { "epoch": 0.5771689497716895, "grad_norm": 23.269601821899414, "learning_rate": 5.762557077625572e-06, "loss": 0.1675, "step": 632 }, { "epoch": 0.5780821917808219, "grad_norm": 15.27685260772705, "learning_rate": 5.771689497716896e-06, "loss": 0.1296, "step": 633 }, { "epoch": 0.5789954337899543, "grad_norm": 67.93006134033203, "learning_rate": 5.780821917808219e-06, "loss": 0.9989, "step": 634 }, { "epoch": 0.5799086757990868, "grad_norm": 13.199803352355957, "learning_rate": 5.789954337899543e-06, "loss": 0.1064, "step": 635 }, { "epoch": 0.5808219178082191, "grad_norm": 7.691871643066406, "learning_rate": 5.799086757990869e-06, "loss": 0.0489, "step": 636 }, { "epoch": 0.5817351598173516, "grad_norm": 108.95758056640625, "learning_rate": 5.8082191780821915e-06, "loss": 2.0501, "step": 637 }, { "epoch": 0.582648401826484, "grad_norm": 43.49346160888672, "learning_rate": 5.817351598173516e-06, "loss": 0.5185, "step": 638 }, { "epoch": 0.5835616438356165, "grad_norm": 38.63212203979492, "learning_rate": 5.82648401826484e-06, "loss": 0.1516, "step": 639 }, { "epoch": 0.5844748858447488, "grad_norm": 12.070369720458984, "learning_rate": 5.835616438356166e-06, "loss": 0.1297, "step": 640 }, { "epoch": 0.5853881278538813, "grad_norm": 13.03789234161377, "learning_rate": 5.8447488584474885e-06, "loss": 0.0784, "step": 641 }, { "epoch": 0.5863013698630137, "grad_norm": 89.5448226928711, "learning_rate": 5.853881278538813e-06, "loss": 1.4512, "step": 642 }, { "epoch": 0.5872146118721461, "grad_norm": 51.449092864990234, "learning_rate": 5.863013698630137e-06, "loss": 0.5801, "step": 643 }, { "epoch": 0.5881278538812785, "grad_norm": 131.5089569091797, "learning_rate": 5.872146118721461e-06, "loss": 2.1322, "step": 644 }, { "epoch": 0.589041095890411, "grad_norm": 103.15483856201172, "learning_rate": 5.8812785388127855e-06, "loss": 2.0786, "step": 645 }, { "epoch": 0.5899543378995433, "grad_norm": 84.44374084472656, "learning_rate": 5.89041095890411e-06, "loss": 2.5609, "step": 646 }, { "epoch": 0.5908675799086758, "grad_norm": 5.0140485763549805, "learning_rate": 5.8995433789954336e-06, "loss": 0.0275, "step": 647 }, { "epoch": 0.5917808219178082, "grad_norm": 10.17607593536377, "learning_rate": 5.908675799086758e-06, "loss": 0.0925, "step": 648 }, { "epoch": 0.5926940639269407, "grad_norm": 5.260303020477295, "learning_rate": 5.9178082191780825e-06, "loss": 0.0421, "step": 649 }, { "epoch": 0.593607305936073, "grad_norm": 114.2519760131836, "learning_rate": 5.926940639269407e-06, "loss": 0.6028, "step": 650 }, { "epoch": 0.5945205479452055, "grad_norm": 2.7146098613739014, "learning_rate": 5.936073059360731e-06, "loss": 0.0171, "step": 651 }, { "epoch": 0.5954337899543379, "grad_norm": 59.947715759277344, "learning_rate": 5.945205479452055e-06, "loss": 1.2505, "step": 652 }, { "epoch": 0.5963470319634703, "grad_norm": 34.647586822509766, "learning_rate": 5.9543378995433795e-06, "loss": 0.4719, "step": 653 }, { "epoch": 0.5972602739726027, "grad_norm": 33.86647033691406, "learning_rate": 5.963470319634703e-06, "loss": 0.3413, "step": 654 }, { "epoch": 0.5981735159817352, "grad_norm": 12.830849647521973, "learning_rate": 5.972602739726028e-06, "loss": 0.077, "step": 655 }, { "epoch": 0.5990867579908675, "grad_norm": 59.27029800415039, "learning_rate": 5.981735159817352e-06, "loss": 0.3765, "step": 656 }, { "epoch": 0.6, "grad_norm": 49.94404983520508, "learning_rate": 5.990867579908676e-06, "loss": 0.4392, "step": 657 }, { "epoch": 0.6009132420091324, "grad_norm": 37.47412872314453, "learning_rate": 6e-06, "loss": 0.2624, "step": 658 }, { "epoch": 0.6018264840182649, "grad_norm": 107.2572021484375, "learning_rate": 6.009132420091325e-06, "loss": 0.696, "step": 659 }, { "epoch": 0.6027397260273972, "grad_norm": 74.31698608398438, "learning_rate": 6.018264840182649e-06, "loss": 1.3177, "step": 660 }, { "epoch": 0.6036529680365297, "grad_norm": 89.86529541015625, "learning_rate": 6.027397260273973e-06, "loss": 2.6322, "step": 661 }, { "epoch": 0.6045662100456621, "grad_norm": 36.94347381591797, "learning_rate": 6.036529680365297e-06, "loss": 0.2521, "step": 662 }, { "epoch": 0.6054794520547945, "grad_norm": 82.0298843383789, "learning_rate": 6.045662100456622e-06, "loss": 2.9425, "step": 663 }, { "epoch": 0.6063926940639269, "grad_norm": 30.658309936523438, "learning_rate": 6.054794520547945e-06, "loss": 0.3148, "step": 664 }, { "epoch": 0.6073059360730594, "grad_norm": 4.137408256530762, "learning_rate": 6.06392694063927e-06, "loss": 0.0307, "step": 665 }, { "epoch": 0.6082191780821918, "grad_norm": 32.02985763549805, "learning_rate": 6.073059360730594e-06, "loss": 0.2422, "step": 666 }, { "epoch": 0.6091324200913242, "grad_norm": 71.75879669189453, "learning_rate": 6.082191780821919e-06, "loss": 0.8436, "step": 667 }, { "epoch": 0.6100456621004566, "grad_norm": 87.88565063476562, "learning_rate": 6.091324200913242e-06, "loss": 6.261, "step": 668 }, { "epoch": 0.6109589041095891, "grad_norm": 23.999401092529297, "learning_rate": 6.100456621004567e-06, "loss": 0.2075, "step": 669 }, { "epoch": 0.6118721461187214, "grad_norm": 7.400008678436279, "learning_rate": 6.109589041095891e-06, "loss": 0.069, "step": 670 }, { "epoch": 0.6127853881278539, "grad_norm": 30.395950317382812, "learning_rate": 6.118721461187215e-06, "loss": 0.1842, "step": 671 }, { "epoch": 0.6136986301369863, "grad_norm": 16.997289657592773, "learning_rate": 6.127853881278539e-06, "loss": 0.1189, "step": 672 }, { "epoch": 0.6146118721461187, "grad_norm": 222.5692901611328, "learning_rate": 6.136986301369864e-06, "loss": 0.6344, "step": 673 }, { "epoch": 0.6155251141552511, "grad_norm": 6.103581428527832, "learning_rate": 6.146118721461187e-06, "loss": 0.0673, "step": 674 }, { "epoch": 0.6164383561643836, "grad_norm": 45.833194732666016, "learning_rate": 6.155251141552512e-06, "loss": 0.4672, "step": 675 }, { "epoch": 0.617351598173516, "grad_norm": 104.49969482421875, "learning_rate": 6.164383561643836e-06, "loss": 1.4774, "step": 676 }, { "epoch": 0.6182648401826484, "grad_norm": 26.78767967224121, "learning_rate": 6.173515981735161e-06, "loss": 0.2922, "step": 677 }, { "epoch": 0.6191780821917808, "grad_norm": 72.42552947998047, "learning_rate": 6.182648401826484e-06, "loss": 1.5342, "step": 678 }, { "epoch": 0.6200913242009133, "grad_norm": 33.32026290893555, "learning_rate": 6.191780821917809e-06, "loss": 0.3992, "step": 679 }, { "epoch": 0.6210045662100456, "grad_norm": 76.49042510986328, "learning_rate": 6.200913242009133e-06, "loss": 0.5747, "step": 680 }, { "epoch": 0.6219178082191781, "grad_norm": 5.051244258880615, "learning_rate": 6.210045662100457e-06, "loss": 0.0382, "step": 681 }, { "epoch": 0.6228310502283105, "grad_norm": 110.7624740600586, "learning_rate": 6.219178082191781e-06, "loss": 3.2644, "step": 682 }, { "epoch": 0.6237442922374429, "grad_norm": 7.914773464202881, "learning_rate": 6.228310502283106e-06, "loss": 0.0578, "step": 683 }, { "epoch": 0.6246575342465753, "grad_norm": 30.735578536987305, "learning_rate": 6.2374429223744295e-06, "loss": 0.399, "step": 684 }, { "epoch": 0.6255707762557078, "grad_norm": 40.25942611694336, "learning_rate": 6.246575342465754e-06, "loss": 0.2454, "step": 685 }, { "epoch": 0.6264840182648402, "grad_norm": 68.85195922851562, "learning_rate": 6.255707762557078e-06, "loss": 0.7263, "step": 686 }, { "epoch": 0.6273972602739726, "grad_norm": 13.469148635864258, "learning_rate": 6.264840182648403e-06, "loss": 0.1068, "step": 687 }, { "epoch": 0.628310502283105, "grad_norm": 5.528262615203857, "learning_rate": 6.2739726027397265e-06, "loss": 0.0517, "step": 688 }, { "epoch": 0.6292237442922375, "grad_norm": 18.469236373901367, "learning_rate": 6.283105022831051e-06, "loss": 0.1695, "step": 689 }, { "epoch": 0.6301369863013698, "grad_norm": 39.63064956665039, "learning_rate": 6.292237442922375e-06, "loss": 0.3574, "step": 690 }, { "epoch": 0.6310502283105023, "grad_norm": 55.685508728027344, "learning_rate": 6.301369863013699e-06, "loss": 0.4657, "step": 691 }, { "epoch": 0.6319634703196347, "grad_norm": 103.70734405517578, "learning_rate": 6.3105022831050235e-06, "loss": 1.4343, "step": 692 }, { "epoch": 0.6328767123287671, "grad_norm": 29.32004165649414, "learning_rate": 6.319634703196348e-06, "loss": 0.4078, "step": 693 }, { "epoch": 0.6337899543378995, "grad_norm": 136.18115234375, "learning_rate": 6.328767123287672e-06, "loss": 3.5814, "step": 694 }, { "epoch": 0.634703196347032, "grad_norm": 86.11778259277344, "learning_rate": 6.337899543378996e-06, "loss": 0.6113, "step": 695 }, { "epoch": 0.6356164383561644, "grad_norm": 59.88497543334961, "learning_rate": 6.3470319634703205e-06, "loss": 1.3761, "step": 696 }, { "epoch": 0.6365296803652968, "grad_norm": 2.2435152530670166, "learning_rate": 6.356164383561645e-06, "loss": 0.0178, "step": 697 }, { "epoch": 0.6374429223744292, "grad_norm": 74.35299682617188, "learning_rate": 6.365296803652969e-06, "loss": 1.7468, "step": 698 }, { "epoch": 0.6383561643835617, "grad_norm": 100.31206512451172, "learning_rate": 6.374429223744293e-06, "loss": 1.2753, "step": 699 }, { "epoch": 0.639269406392694, "grad_norm": 68.11550903320312, "learning_rate": 6.3835616438356175e-06, "loss": 0.8741, "step": 700 }, { "epoch": 0.6401826484018265, "grad_norm": 44.75593185424805, "learning_rate": 6.392694063926941e-06, "loss": 0.4522, "step": 701 }, { "epoch": 0.6410958904109589, "grad_norm": 29.305349349975586, "learning_rate": 6.401826484018266e-06, "loss": 0.2059, "step": 702 }, { "epoch": 0.6420091324200913, "grad_norm": 55.523155212402344, "learning_rate": 6.41095890410959e-06, "loss": 0.4202, "step": 703 }, { "epoch": 0.6429223744292237, "grad_norm": 76.26568603515625, "learning_rate": 6.420091324200914e-06, "loss": 2.8598, "step": 704 }, { "epoch": 0.6438356164383562, "grad_norm": 29.319143295288086, "learning_rate": 6.429223744292238e-06, "loss": 0.2647, "step": 705 }, { "epoch": 0.6447488584474886, "grad_norm": 125.36869049072266, "learning_rate": 6.438356164383563e-06, "loss": 5.9593, "step": 706 }, { "epoch": 0.645662100456621, "grad_norm": 9.248749732971191, "learning_rate": 6.447488584474887e-06, "loss": 0.0684, "step": 707 }, { "epoch": 0.6465753424657534, "grad_norm": 54.652137756347656, "learning_rate": 6.456621004566211e-06, "loss": 0.6732, "step": 708 }, { "epoch": 0.6474885844748859, "grad_norm": 51.53041076660156, "learning_rate": 6.465753424657535e-06, "loss": 0.4567, "step": 709 }, { "epoch": 0.6484018264840182, "grad_norm": 21.30830192565918, "learning_rate": 6.47488584474886e-06, "loss": 0.2276, "step": 710 }, { "epoch": 0.6493150684931507, "grad_norm": 121.29090881347656, "learning_rate": 6.484018264840182e-06, "loss": 4.5455, "step": 711 }, { "epoch": 0.6502283105022831, "grad_norm": 34.73987579345703, "learning_rate": 6.493150684931508e-06, "loss": 0.3418, "step": 712 }, { "epoch": 0.6511415525114155, "grad_norm": 48.02368927001953, "learning_rate": 6.502283105022832e-06, "loss": 0.6841, "step": 713 }, { "epoch": 0.6520547945205479, "grad_norm": 2.176788091659546, "learning_rate": 6.511415525114155e-06, "loss": 0.0088, "step": 714 }, { "epoch": 0.6529680365296804, "grad_norm": 93.09564971923828, "learning_rate": 6.5205479452054794e-06, "loss": 2.1544, "step": 715 }, { "epoch": 0.6538812785388128, "grad_norm": 22.719409942626953, "learning_rate": 6.529680365296805e-06, "loss": 0.2553, "step": 716 }, { "epoch": 0.6547945205479452, "grad_norm": 60.07933044433594, "learning_rate": 6.538812785388129e-06, "loss": 1.7106, "step": 717 }, { "epoch": 0.6557077625570776, "grad_norm": 16.383481979370117, "learning_rate": 6.547945205479452e-06, "loss": 0.1751, "step": 718 }, { "epoch": 0.6566210045662101, "grad_norm": 10.928925514221191, "learning_rate": 6.5570776255707765e-06, "loss": 0.1463, "step": 719 }, { "epoch": 0.6575342465753424, "grad_norm": 225.9449462890625, "learning_rate": 6.566210045662102e-06, "loss": 1.6863, "step": 720 }, { "epoch": 0.6584474885844749, "grad_norm": 29.39130210876465, "learning_rate": 6.5753424657534245e-06, "loss": 0.3788, "step": 721 }, { "epoch": 0.6593607305936073, "grad_norm": 51.85538864135742, "learning_rate": 6.584474885844749e-06, "loss": 0.9818, "step": 722 }, { "epoch": 0.6602739726027397, "grad_norm": 54.62551498413086, "learning_rate": 6.593607305936074e-06, "loss": 1.0698, "step": 723 }, { "epoch": 0.6611872146118721, "grad_norm": 14.397849082946777, "learning_rate": 6.602739726027397e-06, "loss": 0.1231, "step": 724 }, { "epoch": 0.6621004566210046, "grad_norm": 152.4778289794922, "learning_rate": 6.6118721461187215e-06, "loss": 1.2538, "step": 725 }, { "epoch": 0.663013698630137, "grad_norm": 186.18536376953125, "learning_rate": 6.621004566210046e-06, "loss": 1.8257, "step": 726 }, { "epoch": 0.6639269406392694, "grad_norm": 18.526653289794922, "learning_rate": 6.630136986301371e-06, "loss": 0.1346, "step": 727 }, { "epoch": 0.6648401826484018, "grad_norm": 54.73551559448242, "learning_rate": 6.639269406392694e-06, "loss": 1.2106, "step": 728 }, { "epoch": 0.6657534246575343, "grad_norm": 87.05229187011719, "learning_rate": 6.6484018264840186e-06, "loss": 5.8843, "step": 729 }, { "epoch": 0.6666666666666666, "grad_norm": 67.18157958984375, "learning_rate": 6.657534246575343e-06, "loss": 1.2837, "step": 730 }, { "epoch": 0.6675799086757991, "grad_norm": 4.789803981781006, "learning_rate": 6.666666666666667e-06, "loss": 0.043, "step": 731 }, { "epoch": 0.6684931506849315, "grad_norm": 32.38039016723633, "learning_rate": 6.675799086757991e-06, "loss": 0.2555, "step": 732 }, { "epoch": 0.6694063926940639, "grad_norm": 51.865230560302734, "learning_rate": 6.684931506849316e-06, "loss": 0.7784, "step": 733 }, { "epoch": 0.6703196347031963, "grad_norm": 75.58180236816406, "learning_rate": 6.694063926940639e-06, "loss": 1.5669, "step": 734 }, { "epoch": 0.6712328767123288, "grad_norm": 19.203346252441406, "learning_rate": 6.703196347031964e-06, "loss": 0.1364, "step": 735 }, { "epoch": 0.6721461187214612, "grad_norm": 110.83204650878906, "learning_rate": 6.712328767123288e-06, "loss": 3.7913, "step": 736 }, { "epoch": 0.6730593607305936, "grad_norm": 18.850107192993164, "learning_rate": 6.721461187214613e-06, "loss": 0.2072, "step": 737 }, { "epoch": 0.673972602739726, "grad_norm": 19.391618728637695, "learning_rate": 6.730593607305936e-06, "loss": 0.2003, "step": 738 }, { "epoch": 0.6748858447488585, "grad_norm": 19.876264572143555, "learning_rate": 6.739726027397261e-06, "loss": 0.1772, "step": 739 }, { "epoch": 0.6757990867579908, "grad_norm": 77.78189849853516, "learning_rate": 6.748858447488585e-06, "loss": 1.731, "step": 740 }, { "epoch": 0.6767123287671233, "grad_norm": 20.05293083190918, "learning_rate": 6.757990867579909e-06, "loss": 0.2006, "step": 741 }, { "epoch": 0.6776255707762557, "grad_norm": 76.43462371826172, "learning_rate": 6.767123287671233e-06, "loss": 2.2098, "step": 742 }, { "epoch": 0.6785388127853881, "grad_norm": 39.44417953491211, "learning_rate": 6.776255707762558e-06, "loss": 0.4063, "step": 743 }, { "epoch": 0.6794520547945205, "grad_norm": 85.26068878173828, "learning_rate": 6.785388127853881e-06, "loss": 1.3734, "step": 744 }, { "epoch": 0.680365296803653, "grad_norm": 15.553995132446289, "learning_rate": 6.794520547945206e-06, "loss": 0.2678, "step": 745 }, { "epoch": 0.6812785388127854, "grad_norm": 1.352978229522705, "learning_rate": 6.80365296803653e-06, "loss": 0.0128, "step": 746 }, { "epoch": 0.6821917808219178, "grad_norm": 11.593868255615234, "learning_rate": 6.812785388127855e-06, "loss": 0.0641, "step": 747 }, { "epoch": 0.6831050228310502, "grad_norm": 18.21935272216797, "learning_rate": 6.821917808219178e-06, "loss": 0.2175, "step": 748 }, { "epoch": 0.6840182648401827, "grad_norm": 75.02354431152344, "learning_rate": 6.831050228310503e-06, "loss": 1.3184, "step": 749 }, { "epoch": 0.684931506849315, "grad_norm": 17.712610244750977, "learning_rate": 6.840182648401827e-06, "loss": 0.2008, "step": 750 }, { "epoch": 0.6858447488584475, "grad_norm": 56.5174560546875, "learning_rate": 6.849315068493151e-06, "loss": 1.0608, "step": 751 }, { "epoch": 0.6867579908675799, "grad_norm": 0.420840859413147, "learning_rate": 6.858447488584475e-06, "loss": 0.004, "step": 752 }, { "epoch": 0.6876712328767123, "grad_norm": 81.29444122314453, "learning_rate": 6.8675799086758e-06, "loss": 1.8439, "step": 753 }, { "epoch": 0.6885844748858447, "grad_norm": 102.45991516113281, "learning_rate": 6.876712328767123e-06, "loss": 1.709, "step": 754 }, { "epoch": 0.6894977168949772, "grad_norm": 23.705223083496094, "learning_rate": 6.885844748858448e-06, "loss": 0.2248, "step": 755 }, { "epoch": 0.6904109589041096, "grad_norm": 27.318584442138672, "learning_rate": 6.894977168949772e-06, "loss": 0.3984, "step": 756 }, { "epoch": 0.691324200913242, "grad_norm": 9.578939437866211, "learning_rate": 6.904109589041097e-06, "loss": 0.0889, "step": 757 }, { "epoch": 0.6922374429223744, "grad_norm": 31.80471420288086, "learning_rate": 6.9132420091324204e-06, "loss": 0.5046, "step": 758 }, { "epoch": 0.6931506849315069, "grad_norm": 23.249467849731445, "learning_rate": 6.922374429223745e-06, "loss": 0.28, "step": 759 }, { "epoch": 0.6940639269406392, "grad_norm": 76.36686706542969, "learning_rate": 6.931506849315069e-06, "loss": 1.4543, "step": 760 }, { "epoch": 0.6949771689497717, "grad_norm": 62.21699905395508, "learning_rate": 6.940639269406393e-06, "loss": 1.2651, "step": 761 }, { "epoch": 0.6958904109589041, "grad_norm": 3.6439366340637207, "learning_rate": 6.9497716894977175e-06, "loss": 0.0253, "step": 762 }, { "epoch": 0.6968036529680365, "grad_norm": 50.612701416015625, "learning_rate": 6.958904109589042e-06, "loss": 0.5398, "step": 763 }, { "epoch": 0.6977168949771689, "grad_norm": 29.45757293701172, "learning_rate": 6.9680365296803655e-06, "loss": 0.3052, "step": 764 }, { "epoch": 0.6986301369863014, "grad_norm": 1.3611215353012085, "learning_rate": 6.97716894977169e-06, "loss": 0.0132, "step": 765 }, { "epoch": 0.6995433789954338, "grad_norm": 16.462736129760742, "learning_rate": 6.9863013698630145e-06, "loss": 0.1883, "step": 766 }, { "epoch": 0.7004566210045662, "grad_norm": 35.469791412353516, "learning_rate": 6.995433789954339e-06, "loss": 0.4302, "step": 767 }, { "epoch": 0.7013698630136986, "grad_norm": 80.5223388671875, "learning_rate": 7.0045662100456626e-06, "loss": 1.4847, "step": 768 }, { "epoch": 0.7022831050228311, "grad_norm": 48.12818908691406, "learning_rate": 7.013698630136987e-06, "loss": 0.7874, "step": 769 }, { "epoch": 0.7031963470319634, "grad_norm": 72.35318756103516, "learning_rate": 7.0228310502283115e-06, "loss": 1.3635, "step": 770 }, { "epoch": 0.7041095890410959, "grad_norm": 12.12908935546875, "learning_rate": 7.031963470319635e-06, "loss": 0.1378, "step": 771 }, { "epoch": 0.7050228310502283, "grad_norm": 49.895164489746094, "learning_rate": 7.0410958904109596e-06, "loss": 0.4423, "step": 772 }, { "epoch": 0.7059360730593607, "grad_norm": 72.8347396850586, "learning_rate": 7.050228310502284e-06, "loss": 1.0654, "step": 773 }, { "epoch": 0.7068493150684931, "grad_norm": 50.26493835449219, "learning_rate": 7.059360730593608e-06, "loss": 0.6451, "step": 774 }, { "epoch": 0.7077625570776256, "grad_norm": 23.901958465576172, "learning_rate": 7.068493150684932e-06, "loss": 0.2259, "step": 775 }, { "epoch": 0.708675799086758, "grad_norm": 4.540143966674805, "learning_rate": 7.077625570776257e-06, "loss": 0.0427, "step": 776 }, { "epoch": 0.7095890410958904, "grad_norm": 5.269284725189209, "learning_rate": 7.086757990867581e-06, "loss": 0.0514, "step": 777 }, { "epoch": 0.7105022831050228, "grad_norm": 46.46390151977539, "learning_rate": 7.095890410958905e-06, "loss": 0.1924, "step": 778 }, { "epoch": 0.7114155251141553, "grad_norm": 4.731640815734863, "learning_rate": 7.105022831050229e-06, "loss": 0.034, "step": 779 }, { "epoch": 0.7123287671232876, "grad_norm": 76.2733154296875, "learning_rate": 7.114155251141554e-06, "loss": 1.6765, "step": 780 }, { "epoch": 0.7132420091324201, "grad_norm": 4.093043804168701, "learning_rate": 7.123287671232877e-06, "loss": 0.0295, "step": 781 }, { "epoch": 0.7141552511415525, "grad_norm": 89.35159301757812, "learning_rate": 7.132420091324202e-06, "loss": 0.6027, "step": 782 }, { "epoch": 0.7150684931506849, "grad_norm": 65.43269348144531, "learning_rate": 7.141552511415526e-06, "loss": 0.5951, "step": 783 }, { "epoch": 0.7159817351598173, "grad_norm": 49.15220642089844, "learning_rate": 7.15068493150685e-06, "loss": 0.521, "step": 784 }, { "epoch": 0.7168949771689498, "grad_norm": 94.31409454345703, "learning_rate": 7.159817351598174e-06, "loss": 1.6965, "step": 785 }, { "epoch": 0.7178082191780822, "grad_norm": 68.35287475585938, "learning_rate": 7.168949771689499e-06, "loss": 0.8658, "step": 786 }, { "epoch": 0.7187214611872146, "grad_norm": 51.451087951660156, "learning_rate": 7.178082191780823e-06, "loss": 0.5375, "step": 787 }, { "epoch": 0.719634703196347, "grad_norm": 3.395291805267334, "learning_rate": 7.187214611872147e-06, "loss": 0.0222, "step": 788 }, { "epoch": 0.7205479452054795, "grad_norm": 181.33543395996094, "learning_rate": 7.196347031963471e-06, "loss": 4.0141, "step": 789 }, { "epoch": 0.7214611872146118, "grad_norm": 72.24095153808594, "learning_rate": 7.205479452054796e-06, "loss": 1.3899, "step": 790 }, { "epoch": 0.7223744292237443, "grad_norm": 1.8191558122634888, "learning_rate": 7.214611872146119e-06, "loss": 0.017, "step": 791 }, { "epoch": 0.7232876712328767, "grad_norm": 41.806129455566406, "learning_rate": 7.223744292237444e-06, "loss": 0.5773, "step": 792 }, { "epoch": 0.7242009132420091, "grad_norm": 60.515106201171875, "learning_rate": 7.232876712328768e-06, "loss": 0.8215, "step": 793 }, { "epoch": 0.7251141552511415, "grad_norm": 0.4063633680343628, "learning_rate": 7.242009132420091e-06, "loss": 0.0039, "step": 794 }, { "epoch": 0.726027397260274, "grad_norm": 56.64303970336914, "learning_rate": 7.251141552511416e-06, "loss": 0.8959, "step": 795 }, { "epoch": 0.7269406392694064, "grad_norm": 31.166954040527344, "learning_rate": 7.260273972602741e-06, "loss": 0.3992, "step": 796 }, { "epoch": 0.7278538812785388, "grad_norm": 71.80884552001953, "learning_rate": 7.269406392694065e-06, "loss": 1.5443, "step": 797 }, { "epoch": 0.7287671232876712, "grad_norm": 46.326839447021484, "learning_rate": 7.278538812785388e-06, "loss": 0.5639, "step": 798 }, { "epoch": 0.7296803652968037, "grad_norm": 45.73426055908203, "learning_rate": 7.287671232876713e-06, "loss": 0.7943, "step": 799 }, { "epoch": 0.730593607305936, "grad_norm": 10.512808799743652, "learning_rate": 7.296803652968038e-06, "loss": 0.0817, "step": 800 }, { "epoch": 0.7315068493150685, "grad_norm": 17.731801986694336, "learning_rate": 7.305936073059361e-06, "loss": 0.6907, "step": 801 }, { "epoch": 0.7324200913242009, "grad_norm": 0.8850480318069458, "learning_rate": 7.315068493150685e-06, "loss": 0.0097, "step": 802 }, { "epoch": 0.7333333333333333, "grad_norm": 17.45279312133789, "learning_rate": 7.32420091324201e-06, "loss": 0.2381, "step": 803 }, { "epoch": 0.7342465753424657, "grad_norm": 38.780757904052734, "learning_rate": 7.333333333333333e-06, "loss": 0.5285, "step": 804 }, { "epoch": 0.7351598173515982, "grad_norm": 25.51468276977539, "learning_rate": 7.342465753424658e-06, "loss": 0.1902, "step": 805 }, { "epoch": 0.7360730593607306, "grad_norm": 78.06217193603516, "learning_rate": 7.351598173515982e-06, "loss": 1.3697, "step": 806 }, { "epoch": 0.736986301369863, "grad_norm": 74.38336181640625, "learning_rate": 7.360730593607307e-06, "loss": 0.9476, "step": 807 }, { "epoch": 0.7378995433789954, "grad_norm": 146.5282440185547, "learning_rate": 7.36986301369863e-06, "loss": 1.9756, "step": 808 }, { "epoch": 0.7388127853881279, "grad_norm": 1.3368157148361206, "learning_rate": 7.378995433789955e-06, "loss": 0.0137, "step": 809 }, { "epoch": 0.7397260273972602, "grad_norm": 53.878719329833984, "learning_rate": 7.388127853881279e-06, "loss": 1.1495, "step": 810 }, { "epoch": 0.7406392694063927, "grad_norm": 27.839439392089844, "learning_rate": 7.397260273972603e-06, "loss": 0.4471, "step": 811 }, { "epoch": 0.7415525114155251, "grad_norm": 56.839942932128906, "learning_rate": 7.406392694063927e-06, "loss": 0.6532, "step": 812 }, { "epoch": 0.7424657534246575, "grad_norm": 41.6190299987793, "learning_rate": 7.415525114155252e-06, "loss": 0.9412, "step": 813 }, { "epoch": 0.7433789954337899, "grad_norm": 35.71660614013672, "learning_rate": 7.424657534246575e-06, "loss": 0.4096, "step": 814 }, { "epoch": 0.7442922374429224, "grad_norm": 4.349606513977051, "learning_rate": 7.4337899543379e-06, "loss": 0.0585, "step": 815 }, { "epoch": 0.7452054794520548, "grad_norm": 74.30144500732422, "learning_rate": 7.442922374429224e-06, "loss": 1.1566, "step": 816 }, { "epoch": 0.7461187214611872, "grad_norm": 31.598617553710938, "learning_rate": 7.452054794520549e-06, "loss": 0.23, "step": 817 }, { "epoch": 0.7470319634703196, "grad_norm": 61.847686767578125, "learning_rate": 7.461187214611872e-06, "loss": 1.0406, "step": 818 }, { "epoch": 0.7479452054794521, "grad_norm": 70.2628402709961, "learning_rate": 7.470319634703197e-06, "loss": 1.7974, "step": 819 }, { "epoch": 0.7488584474885844, "grad_norm": 25.507667541503906, "learning_rate": 7.479452054794521e-06, "loss": 0.3389, "step": 820 }, { "epoch": 0.7497716894977169, "grad_norm": 1.494282603263855, "learning_rate": 7.488584474885845e-06, "loss": 0.0163, "step": 821 }, { "epoch": 0.7506849315068493, "grad_norm": 5.9981865882873535, "learning_rate": 7.497716894977169e-06, "loss": 0.0637, "step": 822 }, { "epoch": 0.7515981735159817, "grad_norm": 1.203965663909912, "learning_rate": 7.506849315068494e-06, "loss": 0.0139, "step": 823 }, { "epoch": 0.7525114155251141, "grad_norm": 40.46773147583008, "learning_rate": 7.515981735159817e-06, "loss": 0.3273, "step": 824 }, { "epoch": 0.7534246575342466, "grad_norm": 57.76207733154297, "learning_rate": 7.525114155251142e-06, "loss": 1.7633, "step": 825 }, { "epoch": 0.754337899543379, "grad_norm": 29.157705307006836, "learning_rate": 7.534246575342466e-06, "loss": 0.4148, "step": 826 }, { "epoch": 0.7552511415525114, "grad_norm": 64.84636688232422, "learning_rate": 7.543378995433791e-06, "loss": 0.8333, "step": 827 }, { "epoch": 0.7561643835616438, "grad_norm": 37.79376983642578, "learning_rate": 7.552511415525114e-06, "loss": 0.29, "step": 828 }, { "epoch": 0.7570776255707763, "grad_norm": 2.3556969165802, "learning_rate": 7.561643835616439e-06, "loss": 0.0161, "step": 829 }, { "epoch": 0.7579908675799086, "grad_norm": 10.036351203918457, "learning_rate": 7.570776255707763e-06, "loss": 0.0867, "step": 830 }, { "epoch": 0.7589041095890411, "grad_norm": 12.992441177368164, "learning_rate": 7.579908675799087e-06, "loss": 0.12, "step": 831 }, { "epoch": 0.7598173515981735, "grad_norm": 70.09133911132812, "learning_rate": 7.589041095890411e-06, "loss": 0.9791, "step": 832 }, { "epoch": 0.7607305936073059, "grad_norm": 117.23416900634766, "learning_rate": 7.598173515981736e-06, "loss": 0.6044, "step": 833 }, { "epoch": 0.7616438356164383, "grad_norm": 13.131667137145996, "learning_rate": 7.6073059360730595e-06, "loss": 0.0676, "step": 834 }, { "epoch": 0.7625570776255708, "grad_norm": 24.759632110595703, "learning_rate": 7.616438356164384e-06, "loss": 0.3089, "step": 835 }, { "epoch": 0.7634703196347032, "grad_norm": 24.51546287536621, "learning_rate": 7.625570776255708e-06, "loss": 0.2178, "step": 836 }, { "epoch": 0.7643835616438356, "grad_norm": 29.57969093322754, "learning_rate": 7.634703196347033e-06, "loss": 0.4208, "step": 837 }, { "epoch": 0.765296803652968, "grad_norm": 5.768768310546875, "learning_rate": 7.643835616438356e-06, "loss": 0.068, "step": 838 }, { "epoch": 0.7662100456621005, "grad_norm": 56.811859130859375, "learning_rate": 7.652968036529682e-06, "loss": 0.895, "step": 839 }, { "epoch": 0.7671232876712328, "grad_norm": 22.25271987915039, "learning_rate": 7.662100456621005e-06, "loss": 0.2339, "step": 840 }, { "epoch": 0.7680365296803653, "grad_norm": 59.75829315185547, "learning_rate": 7.671232876712329e-06, "loss": 0.7298, "step": 841 }, { "epoch": 0.7689497716894977, "grad_norm": 93.41728210449219, "learning_rate": 7.680365296803653e-06, "loss": 3.1627, "step": 842 }, { "epoch": 0.7698630136986301, "grad_norm": 58.349693298339844, "learning_rate": 7.689497716894978e-06, "loss": 0.6429, "step": 843 }, { "epoch": 0.7707762557077625, "grad_norm": 74.78717803955078, "learning_rate": 7.698630136986302e-06, "loss": 4.2642, "step": 844 }, { "epoch": 0.771689497716895, "grad_norm": 27.12296485900879, "learning_rate": 7.707762557077625e-06, "loss": 0.2216, "step": 845 }, { "epoch": 0.7726027397260274, "grad_norm": 2.235957622528076, "learning_rate": 7.71689497716895e-06, "loss": 0.0206, "step": 846 }, { "epoch": 0.7735159817351598, "grad_norm": 20.594039916992188, "learning_rate": 7.726027397260276e-06, "loss": 0.2301, "step": 847 }, { "epoch": 0.7744292237442922, "grad_norm": 5.405617713928223, "learning_rate": 7.735159817351598e-06, "loss": 0.0504, "step": 848 }, { "epoch": 0.7753424657534247, "grad_norm": 93.87007141113281, "learning_rate": 7.744292237442923e-06, "loss": 2.1932, "step": 849 }, { "epoch": 0.776255707762557, "grad_norm": 145.31246948242188, "learning_rate": 7.753424657534248e-06, "loss": 3.5402, "step": 850 }, { "epoch": 0.7771689497716895, "grad_norm": 72.2968978881836, "learning_rate": 7.76255707762557e-06, "loss": 2.6043, "step": 851 }, { "epoch": 0.7780821917808219, "grad_norm": 28.888498306274414, "learning_rate": 7.771689497716896e-06, "loss": 0.433, "step": 852 }, { "epoch": 0.7789954337899543, "grad_norm": 64.17280578613281, "learning_rate": 7.78082191780822e-06, "loss": 1.4925, "step": 853 }, { "epoch": 0.7799086757990867, "grad_norm": 49.39548873901367, "learning_rate": 7.789954337899543e-06, "loss": 0.7047, "step": 854 }, { "epoch": 0.7808219178082192, "grad_norm": 10.145587921142578, "learning_rate": 7.799086757990868e-06, "loss": 0.121, "step": 855 }, { "epoch": 0.7817351598173516, "grad_norm": 23.83386993408203, "learning_rate": 7.808219178082192e-06, "loss": 0.2159, "step": 856 }, { "epoch": 0.782648401826484, "grad_norm": 2.0447001457214355, "learning_rate": 7.817351598173517e-06, "loss": 0.0185, "step": 857 }, { "epoch": 0.7835616438356164, "grad_norm": 65.36552429199219, "learning_rate": 7.82648401826484e-06, "loss": 1.3615, "step": 858 }, { "epoch": 0.7844748858447489, "grad_norm": 6.036379814147949, "learning_rate": 7.835616438356164e-06, "loss": 0.04, "step": 859 }, { "epoch": 0.7853881278538812, "grad_norm": 6.738356113433838, "learning_rate": 7.84474885844749e-06, "loss": 0.0622, "step": 860 }, { "epoch": 0.7863013698630137, "grad_norm": 12.137633323669434, "learning_rate": 7.853881278538813e-06, "loss": 0.1391, "step": 861 }, { "epoch": 0.7872146118721461, "grad_norm": 44.109535217285156, "learning_rate": 7.863013698630137e-06, "loss": 0.8242, "step": 862 }, { "epoch": 0.7881278538812785, "grad_norm": 19.427846908569336, "learning_rate": 7.872146118721462e-06, "loss": 0.2294, "step": 863 }, { "epoch": 0.7890410958904109, "grad_norm": 3.3380181789398193, "learning_rate": 7.881278538812786e-06, "loss": 0.0325, "step": 864 }, { "epoch": 0.7899543378995434, "grad_norm": 60.59779357910156, "learning_rate": 7.89041095890411e-06, "loss": 1.7027, "step": 865 }, { "epoch": 0.7908675799086758, "grad_norm": 19.73265266418457, "learning_rate": 7.899543378995435e-06, "loss": 0.2769, "step": 866 }, { "epoch": 0.7917808219178082, "grad_norm": 22.81819725036621, "learning_rate": 7.908675799086758e-06, "loss": 0.2719, "step": 867 }, { "epoch": 0.7926940639269406, "grad_norm": 49.575218200683594, "learning_rate": 7.917808219178082e-06, "loss": 1.0908, "step": 868 }, { "epoch": 0.7936073059360731, "grad_norm": 60.7605094909668, "learning_rate": 7.926940639269407e-06, "loss": 0.6842, "step": 869 }, { "epoch": 0.7945205479452054, "grad_norm": 36.01530456542969, "learning_rate": 7.936073059360731e-06, "loss": 0.3038, "step": 870 }, { "epoch": 0.7954337899543379, "grad_norm": 28.539276123046875, "learning_rate": 7.945205479452055e-06, "loss": 0.3491, "step": 871 }, { "epoch": 0.7963470319634703, "grad_norm": 24.733802795410156, "learning_rate": 7.95433789954338e-06, "loss": 0.3153, "step": 872 }, { "epoch": 0.7972602739726027, "grad_norm": 5.164364814758301, "learning_rate": 7.963470319634703e-06, "loss": 0.0468, "step": 873 }, { "epoch": 0.7981735159817351, "grad_norm": 58.47385787963867, "learning_rate": 7.972602739726027e-06, "loss": 0.9592, "step": 874 }, { "epoch": 0.7990867579908676, "grad_norm": 9.891631126403809, "learning_rate": 7.981735159817352e-06, "loss": 0.1308, "step": 875 }, { "epoch": 0.8, "grad_norm": 19.36839485168457, "learning_rate": 7.990867579908676e-06, "loss": 0.2471, "step": 876 }, { "epoch": 0.8009132420091324, "grad_norm": 36.61865997314453, "learning_rate": 8.000000000000001e-06, "loss": 0.4644, "step": 877 }, { "epoch": 0.8018264840182648, "grad_norm": 33.58241653442383, "learning_rate": 8.009132420091325e-06, "loss": 0.3489, "step": 878 }, { "epoch": 0.8027397260273973, "grad_norm": 3.8187968730926514, "learning_rate": 8.018264840182649e-06, "loss": 0.0387, "step": 879 }, { "epoch": 0.8036529680365296, "grad_norm": 54.7425651550293, "learning_rate": 8.027397260273974e-06, "loss": 0.7164, "step": 880 }, { "epoch": 0.8045662100456621, "grad_norm": 66.9697494506836, "learning_rate": 8.036529680365297e-06, "loss": 1.6232, "step": 881 }, { "epoch": 0.8054794520547945, "grad_norm": 47.34202194213867, "learning_rate": 8.045662100456621e-06, "loss": 0.5944, "step": 882 }, { "epoch": 0.806392694063927, "grad_norm": 18.831586837768555, "learning_rate": 8.054794520547946e-06, "loss": 0.2767, "step": 883 }, { "epoch": 0.8073059360730593, "grad_norm": 54.63134002685547, "learning_rate": 8.06392694063927e-06, "loss": 0.845, "step": 884 }, { "epoch": 0.8082191780821918, "grad_norm": 42.8266487121582, "learning_rate": 8.073059360730594e-06, "loss": 0.4746, "step": 885 }, { "epoch": 0.8091324200913242, "grad_norm": 71.9335708618164, "learning_rate": 8.082191780821919e-06, "loss": 1.6966, "step": 886 }, { "epoch": 0.8100456621004566, "grad_norm": 28.22928237915039, "learning_rate": 8.091324200913243e-06, "loss": 0.5609, "step": 887 }, { "epoch": 0.810958904109589, "grad_norm": 10.829813957214355, "learning_rate": 8.100456621004566e-06, "loss": 0.1123, "step": 888 }, { "epoch": 0.8118721461187215, "grad_norm": 41.28998565673828, "learning_rate": 8.109589041095892e-06, "loss": 0.5289, "step": 889 }, { "epoch": 0.8127853881278538, "grad_norm": 16.83150863647461, "learning_rate": 8.118721461187215e-06, "loss": 0.1374, "step": 890 }, { "epoch": 0.8136986301369863, "grad_norm": 70.11990356445312, "learning_rate": 8.127853881278539e-06, "loss": 1.7532, "step": 891 }, { "epoch": 0.8146118721461187, "grad_norm": 102.17564392089844, "learning_rate": 8.136986301369864e-06, "loss": 1.0865, "step": 892 }, { "epoch": 0.8155251141552512, "grad_norm": 85.51142120361328, "learning_rate": 8.146118721461188e-06, "loss": 1.2636, "step": 893 }, { "epoch": 0.8164383561643835, "grad_norm": 6.818487644195557, "learning_rate": 8.155251141552513e-06, "loss": 0.0786, "step": 894 }, { "epoch": 0.817351598173516, "grad_norm": 4.10952091217041, "learning_rate": 8.164383561643837e-06, "loss": 0.0298, "step": 895 }, { "epoch": 0.8182648401826484, "grad_norm": 66.06404113769531, "learning_rate": 8.17351598173516e-06, "loss": 0.2382, "step": 896 }, { "epoch": 0.8191780821917808, "grad_norm": 14.432676315307617, "learning_rate": 8.182648401826486e-06, "loss": 0.1301, "step": 897 }, { "epoch": 0.8200913242009132, "grad_norm": 24.433887481689453, "learning_rate": 8.19178082191781e-06, "loss": 0.2368, "step": 898 }, { "epoch": 0.8210045662100457, "grad_norm": 76.58094787597656, "learning_rate": 8.200913242009133e-06, "loss": 2.8918, "step": 899 }, { "epoch": 0.821917808219178, "grad_norm": 82.50822448730469, "learning_rate": 8.210045662100458e-06, "loss": 2.2073, "step": 900 }, { "epoch": 0.8228310502283105, "grad_norm": 81.0888900756836, "learning_rate": 8.219178082191782e-06, "loss": 1.1319, "step": 901 }, { "epoch": 0.8237442922374429, "grad_norm": 9.347799301147461, "learning_rate": 8.228310502283105e-06, "loss": 0.0957, "step": 902 }, { "epoch": 0.8246575342465754, "grad_norm": 36.187843322753906, "learning_rate": 8.23744292237443e-06, "loss": 0.3495, "step": 903 }, { "epoch": 0.8255707762557077, "grad_norm": 2.4059019088745117, "learning_rate": 8.246575342465754e-06, "loss": 0.0236, "step": 904 }, { "epoch": 0.8264840182648402, "grad_norm": 4.6065826416015625, "learning_rate": 8.255707762557078e-06, "loss": 0.0392, "step": 905 }, { "epoch": 0.8273972602739726, "grad_norm": 51.971893310546875, "learning_rate": 8.264840182648403e-06, "loss": 0.5048, "step": 906 }, { "epoch": 0.828310502283105, "grad_norm": 26.928512573242188, "learning_rate": 8.273972602739727e-06, "loss": 0.3248, "step": 907 }, { "epoch": 0.8292237442922374, "grad_norm": 6.627527236938477, "learning_rate": 8.28310502283105e-06, "loss": 0.05, "step": 908 }, { "epoch": 0.8301369863013699, "grad_norm": 50.87287521362305, "learning_rate": 8.292237442922376e-06, "loss": 0.7007, "step": 909 }, { "epoch": 0.8310502283105022, "grad_norm": 5.692317962646484, "learning_rate": 8.3013698630137e-06, "loss": 0.0708, "step": 910 }, { "epoch": 0.8319634703196347, "grad_norm": 52.37306213378906, "learning_rate": 8.310502283105023e-06, "loss": 0.5269, "step": 911 }, { "epoch": 0.8328767123287671, "grad_norm": 1.2745229005813599, "learning_rate": 8.319634703196348e-06, "loss": 0.0101, "step": 912 }, { "epoch": 0.8337899543378996, "grad_norm": 11.736579895019531, "learning_rate": 8.328767123287672e-06, "loss": 0.1494, "step": 913 }, { "epoch": 0.8347031963470319, "grad_norm": 18.552549362182617, "learning_rate": 8.337899543378997e-06, "loss": 0.1965, "step": 914 }, { "epoch": 0.8356164383561644, "grad_norm": 2.539126396179199, "learning_rate": 8.347031963470321e-06, "loss": 0.034, "step": 915 }, { "epoch": 0.8365296803652968, "grad_norm": 29.611709594726562, "learning_rate": 8.356164383561644e-06, "loss": 0.3267, "step": 916 }, { "epoch": 0.8374429223744292, "grad_norm": 17.503543853759766, "learning_rate": 8.36529680365297e-06, "loss": 0.173, "step": 917 }, { "epoch": 0.8383561643835616, "grad_norm": 3.3015260696411133, "learning_rate": 8.374429223744293e-06, "loss": 0.0332, "step": 918 }, { "epoch": 0.8392694063926941, "grad_norm": 20.720430374145508, "learning_rate": 8.383561643835617e-06, "loss": 0.2175, "step": 919 }, { "epoch": 0.8401826484018264, "grad_norm": 59.799842834472656, "learning_rate": 8.392694063926942e-06, "loss": 1.3366, "step": 920 }, { "epoch": 0.8410958904109589, "grad_norm": 13.419408798217773, "learning_rate": 8.401826484018264e-06, "loss": 0.1614, "step": 921 }, { "epoch": 0.8420091324200913, "grad_norm": 1.3591128587722778, "learning_rate": 8.41095890410959e-06, "loss": 0.0118, "step": 922 }, { "epoch": 0.8429223744292238, "grad_norm": 8.493356704711914, "learning_rate": 8.420091324200915e-06, "loss": 0.0707, "step": 923 }, { "epoch": 0.8438356164383561, "grad_norm": 66.2013168334961, "learning_rate": 8.429223744292239e-06, "loss": 1.0796, "step": 924 }, { "epoch": 0.8447488584474886, "grad_norm": 4.677180290222168, "learning_rate": 8.438356164383562e-06, "loss": 0.0553, "step": 925 }, { "epoch": 0.845662100456621, "grad_norm": 83.22639465332031, "learning_rate": 8.447488584474887e-06, "loss": 4.3073, "step": 926 }, { "epoch": 0.8465753424657534, "grad_norm": 1.2402905225753784, "learning_rate": 8.456621004566211e-06, "loss": 0.0113, "step": 927 }, { "epoch": 0.8474885844748858, "grad_norm": 26.701187133789062, "learning_rate": 8.465753424657535e-06, "loss": 0.3739, "step": 928 }, { "epoch": 0.8484018264840183, "grad_norm": 74.10446166992188, "learning_rate": 8.474885844748858e-06, "loss": 2.4759, "step": 929 }, { "epoch": 0.8493150684931506, "grad_norm": 6.928402900695801, "learning_rate": 8.484018264840184e-06, "loss": 0.0728, "step": 930 }, { "epoch": 0.8502283105022831, "grad_norm": 12.224347114562988, "learning_rate": 8.493150684931507e-06, "loss": 0.0842, "step": 931 }, { "epoch": 0.8511415525114155, "grad_norm": 3.352992534637451, "learning_rate": 8.50228310502283e-06, "loss": 0.0308, "step": 932 }, { "epoch": 0.852054794520548, "grad_norm": 45.37390899658203, "learning_rate": 8.511415525114156e-06, "loss": 0.2715, "step": 933 }, { "epoch": 0.8529680365296803, "grad_norm": 1.4490890502929688, "learning_rate": 8.520547945205481e-06, "loss": 0.0146, "step": 934 }, { "epoch": 0.8538812785388128, "grad_norm": 16.73929214477539, "learning_rate": 8.529680365296803e-06, "loss": 0.1768, "step": 935 }, { "epoch": 0.8547945205479452, "grad_norm": 21.930335998535156, "learning_rate": 8.538812785388129e-06, "loss": 0.2849, "step": 936 }, { "epoch": 0.8557077625570776, "grad_norm": 106.13126373291016, "learning_rate": 8.547945205479454e-06, "loss": 3.4204, "step": 937 }, { "epoch": 0.85662100456621, "grad_norm": 41.47927474975586, "learning_rate": 8.557077625570776e-06, "loss": 0.5022, "step": 938 }, { "epoch": 0.8575342465753425, "grad_norm": 50.02931594848633, "learning_rate": 8.566210045662101e-06, "loss": 0.4584, "step": 939 }, { "epoch": 0.8584474885844748, "grad_norm": 57.955020904541016, "learning_rate": 8.575342465753425e-06, "loss": 0.8607, "step": 940 }, { "epoch": 0.8593607305936073, "grad_norm": 63.3349494934082, "learning_rate": 8.584474885844748e-06, "loss": 1.1207, "step": 941 }, { "epoch": 0.8602739726027397, "grad_norm": 3.438372850418091, "learning_rate": 8.593607305936074e-06, "loss": 0.0386, "step": 942 }, { "epoch": 0.8611872146118722, "grad_norm": 87.9367446899414, "learning_rate": 8.602739726027397e-06, "loss": 3.4933, "step": 943 }, { "epoch": 0.8621004566210045, "grad_norm": 62.14993667602539, "learning_rate": 8.611872146118723e-06, "loss": 0.6779, "step": 944 }, { "epoch": 0.863013698630137, "grad_norm": 4.179379940032959, "learning_rate": 8.621004566210046e-06, "loss": 0.0503, "step": 945 }, { "epoch": 0.8639269406392694, "grad_norm": 58.699562072753906, "learning_rate": 8.63013698630137e-06, "loss": 0.9934, "step": 946 }, { "epoch": 0.8648401826484018, "grad_norm": 10.352327346801758, "learning_rate": 8.639269406392695e-06, "loss": 0.0824, "step": 947 }, { "epoch": 0.8657534246575342, "grad_norm": 86.40420532226562, "learning_rate": 8.648401826484019e-06, "loss": 1.492, "step": 948 }, { "epoch": 0.8666666666666667, "grad_norm": 29.50115203857422, "learning_rate": 8.657534246575343e-06, "loss": 0.441, "step": 949 }, { "epoch": 0.867579908675799, "grad_norm": 80.31971740722656, "learning_rate": 8.666666666666668e-06, "loss": 1.9008, "step": 950 }, { "epoch": 0.8684931506849315, "grad_norm": 47.99782943725586, "learning_rate": 8.675799086757991e-06, "loss": 0.6175, "step": 951 }, { "epoch": 0.869406392694064, "grad_norm": 1.068830132484436, "learning_rate": 8.684931506849315e-06, "loss": 0.008, "step": 952 }, { "epoch": 0.8703196347031964, "grad_norm": 69.39022064208984, "learning_rate": 8.69406392694064e-06, "loss": 0.7755, "step": 953 }, { "epoch": 0.8712328767123287, "grad_norm": 5.713808536529541, "learning_rate": 8.703196347031964e-06, "loss": 0.0403, "step": 954 }, { "epoch": 0.8721461187214612, "grad_norm": 39.51560974121094, "learning_rate": 8.712328767123288e-06, "loss": 0.4446, "step": 955 }, { "epoch": 0.8730593607305936, "grad_norm": 0.78012615442276, "learning_rate": 8.721461187214613e-06, "loss": 0.0086, "step": 956 }, { "epoch": 0.873972602739726, "grad_norm": 51.36018371582031, "learning_rate": 8.730593607305937e-06, "loss": 0.605, "step": 957 }, { "epoch": 0.8748858447488584, "grad_norm": 38.60447692871094, "learning_rate": 8.73972602739726e-06, "loss": 0.369, "step": 958 }, { "epoch": 0.8757990867579909, "grad_norm": 62.723304748535156, "learning_rate": 8.748858447488585e-06, "loss": 1.1756, "step": 959 }, { "epoch": 0.8767123287671232, "grad_norm": 140.06748962402344, "learning_rate": 8.757990867579909e-06, "loss": 1.7893, "step": 960 }, { "epoch": 0.8776255707762557, "grad_norm": 81.41402435302734, "learning_rate": 8.767123287671233e-06, "loss": 4.2715, "step": 961 }, { "epoch": 0.8785388127853881, "grad_norm": 24.591522216796875, "learning_rate": 8.776255707762558e-06, "loss": 0.0928, "step": 962 }, { "epoch": 0.8794520547945206, "grad_norm": 60.04916763305664, "learning_rate": 8.785388127853882e-06, "loss": 1.0945, "step": 963 }, { "epoch": 0.8803652968036529, "grad_norm": 138.3867645263672, "learning_rate": 8.794520547945207e-06, "loss": 4.1193, "step": 964 }, { "epoch": 0.8812785388127854, "grad_norm": 34.108863830566406, "learning_rate": 8.80365296803653e-06, "loss": 0.5098, "step": 965 }, { "epoch": 0.8821917808219178, "grad_norm": 2.771101951599121, "learning_rate": 8.812785388127854e-06, "loss": 0.0216, "step": 966 }, { "epoch": 0.8831050228310502, "grad_norm": 61.02241516113281, "learning_rate": 8.82191780821918e-06, "loss": 1.5311, "step": 967 }, { "epoch": 0.8840182648401826, "grad_norm": 49.709678649902344, "learning_rate": 8.831050228310503e-06, "loss": 0.4803, "step": 968 }, { "epoch": 0.8849315068493151, "grad_norm": 1.7478625774383545, "learning_rate": 8.840182648401827e-06, "loss": 0.0142, "step": 969 }, { "epoch": 0.8858447488584474, "grad_norm": 41.46023178100586, "learning_rate": 8.849315068493152e-06, "loss": 0.7774, "step": 970 }, { "epoch": 0.8867579908675799, "grad_norm": 37.73106384277344, "learning_rate": 8.858447488584476e-06, "loss": 0.4881, "step": 971 }, { "epoch": 0.8876712328767123, "grad_norm": 78.57492065429688, "learning_rate": 8.8675799086758e-06, "loss": 1.1989, "step": 972 }, { "epoch": 0.8885844748858448, "grad_norm": 28.322036743164062, "learning_rate": 8.876712328767125e-06, "loss": 0.3866, "step": 973 }, { "epoch": 0.8894977168949771, "grad_norm": 10.822772026062012, "learning_rate": 8.885844748858448e-06, "loss": 0.1002, "step": 974 }, { "epoch": 0.8904109589041096, "grad_norm": 40.74002456665039, "learning_rate": 8.894977168949772e-06, "loss": 0.5908, "step": 975 }, { "epoch": 0.891324200913242, "grad_norm": 50.914703369140625, "learning_rate": 8.904109589041097e-06, "loss": 1.0153, "step": 976 }, { "epoch": 0.8922374429223744, "grad_norm": 4.508623123168945, "learning_rate": 8.91324200913242e-06, "loss": 0.0351, "step": 977 }, { "epoch": 0.8931506849315068, "grad_norm": 40.58057403564453, "learning_rate": 8.922374429223744e-06, "loss": 0.5285, "step": 978 }, { "epoch": 0.8940639269406393, "grad_norm": 15.811490058898926, "learning_rate": 8.93150684931507e-06, "loss": 0.2242, "step": 979 }, { "epoch": 0.8949771689497716, "grad_norm": 6.925943374633789, "learning_rate": 8.940639269406393e-06, "loss": 0.0749, "step": 980 }, { "epoch": 0.8958904109589041, "grad_norm": 66.78372192382812, "learning_rate": 8.949771689497717e-06, "loss": 3.0534, "step": 981 }, { "epoch": 0.8968036529680365, "grad_norm": 67.84355163574219, "learning_rate": 8.958904109589042e-06, "loss": 3.7699, "step": 982 }, { "epoch": 0.897716894977169, "grad_norm": 22.243507385253906, "learning_rate": 8.968036529680366e-06, "loss": 0.329, "step": 983 }, { "epoch": 0.8986301369863013, "grad_norm": 75.46118927001953, "learning_rate": 8.977168949771691e-06, "loss": 2.6779, "step": 984 }, { "epoch": 0.8995433789954338, "grad_norm": 11.231538772583008, "learning_rate": 8.986301369863015e-06, "loss": 0.1186, "step": 985 }, { "epoch": 0.9004566210045662, "grad_norm": 13.163168907165527, "learning_rate": 8.995433789954338e-06, "loss": 0.1546, "step": 986 }, { "epoch": 0.9013698630136986, "grad_norm": 32.76507568359375, "learning_rate": 9.004566210045664e-06, "loss": 0.4387, "step": 987 }, { "epoch": 0.902283105022831, "grad_norm": 11.011016845703125, "learning_rate": 9.013698630136987e-06, "loss": 0.145, "step": 988 }, { "epoch": 0.9031963470319635, "grad_norm": 96.09173583984375, "learning_rate": 9.022831050228311e-06, "loss": 1.3936, "step": 989 }, { "epoch": 0.9041095890410958, "grad_norm": 12.148883819580078, "learning_rate": 9.031963470319636e-06, "loss": 0.0904, "step": 990 }, { "epoch": 0.9050228310502283, "grad_norm": 1.546993374824524, "learning_rate": 9.04109589041096e-06, "loss": 0.0171, "step": 991 }, { "epoch": 0.9059360730593607, "grad_norm": 61.08338928222656, "learning_rate": 9.050228310502284e-06, "loss": 1.7705, "step": 992 }, { "epoch": 0.9068493150684932, "grad_norm": 9.069503784179688, "learning_rate": 9.059360730593609e-06, "loss": 0.1429, "step": 993 }, { "epoch": 0.9077625570776255, "grad_norm": 33.34361267089844, "learning_rate": 9.068493150684932e-06, "loss": 0.6147, "step": 994 }, { "epoch": 0.908675799086758, "grad_norm": 20.801328659057617, "learning_rate": 9.077625570776256e-06, "loss": 0.3051, "step": 995 }, { "epoch": 0.9095890410958904, "grad_norm": 54.938377380371094, "learning_rate": 9.086757990867581e-06, "loss": 0.7078, "step": 996 }, { "epoch": 0.9105022831050228, "grad_norm": 34.967105865478516, "learning_rate": 9.095890410958905e-06, "loss": 0.3511, "step": 997 }, { "epoch": 0.9114155251141552, "grad_norm": 48.611759185791016, "learning_rate": 9.105022831050229e-06, "loss": 1.2243, "step": 998 }, { "epoch": 0.9123287671232877, "grad_norm": 77.32306671142578, "learning_rate": 9.114155251141554e-06, "loss": 1.2584, "step": 999 }, { "epoch": 0.91324200913242, "grad_norm": 11.596403121948242, "learning_rate": 9.123287671232878e-06, "loss": 0.1386, "step": 1000 }, { "epoch": 0.9141552511415525, "grad_norm": 24.616884231567383, "learning_rate": 9.132420091324201e-06, "loss": 0.4289, "step": 1001 }, { "epoch": 0.915068493150685, "grad_norm": 49.06389236450195, "learning_rate": 9.141552511415526e-06, "loss": 0.7092, "step": 1002 }, { "epoch": 0.9159817351598174, "grad_norm": 1.3050354719161987, "learning_rate": 9.15068493150685e-06, "loss": 0.0145, "step": 1003 }, { "epoch": 0.9168949771689497, "grad_norm": 69.6685562133789, "learning_rate": 9.159817351598175e-06, "loss": 1.0987, "step": 1004 }, { "epoch": 0.9178082191780822, "grad_norm": 41.250125885009766, "learning_rate": 9.168949771689499e-06, "loss": 0.5744, "step": 1005 }, { "epoch": 0.9187214611872146, "grad_norm": 76.94418334960938, "learning_rate": 9.178082191780823e-06, "loss": 2.043, "step": 1006 }, { "epoch": 0.919634703196347, "grad_norm": 5.865431308746338, "learning_rate": 9.187214611872148e-06, "loss": 0.0573, "step": 1007 }, { "epoch": 0.9205479452054794, "grad_norm": 64.3332290649414, "learning_rate": 9.19634703196347e-06, "loss": 1.6433, "step": 1008 }, { "epoch": 0.9214611872146119, "grad_norm": 2.6878578662872314, "learning_rate": 9.205479452054795e-06, "loss": 0.0337, "step": 1009 }, { "epoch": 0.9223744292237442, "grad_norm": 83.88431549072266, "learning_rate": 9.21461187214612e-06, "loss": 3.0047, "step": 1010 }, { "epoch": 0.9232876712328767, "grad_norm": 7.2621235847473145, "learning_rate": 9.223744292237442e-06, "loss": 0.1097, "step": 1011 }, { "epoch": 0.9242009132420091, "grad_norm": 3.3354856967926025, "learning_rate": 9.232876712328768e-06, "loss": 0.0321, "step": 1012 }, { "epoch": 0.9251141552511416, "grad_norm": 231.2224578857422, "learning_rate": 9.242009132420093e-06, "loss": 0.9796, "step": 1013 }, { "epoch": 0.9260273972602739, "grad_norm": 70.92475891113281, "learning_rate": 9.251141552511417e-06, "loss": 1.7528, "step": 1014 }, { "epoch": 0.9269406392694064, "grad_norm": 68.71794128417969, "learning_rate": 9.26027397260274e-06, "loss": 1.0955, "step": 1015 }, { "epoch": 0.9278538812785389, "grad_norm": 55.17373275756836, "learning_rate": 9.269406392694064e-06, "loss": 1.4974, "step": 1016 }, { "epoch": 0.9287671232876712, "grad_norm": 27.333791732788086, "learning_rate": 9.27853881278539e-06, "loss": 0.5192, "step": 1017 }, { "epoch": 0.9296803652968036, "grad_norm": 24.92716407775879, "learning_rate": 9.287671232876713e-06, "loss": 0.3497, "step": 1018 }, { "epoch": 0.9305936073059361, "grad_norm": 15.102202415466309, "learning_rate": 9.296803652968036e-06, "loss": 0.2126, "step": 1019 }, { "epoch": 0.9315068493150684, "grad_norm": 48.84443664550781, "learning_rate": 9.305936073059362e-06, "loss": 0.8363, "step": 1020 }, { "epoch": 0.9324200913242009, "grad_norm": 35.52340316772461, "learning_rate": 9.315068493150685e-06, "loss": 0.3442, "step": 1021 }, { "epoch": 0.9333333333333333, "grad_norm": 33.71098709106445, "learning_rate": 9.324200913242009e-06, "loss": 0.5333, "step": 1022 }, { "epoch": 0.9342465753424658, "grad_norm": 91.07613372802734, "learning_rate": 9.333333333333334e-06, "loss": 2.1525, "step": 1023 }, { "epoch": 0.9351598173515981, "grad_norm": 24.473636627197266, "learning_rate": 9.342465753424658e-06, "loss": 0.2778, "step": 1024 }, { "epoch": 0.9360730593607306, "grad_norm": 29.479713439941406, "learning_rate": 9.351598173515982e-06, "loss": 0.3371, "step": 1025 }, { "epoch": 0.936986301369863, "grad_norm": 37.94594192504883, "learning_rate": 9.360730593607307e-06, "loss": 0.5433, "step": 1026 }, { "epoch": 0.9378995433789954, "grad_norm": 46.76959991455078, "learning_rate": 9.36986301369863e-06, "loss": 1.4611, "step": 1027 }, { "epoch": 0.9388127853881278, "grad_norm": 6.452788352966309, "learning_rate": 9.378995433789954e-06, "loss": 0.027, "step": 1028 }, { "epoch": 0.9397260273972603, "grad_norm": 13.208540916442871, "learning_rate": 9.38812785388128e-06, "loss": 0.1776, "step": 1029 }, { "epoch": 0.9406392694063926, "grad_norm": 29.534236907958984, "learning_rate": 9.397260273972603e-06, "loss": 0.3376, "step": 1030 }, { "epoch": 0.9415525114155251, "grad_norm": 39.64462661743164, "learning_rate": 9.406392694063927e-06, "loss": 1.1708, "step": 1031 }, { "epoch": 0.9424657534246575, "grad_norm": 2.9740350246429443, "learning_rate": 9.415525114155252e-06, "loss": 0.0277, "step": 1032 }, { "epoch": 0.94337899543379, "grad_norm": 29.226961135864258, "learning_rate": 9.424657534246576e-06, "loss": 0.3236, "step": 1033 }, { "epoch": 0.9442922374429223, "grad_norm": 18.694047927856445, "learning_rate": 9.433789954337901e-06, "loss": 0.1607, "step": 1034 }, { "epoch": 0.9452054794520548, "grad_norm": 19.928442001342773, "learning_rate": 9.442922374429225e-06, "loss": 0.2661, "step": 1035 }, { "epoch": 0.9461187214611873, "grad_norm": 3.5326671600341797, "learning_rate": 9.452054794520548e-06, "loss": 0.0276, "step": 1036 }, { "epoch": 0.9470319634703196, "grad_norm": 44.471500396728516, "learning_rate": 9.461187214611873e-06, "loss": 0.7224, "step": 1037 }, { "epoch": 0.947945205479452, "grad_norm": 3.3180744647979736, "learning_rate": 9.470319634703197e-06, "loss": 0.0241, "step": 1038 }, { "epoch": 0.9488584474885845, "grad_norm": 67.83837127685547, "learning_rate": 9.47945205479452e-06, "loss": 1.6464, "step": 1039 }, { "epoch": 0.9497716894977168, "grad_norm": 6.6102986335754395, "learning_rate": 9.488584474885846e-06, "loss": 0.0582, "step": 1040 }, { "epoch": 0.9506849315068493, "grad_norm": 4.554635524749756, "learning_rate": 9.49771689497717e-06, "loss": 0.0416, "step": 1041 }, { "epoch": 0.9515981735159817, "grad_norm": 0.49115318059921265, "learning_rate": 9.506849315068493e-06, "loss": 0.0059, "step": 1042 }, { "epoch": 0.9525114155251142, "grad_norm": 31.593637466430664, "learning_rate": 9.515981735159819e-06, "loss": 0.4017, "step": 1043 }, { "epoch": 0.9534246575342465, "grad_norm": 113.18373107910156, "learning_rate": 9.525114155251142e-06, "loss": 2.0083, "step": 1044 }, { "epoch": 0.954337899543379, "grad_norm": 95.70514678955078, "learning_rate": 9.534246575342466e-06, "loss": 5.8413, "step": 1045 }, { "epoch": 0.9552511415525115, "grad_norm": 18.37888526916504, "learning_rate": 9.543378995433791e-06, "loss": 0.2096, "step": 1046 }, { "epoch": 0.9561643835616438, "grad_norm": 55.90707778930664, "learning_rate": 9.552511415525115e-06, "loss": 1.5142, "step": 1047 }, { "epoch": 0.9570776255707762, "grad_norm": 84.21673583984375, "learning_rate": 9.561643835616438e-06, "loss": 3.5924, "step": 1048 }, { "epoch": 0.9579908675799087, "grad_norm": 56.64280319213867, "learning_rate": 9.570776255707764e-06, "loss": 1.0257, "step": 1049 }, { "epoch": 0.958904109589041, "grad_norm": 20.309429168701172, "learning_rate": 9.579908675799087e-06, "loss": 0.1649, "step": 1050 }, { "epoch": 0.9598173515981735, "grad_norm": 94.84056854248047, "learning_rate": 9.589041095890411e-06, "loss": 4.9871, "step": 1051 }, { "epoch": 0.960730593607306, "grad_norm": 71.98633575439453, "learning_rate": 9.598173515981736e-06, "loss": 2.1097, "step": 1052 }, { "epoch": 0.9616438356164384, "grad_norm": 2.4573309421539307, "learning_rate": 9.60730593607306e-06, "loss": 0.0204, "step": 1053 }, { "epoch": 0.9625570776255707, "grad_norm": 2.997326374053955, "learning_rate": 9.616438356164385e-06, "loss": 0.0223, "step": 1054 }, { "epoch": 0.9634703196347032, "grad_norm": 26.25538444519043, "learning_rate": 9.625570776255709e-06, "loss": 0.3042, "step": 1055 }, { "epoch": 0.9643835616438357, "grad_norm": 70.764892578125, "learning_rate": 9.634703196347032e-06, "loss": 2.6048, "step": 1056 }, { "epoch": 0.965296803652968, "grad_norm": 19.452800750732422, "learning_rate": 9.643835616438358e-06, "loss": 0.176, "step": 1057 }, { "epoch": 0.9662100456621004, "grad_norm": 40.98455810546875, "learning_rate": 9.652968036529681e-06, "loss": 0.6803, "step": 1058 }, { "epoch": 0.9671232876712329, "grad_norm": 16.13430404663086, "learning_rate": 9.662100456621005e-06, "loss": 0.1246, "step": 1059 }, { "epoch": 0.9680365296803652, "grad_norm": 4.095152378082275, "learning_rate": 9.67123287671233e-06, "loss": 0.0387, "step": 1060 }, { "epoch": 0.9689497716894977, "grad_norm": 72.1044692993164, "learning_rate": 9.680365296803654e-06, "loss": 1.2633, "step": 1061 }, { "epoch": 0.9698630136986301, "grad_norm": 23.498537063598633, "learning_rate": 9.689497716894977e-06, "loss": 0.3377, "step": 1062 }, { "epoch": 0.9707762557077626, "grad_norm": 54.07660675048828, "learning_rate": 9.698630136986303e-06, "loss": 0.682, "step": 1063 }, { "epoch": 0.971689497716895, "grad_norm": 68.09193420410156, "learning_rate": 9.707762557077626e-06, "loss": 1.3597, "step": 1064 }, { "epoch": 0.9726027397260274, "grad_norm": 10.265414237976074, "learning_rate": 9.71689497716895e-06, "loss": 0.1336, "step": 1065 }, { "epoch": 0.9735159817351599, "grad_norm": 40.03880310058594, "learning_rate": 9.726027397260275e-06, "loss": 0.423, "step": 1066 }, { "epoch": 0.9744292237442922, "grad_norm": 79.60580444335938, "learning_rate": 9.735159817351599e-06, "loss": 1.591, "step": 1067 }, { "epoch": 0.9753424657534246, "grad_norm": 78.22001647949219, "learning_rate": 9.744292237442923e-06, "loss": 0.9753, "step": 1068 }, { "epoch": 0.9762557077625571, "grad_norm": 9.016551971435547, "learning_rate": 9.753424657534248e-06, "loss": 0.1042, "step": 1069 }, { "epoch": 0.9771689497716894, "grad_norm": 8.183223724365234, "learning_rate": 9.762557077625571e-06, "loss": 0.0991, "step": 1070 }, { "epoch": 0.9780821917808219, "grad_norm": 37.56437683105469, "learning_rate": 9.771689497716895e-06, "loss": 0.3283, "step": 1071 }, { "epoch": 0.9789954337899544, "grad_norm": 42.96230697631836, "learning_rate": 9.78082191780822e-06, "loss": 0.9624, "step": 1072 }, { "epoch": 0.9799086757990868, "grad_norm": 5.678774833679199, "learning_rate": 9.789954337899544e-06, "loss": 0.0626, "step": 1073 }, { "epoch": 0.9808219178082191, "grad_norm": 31.877199172973633, "learning_rate": 9.79908675799087e-06, "loss": 0.5717, "step": 1074 }, { "epoch": 0.9817351598173516, "grad_norm": 67.55469512939453, "learning_rate": 9.808219178082193e-06, "loss": 2.7307, "step": 1075 }, { "epoch": 0.982648401826484, "grad_norm": 75.89823913574219, "learning_rate": 9.817351598173517e-06, "loss": 1.4566, "step": 1076 }, { "epoch": 0.9835616438356164, "grad_norm": 5.821112155914307, "learning_rate": 9.826484018264842e-06, "loss": 0.077, "step": 1077 }, { "epoch": 0.9844748858447488, "grad_norm": 36.96324157714844, "learning_rate": 9.835616438356166e-06, "loss": 0.6725, "step": 1078 }, { "epoch": 0.9853881278538813, "grad_norm": 91.95108795166016, "learning_rate": 9.844748858447489e-06, "loss": 1.0632, "step": 1079 }, { "epoch": 0.9863013698630136, "grad_norm": 37.950531005859375, "learning_rate": 9.853881278538814e-06, "loss": 0.4003, "step": 1080 }, { "epoch": 0.9872146118721461, "grad_norm": 21.441457748413086, "learning_rate": 9.863013698630138e-06, "loss": 0.3083, "step": 1081 }, { "epoch": 0.9881278538812786, "grad_norm": 67.9062271118164, "learning_rate": 9.872146118721462e-06, "loss": 2.7518, "step": 1082 }, { "epoch": 0.989041095890411, "grad_norm": 29.73728370666504, "learning_rate": 9.881278538812787e-06, "loss": 0.2998, "step": 1083 }, { "epoch": 0.9899543378995433, "grad_norm": 11.437714576721191, "learning_rate": 9.89041095890411e-06, "loss": 0.1187, "step": 1084 }, { "epoch": 0.9908675799086758, "grad_norm": 81.91477966308594, "learning_rate": 9.899543378995434e-06, "loss": 0.5256, "step": 1085 }, { "epoch": 0.9917808219178083, "grad_norm": 1.452174186706543, "learning_rate": 9.90867579908676e-06, "loss": 0.007, "step": 1086 }, { "epoch": 0.9926940639269406, "grad_norm": 62.54002380371094, "learning_rate": 9.917808219178083e-06, "loss": 2.1733, "step": 1087 }, { "epoch": 0.993607305936073, "grad_norm": 3.0866756439208984, "learning_rate": 9.926940639269407e-06, "loss": 0.0288, "step": 1088 }, { "epoch": 0.9945205479452055, "grad_norm": 35.28541564941406, "learning_rate": 9.936073059360732e-06, "loss": 0.4841, "step": 1089 }, { "epoch": 0.9954337899543378, "grad_norm": 90.3548583984375, "learning_rate": 9.945205479452056e-06, "loss": 2.3181, "step": 1090 }, { "epoch": 0.9963470319634703, "grad_norm": 4.630880832672119, "learning_rate": 9.95433789954338e-06, "loss": 0.034, "step": 1091 }, { "epoch": 0.9972602739726028, "grad_norm": 28.214872360229492, "learning_rate": 9.963470319634703e-06, "loss": 0.2672, "step": 1092 }, { "epoch": 0.9981735159817352, "grad_norm": 73.82799530029297, "learning_rate": 9.972602739726028e-06, "loss": 1.2089, "step": 1093 }, { "epoch": 0.9990867579908675, "grad_norm": 72.52216339111328, "learning_rate": 9.981735159817354e-06, "loss": 1.5099, "step": 1094 }, { "epoch": 1.0, "grad_norm": 33.92768096923828, "learning_rate": 9.990867579908676e-06, "loss": 0.4051, "step": 1095 }, { "epoch": 1.0009132420091325, "grad_norm": 58.76395797729492, "learning_rate": 1e-05, "loss": 1.0428, "step": 1096 }, { "epoch": 1.001826484018265, "grad_norm": 68.3316879272461, "learning_rate": 9.998985286656521e-06, "loss": 1.613, "step": 1097 }, { "epoch": 1.0027397260273974, "grad_norm": 3.8774664402008057, "learning_rate": 9.99797057331304e-06, "loss": 0.0396, "step": 1098 }, { "epoch": 1.0036529680365296, "grad_norm": 9.00130844116211, "learning_rate": 9.996955859969559e-06, "loss": 0.0684, "step": 1099 }, { "epoch": 1.004566210045662, "grad_norm": 15.294543266296387, "learning_rate": 9.99594114662608e-06, "loss": 0.1671, "step": 1100 }, { "epoch": 1.0054794520547945, "grad_norm": 23.27090835571289, "learning_rate": 9.994926433282598e-06, "loss": 0.2375, "step": 1101 }, { "epoch": 1.006392694063927, "grad_norm": 80.45234680175781, "learning_rate": 9.993911719939117e-06, "loss": 2.0194, "step": 1102 }, { "epoch": 1.0073059360730594, "grad_norm": 0.48725858330726624, "learning_rate": 9.992897006595638e-06, "loss": 0.0046, "step": 1103 }, { "epoch": 1.0082191780821919, "grad_norm": 58.36763000488281, "learning_rate": 9.991882293252158e-06, "loss": 1.1879, "step": 1104 }, { "epoch": 1.009132420091324, "grad_norm": 20.934370040893555, "learning_rate": 9.990867579908676e-06, "loss": 0.2603, "step": 1105 }, { "epoch": 1.0100456621004565, "grad_norm": 38.523555755615234, "learning_rate": 9.989852866565196e-06, "loss": 0.3532, "step": 1106 }, { "epoch": 1.010958904109589, "grad_norm": 5.984973430633545, "learning_rate": 9.988838153221717e-06, "loss": 0.0684, "step": 1107 }, { "epoch": 1.0118721461187214, "grad_norm": 9.424091339111328, "learning_rate": 9.987823439878235e-06, "loss": 0.1151, "step": 1108 }, { "epoch": 1.012785388127854, "grad_norm": 19.735050201416016, "learning_rate": 9.986808726534754e-06, "loss": 0.1741, "step": 1109 }, { "epoch": 1.0136986301369864, "grad_norm": 7.511282920837402, "learning_rate": 9.985794013191275e-06, "loss": 0.0583, "step": 1110 }, { "epoch": 1.0146118721461188, "grad_norm": 25.47519874572754, "learning_rate": 9.984779299847794e-06, "loss": 0.2139, "step": 1111 }, { "epoch": 1.015525114155251, "grad_norm": 2.6127161979675293, "learning_rate": 9.983764586504313e-06, "loss": 0.0212, "step": 1112 }, { "epoch": 1.0164383561643835, "grad_norm": 33.31047439575195, "learning_rate": 9.982749873160833e-06, "loss": 0.354, "step": 1113 }, { "epoch": 1.017351598173516, "grad_norm": 85.97967529296875, "learning_rate": 9.981735159817354e-06, "loss": 1.4512, "step": 1114 }, { "epoch": 1.0182648401826484, "grad_norm": 28.457290649414062, "learning_rate": 9.980720446473872e-06, "loss": 0.2795, "step": 1115 }, { "epoch": 1.0191780821917809, "grad_norm": 10.6680908203125, "learning_rate": 9.979705733130391e-06, "loss": 0.1489, "step": 1116 }, { "epoch": 1.0200913242009133, "grad_norm": 90.9328842163086, "learning_rate": 9.978691019786912e-06, "loss": 2.0147, "step": 1117 }, { "epoch": 1.0210045662100458, "grad_norm": 21.467376708984375, "learning_rate": 9.97767630644343e-06, "loss": 0.2101, "step": 1118 }, { "epoch": 1.021917808219178, "grad_norm": 11.998880386352539, "learning_rate": 9.97666159309995e-06, "loss": 0.0982, "step": 1119 }, { "epoch": 1.0228310502283104, "grad_norm": 20.687955856323242, "learning_rate": 9.97564687975647e-06, "loss": 0.2758, "step": 1120 }, { "epoch": 1.023744292237443, "grad_norm": 3.9462201595306396, "learning_rate": 9.974632166412989e-06, "loss": 0.0315, "step": 1121 }, { "epoch": 1.0246575342465754, "grad_norm": 24.153099060058594, "learning_rate": 9.973617453069508e-06, "loss": 0.1702, "step": 1122 }, { "epoch": 1.0255707762557078, "grad_norm": 12.701769828796387, "learning_rate": 9.972602739726028e-06, "loss": 0.1231, "step": 1123 }, { "epoch": 1.0264840182648403, "grad_norm": 26.309234619140625, "learning_rate": 9.971588026382549e-06, "loss": 0.2262, "step": 1124 }, { "epoch": 1.0273972602739727, "grad_norm": 3.7165868282318115, "learning_rate": 9.970573313039068e-06, "loss": 0.0406, "step": 1125 }, { "epoch": 1.028310502283105, "grad_norm": 35.78733825683594, "learning_rate": 9.969558599695586e-06, "loss": 0.4569, "step": 1126 }, { "epoch": 1.0292237442922374, "grad_norm": 11.338293075561523, "learning_rate": 9.968543886352107e-06, "loss": 0.0772, "step": 1127 }, { "epoch": 1.0301369863013699, "grad_norm": 7.976831912994385, "learning_rate": 9.967529173008626e-06, "loss": 0.0729, "step": 1128 }, { "epoch": 1.0310502283105023, "grad_norm": 27.675090789794922, "learning_rate": 9.966514459665145e-06, "loss": 0.2924, "step": 1129 }, { "epoch": 1.0319634703196348, "grad_norm": 51.487098693847656, "learning_rate": 9.965499746321665e-06, "loss": 0.4104, "step": 1130 }, { "epoch": 1.0328767123287672, "grad_norm": 94.9079360961914, "learning_rate": 9.964485032978184e-06, "loss": 1.819, "step": 1131 }, { "epoch": 1.0337899543378994, "grad_norm": 7.40438985824585, "learning_rate": 9.963470319634703e-06, "loss": 0.0573, "step": 1132 }, { "epoch": 1.034703196347032, "grad_norm": 50.488128662109375, "learning_rate": 9.962455606291223e-06, "loss": 0.8045, "step": 1133 }, { "epoch": 1.0356164383561643, "grad_norm": 14.560175895690918, "learning_rate": 9.961440892947744e-06, "loss": 0.1342, "step": 1134 }, { "epoch": 1.0365296803652968, "grad_norm": 17.991836547851562, "learning_rate": 9.960426179604263e-06, "loss": 0.2765, "step": 1135 }, { "epoch": 1.0374429223744293, "grad_norm": 49.077030181884766, "learning_rate": 9.959411466260782e-06, "loss": 0.5084, "step": 1136 }, { "epoch": 1.0383561643835617, "grad_norm": 77.55750274658203, "learning_rate": 9.958396752917302e-06, "loss": 2.6864, "step": 1137 }, { "epoch": 1.0392694063926942, "grad_norm": 2.1163997650146484, "learning_rate": 9.957382039573821e-06, "loss": 0.0208, "step": 1138 }, { "epoch": 1.0401826484018264, "grad_norm": 35.199378967285156, "learning_rate": 9.95636732623034e-06, "loss": 0.1974, "step": 1139 }, { "epoch": 1.0410958904109588, "grad_norm": 0.16024017333984375, "learning_rate": 9.95535261288686e-06, "loss": 0.0015, "step": 1140 }, { "epoch": 1.0420091324200913, "grad_norm": 65.89723205566406, "learning_rate": 9.95433789954338e-06, "loss": 1.2224, "step": 1141 }, { "epoch": 1.0429223744292238, "grad_norm": 6.578869342803955, "learning_rate": 9.9533231861999e-06, "loss": 0.066, "step": 1142 }, { "epoch": 1.0438356164383562, "grad_norm": 0.40872734785079956, "learning_rate": 9.952308472856419e-06, "loss": 0.0032, "step": 1143 }, { "epoch": 1.0447488584474887, "grad_norm": 11.861063003540039, "learning_rate": 9.95129375951294e-06, "loss": 0.072, "step": 1144 }, { "epoch": 1.045662100456621, "grad_norm": 1.8664146661758423, "learning_rate": 9.950279046169458e-06, "loss": 0.0183, "step": 1145 }, { "epoch": 1.0465753424657533, "grad_norm": 22.742250442504883, "learning_rate": 9.949264332825977e-06, "loss": 0.3707, "step": 1146 }, { "epoch": 1.0474885844748858, "grad_norm": 10.703137397766113, "learning_rate": 9.948249619482497e-06, "loss": 0.0847, "step": 1147 }, { "epoch": 1.0484018264840183, "grad_norm": 52.962486267089844, "learning_rate": 9.947234906139016e-06, "loss": 0.7025, "step": 1148 }, { "epoch": 1.0493150684931507, "grad_norm": 4.599348068237305, "learning_rate": 9.946220192795535e-06, "loss": 0.0319, "step": 1149 }, { "epoch": 1.0502283105022832, "grad_norm": 10.473045349121094, "learning_rate": 9.945205479452056e-06, "loss": 0.091, "step": 1150 }, { "epoch": 1.0511415525114156, "grad_norm": 78.52191925048828, "learning_rate": 9.944190766108575e-06, "loss": 2.0727, "step": 1151 }, { "epoch": 1.0520547945205478, "grad_norm": 102.7319107055664, "learning_rate": 9.943176052765095e-06, "loss": 7.1639, "step": 1152 }, { "epoch": 1.0529680365296803, "grad_norm": 3.7668824195861816, "learning_rate": 9.942161339421614e-06, "loss": 0.0311, "step": 1153 }, { "epoch": 1.0538812785388127, "grad_norm": 86.81398010253906, "learning_rate": 9.941146626078134e-06, "loss": 0.6332, "step": 1154 }, { "epoch": 1.0547945205479452, "grad_norm": 11.730681419372559, "learning_rate": 9.940131912734653e-06, "loss": 0.1218, "step": 1155 }, { "epoch": 1.0557077625570777, "grad_norm": 30.43528175354004, "learning_rate": 9.939117199391172e-06, "loss": 0.2932, "step": 1156 }, { "epoch": 1.05662100456621, "grad_norm": 6.660315036773682, "learning_rate": 9.938102486047693e-06, "loss": 0.0526, "step": 1157 }, { "epoch": 1.0575342465753426, "grad_norm": 0.49565446376800537, "learning_rate": 9.937087772704212e-06, "loss": 0.0056, "step": 1158 }, { "epoch": 1.0584474885844748, "grad_norm": 1.474202036857605, "learning_rate": 9.936073059360732e-06, "loss": 0.015, "step": 1159 }, { "epoch": 1.0593607305936072, "grad_norm": 80.6923599243164, "learning_rate": 9.935058346017251e-06, "loss": 0.9787, "step": 1160 }, { "epoch": 1.0602739726027397, "grad_norm": 36.380611419677734, "learning_rate": 9.93404363267377e-06, "loss": 0.7743, "step": 1161 }, { "epoch": 1.0611872146118722, "grad_norm": 2.620955228805542, "learning_rate": 9.93302891933029e-06, "loss": 0.0235, "step": 1162 }, { "epoch": 1.0621004566210046, "grad_norm": 0.5856183171272278, "learning_rate": 9.93201420598681e-06, "loss": 0.0066, "step": 1163 }, { "epoch": 1.063013698630137, "grad_norm": 93.04810333251953, "learning_rate": 9.93099949264333e-06, "loss": 2.0086, "step": 1164 }, { "epoch": 1.0639269406392695, "grad_norm": 68.9166488647461, "learning_rate": 9.929984779299849e-06, "loss": 0.6471, "step": 1165 }, { "epoch": 1.0648401826484017, "grad_norm": 83.09068298339844, "learning_rate": 9.928970065956367e-06, "loss": 1.9209, "step": 1166 }, { "epoch": 1.0657534246575342, "grad_norm": 46.69468307495117, "learning_rate": 9.927955352612888e-06, "loss": 0.5906, "step": 1167 }, { "epoch": 1.0666666666666667, "grad_norm": 70.38320922851562, "learning_rate": 9.926940639269407e-06, "loss": 0.9854, "step": 1168 }, { "epoch": 1.067579908675799, "grad_norm": 89.33210754394531, "learning_rate": 9.925925925925927e-06, "loss": 1.1099, "step": 1169 }, { "epoch": 1.0684931506849316, "grad_norm": 7.836672306060791, "learning_rate": 9.924911212582446e-06, "loss": 0.08, "step": 1170 }, { "epoch": 1.069406392694064, "grad_norm": 0.5433726906776428, "learning_rate": 9.923896499238965e-06, "loss": 0.0051, "step": 1171 }, { "epoch": 1.0703196347031962, "grad_norm": 68.13092041015625, "learning_rate": 9.922881785895486e-06, "loss": 1.9116, "step": 1172 }, { "epoch": 1.0712328767123287, "grad_norm": 99.43687438964844, "learning_rate": 9.921867072552004e-06, "loss": 1.6476, "step": 1173 }, { "epoch": 1.0721461187214611, "grad_norm": 61.36214828491211, "learning_rate": 9.920852359208525e-06, "loss": 1.1333, "step": 1174 }, { "epoch": 1.0730593607305936, "grad_norm": 36.09506607055664, "learning_rate": 9.919837645865044e-06, "loss": 0.5307, "step": 1175 }, { "epoch": 1.073972602739726, "grad_norm": 5.20225715637207, "learning_rate": 9.918822932521563e-06, "loss": 0.0541, "step": 1176 }, { "epoch": 1.0748858447488585, "grad_norm": 68.9091796875, "learning_rate": 9.917808219178083e-06, "loss": 1.7289, "step": 1177 }, { "epoch": 1.075799086757991, "grad_norm": 60.07439422607422, "learning_rate": 9.916793505834602e-06, "loss": 0.9895, "step": 1178 }, { "epoch": 1.0767123287671232, "grad_norm": 88.56982421875, "learning_rate": 9.915778792491123e-06, "loss": 0.2737, "step": 1179 }, { "epoch": 1.0776255707762556, "grad_norm": 36.36375427246094, "learning_rate": 9.914764079147641e-06, "loss": 0.4368, "step": 1180 }, { "epoch": 1.078538812785388, "grad_norm": 26.063766479492188, "learning_rate": 9.91374936580416e-06, "loss": 0.3544, "step": 1181 }, { "epoch": 1.0794520547945206, "grad_norm": 76.00108337402344, "learning_rate": 9.91273465246068e-06, "loss": 1.5228, "step": 1182 }, { "epoch": 1.080365296803653, "grad_norm": 1.4761919975280762, "learning_rate": 9.9117199391172e-06, "loss": 0.0121, "step": 1183 }, { "epoch": 1.0812785388127855, "grad_norm": 1.6589564085006714, "learning_rate": 9.91070522577372e-06, "loss": 0.018, "step": 1184 }, { "epoch": 1.0821917808219177, "grad_norm": 0.2870868146419525, "learning_rate": 9.909690512430239e-06, "loss": 0.0019, "step": 1185 }, { "epoch": 1.0831050228310501, "grad_norm": 58.25961685180664, "learning_rate": 9.90867579908676e-06, "loss": 0.9305, "step": 1186 }, { "epoch": 1.0840182648401826, "grad_norm": 22.113845825195312, "learning_rate": 9.907661085743278e-06, "loss": 0.2041, "step": 1187 }, { "epoch": 1.084931506849315, "grad_norm": 64.53033447265625, "learning_rate": 9.906646372399797e-06, "loss": 0.9773, "step": 1188 }, { "epoch": 1.0858447488584475, "grad_norm": 8.665323257446289, "learning_rate": 9.905631659056318e-06, "loss": 0.0531, "step": 1189 }, { "epoch": 1.08675799086758, "grad_norm": 37.795223236083984, "learning_rate": 9.904616945712837e-06, "loss": 0.5059, "step": 1190 }, { "epoch": 1.0876712328767124, "grad_norm": 1.063546895980835, "learning_rate": 9.903602232369355e-06, "loss": 0.0098, "step": 1191 }, { "epoch": 1.0885844748858449, "grad_norm": 3.537407636642456, "learning_rate": 9.902587519025876e-06, "loss": 0.0313, "step": 1192 }, { "epoch": 1.089497716894977, "grad_norm": 13.776947021484375, "learning_rate": 9.901572805682395e-06, "loss": 0.1751, "step": 1193 }, { "epoch": 1.0904109589041096, "grad_norm": 54.28927993774414, "learning_rate": 9.900558092338915e-06, "loss": 1.3358, "step": 1194 }, { "epoch": 1.091324200913242, "grad_norm": 1.2236112356185913, "learning_rate": 9.899543378995434e-06, "loss": 0.01, "step": 1195 }, { "epoch": 1.0922374429223745, "grad_norm": 89.52770233154297, "learning_rate": 9.898528665651955e-06, "loss": 1.7113, "step": 1196 }, { "epoch": 1.093150684931507, "grad_norm": 51.918888092041016, "learning_rate": 9.897513952308474e-06, "loss": 1.0027, "step": 1197 }, { "epoch": 1.0940639269406394, "grad_norm": 9.872076034545898, "learning_rate": 9.896499238964992e-06, "loss": 0.087, "step": 1198 }, { "epoch": 1.0949771689497716, "grad_norm": 25.707347869873047, "learning_rate": 9.895484525621513e-06, "loss": 0.272, "step": 1199 }, { "epoch": 1.095890410958904, "grad_norm": 7.869113445281982, "learning_rate": 9.894469812278032e-06, "loss": 0.0804, "step": 1200 }, { "epoch": 1.0968036529680365, "grad_norm": 1.1539344787597656, "learning_rate": 9.89345509893455e-06, "loss": 0.0119, "step": 1201 }, { "epoch": 1.097716894977169, "grad_norm": 14.54737377166748, "learning_rate": 9.892440385591071e-06, "loss": 0.1362, "step": 1202 }, { "epoch": 1.0986301369863014, "grad_norm": 88.53337097167969, "learning_rate": 9.891425672247592e-06, "loss": 3.1771, "step": 1203 }, { "epoch": 1.0995433789954339, "grad_norm": 30.743600845336914, "learning_rate": 9.89041095890411e-06, "loss": 0.3834, "step": 1204 }, { "epoch": 1.1004566210045663, "grad_norm": 86.07425689697266, "learning_rate": 9.88939624556063e-06, "loss": 5.013, "step": 1205 }, { "epoch": 1.1013698630136985, "grad_norm": 14.068975448608398, "learning_rate": 9.88838153221715e-06, "loss": 0.2061, "step": 1206 }, { "epoch": 1.102283105022831, "grad_norm": 37.346771240234375, "learning_rate": 9.887366818873669e-06, "loss": 0.6176, "step": 1207 }, { "epoch": 1.1031963470319635, "grad_norm": 48.72676467895508, "learning_rate": 9.886352105530188e-06, "loss": 1.2269, "step": 1208 }, { "epoch": 1.104109589041096, "grad_norm": 25.352697372436523, "learning_rate": 9.885337392186708e-06, "loss": 0.2713, "step": 1209 }, { "epoch": 1.1050228310502284, "grad_norm": 58.837581634521484, "learning_rate": 9.884322678843227e-06, "loss": 1.7474, "step": 1210 }, { "epoch": 1.1059360730593608, "grad_norm": 15.198026657104492, "learning_rate": 9.883307965499746e-06, "loss": 0.2106, "step": 1211 }, { "epoch": 1.106849315068493, "grad_norm": 85.50592041015625, "learning_rate": 9.882293252156266e-06, "loss": 3.2844, "step": 1212 }, { "epoch": 1.1077625570776255, "grad_norm": 28.039081573486328, "learning_rate": 9.881278538812787e-06, "loss": 0.4675, "step": 1213 }, { "epoch": 1.108675799086758, "grad_norm": 84.47945404052734, "learning_rate": 9.880263825469306e-06, "loss": 2.6365, "step": 1214 }, { "epoch": 1.1095890410958904, "grad_norm": 71.70197296142578, "learning_rate": 9.879249112125825e-06, "loss": 1.5257, "step": 1215 }, { "epoch": 1.1105022831050229, "grad_norm": 3.8099379539489746, "learning_rate": 9.878234398782345e-06, "loss": 0.0347, "step": 1216 }, { "epoch": 1.1114155251141553, "grad_norm": 46.55938720703125, "learning_rate": 9.877219685438864e-06, "loss": 0.8499, "step": 1217 }, { "epoch": 1.1123287671232878, "grad_norm": 9.178156852722168, "learning_rate": 9.876204972095383e-06, "loss": 0.1055, "step": 1218 }, { "epoch": 1.11324200913242, "grad_norm": 4.809138774871826, "learning_rate": 9.875190258751903e-06, "loss": 0.0428, "step": 1219 }, { "epoch": 1.1141552511415524, "grad_norm": 13.74797534942627, "learning_rate": 9.874175545408424e-06, "loss": 0.1551, "step": 1220 }, { "epoch": 1.115068493150685, "grad_norm": 25.54538917541504, "learning_rate": 9.873160832064941e-06, "loss": 0.1996, "step": 1221 }, { "epoch": 1.1159817351598174, "grad_norm": 26.78656768798828, "learning_rate": 9.872146118721462e-06, "loss": 0.3042, "step": 1222 }, { "epoch": 1.1168949771689498, "grad_norm": 9.422928810119629, "learning_rate": 9.871131405377982e-06, "loss": 0.1006, "step": 1223 }, { "epoch": 1.1178082191780823, "grad_norm": 14.787313461303711, "learning_rate": 9.870116692034501e-06, "loss": 0.1657, "step": 1224 }, { "epoch": 1.1187214611872145, "grad_norm": 88.80024719238281, "learning_rate": 9.86910197869102e-06, "loss": 5.0056, "step": 1225 }, { "epoch": 1.119634703196347, "grad_norm": 20.862699508666992, "learning_rate": 9.86808726534754e-06, "loss": 0.2317, "step": 1226 }, { "epoch": 1.1205479452054794, "grad_norm": 35.51770782470703, "learning_rate": 9.86707255200406e-06, "loss": 0.3372, "step": 1227 }, { "epoch": 1.1214611872146119, "grad_norm": 13.203450202941895, "learning_rate": 9.866057838660578e-06, "loss": 0.1776, "step": 1228 }, { "epoch": 1.1223744292237443, "grad_norm": 25.109724044799805, "learning_rate": 9.865043125317099e-06, "loss": 0.3474, "step": 1229 }, { "epoch": 1.1232876712328768, "grad_norm": 25.62274932861328, "learning_rate": 9.86402841197362e-06, "loss": 0.4813, "step": 1230 }, { "epoch": 1.1242009132420092, "grad_norm": 7.363849639892578, "learning_rate": 9.863013698630138e-06, "loss": 0.0952, "step": 1231 }, { "epoch": 1.1251141552511417, "grad_norm": 47.50288391113281, "learning_rate": 9.861998985286657e-06, "loss": 0.894, "step": 1232 }, { "epoch": 1.126027397260274, "grad_norm": 20.199132919311523, "learning_rate": 9.860984271943177e-06, "loss": 0.2741, "step": 1233 }, { "epoch": 1.1269406392694064, "grad_norm": 17.281333923339844, "learning_rate": 9.859969558599696e-06, "loss": 0.1132, "step": 1234 }, { "epoch": 1.1278538812785388, "grad_norm": 8.892801284790039, "learning_rate": 9.858954845256215e-06, "loss": 0.0746, "step": 1235 }, { "epoch": 1.1287671232876713, "grad_norm": 65.82695007324219, "learning_rate": 9.857940131912736e-06, "loss": 1.7299, "step": 1236 }, { "epoch": 1.1296803652968037, "grad_norm": 5.248528957366943, "learning_rate": 9.856925418569255e-06, "loss": 0.0551, "step": 1237 }, { "epoch": 1.1305936073059362, "grad_norm": 4.645214557647705, "learning_rate": 9.855910705225773e-06, "loss": 0.0663, "step": 1238 }, { "epoch": 1.1315068493150684, "grad_norm": 35.816802978515625, "learning_rate": 9.854895991882294e-06, "loss": 0.2922, "step": 1239 }, { "epoch": 1.1324200913242009, "grad_norm": 63.349647521972656, "learning_rate": 9.853881278538814e-06, "loss": 1.6879, "step": 1240 }, { "epoch": 1.1333333333333333, "grad_norm": 5.047916889190674, "learning_rate": 9.852866565195333e-06, "loss": 0.0465, "step": 1241 }, { "epoch": 1.1342465753424658, "grad_norm": 0.4489988088607788, "learning_rate": 9.851851851851852e-06, "loss": 0.0056, "step": 1242 }, { "epoch": 1.1351598173515982, "grad_norm": 12.529107093811035, "learning_rate": 9.850837138508373e-06, "loss": 0.1145, "step": 1243 }, { "epoch": 1.1360730593607307, "grad_norm": 16.505146026611328, "learning_rate": 9.849822425164892e-06, "loss": 0.2158, "step": 1244 }, { "epoch": 1.1369863013698631, "grad_norm": 0.7900995016098022, "learning_rate": 9.84880771182141e-06, "loss": 0.006, "step": 1245 }, { "epoch": 1.1378995433789953, "grad_norm": 29.352359771728516, "learning_rate": 9.847792998477931e-06, "loss": 0.3354, "step": 1246 }, { "epoch": 1.1388127853881278, "grad_norm": 156.06890869140625, "learning_rate": 9.846778285134451e-06, "loss": 4.2934, "step": 1247 }, { "epoch": 1.1397260273972603, "grad_norm": 7.336793899536133, "learning_rate": 9.845763571790969e-06, "loss": 0.0539, "step": 1248 }, { "epoch": 1.1406392694063927, "grad_norm": 4.084846496582031, "learning_rate": 9.844748858447489e-06, "loss": 0.0437, "step": 1249 }, { "epoch": 1.1415525114155252, "grad_norm": 9.914817810058594, "learning_rate": 9.84373414510401e-06, "loss": 0.0918, "step": 1250 }, { "epoch": 1.1424657534246576, "grad_norm": 6.705550193786621, "learning_rate": 9.842719431760529e-06, "loss": 0.0682, "step": 1251 }, { "epoch": 1.1433789954337898, "grad_norm": 81.59422302246094, "learning_rate": 9.841704718417047e-06, "loss": 3.6612, "step": 1252 }, { "epoch": 1.1442922374429223, "grad_norm": 66.08964538574219, "learning_rate": 9.840690005073568e-06, "loss": 1.1194, "step": 1253 }, { "epoch": 1.1452054794520548, "grad_norm": 60.591819763183594, "learning_rate": 9.839675291730087e-06, "loss": 0.8931, "step": 1254 }, { "epoch": 1.1461187214611872, "grad_norm": 67.6300048828125, "learning_rate": 9.838660578386606e-06, "loss": 1.2345, "step": 1255 }, { "epoch": 1.1470319634703197, "grad_norm": 28.406105041503906, "learning_rate": 9.837645865043126e-06, "loss": 0.1933, "step": 1256 }, { "epoch": 1.1479452054794521, "grad_norm": 4.289877891540527, "learning_rate": 9.836631151699647e-06, "loss": 0.0303, "step": 1257 }, { "epoch": 1.1488584474885846, "grad_norm": 86.62110137939453, "learning_rate": 9.835616438356166e-06, "loss": 0.9639, "step": 1258 }, { "epoch": 1.1497716894977168, "grad_norm": 78.43213653564453, "learning_rate": 9.834601725012684e-06, "loss": 2.5271, "step": 1259 }, { "epoch": 1.1506849315068493, "grad_norm": 44.90906524658203, "learning_rate": 9.833587011669205e-06, "loss": 0.5943, "step": 1260 }, { "epoch": 1.1515981735159817, "grad_norm": 2.6936566829681396, "learning_rate": 9.832572298325724e-06, "loss": 0.0219, "step": 1261 }, { "epoch": 1.1525114155251142, "grad_norm": 23.749540328979492, "learning_rate": 9.831557584982243e-06, "loss": 0.2505, "step": 1262 }, { "epoch": 1.1534246575342466, "grad_norm": 23.713918685913086, "learning_rate": 9.830542871638763e-06, "loss": 0.2019, "step": 1263 }, { "epoch": 1.154337899543379, "grad_norm": 12.522826194763184, "learning_rate": 9.829528158295284e-06, "loss": 0.1732, "step": 1264 }, { "epoch": 1.1552511415525113, "grad_norm": 1.2397854328155518, "learning_rate": 9.8285134449518e-06, "loss": 0.0135, "step": 1265 }, { "epoch": 1.1561643835616437, "grad_norm": 78.26264953613281, "learning_rate": 9.827498731608321e-06, "loss": 1.6792, "step": 1266 }, { "epoch": 1.1570776255707762, "grad_norm": 10.646299362182617, "learning_rate": 9.826484018264842e-06, "loss": 0.088, "step": 1267 }, { "epoch": 1.1579908675799087, "grad_norm": 44.26567840576172, "learning_rate": 9.82546930492136e-06, "loss": 0.2697, "step": 1268 }, { "epoch": 1.158904109589041, "grad_norm": 80.51653289794922, "learning_rate": 9.82445459157788e-06, "loss": 0.498, "step": 1269 }, { "epoch": 1.1598173515981736, "grad_norm": 21.342660903930664, "learning_rate": 9.8234398782344e-06, "loss": 0.1611, "step": 1270 }, { "epoch": 1.160730593607306, "grad_norm": 39.30296325683594, "learning_rate": 9.822425164890919e-06, "loss": 0.5821, "step": 1271 }, { "epoch": 1.1616438356164385, "grad_norm": 40.893211364746094, "learning_rate": 9.821410451547438e-06, "loss": 0.6536, "step": 1272 }, { "epoch": 1.1625570776255707, "grad_norm": 30.313264846801758, "learning_rate": 9.820395738203958e-06, "loss": 0.1934, "step": 1273 }, { "epoch": 1.1634703196347032, "grad_norm": 20.10514259338379, "learning_rate": 9.819381024860479e-06, "loss": 0.264, "step": 1274 }, { "epoch": 1.1643835616438356, "grad_norm": 82.0440902709961, "learning_rate": 9.818366311516998e-06, "loss": 2.2911, "step": 1275 }, { "epoch": 1.165296803652968, "grad_norm": 12.376786231994629, "learning_rate": 9.817351598173517e-06, "loss": 0.098, "step": 1276 }, { "epoch": 1.1662100456621005, "grad_norm": 53.29960632324219, "learning_rate": 9.816336884830037e-06, "loss": 0.3396, "step": 1277 }, { "epoch": 1.167123287671233, "grad_norm": 25.352275848388672, "learning_rate": 9.815322171486556e-06, "loss": 0.3198, "step": 1278 }, { "epoch": 1.1680365296803652, "grad_norm": 5.3171868324279785, "learning_rate": 9.814307458143075e-06, "loss": 0.0537, "step": 1279 }, { "epoch": 1.1689497716894977, "grad_norm": 58.578399658203125, "learning_rate": 9.813292744799595e-06, "loss": 0.6184, "step": 1280 }, { "epoch": 1.16986301369863, "grad_norm": 3.704026699066162, "learning_rate": 9.812278031456114e-06, "loss": 0.0348, "step": 1281 }, { "epoch": 1.1707762557077626, "grad_norm": 16.801334381103516, "learning_rate": 9.811263318112633e-06, "loss": 0.1181, "step": 1282 }, { "epoch": 1.171689497716895, "grad_norm": 14.507014274597168, "learning_rate": 9.810248604769154e-06, "loss": 0.1711, "step": 1283 }, { "epoch": 1.1726027397260275, "grad_norm": 2.0689210891723633, "learning_rate": 9.809233891425674e-06, "loss": 0.0206, "step": 1284 }, { "epoch": 1.17351598173516, "grad_norm": 32.27437210083008, "learning_rate": 9.808219178082193e-06, "loss": 0.3382, "step": 1285 }, { "epoch": 1.1744292237442922, "grad_norm": 43.41175842285156, "learning_rate": 9.807204464738712e-06, "loss": 0.5345, "step": 1286 }, { "epoch": 1.1753424657534246, "grad_norm": 59.642433166503906, "learning_rate": 9.806189751395232e-06, "loss": 0.7169, "step": 1287 }, { "epoch": 1.176255707762557, "grad_norm": 22.348304748535156, "learning_rate": 9.805175038051751e-06, "loss": 0.2099, "step": 1288 }, { "epoch": 1.1771689497716895, "grad_norm": 35.98337173461914, "learning_rate": 9.80416032470827e-06, "loss": 0.2489, "step": 1289 }, { "epoch": 1.178082191780822, "grad_norm": 75.41336822509766, "learning_rate": 9.80314561136479e-06, "loss": 1.3984, "step": 1290 }, { "epoch": 1.1789954337899544, "grad_norm": 134.04562377929688, "learning_rate": 9.80213089802131e-06, "loss": 0.8437, "step": 1291 }, { "epoch": 1.1799086757990866, "grad_norm": 12.860993385314941, "learning_rate": 9.801116184677828e-06, "loss": 0.0911, "step": 1292 }, { "epoch": 1.180821917808219, "grad_norm": 99.05829620361328, "learning_rate": 9.800101471334349e-06, "loss": 1.8396, "step": 1293 }, { "epoch": 1.1817351598173516, "grad_norm": 108.53589630126953, "learning_rate": 9.79908675799087e-06, "loss": 1.5642, "step": 1294 }, { "epoch": 1.182648401826484, "grad_norm": 76.76710510253906, "learning_rate": 9.798072044647388e-06, "loss": 1.7132, "step": 1295 }, { "epoch": 1.1835616438356165, "grad_norm": 29.778417587280273, "learning_rate": 9.797057331303907e-06, "loss": 0.3266, "step": 1296 }, { "epoch": 1.184474885844749, "grad_norm": 104.66461181640625, "learning_rate": 9.796042617960428e-06, "loss": 1.4737, "step": 1297 }, { "epoch": 1.1853881278538814, "grad_norm": 5.323672771453857, "learning_rate": 9.795027904616946e-06, "loss": 0.0355, "step": 1298 }, { "epoch": 1.1863013698630138, "grad_norm": 1.5514048337936401, "learning_rate": 9.794013191273465e-06, "loss": 0.0128, "step": 1299 }, { "epoch": 1.187214611872146, "grad_norm": 67.87527465820312, "learning_rate": 9.792998477929986e-06, "loss": 1.7109, "step": 1300 }, { "epoch": 1.1881278538812785, "grad_norm": 7.436203956604004, "learning_rate": 9.791983764586505e-06, "loss": 0.0661, "step": 1301 }, { "epoch": 1.189041095890411, "grad_norm": 34.19457244873047, "learning_rate": 9.790969051243025e-06, "loss": 0.394, "step": 1302 }, { "epoch": 1.1899543378995434, "grad_norm": 35.89522933959961, "learning_rate": 9.789954337899544e-06, "loss": 0.4812, "step": 1303 }, { "epoch": 1.1908675799086759, "grad_norm": 23.62456512451172, "learning_rate": 9.788939624556065e-06, "loss": 0.2636, "step": 1304 }, { "epoch": 1.191780821917808, "grad_norm": 1.1131627559661865, "learning_rate": 9.787924911212583e-06, "loss": 0.0087, "step": 1305 }, { "epoch": 1.1926940639269406, "grad_norm": 1.5946546792984009, "learning_rate": 9.786910197869102e-06, "loss": 0.0166, "step": 1306 }, { "epoch": 1.193607305936073, "grad_norm": 90.08351135253906, "learning_rate": 9.785895484525623e-06, "loss": 2.2185, "step": 1307 }, { "epoch": 1.1945205479452055, "grad_norm": 0.1915072202682495, "learning_rate": 9.784880771182142e-06, "loss": 0.0013, "step": 1308 }, { "epoch": 1.195433789954338, "grad_norm": 56.974666595458984, "learning_rate": 9.78386605783866e-06, "loss": 0.5118, "step": 1309 }, { "epoch": 1.1963470319634704, "grad_norm": 79.21725463867188, "learning_rate": 9.782851344495181e-06, "loss": 0.8692, "step": 1310 }, { "epoch": 1.1972602739726028, "grad_norm": 1.5976619720458984, "learning_rate": 9.7818366311517e-06, "loss": 0.0149, "step": 1311 }, { "epoch": 1.1981735159817353, "grad_norm": 89.97797393798828, "learning_rate": 9.78082191780822e-06, "loss": 1.8761, "step": 1312 }, { "epoch": 1.1990867579908675, "grad_norm": 40.5387077331543, "learning_rate": 9.77980720446474e-06, "loss": 0.5213, "step": 1313 }, { "epoch": 1.2, "grad_norm": 72.33155822753906, "learning_rate": 9.77879249112126e-06, "loss": 0.8937, "step": 1314 }, { "epoch": 1.2009132420091324, "grad_norm": 2.2777836322784424, "learning_rate": 9.777777777777779e-06, "loss": 0.017, "step": 1315 }, { "epoch": 1.2018264840182649, "grad_norm": 101.13744354248047, "learning_rate": 9.776763064434297e-06, "loss": 4.5838, "step": 1316 }, { "epoch": 1.2027397260273973, "grad_norm": 22.500551223754883, "learning_rate": 9.775748351090818e-06, "loss": 0.2362, "step": 1317 }, { "epoch": 1.2036529680365298, "grad_norm": 26.798280715942383, "learning_rate": 9.774733637747337e-06, "loss": 0.2356, "step": 1318 }, { "epoch": 1.204566210045662, "grad_norm": 77.63356018066406, "learning_rate": 9.773718924403857e-06, "loss": 0.7928, "step": 1319 }, { "epoch": 1.2054794520547945, "grad_norm": 46.75981140136719, "learning_rate": 9.772704211060376e-06, "loss": 0.5747, "step": 1320 }, { "epoch": 1.206392694063927, "grad_norm": 28.960811614990234, "learning_rate": 9.771689497716895e-06, "loss": 0.2103, "step": 1321 }, { "epoch": 1.2073059360730594, "grad_norm": 52.42885208129883, "learning_rate": 9.770674784373416e-06, "loss": 0.1923, "step": 1322 }, { "epoch": 1.2082191780821918, "grad_norm": 42.40227127075195, "learning_rate": 9.769660071029934e-06, "loss": 0.4148, "step": 1323 }, { "epoch": 1.2091324200913243, "grad_norm": 13.182405471801758, "learning_rate": 9.768645357686455e-06, "loss": 0.091, "step": 1324 }, { "epoch": 1.2100456621004567, "grad_norm": 6.859556674957275, "learning_rate": 9.767630644342974e-06, "loss": 0.0467, "step": 1325 }, { "epoch": 1.210958904109589, "grad_norm": 9.867477416992188, "learning_rate": 9.766615930999493e-06, "loss": 0.0899, "step": 1326 }, { "epoch": 1.2118721461187214, "grad_norm": 65.77630615234375, "learning_rate": 9.765601217656013e-06, "loss": 0.5433, "step": 1327 }, { "epoch": 1.2127853881278539, "grad_norm": 102.79207611083984, "learning_rate": 9.764586504312532e-06, "loss": 2.869, "step": 1328 }, { "epoch": 1.2136986301369863, "grad_norm": 60.08284378051758, "learning_rate": 9.763571790969053e-06, "loss": 0.772, "step": 1329 }, { "epoch": 1.2146118721461188, "grad_norm": 18.89305877685547, "learning_rate": 9.762557077625571e-06, "loss": 0.1614, "step": 1330 }, { "epoch": 1.2155251141552512, "grad_norm": 76.47148132324219, "learning_rate": 9.76154236428209e-06, "loss": 1.0893, "step": 1331 }, { "epoch": 1.2164383561643834, "grad_norm": 75.61084747314453, "learning_rate": 9.760527650938611e-06, "loss": 1.5073, "step": 1332 }, { "epoch": 1.217351598173516, "grad_norm": 27.764156341552734, "learning_rate": 9.75951293759513e-06, "loss": 0.2101, "step": 1333 }, { "epoch": 1.2182648401826484, "grad_norm": 20.110763549804688, "learning_rate": 9.75849822425165e-06, "loss": 0.224, "step": 1334 }, { "epoch": 1.2191780821917808, "grad_norm": 6.982361793518066, "learning_rate": 9.757483510908169e-06, "loss": 0.0682, "step": 1335 }, { "epoch": 1.2200913242009133, "grad_norm": 9.039687156677246, "learning_rate": 9.756468797564688e-06, "loss": 0.1055, "step": 1336 }, { "epoch": 1.2210045662100457, "grad_norm": 8.757218360900879, "learning_rate": 9.755454084221208e-06, "loss": 0.0509, "step": 1337 }, { "epoch": 1.2219178082191782, "grad_norm": 1.4776160717010498, "learning_rate": 9.754439370877727e-06, "loss": 0.0162, "step": 1338 }, { "epoch": 1.2228310502283106, "grad_norm": 20.996078491210938, "learning_rate": 9.753424657534248e-06, "loss": 0.2399, "step": 1339 }, { "epoch": 1.2237442922374429, "grad_norm": 2.496030569076538, "learning_rate": 9.752409944190767e-06, "loss": 0.0215, "step": 1340 }, { "epoch": 1.2246575342465753, "grad_norm": 14.291552543640137, "learning_rate": 9.751395230847286e-06, "loss": 0.1462, "step": 1341 }, { "epoch": 1.2255707762557078, "grad_norm": 44.46611022949219, "learning_rate": 9.750380517503806e-06, "loss": 0.4847, "step": 1342 }, { "epoch": 1.2264840182648402, "grad_norm": 17.44870948791504, "learning_rate": 9.749365804160325e-06, "loss": 0.1933, "step": 1343 }, { "epoch": 1.2273972602739727, "grad_norm": 100.36543273925781, "learning_rate": 9.748351090816845e-06, "loss": 3.6484, "step": 1344 }, { "epoch": 1.228310502283105, "grad_norm": 13.496602058410645, "learning_rate": 9.747336377473364e-06, "loss": 0.0786, "step": 1345 }, { "epoch": 1.2292237442922374, "grad_norm": 74.28704833984375, "learning_rate": 9.746321664129885e-06, "loss": 0.2385, "step": 1346 }, { "epoch": 1.2301369863013698, "grad_norm": 69.9590835571289, "learning_rate": 9.745306950786404e-06, "loss": 2.012, "step": 1347 }, { "epoch": 1.2310502283105023, "grad_norm": 8.23176097869873, "learning_rate": 9.744292237442923e-06, "loss": 0.0781, "step": 1348 }, { "epoch": 1.2319634703196347, "grad_norm": 26.78479766845703, "learning_rate": 9.743277524099443e-06, "loss": 0.321, "step": 1349 }, { "epoch": 1.2328767123287672, "grad_norm": 52.98798370361328, "learning_rate": 9.742262810755962e-06, "loss": 0.5822, "step": 1350 }, { "epoch": 1.2337899543378996, "grad_norm": 16.104337692260742, "learning_rate": 9.74124809741248e-06, "loss": 0.1356, "step": 1351 }, { "epoch": 1.234703196347032, "grad_norm": 17.398014068603516, "learning_rate": 9.740233384069001e-06, "loss": 0.1574, "step": 1352 }, { "epoch": 1.2356164383561643, "grad_norm": 53.93695068359375, "learning_rate": 9.73921867072552e-06, "loss": 0.6395, "step": 1353 }, { "epoch": 1.2365296803652968, "grad_norm": 101.99588012695312, "learning_rate": 9.73820395738204e-06, "loss": 1.111, "step": 1354 }, { "epoch": 1.2374429223744292, "grad_norm": 75.9369125366211, "learning_rate": 9.73718924403856e-06, "loss": 1.1366, "step": 1355 }, { "epoch": 1.2383561643835617, "grad_norm": 20.180665969848633, "learning_rate": 9.73617453069508e-06, "loss": 0.1388, "step": 1356 }, { "epoch": 1.2392694063926941, "grad_norm": 65.64923858642578, "learning_rate": 9.735159817351599e-06, "loss": 0.8174, "step": 1357 }, { "epoch": 1.2401826484018266, "grad_norm": 49.45367431640625, "learning_rate": 9.734145104008118e-06, "loss": 0.2589, "step": 1358 }, { "epoch": 1.2410958904109588, "grad_norm": 31.095754623413086, "learning_rate": 9.733130390664638e-06, "loss": 0.1899, "step": 1359 }, { "epoch": 1.2420091324200913, "grad_norm": 32.06346130371094, "learning_rate": 9.732115677321157e-06, "loss": 0.2166, "step": 1360 }, { "epoch": 1.2429223744292237, "grad_norm": 10.49650764465332, "learning_rate": 9.731100963977676e-06, "loss": 0.0818, "step": 1361 }, { "epoch": 1.2438356164383562, "grad_norm": 39.6473388671875, "learning_rate": 9.730086250634197e-06, "loss": 0.3423, "step": 1362 }, { "epoch": 1.2447488584474886, "grad_norm": 7.177674770355225, "learning_rate": 9.729071537290717e-06, "loss": 0.0632, "step": 1363 }, { "epoch": 1.245662100456621, "grad_norm": 66.9870376586914, "learning_rate": 9.728056823947236e-06, "loss": 1.1498, "step": 1364 }, { "epoch": 1.2465753424657535, "grad_norm": 5.909432888031006, "learning_rate": 9.727042110603755e-06, "loss": 0.0654, "step": 1365 }, { "epoch": 1.2474885844748858, "grad_norm": 33.9631462097168, "learning_rate": 9.726027397260275e-06, "loss": 0.2934, "step": 1366 }, { "epoch": 1.2484018264840182, "grad_norm": 22.62592315673828, "learning_rate": 9.725012683916794e-06, "loss": 0.1214, "step": 1367 }, { "epoch": 1.2493150684931507, "grad_norm": 61.025943756103516, "learning_rate": 9.723997970573313e-06, "loss": 0.5251, "step": 1368 }, { "epoch": 1.2502283105022831, "grad_norm": 84.50248718261719, "learning_rate": 9.722983257229834e-06, "loss": 0.904, "step": 1369 }, { "epoch": 1.2511415525114156, "grad_norm": 45.88552474975586, "learning_rate": 9.721968543886352e-06, "loss": 0.4643, "step": 1370 }, { "epoch": 1.252054794520548, "grad_norm": 8.397671699523926, "learning_rate": 9.720953830542871e-06, "loss": 0.0325, "step": 1371 }, { "epoch": 1.2529680365296803, "grad_norm": 57.79999542236328, "learning_rate": 9.719939117199392e-06, "loss": 0.5758, "step": 1372 }, { "epoch": 1.2538812785388127, "grad_norm": 67.5996322631836, "learning_rate": 9.718924403855912e-06, "loss": 1.3255, "step": 1373 }, { "epoch": 1.2547945205479452, "grad_norm": 112.09172058105469, "learning_rate": 9.717909690512431e-06, "loss": 0.3841, "step": 1374 }, { "epoch": 1.2557077625570776, "grad_norm": 69.1429443359375, "learning_rate": 9.71689497716895e-06, "loss": 0.5281, "step": 1375 }, { "epoch": 1.25662100456621, "grad_norm": 50.741634368896484, "learning_rate": 9.71588026382547e-06, "loss": 0.5349, "step": 1376 }, { "epoch": 1.2575342465753425, "grad_norm": 83.62986755371094, "learning_rate": 9.71486555048199e-06, "loss": 1.026, "step": 1377 }, { "epoch": 1.258447488584475, "grad_norm": 73.45326232910156, "learning_rate": 9.713850837138508e-06, "loss": 1.1464, "step": 1378 }, { "epoch": 1.2593607305936074, "grad_norm": 112.7414321899414, "learning_rate": 9.712836123795029e-06, "loss": 2.3833, "step": 1379 }, { "epoch": 1.2602739726027397, "grad_norm": 97.99798583984375, "learning_rate": 9.711821410451548e-06, "loss": 1.4879, "step": 1380 }, { "epoch": 1.261187214611872, "grad_norm": 72.68624877929688, "learning_rate": 9.710806697108066e-06, "loss": 1.1093, "step": 1381 }, { "epoch": 1.2621004566210046, "grad_norm": 74.87551879882812, "learning_rate": 9.709791983764587e-06, "loss": 0.6938, "step": 1382 }, { "epoch": 1.263013698630137, "grad_norm": 63.7757568359375, "learning_rate": 9.708777270421108e-06, "loss": 0.7102, "step": 1383 }, { "epoch": 1.2639269406392695, "grad_norm": 71.97911071777344, "learning_rate": 9.707762557077626e-06, "loss": 0.8395, "step": 1384 }, { "epoch": 1.2648401826484017, "grad_norm": 138.13546752929688, "learning_rate": 9.706747843734145e-06, "loss": 4.1725, "step": 1385 }, { "epoch": 1.2657534246575342, "grad_norm": 4.150033950805664, "learning_rate": 9.705733130390666e-06, "loss": 0.0281, "step": 1386 }, { "epoch": 1.2666666666666666, "grad_norm": 17.694217681884766, "learning_rate": 9.704718417047185e-06, "loss": 0.1207, "step": 1387 }, { "epoch": 1.267579908675799, "grad_norm": 1.1916723251342773, "learning_rate": 9.703703703703703e-06, "loss": 0.0085, "step": 1388 }, { "epoch": 1.2684931506849315, "grad_norm": 0.2847817838191986, "learning_rate": 9.702688990360224e-06, "loss": 0.0017, "step": 1389 }, { "epoch": 1.269406392694064, "grad_norm": 53.59150695800781, "learning_rate": 9.701674277016745e-06, "loss": 0.2502, "step": 1390 }, { "epoch": 1.2703196347031964, "grad_norm": 9.923317909240723, "learning_rate": 9.700659563673263e-06, "loss": 0.0683, "step": 1391 }, { "epoch": 1.2712328767123289, "grad_norm": 10.773807525634766, "learning_rate": 9.699644850329782e-06, "loss": 0.0727, "step": 1392 }, { "epoch": 1.272146118721461, "grad_norm": 21.299779891967773, "learning_rate": 9.698630136986303e-06, "loss": 0.1404, "step": 1393 }, { "epoch": 1.2730593607305936, "grad_norm": 64.33821105957031, "learning_rate": 9.697615423642822e-06, "loss": 0.6946, "step": 1394 }, { "epoch": 1.273972602739726, "grad_norm": 90.3368148803711, "learning_rate": 9.69660071029934e-06, "loss": 0.6992, "step": 1395 }, { "epoch": 1.2748858447488585, "grad_norm": 4.694770336151123, "learning_rate": 9.695585996955861e-06, "loss": 0.0391, "step": 1396 }, { "epoch": 1.275799086757991, "grad_norm": 25.669994354248047, "learning_rate": 9.69457128361238e-06, "loss": 0.2137, "step": 1397 }, { "epoch": 1.2767123287671232, "grad_norm": 5.037447452545166, "learning_rate": 9.693556570268899e-06, "loss": 0.0386, "step": 1398 }, { "epoch": 1.2776255707762556, "grad_norm": 30.558486938476562, "learning_rate": 9.69254185692542e-06, "loss": 0.3864, "step": 1399 }, { "epoch": 1.278538812785388, "grad_norm": 90.134765625, "learning_rate": 9.69152714358194e-06, "loss": 1.7082, "step": 1400 }, { "epoch": 1.2794520547945205, "grad_norm": 146.46066284179688, "learning_rate": 9.690512430238459e-06, "loss": 1.5407, "step": 1401 }, { "epoch": 1.280365296803653, "grad_norm": 29.380475997924805, "learning_rate": 9.689497716894977e-06, "loss": 0.2179, "step": 1402 }, { "epoch": 1.2812785388127854, "grad_norm": 3.8395276069641113, "learning_rate": 9.688483003551498e-06, "loss": 0.0194, "step": 1403 }, { "epoch": 1.2821917808219179, "grad_norm": 2.5694339275360107, "learning_rate": 9.687468290208017e-06, "loss": 0.0132, "step": 1404 }, { "epoch": 1.2831050228310503, "grad_norm": 20.256622314453125, "learning_rate": 9.686453576864536e-06, "loss": 0.1638, "step": 1405 }, { "epoch": 1.2840182648401828, "grad_norm": 10.597224235534668, "learning_rate": 9.685438863521056e-06, "loss": 0.0702, "step": 1406 }, { "epoch": 1.284931506849315, "grad_norm": 2.3567984104156494, "learning_rate": 9.684424150177577e-06, "loss": 0.0061, "step": 1407 }, { "epoch": 1.2858447488584475, "grad_norm": 2.721822738647461, "learning_rate": 9.683409436834094e-06, "loss": 0.0102, "step": 1408 }, { "epoch": 1.28675799086758, "grad_norm": 24.570236206054688, "learning_rate": 9.682394723490614e-06, "loss": 0.1116, "step": 1409 }, { "epoch": 1.2876712328767124, "grad_norm": 3.2016706466674805, "learning_rate": 9.681380010147135e-06, "loss": 0.0177, "step": 1410 }, { "epoch": 1.2885844748858448, "grad_norm": 171.8626251220703, "learning_rate": 9.680365296803654e-06, "loss": 1.2564, "step": 1411 }, { "epoch": 1.289497716894977, "grad_norm": 107.90061950683594, "learning_rate": 9.679350583460173e-06, "loss": 1.9399, "step": 1412 }, { "epoch": 1.2904109589041095, "grad_norm": 65.92833709716797, "learning_rate": 9.678335870116693e-06, "loss": 0.3302, "step": 1413 }, { "epoch": 1.291324200913242, "grad_norm": 4.622359752655029, "learning_rate": 9.677321156773212e-06, "loss": 0.0235, "step": 1414 }, { "epoch": 1.2922374429223744, "grad_norm": 19.300851821899414, "learning_rate": 9.676306443429731e-06, "loss": 0.1043, "step": 1415 }, { "epoch": 1.2931506849315069, "grad_norm": 61.5306396484375, "learning_rate": 9.675291730086251e-06, "loss": 0.7189, "step": 1416 }, { "epoch": 1.2940639269406393, "grad_norm": 16.39863395690918, "learning_rate": 9.674277016742772e-06, "loss": 0.1084, "step": 1417 }, { "epoch": 1.2949771689497718, "grad_norm": 9.938925743103027, "learning_rate": 9.67326230339929e-06, "loss": 0.0858, "step": 1418 }, { "epoch": 1.2958904109589042, "grad_norm": 99.3412857055664, "learning_rate": 9.67224759005581e-06, "loss": 1.7747, "step": 1419 }, { "epoch": 1.2968036529680365, "grad_norm": 1.001049280166626, "learning_rate": 9.67123287671233e-06, "loss": 0.0076, "step": 1420 }, { "epoch": 1.297716894977169, "grad_norm": 44.922786712646484, "learning_rate": 9.670218163368849e-06, "loss": 0.6992, "step": 1421 }, { "epoch": 1.2986301369863014, "grad_norm": 65.34244537353516, "learning_rate": 9.669203450025368e-06, "loss": 0.8955, "step": 1422 }, { "epoch": 1.2995433789954338, "grad_norm": 98.58489227294922, "learning_rate": 9.668188736681888e-06, "loss": 3.581, "step": 1423 }, { "epoch": 1.3004566210045663, "grad_norm": 15.355813980102539, "learning_rate": 9.667174023338409e-06, "loss": 0.1275, "step": 1424 }, { "epoch": 1.3013698630136985, "grad_norm": 189.42855834960938, "learning_rate": 9.666159309994926e-06, "loss": 1.0924, "step": 1425 }, { "epoch": 1.302283105022831, "grad_norm": 18.816686630249023, "learning_rate": 9.665144596651447e-06, "loss": 0.1841, "step": 1426 }, { "epoch": 1.3031963470319634, "grad_norm": 82.831298828125, "learning_rate": 9.664129883307967e-06, "loss": 0.8549, "step": 1427 }, { "epoch": 1.3041095890410959, "grad_norm": 38.54985809326172, "learning_rate": 9.663115169964486e-06, "loss": 0.4426, "step": 1428 }, { "epoch": 1.3050228310502283, "grad_norm": 4.896639823913574, "learning_rate": 9.662100456621005e-06, "loss": 0.0395, "step": 1429 }, { "epoch": 1.3059360730593608, "grad_norm": 80.0697250366211, "learning_rate": 9.661085743277525e-06, "loss": 4.1802, "step": 1430 }, { "epoch": 1.3068493150684932, "grad_norm": 46.15632247924805, "learning_rate": 9.660071029934044e-06, "loss": 0.341, "step": 1431 }, { "epoch": 1.3077625570776257, "grad_norm": 31.489274978637695, "learning_rate": 9.659056316590563e-06, "loss": 0.4029, "step": 1432 }, { "epoch": 1.3086757990867581, "grad_norm": 61.999786376953125, "learning_rate": 9.658041603247084e-06, "loss": 1.0149, "step": 1433 }, { "epoch": 1.3095890410958904, "grad_norm": 33.68089294433594, "learning_rate": 9.657026889903604e-06, "loss": 0.2838, "step": 1434 }, { "epoch": 1.3105022831050228, "grad_norm": 5.500439643859863, "learning_rate": 9.656012176560123e-06, "loss": 0.044, "step": 1435 }, { "epoch": 1.3114155251141553, "grad_norm": 16.047311782836914, "learning_rate": 9.654997463216642e-06, "loss": 0.171, "step": 1436 }, { "epoch": 1.3123287671232877, "grad_norm": 87.92000579833984, "learning_rate": 9.653982749873162e-06, "loss": 3.0954, "step": 1437 }, { "epoch": 1.3132420091324202, "grad_norm": 80.80741119384766, "learning_rate": 9.652968036529681e-06, "loss": 2.3257, "step": 1438 }, { "epoch": 1.3141552511415524, "grad_norm": 31.766555786132812, "learning_rate": 9.6519533231862e-06, "loss": 0.3942, "step": 1439 }, { "epoch": 1.3150684931506849, "grad_norm": 7.00796365737915, "learning_rate": 9.65093860984272e-06, "loss": 0.074, "step": 1440 }, { "epoch": 1.3159817351598173, "grad_norm": 43.61451721191406, "learning_rate": 9.64992389649924e-06, "loss": 0.7853, "step": 1441 }, { "epoch": 1.3168949771689498, "grad_norm": 37.971988677978516, "learning_rate": 9.648909183155758e-06, "loss": 0.4117, "step": 1442 }, { "epoch": 1.3178082191780822, "grad_norm": 49.832969665527344, "learning_rate": 9.647894469812279e-06, "loss": 1.4936, "step": 1443 }, { "epoch": 1.3187214611872147, "grad_norm": 50.70769500732422, "learning_rate": 9.6468797564688e-06, "loss": 0.7108, "step": 1444 }, { "epoch": 1.3196347031963471, "grad_norm": 6.926607608795166, "learning_rate": 9.645865043125318e-06, "loss": 0.0557, "step": 1445 }, { "epoch": 1.3205479452054796, "grad_norm": 48.434635162353516, "learning_rate": 9.644850329781837e-06, "loss": 0.6785, "step": 1446 }, { "epoch": 1.3214611872146118, "grad_norm": 26.447168350219727, "learning_rate": 9.643835616438358e-06, "loss": 0.3194, "step": 1447 }, { "epoch": 1.3223744292237443, "grad_norm": 84.07521057128906, "learning_rate": 9.642820903094877e-06, "loss": 2.3346, "step": 1448 }, { "epoch": 1.3232876712328767, "grad_norm": 21.902244567871094, "learning_rate": 9.641806189751395e-06, "loss": 0.234, "step": 1449 }, { "epoch": 1.3242009132420092, "grad_norm": 3.658726215362549, "learning_rate": 9.640791476407916e-06, "loss": 0.0278, "step": 1450 }, { "epoch": 1.3251141552511416, "grad_norm": 36.34769821166992, "learning_rate": 9.639776763064435e-06, "loss": 0.4927, "step": 1451 }, { "epoch": 1.3260273972602739, "grad_norm": 29.01278305053711, "learning_rate": 9.638762049720954e-06, "loss": 0.1039, "step": 1452 }, { "epoch": 1.3269406392694063, "grad_norm": 35.09388732910156, "learning_rate": 9.637747336377474e-06, "loss": 0.4738, "step": 1453 }, { "epoch": 1.3278538812785388, "grad_norm": 47.38325119018555, "learning_rate": 9.636732623033995e-06, "loss": 0.534, "step": 1454 }, { "epoch": 1.3287671232876712, "grad_norm": 44.09049987792969, "learning_rate": 9.635717909690514e-06, "loss": 0.477, "step": 1455 }, { "epoch": 1.3296803652968037, "grad_norm": 62.19829177856445, "learning_rate": 9.634703196347032e-06, "loss": 1.2007, "step": 1456 }, { "epoch": 1.3305936073059361, "grad_norm": 35.42786407470703, "learning_rate": 9.633688483003553e-06, "loss": 0.2203, "step": 1457 }, { "epoch": 1.3315068493150686, "grad_norm": 6.3199849128723145, "learning_rate": 9.632673769660072e-06, "loss": 0.0662, "step": 1458 }, { "epoch": 1.332420091324201, "grad_norm": 1.4242088794708252, "learning_rate": 9.63165905631659e-06, "loss": 0.0096, "step": 1459 }, { "epoch": 1.3333333333333333, "grad_norm": 25.812896728515625, "learning_rate": 9.630644342973111e-06, "loss": 0.1839, "step": 1460 }, { "epoch": 1.3342465753424657, "grad_norm": 90.57601165771484, "learning_rate": 9.62962962962963e-06, "loss": 0.9005, "step": 1461 }, { "epoch": 1.3351598173515982, "grad_norm": 71.58470153808594, "learning_rate": 9.62861491628615e-06, "loss": 1.2591, "step": 1462 }, { "epoch": 1.3360730593607306, "grad_norm": 73.86967468261719, "learning_rate": 9.62760020294267e-06, "loss": 1.2841, "step": 1463 }, { "epoch": 1.336986301369863, "grad_norm": 7.551920413970947, "learning_rate": 9.62658548959919e-06, "loss": 0.0651, "step": 1464 }, { "epoch": 1.3378995433789953, "grad_norm": 43.48785400390625, "learning_rate": 9.625570776255709e-06, "loss": 0.255, "step": 1465 }, { "epoch": 1.3388127853881278, "grad_norm": 19.011568069458008, "learning_rate": 9.624556062912228e-06, "loss": 0.284, "step": 1466 }, { "epoch": 1.3397260273972602, "grad_norm": 1.5431352853775024, "learning_rate": 9.623541349568748e-06, "loss": 0.0096, "step": 1467 }, { "epoch": 1.3406392694063927, "grad_norm": 54.45808029174805, "learning_rate": 9.622526636225267e-06, "loss": 1.1732, "step": 1468 }, { "epoch": 1.3415525114155251, "grad_norm": 78.44134521484375, "learning_rate": 9.621511922881786e-06, "loss": 1.4068, "step": 1469 }, { "epoch": 1.3424657534246576, "grad_norm": 84.48384094238281, "learning_rate": 9.620497209538306e-06, "loss": 2.0141, "step": 1470 }, { "epoch": 1.34337899543379, "grad_norm": 4.08621883392334, "learning_rate": 9.619482496194825e-06, "loss": 0.0346, "step": 1471 }, { "epoch": 1.3442922374429225, "grad_norm": 38.54029846191406, "learning_rate": 9.618467782851346e-06, "loss": 0.5788, "step": 1472 }, { "epoch": 1.345205479452055, "grad_norm": 81.01033782958984, "learning_rate": 9.617453069507865e-06, "loss": 3.0452, "step": 1473 }, { "epoch": 1.3461187214611872, "grad_norm": 1.3637975454330444, "learning_rate": 9.616438356164385e-06, "loss": 0.0088, "step": 1474 }, { "epoch": 1.3470319634703196, "grad_norm": 24.143051147460938, "learning_rate": 9.615423642820904e-06, "loss": 0.2372, "step": 1475 }, { "epoch": 1.347945205479452, "grad_norm": 5.817459583282471, "learning_rate": 9.614408929477423e-06, "loss": 0.0546, "step": 1476 }, { "epoch": 1.3488584474885845, "grad_norm": 0.9851129055023193, "learning_rate": 9.613394216133943e-06, "loss": 0.0068, "step": 1477 }, { "epoch": 1.349771689497717, "grad_norm": 1.123574137687683, "learning_rate": 9.612379502790462e-06, "loss": 0.0121, "step": 1478 }, { "epoch": 1.3506849315068492, "grad_norm": 51.4036865234375, "learning_rate": 9.611364789446983e-06, "loss": 0.9576, "step": 1479 }, { "epoch": 1.3515981735159817, "grad_norm": 30.933441162109375, "learning_rate": 9.610350076103502e-06, "loss": 0.359, "step": 1480 }, { "epoch": 1.3525114155251141, "grad_norm": 7.64380407333374, "learning_rate": 9.60933536276002e-06, "loss": 0.0413, "step": 1481 }, { "epoch": 1.3534246575342466, "grad_norm": 103.71795654296875, "learning_rate": 9.608320649416541e-06, "loss": 2.5036, "step": 1482 }, { "epoch": 1.354337899543379, "grad_norm": 54.01983642578125, "learning_rate": 9.60730593607306e-06, "loss": 1.2138, "step": 1483 }, { "epoch": 1.3552511415525115, "grad_norm": 5.22285795211792, "learning_rate": 9.60629122272958e-06, "loss": 0.0465, "step": 1484 }, { "epoch": 1.356164383561644, "grad_norm": 71.15897369384766, "learning_rate": 9.6052765093861e-06, "loss": 4.1138, "step": 1485 }, { "epoch": 1.3570776255707764, "grad_norm": 22.409990310668945, "learning_rate": 9.604261796042618e-06, "loss": 0.3304, "step": 1486 }, { "epoch": 1.3579908675799086, "grad_norm": 83.15522003173828, "learning_rate": 9.603247082699139e-06, "loss": 1.2905, "step": 1487 }, { "epoch": 1.358904109589041, "grad_norm": 37.775856018066406, "learning_rate": 9.602232369355657e-06, "loss": 0.3779, "step": 1488 }, { "epoch": 1.3598173515981735, "grad_norm": 3.7423958778381348, "learning_rate": 9.601217656012178e-06, "loss": 0.0443, "step": 1489 }, { "epoch": 1.360730593607306, "grad_norm": 224.20973205566406, "learning_rate": 9.600202942668697e-06, "loss": 2.6999, "step": 1490 }, { "epoch": 1.3616438356164384, "grad_norm": 7.968358039855957, "learning_rate": 9.599188229325216e-06, "loss": 0.097, "step": 1491 }, { "epoch": 1.3625570776255707, "grad_norm": 5.74074125289917, "learning_rate": 9.598173515981736e-06, "loss": 0.0579, "step": 1492 }, { "epoch": 1.363470319634703, "grad_norm": 70.32708740234375, "learning_rate": 9.597158802638255e-06, "loss": 1.1187, "step": 1493 }, { "epoch": 1.3643835616438356, "grad_norm": 14.546177864074707, "learning_rate": 9.596144089294776e-06, "loss": 0.2209, "step": 1494 }, { "epoch": 1.365296803652968, "grad_norm": 31.634342193603516, "learning_rate": 9.595129375951294e-06, "loss": 0.3553, "step": 1495 }, { "epoch": 1.3662100456621005, "grad_norm": 76.03375244140625, "learning_rate": 9.594114662607813e-06, "loss": 3.4942, "step": 1496 }, { "epoch": 1.367123287671233, "grad_norm": 69.96204376220703, "learning_rate": 9.593099949264334e-06, "loss": 1.8429, "step": 1497 }, { "epoch": 1.3680365296803654, "grad_norm": 6.5434136390686035, "learning_rate": 9.592085235920853e-06, "loss": 0.0635, "step": 1498 }, { "epoch": 1.3689497716894978, "grad_norm": 16.487802505493164, "learning_rate": 9.591070522577373e-06, "loss": 0.2702, "step": 1499 }, { "epoch": 1.36986301369863, "grad_norm": 71.136474609375, "learning_rate": 9.590055809233892e-06, "loss": 4.033, "step": 1500 }, { "epoch": 1.3707762557077625, "grad_norm": 59.7105598449707, "learning_rate": 9.589041095890411e-06, "loss": 0.7157, "step": 1501 }, { "epoch": 1.371689497716895, "grad_norm": 6.653661251068115, "learning_rate": 9.588026382546931e-06, "loss": 0.0735, "step": 1502 }, { "epoch": 1.3726027397260274, "grad_norm": 30.954513549804688, "learning_rate": 9.58701166920345e-06, "loss": 0.7808, "step": 1503 }, { "epoch": 1.3735159817351599, "grad_norm": 42.57072830200195, "learning_rate": 9.58599695585997e-06, "loss": 0.6605, "step": 1504 }, { "epoch": 1.374429223744292, "grad_norm": 94.22811889648438, "learning_rate": 9.58498224251649e-06, "loss": 2.019, "step": 1505 }, { "epoch": 1.3753424657534246, "grad_norm": 61.55830383300781, "learning_rate": 9.58396752917301e-06, "loss": 1.4202, "step": 1506 }, { "epoch": 1.376255707762557, "grad_norm": 18.51468849182129, "learning_rate": 9.582952815829529e-06, "loss": 0.2506, "step": 1507 }, { "epoch": 1.3771689497716895, "grad_norm": 13.365358352661133, "learning_rate": 9.581938102486048e-06, "loss": 0.2119, "step": 1508 }, { "epoch": 1.378082191780822, "grad_norm": 77.93798828125, "learning_rate": 9.580923389142568e-06, "loss": 1.7609, "step": 1509 }, { "epoch": 1.3789954337899544, "grad_norm": 54.19511795043945, "learning_rate": 9.579908675799087e-06, "loss": 1.0986, "step": 1510 }, { "epoch": 1.3799086757990868, "grad_norm": 57.66780471801758, "learning_rate": 9.578893962455606e-06, "loss": 2.1229, "step": 1511 }, { "epoch": 1.3808219178082193, "grad_norm": 32.741493225097656, "learning_rate": 9.577879249112127e-06, "loss": 0.4042, "step": 1512 }, { "epoch": 1.3817351598173517, "grad_norm": 16.21018409729004, "learning_rate": 9.576864535768645e-06, "loss": 0.1832, "step": 1513 }, { "epoch": 1.382648401826484, "grad_norm": 4.552210330963135, "learning_rate": 9.575849822425166e-06, "loss": 0.0704, "step": 1514 }, { "epoch": 1.3835616438356164, "grad_norm": 7.412917137145996, "learning_rate": 9.574835109081685e-06, "loss": 0.0889, "step": 1515 }, { "epoch": 1.3844748858447489, "grad_norm": 28.872234344482422, "learning_rate": 9.573820395738205e-06, "loss": 0.3458, "step": 1516 }, { "epoch": 1.3853881278538813, "grad_norm": 51.31852722167969, "learning_rate": 9.572805682394724e-06, "loss": 0.6722, "step": 1517 }, { "epoch": 1.3863013698630138, "grad_norm": 46.49010467529297, "learning_rate": 9.571790969051243e-06, "loss": 0.6966, "step": 1518 }, { "epoch": 1.387214611872146, "grad_norm": 1.3255218267440796, "learning_rate": 9.570776255707764e-06, "loss": 0.0135, "step": 1519 }, { "epoch": 1.3881278538812785, "grad_norm": 6.05112886428833, "learning_rate": 9.569761542364282e-06, "loss": 0.071, "step": 1520 }, { "epoch": 1.389041095890411, "grad_norm": 38.630558013916016, "learning_rate": 9.568746829020801e-06, "loss": 0.3717, "step": 1521 }, { "epoch": 1.3899543378995434, "grad_norm": 25.53802490234375, "learning_rate": 9.567732115677322e-06, "loss": 0.3433, "step": 1522 }, { "epoch": 1.3908675799086758, "grad_norm": 65.20556640625, "learning_rate": 9.566717402333842e-06, "loss": 0.6252, "step": 1523 }, { "epoch": 1.3917808219178083, "grad_norm": 21.739166259765625, "learning_rate": 9.565702688990361e-06, "loss": 0.251, "step": 1524 }, { "epoch": 1.3926940639269407, "grad_norm": 55.90096664428711, "learning_rate": 9.56468797564688e-06, "loss": 0.9529, "step": 1525 }, { "epoch": 1.3936073059360732, "grad_norm": 66.39762878417969, "learning_rate": 9.5636732623034e-06, "loss": 1.9983, "step": 1526 }, { "epoch": 1.3945205479452054, "grad_norm": 32.10663604736328, "learning_rate": 9.56265854895992e-06, "loss": 0.3597, "step": 1527 }, { "epoch": 1.3954337899543379, "grad_norm": 30.248992919921875, "learning_rate": 9.561643835616438e-06, "loss": 0.8644, "step": 1528 }, { "epoch": 1.3963470319634703, "grad_norm": 0.8164404034614563, "learning_rate": 9.560629122272959e-06, "loss": 0.01, "step": 1529 }, { "epoch": 1.3972602739726028, "grad_norm": 3.1497678756713867, "learning_rate": 9.559614408929478e-06, "loss": 0.0318, "step": 1530 }, { "epoch": 1.3981735159817352, "grad_norm": 12.861664772033691, "learning_rate": 9.558599695585997e-06, "loss": 0.1965, "step": 1531 }, { "epoch": 1.3990867579908675, "grad_norm": 22.98348617553711, "learning_rate": 9.557584982242517e-06, "loss": 0.2828, "step": 1532 }, { "epoch": 1.4, "grad_norm": 39.568336486816406, "learning_rate": 9.556570268899038e-06, "loss": 0.5672, "step": 1533 }, { "epoch": 1.4009132420091324, "grad_norm": 38.919342041015625, "learning_rate": 9.555555555555556e-06, "loss": 0.4044, "step": 1534 }, { "epoch": 1.4018264840182648, "grad_norm": 30.428747177124023, "learning_rate": 9.554540842212075e-06, "loss": 0.3197, "step": 1535 }, { "epoch": 1.4027397260273973, "grad_norm": 21.503053665161133, "learning_rate": 9.553526128868596e-06, "loss": 0.1955, "step": 1536 }, { "epoch": 1.4036529680365297, "grad_norm": 58.029319763183594, "learning_rate": 9.552511415525115e-06, "loss": 2.2867, "step": 1537 }, { "epoch": 1.4045662100456622, "grad_norm": 23.19981575012207, "learning_rate": 9.551496702181634e-06, "loss": 0.2823, "step": 1538 }, { "epoch": 1.4054794520547946, "grad_norm": 70.9052963256836, "learning_rate": 9.550481988838154e-06, "loss": 2.0096, "step": 1539 }, { "epoch": 1.4063926940639269, "grad_norm": 53.16992950439453, "learning_rate": 9.549467275494673e-06, "loss": 0.5198, "step": 1540 }, { "epoch": 1.4073059360730593, "grad_norm": 7.446700096130371, "learning_rate": 9.548452562151192e-06, "loss": 0.0948, "step": 1541 }, { "epoch": 1.4082191780821918, "grad_norm": 15.08086109161377, "learning_rate": 9.547437848807712e-06, "loss": 0.0844, "step": 1542 }, { "epoch": 1.4091324200913242, "grad_norm": 15.049654960632324, "learning_rate": 9.546423135464233e-06, "loss": 0.1636, "step": 1543 }, { "epoch": 1.4100456621004567, "grad_norm": 6.431410312652588, "learning_rate": 9.545408422120752e-06, "loss": 0.0629, "step": 1544 }, { "epoch": 1.410958904109589, "grad_norm": 83.44270324707031, "learning_rate": 9.54439370877727e-06, "loss": 3.5292, "step": 1545 }, { "epoch": 1.4118721461187214, "grad_norm": 11.698816299438477, "learning_rate": 9.543378995433791e-06, "loss": 0.1356, "step": 1546 }, { "epoch": 1.4127853881278538, "grad_norm": 9.764521598815918, "learning_rate": 9.54236428209031e-06, "loss": 0.1543, "step": 1547 }, { "epoch": 1.4136986301369863, "grad_norm": 17.27014923095703, "learning_rate": 9.541349568746829e-06, "loss": 0.2516, "step": 1548 }, { "epoch": 1.4146118721461187, "grad_norm": 29.205245971679688, "learning_rate": 9.54033485540335e-06, "loss": 0.4321, "step": 1549 }, { "epoch": 1.4155251141552512, "grad_norm": 2.134336233139038, "learning_rate": 9.53932014205987e-06, "loss": 0.0293, "step": 1550 }, { "epoch": 1.4164383561643836, "grad_norm": 70.25287628173828, "learning_rate": 9.538305428716389e-06, "loss": 3.4964, "step": 1551 }, { "epoch": 1.417351598173516, "grad_norm": 73.83332824707031, "learning_rate": 9.537290715372908e-06, "loss": 1.3027, "step": 1552 }, { "epoch": 1.4182648401826485, "grad_norm": 6.622846603393555, "learning_rate": 9.536276002029428e-06, "loss": 0.0756, "step": 1553 }, { "epoch": 1.4191780821917808, "grad_norm": 10.546354293823242, "learning_rate": 9.535261288685947e-06, "loss": 0.1193, "step": 1554 }, { "epoch": 1.4200913242009132, "grad_norm": 55.4562873840332, "learning_rate": 9.534246575342466e-06, "loss": 0.8673, "step": 1555 }, { "epoch": 1.4210045662100457, "grad_norm": 55.955326080322266, "learning_rate": 9.533231861998986e-06, "loss": 2.3767, "step": 1556 }, { "epoch": 1.4219178082191781, "grad_norm": 64.64836120605469, "learning_rate": 9.532217148655505e-06, "loss": 1.1621, "step": 1557 }, { "epoch": 1.4228310502283106, "grad_norm": 2.0497853755950928, "learning_rate": 9.531202435312024e-06, "loss": 0.0268, "step": 1558 }, { "epoch": 1.4237442922374428, "grad_norm": 11.41779613494873, "learning_rate": 9.530187721968545e-06, "loss": 0.1001, "step": 1559 }, { "epoch": 1.4246575342465753, "grad_norm": 68.68453216552734, "learning_rate": 9.529173008625065e-06, "loss": 2.9501, "step": 1560 }, { "epoch": 1.4255707762557077, "grad_norm": 0.9464531540870667, "learning_rate": 9.528158295281584e-06, "loss": 0.0097, "step": 1561 }, { "epoch": 1.4264840182648402, "grad_norm": 74.70503234863281, "learning_rate": 9.527143581938103e-06, "loss": 1.4782, "step": 1562 }, { "epoch": 1.4273972602739726, "grad_norm": 29.338958740234375, "learning_rate": 9.526128868594623e-06, "loss": 0.21, "step": 1563 }, { "epoch": 1.428310502283105, "grad_norm": 53.442543029785156, "learning_rate": 9.525114155251142e-06, "loss": 0.6548, "step": 1564 }, { "epoch": 1.4292237442922375, "grad_norm": 22.638837814331055, "learning_rate": 9.524099441907661e-06, "loss": 0.2928, "step": 1565 }, { "epoch": 1.43013698630137, "grad_norm": 39.10401916503906, "learning_rate": 9.523084728564182e-06, "loss": 0.4156, "step": 1566 }, { "epoch": 1.4310502283105022, "grad_norm": 109.79373931884766, "learning_rate": 9.522070015220702e-06, "loss": 0.7524, "step": 1567 }, { "epoch": 1.4319634703196347, "grad_norm": 4.062772750854492, "learning_rate": 9.52105530187722e-06, "loss": 0.0329, "step": 1568 }, { "epoch": 1.4328767123287671, "grad_norm": 32.68654251098633, "learning_rate": 9.52004058853374e-06, "loss": 0.3369, "step": 1569 }, { "epoch": 1.4337899543378996, "grad_norm": 4.201336860656738, "learning_rate": 9.51902587519026e-06, "loss": 0.0289, "step": 1570 }, { "epoch": 1.434703196347032, "grad_norm": 18.691307067871094, "learning_rate": 9.51801116184678e-06, "loss": 0.1233, "step": 1571 }, { "epoch": 1.4356164383561643, "grad_norm": 15.714521408081055, "learning_rate": 9.516996448503298e-06, "loss": 0.2088, "step": 1572 }, { "epoch": 1.4365296803652967, "grad_norm": 11.052412033081055, "learning_rate": 9.515981735159819e-06, "loss": 0.1382, "step": 1573 }, { "epoch": 1.4374429223744292, "grad_norm": 22.710851669311523, "learning_rate": 9.514967021816337e-06, "loss": 0.3964, "step": 1574 }, { "epoch": 1.4383561643835616, "grad_norm": 111.91967010498047, "learning_rate": 9.513952308472856e-06, "loss": 1.1223, "step": 1575 }, { "epoch": 1.439269406392694, "grad_norm": 16.076980590820312, "learning_rate": 9.512937595129377e-06, "loss": 0.2195, "step": 1576 }, { "epoch": 1.4401826484018265, "grad_norm": 1.0582917928695679, "learning_rate": 9.511922881785897e-06, "loss": 0.0132, "step": 1577 }, { "epoch": 1.441095890410959, "grad_norm": 4.027010440826416, "learning_rate": 9.510908168442416e-06, "loss": 0.0381, "step": 1578 }, { "epoch": 1.4420091324200914, "grad_norm": 10.182439804077148, "learning_rate": 9.509893455098935e-06, "loss": 0.0881, "step": 1579 }, { "epoch": 1.4429223744292237, "grad_norm": 11.512910842895508, "learning_rate": 9.508878741755456e-06, "loss": 0.1204, "step": 1580 }, { "epoch": 1.4438356164383561, "grad_norm": 6.788900375366211, "learning_rate": 9.507864028411974e-06, "loss": 0.0495, "step": 1581 }, { "epoch": 1.4447488584474886, "grad_norm": 168.98052978515625, "learning_rate": 9.506849315068493e-06, "loss": 0.4361, "step": 1582 }, { "epoch": 1.445662100456621, "grad_norm": 1.506465196609497, "learning_rate": 9.505834601725014e-06, "loss": 0.0147, "step": 1583 }, { "epoch": 1.4465753424657535, "grad_norm": 94.01361846923828, "learning_rate": 9.504819888381533e-06, "loss": 3.532, "step": 1584 }, { "epoch": 1.4474885844748857, "grad_norm": 60.66515350341797, "learning_rate": 9.503805175038051e-06, "loss": 1.2916, "step": 1585 }, { "epoch": 1.4484018264840182, "grad_norm": 2.251344919204712, "learning_rate": 9.502790461694572e-06, "loss": 0.0212, "step": 1586 }, { "epoch": 1.4493150684931506, "grad_norm": 8.240509986877441, "learning_rate": 9.501775748351093e-06, "loss": 0.065, "step": 1587 }, { "epoch": 1.450228310502283, "grad_norm": 17.288837432861328, "learning_rate": 9.500761035007611e-06, "loss": 0.1363, "step": 1588 }, { "epoch": 1.4511415525114155, "grad_norm": 12.358570098876953, "learning_rate": 9.49974632166413e-06, "loss": 0.1279, "step": 1589 }, { "epoch": 1.452054794520548, "grad_norm": 2.6537132263183594, "learning_rate": 9.49873160832065e-06, "loss": 0.0323, "step": 1590 }, { "epoch": 1.4529680365296804, "grad_norm": 32.00151443481445, "learning_rate": 9.49771689497717e-06, "loss": 0.4817, "step": 1591 }, { "epoch": 1.4538812785388129, "grad_norm": 17.92694664001465, "learning_rate": 9.496702181633688e-06, "loss": 0.1605, "step": 1592 }, { "epoch": 1.4547945205479453, "grad_norm": 8.430505752563477, "learning_rate": 9.495687468290209e-06, "loss": 0.0781, "step": 1593 }, { "epoch": 1.4557077625570776, "grad_norm": 3.1785147190093994, "learning_rate": 9.49467275494673e-06, "loss": 0.0301, "step": 1594 }, { "epoch": 1.45662100456621, "grad_norm": 26.774803161621094, "learning_rate": 9.493658041603248e-06, "loss": 0.2981, "step": 1595 }, { "epoch": 1.4575342465753425, "grad_norm": 13.582262992858887, "learning_rate": 9.492643328259767e-06, "loss": 0.1524, "step": 1596 }, { "epoch": 1.458447488584475, "grad_norm": 3.0308339595794678, "learning_rate": 9.491628614916288e-06, "loss": 0.0327, "step": 1597 }, { "epoch": 1.4593607305936074, "grad_norm": 8.058058738708496, "learning_rate": 9.490613901572807e-06, "loss": 0.087, "step": 1598 }, { "epoch": 1.4602739726027396, "grad_norm": 2.233325958251953, "learning_rate": 9.489599188229325e-06, "loss": 0.0182, "step": 1599 }, { "epoch": 1.461187214611872, "grad_norm": 110.89195251464844, "learning_rate": 9.488584474885846e-06, "loss": 1.2701, "step": 1600 }, { "epoch": 1.4621004566210045, "grad_norm": 1.653867483139038, "learning_rate": 9.487569761542365e-06, "loss": 0.0181, "step": 1601 }, { "epoch": 1.463013698630137, "grad_norm": 54.10320281982422, "learning_rate": 9.486555048198884e-06, "loss": 0.2272, "step": 1602 }, { "epoch": 1.4639269406392694, "grad_norm": 19.972335815429688, "learning_rate": 9.485540334855404e-06, "loss": 0.1764, "step": 1603 }, { "epoch": 1.4648401826484019, "grad_norm": 28.75660514831543, "learning_rate": 9.484525621511925e-06, "loss": 0.4653, "step": 1604 }, { "epoch": 1.4657534246575343, "grad_norm": 23.844585418701172, "learning_rate": 9.483510908168444e-06, "loss": 0.2678, "step": 1605 }, { "epoch": 1.4666666666666668, "grad_norm": 46.42759323120117, "learning_rate": 9.482496194824962e-06, "loss": 0.9345, "step": 1606 }, { "epoch": 1.467579908675799, "grad_norm": 52.71480941772461, "learning_rate": 9.481481481481483e-06, "loss": 0.4306, "step": 1607 }, { "epoch": 1.4684931506849315, "grad_norm": 18.120468139648438, "learning_rate": 9.480466768138002e-06, "loss": 0.0443, "step": 1608 }, { "epoch": 1.469406392694064, "grad_norm": 0.4596172869205475, "learning_rate": 9.47945205479452e-06, "loss": 0.0038, "step": 1609 }, { "epoch": 1.4703196347031964, "grad_norm": 33.53336715698242, "learning_rate": 9.478437341451041e-06, "loss": 0.5415, "step": 1610 }, { "epoch": 1.4712328767123288, "grad_norm": 66.99646759033203, "learning_rate": 9.47742262810756e-06, "loss": 1.6276, "step": 1611 }, { "epoch": 1.472146118721461, "grad_norm": 73.13235473632812, "learning_rate": 9.476407914764079e-06, "loss": 1.7571, "step": 1612 }, { "epoch": 1.4730593607305935, "grad_norm": 0.5338083505630493, "learning_rate": 9.4753932014206e-06, "loss": 0.0058, "step": 1613 }, { "epoch": 1.473972602739726, "grad_norm": 93.96912384033203, "learning_rate": 9.47437848807712e-06, "loss": 2.8065, "step": 1614 }, { "epoch": 1.4748858447488584, "grad_norm": 1.6879627704620361, "learning_rate": 9.473363774733639e-06, "loss": 0.0101, "step": 1615 }, { "epoch": 1.4757990867579909, "grad_norm": 30.691696166992188, "learning_rate": 9.472349061390158e-06, "loss": 0.5927, "step": 1616 }, { "epoch": 1.4767123287671233, "grad_norm": 46.1328239440918, "learning_rate": 9.471334348046678e-06, "loss": 0.5572, "step": 1617 }, { "epoch": 1.4776255707762558, "grad_norm": 3.7443790435791016, "learning_rate": 9.470319634703197e-06, "loss": 0.0359, "step": 1618 }, { "epoch": 1.4785388127853882, "grad_norm": 46.01026916503906, "learning_rate": 9.469304921359716e-06, "loss": 0.6352, "step": 1619 }, { "epoch": 1.4794520547945205, "grad_norm": 2.315256357192993, "learning_rate": 9.468290208016236e-06, "loss": 0.0224, "step": 1620 }, { "epoch": 1.480365296803653, "grad_norm": 2.0677030086517334, "learning_rate": 9.467275494672755e-06, "loss": 0.0232, "step": 1621 }, { "epoch": 1.4812785388127854, "grad_norm": 4.016256332397461, "learning_rate": 9.466260781329276e-06, "loss": 0.0365, "step": 1622 }, { "epoch": 1.4821917808219178, "grad_norm": 3.6809675693511963, "learning_rate": 9.465246067985795e-06, "loss": 0.0303, "step": 1623 }, { "epoch": 1.4831050228310503, "grad_norm": 22.754255294799805, "learning_rate": 9.464231354642315e-06, "loss": 0.2686, "step": 1624 }, { "epoch": 1.4840182648401825, "grad_norm": 1.5533910989761353, "learning_rate": 9.463216641298834e-06, "loss": 0.014, "step": 1625 }, { "epoch": 1.484931506849315, "grad_norm": 5.197054386138916, "learning_rate": 9.462201927955353e-06, "loss": 0.0512, "step": 1626 }, { "epoch": 1.4858447488584474, "grad_norm": 1.9355757236480713, "learning_rate": 9.461187214611873e-06, "loss": 0.0148, "step": 1627 }, { "epoch": 1.4867579908675799, "grad_norm": 0.7837859988212585, "learning_rate": 9.460172501268392e-06, "loss": 0.0081, "step": 1628 }, { "epoch": 1.4876712328767123, "grad_norm": 4.971139430999756, "learning_rate": 9.459157787924911e-06, "loss": 0.0371, "step": 1629 }, { "epoch": 1.4885844748858448, "grad_norm": 53.293548583984375, "learning_rate": 9.458143074581432e-06, "loss": 0.3732, "step": 1630 }, { "epoch": 1.4894977168949772, "grad_norm": 27.215333938598633, "learning_rate": 9.45712836123795e-06, "loss": 0.3193, "step": 1631 }, { "epoch": 1.4904109589041097, "grad_norm": 10.62795639038086, "learning_rate": 9.456113647894471e-06, "loss": 0.0912, "step": 1632 }, { "epoch": 1.4913242009132421, "grad_norm": 30.904953002929688, "learning_rate": 9.45509893455099e-06, "loss": 0.4904, "step": 1633 }, { "epoch": 1.4922374429223744, "grad_norm": 31.799448013305664, "learning_rate": 9.45408422120751e-06, "loss": 0.412, "step": 1634 }, { "epoch": 1.4931506849315068, "grad_norm": 45.001068115234375, "learning_rate": 9.45306950786403e-06, "loss": 1.0935, "step": 1635 }, { "epoch": 1.4940639269406393, "grad_norm": 17.78378677368164, "learning_rate": 9.452054794520548e-06, "loss": 0.2171, "step": 1636 }, { "epoch": 1.4949771689497717, "grad_norm": 6.406611919403076, "learning_rate": 9.451040081177069e-06, "loss": 0.0665, "step": 1637 }, { "epoch": 1.4958904109589042, "grad_norm": 0.49299925565719604, "learning_rate": 9.450025367833588e-06, "loss": 0.0038, "step": 1638 }, { "epoch": 1.4968036529680364, "grad_norm": 53.2421760559082, "learning_rate": 9.449010654490108e-06, "loss": 1.5172, "step": 1639 }, { "epoch": 1.4977168949771689, "grad_norm": 58.49087905883789, "learning_rate": 9.447995941146627e-06, "loss": 0.5895, "step": 1640 }, { "epoch": 1.4986301369863013, "grad_norm": 66.22061920166016, "learning_rate": 9.446981227803146e-06, "loss": 3.1682, "step": 1641 }, { "epoch": 1.4995433789954338, "grad_norm": 0.31840917468070984, "learning_rate": 9.445966514459666e-06, "loss": 0.0029, "step": 1642 }, { "epoch": 1.5004566210045662, "grad_norm": 40.277462005615234, "learning_rate": 9.444951801116185e-06, "loss": 1.2193, "step": 1643 }, { "epoch": 1.5013698630136987, "grad_norm": 18.676170349121094, "learning_rate": 9.443937087772706e-06, "loss": 0.1394, "step": 1644 }, { "epoch": 1.5022831050228311, "grad_norm": 65.12431335449219, "learning_rate": 9.442922374429225e-06, "loss": 1.5533, "step": 1645 }, { "epoch": 1.5031963470319636, "grad_norm": 21.80479621887207, "learning_rate": 9.441907661085743e-06, "loss": 0.2172, "step": 1646 }, { "epoch": 1.504109589041096, "grad_norm": 68.75554656982422, "learning_rate": 9.440892947742264e-06, "loss": 2.0709, "step": 1647 }, { "epoch": 1.5050228310502283, "grad_norm": 36.85853958129883, "learning_rate": 9.439878234398783e-06, "loss": 0.5051, "step": 1648 }, { "epoch": 1.5059360730593607, "grad_norm": 78.93167877197266, "learning_rate": 9.438863521055303e-06, "loss": 1.6896, "step": 1649 }, { "epoch": 1.5068493150684932, "grad_norm": 7.157394886016846, "learning_rate": 9.437848807711822e-06, "loss": 0.0756, "step": 1650 }, { "epoch": 1.5077625570776254, "grad_norm": 6.848533630371094, "learning_rate": 9.436834094368341e-06, "loss": 0.0724, "step": 1651 }, { "epoch": 1.5086757990867579, "grad_norm": 0.7932449579238892, "learning_rate": 9.435819381024862e-06, "loss": 0.0082, "step": 1652 }, { "epoch": 1.5095890410958903, "grad_norm": 73.73672485351562, "learning_rate": 9.43480466768138e-06, "loss": 0.5356, "step": 1653 }, { "epoch": 1.5105022831050228, "grad_norm": 85.70979309082031, "learning_rate": 9.433789954337901e-06, "loss": 0.8389, "step": 1654 }, { "epoch": 1.5114155251141552, "grad_norm": 0.6449397802352905, "learning_rate": 9.43277524099442e-06, "loss": 0.0051, "step": 1655 }, { "epoch": 1.5123287671232877, "grad_norm": 4.678313732147217, "learning_rate": 9.431760527650939e-06, "loss": 0.0488, "step": 1656 }, { "epoch": 1.5132420091324201, "grad_norm": 2.237295389175415, "learning_rate": 9.430745814307459e-06, "loss": 0.0261, "step": 1657 }, { "epoch": 1.5141552511415526, "grad_norm": 5.8085036277771, "learning_rate": 9.429731100963978e-06, "loss": 0.0577, "step": 1658 }, { "epoch": 1.515068493150685, "grad_norm": 84.51726531982422, "learning_rate": 9.428716387620499e-06, "loss": 3.0303, "step": 1659 }, { "epoch": 1.5159817351598175, "grad_norm": 4.483886241912842, "learning_rate": 9.427701674277017e-06, "loss": 0.0475, "step": 1660 }, { "epoch": 1.5168949771689497, "grad_norm": 61.96615982055664, "learning_rate": 9.426686960933536e-06, "loss": 1.4547, "step": 1661 }, { "epoch": 1.5178082191780822, "grad_norm": 77.22016906738281, "learning_rate": 9.425672247590057e-06, "loss": 1.3273, "step": 1662 }, { "epoch": 1.5187214611872146, "grad_norm": 72.09870910644531, "learning_rate": 9.424657534246576e-06, "loss": 1.2017, "step": 1663 }, { "epoch": 1.519634703196347, "grad_norm": 1.1992692947387695, "learning_rate": 9.423642820903096e-06, "loss": 0.0118, "step": 1664 }, { "epoch": 1.5205479452054793, "grad_norm": 13.550102233886719, "learning_rate": 9.422628107559615e-06, "loss": 0.1394, "step": 1665 }, { "epoch": 1.5214611872146118, "grad_norm": 21.8265438079834, "learning_rate": 9.421613394216136e-06, "loss": 0.2074, "step": 1666 }, { "epoch": 1.5223744292237442, "grad_norm": 16.794158935546875, "learning_rate": 9.420598680872654e-06, "loss": 0.1519, "step": 1667 }, { "epoch": 1.5232876712328767, "grad_norm": 19.31711196899414, "learning_rate": 9.419583967529173e-06, "loss": 0.1917, "step": 1668 }, { "epoch": 1.5242009132420091, "grad_norm": 30.596538543701172, "learning_rate": 9.418569254185694e-06, "loss": 0.3675, "step": 1669 }, { "epoch": 1.5251141552511416, "grad_norm": 24.64959716796875, "learning_rate": 9.417554540842213e-06, "loss": 0.3328, "step": 1670 }, { "epoch": 1.526027397260274, "grad_norm": 17.835603713989258, "learning_rate": 9.416539827498731e-06, "loss": 0.2978, "step": 1671 }, { "epoch": 1.5269406392694065, "grad_norm": 6.41143798828125, "learning_rate": 9.415525114155252e-06, "loss": 0.0675, "step": 1672 }, { "epoch": 1.527853881278539, "grad_norm": 12.846447944641113, "learning_rate": 9.41451040081177e-06, "loss": 0.1083, "step": 1673 }, { "epoch": 1.5287671232876714, "grad_norm": 7.338199615478516, "learning_rate": 9.413495687468291e-06, "loss": 0.0777, "step": 1674 }, { "epoch": 1.5296803652968036, "grad_norm": 15.604384422302246, "learning_rate": 9.41248097412481e-06, "loss": 0.1907, "step": 1675 }, { "epoch": 1.530593607305936, "grad_norm": 29.077919006347656, "learning_rate": 9.41146626078133e-06, "loss": 0.1703, "step": 1676 }, { "epoch": 1.5315068493150685, "grad_norm": 113.65913391113281, "learning_rate": 9.41045154743785e-06, "loss": 1.2247, "step": 1677 }, { "epoch": 1.5324200913242008, "grad_norm": 20.612377166748047, "learning_rate": 9.409436834094368e-06, "loss": 0.1276, "step": 1678 }, { "epoch": 1.5333333333333332, "grad_norm": 40.47608947753906, "learning_rate": 9.408422120750889e-06, "loss": 0.6325, "step": 1679 }, { "epoch": 1.5342465753424657, "grad_norm": 78.9966812133789, "learning_rate": 9.407407407407408e-06, "loss": 1.7829, "step": 1680 }, { "epoch": 1.5351598173515981, "grad_norm": 1.6894316673278809, "learning_rate": 9.406392694063927e-06, "loss": 0.0145, "step": 1681 }, { "epoch": 1.5360730593607306, "grad_norm": 2.171049118041992, "learning_rate": 9.405377980720447e-06, "loss": 0.0184, "step": 1682 }, { "epoch": 1.536986301369863, "grad_norm": 1.5307786464691162, "learning_rate": 9.404363267376968e-06, "loss": 0.0166, "step": 1683 }, { "epoch": 1.5378995433789955, "grad_norm": 93.18677520751953, "learning_rate": 9.403348554033487e-06, "loss": 1.0945, "step": 1684 }, { "epoch": 1.538812785388128, "grad_norm": 65.91613006591797, "learning_rate": 9.402333840690005e-06, "loss": 1.1147, "step": 1685 }, { "epoch": 1.5397260273972604, "grad_norm": 95.72000122070312, "learning_rate": 9.401319127346526e-06, "loss": 3.049, "step": 1686 }, { "epoch": 1.5406392694063928, "grad_norm": 28.510665893554688, "learning_rate": 9.400304414003045e-06, "loss": 0.2949, "step": 1687 }, { "epoch": 1.541552511415525, "grad_norm": 0.6316760778427124, "learning_rate": 9.399289700659564e-06, "loss": 0.0057, "step": 1688 }, { "epoch": 1.5424657534246575, "grad_norm": 32.58909225463867, "learning_rate": 9.398274987316084e-06, "loss": 0.3892, "step": 1689 }, { "epoch": 1.54337899543379, "grad_norm": 10.530635833740234, "learning_rate": 9.397260273972603e-06, "loss": 0.0752, "step": 1690 }, { "epoch": 1.5442922374429222, "grad_norm": 69.63558197021484, "learning_rate": 9.396245560629122e-06, "loss": 1.9929, "step": 1691 }, { "epoch": 1.5452054794520547, "grad_norm": 37.69184494018555, "learning_rate": 9.395230847285642e-06, "loss": 0.5123, "step": 1692 }, { "epoch": 1.5461187214611871, "grad_norm": 42.29705047607422, "learning_rate": 9.394216133942163e-06, "loss": 0.4093, "step": 1693 }, { "epoch": 1.5470319634703196, "grad_norm": 51.601749420166016, "learning_rate": 9.393201420598682e-06, "loss": 0.75, "step": 1694 }, { "epoch": 1.547945205479452, "grad_norm": 66.9555435180664, "learning_rate": 9.3921867072552e-06, "loss": 0.7971, "step": 1695 }, { "epoch": 1.5488584474885845, "grad_norm": 63.71406555175781, "learning_rate": 9.391171993911721e-06, "loss": 1.4741, "step": 1696 }, { "epoch": 1.549771689497717, "grad_norm": 12.082016944885254, "learning_rate": 9.39015728056824e-06, "loss": 0.1521, "step": 1697 }, { "epoch": 1.5506849315068494, "grad_norm": 19.628549575805664, "learning_rate": 9.389142567224759e-06, "loss": 0.1224, "step": 1698 }, { "epoch": 1.5515981735159818, "grad_norm": 12.197189331054688, "learning_rate": 9.38812785388128e-06, "loss": 0.1298, "step": 1699 }, { "epoch": 1.5525114155251143, "grad_norm": 101.14119720458984, "learning_rate": 9.387113140537798e-06, "loss": 2.7516, "step": 1700 }, { "epoch": 1.5534246575342465, "grad_norm": 61.14510726928711, "learning_rate": 9.386098427194317e-06, "loss": 2.1511, "step": 1701 }, { "epoch": 1.554337899543379, "grad_norm": 34.78244400024414, "learning_rate": 9.385083713850838e-06, "loss": 0.3481, "step": 1702 }, { "epoch": 1.5552511415525114, "grad_norm": 33.63381576538086, "learning_rate": 9.384069000507358e-06, "loss": 0.583, "step": 1703 }, { "epoch": 1.5561643835616439, "grad_norm": 55.66820526123047, "learning_rate": 9.383054287163877e-06, "loss": 0.8714, "step": 1704 }, { "epoch": 1.5570776255707761, "grad_norm": 64.96987915039062, "learning_rate": 9.382039573820396e-06, "loss": 0.5498, "step": 1705 }, { "epoch": 1.5579908675799086, "grad_norm": 75.64623260498047, "learning_rate": 9.381024860476916e-06, "loss": 1.6923, "step": 1706 }, { "epoch": 1.558904109589041, "grad_norm": 35.592952728271484, "learning_rate": 9.380010147133435e-06, "loss": 0.7901, "step": 1707 }, { "epoch": 1.5598173515981735, "grad_norm": 1.1255065202713013, "learning_rate": 9.378995433789954e-06, "loss": 0.0118, "step": 1708 }, { "epoch": 1.560730593607306, "grad_norm": 1.4739474058151245, "learning_rate": 9.377980720446475e-06, "loss": 0.009, "step": 1709 }, { "epoch": 1.5616438356164384, "grad_norm": 4.682794094085693, "learning_rate": 9.376966007102995e-06, "loss": 0.0539, "step": 1710 }, { "epoch": 1.5625570776255708, "grad_norm": 17.36832046508789, "learning_rate": 9.375951293759512e-06, "loss": 0.1862, "step": 1711 }, { "epoch": 1.5634703196347033, "grad_norm": 77.43868255615234, "learning_rate": 9.374936580416033e-06, "loss": 1.4705, "step": 1712 }, { "epoch": 1.5643835616438357, "grad_norm": 57.1617546081543, "learning_rate": 9.373921867072553e-06, "loss": 1.8399, "step": 1713 }, { "epoch": 1.5652968036529682, "grad_norm": 3.3832473754882812, "learning_rate": 9.372907153729072e-06, "loss": 0.03, "step": 1714 }, { "epoch": 1.5662100456621004, "grad_norm": 2.814337730407715, "learning_rate": 9.371892440385591e-06, "loss": 0.03, "step": 1715 }, { "epoch": 1.5671232876712329, "grad_norm": 42.995582580566406, "learning_rate": 9.370877727042112e-06, "loss": 0.7441, "step": 1716 }, { "epoch": 1.5680365296803653, "grad_norm": 52.48352813720703, "learning_rate": 9.36986301369863e-06, "loss": 0.7684, "step": 1717 }, { "epoch": 1.5689497716894976, "grad_norm": 2.5801584720611572, "learning_rate": 9.36884830035515e-06, "loss": 0.0297, "step": 1718 }, { "epoch": 1.56986301369863, "grad_norm": 4.055255889892578, "learning_rate": 9.36783358701167e-06, "loss": 0.0291, "step": 1719 }, { "epoch": 1.5707762557077625, "grad_norm": 98.08202362060547, "learning_rate": 9.36681887366819e-06, "loss": 1.1471, "step": 1720 }, { "epoch": 1.571689497716895, "grad_norm": 40.09896469116211, "learning_rate": 9.36580416032471e-06, "loss": 1.0395, "step": 1721 }, { "epoch": 1.5726027397260274, "grad_norm": 34.517799377441406, "learning_rate": 9.364789446981228e-06, "loss": 0.4182, "step": 1722 }, { "epoch": 1.5735159817351598, "grad_norm": 76.44532012939453, "learning_rate": 9.363774733637749e-06, "loss": 0.9117, "step": 1723 }, { "epoch": 1.5744292237442923, "grad_norm": 206.65391540527344, "learning_rate": 9.362760020294267e-06, "loss": 2.8039, "step": 1724 }, { "epoch": 1.5753424657534247, "grad_norm": 34.48017120361328, "learning_rate": 9.361745306950786e-06, "loss": 0.279, "step": 1725 }, { "epoch": 1.5762557077625572, "grad_norm": 62.63832473754883, "learning_rate": 9.360730593607307e-06, "loss": 1.1299, "step": 1726 }, { "epoch": 1.5771689497716896, "grad_norm": 41.68679428100586, "learning_rate": 9.359715880263827e-06, "loss": 0.5774, "step": 1727 }, { "epoch": 1.5780821917808219, "grad_norm": 44.833866119384766, "learning_rate": 9.358701166920345e-06, "loss": 0.7835, "step": 1728 }, { "epoch": 1.5789954337899543, "grad_norm": 35.84267807006836, "learning_rate": 9.357686453576865e-06, "loss": 0.3435, "step": 1729 }, { "epoch": 1.5799086757990868, "grad_norm": 41.98551559448242, "learning_rate": 9.356671740233386e-06, "loss": 0.3138, "step": 1730 }, { "epoch": 1.580821917808219, "grad_norm": 13.64596176147461, "learning_rate": 9.355657026889904e-06, "loss": 0.1047, "step": 1731 }, { "epoch": 1.5817351598173515, "grad_norm": 6.811862468719482, "learning_rate": 9.354642313546423e-06, "loss": 0.0668, "step": 1732 }, { "epoch": 1.582648401826484, "grad_norm": 95.25035858154297, "learning_rate": 9.353627600202944e-06, "loss": 2.6809, "step": 1733 }, { "epoch": 1.5835616438356164, "grad_norm": 41.56516647338867, "learning_rate": 9.352612886859463e-06, "loss": 0.2723, "step": 1734 }, { "epoch": 1.5844748858447488, "grad_norm": 2.782647132873535, "learning_rate": 9.351598173515982e-06, "loss": 0.0199, "step": 1735 }, { "epoch": 1.5853881278538813, "grad_norm": 12.598660469055176, "learning_rate": 9.350583460172502e-06, "loss": 0.1366, "step": 1736 }, { "epoch": 1.5863013698630137, "grad_norm": 55.3587760925293, "learning_rate": 9.349568746829023e-06, "loss": 0.7277, "step": 1737 }, { "epoch": 1.5872146118721462, "grad_norm": 14.473475456237793, "learning_rate": 9.348554033485541e-06, "loss": 0.1491, "step": 1738 }, { "epoch": 1.5881278538812786, "grad_norm": 13.321983337402344, "learning_rate": 9.34753932014206e-06, "loss": 0.1151, "step": 1739 }, { "epoch": 1.589041095890411, "grad_norm": 7.175190448760986, "learning_rate": 9.346524606798581e-06, "loss": 0.0646, "step": 1740 }, { "epoch": 1.5899543378995433, "grad_norm": 22.476730346679688, "learning_rate": 9.3455098934551e-06, "loss": 0.3169, "step": 1741 }, { "epoch": 1.5908675799086758, "grad_norm": 8.820850372314453, "learning_rate": 9.344495180111619e-06, "loss": 0.0463, "step": 1742 }, { "epoch": 1.5917808219178082, "grad_norm": 39.36225128173828, "learning_rate": 9.343480466768139e-06, "loss": 0.6888, "step": 1743 }, { "epoch": 1.5926940639269407, "grad_norm": 36.69620132446289, "learning_rate": 9.342465753424658e-06, "loss": 0.2718, "step": 1744 }, { "epoch": 1.593607305936073, "grad_norm": 21.037860870361328, "learning_rate": 9.341451040081177e-06, "loss": 0.2584, "step": 1745 }, { "epoch": 1.5945205479452054, "grad_norm": 26.929424285888672, "learning_rate": 9.340436326737697e-06, "loss": 0.2332, "step": 1746 }, { "epoch": 1.5954337899543378, "grad_norm": 3.851919651031494, "learning_rate": 9.339421613394218e-06, "loss": 0.0471, "step": 1747 }, { "epoch": 1.5963470319634703, "grad_norm": 29.17400360107422, "learning_rate": 9.338406900050737e-06, "loss": 0.1669, "step": 1748 }, { "epoch": 1.5972602739726027, "grad_norm": 5.818373203277588, "learning_rate": 9.337392186707256e-06, "loss": 0.034, "step": 1749 }, { "epoch": 1.5981735159817352, "grad_norm": 98.17304229736328, "learning_rate": 9.336377473363776e-06, "loss": 3.1654, "step": 1750 }, { "epoch": 1.5990867579908676, "grad_norm": 49.078895568847656, "learning_rate": 9.335362760020295e-06, "loss": 0.8053, "step": 1751 }, { "epoch": 1.6, "grad_norm": 0.9777005314826965, "learning_rate": 9.334348046676814e-06, "loss": 0.0095, "step": 1752 }, { "epoch": 1.6009132420091325, "grad_norm": 69.71762084960938, "learning_rate": 9.333333333333334e-06, "loss": 0.977, "step": 1753 }, { "epoch": 1.601826484018265, "grad_norm": 26.358436584472656, "learning_rate": 9.332318619989855e-06, "loss": 0.2425, "step": 1754 }, { "epoch": 1.6027397260273972, "grad_norm": 4.6054205894470215, "learning_rate": 9.331303906646374e-06, "loss": 0.043, "step": 1755 }, { "epoch": 1.6036529680365297, "grad_norm": 51.743900299072266, "learning_rate": 9.330289193302893e-06, "loss": 0.5712, "step": 1756 }, { "epoch": 1.6045662100456621, "grad_norm": 86.42259979248047, "learning_rate": 9.329274479959413e-06, "loss": 1.5773, "step": 1757 }, { "epoch": 1.6054794520547944, "grad_norm": 0.7437431216239929, "learning_rate": 9.328259766615932e-06, "loss": 0.007, "step": 1758 }, { "epoch": 1.6063926940639268, "grad_norm": 55.66876983642578, "learning_rate": 9.32724505327245e-06, "loss": 0.6733, "step": 1759 }, { "epoch": 1.6073059360730593, "grad_norm": 19.723712921142578, "learning_rate": 9.326230339928971e-06, "loss": 0.1969, "step": 1760 }, { "epoch": 1.6082191780821917, "grad_norm": 2.9923746585845947, "learning_rate": 9.32521562658549e-06, "loss": 0.0277, "step": 1761 }, { "epoch": 1.6091324200913242, "grad_norm": 2.5568714141845703, "learning_rate": 9.324200913242009e-06, "loss": 0.0203, "step": 1762 }, { "epoch": 1.6100456621004566, "grad_norm": 1.4901961088180542, "learning_rate": 9.32318619989853e-06, "loss": 0.0139, "step": 1763 }, { "epoch": 1.610958904109589, "grad_norm": 28.67121696472168, "learning_rate": 9.32217148655505e-06, "loss": 0.3437, "step": 1764 }, { "epoch": 1.6118721461187215, "grad_norm": 26.04372215270996, "learning_rate": 9.321156773211569e-06, "loss": 0.4084, "step": 1765 }, { "epoch": 1.612785388127854, "grad_norm": 54.6910285949707, "learning_rate": 9.320142059868088e-06, "loss": 0.3663, "step": 1766 }, { "epoch": 1.6136986301369864, "grad_norm": 11.13100528717041, "learning_rate": 9.319127346524608e-06, "loss": 0.115, "step": 1767 }, { "epoch": 1.6146118721461187, "grad_norm": 60.05731201171875, "learning_rate": 9.318112633181127e-06, "loss": 0.6433, "step": 1768 }, { "epoch": 1.6155251141552511, "grad_norm": 9.003053665161133, "learning_rate": 9.317097919837646e-06, "loss": 0.0791, "step": 1769 }, { "epoch": 1.6164383561643836, "grad_norm": 2.295954465866089, "learning_rate": 9.316083206494167e-06, "loss": 0.0244, "step": 1770 }, { "epoch": 1.617351598173516, "grad_norm": 41.701255798339844, "learning_rate": 9.315068493150685e-06, "loss": 0.5243, "step": 1771 }, { "epoch": 1.6182648401826483, "grad_norm": 1.4190937280654907, "learning_rate": 9.314053779807204e-06, "loss": 0.0141, "step": 1772 }, { "epoch": 1.6191780821917807, "grad_norm": 1.7445731163024902, "learning_rate": 9.313039066463725e-06, "loss": 0.0152, "step": 1773 }, { "epoch": 1.6200913242009132, "grad_norm": 0.3723667860031128, "learning_rate": 9.312024353120245e-06, "loss": 0.0025, "step": 1774 }, { "epoch": 1.6210045662100456, "grad_norm": 1.0183368921279907, "learning_rate": 9.311009639776764e-06, "loss": 0.0087, "step": 1775 }, { "epoch": 1.621917808219178, "grad_norm": 19.137887954711914, "learning_rate": 9.309994926433283e-06, "loss": 0.2044, "step": 1776 }, { "epoch": 1.6228310502283105, "grad_norm": 41.09137725830078, "learning_rate": 9.308980213089804e-06, "loss": 0.7001, "step": 1777 }, { "epoch": 1.623744292237443, "grad_norm": 18.5150146484375, "learning_rate": 9.307965499746322e-06, "loss": 0.2104, "step": 1778 }, { "epoch": 1.6246575342465754, "grad_norm": 17.84613609313965, "learning_rate": 9.306950786402841e-06, "loss": 0.1916, "step": 1779 }, { "epoch": 1.625570776255708, "grad_norm": 100.67161560058594, "learning_rate": 9.305936073059362e-06, "loss": 3.5933, "step": 1780 }, { "epoch": 1.6264840182648403, "grad_norm": 11.31849193572998, "learning_rate": 9.30492135971588e-06, "loss": 0.1011, "step": 1781 }, { "epoch": 1.6273972602739726, "grad_norm": 23.973161697387695, "learning_rate": 9.303906646372401e-06, "loss": 0.3219, "step": 1782 }, { "epoch": 1.628310502283105, "grad_norm": 0.8255593776702881, "learning_rate": 9.30289193302892e-06, "loss": 0.0064, "step": 1783 }, { "epoch": 1.6292237442922375, "grad_norm": 60.44802474975586, "learning_rate": 9.30187721968544e-06, "loss": 1.1647, "step": 1784 }, { "epoch": 1.6301369863013697, "grad_norm": 26.541452407836914, "learning_rate": 9.30086250634196e-06, "loss": 0.2243, "step": 1785 }, { "epoch": 1.6310502283105022, "grad_norm": 35.50346374511719, "learning_rate": 9.299847792998478e-06, "loss": 0.4309, "step": 1786 }, { "epoch": 1.6319634703196346, "grad_norm": 23.611053466796875, "learning_rate": 9.298833079654999e-06, "loss": 0.2733, "step": 1787 }, { "epoch": 1.632876712328767, "grad_norm": 4.081919193267822, "learning_rate": 9.297818366311518e-06, "loss": 0.0395, "step": 1788 }, { "epoch": 1.6337899543378995, "grad_norm": 62.454132080078125, "learning_rate": 9.296803652968036e-06, "loss": 2.5142, "step": 1789 }, { "epoch": 1.634703196347032, "grad_norm": 0.5355783104896545, "learning_rate": 9.295788939624557e-06, "loss": 0.0054, "step": 1790 }, { "epoch": 1.6356164383561644, "grad_norm": 16.853822708129883, "learning_rate": 9.294774226281076e-06, "loss": 0.1075, "step": 1791 }, { "epoch": 1.636529680365297, "grad_norm": 3.852691888809204, "learning_rate": 9.293759512937596e-06, "loss": 0.0428, "step": 1792 }, { "epoch": 1.6374429223744293, "grad_norm": 37.94844055175781, "learning_rate": 9.292744799594115e-06, "loss": 0.3388, "step": 1793 }, { "epoch": 1.6383561643835618, "grad_norm": 404.57745361328125, "learning_rate": 9.291730086250636e-06, "loss": 1.6808, "step": 1794 }, { "epoch": 1.639269406392694, "grad_norm": 55.38078308105469, "learning_rate": 9.290715372907155e-06, "loss": 1.2691, "step": 1795 }, { "epoch": 1.6401826484018265, "grad_norm": 50.291786193847656, "learning_rate": 9.289700659563673e-06, "loss": 0.8189, "step": 1796 }, { "epoch": 1.641095890410959, "grad_norm": 2.058061122894287, "learning_rate": 9.288685946220194e-06, "loss": 0.0145, "step": 1797 }, { "epoch": 1.6420091324200912, "grad_norm": 9.684267044067383, "learning_rate": 9.287671232876713e-06, "loss": 0.0791, "step": 1798 }, { "epoch": 1.6429223744292236, "grad_norm": 48.10397720336914, "learning_rate": 9.286656519533233e-06, "loss": 0.6078, "step": 1799 }, { "epoch": 1.643835616438356, "grad_norm": 2.670443296432495, "learning_rate": 9.285641806189752e-06, "loss": 0.0275, "step": 1800 }, { "epoch": 1.6447488584474885, "grad_norm": 20.268016815185547, "learning_rate": 9.284627092846271e-06, "loss": 0.2134, "step": 1801 }, { "epoch": 1.645662100456621, "grad_norm": 26.293569564819336, "learning_rate": 9.283612379502792e-06, "loss": 0.3027, "step": 1802 }, { "epoch": 1.6465753424657534, "grad_norm": 14.600412368774414, "learning_rate": 9.28259766615931e-06, "loss": 0.095, "step": 1803 }, { "epoch": 1.6474885844748859, "grad_norm": 18.53495979309082, "learning_rate": 9.281582952815831e-06, "loss": 0.2048, "step": 1804 }, { "epoch": 1.6484018264840183, "grad_norm": 28.780906677246094, "learning_rate": 9.28056823947235e-06, "loss": 0.4875, "step": 1805 }, { "epoch": 1.6493150684931508, "grad_norm": 10.304615020751953, "learning_rate": 9.279553526128869e-06, "loss": 0.1126, "step": 1806 }, { "epoch": 1.6502283105022832, "grad_norm": 85.30982971191406, "learning_rate": 9.27853881278539e-06, "loss": 0.9784, "step": 1807 }, { "epoch": 1.6511415525114155, "grad_norm": 11.46893310546875, "learning_rate": 9.277524099441908e-06, "loss": 0.1183, "step": 1808 }, { "epoch": 1.652054794520548, "grad_norm": 5.775902271270752, "learning_rate": 9.276509386098429e-06, "loss": 0.0593, "step": 1809 }, { "epoch": 1.6529680365296804, "grad_norm": 46.86652374267578, "learning_rate": 9.275494672754947e-06, "loss": 1.0033, "step": 1810 }, { "epoch": 1.6538812785388128, "grad_norm": 82.04986572265625, "learning_rate": 9.274479959411466e-06, "loss": 1.8387, "step": 1811 }, { "epoch": 1.654794520547945, "grad_norm": 8.578059196472168, "learning_rate": 9.273465246067987e-06, "loss": 0.0794, "step": 1812 }, { "epoch": 1.6557077625570775, "grad_norm": 81.45525360107422, "learning_rate": 9.272450532724506e-06, "loss": 5.5071, "step": 1813 }, { "epoch": 1.65662100456621, "grad_norm": 28.97995948791504, "learning_rate": 9.271435819381026e-06, "loss": 0.2717, "step": 1814 }, { "epoch": 1.6575342465753424, "grad_norm": 783.1406860351562, "learning_rate": 9.270421106037545e-06, "loss": 6.2688, "step": 1815 }, { "epoch": 1.6584474885844749, "grad_norm": 34.59329605102539, "learning_rate": 9.269406392694064e-06, "loss": 0.4164, "step": 1816 }, { "epoch": 1.6593607305936073, "grad_norm": 1.8684055805206299, "learning_rate": 9.268391679350584e-06, "loss": 0.0168, "step": 1817 }, { "epoch": 1.6602739726027398, "grad_norm": 2.4761626720428467, "learning_rate": 9.267376966007103e-06, "loss": 0.0224, "step": 1818 }, { "epoch": 1.6611872146118722, "grad_norm": 73.86084747314453, "learning_rate": 9.266362252663624e-06, "loss": 0.1894, "step": 1819 }, { "epoch": 1.6621004566210047, "grad_norm": 1560.723388671875, "learning_rate": 9.265347539320143e-06, "loss": 3.1573, "step": 1820 }, { "epoch": 1.6630136986301371, "grad_norm": 77.53546142578125, "learning_rate": 9.264332825976662e-06, "loss": 0.6503, "step": 1821 }, { "epoch": 1.6639269406392694, "grad_norm": 44.713375091552734, "learning_rate": 9.263318112633182e-06, "loss": 0.3961, "step": 1822 }, { "epoch": 1.6648401826484018, "grad_norm": 49.047672271728516, "learning_rate": 9.262303399289701e-06, "loss": 0.2275, "step": 1823 }, { "epoch": 1.6657534246575343, "grad_norm": 166.7714080810547, "learning_rate": 9.261288685946221e-06, "loss": 0.8831, "step": 1824 }, { "epoch": 1.6666666666666665, "grad_norm": 237.414794921875, "learning_rate": 9.26027397260274e-06, "loss": 1.095, "step": 1825 }, { "epoch": 1.667579908675799, "grad_norm": 356.0032043457031, "learning_rate": 9.25925925925926e-06, "loss": 2.6537, "step": 1826 }, { "epoch": 1.6684931506849314, "grad_norm": 84.18987274169922, "learning_rate": 9.25824454591578e-06, "loss": 0.9602, "step": 1827 }, { "epoch": 1.6694063926940639, "grad_norm": 58.92682647705078, "learning_rate": 9.257229832572299e-06, "loss": 0.3032, "step": 1828 }, { "epoch": 1.6703196347031963, "grad_norm": 568.2027587890625, "learning_rate": 9.256215119228819e-06, "loss": 10.1731, "step": 1829 }, { "epoch": 1.6712328767123288, "grad_norm": 250.87376403808594, "learning_rate": 9.255200405885338e-06, "loss": 5.8011, "step": 1830 }, { "epoch": 1.6721461187214612, "grad_norm": 218.23167419433594, "learning_rate": 9.254185692541857e-06, "loss": 6.3268, "step": 1831 }, { "epoch": 1.6730593607305937, "grad_norm": 165.59510803222656, "learning_rate": 9.253170979198377e-06, "loss": 1.5956, "step": 1832 }, { "epoch": 1.6739726027397261, "grad_norm": 83.76448059082031, "learning_rate": 9.252156265854896e-06, "loss": 0.6118, "step": 1833 }, { "epoch": 1.6748858447488586, "grad_norm": 109.41250610351562, "learning_rate": 9.251141552511417e-06, "loss": 1.8506, "step": 1834 }, { "epoch": 1.6757990867579908, "grad_norm": 54.289085388183594, "learning_rate": 9.250126839167936e-06, "loss": 0.4282, "step": 1835 }, { "epoch": 1.6767123287671233, "grad_norm": 92.7637710571289, "learning_rate": 9.249112125824456e-06, "loss": 0.4782, "step": 1836 }, { "epoch": 1.6776255707762557, "grad_norm": 51.34032440185547, "learning_rate": 9.248097412480975e-06, "loss": 0.7539, "step": 1837 }, { "epoch": 1.678538812785388, "grad_norm": 461.9293518066406, "learning_rate": 9.247082699137494e-06, "loss": 3.5281, "step": 1838 }, { "epoch": 1.6794520547945204, "grad_norm": 42.789527893066406, "learning_rate": 9.246067985794014e-06, "loss": 0.1846, "step": 1839 }, { "epoch": 1.6803652968036529, "grad_norm": 20.349037170410156, "learning_rate": 9.245053272450533e-06, "loss": 0.0764, "step": 1840 }, { "epoch": 1.6812785388127853, "grad_norm": 240.30709838867188, "learning_rate": 9.244038559107052e-06, "loss": 2.6771, "step": 1841 }, { "epoch": 1.6821917808219178, "grad_norm": 65.51016998291016, "learning_rate": 9.243023845763573e-06, "loss": 0.7842, "step": 1842 }, { "epoch": 1.6831050228310502, "grad_norm": 69.19259643554688, "learning_rate": 9.242009132420093e-06, "loss": 0.9635, "step": 1843 }, { "epoch": 1.6840182648401827, "grad_norm": 63.454734802246094, "learning_rate": 9.240994419076612e-06, "loss": 0.6843, "step": 1844 }, { "epoch": 1.6849315068493151, "grad_norm": 94.57384490966797, "learning_rate": 9.23997970573313e-06, "loss": 1.7106, "step": 1845 }, { "epoch": 1.6858447488584476, "grad_norm": 91.61274719238281, "learning_rate": 9.238964992389651e-06, "loss": 1.9941, "step": 1846 }, { "epoch": 1.68675799086758, "grad_norm": 186.0961151123047, "learning_rate": 9.23795027904617e-06, "loss": 2.3685, "step": 1847 }, { "epoch": 1.6876712328767123, "grad_norm": 62.2406005859375, "learning_rate": 9.236935565702689e-06, "loss": 1.5317, "step": 1848 }, { "epoch": 1.6885844748858447, "grad_norm": 14.828817367553711, "learning_rate": 9.23592085235921e-06, "loss": 0.1394, "step": 1849 }, { "epoch": 1.6894977168949772, "grad_norm": 38.857730865478516, "learning_rate": 9.234906139015728e-06, "loss": 0.5195, "step": 1850 }, { "epoch": 1.6904109589041096, "grad_norm": 48.4982795715332, "learning_rate": 9.233891425672247e-06, "loss": 0.3598, "step": 1851 }, { "epoch": 1.6913242009132419, "grad_norm": 25.349807739257812, "learning_rate": 9.232876712328768e-06, "loss": 0.3013, "step": 1852 }, { "epoch": 1.6922374429223743, "grad_norm": 76.53164672851562, "learning_rate": 9.231861998985288e-06, "loss": 2.3896, "step": 1853 }, { "epoch": 1.6931506849315068, "grad_norm": 53.86687088012695, "learning_rate": 9.230847285641807e-06, "loss": 1.5833, "step": 1854 }, { "epoch": 1.6940639269406392, "grad_norm": 21.19904136657715, "learning_rate": 9.229832572298326e-06, "loss": 0.1619, "step": 1855 }, { "epoch": 1.6949771689497717, "grad_norm": 65.88201904296875, "learning_rate": 9.228817858954847e-06, "loss": 0.7028, "step": 1856 }, { "epoch": 1.6958904109589041, "grad_norm": 29.788286209106445, "learning_rate": 9.227803145611365e-06, "loss": 0.1619, "step": 1857 }, { "epoch": 1.6968036529680366, "grad_norm": 142.55796813964844, "learning_rate": 9.226788432267884e-06, "loss": 4.3321, "step": 1858 }, { "epoch": 1.697716894977169, "grad_norm": 31.21364402770996, "learning_rate": 9.225773718924405e-06, "loss": 0.38, "step": 1859 }, { "epoch": 1.6986301369863015, "grad_norm": 50.14790725708008, "learning_rate": 9.224759005580924e-06, "loss": 0.5792, "step": 1860 }, { "epoch": 1.699543378995434, "grad_norm": 41.719539642333984, "learning_rate": 9.223744292237442e-06, "loss": 0.2277, "step": 1861 }, { "epoch": 1.7004566210045662, "grad_norm": 42.959476470947266, "learning_rate": 9.222729578893963e-06, "loss": 0.2908, "step": 1862 }, { "epoch": 1.7013698630136986, "grad_norm": 8.114343643188477, "learning_rate": 9.221714865550484e-06, "loss": 0.0776, "step": 1863 }, { "epoch": 1.702283105022831, "grad_norm": 27.67634391784668, "learning_rate": 9.220700152207002e-06, "loss": 0.3946, "step": 1864 }, { "epoch": 1.7031963470319633, "grad_norm": 42.5159912109375, "learning_rate": 9.219685438863521e-06, "loss": 0.7119, "step": 1865 }, { "epoch": 1.7041095890410958, "grad_norm": 1.5835663080215454, "learning_rate": 9.218670725520042e-06, "loss": 0.0148, "step": 1866 }, { "epoch": 1.7050228310502282, "grad_norm": 7.530608177185059, "learning_rate": 9.21765601217656e-06, "loss": 0.0765, "step": 1867 }, { "epoch": 1.7059360730593607, "grad_norm": 15.047529220581055, "learning_rate": 9.21664129883308e-06, "loss": 0.1127, "step": 1868 }, { "epoch": 1.7068493150684931, "grad_norm": 42.243412017822266, "learning_rate": 9.2156265854896e-06, "loss": 0.5812, "step": 1869 }, { "epoch": 1.7077625570776256, "grad_norm": 26.484119415283203, "learning_rate": 9.21461187214612e-06, "loss": 0.2597, "step": 1870 }, { "epoch": 1.708675799086758, "grad_norm": 72.80657958984375, "learning_rate": 9.213597158802638e-06, "loss": 0.8402, "step": 1871 }, { "epoch": 1.7095890410958905, "grad_norm": 15.285073280334473, "learning_rate": 9.212582445459158e-06, "loss": 0.1077, "step": 1872 }, { "epoch": 1.710502283105023, "grad_norm": 6.9834442138671875, "learning_rate": 9.211567732115679e-06, "loss": 0.0578, "step": 1873 }, { "epoch": 1.7114155251141554, "grad_norm": 0.7351130843162537, "learning_rate": 9.210553018772198e-06, "loss": 0.0057, "step": 1874 }, { "epoch": 1.7123287671232876, "grad_norm": 116.81783294677734, "learning_rate": 9.209538305428716e-06, "loss": 3.1785, "step": 1875 }, { "epoch": 1.71324200913242, "grad_norm": 21.254663467407227, "learning_rate": 9.208523592085237e-06, "loss": 0.122, "step": 1876 }, { "epoch": 1.7141552511415525, "grad_norm": 56.767730712890625, "learning_rate": 9.207508878741756e-06, "loss": 0.8627, "step": 1877 }, { "epoch": 1.7150684931506848, "grad_norm": 81.18311309814453, "learning_rate": 9.206494165398275e-06, "loss": 0.9114, "step": 1878 }, { "epoch": 1.7159817351598172, "grad_norm": 126.0943603515625, "learning_rate": 9.205479452054795e-06, "loss": 1.6191, "step": 1879 }, { "epoch": 1.7168949771689497, "grad_norm": 62.02207565307617, "learning_rate": 9.204464738711316e-06, "loss": 1.3372, "step": 1880 }, { "epoch": 1.7178082191780821, "grad_norm": 141.25535583496094, "learning_rate": 9.203450025367835e-06, "loss": 0.6365, "step": 1881 }, { "epoch": 1.7187214611872146, "grad_norm": 34.99830627441406, "learning_rate": 9.202435312024353e-06, "loss": 0.2719, "step": 1882 }, { "epoch": 1.719634703196347, "grad_norm": 16.879953384399414, "learning_rate": 9.201420598680874e-06, "loss": 0.1334, "step": 1883 }, { "epoch": 1.7205479452054795, "grad_norm": 88.07173919677734, "learning_rate": 9.200405885337393e-06, "loss": 2.2456, "step": 1884 }, { "epoch": 1.721461187214612, "grad_norm": 66.4643325805664, "learning_rate": 9.199391171993912e-06, "loss": 0.4112, "step": 1885 }, { "epoch": 1.7223744292237444, "grad_norm": 70.3757095336914, "learning_rate": 9.198376458650432e-06, "loss": 2.4193, "step": 1886 }, { "epoch": 1.7232876712328768, "grad_norm": 21.39786148071289, "learning_rate": 9.197361745306953e-06, "loss": 0.2878, "step": 1887 }, { "epoch": 1.724200913242009, "grad_norm": 19.54376983642578, "learning_rate": 9.19634703196347e-06, "loss": 0.168, "step": 1888 }, { "epoch": 1.7251141552511415, "grad_norm": 45.343170166015625, "learning_rate": 9.19533231861999e-06, "loss": 0.5976, "step": 1889 }, { "epoch": 1.726027397260274, "grad_norm": 28.737350463867188, "learning_rate": 9.194317605276511e-06, "loss": 0.2174, "step": 1890 }, { "epoch": 1.7269406392694064, "grad_norm": 28.650737762451172, "learning_rate": 9.19330289193303e-06, "loss": 0.2825, "step": 1891 }, { "epoch": 1.7278538812785387, "grad_norm": 33.096927642822266, "learning_rate": 9.192288178589549e-06, "loss": 0.2538, "step": 1892 }, { "epoch": 1.7287671232876711, "grad_norm": 17.22200584411621, "learning_rate": 9.19127346524607e-06, "loss": 0.1604, "step": 1893 }, { "epoch": 1.7296803652968036, "grad_norm": 37.8046989440918, "learning_rate": 9.190258751902588e-06, "loss": 0.4941, "step": 1894 }, { "epoch": 1.730593607305936, "grad_norm": 26.19607162475586, "learning_rate": 9.189244038559107e-06, "loss": 0.1929, "step": 1895 }, { "epoch": 1.7315068493150685, "grad_norm": 107.32279968261719, "learning_rate": 9.188229325215627e-06, "loss": 2.7816, "step": 1896 }, { "epoch": 1.732420091324201, "grad_norm": 62.6724853515625, "learning_rate": 9.187214611872148e-06, "loss": 0.4105, "step": 1897 }, { "epoch": 1.7333333333333334, "grad_norm": 15.343182563781738, "learning_rate": 9.186199898528667e-06, "loss": 0.1335, "step": 1898 }, { "epoch": 1.7342465753424658, "grad_norm": 72.21351623535156, "learning_rate": 9.185185185185186e-06, "loss": 2.4417, "step": 1899 }, { "epoch": 1.7351598173515983, "grad_norm": 27.630952835083008, "learning_rate": 9.184170471841706e-06, "loss": 0.4048, "step": 1900 }, { "epoch": 1.7360730593607308, "grad_norm": 2.4508416652679443, "learning_rate": 9.183155758498225e-06, "loss": 0.0232, "step": 1901 }, { "epoch": 1.736986301369863, "grad_norm": 67.03666687011719, "learning_rate": 9.182141045154744e-06, "loss": 1.1351, "step": 1902 }, { "epoch": 1.7378995433789954, "grad_norm": 137.68812561035156, "learning_rate": 9.181126331811264e-06, "loss": 2.5583, "step": 1903 }, { "epoch": 1.738812785388128, "grad_norm": 11.243698120117188, "learning_rate": 9.180111618467783e-06, "loss": 0.1381, "step": 1904 }, { "epoch": 1.7397260273972601, "grad_norm": 53.407100677490234, "learning_rate": 9.179096905124302e-06, "loss": 0.7849, "step": 1905 }, { "epoch": 1.7406392694063926, "grad_norm": 108.97261810302734, "learning_rate": 9.178082191780823e-06, "loss": 1.2735, "step": 1906 }, { "epoch": 1.741552511415525, "grad_norm": 35.150596618652344, "learning_rate": 9.177067478437343e-06, "loss": 0.3389, "step": 1907 }, { "epoch": 1.7424657534246575, "grad_norm": 65.57857513427734, "learning_rate": 9.176052765093862e-06, "loss": 1.6201, "step": 1908 }, { "epoch": 1.74337899543379, "grad_norm": 74.63565826416016, "learning_rate": 9.175038051750381e-06, "loss": 1.5825, "step": 1909 }, { "epoch": 1.7442922374429224, "grad_norm": 40.85036087036133, "learning_rate": 9.174023338406901e-06, "loss": 0.462, "step": 1910 }, { "epoch": 1.7452054794520548, "grad_norm": 33.870094299316406, "learning_rate": 9.17300862506342e-06, "loss": 0.4935, "step": 1911 }, { "epoch": 1.7461187214611873, "grad_norm": 11.17979621887207, "learning_rate": 9.171993911719939e-06, "loss": 0.0619, "step": 1912 }, { "epoch": 1.7470319634703197, "grad_norm": 34.78786849975586, "learning_rate": 9.17097919837646e-06, "loss": 0.3014, "step": 1913 }, { "epoch": 1.7479452054794522, "grad_norm": 10.656474113464355, "learning_rate": 9.16996448503298e-06, "loss": 0.0892, "step": 1914 }, { "epoch": 1.7488584474885844, "grad_norm": 42.0554084777832, "learning_rate": 9.168949771689499e-06, "loss": 0.4908, "step": 1915 }, { "epoch": 1.7497716894977169, "grad_norm": 8.72321891784668, "learning_rate": 9.167935058346018e-06, "loss": 0.086, "step": 1916 }, { "epoch": 1.7506849315068493, "grad_norm": 50.5660285949707, "learning_rate": 9.166920345002538e-06, "loss": 0.5115, "step": 1917 }, { "epoch": 1.7515981735159816, "grad_norm": 11.400227546691895, "learning_rate": 9.165905631659057e-06, "loss": 0.1023, "step": 1918 }, { "epoch": 1.752511415525114, "grad_norm": 10.25698184967041, "learning_rate": 9.164890918315576e-06, "loss": 0.0954, "step": 1919 }, { "epoch": 1.7534246575342465, "grad_norm": 36.68915557861328, "learning_rate": 9.163876204972097e-06, "loss": 0.8182, "step": 1920 }, { "epoch": 1.754337899543379, "grad_norm": 49.26555252075195, "learning_rate": 9.162861491628615e-06, "loss": 0.9273, "step": 1921 }, { "epoch": 1.7552511415525114, "grad_norm": 10.161992073059082, "learning_rate": 9.161846778285134e-06, "loss": 0.1162, "step": 1922 }, { "epoch": 1.7561643835616438, "grad_norm": 22.96100616455078, "learning_rate": 9.160832064941655e-06, "loss": 0.2365, "step": 1923 }, { "epoch": 1.7570776255707763, "grad_norm": 41.11116409301758, "learning_rate": 9.159817351598175e-06, "loss": 0.779, "step": 1924 }, { "epoch": 1.7579908675799087, "grad_norm": 17.087438583374023, "learning_rate": 9.158802638254694e-06, "loss": 0.1612, "step": 1925 }, { "epoch": 1.7589041095890412, "grad_norm": 49.481719970703125, "learning_rate": 9.157787924911213e-06, "loss": 0.7936, "step": 1926 }, { "epoch": 1.7598173515981737, "grad_norm": 32.37746047973633, "learning_rate": 9.156773211567734e-06, "loss": 0.3075, "step": 1927 }, { "epoch": 1.7607305936073059, "grad_norm": 75.52328491210938, "learning_rate": 9.155758498224252e-06, "loss": 1.3685, "step": 1928 }, { "epoch": 1.7616438356164383, "grad_norm": 29.107797622680664, "learning_rate": 9.154743784880771e-06, "loss": 0.3489, "step": 1929 }, { "epoch": 1.7625570776255708, "grad_norm": 51.282588958740234, "learning_rate": 9.153729071537292e-06, "loss": 0.4123, "step": 1930 }, { "epoch": 1.7634703196347032, "grad_norm": 52.18727111816406, "learning_rate": 9.15271435819381e-06, "loss": 0.5185, "step": 1931 }, { "epoch": 1.7643835616438355, "grad_norm": 41.76936340332031, "learning_rate": 9.15169964485033e-06, "loss": 0.4787, "step": 1932 }, { "epoch": 1.765296803652968, "grad_norm": 66.59158325195312, "learning_rate": 9.15068493150685e-06, "loss": 0.9562, "step": 1933 }, { "epoch": 1.7662100456621004, "grad_norm": 12.391529083251953, "learning_rate": 9.14967021816337e-06, "loss": 0.1216, "step": 1934 }, { "epoch": 1.7671232876712328, "grad_norm": 70.15039825439453, "learning_rate": 9.14865550481989e-06, "loss": 0.3611, "step": 1935 }, { "epoch": 1.7680365296803653, "grad_norm": 8.94403076171875, "learning_rate": 9.147640791476408e-06, "loss": 0.0804, "step": 1936 }, { "epoch": 1.7689497716894977, "grad_norm": 72.50226593017578, "learning_rate": 9.146626078132929e-06, "loss": 2.8947, "step": 1937 }, { "epoch": 1.7698630136986302, "grad_norm": 68.8460922241211, "learning_rate": 9.145611364789448e-06, "loss": 1.3243, "step": 1938 }, { "epoch": 1.7707762557077626, "grad_norm": 60.34086990356445, "learning_rate": 9.144596651445967e-06, "loss": 0.8771, "step": 1939 }, { "epoch": 1.771689497716895, "grad_norm": 27.859050750732422, "learning_rate": 9.143581938102487e-06, "loss": 0.2083, "step": 1940 }, { "epoch": 1.7726027397260276, "grad_norm": 53.37602233886719, "learning_rate": 9.142567224759006e-06, "loss": 0.7057, "step": 1941 }, { "epoch": 1.7735159817351598, "grad_norm": 21.419878005981445, "learning_rate": 9.141552511415526e-06, "loss": 0.1846, "step": 1942 }, { "epoch": 1.7744292237442922, "grad_norm": 133.52626037597656, "learning_rate": 9.140537798072045e-06, "loss": 1.6691, "step": 1943 }, { "epoch": 1.7753424657534247, "grad_norm": 2.110581874847412, "learning_rate": 9.139523084728566e-06, "loss": 0.0207, "step": 1944 }, { "epoch": 1.776255707762557, "grad_norm": 5.923244476318359, "learning_rate": 9.138508371385085e-06, "loss": 0.049, "step": 1945 }, { "epoch": 1.7771689497716894, "grad_norm": 70.59261322021484, "learning_rate": 9.137493658041604e-06, "loss": 0.6931, "step": 1946 }, { "epoch": 1.7780821917808218, "grad_norm": 21.090532302856445, "learning_rate": 9.136478944698124e-06, "loss": 0.1353, "step": 1947 }, { "epoch": 1.7789954337899543, "grad_norm": 110.44983673095703, "learning_rate": 9.135464231354643e-06, "loss": 5.2696, "step": 1948 }, { "epoch": 1.7799086757990867, "grad_norm": 14.909265518188477, "learning_rate": 9.134449518011162e-06, "loss": 0.1779, "step": 1949 }, { "epoch": 1.7808219178082192, "grad_norm": 2.753995180130005, "learning_rate": 9.133434804667682e-06, "loss": 0.0267, "step": 1950 }, { "epoch": 1.7817351598173516, "grad_norm": 64.25704193115234, "learning_rate": 9.132420091324201e-06, "loss": 0.8157, "step": 1951 }, { "epoch": 1.782648401826484, "grad_norm": 7.601568222045898, "learning_rate": 9.131405377980722e-06, "loss": 0.0622, "step": 1952 }, { "epoch": 1.7835616438356166, "grad_norm": 7.499879360198975, "learning_rate": 9.13039066463724e-06, "loss": 0.0439, "step": 1953 }, { "epoch": 1.784474885844749, "grad_norm": 1.577714204788208, "learning_rate": 9.129375951293761e-06, "loss": 0.0131, "step": 1954 }, { "epoch": 1.7853881278538812, "grad_norm": 43.30174255371094, "learning_rate": 9.12836123795028e-06, "loss": 0.3139, "step": 1955 }, { "epoch": 1.7863013698630137, "grad_norm": 49.46064376831055, "learning_rate": 9.127346524606799e-06, "loss": 0.2497, "step": 1956 }, { "epoch": 1.7872146118721461, "grad_norm": 39.489192962646484, "learning_rate": 9.12633181126332e-06, "loss": 0.3812, "step": 1957 }, { "epoch": 1.7881278538812784, "grad_norm": 58.880462646484375, "learning_rate": 9.125317097919838e-06, "loss": 0.6966, "step": 1958 }, { "epoch": 1.7890410958904108, "grad_norm": 100.13616943359375, "learning_rate": 9.124302384576359e-06, "loss": 0.3404, "step": 1959 }, { "epoch": 1.7899543378995433, "grad_norm": 30.6699275970459, "learning_rate": 9.123287671232878e-06, "loss": 0.6705, "step": 1960 }, { "epoch": 1.7908675799086757, "grad_norm": 0.3288891911506653, "learning_rate": 9.122272957889396e-06, "loss": 0.0028, "step": 1961 }, { "epoch": 1.7917808219178082, "grad_norm": 21.41047477722168, "learning_rate": 9.121258244545917e-06, "loss": 0.1396, "step": 1962 }, { "epoch": 1.7926940639269406, "grad_norm": 4.837810039520264, "learning_rate": 9.120243531202436e-06, "loss": 0.0251, "step": 1963 }, { "epoch": 1.793607305936073, "grad_norm": 50.24432373046875, "learning_rate": 9.119228817858956e-06, "loss": 0.6402, "step": 1964 }, { "epoch": 1.7945205479452055, "grad_norm": 49.258399963378906, "learning_rate": 9.118214104515475e-06, "loss": 0.4965, "step": 1965 }, { "epoch": 1.795433789954338, "grad_norm": 65.41264343261719, "learning_rate": 9.117199391171994e-06, "loss": 0.5349, "step": 1966 }, { "epoch": 1.7963470319634705, "grad_norm": 100.44697570800781, "learning_rate": 9.116184677828515e-06, "loss": 1.8716, "step": 1967 }, { "epoch": 1.7972602739726027, "grad_norm": 6.1049723625183105, "learning_rate": 9.115169964485033e-06, "loss": 0.0583, "step": 1968 }, { "epoch": 1.7981735159817351, "grad_norm": 66.09585571289062, "learning_rate": 9.114155251141554e-06, "loss": 0.7739, "step": 1969 }, { "epoch": 1.7990867579908676, "grad_norm": 71.42376708984375, "learning_rate": 9.113140537798073e-06, "loss": 1.3133, "step": 1970 }, { "epoch": 1.8, "grad_norm": 5.373698711395264, "learning_rate": 9.112125824454592e-06, "loss": 0.0332, "step": 1971 }, { "epoch": 1.8009132420091323, "grad_norm": 38.9451789855957, "learning_rate": 9.111111111111112e-06, "loss": 0.4412, "step": 1972 }, { "epoch": 1.8018264840182647, "grad_norm": 29.29280662536621, "learning_rate": 9.110096397767631e-06, "loss": 0.3728, "step": 1973 }, { "epoch": 1.8027397260273972, "grad_norm": 15.788755416870117, "learning_rate": 9.109081684424152e-06, "loss": 0.1179, "step": 1974 }, { "epoch": 1.8036529680365296, "grad_norm": 4.584473609924316, "learning_rate": 9.10806697108067e-06, "loss": 0.0419, "step": 1975 }, { "epoch": 1.804566210045662, "grad_norm": 48.57199478149414, "learning_rate": 9.10705225773719e-06, "loss": 0.7127, "step": 1976 }, { "epoch": 1.8054794520547945, "grad_norm": 51.92372131347656, "learning_rate": 9.10603754439371e-06, "loss": 0.8451, "step": 1977 }, { "epoch": 1.806392694063927, "grad_norm": 71.15345001220703, "learning_rate": 9.105022831050229e-06, "loss": 1.8644, "step": 1978 }, { "epoch": 1.8073059360730594, "grad_norm": 45.000797271728516, "learning_rate": 9.10400811770675e-06, "loss": 0.3801, "step": 1979 }, { "epoch": 1.808219178082192, "grad_norm": 7.4897661209106445, "learning_rate": 9.102993404363268e-06, "loss": 0.0757, "step": 1980 }, { "epoch": 1.8091324200913244, "grad_norm": 24.59771156311035, "learning_rate": 9.101978691019787e-06, "loss": 0.2061, "step": 1981 }, { "epoch": 1.8100456621004566, "grad_norm": 113.43928527832031, "learning_rate": 9.100963977676307e-06, "loss": 2.1701, "step": 1982 }, { "epoch": 1.810958904109589, "grad_norm": 100.77299499511719, "learning_rate": 9.099949264332826e-06, "loss": 2.1611, "step": 1983 }, { "epoch": 1.8118721461187215, "grad_norm": 39.850135803222656, "learning_rate": 9.098934550989347e-06, "loss": 0.3508, "step": 1984 }, { "epoch": 1.8127853881278537, "grad_norm": 35.18355178833008, "learning_rate": 9.097919837645866e-06, "loss": 0.391, "step": 1985 }, { "epoch": 1.8136986301369862, "grad_norm": 57.90007781982422, "learning_rate": 9.096905124302386e-06, "loss": 0.5886, "step": 1986 }, { "epoch": 1.8146118721461186, "grad_norm": 0.7002186179161072, "learning_rate": 9.095890410958905e-06, "loss": 0.0043, "step": 1987 }, { "epoch": 1.815525114155251, "grad_norm": 6.402116298675537, "learning_rate": 9.094875697615424e-06, "loss": 0.0324, "step": 1988 }, { "epoch": 1.8164383561643835, "grad_norm": 33.19042205810547, "learning_rate": 9.093860984271944e-06, "loss": 0.3134, "step": 1989 }, { "epoch": 1.817351598173516, "grad_norm": 3.7935919761657715, "learning_rate": 9.092846270928463e-06, "loss": 0.0249, "step": 1990 }, { "epoch": 1.8182648401826484, "grad_norm": 93.76913452148438, "learning_rate": 9.091831557584982e-06, "loss": 1.5681, "step": 1991 }, { "epoch": 1.819178082191781, "grad_norm": 21.046783447265625, "learning_rate": 9.090816844241503e-06, "loss": 0.2076, "step": 1992 }, { "epoch": 1.8200913242009134, "grad_norm": 10.657425880432129, "learning_rate": 9.089802130898021e-06, "loss": 0.0934, "step": 1993 }, { "epoch": 1.8210045662100458, "grad_norm": 26.261058807373047, "learning_rate": 9.088787417554542e-06, "loss": 0.2326, "step": 1994 }, { "epoch": 1.821917808219178, "grad_norm": 27.5227108001709, "learning_rate": 9.08777270421106e-06, "loss": 0.2124, "step": 1995 }, { "epoch": 1.8228310502283105, "grad_norm": 66.00247955322266, "learning_rate": 9.086757990867581e-06, "loss": 0.3211, "step": 1996 }, { "epoch": 1.823744292237443, "grad_norm": 56.07461929321289, "learning_rate": 9.0857432775241e-06, "loss": 0.5281, "step": 1997 }, { "epoch": 1.8246575342465754, "grad_norm": 886.9507446289062, "learning_rate": 9.084728564180619e-06, "loss": 1.7065, "step": 1998 }, { "epoch": 1.8255707762557076, "grad_norm": 18.453020095825195, "learning_rate": 9.08371385083714e-06, "loss": 0.1692, "step": 1999 }, { "epoch": 1.82648401826484, "grad_norm": 43.35569381713867, "learning_rate": 9.082699137493658e-06, "loss": 0.4975, "step": 2000 }, { "epoch": 1.8273972602739725, "grad_norm": 10.078710556030273, "learning_rate": 9.081684424150177e-06, "loss": 0.0805, "step": 2001 }, { "epoch": 1.828310502283105, "grad_norm": 2.1738669872283936, "learning_rate": 9.080669710806698e-06, "loss": 0.0156, "step": 2002 }, { "epoch": 1.8292237442922374, "grad_norm": 7.329278469085693, "learning_rate": 9.079654997463218e-06, "loss": 0.0787, "step": 2003 }, { "epoch": 1.83013698630137, "grad_norm": 99.05628967285156, "learning_rate": 9.078640284119737e-06, "loss": 0.9095, "step": 2004 }, { "epoch": 1.8310502283105023, "grad_norm": 4.969605922698975, "learning_rate": 9.077625570776256e-06, "loss": 0.0449, "step": 2005 }, { "epoch": 1.8319634703196348, "grad_norm": 17.713829040527344, "learning_rate": 9.076610857432777e-06, "loss": 0.072, "step": 2006 }, { "epoch": 1.8328767123287673, "grad_norm": 34.96578598022461, "learning_rate": 9.075596144089295e-06, "loss": 0.6289, "step": 2007 }, { "epoch": 1.8337899543378997, "grad_norm": 11.026759147644043, "learning_rate": 9.074581430745814e-06, "loss": 0.1132, "step": 2008 }, { "epoch": 1.834703196347032, "grad_norm": 18.049636840820312, "learning_rate": 9.073566717402335e-06, "loss": 0.206, "step": 2009 }, { "epoch": 1.8356164383561644, "grad_norm": 9.03206729888916, "learning_rate": 9.072552004058854e-06, "loss": 0.0903, "step": 2010 }, { "epoch": 1.8365296803652968, "grad_norm": 127.9749984741211, "learning_rate": 9.071537290715373e-06, "loss": 1.0422, "step": 2011 }, { "epoch": 1.837442922374429, "grad_norm": 88.6496353149414, "learning_rate": 9.070522577371893e-06, "loss": 2.4762, "step": 2012 }, { "epoch": 1.8383561643835615, "grad_norm": 0.7034335136413574, "learning_rate": 9.069507864028414e-06, "loss": 0.0052, "step": 2013 }, { "epoch": 1.839269406392694, "grad_norm": 45.914642333984375, "learning_rate": 9.068493150684932e-06, "loss": 0.525, "step": 2014 }, { "epoch": 1.8401826484018264, "grad_norm": 40.32078552246094, "learning_rate": 9.067478437341451e-06, "loss": 0.3675, "step": 2015 }, { "epoch": 1.841095890410959, "grad_norm": 61.60209274291992, "learning_rate": 9.066463723997972e-06, "loss": 0.5103, "step": 2016 }, { "epoch": 1.8420091324200913, "grad_norm": 38.651485443115234, "learning_rate": 9.06544901065449e-06, "loss": 0.5785, "step": 2017 }, { "epoch": 1.8429223744292238, "grad_norm": 40.590065002441406, "learning_rate": 9.06443429731101e-06, "loss": 0.4382, "step": 2018 }, { "epoch": 1.8438356164383563, "grad_norm": 5.962835311889648, "learning_rate": 9.06341958396753e-06, "loss": 0.0531, "step": 2019 }, { "epoch": 1.8447488584474887, "grad_norm": 19.962556838989258, "learning_rate": 9.062404870624049e-06, "loss": 0.1698, "step": 2020 }, { "epoch": 1.8456621004566212, "grad_norm": 112.81644439697266, "learning_rate": 9.061390157280568e-06, "loss": 1.1885, "step": 2021 }, { "epoch": 1.8465753424657534, "grad_norm": 114.34662628173828, "learning_rate": 9.060375443937088e-06, "loss": 2.0947, "step": 2022 }, { "epoch": 1.8474885844748858, "grad_norm": 80.37203216552734, "learning_rate": 9.059360730593609e-06, "loss": 2.893, "step": 2023 }, { "epoch": 1.8484018264840183, "grad_norm": 97.98184204101562, "learning_rate": 9.058346017250128e-06, "loss": 2.628, "step": 2024 }, { "epoch": 1.8493150684931505, "grad_norm": 3.3681082725524902, "learning_rate": 9.057331303906647e-06, "loss": 0.0226, "step": 2025 }, { "epoch": 1.850228310502283, "grad_norm": 68.62054443359375, "learning_rate": 9.056316590563167e-06, "loss": 0.6859, "step": 2026 }, { "epoch": 1.8511415525114154, "grad_norm": 134.1426544189453, "learning_rate": 9.055301877219686e-06, "loss": 1.282, "step": 2027 }, { "epoch": 1.8520547945205479, "grad_norm": 123.05347442626953, "learning_rate": 9.054287163876205e-06, "loss": 2.5038, "step": 2028 }, { "epoch": 1.8529680365296803, "grad_norm": 34.67249298095703, "learning_rate": 9.053272450532725e-06, "loss": 0.2775, "step": 2029 }, { "epoch": 1.8538812785388128, "grad_norm": 86.56396484375, "learning_rate": 9.052257737189246e-06, "loss": 2.1346, "step": 2030 }, { "epoch": 1.8547945205479452, "grad_norm": 53.78947067260742, "learning_rate": 9.051243023845763e-06, "loss": 0.6933, "step": 2031 }, { "epoch": 1.8557077625570777, "grad_norm": 43.93201446533203, "learning_rate": 9.050228310502284e-06, "loss": 0.2618, "step": 2032 }, { "epoch": 1.8566210045662102, "grad_norm": 35.4183235168457, "learning_rate": 9.049213597158804e-06, "loss": 0.0934, "step": 2033 }, { "epoch": 1.8575342465753426, "grad_norm": 64.22933197021484, "learning_rate": 9.048198883815323e-06, "loss": 0.7015, "step": 2034 }, { "epoch": 1.8584474885844748, "grad_norm": 92.60957336425781, "learning_rate": 9.047184170471842e-06, "loss": 2.313, "step": 2035 }, { "epoch": 1.8593607305936073, "grad_norm": 63.098331451416016, "learning_rate": 9.046169457128362e-06, "loss": 0.7759, "step": 2036 }, { "epoch": 1.8602739726027397, "grad_norm": 125.38679504394531, "learning_rate": 9.045154743784881e-06, "loss": 5.3305, "step": 2037 }, { "epoch": 1.8611872146118722, "grad_norm": 60.244163513183594, "learning_rate": 9.0441400304414e-06, "loss": 0.6828, "step": 2038 }, { "epoch": 1.8621004566210044, "grad_norm": 30.633689880371094, "learning_rate": 9.04312531709792e-06, "loss": 0.2683, "step": 2039 }, { "epoch": 1.8630136986301369, "grad_norm": 38.5473518371582, "learning_rate": 9.042110603754441e-06, "loss": 0.2671, "step": 2040 }, { "epoch": 1.8639269406392693, "grad_norm": 40.69294357299805, "learning_rate": 9.04109589041096e-06, "loss": 0.4167, "step": 2041 }, { "epoch": 1.8648401826484018, "grad_norm": 76.73490142822266, "learning_rate": 9.040081177067479e-06, "loss": 1.4737, "step": 2042 }, { "epoch": 1.8657534246575342, "grad_norm": 102.77165985107422, "learning_rate": 9.039066463724e-06, "loss": 4.0337, "step": 2043 }, { "epoch": 1.8666666666666667, "grad_norm": 157.94288635253906, "learning_rate": 9.038051750380518e-06, "loss": 1.0684, "step": 2044 }, { "epoch": 1.8675799086757991, "grad_norm": 58.05318832397461, "learning_rate": 9.037037037037037e-06, "loss": 1.0821, "step": 2045 }, { "epoch": 1.8684931506849316, "grad_norm": 1.5013848543167114, "learning_rate": 9.036022323693558e-06, "loss": 0.0126, "step": 2046 }, { "epoch": 1.869406392694064, "grad_norm": 17.449634552001953, "learning_rate": 9.035007610350078e-06, "loss": 0.1376, "step": 2047 }, { "epoch": 1.8703196347031965, "grad_norm": 3.041198968887329, "learning_rate": 9.033992897006595e-06, "loss": 0.0288, "step": 2048 }, { "epoch": 1.8712328767123287, "grad_norm": 29.298572540283203, "learning_rate": 9.032978183663116e-06, "loss": 0.2559, "step": 2049 }, { "epoch": 1.8721461187214612, "grad_norm": 95.78778839111328, "learning_rate": 9.031963470319636e-06, "loss": 6.0665, "step": 2050 }, { "epoch": 1.8730593607305936, "grad_norm": 64.89616394042969, "learning_rate": 9.030948756976155e-06, "loss": 0.77, "step": 2051 }, { "epoch": 1.8739726027397259, "grad_norm": 70.418212890625, "learning_rate": 9.029934043632674e-06, "loss": 0.7708, "step": 2052 }, { "epoch": 1.8748858447488583, "grad_norm": 4.052464485168457, "learning_rate": 9.028919330289195e-06, "loss": 0.0358, "step": 2053 }, { "epoch": 1.8757990867579908, "grad_norm": 272.1617736816406, "learning_rate": 9.027904616945713e-06, "loss": 1.3087, "step": 2054 }, { "epoch": 1.8767123287671232, "grad_norm": 18.4796199798584, "learning_rate": 9.026889903602232e-06, "loss": 0.2847, "step": 2055 }, { "epoch": 1.8776255707762557, "grad_norm": 49.537025451660156, "learning_rate": 9.025875190258753e-06, "loss": 0.7804, "step": 2056 }, { "epoch": 1.8785388127853881, "grad_norm": 33.27109146118164, "learning_rate": 9.024860476915273e-06, "loss": 0.4035, "step": 2057 }, { "epoch": 1.8794520547945206, "grad_norm": 17.45095443725586, "learning_rate": 9.023845763571792e-06, "loss": 0.2494, "step": 2058 }, { "epoch": 1.880365296803653, "grad_norm": 30.15760612487793, "learning_rate": 9.022831050228311e-06, "loss": 0.2187, "step": 2059 }, { "epoch": 1.8812785388127855, "grad_norm": 45.50331497192383, "learning_rate": 9.021816336884832e-06, "loss": 0.2688, "step": 2060 }, { "epoch": 1.882191780821918, "grad_norm": 10.531085968017578, "learning_rate": 9.02080162354135e-06, "loss": 0.0971, "step": 2061 }, { "epoch": 1.8831050228310502, "grad_norm": 5.816117763519287, "learning_rate": 9.01978691019787e-06, "loss": 0.0456, "step": 2062 }, { "epoch": 1.8840182648401826, "grad_norm": 49.80198669433594, "learning_rate": 9.01877219685439e-06, "loss": 0.6065, "step": 2063 }, { "epoch": 1.884931506849315, "grad_norm": 18.024486541748047, "learning_rate": 9.017757483510909e-06, "loss": 0.1652, "step": 2064 }, { "epoch": 1.8858447488584473, "grad_norm": 86.56661224365234, "learning_rate": 9.016742770167427e-06, "loss": 1.4094, "step": 2065 }, { "epoch": 1.8867579908675798, "grad_norm": 49.43673324584961, "learning_rate": 9.015728056823948e-06, "loss": 0.6257, "step": 2066 }, { "epoch": 1.8876712328767122, "grad_norm": 105.66466522216797, "learning_rate": 9.014713343480469e-06, "loss": 4.6587, "step": 2067 }, { "epoch": 1.8885844748858447, "grad_norm": 61.946067810058594, "learning_rate": 9.013698630136987e-06, "loss": 0.942, "step": 2068 }, { "epoch": 1.8894977168949771, "grad_norm": 50.043792724609375, "learning_rate": 9.012683916793506e-06, "loss": 0.4496, "step": 2069 }, { "epoch": 1.8904109589041096, "grad_norm": 52.21540069580078, "learning_rate": 9.011669203450027e-06, "loss": 1.9089, "step": 2070 }, { "epoch": 1.891324200913242, "grad_norm": 85.8218765258789, "learning_rate": 9.010654490106546e-06, "loss": 1.0763, "step": 2071 }, { "epoch": 1.8922374429223745, "grad_norm": 24.541057586669922, "learning_rate": 9.009639776763064e-06, "loss": 0.1906, "step": 2072 }, { "epoch": 1.893150684931507, "grad_norm": 31.39955711364746, "learning_rate": 9.008625063419585e-06, "loss": 0.3834, "step": 2073 }, { "epoch": 1.8940639269406394, "grad_norm": 40.885833740234375, "learning_rate": 9.007610350076106e-06, "loss": 0.6539, "step": 2074 }, { "epoch": 1.8949771689497716, "grad_norm": 30.76819610595703, "learning_rate": 9.006595636732623e-06, "loss": 0.243, "step": 2075 }, { "epoch": 1.895890410958904, "grad_norm": 35.70730972290039, "learning_rate": 9.005580923389143e-06, "loss": 0.5199, "step": 2076 }, { "epoch": 1.8968036529680365, "grad_norm": 2.171206474304199, "learning_rate": 9.004566210045664e-06, "loss": 0.016, "step": 2077 }, { "epoch": 1.897716894977169, "grad_norm": 63.403282165527344, "learning_rate": 9.003551496702183e-06, "loss": 0.7576, "step": 2078 }, { "epoch": 1.8986301369863012, "grad_norm": 74.31417846679688, "learning_rate": 9.002536783358701e-06, "loss": 1.1051, "step": 2079 }, { "epoch": 1.8995433789954337, "grad_norm": 14.0780668258667, "learning_rate": 9.001522070015222e-06, "loss": 0.1261, "step": 2080 }, { "epoch": 1.9004566210045661, "grad_norm": 3.793886423110962, "learning_rate": 9.00050735667174e-06, "loss": 0.031, "step": 2081 }, { "epoch": 1.9013698630136986, "grad_norm": 139.6371612548828, "learning_rate": 8.99949264332826e-06, "loss": 3.3979, "step": 2082 }, { "epoch": 1.902283105022831, "grad_norm": 61.92314147949219, "learning_rate": 8.99847792998478e-06, "loss": 1.0646, "step": 2083 }, { "epoch": 1.9031963470319635, "grad_norm": 63.21108627319336, "learning_rate": 8.9974632166413e-06, "loss": 0.8266, "step": 2084 }, { "epoch": 1.904109589041096, "grad_norm": 25.9259090423584, "learning_rate": 8.99644850329782e-06, "loss": 0.1624, "step": 2085 }, { "epoch": 1.9050228310502284, "grad_norm": 7.3349928855896, "learning_rate": 8.995433789954338e-06, "loss": 0.0685, "step": 2086 }, { "epoch": 1.9059360730593609, "grad_norm": 107.1559066772461, "learning_rate": 8.994419076610859e-06, "loss": 1.3657, "step": 2087 }, { "epoch": 1.9068493150684933, "grad_norm": 3.2557199001312256, "learning_rate": 8.993404363267378e-06, "loss": 0.0313, "step": 2088 }, { "epoch": 1.9077625570776255, "grad_norm": 29.964149475097656, "learning_rate": 8.992389649923897e-06, "loss": 0.1958, "step": 2089 }, { "epoch": 1.908675799086758, "grad_norm": 151.62435913085938, "learning_rate": 8.991374936580417e-06, "loss": 2.9902, "step": 2090 }, { "epoch": 1.9095890410958904, "grad_norm": 53.52703857421875, "learning_rate": 8.990360223236936e-06, "loss": 1.1366, "step": 2091 }, { "epoch": 1.9105022831050227, "grad_norm": 46.46689987182617, "learning_rate": 8.989345509893455e-06, "loss": 0.7624, "step": 2092 }, { "epoch": 1.9114155251141551, "grad_norm": 14.07602310180664, "learning_rate": 8.988330796549975e-06, "loss": 0.1764, "step": 2093 }, { "epoch": 1.9123287671232876, "grad_norm": 4.9611358642578125, "learning_rate": 8.987316083206496e-06, "loss": 0.0458, "step": 2094 }, { "epoch": 1.91324200913242, "grad_norm": 101.65821075439453, "learning_rate": 8.986301369863015e-06, "loss": 2.2792, "step": 2095 }, { "epoch": 1.9141552511415525, "grad_norm": 13.209863662719727, "learning_rate": 8.985286656519534e-06, "loss": 0.1461, "step": 2096 }, { "epoch": 1.915068493150685, "grad_norm": 73.53060150146484, "learning_rate": 8.984271943176054e-06, "loss": 1.0103, "step": 2097 }, { "epoch": 1.9159817351598174, "grad_norm": 2.1831541061401367, "learning_rate": 8.983257229832573e-06, "loss": 0.014, "step": 2098 }, { "epoch": 1.9168949771689499, "grad_norm": 23.50394058227539, "learning_rate": 8.982242516489092e-06, "loss": 0.2195, "step": 2099 }, { "epoch": 1.9178082191780823, "grad_norm": 28.80841064453125, "learning_rate": 8.981227803145612e-06, "loss": 0.3719, "step": 2100 }, { "epoch": 1.9187214611872148, "grad_norm": 82.91236877441406, "learning_rate": 8.980213089802131e-06, "loss": 1.3559, "step": 2101 }, { "epoch": 1.919634703196347, "grad_norm": 119.52164459228516, "learning_rate": 8.979198376458652e-06, "loss": 4.7369, "step": 2102 }, { "epoch": 1.9205479452054794, "grad_norm": 47.183048248291016, "learning_rate": 8.97818366311517e-06, "loss": 0.3875, "step": 2103 }, { "epoch": 1.921461187214612, "grad_norm": 13.251933097839355, "learning_rate": 8.977168949771691e-06, "loss": 0.117, "step": 2104 }, { "epoch": 1.9223744292237441, "grad_norm": 122.78868865966797, "learning_rate": 8.97615423642821e-06, "loss": 2.0935, "step": 2105 }, { "epoch": 1.9232876712328766, "grad_norm": 28.559097290039062, "learning_rate": 8.975139523084729e-06, "loss": 0.1412, "step": 2106 }, { "epoch": 1.924200913242009, "grad_norm": 27.587587356567383, "learning_rate": 8.97412480974125e-06, "loss": 0.3645, "step": 2107 }, { "epoch": 1.9251141552511415, "grad_norm": 66.5941390991211, "learning_rate": 8.973110096397768e-06, "loss": 1.9978, "step": 2108 }, { "epoch": 1.926027397260274, "grad_norm": 18.3697452545166, "learning_rate": 8.972095383054287e-06, "loss": 0.1852, "step": 2109 }, { "epoch": 1.9269406392694064, "grad_norm": 38.971885681152344, "learning_rate": 8.971080669710808e-06, "loss": 0.4789, "step": 2110 }, { "epoch": 1.9278538812785389, "grad_norm": 35.434715270996094, "learning_rate": 8.970065956367326e-06, "loss": 0.1887, "step": 2111 }, { "epoch": 1.9287671232876713, "grad_norm": 28.313854217529297, "learning_rate": 8.969051243023847e-06, "loss": 0.2349, "step": 2112 }, { "epoch": 1.9296803652968038, "grad_norm": 14.868274688720703, "learning_rate": 8.968036529680366e-06, "loss": 0.1516, "step": 2113 }, { "epoch": 1.9305936073059362, "grad_norm": 45.27061080932617, "learning_rate": 8.967021816336886e-06, "loss": 0.5746, "step": 2114 }, { "epoch": 1.9315068493150684, "grad_norm": 39.812618255615234, "learning_rate": 8.966007102993405e-06, "loss": 0.353, "step": 2115 }, { "epoch": 1.932420091324201, "grad_norm": 0.8794688582420349, "learning_rate": 8.964992389649924e-06, "loss": 0.0082, "step": 2116 }, { "epoch": 1.9333333333333333, "grad_norm": 39.293643951416016, "learning_rate": 8.963977676306445e-06, "loss": 0.3908, "step": 2117 }, { "epoch": 1.9342465753424658, "grad_norm": 4.051852226257324, "learning_rate": 8.962962962962963e-06, "loss": 0.0351, "step": 2118 }, { "epoch": 1.935159817351598, "grad_norm": 1.1762282848358154, "learning_rate": 8.961948249619484e-06, "loss": 0.0125, "step": 2119 }, { "epoch": 1.9360730593607305, "grad_norm": 9.22496509552002, "learning_rate": 8.960933536276003e-06, "loss": 0.0225, "step": 2120 }, { "epoch": 1.936986301369863, "grad_norm": 1.8947044610977173, "learning_rate": 8.959918822932522e-06, "loss": 0.0184, "step": 2121 }, { "epoch": 1.9378995433789954, "grad_norm": 2.82311749458313, "learning_rate": 8.958904109589042e-06, "loss": 0.0161, "step": 2122 }, { "epoch": 1.9388127853881278, "grad_norm": 74.18487548828125, "learning_rate": 8.957889396245561e-06, "loss": 1.5869, "step": 2123 }, { "epoch": 1.9397260273972603, "grad_norm": 35.58678436279297, "learning_rate": 8.956874682902082e-06, "loss": 0.2535, "step": 2124 }, { "epoch": 1.9406392694063928, "grad_norm": 53.88751220703125, "learning_rate": 8.9558599695586e-06, "loss": 0.4907, "step": 2125 }, { "epoch": 1.9415525114155252, "grad_norm": 24.089092254638672, "learning_rate": 8.95484525621512e-06, "loss": 0.1675, "step": 2126 }, { "epoch": 1.9424657534246577, "grad_norm": 44.88922119140625, "learning_rate": 8.95383054287164e-06, "loss": 0.4196, "step": 2127 }, { "epoch": 1.9433789954337901, "grad_norm": 45.2702522277832, "learning_rate": 8.952815829528159e-06, "loss": 0.6048, "step": 2128 }, { "epoch": 1.9442922374429223, "grad_norm": 45.796627044677734, "learning_rate": 8.95180111618468e-06, "loss": 0.2907, "step": 2129 }, { "epoch": 1.9452054794520548, "grad_norm": 45.180946350097656, "learning_rate": 8.950786402841198e-06, "loss": 0.7298, "step": 2130 }, { "epoch": 1.9461187214611873, "grad_norm": 72.19673156738281, "learning_rate": 8.949771689497717e-06, "loss": 0.9667, "step": 2131 }, { "epoch": 1.9470319634703195, "grad_norm": 43.51890563964844, "learning_rate": 8.948756976154237e-06, "loss": 0.6041, "step": 2132 }, { "epoch": 1.947945205479452, "grad_norm": 21.06615447998047, "learning_rate": 8.947742262810756e-06, "loss": 0.1211, "step": 2133 }, { "epoch": 1.9488584474885844, "grad_norm": 53.6763801574707, "learning_rate": 8.946727549467277e-06, "loss": 0.644, "step": 2134 }, { "epoch": 1.9497716894977168, "grad_norm": 58.93217086791992, "learning_rate": 8.945712836123796e-06, "loss": 0.6196, "step": 2135 }, { "epoch": 1.9506849315068493, "grad_norm": 80.7812728881836, "learning_rate": 8.944698122780315e-06, "loss": 1.5533, "step": 2136 }, { "epoch": 1.9515981735159817, "grad_norm": 38.058013916015625, "learning_rate": 8.943683409436835e-06, "loss": 0.5912, "step": 2137 }, { "epoch": 1.9525114155251142, "grad_norm": 51.9195442199707, "learning_rate": 8.942668696093354e-06, "loss": 1.4728, "step": 2138 }, { "epoch": 1.9534246575342467, "grad_norm": 12.99720287322998, "learning_rate": 8.941653982749874e-06, "loss": 0.1485, "step": 2139 }, { "epoch": 1.954337899543379, "grad_norm": 5.239146709442139, "learning_rate": 8.940639269406393e-06, "loss": 0.0382, "step": 2140 }, { "epoch": 1.9552511415525116, "grad_norm": 25.44478416442871, "learning_rate": 8.939624556062912e-06, "loss": 0.2689, "step": 2141 }, { "epoch": 1.9561643835616438, "grad_norm": 7.3026957511901855, "learning_rate": 8.938609842719433e-06, "loss": 0.0648, "step": 2142 }, { "epoch": 1.9570776255707762, "grad_norm": 21.20894432067871, "learning_rate": 8.937595129375952e-06, "loss": 0.2595, "step": 2143 }, { "epoch": 1.9579908675799087, "grad_norm": 8.61834716796875, "learning_rate": 8.936580416032472e-06, "loss": 0.0573, "step": 2144 }, { "epoch": 1.958904109589041, "grad_norm": 20.921550750732422, "learning_rate": 8.935565702688991e-06, "loss": 0.1732, "step": 2145 }, { "epoch": 1.9598173515981734, "grad_norm": 29.81536102294922, "learning_rate": 8.934550989345511e-06, "loss": 0.3624, "step": 2146 }, { "epoch": 1.9607305936073058, "grad_norm": 12.833561897277832, "learning_rate": 8.93353627600203e-06, "loss": 0.1188, "step": 2147 }, { "epoch": 1.9616438356164383, "grad_norm": 9.357948303222656, "learning_rate": 8.93252156265855e-06, "loss": 0.0698, "step": 2148 }, { "epoch": 1.9625570776255707, "grad_norm": 107.42261505126953, "learning_rate": 8.93150684931507e-06, "loss": 3.8369, "step": 2149 }, { "epoch": 1.9634703196347032, "grad_norm": 1.825657606124878, "learning_rate": 8.930492135971589e-06, "loss": 0.0132, "step": 2150 }, { "epoch": 1.9643835616438357, "grad_norm": 5.180931568145752, "learning_rate": 8.929477422628107e-06, "loss": 0.024, "step": 2151 }, { "epoch": 1.965296803652968, "grad_norm": 107.80252838134766, "learning_rate": 8.928462709284628e-06, "loss": 4.6854, "step": 2152 }, { "epoch": 1.9662100456621006, "grad_norm": 12.699347496032715, "learning_rate": 8.927447995941147e-06, "loss": 0.1272, "step": 2153 }, { "epoch": 1.967123287671233, "grad_norm": 96.3140640258789, "learning_rate": 8.926433282597667e-06, "loss": 1.9106, "step": 2154 }, { "epoch": 1.9680365296803652, "grad_norm": 0.8505117893218994, "learning_rate": 8.925418569254186e-06, "loss": 0.0083, "step": 2155 }, { "epoch": 1.9689497716894977, "grad_norm": 12.798935890197754, "learning_rate": 8.924403855910707e-06, "loss": 0.0914, "step": 2156 }, { "epoch": 1.9698630136986301, "grad_norm": 6.1002516746521, "learning_rate": 8.923389142567226e-06, "loss": 0.0351, "step": 2157 }, { "epoch": 1.9707762557077626, "grad_norm": 26.935382843017578, "learning_rate": 8.922374429223744e-06, "loss": 0.1818, "step": 2158 }, { "epoch": 1.9716894977168948, "grad_norm": 3.5023021697998047, "learning_rate": 8.921359715880265e-06, "loss": 0.0302, "step": 2159 }, { "epoch": 1.9726027397260273, "grad_norm": 42.78627395629883, "learning_rate": 8.920345002536784e-06, "loss": 0.3104, "step": 2160 }, { "epoch": 1.9735159817351597, "grad_norm": 75.3594741821289, "learning_rate": 8.919330289193303e-06, "loss": 1.2194, "step": 2161 }, { "epoch": 1.9744292237442922, "grad_norm": 9.41092586517334, "learning_rate": 8.918315575849823e-06, "loss": 0.0747, "step": 2162 }, { "epoch": 1.9753424657534246, "grad_norm": 22.969688415527344, "learning_rate": 8.917300862506344e-06, "loss": 0.2112, "step": 2163 }, { "epoch": 1.976255707762557, "grad_norm": 18.0825138092041, "learning_rate": 8.916286149162863e-06, "loss": 0.2161, "step": 2164 }, { "epoch": 1.9771689497716896, "grad_norm": 2.5777781009674072, "learning_rate": 8.915271435819381e-06, "loss": 0.0145, "step": 2165 }, { "epoch": 1.978082191780822, "grad_norm": 21.040603637695312, "learning_rate": 8.914256722475902e-06, "loss": 0.1958, "step": 2166 }, { "epoch": 1.9789954337899545, "grad_norm": 14.477423667907715, "learning_rate": 8.91324200913242e-06, "loss": 0.0754, "step": 2167 }, { "epoch": 1.979908675799087, "grad_norm": 59.9955940246582, "learning_rate": 8.91222729578894e-06, "loss": 0.3095, "step": 2168 }, { "epoch": 1.9808219178082191, "grad_norm": 9.251941680908203, "learning_rate": 8.91121258244546e-06, "loss": 0.0941, "step": 2169 }, { "epoch": 1.9817351598173516, "grad_norm": 0.9055978059768677, "learning_rate": 8.910197869101979e-06, "loss": 0.0078, "step": 2170 }, { "epoch": 1.982648401826484, "grad_norm": 129.89646911621094, "learning_rate": 8.909183155758498e-06, "loss": 2.399, "step": 2171 }, { "epoch": 1.9835616438356163, "grad_norm": 56.921836853027344, "learning_rate": 8.908168442415018e-06, "loss": 0.5168, "step": 2172 }, { "epoch": 1.9844748858447487, "grad_norm": 92.27546691894531, "learning_rate": 8.907153729071539e-06, "loss": 1.38, "step": 2173 }, { "epoch": 1.9853881278538812, "grad_norm": 5.915555953979492, "learning_rate": 8.906139015728058e-06, "loss": 0.0493, "step": 2174 }, { "epoch": 1.9863013698630136, "grad_norm": 0.783058762550354, "learning_rate": 8.905124302384577e-06, "loss": 0.0055, "step": 2175 }, { "epoch": 1.987214611872146, "grad_norm": 121.16291809082031, "learning_rate": 8.904109589041097e-06, "loss": 4.7634, "step": 2176 }, { "epoch": 1.9881278538812786, "grad_norm": 7.533426284790039, "learning_rate": 8.903094875697616e-06, "loss": 0.0707, "step": 2177 }, { "epoch": 1.989041095890411, "grad_norm": 112.55916595458984, "learning_rate": 8.902080162354135e-06, "loss": 2.9896, "step": 2178 }, { "epoch": 1.9899543378995435, "grad_norm": 94.37251281738281, "learning_rate": 8.901065449010655e-06, "loss": 1.9637, "step": 2179 }, { "epoch": 1.990867579908676, "grad_norm": 58.674407958984375, "learning_rate": 8.900050735667174e-06, "loss": 0.6669, "step": 2180 }, { "epoch": 1.9917808219178084, "grad_norm": 70.93487548828125, "learning_rate": 8.899036022323693e-06, "loss": 0.8337, "step": 2181 }, { "epoch": 1.9926940639269406, "grad_norm": 9.741381645202637, "learning_rate": 8.898021308980214e-06, "loss": 0.1036, "step": 2182 }, { "epoch": 1.993607305936073, "grad_norm": 15.018641471862793, "learning_rate": 8.897006595636734e-06, "loss": 0.1134, "step": 2183 }, { "epoch": 1.9945205479452055, "grad_norm": 0.34646815061569214, "learning_rate": 8.895991882293253e-06, "loss": 0.004, "step": 2184 }, { "epoch": 1.9954337899543377, "grad_norm": 6.360834121704102, "learning_rate": 8.894977168949772e-06, "loss": 0.0679, "step": 2185 }, { "epoch": 1.9963470319634702, "grad_norm": 13.920827865600586, "learning_rate": 8.893962455606292e-06, "loss": 0.0738, "step": 2186 }, { "epoch": 1.9972602739726026, "grad_norm": 70.66716003417969, "learning_rate": 8.892947742262811e-06, "loss": 1.74, "step": 2187 }, { "epoch": 1.998173515981735, "grad_norm": 25.20996856689453, "learning_rate": 8.89193302891933e-06, "loss": 0.1586, "step": 2188 }, { "epoch": 1.9990867579908675, "grad_norm": 68.49055480957031, "learning_rate": 8.89091831557585e-06, "loss": 1.412, "step": 2189 }, { "epoch": 2.0, "grad_norm": 2.319622755050659, "learning_rate": 8.889903602232371e-06, "loss": 0.0191, "step": 2190 }, { "epoch": 2.0009132420091325, "grad_norm": 53.75041580200195, "learning_rate": 8.888888888888888e-06, "loss": 0.6251, "step": 2191 }, { "epoch": 2.001826484018265, "grad_norm": 42.29996871948242, "learning_rate": 8.887874175545409e-06, "loss": 0.5939, "step": 2192 }, { "epoch": 2.0027397260273974, "grad_norm": 1.393972635269165, "learning_rate": 8.88685946220193e-06, "loss": 0.014, "step": 2193 }, { "epoch": 2.00365296803653, "grad_norm": 2.397780656814575, "learning_rate": 8.885844748858448e-06, "loss": 0.0231, "step": 2194 }, { "epoch": 2.0045662100456623, "grad_norm": 1.2833857536315918, "learning_rate": 8.884830035514967e-06, "loss": 0.0089, "step": 2195 }, { "epoch": 2.0054794520547947, "grad_norm": 37.789215087890625, "learning_rate": 8.883815322171488e-06, "loss": 0.3611, "step": 2196 }, { "epoch": 2.0063926940639267, "grad_norm": 65.50315856933594, "learning_rate": 8.882800608828006e-06, "loss": 0.7197, "step": 2197 }, { "epoch": 2.007305936073059, "grad_norm": 42.30763244628906, "learning_rate": 8.881785895484525e-06, "loss": 0.3905, "step": 2198 }, { "epoch": 2.0082191780821916, "grad_norm": 3.307783365249634, "learning_rate": 8.880771182141046e-06, "loss": 0.0349, "step": 2199 }, { "epoch": 2.009132420091324, "grad_norm": 0.6765236854553223, "learning_rate": 8.879756468797566e-06, "loss": 0.006, "step": 2200 }, { "epoch": 2.0100456621004565, "grad_norm": 4.5930986404418945, "learning_rate": 8.878741755454085e-06, "loss": 0.0571, "step": 2201 }, { "epoch": 2.010958904109589, "grad_norm": 13.533751487731934, "learning_rate": 8.877727042110604e-06, "loss": 0.0402, "step": 2202 }, { "epoch": 2.0118721461187214, "grad_norm": 1.290871262550354, "learning_rate": 8.876712328767125e-06, "loss": 0.0129, "step": 2203 }, { "epoch": 2.012785388127854, "grad_norm": 9.713951110839844, "learning_rate": 8.875697615423643e-06, "loss": 0.0792, "step": 2204 }, { "epoch": 2.0136986301369864, "grad_norm": 16.289609909057617, "learning_rate": 8.874682902080162e-06, "loss": 0.1331, "step": 2205 }, { "epoch": 2.014611872146119, "grad_norm": 38.39680099487305, "learning_rate": 8.873668188736683e-06, "loss": 0.2925, "step": 2206 }, { "epoch": 2.0155251141552513, "grad_norm": 24.0375919342041, "learning_rate": 8.872653475393203e-06, "loss": 0.1881, "step": 2207 }, { "epoch": 2.0164383561643837, "grad_norm": 15.00643539428711, "learning_rate": 8.87163876204972e-06, "loss": 0.1526, "step": 2208 }, { "epoch": 2.017351598173516, "grad_norm": 19.935077667236328, "learning_rate": 8.870624048706241e-06, "loss": 0.2112, "step": 2209 }, { "epoch": 2.018264840182648, "grad_norm": 7.908957481384277, "learning_rate": 8.869609335362762e-06, "loss": 0.0787, "step": 2210 }, { "epoch": 2.0191780821917806, "grad_norm": 11.636625289916992, "learning_rate": 8.86859462201928e-06, "loss": 0.0482, "step": 2211 }, { "epoch": 2.020091324200913, "grad_norm": 81.78958892822266, "learning_rate": 8.8675799086758e-06, "loss": 0.5995, "step": 2212 }, { "epoch": 2.0210045662100455, "grad_norm": 7.852067470550537, "learning_rate": 8.86656519533232e-06, "loss": 0.0582, "step": 2213 }, { "epoch": 2.021917808219178, "grad_norm": 7.866483211517334, "learning_rate": 8.865550481988839e-06, "loss": 0.0728, "step": 2214 }, { "epoch": 2.0228310502283104, "grad_norm": 129.35989379882812, "learning_rate": 8.864535768645358e-06, "loss": 3.2827, "step": 2215 }, { "epoch": 2.023744292237443, "grad_norm": 78.2619857788086, "learning_rate": 8.863521055301878e-06, "loss": 1.1207, "step": 2216 }, { "epoch": 2.0246575342465754, "grad_norm": 67.58443450927734, "learning_rate": 8.862506341958399e-06, "loss": 1.3472, "step": 2217 }, { "epoch": 2.025570776255708, "grad_norm": 1.1303250789642334, "learning_rate": 8.861491628614917e-06, "loss": 0.0105, "step": 2218 }, { "epoch": 2.0264840182648403, "grad_norm": 0.9576665163040161, "learning_rate": 8.860476915271436e-06, "loss": 0.0065, "step": 2219 }, { "epoch": 2.0273972602739727, "grad_norm": 3.2147092819213867, "learning_rate": 8.859462201927957e-06, "loss": 0.0192, "step": 2220 }, { "epoch": 2.028310502283105, "grad_norm": 8.509336471557617, "learning_rate": 8.858447488584476e-06, "loss": 0.0979, "step": 2221 }, { "epoch": 2.0292237442922376, "grad_norm": 6.56364631652832, "learning_rate": 8.857432775240995e-06, "loss": 0.0425, "step": 2222 }, { "epoch": 2.03013698630137, "grad_norm": 132.05426025390625, "learning_rate": 8.856418061897515e-06, "loss": 0.469, "step": 2223 }, { "epoch": 2.031050228310502, "grad_norm": 9.26961898803711, "learning_rate": 8.855403348554034e-06, "loss": 0.0515, "step": 2224 }, { "epoch": 2.0319634703196345, "grad_norm": 1.7091401815414429, "learning_rate": 8.854388635210553e-06, "loss": 0.0048, "step": 2225 }, { "epoch": 2.032876712328767, "grad_norm": 13.313980102539062, "learning_rate": 8.853373921867073e-06, "loss": 0.095, "step": 2226 }, { "epoch": 2.0337899543378994, "grad_norm": 77.70919036865234, "learning_rate": 8.852359208523594e-06, "loss": 0.4045, "step": 2227 }, { "epoch": 2.034703196347032, "grad_norm": 10.917083740234375, "learning_rate": 8.851344495180113e-06, "loss": 0.0873, "step": 2228 }, { "epoch": 2.0356164383561643, "grad_norm": 20.6254825592041, "learning_rate": 8.850329781836632e-06, "loss": 0.4911, "step": 2229 }, { "epoch": 2.036529680365297, "grad_norm": 4.360234260559082, "learning_rate": 8.849315068493152e-06, "loss": 0.0262, "step": 2230 }, { "epoch": 2.0374429223744293, "grad_norm": 42.76108169555664, "learning_rate": 8.848300355149671e-06, "loss": 0.3376, "step": 2231 }, { "epoch": 2.0383561643835617, "grad_norm": 5.876658916473389, "learning_rate": 8.84728564180619e-06, "loss": 0.0497, "step": 2232 }, { "epoch": 2.039269406392694, "grad_norm": 45.148597717285156, "learning_rate": 8.84627092846271e-06, "loss": 0.6017, "step": 2233 }, { "epoch": 2.0401826484018266, "grad_norm": 66.85814666748047, "learning_rate": 8.84525621511923e-06, "loss": 1.1923, "step": 2234 }, { "epoch": 2.041095890410959, "grad_norm": 4.188924789428711, "learning_rate": 8.844241501775748e-06, "loss": 0.0362, "step": 2235 }, { "epoch": 2.0420091324200915, "grad_norm": 1.225584626197815, "learning_rate": 8.843226788432269e-06, "loss": 0.0118, "step": 2236 }, { "epoch": 2.0429223744292235, "grad_norm": 31.637392044067383, "learning_rate": 8.842212075088789e-06, "loss": 0.0877, "step": 2237 }, { "epoch": 2.043835616438356, "grad_norm": 90.91632843017578, "learning_rate": 8.841197361745308e-06, "loss": 1.6186, "step": 2238 }, { "epoch": 2.0447488584474884, "grad_norm": 6.086756229400635, "learning_rate": 8.840182648401827e-06, "loss": 0.0662, "step": 2239 }, { "epoch": 2.045662100456621, "grad_norm": 0.9402700066566467, "learning_rate": 8.839167935058347e-06, "loss": 0.0081, "step": 2240 }, { "epoch": 2.0465753424657533, "grad_norm": 7.596081733703613, "learning_rate": 8.838153221714866e-06, "loss": 0.0657, "step": 2241 }, { "epoch": 2.047488584474886, "grad_norm": 0.06576841324567795, "learning_rate": 8.837138508371385e-06, "loss": 0.0004, "step": 2242 }, { "epoch": 2.0484018264840183, "grad_norm": 0.4570503830909729, "learning_rate": 8.836123795027906e-06, "loss": 0.0043, "step": 2243 }, { "epoch": 2.0493150684931507, "grad_norm": 7.612755298614502, "learning_rate": 8.835109081684426e-06, "loss": 0.0579, "step": 2244 }, { "epoch": 2.050228310502283, "grad_norm": 127.57798767089844, "learning_rate": 8.834094368340945e-06, "loss": 0.7568, "step": 2245 }, { "epoch": 2.0511415525114156, "grad_norm": 16.2823486328125, "learning_rate": 8.833079654997464e-06, "loss": 0.139, "step": 2246 }, { "epoch": 2.052054794520548, "grad_norm": 38.66204071044922, "learning_rate": 8.832064941653984e-06, "loss": 0.3998, "step": 2247 }, { "epoch": 2.0529680365296805, "grad_norm": 2.8646035194396973, "learning_rate": 8.831050228310503e-06, "loss": 0.0246, "step": 2248 }, { "epoch": 2.053881278538813, "grad_norm": 9.128894805908203, "learning_rate": 8.830035514967022e-06, "loss": 0.0302, "step": 2249 }, { "epoch": 2.0547945205479454, "grad_norm": 64.38107299804688, "learning_rate": 8.829020801623543e-06, "loss": 0.5217, "step": 2250 }, { "epoch": 2.0557077625570774, "grad_norm": 11.033976554870605, "learning_rate": 8.828006088280061e-06, "loss": 0.1428, "step": 2251 }, { "epoch": 2.05662100456621, "grad_norm": 13.176627159118652, "learning_rate": 8.82699137493658e-06, "loss": 0.0902, "step": 2252 }, { "epoch": 2.0575342465753423, "grad_norm": 2.8041718006134033, "learning_rate": 8.8259766615931e-06, "loss": 0.0172, "step": 2253 }, { "epoch": 2.058447488584475, "grad_norm": 6.440333843231201, "learning_rate": 8.824961948249621e-06, "loss": 0.0399, "step": 2254 }, { "epoch": 2.0593607305936072, "grad_norm": 79.16975402832031, "learning_rate": 8.82394723490614e-06, "loss": 1.3826, "step": 2255 }, { "epoch": 2.0602739726027397, "grad_norm": 1.5098994970321655, "learning_rate": 8.822932521562659e-06, "loss": 0.011, "step": 2256 }, { "epoch": 2.061187214611872, "grad_norm": 2.437436103820801, "learning_rate": 8.82191780821918e-06, "loss": 0.0177, "step": 2257 }, { "epoch": 2.0621004566210046, "grad_norm": 66.6534194946289, "learning_rate": 8.820903094875698e-06, "loss": 0.8509, "step": 2258 }, { "epoch": 2.063013698630137, "grad_norm": 98.36062622070312, "learning_rate": 8.819888381532217e-06, "loss": 2.9562, "step": 2259 }, { "epoch": 2.0639269406392695, "grad_norm": 23.148418426513672, "learning_rate": 8.818873668188738e-06, "loss": 0.2435, "step": 2260 }, { "epoch": 2.064840182648402, "grad_norm": 35.03211975097656, "learning_rate": 8.817858954845257e-06, "loss": 0.4595, "step": 2261 }, { "epoch": 2.0657534246575344, "grad_norm": 1.5684926509857178, "learning_rate": 8.816844241501777e-06, "loss": 0.0091, "step": 2262 }, { "epoch": 2.066666666666667, "grad_norm": 3.0227372646331787, "learning_rate": 8.815829528158296e-06, "loss": 0.0237, "step": 2263 }, { "epoch": 2.067579908675799, "grad_norm": 148.41465759277344, "learning_rate": 8.814814814814817e-06, "loss": 1.7685, "step": 2264 }, { "epoch": 2.0684931506849313, "grad_norm": 29.01453971862793, "learning_rate": 8.813800101471335e-06, "loss": 0.2672, "step": 2265 }, { "epoch": 2.069406392694064, "grad_norm": 20.339080810546875, "learning_rate": 8.812785388127854e-06, "loss": 0.1632, "step": 2266 }, { "epoch": 2.0703196347031962, "grad_norm": 10.346718788146973, "learning_rate": 8.811770674784375e-06, "loss": 0.092, "step": 2267 }, { "epoch": 2.0712328767123287, "grad_norm": 88.18811798095703, "learning_rate": 8.810755961440894e-06, "loss": 1.2395, "step": 2268 }, { "epoch": 2.072146118721461, "grad_norm": 3.7770121097564697, "learning_rate": 8.809741248097412e-06, "loss": 0.0269, "step": 2269 }, { "epoch": 2.0730593607305936, "grad_norm": 0.7464116811752319, "learning_rate": 8.808726534753933e-06, "loss": 0.0055, "step": 2270 }, { "epoch": 2.073972602739726, "grad_norm": 16.065885543823242, "learning_rate": 8.807711821410452e-06, "loss": 0.077, "step": 2271 }, { "epoch": 2.0748858447488585, "grad_norm": 0.9674680829048157, "learning_rate": 8.806697108066972e-06, "loss": 0.0067, "step": 2272 }, { "epoch": 2.075799086757991, "grad_norm": 2.122314453125, "learning_rate": 8.805682394723491e-06, "loss": 0.0234, "step": 2273 }, { "epoch": 2.0767123287671234, "grad_norm": 10.181023597717285, "learning_rate": 8.804667681380012e-06, "loss": 0.0843, "step": 2274 }, { "epoch": 2.077625570776256, "grad_norm": 11.259839057922363, "learning_rate": 8.80365296803653e-06, "loss": 0.087, "step": 2275 }, { "epoch": 2.0785388127853883, "grad_norm": 103.39250946044922, "learning_rate": 8.80263825469305e-06, "loss": 3.3908, "step": 2276 }, { "epoch": 2.0794520547945208, "grad_norm": 20.530115127563477, "learning_rate": 8.80162354134957e-06, "loss": 0.1646, "step": 2277 }, { "epoch": 2.080365296803653, "grad_norm": 380.44244384765625, "learning_rate": 8.800608828006089e-06, "loss": 4.3344, "step": 2278 }, { "epoch": 2.0812785388127852, "grad_norm": 5.479150772094727, "learning_rate": 8.799594114662608e-06, "loss": 0.0351, "step": 2279 }, { "epoch": 2.0821917808219177, "grad_norm": 0.35062605142593384, "learning_rate": 8.798579401319128e-06, "loss": 0.0033, "step": 2280 }, { "epoch": 2.08310502283105, "grad_norm": 12.83180046081543, "learning_rate": 8.797564687975647e-06, "loss": 0.1169, "step": 2281 }, { "epoch": 2.0840182648401826, "grad_norm": 46.2896614074707, "learning_rate": 8.796549974632168e-06, "loss": 0.4104, "step": 2282 }, { "epoch": 2.084931506849315, "grad_norm": 37.01845932006836, "learning_rate": 8.795535261288686e-06, "loss": 0.3865, "step": 2283 }, { "epoch": 2.0858447488584475, "grad_norm": 0.698616087436676, "learning_rate": 8.794520547945207e-06, "loss": 0.0062, "step": 2284 }, { "epoch": 2.08675799086758, "grad_norm": 1.541617751121521, "learning_rate": 8.793505834601726e-06, "loss": 0.0131, "step": 2285 }, { "epoch": 2.0876712328767124, "grad_norm": 28.47223472595215, "learning_rate": 8.792491121258245e-06, "loss": 0.1051, "step": 2286 }, { "epoch": 2.088584474885845, "grad_norm": 83.17826080322266, "learning_rate": 8.791476407914765e-06, "loss": 0.8669, "step": 2287 }, { "epoch": 2.0894977168949773, "grad_norm": 5.999943733215332, "learning_rate": 8.790461694571284e-06, "loss": 0.046, "step": 2288 }, { "epoch": 2.0904109589041098, "grad_norm": 4.725938320159912, "learning_rate": 8.789446981227805e-06, "loss": 0.0399, "step": 2289 }, { "epoch": 2.091324200913242, "grad_norm": 69.20633697509766, "learning_rate": 8.788432267884323e-06, "loss": 0.8283, "step": 2290 }, { "epoch": 2.0922374429223742, "grad_norm": 12.402261734008789, "learning_rate": 8.787417554540842e-06, "loss": 0.0816, "step": 2291 }, { "epoch": 2.0931506849315067, "grad_norm": 17.460695266723633, "learning_rate": 8.786402841197363e-06, "loss": 0.1349, "step": 2292 }, { "epoch": 2.094063926940639, "grad_norm": 10.270130157470703, "learning_rate": 8.785388127853882e-06, "loss": 0.0658, "step": 2293 }, { "epoch": 2.0949771689497716, "grad_norm": 24.938568115234375, "learning_rate": 8.784373414510402e-06, "loss": 0.1886, "step": 2294 }, { "epoch": 2.095890410958904, "grad_norm": 48.318084716796875, "learning_rate": 8.783358701166921e-06, "loss": 0.3607, "step": 2295 }, { "epoch": 2.0968036529680365, "grad_norm": 37.45723342895508, "learning_rate": 8.78234398782344e-06, "loss": 0.3597, "step": 2296 }, { "epoch": 2.097716894977169, "grad_norm": 1.8173013925552368, "learning_rate": 8.78132927447996e-06, "loss": 0.0153, "step": 2297 }, { "epoch": 2.0986301369863014, "grad_norm": 64.9857406616211, "learning_rate": 8.78031456113648e-06, "loss": 0.5146, "step": 2298 }, { "epoch": 2.099543378995434, "grad_norm": 35.07210159301758, "learning_rate": 8.779299847793e-06, "loss": 0.275, "step": 2299 }, { "epoch": 2.1004566210045663, "grad_norm": 4.099186420440674, "learning_rate": 8.778285134449519e-06, "loss": 0.044, "step": 2300 }, { "epoch": 2.1013698630136988, "grad_norm": 36.494468688964844, "learning_rate": 8.777270421106037e-06, "loss": 0.3617, "step": 2301 }, { "epoch": 2.1022831050228312, "grad_norm": 27.099103927612305, "learning_rate": 8.776255707762558e-06, "loss": 0.3181, "step": 2302 }, { "epoch": 2.1031963470319637, "grad_norm": 6.143426895141602, "learning_rate": 8.775240994419077e-06, "loss": 0.0591, "step": 2303 }, { "epoch": 2.1041095890410957, "grad_norm": 3.8425464630126953, "learning_rate": 8.774226281075597e-06, "loss": 0.0292, "step": 2304 }, { "epoch": 2.105022831050228, "grad_norm": 89.19825744628906, "learning_rate": 8.773211567732116e-06, "loss": 0.1807, "step": 2305 }, { "epoch": 2.1059360730593606, "grad_norm": 13.391586303710938, "learning_rate": 8.772196854388637e-06, "loss": 0.0952, "step": 2306 }, { "epoch": 2.106849315068493, "grad_norm": 75.52375030517578, "learning_rate": 8.771182141045156e-06, "loss": 2.101, "step": 2307 }, { "epoch": 2.1077625570776255, "grad_norm": 9.773487091064453, "learning_rate": 8.770167427701674e-06, "loss": 0.0896, "step": 2308 }, { "epoch": 2.108675799086758, "grad_norm": 5.961336612701416, "learning_rate": 8.769152714358195e-06, "loss": 0.0511, "step": 2309 }, { "epoch": 2.1095890410958904, "grad_norm": 5.567636966705322, "learning_rate": 8.768138001014714e-06, "loss": 0.0462, "step": 2310 }, { "epoch": 2.110502283105023, "grad_norm": 33.32500076293945, "learning_rate": 8.767123287671233e-06, "loss": 0.2832, "step": 2311 }, { "epoch": 2.1114155251141553, "grad_norm": 28.578380584716797, "learning_rate": 8.766108574327753e-06, "loss": 0.3058, "step": 2312 }, { "epoch": 2.1123287671232878, "grad_norm": 82.65984344482422, "learning_rate": 8.765093860984272e-06, "loss": 1.7249, "step": 2313 }, { "epoch": 2.11324200913242, "grad_norm": 83.26168823242188, "learning_rate": 8.764079147640793e-06, "loss": 0.8171, "step": 2314 }, { "epoch": 2.1141552511415527, "grad_norm": 2.3421621322631836, "learning_rate": 8.763064434297311e-06, "loss": 0.0195, "step": 2315 }, { "epoch": 2.115068493150685, "grad_norm": 31.663619995117188, "learning_rate": 8.762049720953832e-06, "loss": 0.3319, "step": 2316 }, { "epoch": 2.115981735159817, "grad_norm": 0.4523659348487854, "learning_rate": 8.761035007610351e-06, "loss": 0.0048, "step": 2317 }, { "epoch": 2.1168949771689496, "grad_norm": 7.011453151702881, "learning_rate": 8.76002029426687e-06, "loss": 0.0483, "step": 2318 }, { "epoch": 2.117808219178082, "grad_norm": 52.62917709350586, "learning_rate": 8.75900558092339e-06, "loss": 0.3457, "step": 2319 }, { "epoch": 2.1187214611872145, "grad_norm": 0.4017098546028137, "learning_rate": 8.757990867579909e-06, "loss": 0.003, "step": 2320 }, { "epoch": 2.119634703196347, "grad_norm": 2.903404951095581, "learning_rate": 8.756976154236428e-06, "loss": 0.0302, "step": 2321 }, { "epoch": 2.1205479452054794, "grad_norm": 1.1591991186141968, "learning_rate": 8.755961440892948e-06, "loss": 0.0079, "step": 2322 }, { "epoch": 2.121461187214612, "grad_norm": 60.8623161315918, "learning_rate": 8.754946727549469e-06, "loss": 0.7461, "step": 2323 }, { "epoch": 2.1223744292237443, "grad_norm": 26.32874870300293, "learning_rate": 8.753932014205988e-06, "loss": 0.2056, "step": 2324 }, { "epoch": 2.1232876712328768, "grad_norm": 14.319098472595215, "learning_rate": 8.752917300862507e-06, "loss": 0.0509, "step": 2325 }, { "epoch": 2.124200913242009, "grad_norm": 0.6230924129486084, "learning_rate": 8.751902587519027e-06, "loss": 0.0054, "step": 2326 }, { "epoch": 2.1251141552511417, "grad_norm": 7.061282634735107, "learning_rate": 8.750887874175546e-06, "loss": 0.0409, "step": 2327 }, { "epoch": 2.126027397260274, "grad_norm": 8.876286506652832, "learning_rate": 8.749873160832065e-06, "loss": 0.0839, "step": 2328 }, { "epoch": 2.1269406392694066, "grad_norm": 19.98558807373047, "learning_rate": 8.748858447488585e-06, "loss": 0.1574, "step": 2329 }, { "epoch": 2.127853881278539, "grad_norm": 47.813045501708984, "learning_rate": 8.747843734145104e-06, "loss": 0.4887, "step": 2330 }, { "epoch": 2.128767123287671, "grad_norm": 2.543769598007202, "learning_rate": 8.746829020801623e-06, "loss": 0.012, "step": 2331 }, { "epoch": 2.1296803652968035, "grad_norm": 73.3026351928711, "learning_rate": 8.745814307458144e-06, "loss": 1.2522, "step": 2332 }, { "epoch": 2.130593607305936, "grad_norm": 64.5268325805664, "learning_rate": 8.744799594114664e-06, "loss": 1.1692, "step": 2333 }, { "epoch": 2.1315068493150684, "grad_norm": 1.3635770082473755, "learning_rate": 8.743784880771183e-06, "loss": 0.0116, "step": 2334 }, { "epoch": 2.132420091324201, "grad_norm": 0.6603602170944214, "learning_rate": 8.742770167427702e-06, "loss": 0.0051, "step": 2335 }, { "epoch": 2.1333333333333333, "grad_norm": 7.909700870513916, "learning_rate": 8.741755454084222e-06, "loss": 0.0718, "step": 2336 }, { "epoch": 2.1342465753424658, "grad_norm": 65.05419158935547, "learning_rate": 8.740740740740741e-06, "loss": 0.7452, "step": 2337 }, { "epoch": 2.135159817351598, "grad_norm": 3.2342333793640137, "learning_rate": 8.73972602739726e-06, "loss": 0.0272, "step": 2338 }, { "epoch": 2.1360730593607307, "grad_norm": 17.595882415771484, "learning_rate": 8.73871131405378e-06, "loss": 0.1501, "step": 2339 }, { "epoch": 2.136986301369863, "grad_norm": 30.10987091064453, "learning_rate": 8.7376966007103e-06, "loss": 0.1225, "step": 2340 }, { "epoch": 2.1378995433789956, "grad_norm": 0.6486276388168335, "learning_rate": 8.736681887366818e-06, "loss": 0.0037, "step": 2341 }, { "epoch": 2.138812785388128, "grad_norm": 6.763646602630615, "learning_rate": 8.735667174023339e-06, "loss": 0.0654, "step": 2342 }, { "epoch": 2.1397260273972605, "grad_norm": 68.76266479492188, "learning_rate": 8.73465246067986e-06, "loss": 0.6293, "step": 2343 }, { "epoch": 2.1406392694063925, "grad_norm": 92.92091369628906, "learning_rate": 8.733637747336378e-06, "loss": 4.4805, "step": 2344 }, { "epoch": 2.141552511415525, "grad_norm": 39.71450424194336, "learning_rate": 8.732623033992897e-06, "loss": 0.6198, "step": 2345 }, { "epoch": 2.1424657534246574, "grad_norm": 1.9500880241394043, "learning_rate": 8.731608320649418e-06, "loss": 0.0141, "step": 2346 }, { "epoch": 2.14337899543379, "grad_norm": 20.15951919555664, "learning_rate": 8.730593607305937e-06, "loss": 0.1657, "step": 2347 }, { "epoch": 2.1442922374429223, "grad_norm": 10.137978553771973, "learning_rate": 8.729578893962455e-06, "loss": 0.0618, "step": 2348 }, { "epoch": 2.1452054794520548, "grad_norm": 11.562357902526855, "learning_rate": 8.728564180618976e-06, "loss": 0.0476, "step": 2349 }, { "epoch": 2.146118721461187, "grad_norm": 1.5936073064804077, "learning_rate": 8.727549467275496e-06, "loss": 0.0127, "step": 2350 }, { "epoch": 2.1470319634703197, "grad_norm": 2.0431156158447266, "learning_rate": 8.726534753932014e-06, "loss": 0.0151, "step": 2351 }, { "epoch": 2.147945205479452, "grad_norm": 25.680828094482422, "learning_rate": 8.725520040588534e-06, "loss": 0.2669, "step": 2352 }, { "epoch": 2.1488584474885846, "grad_norm": 82.71753692626953, "learning_rate": 8.724505327245055e-06, "loss": 1.3521, "step": 2353 }, { "epoch": 2.149771689497717, "grad_norm": 0.12569443881511688, "learning_rate": 8.723490613901574e-06, "loss": 0.001, "step": 2354 }, { "epoch": 2.1506849315068495, "grad_norm": 11.27730655670166, "learning_rate": 8.722475900558092e-06, "loss": 0.0949, "step": 2355 }, { "epoch": 2.151598173515982, "grad_norm": 0.8796870708465576, "learning_rate": 8.721461187214613e-06, "loss": 0.0071, "step": 2356 }, { "epoch": 2.1525114155251144, "grad_norm": 7.23440408706665, "learning_rate": 8.720446473871132e-06, "loss": 0.0488, "step": 2357 }, { "epoch": 2.1534246575342464, "grad_norm": 18.87354278564453, "learning_rate": 8.71943176052765e-06, "loss": 0.186, "step": 2358 }, { "epoch": 2.154337899543379, "grad_norm": 25.65412712097168, "learning_rate": 8.718417047184171e-06, "loss": 0.2295, "step": 2359 }, { "epoch": 2.1552511415525113, "grad_norm": 4.458056926727295, "learning_rate": 8.717402333840692e-06, "loss": 0.0421, "step": 2360 }, { "epoch": 2.1561643835616437, "grad_norm": 84.79215240478516, "learning_rate": 8.71638762049721e-06, "loss": 1.5438, "step": 2361 }, { "epoch": 2.157077625570776, "grad_norm": 2.8441741466522217, "learning_rate": 8.71537290715373e-06, "loss": 0.0244, "step": 2362 }, { "epoch": 2.1579908675799087, "grad_norm": 4.641557216644287, "learning_rate": 8.71435819381025e-06, "loss": 0.037, "step": 2363 }, { "epoch": 2.158904109589041, "grad_norm": 3.427715539932251, "learning_rate": 8.713343480466769e-06, "loss": 0.0315, "step": 2364 }, { "epoch": 2.1598173515981736, "grad_norm": 39.472408294677734, "learning_rate": 8.712328767123288e-06, "loss": 0.2528, "step": 2365 }, { "epoch": 2.160730593607306, "grad_norm": 39.55055618286133, "learning_rate": 8.711314053779808e-06, "loss": 0.5322, "step": 2366 }, { "epoch": 2.1616438356164385, "grad_norm": 22.996116638183594, "learning_rate": 8.710299340436329e-06, "loss": 0.1631, "step": 2367 }, { "epoch": 2.162557077625571, "grad_norm": 4.5301513671875, "learning_rate": 8.709284627092846e-06, "loss": 0.0433, "step": 2368 }, { "epoch": 2.1634703196347034, "grad_norm": 2.19413685798645, "learning_rate": 8.708269913749366e-06, "loss": 0.0183, "step": 2369 }, { "epoch": 2.1643835616438354, "grad_norm": 0.6664886474609375, "learning_rate": 8.707255200405887e-06, "loss": 0.0057, "step": 2370 }, { "epoch": 2.165296803652968, "grad_norm": 19.5582332611084, "learning_rate": 8.706240487062406e-06, "loss": 0.1535, "step": 2371 }, { "epoch": 2.1662100456621003, "grad_norm": 91.54218292236328, "learning_rate": 8.705225773718925e-06, "loss": 1.575, "step": 2372 }, { "epoch": 2.1671232876712327, "grad_norm": 1.3542368412017822, "learning_rate": 8.704211060375445e-06, "loss": 0.0083, "step": 2373 }, { "epoch": 2.168036529680365, "grad_norm": 85.09370422363281, "learning_rate": 8.703196347031964e-06, "loss": 1.536, "step": 2374 }, { "epoch": 2.1689497716894977, "grad_norm": 47.32329177856445, "learning_rate": 8.702181633688483e-06, "loss": 0.4121, "step": 2375 }, { "epoch": 2.16986301369863, "grad_norm": 5.756678581237793, "learning_rate": 8.701166920345003e-06, "loss": 0.0507, "step": 2376 }, { "epoch": 2.1707762557077626, "grad_norm": 104.81929779052734, "learning_rate": 8.700152207001524e-06, "loss": 4.6861, "step": 2377 }, { "epoch": 2.171689497716895, "grad_norm": 50.25559616088867, "learning_rate": 8.699137493658043e-06, "loss": 0.6148, "step": 2378 }, { "epoch": 2.1726027397260275, "grad_norm": 0.07933866232633591, "learning_rate": 8.698122780314562e-06, "loss": 0.0005, "step": 2379 }, { "epoch": 2.17351598173516, "grad_norm": 3.3334295749664307, "learning_rate": 8.697108066971082e-06, "loss": 0.0303, "step": 2380 }, { "epoch": 2.1744292237442924, "grad_norm": 3.427678346633911, "learning_rate": 8.696093353627601e-06, "loss": 0.0326, "step": 2381 }, { "epoch": 2.175342465753425, "grad_norm": 18.026668548583984, "learning_rate": 8.69507864028412e-06, "loss": 0.0908, "step": 2382 }, { "epoch": 2.1762557077625573, "grad_norm": 0.38889873027801514, "learning_rate": 8.69406392694064e-06, "loss": 0.0031, "step": 2383 }, { "epoch": 2.1771689497716897, "grad_norm": 60.77109146118164, "learning_rate": 8.69304921359716e-06, "loss": 1.147, "step": 2384 }, { "epoch": 2.1780821917808217, "grad_norm": 70.95074462890625, "learning_rate": 8.692034500253678e-06, "loss": 0.613, "step": 2385 }, { "epoch": 2.178995433789954, "grad_norm": 3.6931586265563965, "learning_rate": 8.691019786910199e-06, "loss": 0.0302, "step": 2386 }, { "epoch": 2.1799086757990866, "grad_norm": 0.7059317827224731, "learning_rate": 8.690005073566719e-06, "loss": 0.0076, "step": 2387 }, { "epoch": 2.180821917808219, "grad_norm": 42.24913024902344, "learning_rate": 8.688990360223238e-06, "loss": 0.4675, "step": 2388 }, { "epoch": 2.1817351598173516, "grad_norm": 3.1929759979248047, "learning_rate": 8.687975646879757e-06, "loss": 0.0263, "step": 2389 }, { "epoch": 2.182648401826484, "grad_norm": 37.83641052246094, "learning_rate": 8.686960933536277e-06, "loss": 0.3962, "step": 2390 }, { "epoch": 2.1835616438356165, "grad_norm": 0.4312041103839874, "learning_rate": 8.685946220192796e-06, "loss": 0.0041, "step": 2391 }, { "epoch": 2.184474885844749, "grad_norm": 2.719486713409424, "learning_rate": 8.684931506849315e-06, "loss": 0.019, "step": 2392 }, { "epoch": 2.1853881278538814, "grad_norm": 7.999295711517334, "learning_rate": 8.683916793505836e-06, "loss": 0.0705, "step": 2393 }, { "epoch": 2.186301369863014, "grad_norm": 15.826421737670898, "learning_rate": 8.682902080162356e-06, "loss": 0.0764, "step": 2394 }, { "epoch": 2.1872146118721463, "grad_norm": 9.037505149841309, "learning_rate": 8.681887366818873e-06, "loss": 0.0358, "step": 2395 }, { "epoch": 2.1881278538812787, "grad_norm": 80.59357452392578, "learning_rate": 8.680872653475394e-06, "loss": 2.0407, "step": 2396 }, { "epoch": 2.1890410958904107, "grad_norm": 52.163841247558594, "learning_rate": 8.679857940131914e-06, "loss": 0.5369, "step": 2397 }, { "epoch": 2.189954337899543, "grad_norm": 8.505536079406738, "learning_rate": 8.678843226788433e-06, "loss": 0.0277, "step": 2398 }, { "epoch": 2.1908675799086756, "grad_norm": 31.609678268432617, "learning_rate": 8.677828513444952e-06, "loss": 0.3083, "step": 2399 }, { "epoch": 2.191780821917808, "grad_norm": 79.28275299072266, "learning_rate": 8.676813800101473e-06, "loss": 1.2141, "step": 2400 }, { "epoch": 2.1926940639269406, "grad_norm": 9.546438217163086, "learning_rate": 8.675799086757991e-06, "loss": 0.1017, "step": 2401 }, { "epoch": 2.193607305936073, "grad_norm": 4.9992194175720215, "learning_rate": 8.67478437341451e-06, "loss": 0.0272, "step": 2402 }, { "epoch": 2.1945205479452055, "grad_norm": 27.268413543701172, "learning_rate": 8.67376966007103e-06, "loss": 0.3037, "step": 2403 }, { "epoch": 2.195433789954338, "grad_norm": 35.00524139404297, "learning_rate": 8.672754946727551e-06, "loss": 0.4146, "step": 2404 }, { "epoch": 2.1963470319634704, "grad_norm": 28.715280532836914, "learning_rate": 8.67174023338407e-06, "loss": 0.2744, "step": 2405 }, { "epoch": 2.197260273972603, "grad_norm": 7.5192131996154785, "learning_rate": 8.670725520040589e-06, "loss": 0.0715, "step": 2406 }, { "epoch": 2.1981735159817353, "grad_norm": 12.799283027648926, "learning_rate": 8.66971080669711e-06, "loss": 0.116, "step": 2407 }, { "epoch": 2.1990867579908677, "grad_norm": 17.58650779724121, "learning_rate": 8.668696093353628e-06, "loss": 0.1319, "step": 2408 }, { "epoch": 2.2, "grad_norm": 1.7044732570648193, "learning_rate": 8.667681380010147e-06, "loss": 0.01, "step": 2409 }, { "epoch": 2.2009132420091326, "grad_norm": 165.29962158203125, "learning_rate": 8.666666666666668e-06, "loss": 1.3498, "step": 2410 }, { "epoch": 2.2018264840182646, "grad_norm": 23.030548095703125, "learning_rate": 8.665651953323187e-06, "loss": 0.2568, "step": 2411 }, { "epoch": 2.202739726027397, "grad_norm": 1.6163567304611206, "learning_rate": 8.664637239979706e-06, "loss": 0.0137, "step": 2412 }, { "epoch": 2.2036529680365295, "grad_norm": 19.261648178100586, "learning_rate": 8.663622526636226e-06, "loss": 0.1887, "step": 2413 }, { "epoch": 2.204566210045662, "grad_norm": 9.481266021728516, "learning_rate": 8.662607813292747e-06, "loss": 0.0837, "step": 2414 }, { "epoch": 2.2054794520547945, "grad_norm": 82.10902404785156, "learning_rate": 8.661593099949265e-06, "loss": 0.5737, "step": 2415 }, { "epoch": 2.206392694063927, "grad_norm": 82.76190185546875, "learning_rate": 8.660578386605784e-06, "loss": 2.2111, "step": 2416 }, { "epoch": 2.2073059360730594, "grad_norm": 16.722604751586914, "learning_rate": 8.659563673262305e-06, "loss": 0.0933, "step": 2417 }, { "epoch": 2.208219178082192, "grad_norm": 16.196727752685547, "learning_rate": 8.658548959918824e-06, "loss": 0.1485, "step": 2418 }, { "epoch": 2.2091324200913243, "grad_norm": 7.364970684051514, "learning_rate": 8.657534246575343e-06, "loss": 0.0526, "step": 2419 }, { "epoch": 2.2100456621004567, "grad_norm": 33.25299072265625, "learning_rate": 8.656519533231863e-06, "loss": 0.1407, "step": 2420 }, { "epoch": 2.210958904109589, "grad_norm": 24.49693489074707, "learning_rate": 8.655504819888382e-06, "loss": 0.2674, "step": 2421 }, { "epoch": 2.2118721461187216, "grad_norm": 3.931703567504883, "learning_rate": 8.654490106544902e-06, "loss": 0.0366, "step": 2422 }, { "epoch": 2.212785388127854, "grad_norm": 773.9075927734375, "learning_rate": 8.653475393201421e-06, "loss": 0.5238, "step": 2423 }, { "epoch": 2.213698630136986, "grad_norm": 0.27111783623695374, "learning_rate": 8.652460679857942e-06, "loss": 0.0027, "step": 2424 }, { "epoch": 2.2146118721461185, "grad_norm": 15.412158966064453, "learning_rate": 8.65144596651446e-06, "loss": 0.1335, "step": 2425 }, { "epoch": 2.215525114155251, "grad_norm": 7.280797958374023, "learning_rate": 8.65043125317098e-06, "loss": 0.071, "step": 2426 }, { "epoch": 2.2164383561643834, "grad_norm": 42.53511428833008, "learning_rate": 8.6494165398275e-06, "loss": 0.803, "step": 2427 }, { "epoch": 2.217351598173516, "grad_norm": 53.86223220825195, "learning_rate": 8.648401826484019e-06, "loss": 0.1413, "step": 2428 }, { "epoch": 2.2182648401826484, "grad_norm": 6.819436073303223, "learning_rate": 8.647387113140538e-06, "loss": 0.0581, "step": 2429 }, { "epoch": 2.219178082191781, "grad_norm": 16.823110580444336, "learning_rate": 8.646372399797058e-06, "loss": 0.0941, "step": 2430 }, { "epoch": 2.2200913242009133, "grad_norm": 26.10361099243164, "learning_rate": 8.645357686453577e-06, "loss": 0.3425, "step": 2431 }, { "epoch": 2.2210045662100457, "grad_norm": 5.350671291351318, "learning_rate": 8.644342973110098e-06, "loss": 0.0364, "step": 2432 }, { "epoch": 2.221917808219178, "grad_norm": 12.738986015319824, "learning_rate": 8.643328259766617e-06, "loss": 0.0883, "step": 2433 }, { "epoch": 2.2228310502283106, "grad_norm": 30.419538497924805, "learning_rate": 8.642313546423137e-06, "loss": 0.1759, "step": 2434 }, { "epoch": 2.223744292237443, "grad_norm": 1.9305768013000488, "learning_rate": 8.641298833079656e-06, "loss": 0.0132, "step": 2435 }, { "epoch": 2.2246575342465755, "grad_norm": 12.909948348999023, "learning_rate": 8.640284119736175e-06, "loss": 0.1325, "step": 2436 }, { "epoch": 2.225570776255708, "grad_norm": 254.49472045898438, "learning_rate": 8.639269406392695e-06, "loss": 0.4271, "step": 2437 }, { "epoch": 2.22648401826484, "grad_norm": 42.746131896972656, "learning_rate": 8.638254693049214e-06, "loss": 0.4614, "step": 2438 }, { "epoch": 2.2273972602739724, "grad_norm": 199.93299865722656, "learning_rate": 8.637239979705733e-06, "loss": 0.4575, "step": 2439 }, { "epoch": 2.228310502283105, "grad_norm": 3.084794044494629, "learning_rate": 8.636225266362254e-06, "loss": 0.0216, "step": 2440 }, { "epoch": 2.2292237442922374, "grad_norm": 5.346216201782227, "learning_rate": 8.635210553018772e-06, "loss": 0.0447, "step": 2441 }, { "epoch": 2.23013698630137, "grad_norm": 4.930089950561523, "learning_rate": 8.634195839675293e-06, "loss": 0.0542, "step": 2442 }, { "epoch": 2.2310502283105023, "grad_norm": 10.712746620178223, "learning_rate": 8.633181126331812e-06, "loss": 0.071, "step": 2443 }, { "epoch": 2.2319634703196347, "grad_norm": 12.585335731506348, "learning_rate": 8.632166412988332e-06, "loss": 0.0745, "step": 2444 }, { "epoch": 2.232876712328767, "grad_norm": 0.7024602890014648, "learning_rate": 8.631151699644851e-06, "loss": 0.0058, "step": 2445 }, { "epoch": 2.2337899543378996, "grad_norm": 0.9726001024246216, "learning_rate": 8.63013698630137e-06, "loss": 0.0085, "step": 2446 }, { "epoch": 2.234703196347032, "grad_norm": 1.854085922241211, "learning_rate": 8.62912227295789e-06, "loss": 0.0168, "step": 2447 }, { "epoch": 2.2356164383561645, "grad_norm": 1.9019869565963745, "learning_rate": 8.62810755961441e-06, "loss": 0.0079, "step": 2448 }, { "epoch": 2.236529680365297, "grad_norm": 23.307764053344727, "learning_rate": 8.62709284627093e-06, "loss": 0.1765, "step": 2449 }, { "epoch": 2.237442922374429, "grad_norm": 60.45918655395508, "learning_rate": 8.626078132927449e-06, "loss": 0.7671, "step": 2450 }, { "epoch": 2.2383561643835614, "grad_norm": 0.9395273327827454, "learning_rate": 8.625063419583968e-06, "loss": 0.0072, "step": 2451 }, { "epoch": 2.239269406392694, "grad_norm": 26.97034454345703, "learning_rate": 8.624048706240488e-06, "loss": 0.3032, "step": 2452 }, { "epoch": 2.2401826484018263, "grad_norm": 0.7236809730529785, "learning_rate": 8.623033992897007e-06, "loss": 0.0059, "step": 2453 }, { "epoch": 2.241095890410959, "grad_norm": 57.639015197753906, "learning_rate": 8.622019279553528e-06, "loss": 0.2818, "step": 2454 }, { "epoch": 2.2420091324200913, "grad_norm": 43.13634490966797, "learning_rate": 8.621004566210046e-06, "loss": 0.3236, "step": 2455 }, { "epoch": 2.2429223744292237, "grad_norm": 0.7453742623329163, "learning_rate": 8.619989852866565e-06, "loss": 0.002, "step": 2456 }, { "epoch": 2.243835616438356, "grad_norm": 70.43594360351562, "learning_rate": 8.618975139523086e-06, "loss": 0.8587, "step": 2457 }, { "epoch": 2.2447488584474886, "grad_norm": 30.73253059387207, "learning_rate": 8.617960426179605e-06, "loss": 0.2588, "step": 2458 }, { "epoch": 2.245662100456621, "grad_norm": 5.765751838684082, "learning_rate": 8.616945712836125e-06, "loss": 0.0287, "step": 2459 }, { "epoch": 2.2465753424657535, "grad_norm": 83.2552490234375, "learning_rate": 8.615930999492644e-06, "loss": 0.9367, "step": 2460 }, { "epoch": 2.247488584474886, "grad_norm": 85.60025787353516, "learning_rate": 8.614916286149163e-06, "loss": 1.3203, "step": 2461 }, { "epoch": 2.2484018264840184, "grad_norm": 9.426651954650879, "learning_rate": 8.613901572805683e-06, "loss": 0.0775, "step": 2462 }, { "epoch": 2.249315068493151, "grad_norm": 29.816129684448242, "learning_rate": 8.612886859462202e-06, "loss": 0.6026, "step": 2463 }, { "epoch": 2.2502283105022833, "grad_norm": 0.9021149277687073, "learning_rate": 8.611872146118723e-06, "loss": 0.0081, "step": 2464 }, { "epoch": 2.2511415525114153, "grad_norm": 2.0306429862976074, "learning_rate": 8.610857432775242e-06, "loss": 0.0187, "step": 2465 }, { "epoch": 2.252054794520548, "grad_norm": 48.08165740966797, "learning_rate": 8.609842719431762e-06, "loss": 0.7583, "step": 2466 }, { "epoch": 2.2529680365296803, "grad_norm": 0.08424849808216095, "learning_rate": 8.608828006088281e-06, "loss": 0.0008, "step": 2467 }, { "epoch": 2.2538812785388127, "grad_norm": 35.65494918823242, "learning_rate": 8.6078132927448e-06, "loss": 0.4643, "step": 2468 }, { "epoch": 2.254794520547945, "grad_norm": 29.339635848999023, "learning_rate": 8.60679857940132e-06, "loss": 0.1883, "step": 2469 }, { "epoch": 2.2557077625570776, "grad_norm": 1.4071487188339233, "learning_rate": 8.60578386605784e-06, "loss": 0.0125, "step": 2470 }, { "epoch": 2.25662100456621, "grad_norm": 14.988067626953125, "learning_rate": 8.604769152714358e-06, "loss": 0.0988, "step": 2471 }, { "epoch": 2.2575342465753425, "grad_norm": 4.3095927238464355, "learning_rate": 8.603754439370879e-06, "loss": 0.0274, "step": 2472 }, { "epoch": 2.258447488584475, "grad_norm": 0.372743159532547, "learning_rate": 8.602739726027397e-06, "loss": 0.0025, "step": 2473 }, { "epoch": 2.2593607305936074, "grad_norm": 16.860912322998047, "learning_rate": 8.601725012683918e-06, "loss": 0.1713, "step": 2474 }, { "epoch": 2.26027397260274, "grad_norm": 24.288524627685547, "learning_rate": 8.600710299340437e-06, "loss": 0.1901, "step": 2475 }, { "epoch": 2.2611872146118723, "grad_norm": 12.981990814208984, "learning_rate": 8.599695585996957e-06, "loss": 0.1152, "step": 2476 }, { "epoch": 2.2621004566210043, "grad_norm": 62.63956832885742, "learning_rate": 8.598680872653476e-06, "loss": 0.7586, "step": 2477 }, { "epoch": 2.263013698630137, "grad_norm": 13.838021278381348, "learning_rate": 8.597666159309995e-06, "loss": 0.145, "step": 2478 }, { "epoch": 2.2639269406392692, "grad_norm": 0.27017173171043396, "learning_rate": 8.596651445966516e-06, "loss": 0.0018, "step": 2479 }, { "epoch": 2.2648401826484017, "grad_norm": 62.18226623535156, "learning_rate": 8.595636732623034e-06, "loss": 0.8361, "step": 2480 }, { "epoch": 2.265753424657534, "grad_norm": 0.6056771278381348, "learning_rate": 8.594622019279553e-06, "loss": 0.0046, "step": 2481 }, { "epoch": 2.2666666666666666, "grad_norm": 4.501248359680176, "learning_rate": 8.593607305936074e-06, "loss": 0.0373, "step": 2482 }, { "epoch": 2.267579908675799, "grad_norm": 10.51222038269043, "learning_rate": 8.592592592592593e-06, "loss": 0.0985, "step": 2483 }, { "epoch": 2.2684931506849315, "grad_norm": 130.10894775390625, "learning_rate": 8.591577879249113e-06, "loss": 3.9296, "step": 2484 }, { "epoch": 2.269406392694064, "grad_norm": 52.3765983581543, "learning_rate": 8.590563165905632e-06, "loss": 0.475, "step": 2485 }, { "epoch": 2.2703196347031964, "grad_norm": 6.957731246948242, "learning_rate": 8.589548452562153e-06, "loss": 0.0453, "step": 2486 }, { "epoch": 2.271232876712329, "grad_norm": 22.482452392578125, "learning_rate": 8.588533739218671e-06, "loss": 0.195, "step": 2487 }, { "epoch": 2.2721461187214613, "grad_norm": 95.20319366455078, "learning_rate": 8.58751902587519e-06, "loss": 1.8435, "step": 2488 }, { "epoch": 2.273059360730594, "grad_norm": 51.341651916503906, "learning_rate": 8.58650431253171e-06, "loss": 0.4847, "step": 2489 }, { "epoch": 2.2739726027397262, "grad_norm": 68.45936584472656, "learning_rate": 8.58548959918823e-06, "loss": 0.5999, "step": 2490 }, { "epoch": 2.2748858447488587, "grad_norm": 50.35246658325195, "learning_rate": 8.584474885844748e-06, "loss": 0.5558, "step": 2491 }, { "epoch": 2.2757990867579907, "grad_norm": 126.10161590576172, "learning_rate": 8.583460172501269e-06, "loss": 2.6868, "step": 2492 }, { "epoch": 2.276712328767123, "grad_norm": 41.42343521118164, "learning_rate": 8.58244545915779e-06, "loss": 0.3702, "step": 2493 }, { "epoch": 2.2776255707762556, "grad_norm": 0.8584371209144592, "learning_rate": 8.581430745814308e-06, "loss": 0.0058, "step": 2494 }, { "epoch": 2.278538812785388, "grad_norm": 21.242612838745117, "learning_rate": 8.580416032470827e-06, "loss": 0.2079, "step": 2495 }, { "epoch": 2.2794520547945205, "grad_norm": 19.75535011291504, "learning_rate": 8.579401319127348e-06, "loss": 0.1582, "step": 2496 }, { "epoch": 2.280365296803653, "grad_norm": 48.327693939208984, "learning_rate": 8.578386605783867e-06, "loss": 0.2083, "step": 2497 }, { "epoch": 2.2812785388127854, "grad_norm": 17.792198181152344, "learning_rate": 8.577371892440385e-06, "loss": 0.1646, "step": 2498 }, { "epoch": 2.282191780821918, "grad_norm": 4.357776165008545, "learning_rate": 8.576357179096906e-06, "loss": 0.0279, "step": 2499 }, { "epoch": 2.2831050228310503, "grad_norm": 1.8096137046813965, "learning_rate": 8.575342465753425e-06, "loss": 0.0085, "step": 2500 }, { "epoch": 2.2840182648401828, "grad_norm": 3.4466521739959717, "learning_rate": 8.574327752409944e-06, "loss": 0.0236, "step": 2501 }, { "epoch": 2.2849315068493152, "grad_norm": 42.754486083984375, "learning_rate": 8.573313039066464e-06, "loss": 0.3139, "step": 2502 }, { "epoch": 2.2858447488584472, "grad_norm": 1.2215241193771362, "learning_rate": 8.572298325722985e-06, "loss": 0.0084, "step": 2503 }, { "epoch": 2.2867579908675797, "grad_norm": 82.46865844726562, "learning_rate": 8.571283612379504e-06, "loss": 1.7307, "step": 2504 }, { "epoch": 2.287671232876712, "grad_norm": 28.89000701904297, "learning_rate": 8.570268899036022e-06, "loss": 0.2653, "step": 2505 }, { "epoch": 2.2885844748858446, "grad_norm": 1.4031422138214111, "learning_rate": 8.569254185692543e-06, "loss": 0.0095, "step": 2506 }, { "epoch": 2.289497716894977, "grad_norm": 74.97960662841797, "learning_rate": 8.568239472349062e-06, "loss": 0.5205, "step": 2507 }, { "epoch": 2.2904109589041095, "grad_norm": 80.41458129882812, "learning_rate": 8.56722475900558e-06, "loss": 0.3856, "step": 2508 }, { "epoch": 2.291324200913242, "grad_norm": 18.18034553527832, "learning_rate": 8.566210045662101e-06, "loss": 0.1523, "step": 2509 }, { "epoch": 2.2922374429223744, "grad_norm": 58.976627349853516, "learning_rate": 8.565195332318622e-06, "loss": 0.8635, "step": 2510 }, { "epoch": 2.293150684931507, "grad_norm": 23.252843856811523, "learning_rate": 8.564180618975139e-06, "loss": 0.2515, "step": 2511 }, { "epoch": 2.2940639269406393, "grad_norm": 66.21784973144531, "learning_rate": 8.56316590563166e-06, "loss": 0.942, "step": 2512 }, { "epoch": 2.2949771689497718, "grad_norm": 10.947118759155273, "learning_rate": 8.56215119228818e-06, "loss": 0.088, "step": 2513 }, { "epoch": 2.2958904109589042, "grad_norm": 24.78120994567871, "learning_rate": 8.561136478944699e-06, "loss": 0.2205, "step": 2514 }, { "epoch": 2.2968036529680367, "grad_norm": 0.26194748282432556, "learning_rate": 8.560121765601218e-06, "loss": 0.0021, "step": 2515 }, { "epoch": 2.297716894977169, "grad_norm": 25.064491271972656, "learning_rate": 8.559107052257738e-06, "loss": 0.2025, "step": 2516 }, { "epoch": 2.2986301369863016, "grad_norm": 76.2420425415039, "learning_rate": 8.558092338914257e-06, "loss": 0.5557, "step": 2517 }, { "epoch": 2.2995433789954336, "grad_norm": 27.10629653930664, "learning_rate": 8.557077625570776e-06, "loss": 0.3345, "step": 2518 }, { "epoch": 2.300456621004566, "grad_norm": 0.18027420341968536, "learning_rate": 8.556062912227296e-06, "loss": 0.0019, "step": 2519 }, { "epoch": 2.3013698630136985, "grad_norm": 8.017088890075684, "learning_rate": 8.555048198883817e-06, "loss": 0.0644, "step": 2520 }, { "epoch": 2.302283105022831, "grad_norm": 45.8594856262207, "learning_rate": 8.554033485540336e-06, "loss": 0.4588, "step": 2521 }, { "epoch": 2.3031963470319634, "grad_norm": 1.744215488433838, "learning_rate": 8.553018772196855e-06, "loss": 0.0137, "step": 2522 }, { "epoch": 2.304109589041096, "grad_norm": 0.8887469172477722, "learning_rate": 8.552004058853375e-06, "loss": 0.007, "step": 2523 }, { "epoch": 2.3050228310502283, "grad_norm": 30.417871475219727, "learning_rate": 8.550989345509894e-06, "loss": 0.2903, "step": 2524 }, { "epoch": 2.3059360730593608, "grad_norm": 38.414459228515625, "learning_rate": 8.549974632166413e-06, "loss": 0.4883, "step": 2525 }, { "epoch": 2.3068493150684932, "grad_norm": 96.76921081542969, "learning_rate": 8.548959918822933e-06, "loss": 1.0263, "step": 2526 }, { "epoch": 2.3077625570776257, "grad_norm": 46.314151763916016, "learning_rate": 8.547945205479454e-06, "loss": 0.3451, "step": 2527 }, { "epoch": 2.308675799086758, "grad_norm": 9.37238883972168, "learning_rate": 8.546930492135971e-06, "loss": 0.0806, "step": 2528 }, { "epoch": 2.3095890410958906, "grad_norm": 55.08688735961914, "learning_rate": 8.545915778792492e-06, "loss": 0.6991, "step": 2529 }, { "epoch": 2.3105022831050226, "grad_norm": 27.596269607543945, "learning_rate": 8.544901065449012e-06, "loss": 0.1305, "step": 2530 }, { "epoch": 2.311415525114155, "grad_norm": 30.365703582763672, "learning_rate": 8.543886352105531e-06, "loss": 0.2087, "step": 2531 }, { "epoch": 2.3123287671232875, "grad_norm": 5.415970325469971, "learning_rate": 8.54287163876205e-06, "loss": 0.044, "step": 2532 }, { "epoch": 2.31324200913242, "grad_norm": 1.7419556379318237, "learning_rate": 8.54185692541857e-06, "loss": 0.0128, "step": 2533 }, { "epoch": 2.3141552511415524, "grad_norm": 1.3461403846740723, "learning_rate": 8.54084221207509e-06, "loss": 0.0116, "step": 2534 }, { "epoch": 2.315068493150685, "grad_norm": 7.123410701751709, "learning_rate": 8.539827498731608e-06, "loss": 0.0459, "step": 2535 }, { "epoch": 2.3159817351598173, "grad_norm": 5.269051551818848, "learning_rate": 8.538812785388129e-06, "loss": 0.0414, "step": 2536 }, { "epoch": 2.3168949771689498, "grad_norm": 1.262459635734558, "learning_rate": 8.53779807204465e-06, "loss": 0.0122, "step": 2537 }, { "epoch": 2.317808219178082, "grad_norm": 0.15343360602855682, "learning_rate": 8.536783358701168e-06, "loss": 0.0011, "step": 2538 }, { "epoch": 2.3187214611872147, "grad_norm": 32.9000244140625, "learning_rate": 8.535768645357687e-06, "loss": 0.153, "step": 2539 }, { "epoch": 2.319634703196347, "grad_norm": 7.4058146476745605, "learning_rate": 8.534753932014207e-06, "loss": 0.0569, "step": 2540 }, { "epoch": 2.3205479452054796, "grad_norm": 27.194496154785156, "learning_rate": 8.533739218670726e-06, "loss": 0.3007, "step": 2541 }, { "epoch": 2.321461187214612, "grad_norm": 1.189505934715271, "learning_rate": 8.532724505327245e-06, "loss": 0.0089, "step": 2542 }, { "epoch": 2.3223744292237445, "grad_norm": 12.362743377685547, "learning_rate": 8.531709791983766e-06, "loss": 0.0284, "step": 2543 }, { "epoch": 2.323287671232877, "grad_norm": 36.64046096801758, "learning_rate": 8.530695078640285e-06, "loss": 0.4641, "step": 2544 }, { "epoch": 2.324200913242009, "grad_norm": 2.432210922241211, "learning_rate": 8.529680365296803e-06, "loss": 0.0178, "step": 2545 }, { "epoch": 2.3251141552511414, "grad_norm": 0.47886380553245544, "learning_rate": 8.528665651953324e-06, "loss": 0.0029, "step": 2546 }, { "epoch": 2.326027397260274, "grad_norm": 62.86466598510742, "learning_rate": 8.527650938609844e-06, "loss": 1.0092, "step": 2547 }, { "epoch": 2.3269406392694063, "grad_norm": 0.26562756299972534, "learning_rate": 8.526636225266363e-06, "loss": 0.002, "step": 2548 }, { "epoch": 2.3278538812785388, "grad_norm": 17.621583938598633, "learning_rate": 8.525621511922882e-06, "loss": 0.0805, "step": 2549 }, { "epoch": 2.328767123287671, "grad_norm": 28.811891555786133, "learning_rate": 8.524606798579403e-06, "loss": 0.2434, "step": 2550 }, { "epoch": 2.3296803652968037, "grad_norm": 33.36982727050781, "learning_rate": 8.523592085235922e-06, "loss": 0.2822, "step": 2551 }, { "epoch": 2.330593607305936, "grad_norm": 65.22868347167969, "learning_rate": 8.52257737189244e-06, "loss": 0.6498, "step": 2552 }, { "epoch": 2.3315068493150686, "grad_norm": 0.7052599787712097, "learning_rate": 8.521562658548961e-06, "loss": 0.0062, "step": 2553 }, { "epoch": 2.332420091324201, "grad_norm": 3.314591407775879, "learning_rate": 8.520547945205481e-06, "loss": 0.0236, "step": 2554 }, { "epoch": 2.3333333333333335, "grad_norm": 31.273948669433594, "learning_rate": 8.519533231861999e-06, "loss": 0.274, "step": 2555 }, { "epoch": 2.334246575342466, "grad_norm": 18.58838653564453, "learning_rate": 8.518518518518519e-06, "loss": 0.2245, "step": 2556 }, { "epoch": 2.335159817351598, "grad_norm": 38.571903228759766, "learning_rate": 8.51750380517504e-06, "loss": 0.2009, "step": 2557 }, { "epoch": 2.3360730593607304, "grad_norm": 4.841696739196777, "learning_rate": 8.516489091831559e-06, "loss": 0.0407, "step": 2558 }, { "epoch": 2.336986301369863, "grad_norm": 38.94816207885742, "learning_rate": 8.515474378488077e-06, "loss": 0.4243, "step": 2559 }, { "epoch": 2.3378995433789953, "grad_norm": 0.7147652506828308, "learning_rate": 8.514459665144598e-06, "loss": 0.0062, "step": 2560 }, { "epoch": 2.3388127853881278, "grad_norm": 76.3510971069336, "learning_rate": 8.513444951801117e-06, "loss": 0.9447, "step": 2561 }, { "epoch": 2.33972602739726, "grad_norm": 88.10295104980469, "learning_rate": 8.512430238457636e-06, "loss": 3.8926, "step": 2562 }, { "epoch": 2.3406392694063927, "grad_norm": 42.47578048706055, "learning_rate": 8.511415525114156e-06, "loss": 0.3393, "step": 2563 }, { "epoch": 2.341552511415525, "grad_norm": 16.667280197143555, "learning_rate": 8.510400811770677e-06, "loss": 0.1064, "step": 2564 }, { "epoch": 2.3424657534246576, "grad_norm": 37.20976257324219, "learning_rate": 8.509386098427196e-06, "loss": 0.2249, "step": 2565 }, { "epoch": 2.34337899543379, "grad_norm": 35.04728317260742, "learning_rate": 8.508371385083714e-06, "loss": 0.2469, "step": 2566 }, { "epoch": 2.3442922374429225, "grad_norm": 70.7943115234375, "learning_rate": 8.507356671740235e-06, "loss": 1.2883, "step": 2567 }, { "epoch": 2.345205479452055, "grad_norm": 1.339434266090393, "learning_rate": 8.506341958396754e-06, "loss": 0.0096, "step": 2568 }, { "epoch": 2.3461187214611874, "grad_norm": 7.990341663360596, "learning_rate": 8.505327245053273e-06, "loss": 0.0743, "step": 2569 }, { "epoch": 2.34703196347032, "grad_norm": 54.75538635253906, "learning_rate": 8.504312531709793e-06, "loss": 0.3341, "step": 2570 }, { "epoch": 2.3479452054794523, "grad_norm": 32.01702117919922, "learning_rate": 8.503297818366312e-06, "loss": 0.2816, "step": 2571 }, { "epoch": 2.3488584474885843, "grad_norm": 35.025753021240234, "learning_rate": 8.50228310502283e-06, "loss": 0.2502, "step": 2572 }, { "epoch": 2.3497716894977168, "grad_norm": 11.007740020751953, "learning_rate": 8.501268391679351e-06, "loss": 0.1059, "step": 2573 }, { "epoch": 2.350684931506849, "grad_norm": 4.7091779708862305, "learning_rate": 8.500253678335872e-06, "loss": 0.0369, "step": 2574 }, { "epoch": 2.3515981735159817, "grad_norm": 18.184593200683594, "learning_rate": 8.49923896499239e-06, "loss": 0.2074, "step": 2575 }, { "epoch": 2.352511415525114, "grad_norm": 48.99995803833008, "learning_rate": 8.49822425164891e-06, "loss": 0.1622, "step": 2576 }, { "epoch": 2.3534246575342466, "grad_norm": 5.245512008666992, "learning_rate": 8.49720953830543e-06, "loss": 0.0331, "step": 2577 }, { "epoch": 2.354337899543379, "grad_norm": 35.65060806274414, "learning_rate": 8.496194824961949e-06, "loss": 0.2846, "step": 2578 }, { "epoch": 2.3552511415525115, "grad_norm": 0.839098334312439, "learning_rate": 8.495180111618468e-06, "loss": 0.0054, "step": 2579 }, { "epoch": 2.356164383561644, "grad_norm": 2.112802743911743, "learning_rate": 8.494165398274988e-06, "loss": 0.0171, "step": 2580 }, { "epoch": 2.3570776255707764, "grad_norm": 67.97174835205078, "learning_rate": 8.493150684931507e-06, "loss": 0.4622, "step": 2581 }, { "epoch": 2.357990867579909, "grad_norm": 0.15388856828212738, "learning_rate": 8.492135971588028e-06, "loss": 0.0012, "step": 2582 }, { "epoch": 2.3589041095890413, "grad_norm": 18.922983169555664, "learning_rate": 8.491121258244547e-06, "loss": 0.2043, "step": 2583 }, { "epoch": 2.3598173515981733, "grad_norm": 2.993868350982666, "learning_rate": 8.490106544901067e-06, "loss": 0.0126, "step": 2584 }, { "epoch": 2.3607305936073057, "grad_norm": 2.42808198928833, "learning_rate": 8.489091831557586e-06, "loss": 0.0168, "step": 2585 }, { "epoch": 2.361643835616438, "grad_norm": 3.9183032512664795, "learning_rate": 8.488077118214105e-06, "loss": 0.0265, "step": 2586 }, { "epoch": 2.3625570776255707, "grad_norm": 5.216090202331543, "learning_rate": 8.487062404870625e-06, "loss": 0.0353, "step": 2587 }, { "epoch": 2.363470319634703, "grad_norm": 0.8040135502815247, "learning_rate": 8.486047691527144e-06, "loss": 0.0065, "step": 2588 }, { "epoch": 2.3643835616438356, "grad_norm": 2.307651996612549, "learning_rate": 8.485032978183663e-06, "loss": 0.0115, "step": 2589 }, { "epoch": 2.365296803652968, "grad_norm": 61.18443298339844, "learning_rate": 8.484018264840184e-06, "loss": 0.6708, "step": 2590 }, { "epoch": 2.3662100456621005, "grad_norm": 0.7214462757110596, "learning_rate": 8.483003551496702e-06, "loss": 0.0055, "step": 2591 }, { "epoch": 2.367123287671233, "grad_norm": 56.883121490478516, "learning_rate": 8.481988838153223e-06, "loss": 0.7549, "step": 2592 }, { "epoch": 2.3680365296803654, "grad_norm": 3.491386890411377, "learning_rate": 8.480974124809742e-06, "loss": 0.0242, "step": 2593 }, { "epoch": 2.368949771689498, "grad_norm": 9.125188827514648, "learning_rate": 8.479959411466262e-06, "loss": 0.0951, "step": 2594 }, { "epoch": 2.3698630136986303, "grad_norm": 17.459369659423828, "learning_rate": 8.478944698122781e-06, "loss": 0.1491, "step": 2595 }, { "epoch": 2.3707762557077627, "grad_norm": 20.480167388916016, "learning_rate": 8.4779299847793e-06, "loss": 0.0978, "step": 2596 }, { "epoch": 2.371689497716895, "grad_norm": 36.10066223144531, "learning_rate": 8.47691527143582e-06, "loss": 0.3624, "step": 2597 }, { "epoch": 2.3726027397260276, "grad_norm": 69.29745483398438, "learning_rate": 8.47590055809234e-06, "loss": 0.656, "step": 2598 }, { "epoch": 2.3735159817351597, "grad_norm": 42.89850616455078, "learning_rate": 8.474885844748858e-06, "loss": 0.2327, "step": 2599 }, { "epoch": 2.374429223744292, "grad_norm": 1.0298452377319336, "learning_rate": 8.473871131405379e-06, "loss": 0.0076, "step": 2600 }, { "epoch": 2.3753424657534246, "grad_norm": 0.34965938329696655, "learning_rate": 8.472856418061898e-06, "loss": 0.0018, "step": 2601 }, { "epoch": 2.376255707762557, "grad_norm": 48.080718994140625, "learning_rate": 8.471841704718418e-06, "loss": 0.4588, "step": 2602 }, { "epoch": 2.3771689497716895, "grad_norm": 194.02146911621094, "learning_rate": 8.470826991374937e-06, "loss": 5.2181, "step": 2603 }, { "epoch": 2.378082191780822, "grad_norm": 0.19772282242774963, "learning_rate": 8.469812278031458e-06, "loss": 0.0021, "step": 2604 }, { "epoch": 2.3789954337899544, "grad_norm": 86.7620849609375, "learning_rate": 8.468797564687976e-06, "loss": 1.584, "step": 2605 }, { "epoch": 2.379908675799087, "grad_norm": 0.6401365399360657, "learning_rate": 8.467782851344495e-06, "loss": 0.0045, "step": 2606 }, { "epoch": 2.3808219178082193, "grad_norm": 33.23946762084961, "learning_rate": 8.466768138001016e-06, "loss": 0.2894, "step": 2607 }, { "epoch": 2.3817351598173517, "grad_norm": 9.038766860961914, "learning_rate": 8.465753424657535e-06, "loss": 0.0601, "step": 2608 }, { "epoch": 2.382648401826484, "grad_norm": 3.1428275108337402, "learning_rate": 8.464738711314055e-06, "loss": 0.0213, "step": 2609 }, { "epoch": 2.383561643835616, "grad_norm": 4.136348724365234, "learning_rate": 8.463723997970574e-06, "loss": 0.0271, "step": 2610 }, { "epoch": 2.3844748858447486, "grad_norm": 64.18885803222656, "learning_rate": 8.462709284627093e-06, "loss": 0.4092, "step": 2611 }, { "epoch": 2.385388127853881, "grad_norm": 1.317162275314331, "learning_rate": 8.461694571283613e-06, "loss": 0.0089, "step": 2612 }, { "epoch": 2.3863013698630136, "grad_norm": 7.054538726806641, "learning_rate": 8.460679857940132e-06, "loss": 0.0431, "step": 2613 }, { "epoch": 2.387214611872146, "grad_norm": 67.70823669433594, "learning_rate": 8.459665144596653e-06, "loss": 0.9816, "step": 2614 }, { "epoch": 2.3881278538812785, "grad_norm": 46.63129806518555, "learning_rate": 8.458650431253172e-06, "loss": 0.3292, "step": 2615 }, { "epoch": 2.389041095890411, "grad_norm": 6.787561893463135, "learning_rate": 8.45763571790969e-06, "loss": 0.0455, "step": 2616 }, { "epoch": 2.3899543378995434, "grad_norm": 44.674983978271484, "learning_rate": 8.456621004566211e-06, "loss": 0.5185, "step": 2617 }, { "epoch": 2.390867579908676, "grad_norm": 8.09572696685791, "learning_rate": 8.45560629122273e-06, "loss": 0.0407, "step": 2618 }, { "epoch": 2.3917808219178083, "grad_norm": 11.151247024536133, "learning_rate": 8.45459157787925e-06, "loss": 0.0684, "step": 2619 }, { "epoch": 2.3926940639269407, "grad_norm": 100.16734313964844, "learning_rate": 8.45357686453577e-06, "loss": 0.7579, "step": 2620 }, { "epoch": 2.393607305936073, "grad_norm": 9.567873001098633, "learning_rate": 8.452562151192288e-06, "loss": 0.0508, "step": 2621 }, { "epoch": 2.3945205479452056, "grad_norm": 5.985392093658447, "learning_rate": 8.451547437848809e-06, "loss": 0.057, "step": 2622 }, { "epoch": 2.395433789954338, "grad_norm": 57.48337173461914, "learning_rate": 8.450532724505328e-06, "loss": 0.6655, "step": 2623 }, { "epoch": 2.3963470319634705, "grad_norm": 8.49120044708252, "learning_rate": 8.449518011161848e-06, "loss": 0.0804, "step": 2624 }, { "epoch": 2.3972602739726026, "grad_norm": 3.5174942016601562, "learning_rate": 8.448503297818367e-06, "loss": 0.0249, "step": 2625 }, { "epoch": 2.398173515981735, "grad_norm": 10.08123779296875, "learning_rate": 8.447488584474887e-06, "loss": 0.0948, "step": 2626 }, { "epoch": 2.3990867579908675, "grad_norm": 13.46921157836914, "learning_rate": 8.446473871131406e-06, "loss": 0.0858, "step": 2627 }, { "epoch": 2.4, "grad_norm": 58.69728469848633, "learning_rate": 8.445459157787925e-06, "loss": 1.0326, "step": 2628 }, { "epoch": 2.4009132420091324, "grad_norm": 144.04852294921875, "learning_rate": 8.444444444444446e-06, "loss": 1.4672, "step": 2629 }, { "epoch": 2.401826484018265, "grad_norm": 25.32732582092285, "learning_rate": 8.443429731100965e-06, "loss": 0.1319, "step": 2630 }, { "epoch": 2.4027397260273973, "grad_norm": 11.119710922241211, "learning_rate": 8.442415017757483e-06, "loss": 0.0895, "step": 2631 }, { "epoch": 2.4036529680365297, "grad_norm": 4.643616676330566, "learning_rate": 8.441400304414004e-06, "loss": 0.0383, "step": 2632 }, { "epoch": 2.404566210045662, "grad_norm": 14.459583282470703, "learning_rate": 8.440385591070523e-06, "loss": 0.1069, "step": 2633 }, { "epoch": 2.4054794520547946, "grad_norm": 123.27848052978516, "learning_rate": 8.439370877727043e-06, "loss": 0.566, "step": 2634 }, { "epoch": 2.406392694063927, "grad_norm": 4.399123191833496, "learning_rate": 8.438356164383562e-06, "loss": 0.0392, "step": 2635 }, { "epoch": 2.4073059360730595, "grad_norm": 7.537052154541016, "learning_rate": 8.437341451040083e-06, "loss": 0.0599, "step": 2636 }, { "epoch": 2.4082191780821915, "grad_norm": 5.644503116607666, "learning_rate": 8.436326737696602e-06, "loss": 0.0406, "step": 2637 }, { "epoch": 2.409132420091324, "grad_norm": 3.9865849018096924, "learning_rate": 8.43531202435312e-06, "loss": 0.0356, "step": 2638 }, { "epoch": 2.4100456621004565, "grad_norm": 3.7506871223449707, "learning_rate": 8.434297311009641e-06, "loss": 0.0244, "step": 2639 }, { "epoch": 2.410958904109589, "grad_norm": 75.88584899902344, "learning_rate": 8.43328259766616e-06, "loss": 2.5223, "step": 2640 }, { "epoch": 2.4118721461187214, "grad_norm": 26.104732513427734, "learning_rate": 8.432267884322679e-06, "loss": 0.237, "step": 2641 }, { "epoch": 2.412785388127854, "grad_norm": 73.81430053710938, "learning_rate": 8.431253170979199e-06, "loss": 2.498, "step": 2642 }, { "epoch": 2.4136986301369863, "grad_norm": 45.52847671508789, "learning_rate": 8.430238457635718e-06, "loss": 0.6176, "step": 2643 }, { "epoch": 2.4146118721461187, "grad_norm": 15.68456745147705, "learning_rate": 8.429223744292239e-06, "loss": 0.178, "step": 2644 }, { "epoch": 2.415525114155251, "grad_norm": 2.634263277053833, "learning_rate": 8.428209030948757e-06, "loss": 0.0214, "step": 2645 }, { "epoch": 2.4164383561643836, "grad_norm": 0.06786513328552246, "learning_rate": 8.427194317605278e-06, "loss": 0.0005, "step": 2646 }, { "epoch": 2.417351598173516, "grad_norm": 40.90235900878906, "learning_rate": 8.426179604261797e-06, "loss": 0.2784, "step": 2647 }, { "epoch": 2.4182648401826485, "grad_norm": 2.9356648921966553, "learning_rate": 8.425164890918316e-06, "loss": 0.0244, "step": 2648 }, { "epoch": 2.419178082191781, "grad_norm": 0.679267168045044, "learning_rate": 8.424150177574836e-06, "loss": 0.0059, "step": 2649 }, { "epoch": 2.4200913242009134, "grad_norm": 42.27705383300781, "learning_rate": 8.423135464231355e-06, "loss": 0.3274, "step": 2650 }, { "epoch": 2.421004566210046, "grad_norm": 1.8208473920822144, "learning_rate": 8.422120750887874e-06, "loss": 0.0158, "step": 2651 }, { "epoch": 2.421917808219178, "grad_norm": 21.079381942749023, "learning_rate": 8.421106037544394e-06, "loss": 0.1387, "step": 2652 }, { "epoch": 2.4228310502283104, "grad_norm": 3.3985981941223145, "learning_rate": 8.420091324200915e-06, "loss": 0.0245, "step": 2653 }, { "epoch": 2.423744292237443, "grad_norm": 6.0880279541015625, "learning_rate": 8.419076610857434e-06, "loss": 0.0379, "step": 2654 }, { "epoch": 2.4246575342465753, "grad_norm": 20.751741409301758, "learning_rate": 8.418061897513953e-06, "loss": 0.1384, "step": 2655 }, { "epoch": 2.4255707762557077, "grad_norm": 16.809478759765625, "learning_rate": 8.417047184170473e-06, "loss": 0.058, "step": 2656 }, { "epoch": 2.42648401826484, "grad_norm": 5.170472145080566, "learning_rate": 8.416032470826992e-06, "loss": 0.0328, "step": 2657 }, { "epoch": 2.4273972602739726, "grad_norm": 2.4152414798736572, "learning_rate": 8.41501775748351e-06, "loss": 0.0168, "step": 2658 }, { "epoch": 2.428310502283105, "grad_norm": 55.60622024536133, "learning_rate": 8.414003044140031e-06, "loss": 0.5926, "step": 2659 }, { "epoch": 2.4292237442922375, "grad_norm": 0.46358898282051086, "learning_rate": 8.41298833079655e-06, "loss": 0.0024, "step": 2660 }, { "epoch": 2.43013698630137, "grad_norm": 0.36607563495635986, "learning_rate": 8.411973617453069e-06, "loss": 0.0024, "step": 2661 }, { "epoch": 2.4310502283105024, "grad_norm": 11.013490676879883, "learning_rate": 8.41095890410959e-06, "loss": 0.0934, "step": 2662 }, { "epoch": 2.431963470319635, "grad_norm": 8.779145240783691, "learning_rate": 8.40994419076611e-06, "loss": 0.0816, "step": 2663 }, { "epoch": 2.432876712328767, "grad_norm": 70.0237808227539, "learning_rate": 8.408929477422629e-06, "loss": 0.8467, "step": 2664 }, { "epoch": 2.4337899543378994, "grad_norm": 37.024269104003906, "learning_rate": 8.407914764079148e-06, "loss": 0.4089, "step": 2665 }, { "epoch": 2.434703196347032, "grad_norm": 3.5686779022216797, "learning_rate": 8.406900050735668e-06, "loss": 0.0204, "step": 2666 }, { "epoch": 2.4356164383561643, "grad_norm": 33.99445343017578, "learning_rate": 8.405885337392187e-06, "loss": 0.2591, "step": 2667 }, { "epoch": 2.4365296803652967, "grad_norm": 114.45307159423828, "learning_rate": 8.404870624048706e-06, "loss": 2.9379, "step": 2668 }, { "epoch": 2.437442922374429, "grad_norm": 97.61579132080078, "learning_rate": 8.403855910705227e-06, "loss": 1.5741, "step": 2669 }, { "epoch": 2.4383561643835616, "grad_norm": 1.2475357055664062, "learning_rate": 8.402841197361747e-06, "loss": 0.0096, "step": 2670 }, { "epoch": 2.439269406392694, "grad_norm": 6.158553600311279, "learning_rate": 8.401826484018264e-06, "loss": 0.0274, "step": 2671 }, { "epoch": 2.4401826484018265, "grad_norm": 0.8173367381095886, "learning_rate": 8.400811770674785e-06, "loss": 0.0075, "step": 2672 }, { "epoch": 2.441095890410959, "grad_norm": 46.58840560913086, "learning_rate": 8.399797057331305e-06, "loss": 0.4872, "step": 2673 }, { "epoch": 2.4420091324200914, "grad_norm": 8.600563049316406, "learning_rate": 8.398782343987824e-06, "loss": 0.0839, "step": 2674 }, { "epoch": 2.442922374429224, "grad_norm": 56.649845123291016, "learning_rate": 8.397767630644343e-06, "loss": 0.859, "step": 2675 }, { "epoch": 2.4438356164383563, "grad_norm": 1.860815167427063, "learning_rate": 8.396752917300864e-06, "loss": 0.0121, "step": 2676 }, { "epoch": 2.444748858447489, "grad_norm": 0.13422195613384247, "learning_rate": 8.395738203957382e-06, "loss": 0.0011, "step": 2677 }, { "epoch": 2.4456621004566212, "grad_norm": 4.185128688812256, "learning_rate": 8.394723490613901e-06, "loss": 0.0415, "step": 2678 }, { "epoch": 2.4465753424657533, "grad_norm": 0.0968497097492218, "learning_rate": 8.393708777270422e-06, "loss": 0.0008, "step": 2679 }, { "epoch": 2.4474885844748857, "grad_norm": 3.130887269973755, "learning_rate": 8.392694063926942e-06, "loss": 0.0187, "step": 2680 }, { "epoch": 2.448401826484018, "grad_norm": 1.05530846118927, "learning_rate": 8.391679350583461e-06, "loss": 0.0095, "step": 2681 }, { "epoch": 2.4493150684931506, "grad_norm": 45.96403121948242, "learning_rate": 8.39066463723998e-06, "loss": 0.6178, "step": 2682 }, { "epoch": 2.450228310502283, "grad_norm": 45.904239654541016, "learning_rate": 8.3896499238965e-06, "loss": 0.4281, "step": 2683 }, { "epoch": 2.4511415525114155, "grad_norm": 20.13577651977539, "learning_rate": 8.38863521055302e-06, "loss": 0.0732, "step": 2684 }, { "epoch": 2.452054794520548, "grad_norm": 6.028274059295654, "learning_rate": 8.387620497209538e-06, "loss": 0.0393, "step": 2685 }, { "epoch": 2.4529680365296804, "grad_norm": 93.70053100585938, "learning_rate": 8.386605783866059e-06, "loss": 4.7389, "step": 2686 }, { "epoch": 2.453881278538813, "grad_norm": 105.02470397949219, "learning_rate": 8.385591070522578e-06, "loss": 1.9925, "step": 2687 }, { "epoch": 2.4547945205479453, "grad_norm": 7.734983921051025, "learning_rate": 8.384576357179096e-06, "loss": 0.0741, "step": 2688 }, { "epoch": 2.455707762557078, "grad_norm": 1.161905288696289, "learning_rate": 8.383561643835617e-06, "loss": 0.0085, "step": 2689 }, { "epoch": 2.45662100456621, "grad_norm": 2.7133963108062744, "learning_rate": 8.382546930492138e-06, "loss": 0.0139, "step": 2690 }, { "epoch": 2.4575342465753423, "grad_norm": 4.14703893661499, "learning_rate": 8.381532217148656e-06, "loss": 0.0203, "step": 2691 }, { "epoch": 2.4584474885844747, "grad_norm": 44.27305603027344, "learning_rate": 8.380517503805175e-06, "loss": 0.3313, "step": 2692 }, { "epoch": 2.459360730593607, "grad_norm": 1.7487560510635376, "learning_rate": 8.379502790461696e-06, "loss": 0.0133, "step": 2693 }, { "epoch": 2.4602739726027396, "grad_norm": 4.010412216186523, "learning_rate": 8.378488077118215e-06, "loss": 0.0342, "step": 2694 }, { "epoch": 2.461187214611872, "grad_norm": 125.5676040649414, "learning_rate": 8.377473363774733e-06, "loss": 1.2748, "step": 2695 }, { "epoch": 2.4621004566210045, "grad_norm": 4.779362201690674, "learning_rate": 8.376458650431254e-06, "loss": 0.0473, "step": 2696 }, { "epoch": 2.463013698630137, "grad_norm": 37.10575485229492, "learning_rate": 8.375443937087775e-06, "loss": 0.2094, "step": 2697 }, { "epoch": 2.4639269406392694, "grad_norm": 78.64927673339844, "learning_rate": 8.374429223744293e-06, "loss": 2.1138, "step": 2698 }, { "epoch": 2.464840182648402, "grad_norm": 68.82449340820312, "learning_rate": 8.373414510400812e-06, "loss": 1.0188, "step": 2699 }, { "epoch": 2.4657534246575343, "grad_norm": 0.7693138122558594, "learning_rate": 8.372399797057333e-06, "loss": 0.0079, "step": 2700 }, { "epoch": 2.466666666666667, "grad_norm": 0.6519371867179871, "learning_rate": 8.371385083713852e-06, "loss": 0.0024, "step": 2701 }, { "epoch": 2.4675799086757992, "grad_norm": 24.71098518371582, "learning_rate": 8.37037037037037e-06, "loss": 0.2351, "step": 2702 }, { "epoch": 2.4684931506849317, "grad_norm": 1.9002573490142822, "learning_rate": 8.369355657026891e-06, "loss": 0.0148, "step": 2703 }, { "epoch": 2.469406392694064, "grad_norm": 1.9843095541000366, "learning_rate": 8.36834094368341e-06, "loss": 0.0144, "step": 2704 }, { "epoch": 2.470319634703196, "grad_norm": 0.5767043828964233, "learning_rate": 8.367326230339929e-06, "loss": 0.0046, "step": 2705 }, { "epoch": 2.4712328767123286, "grad_norm": 4.837329864501953, "learning_rate": 8.36631151699645e-06, "loss": 0.0352, "step": 2706 }, { "epoch": 2.472146118721461, "grad_norm": 9.89062213897705, "learning_rate": 8.36529680365297e-06, "loss": 0.0788, "step": 2707 }, { "epoch": 2.4730593607305935, "grad_norm": 44.87291717529297, "learning_rate": 8.364282090309489e-06, "loss": 0.5027, "step": 2708 }, { "epoch": 2.473972602739726, "grad_norm": 5.90809440612793, "learning_rate": 8.363267376966007e-06, "loss": 0.0339, "step": 2709 }, { "epoch": 2.4748858447488584, "grad_norm": 58.14946746826172, "learning_rate": 8.362252663622528e-06, "loss": 1.1171, "step": 2710 }, { "epoch": 2.475799086757991, "grad_norm": 20.325328826904297, "learning_rate": 8.361237950279047e-06, "loss": 0.2007, "step": 2711 }, { "epoch": 2.4767123287671233, "grad_norm": 18.483951568603516, "learning_rate": 8.360223236935566e-06, "loss": 0.1404, "step": 2712 }, { "epoch": 2.477625570776256, "grad_norm": 52.97976303100586, "learning_rate": 8.359208523592086e-06, "loss": 0.5518, "step": 2713 }, { "epoch": 2.4785388127853882, "grad_norm": 53.25471878051758, "learning_rate": 8.358193810248607e-06, "loss": 0.6739, "step": 2714 }, { "epoch": 2.4794520547945207, "grad_norm": 2.0633704662323, "learning_rate": 8.357179096905124e-06, "loss": 0.0103, "step": 2715 }, { "epoch": 2.480365296803653, "grad_norm": 14.269100189208984, "learning_rate": 8.356164383561644e-06, "loss": 0.0912, "step": 2716 }, { "epoch": 2.481278538812785, "grad_norm": 72.6993179321289, "learning_rate": 8.355149670218165e-06, "loss": 0.4099, "step": 2717 }, { "epoch": 2.4821917808219176, "grad_norm": 4.224847793579102, "learning_rate": 8.354134956874684e-06, "loss": 0.0336, "step": 2718 }, { "epoch": 2.48310502283105, "grad_norm": 64.01741790771484, "learning_rate": 8.353120243531203e-06, "loss": 0.848, "step": 2719 }, { "epoch": 2.4840182648401825, "grad_norm": 68.12596130371094, "learning_rate": 8.352105530187723e-06, "loss": 0.5254, "step": 2720 }, { "epoch": 2.484931506849315, "grad_norm": 14.54881477355957, "learning_rate": 8.351090816844242e-06, "loss": 0.1192, "step": 2721 }, { "epoch": 2.4858447488584474, "grad_norm": 7.19291877746582, "learning_rate": 8.350076103500761e-06, "loss": 0.0461, "step": 2722 }, { "epoch": 2.48675799086758, "grad_norm": 2.8000917434692383, "learning_rate": 8.349061390157281e-06, "loss": 0.0109, "step": 2723 }, { "epoch": 2.4876712328767123, "grad_norm": 34.55055618286133, "learning_rate": 8.348046676813802e-06, "loss": 0.2284, "step": 2724 }, { "epoch": 2.4885844748858448, "grad_norm": 36.613704681396484, "learning_rate": 8.347031963470321e-06, "loss": 0.2444, "step": 2725 }, { "epoch": 2.4894977168949772, "grad_norm": 12.818254470825195, "learning_rate": 8.34601725012684e-06, "loss": 0.0907, "step": 2726 }, { "epoch": 2.4904109589041097, "grad_norm": 0.8005800843238831, "learning_rate": 8.34500253678336e-06, "loss": 0.0062, "step": 2727 }, { "epoch": 2.491324200913242, "grad_norm": 29.016504287719727, "learning_rate": 8.343987823439879e-06, "loss": 0.2811, "step": 2728 }, { "epoch": 2.4922374429223746, "grad_norm": 0.3395978808403015, "learning_rate": 8.342973110096398e-06, "loss": 0.0034, "step": 2729 }, { "epoch": 2.493150684931507, "grad_norm": 18.228878021240234, "learning_rate": 8.341958396752918e-06, "loss": 0.1616, "step": 2730 }, { "epoch": 2.4940639269406395, "grad_norm": 87.80709075927734, "learning_rate": 8.340943683409437e-06, "loss": 1.066, "step": 2731 }, { "epoch": 2.4949771689497715, "grad_norm": 2.922797441482544, "learning_rate": 8.339928970065956e-06, "loss": 0.0154, "step": 2732 }, { "epoch": 2.495890410958904, "grad_norm": 4.117770671844482, "learning_rate": 8.338914256722477e-06, "loss": 0.0222, "step": 2733 }, { "epoch": 2.4968036529680364, "grad_norm": 0.16188199818134308, "learning_rate": 8.337899543378997e-06, "loss": 0.0014, "step": 2734 }, { "epoch": 2.497716894977169, "grad_norm": 208.51487731933594, "learning_rate": 8.336884830035516e-06, "loss": 1.5136, "step": 2735 }, { "epoch": 2.4986301369863013, "grad_norm": 0.18690522015094757, "learning_rate": 8.335870116692035e-06, "loss": 0.0019, "step": 2736 }, { "epoch": 2.4995433789954338, "grad_norm": 17.007526397705078, "learning_rate": 8.334855403348555e-06, "loss": 0.1035, "step": 2737 }, { "epoch": 2.5004566210045662, "grad_norm": 2.3334038257598877, "learning_rate": 8.333840690005074e-06, "loss": 0.0124, "step": 2738 }, { "epoch": 2.5013698630136987, "grad_norm": 37.590301513671875, "learning_rate": 8.332825976661593e-06, "loss": 0.1914, "step": 2739 }, { "epoch": 2.502283105022831, "grad_norm": 9.039206504821777, "learning_rate": 8.331811263318114e-06, "loss": 0.0541, "step": 2740 }, { "epoch": 2.5031963470319636, "grad_norm": 85.79569244384766, "learning_rate": 8.330796549974633e-06, "loss": 0.6621, "step": 2741 }, { "epoch": 2.504109589041096, "grad_norm": 13.343842506408691, "learning_rate": 8.329781836631153e-06, "loss": 0.1353, "step": 2742 }, { "epoch": 2.505022831050228, "grad_norm": 12.608297348022461, "learning_rate": 8.328767123287672e-06, "loss": 0.1009, "step": 2743 }, { "epoch": 2.5059360730593605, "grad_norm": 2.2368295192718506, "learning_rate": 8.327752409944192e-06, "loss": 0.0189, "step": 2744 }, { "epoch": 2.506849315068493, "grad_norm": 64.31807708740234, "learning_rate": 8.326737696600711e-06, "loss": 0.5865, "step": 2745 }, { "epoch": 2.5077625570776254, "grad_norm": 5.517115116119385, "learning_rate": 8.32572298325723e-06, "loss": 0.0321, "step": 2746 }, { "epoch": 2.508675799086758, "grad_norm": 28.406673431396484, "learning_rate": 8.32470826991375e-06, "loss": 0.2758, "step": 2747 }, { "epoch": 2.5095890410958903, "grad_norm": 51.2845573425293, "learning_rate": 8.32369355657027e-06, "loss": 0.4336, "step": 2748 }, { "epoch": 2.5105022831050228, "grad_norm": 2.9117066860198975, "learning_rate": 8.322678843226788e-06, "loss": 0.0144, "step": 2749 }, { "epoch": 2.5114155251141552, "grad_norm": 0.40310072898864746, "learning_rate": 8.321664129883309e-06, "loss": 0.0039, "step": 2750 }, { "epoch": 2.5123287671232877, "grad_norm": 62.71851348876953, "learning_rate": 8.320649416539828e-06, "loss": 0.4961, "step": 2751 }, { "epoch": 2.51324200913242, "grad_norm": 5.881555557250977, "learning_rate": 8.319634703196348e-06, "loss": 0.0426, "step": 2752 }, { "epoch": 2.5141552511415526, "grad_norm": 5.152280807495117, "learning_rate": 8.318619989852867e-06, "loss": 0.043, "step": 2753 }, { "epoch": 2.515068493150685, "grad_norm": 41.87147903442383, "learning_rate": 8.317605276509388e-06, "loss": 0.5395, "step": 2754 }, { "epoch": 2.5159817351598175, "grad_norm": 6.9257893562316895, "learning_rate": 8.316590563165907e-06, "loss": 0.059, "step": 2755 }, { "epoch": 2.51689497716895, "grad_norm": 7.058032035827637, "learning_rate": 8.315575849822425e-06, "loss": 0.053, "step": 2756 }, { "epoch": 2.5178082191780824, "grad_norm": 8.147515296936035, "learning_rate": 8.314561136478946e-06, "loss": 0.0857, "step": 2757 }, { "epoch": 2.518721461187215, "grad_norm": 12.444226264953613, "learning_rate": 8.313546423135465e-06, "loss": 0.0695, "step": 2758 }, { "epoch": 2.5196347031963473, "grad_norm": 7.476531028747559, "learning_rate": 8.312531709791984e-06, "loss": 0.0669, "step": 2759 }, { "epoch": 2.5205479452054793, "grad_norm": 2.595909595489502, "learning_rate": 8.311516996448504e-06, "loss": 0.0243, "step": 2760 }, { "epoch": 2.5214611872146118, "grad_norm": 2.986842632293701, "learning_rate": 8.310502283105023e-06, "loss": 0.0187, "step": 2761 }, { "epoch": 2.522374429223744, "grad_norm": 1.1786538362503052, "learning_rate": 8.309487569761544e-06, "loss": 0.0111, "step": 2762 }, { "epoch": 2.5232876712328767, "grad_norm": 56.16426467895508, "learning_rate": 8.308472856418062e-06, "loss": 0.618, "step": 2763 }, { "epoch": 2.524200913242009, "grad_norm": 23.597211837768555, "learning_rate": 8.307458143074583e-06, "loss": 0.1699, "step": 2764 }, { "epoch": 2.5251141552511416, "grad_norm": 0.8539754152297974, "learning_rate": 8.306443429731102e-06, "loss": 0.0068, "step": 2765 }, { "epoch": 2.526027397260274, "grad_norm": 25.58540153503418, "learning_rate": 8.30542871638762e-06, "loss": 0.1878, "step": 2766 }, { "epoch": 2.5269406392694065, "grad_norm": 125.21698760986328, "learning_rate": 8.304414003044141e-06, "loss": 1.3737, "step": 2767 }, { "epoch": 2.527853881278539, "grad_norm": 43.84666061401367, "learning_rate": 8.30339928970066e-06, "loss": 0.5831, "step": 2768 }, { "epoch": 2.5287671232876714, "grad_norm": 0.5876354575157166, "learning_rate": 8.30238457635718e-06, "loss": 0.0047, "step": 2769 }, { "epoch": 2.5296803652968034, "grad_norm": 71.01434326171875, "learning_rate": 8.3013698630137e-06, "loss": 1.0704, "step": 2770 }, { "epoch": 2.530593607305936, "grad_norm": 132.9503936767578, "learning_rate": 8.300355149670218e-06, "loss": 2.1001, "step": 2771 }, { "epoch": 2.5315068493150683, "grad_norm": 27.613555908203125, "learning_rate": 8.299340436326739e-06, "loss": 0.3348, "step": 2772 }, { "epoch": 2.5324200913242008, "grad_norm": 14.226507186889648, "learning_rate": 8.298325722983258e-06, "loss": 0.1203, "step": 2773 }, { "epoch": 2.533333333333333, "grad_norm": 12.60416316986084, "learning_rate": 8.297311009639778e-06, "loss": 0.0979, "step": 2774 }, { "epoch": 2.5342465753424657, "grad_norm": 25.883888244628906, "learning_rate": 8.296296296296297e-06, "loss": 0.0974, "step": 2775 }, { "epoch": 2.535159817351598, "grad_norm": 12.398091316223145, "learning_rate": 8.295281582952816e-06, "loss": 0.0615, "step": 2776 }, { "epoch": 2.5360730593607306, "grad_norm": 2.109067440032959, "learning_rate": 8.294266869609336e-06, "loss": 0.0172, "step": 2777 }, { "epoch": 2.536986301369863, "grad_norm": 38.20925521850586, "learning_rate": 8.293252156265855e-06, "loss": 0.3749, "step": 2778 }, { "epoch": 2.5378995433789955, "grad_norm": 1.864423155784607, "learning_rate": 8.292237442922376e-06, "loss": 0.0133, "step": 2779 }, { "epoch": 2.538812785388128, "grad_norm": 9.737853050231934, "learning_rate": 8.291222729578895e-06, "loss": 0.059, "step": 2780 }, { "epoch": 2.5397260273972604, "grad_norm": 15.021849632263184, "learning_rate": 8.290208016235413e-06, "loss": 0.0872, "step": 2781 }, { "epoch": 2.540639269406393, "grad_norm": 24.528278350830078, "learning_rate": 8.289193302891934e-06, "loss": 0.1617, "step": 2782 }, { "epoch": 2.5415525114155253, "grad_norm": 18.74258804321289, "learning_rate": 8.288178589548453e-06, "loss": 0.1963, "step": 2783 }, { "epoch": 2.5424657534246577, "grad_norm": 45.81221008300781, "learning_rate": 8.287163876204973e-06, "loss": 0.7136, "step": 2784 }, { "epoch": 2.54337899543379, "grad_norm": 32.07410430908203, "learning_rate": 8.286149162861492e-06, "loss": 0.2037, "step": 2785 }, { "epoch": 2.544292237442922, "grad_norm": 2.9954214096069336, "learning_rate": 8.285134449518013e-06, "loss": 0.0209, "step": 2786 }, { "epoch": 2.5452054794520547, "grad_norm": 0.27439939975738525, "learning_rate": 8.284119736174532e-06, "loss": 0.0016, "step": 2787 }, { "epoch": 2.546118721461187, "grad_norm": 0.505601704120636, "learning_rate": 8.28310502283105e-06, "loss": 0.0043, "step": 2788 }, { "epoch": 2.5470319634703196, "grad_norm": 74.30067443847656, "learning_rate": 8.282090309487571e-06, "loss": 0.8605, "step": 2789 }, { "epoch": 2.547945205479452, "grad_norm": 9.099204063415527, "learning_rate": 8.28107559614409e-06, "loss": 0.0647, "step": 2790 }, { "epoch": 2.5488584474885845, "grad_norm": 41.725807189941406, "learning_rate": 8.280060882800609e-06, "loss": 0.2028, "step": 2791 }, { "epoch": 2.549771689497717, "grad_norm": 11.972780227661133, "learning_rate": 8.27904616945713e-06, "loss": 0.0786, "step": 2792 }, { "epoch": 2.5506849315068494, "grad_norm": 3.433547258377075, "learning_rate": 8.278031456113648e-06, "loss": 0.0228, "step": 2793 }, { "epoch": 2.551598173515982, "grad_norm": 3.212928533554077, "learning_rate": 8.277016742770169e-06, "loss": 0.0184, "step": 2794 }, { "epoch": 2.5525114155251143, "grad_norm": 0.8230350613594055, "learning_rate": 8.276002029426687e-06, "loss": 0.0067, "step": 2795 }, { "epoch": 2.5534246575342463, "grad_norm": 19.24207878112793, "learning_rate": 8.274987316083208e-06, "loss": 0.1535, "step": 2796 }, { "epoch": 2.5543378995433788, "grad_norm": 17.559083938598633, "learning_rate": 8.273972602739727e-06, "loss": 0.179, "step": 2797 }, { "epoch": 2.555251141552511, "grad_norm": 3.281402111053467, "learning_rate": 8.272957889396246e-06, "loss": 0.0357, "step": 2798 }, { "epoch": 2.5561643835616437, "grad_norm": 21.17262077331543, "learning_rate": 8.271943176052766e-06, "loss": 0.177, "step": 2799 }, { "epoch": 2.557077625570776, "grad_norm": 34.457000732421875, "learning_rate": 8.270928462709285e-06, "loss": 0.31, "step": 2800 }, { "epoch": 2.5579908675799086, "grad_norm": 113.40647888183594, "learning_rate": 8.269913749365804e-06, "loss": 0.9942, "step": 2801 }, { "epoch": 2.558904109589041, "grad_norm": 0.728276789188385, "learning_rate": 8.268899036022324e-06, "loss": 0.0056, "step": 2802 }, { "epoch": 2.5598173515981735, "grad_norm": 18.628814697265625, "learning_rate": 8.267884322678843e-06, "loss": 0.1403, "step": 2803 }, { "epoch": 2.560730593607306, "grad_norm": 33.84230422973633, "learning_rate": 8.266869609335364e-06, "loss": 0.2812, "step": 2804 }, { "epoch": 2.5616438356164384, "grad_norm": 1.518782615661621, "learning_rate": 8.265854895991883e-06, "loss": 0.011, "step": 2805 }, { "epoch": 2.562557077625571, "grad_norm": 8.245772361755371, "learning_rate": 8.264840182648403e-06, "loss": 0.0765, "step": 2806 }, { "epoch": 2.5634703196347033, "grad_norm": 14.376304626464844, "learning_rate": 8.263825469304922e-06, "loss": 0.0899, "step": 2807 }, { "epoch": 2.5643835616438357, "grad_norm": 78.84542083740234, "learning_rate": 8.262810755961441e-06, "loss": 1.0527, "step": 2808 }, { "epoch": 2.565296803652968, "grad_norm": 46.85639572143555, "learning_rate": 8.261796042617961e-06, "loss": 0.5303, "step": 2809 }, { "epoch": 2.5662100456621006, "grad_norm": 0.41253748536109924, "learning_rate": 8.26078132927448e-06, "loss": 0.0035, "step": 2810 }, { "epoch": 2.567123287671233, "grad_norm": 68.69620513916016, "learning_rate": 8.259766615930999e-06, "loss": 0.4042, "step": 2811 }, { "epoch": 2.5680365296803656, "grad_norm": 0.7641323804855347, "learning_rate": 8.25875190258752e-06, "loss": 0.0074, "step": 2812 }, { "epoch": 2.5689497716894976, "grad_norm": 22.025087356567383, "learning_rate": 8.25773718924404e-06, "loss": 0.1808, "step": 2813 }, { "epoch": 2.56986301369863, "grad_norm": 0.7632309198379517, "learning_rate": 8.256722475900559e-06, "loss": 0.0038, "step": 2814 }, { "epoch": 2.5707762557077625, "grad_norm": 61.36615753173828, "learning_rate": 8.255707762557078e-06, "loss": 0.9333, "step": 2815 }, { "epoch": 2.571689497716895, "grad_norm": 64.92212677001953, "learning_rate": 8.254693049213598e-06, "loss": 0.5679, "step": 2816 }, { "epoch": 2.5726027397260274, "grad_norm": 0.4373197555541992, "learning_rate": 8.253678335870117e-06, "loss": 0.0034, "step": 2817 }, { "epoch": 2.57351598173516, "grad_norm": 2.902284622192383, "learning_rate": 8.252663622526636e-06, "loss": 0.013, "step": 2818 }, { "epoch": 2.5744292237442923, "grad_norm": 74.0420150756836, "learning_rate": 8.251648909183157e-06, "loss": 1.1067, "step": 2819 }, { "epoch": 2.5753424657534247, "grad_norm": 1.0027633905410767, "learning_rate": 8.250634195839676e-06, "loss": 0.0061, "step": 2820 }, { "epoch": 2.576255707762557, "grad_norm": 77.19893646240234, "learning_rate": 8.249619482496194e-06, "loss": 0.5663, "step": 2821 }, { "epoch": 2.5771689497716896, "grad_norm": 0.5517758727073669, "learning_rate": 8.248604769152715e-06, "loss": 0.0052, "step": 2822 }, { "epoch": 2.5780821917808217, "grad_norm": 2.15366530418396, "learning_rate": 8.247590055809235e-06, "loss": 0.0174, "step": 2823 }, { "epoch": 2.578995433789954, "grad_norm": 44.69366455078125, "learning_rate": 8.246575342465754e-06, "loss": 0.6709, "step": 2824 }, { "epoch": 2.5799086757990866, "grad_norm": 0.6084710955619812, "learning_rate": 8.245560629122273e-06, "loss": 0.0049, "step": 2825 }, { "epoch": 2.580821917808219, "grad_norm": 6.343081951141357, "learning_rate": 8.244545915778794e-06, "loss": 0.0437, "step": 2826 }, { "epoch": 2.5817351598173515, "grad_norm": 14.80178165435791, "learning_rate": 8.243531202435313e-06, "loss": 0.0984, "step": 2827 }, { "epoch": 2.582648401826484, "grad_norm": 1.299707293510437, "learning_rate": 8.242516489091831e-06, "loss": 0.0105, "step": 2828 }, { "epoch": 2.5835616438356164, "grad_norm": 8.09721565246582, "learning_rate": 8.241501775748352e-06, "loss": 0.0743, "step": 2829 }, { "epoch": 2.584474885844749, "grad_norm": 15.408824920654297, "learning_rate": 8.240487062404872e-06, "loss": 0.0952, "step": 2830 }, { "epoch": 2.5853881278538813, "grad_norm": 57.28983688354492, "learning_rate": 8.23947234906139e-06, "loss": 0.3112, "step": 2831 }, { "epoch": 2.5863013698630137, "grad_norm": 0.7079506516456604, "learning_rate": 8.23845763571791e-06, "loss": 0.0039, "step": 2832 }, { "epoch": 2.587214611872146, "grad_norm": 4.121103763580322, "learning_rate": 8.23744292237443e-06, "loss": 0.0316, "step": 2833 }, { "epoch": 2.5881278538812786, "grad_norm": 0.9222578406333923, "learning_rate": 8.23642820903095e-06, "loss": 0.0085, "step": 2834 }, { "epoch": 2.589041095890411, "grad_norm": 34.55817413330078, "learning_rate": 8.235413495687468e-06, "loss": 0.532, "step": 2835 }, { "epoch": 2.5899543378995435, "grad_norm": 16.70937728881836, "learning_rate": 8.234398782343989e-06, "loss": 0.0782, "step": 2836 }, { "epoch": 2.590867579908676, "grad_norm": 2.020969867706299, "learning_rate": 8.233384069000508e-06, "loss": 0.0119, "step": 2837 }, { "epoch": 2.5917808219178085, "grad_norm": 7.623410224914551, "learning_rate": 8.232369355657027e-06, "loss": 0.0364, "step": 2838 }, { "epoch": 2.592694063926941, "grad_norm": 67.67659759521484, "learning_rate": 8.231354642313547e-06, "loss": 0.6709, "step": 2839 }, { "epoch": 2.593607305936073, "grad_norm": 42.45478439331055, "learning_rate": 8.230339928970068e-06, "loss": 0.3111, "step": 2840 }, { "epoch": 2.5945205479452054, "grad_norm": 0.25732696056365967, "learning_rate": 8.229325215626586e-06, "loss": 0.0021, "step": 2841 }, { "epoch": 2.595433789954338, "grad_norm": 5.327232360839844, "learning_rate": 8.228310502283105e-06, "loss": 0.038, "step": 2842 }, { "epoch": 2.5963470319634703, "grad_norm": 46.66969299316406, "learning_rate": 8.227295788939626e-06, "loss": 0.3866, "step": 2843 }, { "epoch": 2.5972602739726027, "grad_norm": 1.0748778581619263, "learning_rate": 8.226281075596145e-06, "loss": 0.0078, "step": 2844 }, { "epoch": 2.598173515981735, "grad_norm": 7.80003023147583, "learning_rate": 8.225266362252664e-06, "loss": 0.0713, "step": 2845 }, { "epoch": 2.5990867579908676, "grad_norm": 79.63758087158203, "learning_rate": 8.224251648909184e-06, "loss": 0.1469, "step": 2846 }, { "epoch": 2.6, "grad_norm": 0.11063020676374435, "learning_rate": 8.223236935565703e-06, "loss": 0.0011, "step": 2847 }, { "epoch": 2.6009132420091325, "grad_norm": 0.26344406604766846, "learning_rate": 8.222222222222222e-06, "loss": 0.0023, "step": 2848 }, { "epoch": 2.601826484018265, "grad_norm": 25.2447509765625, "learning_rate": 8.221207508878742e-06, "loss": 0.2656, "step": 2849 }, { "epoch": 2.602739726027397, "grad_norm": 5.647955417633057, "learning_rate": 8.220192795535263e-06, "loss": 0.0266, "step": 2850 }, { "epoch": 2.6036529680365295, "grad_norm": 17.57743263244629, "learning_rate": 8.219178082191782e-06, "loss": 0.2349, "step": 2851 }, { "epoch": 2.604566210045662, "grad_norm": 13.09521198272705, "learning_rate": 8.2181633688483e-06, "loss": 0.0812, "step": 2852 }, { "epoch": 2.6054794520547944, "grad_norm": 26.925092697143555, "learning_rate": 8.217148655504821e-06, "loss": 0.205, "step": 2853 }, { "epoch": 2.606392694063927, "grad_norm": 1.007900357246399, "learning_rate": 8.21613394216134e-06, "loss": 0.0093, "step": 2854 }, { "epoch": 2.6073059360730593, "grad_norm": 15.440055847167969, "learning_rate": 8.215119228817859e-06, "loss": 0.0981, "step": 2855 }, { "epoch": 2.6082191780821917, "grad_norm": 3.810971736907959, "learning_rate": 8.21410451547438e-06, "loss": 0.0213, "step": 2856 }, { "epoch": 2.609132420091324, "grad_norm": 5.649807929992676, "learning_rate": 8.2130898021309e-06, "loss": 0.0471, "step": 2857 }, { "epoch": 2.6100456621004566, "grad_norm": 39.905582427978516, "learning_rate": 8.212075088787419e-06, "loss": 0.3609, "step": 2858 }, { "epoch": 2.610958904109589, "grad_norm": 23.947254180908203, "learning_rate": 8.211060375443938e-06, "loss": 0.1669, "step": 2859 }, { "epoch": 2.6118721461187215, "grad_norm": 44.56889724731445, "learning_rate": 8.210045662100458e-06, "loss": 0.2903, "step": 2860 }, { "epoch": 2.612785388127854, "grad_norm": 57.517784118652344, "learning_rate": 8.209030948756977e-06, "loss": 0.7538, "step": 2861 }, { "epoch": 2.6136986301369864, "grad_norm": 93.6302719116211, "learning_rate": 8.208016235413496e-06, "loss": 2.8765, "step": 2862 }, { "epoch": 2.614611872146119, "grad_norm": 47.82414245605469, "learning_rate": 8.207001522070016e-06, "loss": 0.6292, "step": 2863 }, { "epoch": 2.6155251141552514, "grad_norm": 3.3950881958007812, "learning_rate": 8.205986808726535e-06, "loss": 0.0166, "step": 2864 }, { "epoch": 2.616438356164384, "grad_norm": 164.77734375, "learning_rate": 8.204972095383054e-06, "loss": 1.0318, "step": 2865 }, { "epoch": 2.6173515981735163, "grad_norm": 23.321969985961914, "learning_rate": 8.203957382039575e-06, "loss": 0.2817, "step": 2866 }, { "epoch": 2.6182648401826483, "grad_norm": 0.6691644787788391, "learning_rate": 8.202942668696095e-06, "loss": 0.0056, "step": 2867 }, { "epoch": 2.6191780821917807, "grad_norm": 19.329591751098633, "learning_rate": 8.201927955352614e-06, "loss": 0.1805, "step": 2868 }, { "epoch": 2.620091324200913, "grad_norm": 15.866915702819824, "learning_rate": 8.200913242009133e-06, "loss": 0.1363, "step": 2869 }, { "epoch": 2.6210045662100456, "grad_norm": 72.39543151855469, "learning_rate": 8.199898528665653e-06, "loss": 0.6908, "step": 2870 }, { "epoch": 2.621917808219178, "grad_norm": 0.4266784191131592, "learning_rate": 8.198883815322172e-06, "loss": 0.0036, "step": 2871 }, { "epoch": 2.6228310502283105, "grad_norm": 35.06610870361328, "learning_rate": 8.197869101978691e-06, "loss": 0.2718, "step": 2872 }, { "epoch": 2.623744292237443, "grad_norm": 4.348167419433594, "learning_rate": 8.196854388635212e-06, "loss": 0.044, "step": 2873 }, { "epoch": 2.6246575342465754, "grad_norm": 2.9086625576019287, "learning_rate": 8.195839675291732e-06, "loss": 0.0241, "step": 2874 }, { "epoch": 2.625570776255708, "grad_norm": 37.66954803466797, "learning_rate": 8.19482496194825e-06, "loss": 0.3621, "step": 2875 }, { "epoch": 2.6264840182648403, "grad_norm": 5.513151168823242, "learning_rate": 8.19381024860477e-06, "loss": 0.0359, "step": 2876 }, { "epoch": 2.6273972602739724, "grad_norm": 3.366361618041992, "learning_rate": 8.19279553526129e-06, "loss": 0.0217, "step": 2877 }, { "epoch": 2.628310502283105, "grad_norm": 0.9314400553703308, "learning_rate": 8.19178082191781e-06, "loss": 0.0059, "step": 2878 }, { "epoch": 2.6292237442922373, "grad_norm": 110.93016815185547, "learning_rate": 8.190766108574328e-06, "loss": 2.4018, "step": 2879 }, { "epoch": 2.6301369863013697, "grad_norm": 56.838741302490234, "learning_rate": 8.189751395230849e-06, "loss": 1.1005, "step": 2880 }, { "epoch": 2.631050228310502, "grad_norm": 4.885944843292236, "learning_rate": 8.188736681887367e-06, "loss": 0.0518, "step": 2881 }, { "epoch": 2.6319634703196346, "grad_norm": 17.457500457763672, "learning_rate": 8.187721968543886e-06, "loss": 0.1266, "step": 2882 }, { "epoch": 2.632876712328767, "grad_norm": 1.7778713703155518, "learning_rate": 8.186707255200407e-06, "loss": 0.0105, "step": 2883 }, { "epoch": 2.6337899543378995, "grad_norm": 7.325479984283447, "learning_rate": 8.185692541856927e-06, "loss": 0.0748, "step": 2884 }, { "epoch": 2.634703196347032, "grad_norm": 8.376510620117188, "learning_rate": 8.184677828513446e-06, "loss": 0.0582, "step": 2885 }, { "epoch": 2.6356164383561644, "grad_norm": 60.184207916259766, "learning_rate": 8.183663115169965e-06, "loss": 0.4618, "step": 2886 }, { "epoch": 2.636529680365297, "grad_norm": 82.42273712158203, "learning_rate": 8.182648401826486e-06, "loss": 0.9238, "step": 2887 }, { "epoch": 2.6374429223744293, "grad_norm": 3.972982406616211, "learning_rate": 8.181633688483004e-06, "loss": 0.0298, "step": 2888 }, { "epoch": 2.638356164383562, "grad_norm": 2.111670970916748, "learning_rate": 8.180618975139523e-06, "loss": 0.0123, "step": 2889 }, { "epoch": 2.6392694063926943, "grad_norm": 123.10643768310547, "learning_rate": 8.179604261796044e-06, "loss": 0.4993, "step": 2890 }, { "epoch": 2.6401826484018267, "grad_norm": 3.7031285762786865, "learning_rate": 8.178589548452563e-06, "loss": 0.0292, "step": 2891 }, { "epoch": 2.641095890410959, "grad_norm": 0.3018522262573242, "learning_rate": 8.177574835109081e-06, "loss": 0.0018, "step": 2892 }, { "epoch": 2.642009132420091, "grad_norm": 0.22862352430820465, "learning_rate": 8.176560121765602e-06, "loss": 0.0013, "step": 2893 }, { "epoch": 2.6429223744292236, "grad_norm": 27.63709259033203, "learning_rate": 8.175545408422123e-06, "loss": 0.1651, "step": 2894 }, { "epoch": 2.643835616438356, "grad_norm": 1.129773736000061, "learning_rate": 8.174530695078641e-06, "loss": 0.0081, "step": 2895 }, { "epoch": 2.6447488584474885, "grad_norm": 41.250396728515625, "learning_rate": 8.17351598173516e-06, "loss": 0.2898, "step": 2896 }, { "epoch": 2.645662100456621, "grad_norm": 32.897705078125, "learning_rate": 8.17250126839168e-06, "loss": 0.3251, "step": 2897 }, { "epoch": 2.6465753424657534, "grad_norm": 6.170634746551514, "learning_rate": 8.1714865550482e-06, "loss": 0.0614, "step": 2898 }, { "epoch": 2.647488584474886, "grad_norm": 2.028827428817749, "learning_rate": 8.170471841704718e-06, "loss": 0.0205, "step": 2899 }, { "epoch": 2.6484018264840183, "grad_norm": 16.339832305908203, "learning_rate": 8.169457128361239e-06, "loss": 0.105, "step": 2900 }, { "epoch": 2.649315068493151, "grad_norm": 20.583698272705078, "learning_rate": 8.168442415017758e-06, "loss": 0.2145, "step": 2901 }, { "epoch": 2.6502283105022832, "grad_norm": 72.4263916015625, "learning_rate": 8.167427701674278e-06, "loss": 0.631, "step": 2902 }, { "epoch": 2.6511415525114153, "grad_norm": 93.81385040283203, "learning_rate": 8.166412988330797e-06, "loss": 2.5109, "step": 2903 }, { "epoch": 2.6520547945205477, "grad_norm": 6.704494953155518, "learning_rate": 8.165398274987318e-06, "loss": 0.0447, "step": 2904 }, { "epoch": 2.65296803652968, "grad_norm": 53.382747650146484, "learning_rate": 8.164383561643837e-06, "loss": 0.4708, "step": 2905 }, { "epoch": 2.6538812785388126, "grad_norm": 97.28400421142578, "learning_rate": 8.163368848300355e-06, "loss": 1.9943, "step": 2906 }, { "epoch": 2.654794520547945, "grad_norm": 25.23044204711914, "learning_rate": 8.162354134956876e-06, "loss": 0.2213, "step": 2907 }, { "epoch": 2.6557077625570775, "grad_norm": 4.403652667999268, "learning_rate": 8.161339421613395e-06, "loss": 0.0437, "step": 2908 }, { "epoch": 2.65662100456621, "grad_norm": 10.921778678894043, "learning_rate": 8.160324708269914e-06, "loss": 0.0725, "step": 2909 }, { "epoch": 2.6575342465753424, "grad_norm": 50.69652557373047, "learning_rate": 8.159309994926434e-06, "loss": 0.61, "step": 2910 }, { "epoch": 2.658447488584475, "grad_norm": 24.78170394897461, "learning_rate": 8.158295281582953e-06, "loss": 0.1411, "step": 2911 }, { "epoch": 2.6593607305936073, "grad_norm": 5.458840847015381, "learning_rate": 8.157280568239474e-06, "loss": 0.033, "step": 2912 }, { "epoch": 2.66027397260274, "grad_norm": 5.758886814117432, "learning_rate": 8.156265854895992e-06, "loss": 0.0518, "step": 2913 }, { "epoch": 2.6611872146118722, "grad_norm": 45.41533660888672, "learning_rate": 8.155251141552513e-06, "loss": 0.7189, "step": 2914 }, { "epoch": 2.6621004566210047, "grad_norm": 16.058372497558594, "learning_rate": 8.154236428209032e-06, "loss": 0.158, "step": 2915 }, { "epoch": 2.663013698630137, "grad_norm": 2.5350067615509033, "learning_rate": 8.15322171486555e-06, "loss": 0.0146, "step": 2916 }, { "epoch": 2.6639269406392696, "grad_norm": 55.924476623535156, "learning_rate": 8.152207001522071e-06, "loss": 0.6063, "step": 2917 }, { "epoch": 2.664840182648402, "grad_norm": 38.79718780517578, "learning_rate": 8.15119228817859e-06, "loss": 0.3665, "step": 2918 }, { "epoch": 2.6657534246575345, "grad_norm": 10.930707931518555, "learning_rate": 8.150177574835109e-06, "loss": 0.1003, "step": 2919 }, { "epoch": 2.6666666666666665, "grad_norm": 56.050533294677734, "learning_rate": 8.14916286149163e-06, "loss": 0.5651, "step": 2920 }, { "epoch": 2.667579908675799, "grad_norm": 3.0187854766845703, "learning_rate": 8.148148148148148e-06, "loss": 0.0114, "step": 2921 }, { "epoch": 2.6684931506849314, "grad_norm": 11.669353485107422, "learning_rate": 8.147133434804669e-06, "loss": 0.076, "step": 2922 }, { "epoch": 2.669406392694064, "grad_norm": 45.291748046875, "learning_rate": 8.146118721461188e-06, "loss": 0.6286, "step": 2923 }, { "epoch": 2.6703196347031963, "grad_norm": 142.81967163085938, "learning_rate": 8.145104008117708e-06, "loss": 2.4246, "step": 2924 }, { "epoch": 2.671232876712329, "grad_norm": 44.89841842651367, "learning_rate": 8.144089294774227e-06, "loss": 0.1072, "step": 2925 }, { "epoch": 2.6721461187214612, "grad_norm": 38.193241119384766, "learning_rate": 8.143074581430746e-06, "loss": 0.2871, "step": 2926 }, { "epoch": 2.6730593607305937, "grad_norm": 0.567014217376709, "learning_rate": 8.142059868087266e-06, "loss": 0.0047, "step": 2927 }, { "epoch": 2.673972602739726, "grad_norm": 61.0821533203125, "learning_rate": 8.141045154743785e-06, "loss": 1.2836, "step": 2928 }, { "epoch": 2.6748858447488586, "grad_norm": 21.975589752197266, "learning_rate": 8.140030441400306e-06, "loss": 0.2138, "step": 2929 }, { "epoch": 2.6757990867579906, "grad_norm": 128.5587158203125, "learning_rate": 8.139015728056825e-06, "loss": 3.1834, "step": 2930 }, { "epoch": 2.676712328767123, "grad_norm": 9.727609634399414, "learning_rate": 8.138001014713344e-06, "loss": 0.079, "step": 2931 }, { "epoch": 2.6776255707762555, "grad_norm": 29.7974853515625, "learning_rate": 8.136986301369864e-06, "loss": 0.3243, "step": 2932 }, { "epoch": 2.678538812785388, "grad_norm": 1.0726855993270874, "learning_rate": 8.135971588026383e-06, "loss": 0.0072, "step": 2933 }, { "epoch": 2.6794520547945204, "grad_norm": 1.8915883302688599, "learning_rate": 8.134956874682903e-06, "loss": 0.0071, "step": 2934 }, { "epoch": 2.680365296803653, "grad_norm": 96.72482299804688, "learning_rate": 8.133942161339422e-06, "loss": 1.3131, "step": 2935 }, { "epoch": 2.6812785388127853, "grad_norm": 1.464916467666626, "learning_rate": 8.132927447995941e-06, "loss": 0.0118, "step": 2936 }, { "epoch": 2.682191780821918, "grad_norm": 15.304017066955566, "learning_rate": 8.131912734652462e-06, "loss": 0.0951, "step": 2937 }, { "epoch": 2.6831050228310502, "grad_norm": 7.822527885437012, "learning_rate": 8.13089802130898e-06, "loss": 0.0597, "step": 2938 }, { "epoch": 2.6840182648401827, "grad_norm": 53.69606018066406, "learning_rate": 8.129883307965501e-06, "loss": 0.6138, "step": 2939 }, { "epoch": 2.684931506849315, "grad_norm": 29.976083755493164, "learning_rate": 8.12886859462202e-06, "loss": 0.2337, "step": 2940 }, { "epoch": 2.6858447488584476, "grad_norm": 13.870539665222168, "learning_rate": 8.127853881278539e-06, "loss": 0.0657, "step": 2941 }, { "epoch": 2.68675799086758, "grad_norm": 67.52738952636719, "learning_rate": 8.12683916793506e-06, "loss": 0.766, "step": 2942 }, { "epoch": 2.6876712328767125, "grad_norm": 13.854265213012695, "learning_rate": 8.125824454591578e-06, "loss": 0.1336, "step": 2943 }, { "epoch": 2.688584474885845, "grad_norm": 82.02283477783203, "learning_rate": 8.124809741248099e-06, "loss": 0.5965, "step": 2944 }, { "epoch": 2.6894977168949774, "grad_norm": 1.435977816581726, "learning_rate": 8.123795027904618e-06, "loss": 0.0136, "step": 2945 }, { "epoch": 2.69041095890411, "grad_norm": 1.0773353576660156, "learning_rate": 8.122780314561138e-06, "loss": 0.0083, "step": 2946 }, { "epoch": 2.691324200913242, "grad_norm": 6.072841167449951, "learning_rate": 8.121765601217657e-06, "loss": 0.0315, "step": 2947 }, { "epoch": 2.6922374429223743, "grad_norm": 30.965953826904297, "learning_rate": 8.120750887874176e-06, "loss": 0.2122, "step": 2948 }, { "epoch": 2.6931506849315068, "grad_norm": 10.535497665405273, "learning_rate": 8.119736174530696e-06, "loss": 0.0331, "step": 2949 }, { "epoch": 2.6940639269406392, "grad_norm": 2.1066067218780518, "learning_rate": 8.118721461187215e-06, "loss": 0.0167, "step": 2950 }, { "epoch": 2.6949771689497717, "grad_norm": 46.97057342529297, "learning_rate": 8.117706747843734e-06, "loss": 0.424, "step": 2951 }, { "epoch": 2.695890410958904, "grad_norm": 55.260459899902344, "learning_rate": 8.116692034500255e-06, "loss": 0.3524, "step": 2952 }, { "epoch": 2.6968036529680366, "grad_norm": 78.2962646484375, "learning_rate": 8.115677321156773e-06, "loss": 0.3373, "step": 2953 }, { "epoch": 2.697716894977169, "grad_norm": 2.7914175987243652, "learning_rate": 8.114662607813294e-06, "loss": 0.0183, "step": 2954 }, { "epoch": 2.6986301369863015, "grad_norm": 2.872180700302124, "learning_rate": 8.113647894469813e-06, "loss": 0.021, "step": 2955 }, { "epoch": 2.699543378995434, "grad_norm": 10.521028518676758, "learning_rate": 8.112633181126333e-06, "loss": 0.0732, "step": 2956 }, { "epoch": 2.700456621004566, "grad_norm": 102.7131118774414, "learning_rate": 8.111618467782852e-06, "loss": 1.3344, "step": 2957 }, { "epoch": 2.7013698630136984, "grad_norm": 0.3488234579563141, "learning_rate": 8.110603754439371e-06, "loss": 0.0018, "step": 2958 }, { "epoch": 2.702283105022831, "grad_norm": 5.555052280426025, "learning_rate": 8.109589041095892e-06, "loss": 0.038, "step": 2959 }, { "epoch": 2.7031963470319633, "grad_norm": 161.7754364013672, "learning_rate": 8.10857432775241e-06, "loss": 1.8188, "step": 2960 }, { "epoch": 2.7041095890410958, "grad_norm": 9.408537864685059, "learning_rate": 8.10755961440893e-06, "loss": 0.0901, "step": 2961 }, { "epoch": 2.7050228310502282, "grad_norm": 13.825603485107422, "learning_rate": 8.10654490106545e-06, "loss": 0.0853, "step": 2962 }, { "epoch": 2.7059360730593607, "grad_norm": 15.09700870513916, "learning_rate": 8.105530187721969e-06, "loss": 0.0666, "step": 2963 }, { "epoch": 2.706849315068493, "grad_norm": 89.93899536132812, "learning_rate": 8.104515474378489e-06, "loss": 1.6856, "step": 2964 }, { "epoch": 2.7077625570776256, "grad_norm": 107.51963806152344, "learning_rate": 8.103500761035008e-06, "loss": 1.8938, "step": 2965 }, { "epoch": 2.708675799086758, "grad_norm": 1.6585386991500854, "learning_rate": 8.102486047691529e-06, "loss": 0.0116, "step": 2966 }, { "epoch": 2.7095890410958905, "grad_norm": 12.070363998413086, "learning_rate": 8.101471334348047e-06, "loss": 0.1085, "step": 2967 }, { "epoch": 2.710502283105023, "grad_norm": 8.27624225616455, "learning_rate": 8.100456621004566e-06, "loss": 0.0585, "step": 2968 }, { "epoch": 2.7114155251141554, "grad_norm": 0.3327777683734894, "learning_rate": 8.099441907661087e-06, "loss": 0.002, "step": 2969 }, { "epoch": 2.712328767123288, "grad_norm": 49.30712890625, "learning_rate": 8.098427194317606e-06, "loss": 0.6371, "step": 2970 }, { "epoch": 2.7132420091324203, "grad_norm": 2.874703884124756, "learning_rate": 8.097412480974124e-06, "loss": 0.0183, "step": 2971 }, { "epoch": 2.7141552511415528, "grad_norm": 71.7579574584961, "learning_rate": 8.096397767630645e-06, "loss": 0.7374, "step": 2972 }, { "epoch": 2.7150684931506848, "grad_norm": 36.315521240234375, "learning_rate": 8.095383054287166e-06, "loss": 0.0974, "step": 2973 }, { "epoch": 2.7159817351598172, "grad_norm": 208.60629272460938, "learning_rate": 8.094368340943684e-06, "loss": 8.8103, "step": 2974 }, { "epoch": 2.7168949771689497, "grad_norm": 30.744543075561523, "learning_rate": 8.093353627600203e-06, "loss": 0.1497, "step": 2975 }, { "epoch": 2.717808219178082, "grad_norm": 23.263673782348633, "learning_rate": 8.092338914256724e-06, "loss": 0.1401, "step": 2976 }, { "epoch": 2.7187214611872146, "grad_norm": 69.95497131347656, "learning_rate": 8.091324200913243e-06, "loss": 1.8951, "step": 2977 }, { "epoch": 2.719634703196347, "grad_norm": 54.90087890625, "learning_rate": 8.090309487569761e-06, "loss": 0.3253, "step": 2978 }, { "epoch": 2.7205479452054795, "grad_norm": 83.96247863769531, "learning_rate": 8.089294774226282e-06, "loss": 3.0474, "step": 2979 }, { "epoch": 2.721461187214612, "grad_norm": 9.045310020446777, "learning_rate": 8.0882800608828e-06, "loss": 0.0563, "step": 2980 }, { "epoch": 2.7223744292237444, "grad_norm": 2.2316513061523438, "learning_rate": 8.08726534753932e-06, "loss": 0.019, "step": 2981 }, { "epoch": 2.723287671232877, "grad_norm": 78.92584228515625, "learning_rate": 8.08625063419584e-06, "loss": 0.7824, "step": 2982 }, { "epoch": 2.724200913242009, "grad_norm": 65.18009185791016, "learning_rate": 8.08523592085236e-06, "loss": 1.108, "step": 2983 }, { "epoch": 2.7251141552511413, "grad_norm": 18.598474502563477, "learning_rate": 8.08422120750888e-06, "loss": 0.1368, "step": 2984 }, { "epoch": 2.7260273972602738, "grad_norm": 1.706636905670166, "learning_rate": 8.083206494165398e-06, "loss": 0.0108, "step": 2985 }, { "epoch": 2.726940639269406, "grad_norm": 41.91183090209961, "learning_rate": 8.082191780821919e-06, "loss": 0.4324, "step": 2986 }, { "epoch": 2.7278538812785387, "grad_norm": 3.016400098800659, "learning_rate": 8.081177067478438e-06, "loss": 0.0209, "step": 2987 }, { "epoch": 2.728767123287671, "grad_norm": 57.540809631347656, "learning_rate": 8.080162354134957e-06, "loss": 0.5466, "step": 2988 }, { "epoch": 2.7296803652968036, "grad_norm": 31.535541534423828, "learning_rate": 8.079147640791477e-06, "loss": 0.2372, "step": 2989 }, { "epoch": 2.730593607305936, "grad_norm": 78.12457275390625, "learning_rate": 8.078132927447998e-06, "loss": 0.8918, "step": 2990 }, { "epoch": 2.7315068493150685, "grad_norm": 8.253474235534668, "learning_rate": 8.077118214104515e-06, "loss": 0.0597, "step": 2991 }, { "epoch": 2.732420091324201, "grad_norm": 7.077469825744629, "learning_rate": 8.076103500761035e-06, "loss": 0.0378, "step": 2992 }, { "epoch": 2.7333333333333334, "grad_norm": 9.036526679992676, "learning_rate": 8.075088787417556e-06, "loss": 0.0888, "step": 2993 }, { "epoch": 2.734246575342466, "grad_norm": 30.60350799560547, "learning_rate": 8.074074074074075e-06, "loss": 0.2042, "step": 2994 }, { "epoch": 2.7351598173515983, "grad_norm": 0.25081878900527954, "learning_rate": 8.073059360730594e-06, "loss": 0.0016, "step": 2995 }, { "epoch": 2.7360730593607308, "grad_norm": 79.36372375488281, "learning_rate": 8.072044647387114e-06, "loss": 0.6916, "step": 2996 }, { "epoch": 2.736986301369863, "grad_norm": 64.79337310791016, "learning_rate": 8.071029934043633e-06, "loss": 0.9889, "step": 2997 }, { "epoch": 2.7378995433789957, "grad_norm": 59.537288665771484, "learning_rate": 8.070015220700152e-06, "loss": 0.8112, "step": 2998 }, { "epoch": 2.738812785388128, "grad_norm": 13.158120155334473, "learning_rate": 8.069000507356672e-06, "loss": 0.0856, "step": 2999 }, { "epoch": 2.73972602739726, "grad_norm": 15.083517074584961, "learning_rate": 8.067985794013193e-06, "loss": 0.1006, "step": 3000 }, { "epoch": 2.7406392694063926, "grad_norm": 21.489288330078125, "learning_rate": 8.066971080669712e-06, "loss": 0.1981, "step": 3001 }, { "epoch": 2.741552511415525, "grad_norm": 6.446374893188477, "learning_rate": 8.06595636732623e-06, "loss": 0.0511, "step": 3002 }, { "epoch": 2.7424657534246575, "grad_norm": 4.437100887298584, "learning_rate": 8.064941653982751e-06, "loss": 0.0167, "step": 3003 }, { "epoch": 2.74337899543379, "grad_norm": 47.679439544677734, "learning_rate": 8.06392694063927e-06, "loss": 0.3046, "step": 3004 }, { "epoch": 2.7442922374429224, "grad_norm": 79.41419219970703, "learning_rate": 8.062912227295789e-06, "loss": 2.1463, "step": 3005 }, { "epoch": 2.745205479452055, "grad_norm": 8.081478118896484, "learning_rate": 8.06189751395231e-06, "loss": 0.0692, "step": 3006 }, { "epoch": 2.7461187214611873, "grad_norm": 9.28551959991455, "learning_rate": 8.060882800608828e-06, "loss": 0.0495, "step": 3007 }, { "epoch": 2.7470319634703197, "grad_norm": 110.31033325195312, "learning_rate": 8.059868087265347e-06, "loss": 4.0835, "step": 3008 }, { "epoch": 2.747945205479452, "grad_norm": 11.146405220031738, "learning_rate": 8.058853373921868e-06, "loss": 0.0878, "step": 3009 }, { "epoch": 2.748858447488584, "grad_norm": 0.7324406504631042, "learning_rate": 8.057838660578388e-06, "loss": 0.0043, "step": 3010 }, { "epoch": 2.7497716894977167, "grad_norm": 30.090476989746094, "learning_rate": 8.056823947234907e-06, "loss": 0.3111, "step": 3011 }, { "epoch": 2.750684931506849, "grad_norm": 48.97830581665039, "learning_rate": 8.055809233891426e-06, "loss": 0.6903, "step": 3012 }, { "epoch": 2.7515981735159816, "grad_norm": 28.10066032409668, "learning_rate": 8.054794520547946e-06, "loss": 0.2922, "step": 3013 }, { "epoch": 2.752511415525114, "grad_norm": 2.633758544921875, "learning_rate": 8.053779807204465e-06, "loss": 0.0185, "step": 3014 }, { "epoch": 2.7534246575342465, "grad_norm": 26.312705993652344, "learning_rate": 8.052765093860984e-06, "loss": 0.1306, "step": 3015 }, { "epoch": 2.754337899543379, "grad_norm": 48.58531188964844, "learning_rate": 8.051750380517505e-06, "loss": 0.3235, "step": 3016 }, { "epoch": 2.7552511415525114, "grad_norm": 2.7957756519317627, "learning_rate": 8.050735667174025e-06, "loss": 0.0164, "step": 3017 }, { "epoch": 2.756164383561644, "grad_norm": 30.10289192199707, "learning_rate": 8.049720953830542e-06, "loss": 0.1904, "step": 3018 }, { "epoch": 2.7570776255707763, "grad_norm": 63.22462463378906, "learning_rate": 8.048706240487063e-06, "loss": 0.6887, "step": 3019 }, { "epoch": 2.7579908675799087, "grad_norm": 9.511448860168457, "learning_rate": 8.047691527143583e-06, "loss": 0.0552, "step": 3020 }, { "epoch": 2.758904109589041, "grad_norm": 2.7753303050994873, "learning_rate": 8.046676813800102e-06, "loss": 0.0217, "step": 3021 }, { "epoch": 2.7598173515981737, "grad_norm": 14.360926628112793, "learning_rate": 8.045662100456621e-06, "loss": 0.132, "step": 3022 }, { "epoch": 2.760730593607306, "grad_norm": 2.278665065765381, "learning_rate": 8.044647387113142e-06, "loss": 0.0147, "step": 3023 }, { "epoch": 2.7616438356164386, "grad_norm": 4.681558609008789, "learning_rate": 8.04363267376966e-06, "loss": 0.0319, "step": 3024 }, { "epoch": 2.762557077625571, "grad_norm": 4.032317161560059, "learning_rate": 8.04261796042618e-06, "loss": 0.0301, "step": 3025 }, { "epoch": 2.7634703196347035, "grad_norm": 73.9208755493164, "learning_rate": 8.0416032470827e-06, "loss": 1.5089, "step": 3026 }, { "epoch": 2.7643835616438355, "grad_norm": 1.8896968364715576, "learning_rate": 8.04058853373922e-06, "loss": 0.0145, "step": 3027 }, { "epoch": 2.765296803652968, "grad_norm": 24.294408798217773, "learning_rate": 8.03957382039574e-06, "loss": 0.2239, "step": 3028 }, { "epoch": 2.7662100456621004, "grad_norm": 24.709394454956055, "learning_rate": 8.038559107052258e-06, "loss": 0.2215, "step": 3029 }, { "epoch": 2.767123287671233, "grad_norm": 91.00978088378906, "learning_rate": 8.037544393708779e-06, "loss": 1.556, "step": 3030 }, { "epoch": 2.7680365296803653, "grad_norm": 85.281494140625, "learning_rate": 8.036529680365297e-06, "loss": 1.7473, "step": 3031 }, { "epoch": 2.7689497716894977, "grad_norm": 2.8635058403015137, "learning_rate": 8.035514967021816e-06, "loss": 0.0213, "step": 3032 }, { "epoch": 2.76986301369863, "grad_norm": 58.15858459472656, "learning_rate": 8.034500253678337e-06, "loss": 0.614, "step": 3033 }, { "epoch": 2.7707762557077626, "grad_norm": 0.30131059885025024, "learning_rate": 8.033485540334857e-06, "loss": 0.0023, "step": 3034 }, { "epoch": 2.771689497716895, "grad_norm": 9.100533485412598, "learning_rate": 8.032470826991375e-06, "loss": 0.0791, "step": 3035 }, { "epoch": 2.7726027397260276, "grad_norm": 3.118727445602417, "learning_rate": 8.031456113647895e-06, "loss": 0.0213, "step": 3036 }, { "epoch": 2.7735159817351596, "grad_norm": 1.2788058519363403, "learning_rate": 8.030441400304416e-06, "loss": 0.0115, "step": 3037 }, { "epoch": 2.774429223744292, "grad_norm": 2.0812385082244873, "learning_rate": 8.029426686960934e-06, "loss": 0.0118, "step": 3038 }, { "epoch": 2.7753424657534245, "grad_norm": 34.64133834838867, "learning_rate": 8.028411973617453e-06, "loss": 0.2479, "step": 3039 }, { "epoch": 2.776255707762557, "grad_norm": 37.83424377441406, "learning_rate": 8.027397260273974e-06, "loss": 0.3247, "step": 3040 }, { "epoch": 2.7771689497716894, "grad_norm": 96.18263244628906, "learning_rate": 8.026382546930493e-06, "loss": 1.1631, "step": 3041 }, { "epoch": 2.778082191780822, "grad_norm": 66.52919006347656, "learning_rate": 8.025367833587012e-06, "loss": 0.5055, "step": 3042 }, { "epoch": 2.7789954337899543, "grad_norm": 0.8617256879806519, "learning_rate": 8.024353120243532e-06, "loss": 0.0061, "step": 3043 }, { "epoch": 2.7799086757990867, "grad_norm": 20.408855438232422, "learning_rate": 8.023338406900053e-06, "loss": 0.1676, "step": 3044 }, { "epoch": 2.780821917808219, "grad_norm": 68.33033752441406, "learning_rate": 8.022323693556571e-06, "loss": 0.6755, "step": 3045 }, { "epoch": 2.7817351598173516, "grad_norm": 1.1210901737213135, "learning_rate": 8.02130898021309e-06, "loss": 0.0071, "step": 3046 }, { "epoch": 2.782648401826484, "grad_norm": 1.4695677757263184, "learning_rate": 8.020294266869611e-06, "loss": 0.0107, "step": 3047 }, { "epoch": 2.7835616438356166, "grad_norm": 31.809833526611328, "learning_rate": 8.01927955352613e-06, "loss": 0.2279, "step": 3048 }, { "epoch": 2.784474885844749, "grad_norm": 28.434715270996094, "learning_rate": 8.018264840182649e-06, "loss": 0.1484, "step": 3049 }, { "epoch": 2.7853881278538815, "grad_norm": 4.288319110870361, "learning_rate": 8.017250126839169e-06, "loss": 0.0258, "step": 3050 }, { "epoch": 2.786301369863014, "grad_norm": 92.2739028930664, "learning_rate": 8.016235413495688e-06, "loss": 0.5885, "step": 3051 }, { "epoch": 2.7872146118721464, "grad_norm": 4.557638168334961, "learning_rate": 8.015220700152207e-06, "loss": 0.0406, "step": 3052 }, { "epoch": 2.7881278538812784, "grad_norm": 4.964922904968262, "learning_rate": 8.014205986808727e-06, "loss": 0.0449, "step": 3053 }, { "epoch": 2.789041095890411, "grad_norm": 26.404373168945312, "learning_rate": 8.013191273465248e-06, "loss": 0.1443, "step": 3054 }, { "epoch": 2.7899543378995433, "grad_norm": 0.2738451361656189, "learning_rate": 8.012176560121767e-06, "loss": 0.0018, "step": 3055 }, { "epoch": 2.7908675799086757, "grad_norm": 50.813392639160156, "learning_rate": 8.011161846778286e-06, "loss": 0.6556, "step": 3056 }, { "epoch": 2.791780821917808, "grad_norm": 56.69446563720703, "learning_rate": 8.010147133434806e-06, "loss": 0.8016, "step": 3057 }, { "epoch": 2.7926940639269406, "grad_norm": 27.71690559387207, "learning_rate": 8.009132420091325e-06, "loss": 0.2757, "step": 3058 }, { "epoch": 2.793607305936073, "grad_norm": 0.13136719167232513, "learning_rate": 8.008117706747844e-06, "loss": 0.0011, "step": 3059 }, { "epoch": 2.7945205479452055, "grad_norm": 99.74613952636719, "learning_rate": 8.007102993404364e-06, "loss": 1.9079, "step": 3060 }, { "epoch": 2.795433789954338, "grad_norm": 108.78013610839844, "learning_rate": 8.006088280060883e-06, "loss": 0.6284, "step": 3061 }, { "epoch": 2.7963470319634705, "grad_norm": 89.15933227539062, "learning_rate": 8.005073566717404e-06, "loss": 0.6018, "step": 3062 }, { "epoch": 2.7972602739726025, "grad_norm": 38.616329193115234, "learning_rate": 8.004058853373923e-06, "loss": 0.3626, "step": 3063 }, { "epoch": 2.798173515981735, "grad_norm": 50.63517761230469, "learning_rate": 8.003044140030443e-06, "loss": 0.5395, "step": 3064 }, { "epoch": 2.7990867579908674, "grad_norm": 0.6406169533729553, "learning_rate": 8.002029426686962e-06, "loss": 0.0024, "step": 3065 }, { "epoch": 2.8, "grad_norm": 0.45366379618644714, "learning_rate": 8.00101471334348e-06, "loss": 0.0026, "step": 3066 }, { "epoch": 2.8009132420091323, "grad_norm": 1.5288500785827637, "learning_rate": 8.000000000000001e-06, "loss": 0.0081, "step": 3067 }, { "epoch": 2.8018264840182647, "grad_norm": 3.149904489517212, "learning_rate": 7.99898528665652e-06, "loss": 0.0202, "step": 3068 }, { "epoch": 2.802739726027397, "grad_norm": 58.15799331665039, "learning_rate": 7.997970573313039e-06, "loss": 0.6071, "step": 3069 }, { "epoch": 2.8036529680365296, "grad_norm": 6.680530548095703, "learning_rate": 7.99695585996956e-06, "loss": 0.0448, "step": 3070 }, { "epoch": 2.804566210045662, "grad_norm": 22.226484298706055, "learning_rate": 7.995941146626078e-06, "loss": 0.2003, "step": 3071 }, { "epoch": 2.8054794520547945, "grad_norm": 0.19896382093429565, "learning_rate": 7.994926433282599e-06, "loss": 0.0018, "step": 3072 }, { "epoch": 2.806392694063927, "grad_norm": 7.358794212341309, "learning_rate": 7.993911719939118e-06, "loss": 0.0784, "step": 3073 }, { "epoch": 2.8073059360730594, "grad_norm": 93.14344787597656, "learning_rate": 7.992897006595637e-06, "loss": 4.5755, "step": 3074 }, { "epoch": 2.808219178082192, "grad_norm": 56.34929656982422, "learning_rate": 7.991882293252157e-06, "loss": 0.5439, "step": 3075 }, { "epoch": 2.8091324200913244, "grad_norm": 73.88063049316406, "learning_rate": 7.990867579908676e-06, "loss": 2.3959, "step": 3076 }, { "epoch": 2.810045662100457, "grad_norm": 13.251025199890137, "learning_rate": 7.989852866565197e-06, "loss": 0.122, "step": 3077 }, { "epoch": 2.8109589041095893, "grad_norm": 19.927377700805664, "learning_rate": 7.988838153221715e-06, "loss": 0.1396, "step": 3078 }, { "epoch": 2.8118721461187217, "grad_norm": 17.410757064819336, "learning_rate": 7.987823439878234e-06, "loss": 0.1194, "step": 3079 }, { "epoch": 2.8127853881278537, "grad_norm": 7.258286476135254, "learning_rate": 7.986808726534755e-06, "loss": 0.0697, "step": 3080 }, { "epoch": 2.813698630136986, "grad_norm": 35.21329116821289, "learning_rate": 7.985794013191274e-06, "loss": 0.24, "step": 3081 }, { "epoch": 2.8146118721461186, "grad_norm": 33.792205810546875, "learning_rate": 7.984779299847794e-06, "loss": 0.2439, "step": 3082 }, { "epoch": 2.815525114155251, "grad_norm": 10.46406078338623, "learning_rate": 7.983764586504313e-06, "loss": 0.0787, "step": 3083 }, { "epoch": 2.8164383561643835, "grad_norm": 1.9029821157455444, "learning_rate": 7.982749873160832e-06, "loss": 0.0125, "step": 3084 }, { "epoch": 2.817351598173516, "grad_norm": 69.59188079833984, "learning_rate": 7.981735159817352e-06, "loss": 0.6321, "step": 3085 }, { "epoch": 2.8182648401826484, "grad_norm": 10.069334983825684, "learning_rate": 7.980720446473871e-06, "loss": 0.0671, "step": 3086 }, { "epoch": 2.819178082191781, "grad_norm": 29.72774887084961, "learning_rate": 7.979705733130392e-06, "loss": 0.4181, "step": 3087 }, { "epoch": 2.8200913242009134, "grad_norm": 14.060053825378418, "learning_rate": 7.97869101978691e-06, "loss": 0.1394, "step": 3088 }, { "epoch": 2.821004566210046, "grad_norm": 89.87550354003906, "learning_rate": 7.977676306443431e-06, "loss": 3.3856, "step": 3089 }, { "epoch": 2.821917808219178, "grad_norm": 2.248884677886963, "learning_rate": 7.97666159309995e-06, "loss": 0.0124, "step": 3090 }, { "epoch": 2.8228310502283103, "grad_norm": 6.341608047485352, "learning_rate": 7.975646879756469e-06, "loss": 0.058, "step": 3091 }, { "epoch": 2.8237442922374427, "grad_norm": 12.606329917907715, "learning_rate": 7.97463216641299e-06, "loss": 0.1102, "step": 3092 }, { "epoch": 2.824657534246575, "grad_norm": 7.1816582679748535, "learning_rate": 7.973617453069508e-06, "loss": 0.0685, "step": 3093 }, { "epoch": 2.8255707762557076, "grad_norm": 145.23634338378906, "learning_rate": 7.972602739726027e-06, "loss": 1.0972, "step": 3094 }, { "epoch": 2.82648401826484, "grad_norm": 0.04162093624472618, "learning_rate": 7.971588026382548e-06, "loss": 0.0003, "step": 3095 }, { "epoch": 2.8273972602739725, "grad_norm": 21.174360275268555, "learning_rate": 7.970573313039066e-06, "loss": 0.1163, "step": 3096 }, { "epoch": 2.828310502283105, "grad_norm": 17.638214111328125, "learning_rate": 7.969558599695587e-06, "loss": 0.161, "step": 3097 }, { "epoch": 2.8292237442922374, "grad_norm": 12.50955867767334, "learning_rate": 7.968543886352106e-06, "loss": 0.1166, "step": 3098 }, { "epoch": 2.83013698630137, "grad_norm": 56.9157600402832, "learning_rate": 7.967529173008626e-06, "loss": 0.5926, "step": 3099 }, { "epoch": 2.8310502283105023, "grad_norm": 21.15333366394043, "learning_rate": 7.966514459665145e-06, "loss": 0.2228, "step": 3100 }, { "epoch": 2.831963470319635, "grad_norm": 3.4321703910827637, "learning_rate": 7.965499746321664e-06, "loss": 0.0262, "step": 3101 }, { "epoch": 2.8328767123287673, "grad_norm": 43.315181732177734, "learning_rate": 7.964485032978185e-06, "loss": 0.2938, "step": 3102 }, { "epoch": 2.8337899543378997, "grad_norm": 14.414800643920898, "learning_rate": 7.963470319634703e-06, "loss": 0.1065, "step": 3103 }, { "epoch": 2.834703196347032, "grad_norm": 0.4132011830806732, "learning_rate": 7.962455606291222e-06, "loss": 0.0049, "step": 3104 }, { "epoch": 2.8356164383561646, "grad_norm": 128.06753540039062, "learning_rate": 7.961440892947743e-06, "loss": 4.4926, "step": 3105 }, { "epoch": 2.836529680365297, "grad_norm": 2.318410634994507, "learning_rate": 7.960426179604263e-06, "loss": 0.0139, "step": 3106 }, { "epoch": 2.837442922374429, "grad_norm": 46.59068298339844, "learning_rate": 7.959411466260782e-06, "loss": 0.546, "step": 3107 }, { "epoch": 2.8383561643835615, "grad_norm": 5.291081428527832, "learning_rate": 7.958396752917301e-06, "loss": 0.0448, "step": 3108 }, { "epoch": 2.839269406392694, "grad_norm": 6.669362545013428, "learning_rate": 7.957382039573822e-06, "loss": 0.0647, "step": 3109 }, { "epoch": 2.8401826484018264, "grad_norm": 11.204620361328125, "learning_rate": 7.95636732623034e-06, "loss": 0.1007, "step": 3110 }, { "epoch": 2.841095890410959, "grad_norm": 16.142751693725586, "learning_rate": 7.95535261288686e-06, "loss": 0.1053, "step": 3111 }, { "epoch": 2.8420091324200913, "grad_norm": 3.2666714191436768, "learning_rate": 7.95433789954338e-06, "loss": 0.0346, "step": 3112 }, { "epoch": 2.842922374429224, "grad_norm": 90.99591827392578, "learning_rate": 7.953323186199899e-06, "loss": 1.2164, "step": 3113 }, { "epoch": 2.8438356164383563, "grad_norm": 9.729612350463867, "learning_rate": 7.952308472856418e-06, "loss": 0.0733, "step": 3114 }, { "epoch": 2.8447488584474887, "grad_norm": 13.655997276306152, "learning_rate": 7.951293759512938e-06, "loss": 0.1004, "step": 3115 }, { "epoch": 2.845662100456621, "grad_norm": 66.5294189453125, "learning_rate": 7.950279046169459e-06, "loss": 0.8303, "step": 3116 }, { "epoch": 2.846575342465753, "grad_norm": 1.494009017944336, "learning_rate": 7.949264332825977e-06, "loss": 0.0108, "step": 3117 }, { "epoch": 2.8474885844748856, "grad_norm": 33.483009338378906, "learning_rate": 7.948249619482496e-06, "loss": 0.2778, "step": 3118 }, { "epoch": 2.848401826484018, "grad_norm": 3.954094886779785, "learning_rate": 7.947234906139017e-06, "loss": 0.0399, "step": 3119 }, { "epoch": 2.8493150684931505, "grad_norm": 19.599096298217773, "learning_rate": 7.946220192795536e-06, "loss": 0.0967, "step": 3120 }, { "epoch": 2.850228310502283, "grad_norm": 42.27423095703125, "learning_rate": 7.945205479452055e-06, "loss": 0.3192, "step": 3121 }, { "epoch": 2.8511415525114154, "grad_norm": 5.023266792297363, "learning_rate": 7.944190766108575e-06, "loss": 0.0351, "step": 3122 }, { "epoch": 2.852054794520548, "grad_norm": 1.5322500467300415, "learning_rate": 7.943176052765094e-06, "loss": 0.0109, "step": 3123 }, { "epoch": 2.8529680365296803, "grad_norm": 4.413599014282227, "learning_rate": 7.942161339421613e-06, "loss": 0.0254, "step": 3124 }, { "epoch": 2.853881278538813, "grad_norm": 0.9112880229949951, "learning_rate": 7.941146626078133e-06, "loss": 0.0068, "step": 3125 }, { "epoch": 2.8547945205479452, "grad_norm": 0.35005074739456177, "learning_rate": 7.940131912734654e-06, "loss": 0.0025, "step": 3126 }, { "epoch": 2.8557077625570777, "grad_norm": 94.50645446777344, "learning_rate": 7.939117199391173e-06, "loss": 2.5069, "step": 3127 }, { "epoch": 2.85662100456621, "grad_norm": 60.78480911254883, "learning_rate": 7.938102486047692e-06, "loss": 0.6792, "step": 3128 }, { "epoch": 2.8575342465753426, "grad_norm": 3.2643816471099854, "learning_rate": 7.937087772704212e-06, "loss": 0.0293, "step": 3129 }, { "epoch": 2.858447488584475, "grad_norm": 47.91776657104492, "learning_rate": 7.936073059360731e-06, "loss": 0.3296, "step": 3130 }, { "epoch": 2.8593607305936075, "grad_norm": 6.958486557006836, "learning_rate": 7.93505834601725e-06, "loss": 0.0452, "step": 3131 }, { "epoch": 2.86027397260274, "grad_norm": 70.81829833984375, "learning_rate": 7.93404363267377e-06, "loss": 0.558, "step": 3132 }, { "epoch": 2.8611872146118724, "grad_norm": 3.3625659942626953, "learning_rate": 7.93302891933029e-06, "loss": 0.028, "step": 3133 }, { "epoch": 2.8621004566210044, "grad_norm": 0.48226994276046753, "learning_rate": 7.932014205986808e-06, "loss": 0.0041, "step": 3134 }, { "epoch": 2.863013698630137, "grad_norm": 19.199539184570312, "learning_rate": 7.930999492643329e-06, "loss": 0.1317, "step": 3135 }, { "epoch": 2.8639269406392693, "grad_norm": 10.342900276184082, "learning_rate": 7.929984779299849e-06, "loss": 0.0616, "step": 3136 }, { "epoch": 2.864840182648402, "grad_norm": 127.32177734375, "learning_rate": 7.928970065956368e-06, "loss": 3.7447, "step": 3137 }, { "epoch": 2.8657534246575342, "grad_norm": 24.015905380249023, "learning_rate": 7.927955352612887e-06, "loss": 0.1925, "step": 3138 }, { "epoch": 2.8666666666666667, "grad_norm": 19.959712982177734, "learning_rate": 7.926940639269407e-06, "loss": 0.1458, "step": 3139 }, { "epoch": 2.867579908675799, "grad_norm": 43.467620849609375, "learning_rate": 7.925925925925926e-06, "loss": 0.3468, "step": 3140 }, { "epoch": 2.8684931506849316, "grad_norm": 27.09088706970215, "learning_rate": 7.924911212582445e-06, "loss": 0.266, "step": 3141 }, { "epoch": 2.869406392694064, "grad_norm": 3.3077285289764404, "learning_rate": 7.923896499238966e-06, "loss": 0.0215, "step": 3142 }, { "epoch": 2.8703196347031965, "grad_norm": 71.0665054321289, "learning_rate": 7.922881785895486e-06, "loss": 1.0873, "step": 3143 }, { "epoch": 2.8712328767123285, "grad_norm": 30.813350677490234, "learning_rate": 7.921867072552005e-06, "loss": 0.4156, "step": 3144 }, { "epoch": 2.872146118721461, "grad_norm": 30.669374465942383, "learning_rate": 7.920852359208524e-06, "loss": 0.2362, "step": 3145 }, { "epoch": 2.8730593607305934, "grad_norm": 28.907398223876953, "learning_rate": 7.919837645865044e-06, "loss": 0.1977, "step": 3146 }, { "epoch": 2.873972602739726, "grad_norm": 24.353200912475586, "learning_rate": 7.918822932521563e-06, "loss": 0.2519, "step": 3147 }, { "epoch": 2.8748858447488583, "grad_norm": 22.044750213623047, "learning_rate": 7.917808219178082e-06, "loss": 0.1373, "step": 3148 }, { "epoch": 2.875799086757991, "grad_norm": 0.6922821402549744, "learning_rate": 7.916793505834603e-06, "loss": 0.0045, "step": 3149 }, { "epoch": 2.8767123287671232, "grad_norm": 6.155999183654785, "learning_rate": 7.915778792491123e-06, "loss": 0.0389, "step": 3150 }, { "epoch": 2.8776255707762557, "grad_norm": 5.517825126647949, "learning_rate": 7.91476407914764e-06, "loss": 0.0378, "step": 3151 }, { "epoch": 2.878538812785388, "grad_norm": 19.298160552978516, "learning_rate": 7.91374936580416e-06, "loss": 0.1889, "step": 3152 }, { "epoch": 2.8794520547945206, "grad_norm": 0.9775688648223877, "learning_rate": 7.912734652460681e-06, "loss": 0.0057, "step": 3153 }, { "epoch": 2.880365296803653, "grad_norm": 2.612004518508911, "learning_rate": 7.9117199391172e-06, "loss": 0.0254, "step": 3154 }, { "epoch": 2.8812785388127855, "grad_norm": 9.838980674743652, "learning_rate": 7.910705225773719e-06, "loss": 0.081, "step": 3155 }, { "epoch": 2.882191780821918, "grad_norm": 53.64801788330078, "learning_rate": 7.90969051243024e-06, "loss": 0.6731, "step": 3156 }, { "epoch": 2.8831050228310504, "grad_norm": 24.366952896118164, "learning_rate": 7.908675799086758e-06, "loss": 0.2247, "step": 3157 }, { "epoch": 2.884018264840183, "grad_norm": 9.73653507232666, "learning_rate": 7.907661085743277e-06, "loss": 0.0743, "step": 3158 }, { "epoch": 2.8849315068493153, "grad_norm": 0.9018043279647827, "learning_rate": 7.906646372399798e-06, "loss": 0.0075, "step": 3159 }, { "epoch": 2.8858447488584473, "grad_norm": 2.1581356525421143, "learning_rate": 7.905631659056318e-06, "loss": 0.0151, "step": 3160 }, { "epoch": 2.88675799086758, "grad_norm": 2.054783821105957, "learning_rate": 7.904616945712837e-06, "loss": 0.0122, "step": 3161 }, { "epoch": 2.8876712328767122, "grad_norm": 2.2354140281677246, "learning_rate": 7.903602232369356e-06, "loss": 0.0184, "step": 3162 }, { "epoch": 2.8885844748858447, "grad_norm": 7.096502304077148, "learning_rate": 7.902587519025877e-06, "loss": 0.0455, "step": 3163 }, { "epoch": 2.889497716894977, "grad_norm": 0.12971079349517822, "learning_rate": 7.901572805682395e-06, "loss": 0.0007, "step": 3164 }, { "epoch": 2.8904109589041096, "grad_norm": 4.829239845275879, "learning_rate": 7.900558092338914e-06, "loss": 0.0294, "step": 3165 }, { "epoch": 2.891324200913242, "grad_norm": 0.4271834194660187, "learning_rate": 7.899543378995435e-06, "loss": 0.0033, "step": 3166 }, { "epoch": 2.8922374429223745, "grad_norm": 72.8016357421875, "learning_rate": 7.898528665651954e-06, "loss": 0.7739, "step": 3167 }, { "epoch": 2.893150684931507, "grad_norm": 43.34067916870117, "learning_rate": 7.897513952308472e-06, "loss": 0.2964, "step": 3168 }, { "epoch": 2.8940639269406394, "grad_norm": 17.547590255737305, "learning_rate": 7.896499238964993e-06, "loss": 0.1099, "step": 3169 }, { "epoch": 2.8949771689497714, "grad_norm": 68.18296813964844, "learning_rate": 7.895484525621514e-06, "loss": 0.2414, "step": 3170 }, { "epoch": 2.895890410958904, "grad_norm": 25.01630210876465, "learning_rate": 7.894469812278032e-06, "loss": 0.2215, "step": 3171 }, { "epoch": 2.8968036529680363, "grad_norm": 12.633068084716797, "learning_rate": 7.893455098934551e-06, "loss": 0.1376, "step": 3172 }, { "epoch": 2.8977168949771688, "grad_norm": 0.5673684477806091, "learning_rate": 7.892440385591072e-06, "loss": 0.0039, "step": 3173 }, { "epoch": 2.8986301369863012, "grad_norm": 34.50189208984375, "learning_rate": 7.89142567224759e-06, "loss": 0.1924, "step": 3174 }, { "epoch": 2.8995433789954337, "grad_norm": 20.249000549316406, "learning_rate": 7.89041095890411e-06, "loss": 0.1703, "step": 3175 }, { "epoch": 2.900456621004566, "grad_norm": 0.7934989929199219, "learning_rate": 7.88939624556063e-06, "loss": 0.0052, "step": 3176 }, { "epoch": 2.9013698630136986, "grad_norm": 3.3326315879821777, "learning_rate": 7.88838153221715e-06, "loss": 0.0267, "step": 3177 }, { "epoch": 2.902283105022831, "grad_norm": 122.21971893310547, "learning_rate": 7.887366818873668e-06, "loss": 0.5949, "step": 3178 }, { "epoch": 2.9031963470319635, "grad_norm": 0.3499526381492615, "learning_rate": 7.886352105530188e-06, "loss": 0.0025, "step": 3179 }, { "epoch": 2.904109589041096, "grad_norm": 95.0014419555664, "learning_rate": 7.885337392186709e-06, "loss": 1.1354, "step": 3180 }, { "epoch": 2.9050228310502284, "grad_norm": 0.7601897120475769, "learning_rate": 7.884322678843228e-06, "loss": 0.0071, "step": 3181 }, { "epoch": 2.905936073059361, "grad_norm": 4.879162788391113, "learning_rate": 7.883307965499746e-06, "loss": 0.04, "step": 3182 }, { "epoch": 2.9068493150684933, "grad_norm": 0.02834239788353443, "learning_rate": 7.882293252156267e-06, "loss": 0.0002, "step": 3183 }, { "epoch": 2.9077625570776258, "grad_norm": 92.22254943847656, "learning_rate": 7.881278538812786e-06, "loss": 0.8753, "step": 3184 }, { "epoch": 2.908675799086758, "grad_norm": 23.269010543823242, "learning_rate": 7.880263825469305e-06, "loss": 0.1389, "step": 3185 }, { "epoch": 2.9095890410958907, "grad_norm": 21.252208709716797, "learning_rate": 7.879249112125825e-06, "loss": 0.2021, "step": 3186 }, { "epoch": 2.9105022831050227, "grad_norm": 2.6693038940429688, "learning_rate": 7.878234398782346e-06, "loss": 0.0122, "step": 3187 }, { "epoch": 2.911415525114155, "grad_norm": 2.4709081649780273, "learning_rate": 7.877219685438865e-06, "loss": 0.0141, "step": 3188 }, { "epoch": 2.9123287671232876, "grad_norm": 0.5830668807029724, "learning_rate": 7.876204972095383e-06, "loss": 0.0022, "step": 3189 }, { "epoch": 2.91324200913242, "grad_norm": 24.455589294433594, "learning_rate": 7.875190258751904e-06, "loss": 0.1305, "step": 3190 }, { "epoch": 2.9141552511415525, "grad_norm": 67.5163803100586, "learning_rate": 7.874175545408423e-06, "loss": 0.6412, "step": 3191 }, { "epoch": 2.915068493150685, "grad_norm": 1.1517926454544067, "learning_rate": 7.873160832064942e-06, "loss": 0.0067, "step": 3192 }, { "epoch": 2.9159817351598174, "grad_norm": 10.33597469329834, "learning_rate": 7.872146118721462e-06, "loss": 0.0708, "step": 3193 }, { "epoch": 2.91689497716895, "grad_norm": 1.6565618515014648, "learning_rate": 7.871131405377981e-06, "loss": 0.0078, "step": 3194 }, { "epoch": 2.9178082191780823, "grad_norm": 25.263511657714844, "learning_rate": 7.8701166920345e-06, "loss": 0.2159, "step": 3195 }, { "epoch": 2.9187214611872148, "grad_norm": 1.3640406131744385, "learning_rate": 7.86910197869102e-06, "loss": 0.0101, "step": 3196 }, { "epoch": 2.9196347031963468, "grad_norm": 27.615297317504883, "learning_rate": 7.868087265347541e-06, "loss": 0.1386, "step": 3197 }, { "epoch": 2.9205479452054792, "grad_norm": 2.71144962310791, "learning_rate": 7.86707255200406e-06, "loss": 0.0207, "step": 3198 }, { "epoch": 2.9214611872146117, "grad_norm": 0.2840903103351593, "learning_rate": 7.866057838660579e-06, "loss": 0.0019, "step": 3199 }, { "epoch": 2.922374429223744, "grad_norm": 10.946453094482422, "learning_rate": 7.8650431253171e-06, "loss": 0.089, "step": 3200 }, { "epoch": 2.9232876712328766, "grad_norm": 14.448545455932617, "learning_rate": 7.864028411973618e-06, "loss": 0.0754, "step": 3201 }, { "epoch": 2.924200913242009, "grad_norm": 1.2737749814987183, "learning_rate": 7.863013698630137e-06, "loss": 0.0108, "step": 3202 }, { "epoch": 2.9251141552511415, "grad_norm": 3.514235734939575, "learning_rate": 7.861998985286657e-06, "loss": 0.0215, "step": 3203 }, { "epoch": 2.926027397260274, "grad_norm": 16.86905288696289, "learning_rate": 7.860984271943176e-06, "loss": 0.0923, "step": 3204 }, { "epoch": 2.9269406392694064, "grad_norm": 2.2947285175323486, "learning_rate": 7.859969558599697e-06, "loss": 0.01, "step": 3205 }, { "epoch": 2.927853881278539, "grad_norm": 2.7766215801239014, "learning_rate": 7.858954845256216e-06, "loss": 0.0177, "step": 3206 }, { "epoch": 2.9287671232876713, "grad_norm": 30.59978675842285, "learning_rate": 7.857940131912736e-06, "loss": 0.306, "step": 3207 }, { "epoch": 2.9296803652968038, "grad_norm": 60.721405029296875, "learning_rate": 7.856925418569255e-06, "loss": 0.6776, "step": 3208 }, { "epoch": 2.930593607305936, "grad_norm": 39.87887191772461, "learning_rate": 7.855910705225774e-06, "loss": 0.4, "step": 3209 }, { "epoch": 2.9315068493150687, "grad_norm": 36.32297134399414, "learning_rate": 7.854895991882294e-06, "loss": 0.292, "step": 3210 }, { "epoch": 2.932420091324201, "grad_norm": 75.15916442871094, "learning_rate": 7.853881278538813e-06, "loss": 1.086, "step": 3211 }, { "epoch": 2.9333333333333336, "grad_norm": 32.18380355834961, "learning_rate": 7.852866565195332e-06, "loss": 0.1848, "step": 3212 }, { "epoch": 2.934246575342466, "grad_norm": 107.6502456665039, "learning_rate": 7.851851851851853e-06, "loss": 1.8878, "step": 3213 }, { "epoch": 2.935159817351598, "grad_norm": 2.9096367359161377, "learning_rate": 7.850837138508372e-06, "loss": 0.0247, "step": 3214 }, { "epoch": 2.9360730593607305, "grad_norm": 35.0159797668457, "learning_rate": 7.849822425164892e-06, "loss": 0.1929, "step": 3215 }, { "epoch": 2.936986301369863, "grad_norm": 0.3632410168647766, "learning_rate": 7.848807711821411e-06, "loss": 0.0021, "step": 3216 }, { "epoch": 2.9378995433789954, "grad_norm": 19.453399658203125, "learning_rate": 7.847792998477931e-06, "loss": 0.1567, "step": 3217 }, { "epoch": 2.938812785388128, "grad_norm": 40.85439682006836, "learning_rate": 7.84677828513445e-06, "loss": 0.3266, "step": 3218 }, { "epoch": 2.9397260273972603, "grad_norm": 9.291061401367188, "learning_rate": 7.845763571790969e-06, "loss": 0.0479, "step": 3219 }, { "epoch": 2.9406392694063928, "grad_norm": 0.8991797566413879, "learning_rate": 7.84474885844749e-06, "loss": 0.0048, "step": 3220 }, { "epoch": 2.941552511415525, "grad_norm": 47.612335205078125, "learning_rate": 7.843734145104008e-06, "loss": 0.3731, "step": 3221 }, { "epoch": 2.9424657534246577, "grad_norm": 28.188106536865234, "learning_rate": 7.842719431760527e-06, "loss": 0.1929, "step": 3222 }, { "epoch": 2.94337899543379, "grad_norm": 150.15304565429688, "learning_rate": 7.841704718417048e-06, "loss": 8.6392, "step": 3223 }, { "epoch": 2.944292237442922, "grad_norm": 3.337117910385132, "learning_rate": 7.840690005073567e-06, "loss": 0.0258, "step": 3224 }, { "epoch": 2.9452054794520546, "grad_norm": 0.07909257709980011, "learning_rate": 7.839675291730087e-06, "loss": 0.0005, "step": 3225 }, { "epoch": 2.946118721461187, "grad_norm": 2.1064178943634033, "learning_rate": 7.838660578386606e-06, "loss": 0.0177, "step": 3226 }, { "epoch": 2.9470319634703195, "grad_norm": 96.23159790039062, "learning_rate": 7.837645865043127e-06, "loss": 1.2787, "step": 3227 }, { "epoch": 2.947945205479452, "grad_norm": 48.1169319152832, "learning_rate": 7.836631151699645e-06, "loss": 0.3493, "step": 3228 }, { "epoch": 2.9488584474885844, "grad_norm": 10.688518524169922, "learning_rate": 7.835616438356164e-06, "loss": 0.0539, "step": 3229 }, { "epoch": 2.949771689497717, "grad_norm": 52.267330169677734, "learning_rate": 7.834601725012685e-06, "loss": 0.6077, "step": 3230 }, { "epoch": 2.9506849315068493, "grad_norm": 45.2080192565918, "learning_rate": 7.833587011669204e-06, "loss": 0.3546, "step": 3231 }, { "epoch": 2.9515981735159817, "grad_norm": 3.053802013397217, "learning_rate": 7.832572298325724e-06, "loss": 0.0213, "step": 3232 }, { "epoch": 2.952511415525114, "grad_norm": 16.952655792236328, "learning_rate": 7.831557584982243e-06, "loss": 0.0432, "step": 3233 }, { "epoch": 2.9534246575342467, "grad_norm": 60.68433380126953, "learning_rate": 7.830542871638762e-06, "loss": 0.9578, "step": 3234 }, { "epoch": 2.954337899543379, "grad_norm": 0.9234883189201355, "learning_rate": 7.829528158295282e-06, "loss": 0.0048, "step": 3235 }, { "epoch": 2.9552511415525116, "grad_norm": 5.863003730773926, "learning_rate": 7.828513444951801e-06, "loss": 0.0433, "step": 3236 }, { "epoch": 2.956164383561644, "grad_norm": 2.866511106491089, "learning_rate": 7.827498731608322e-06, "loss": 0.0203, "step": 3237 }, { "epoch": 2.9570776255707765, "grad_norm": 13.64544677734375, "learning_rate": 7.82648401826484e-06, "loss": 0.1291, "step": 3238 }, { "epoch": 2.957990867579909, "grad_norm": 0.36025622487068176, "learning_rate": 7.82546930492136e-06, "loss": 0.0028, "step": 3239 }, { "epoch": 2.958904109589041, "grad_norm": 0.3324816823005676, "learning_rate": 7.82445459157788e-06, "loss": 0.0026, "step": 3240 }, { "epoch": 2.9598173515981734, "grad_norm": 67.02420043945312, "learning_rate": 7.823439878234399e-06, "loss": 0.7242, "step": 3241 }, { "epoch": 2.960730593607306, "grad_norm": 3.169466257095337, "learning_rate": 7.82242516489092e-06, "loss": 0.0336, "step": 3242 }, { "epoch": 2.9616438356164383, "grad_norm": 9.606684684753418, "learning_rate": 7.821410451547438e-06, "loss": 0.0858, "step": 3243 }, { "epoch": 2.9625570776255707, "grad_norm": 2.433851957321167, "learning_rate": 7.820395738203957e-06, "loss": 0.0225, "step": 3244 }, { "epoch": 2.963470319634703, "grad_norm": 3.265474557876587, "learning_rate": 7.819381024860478e-06, "loss": 0.0191, "step": 3245 }, { "epoch": 2.9643835616438357, "grad_norm": 54.38724899291992, "learning_rate": 7.818366311516997e-06, "loss": 0.6045, "step": 3246 }, { "epoch": 2.965296803652968, "grad_norm": 53.745452880859375, "learning_rate": 7.817351598173517e-06, "loss": 0.4767, "step": 3247 }, { "epoch": 2.9662100456621006, "grad_norm": 58.337162017822266, "learning_rate": 7.816336884830036e-06, "loss": 0.4749, "step": 3248 }, { "epoch": 2.967123287671233, "grad_norm": 9.844382286071777, "learning_rate": 7.815322171486556e-06, "loss": 0.054, "step": 3249 }, { "epoch": 2.968036529680365, "grad_norm": 123.23094940185547, "learning_rate": 7.814307458143075e-06, "loss": 1.3036, "step": 3250 }, { "epoch": 2.9689497716894975, "grad_norm": 0.9734125137329102, "learning_rate": 7.813292744799594e-06, "loss": 0.0071, "step": 3251 }, { "epoch": 2.96986301369863, "grad_norm": 130.47573852539062, "learning_rate": 7.812278031456115e-06, "loss": 1.9408, "step": 3252 }, { "epoch": 2.9707762557077624, "grad_norm": 7.10814094543457, "learning_rate": 7.811263318112634e-06, "loss": 0.0661, "step": 3253 }, { "epoch": 2.971689497716895, "grad_norm": 5.194034099578857, "learning_rate": 7.810248604769152e-06, "loss": 0.0505, "step": 3254 }, { "epoch": 2.9726027397260273, "grad_norm": 22.88471794128418, "learning_rate": 7.809233891425673e-06, "loss": 0.1289, "step": 3255 }, { "epoch": 2.9735159817351597, "grad_norm": 1304.61181640625, "learning_rate": 7.808219178082192e-06, "loss": 0.4823, "step": 3256 }, { "epoch": 2.974429223744292, "grad_norm": 53.96678161621094, "learning_rate": 7.807204464738712e-06, "loss": 0.2352, "step": 3257 }, { "epoch": 2.9753424657534246, "grad_norm": 76.58149719238281, "learning_rate": 7.806189751395231e-06, "loss": 1.2647, "step": 3258 }, { "epoch": 2.976255707762557, "grad_norm": 0.12895701825618744, "learning_rate": 7.805175038051752e-06, "loss": 0.0011, "step": 3259 }, { "epoch": 2.9771689497716896, "grad_norm": 38.82307815551758, "learning_rate": 7.80416032470827e-06, "loss": 0.4117, "step": 3260 }, { "epoch": 2.978082191780822, "grad_norm": 6.740635871887207, "learning_rate": 7.80314561136479e-06, "loss": 0.0383, "step": 3261 }, { "epoch": 2.9789954337899545, "grad_norm": 0.7946199774742126, "learning_rate": 7.80213089802131e-06, "loss": 0.0049, "step": 3262 }, { "epoch": 2.979908675799087, "grad_norm": 5.229825973510742, "learning_rate": 7.801116184677829e-06, "loss": 0.0393, "step": 3263 }, { "epoch": 2.9808219178082194, "grad_norm": 0.993942379951477, "learning_rate": 7.800101471334348e-06, "loss": 0.0056, "step": 3264 }, { "epoch": 2.981735159817352, "grad_norm": 40.857749938964844, "learning_rate": 7.799086757990868e-06, "loss": 0.3748, "step": 3265 }, { "epoch": 2.9826484018264843, "grad_norm": 4.621337413787842, "learning_rate": 7.798072044647389e-06, "loss": 0.0166, "step": 3266 }, { "epoch": 2.9835616438356163, "grad_norm": 16.005617141723633, "learning_rate": 7.797057331303908e-06, "loss": 0.062, "step": 3267 }, { "epoch": 2.9844748858447487, "grad_norm": 46.55233383178711, "learning_rate": 7.796042617960426e-06, "loss": 0.2583, "step": 3268 }, { "epoch": 2.985388127853881, "grad_norm": 42.8541145324707, "learning_rate": 7.795027904616947e-06, "loss": 0.3887, "step": 3269 }, { "epoch": 2.9863013698630136, "grad_norm": 2.3610966205596924, "learning_rate": 7.794013191273466e-06, "loss": 0.0109, "step": 3270 }, { "epoch": 2.987214611872146, "grad_norm": 0.7520325183868408, "learning_rate": 7.792998477929985e-06, "loss": 0.0055, "step": 3271 }, { "epoch": 2.9881278538812786, "grad_norm": 37.968299865722656, "learning_rate": 7.791983764586505e-06, "loss": 0.3689, "step": 3272 }, { "epoch": 2.989041095890411, "grad_norm": 35.04014587402344, "learning_rate": 7.790969051243024e-06, "loss": 0.2921, "step": 3273 }, { "epoch": 2.9899543378995435, "grad_norm": 5.163416862487793, "learning_rate": 7.789954337899543e-06, "loss": 0.0329, "step": 3274 }, { "epoch": 2.990867579908676, "grad_norm": 5.013493061065674, "learning_rate": 7.788939624556063e-06, "loss": 0.0364, "step": 3275 }, { "epoch": 2.9917808219178084, "grad_norm": 78.79368591308594, "learning_rate": 7.787924911212584e-06, "loss": 0.6783, "step": 3276 }, { "epoch": 2.9926940639269404, "grad_norm": 17.758813858032227, "learning_rate": 7.786910197869103e-06, "loss": 0.142, "step": 3277 }, { "epoch": 2.993607305936073, "grad_norm": 28.714187622070312, "learning_rate": 7.785895484525622e-06, "loss": 0.2438, "step": 3278 }, { "epoch": 2.9945205479452053, "grad_norm": 1.2688223123550415, "learning_rate": 7.784880771182142e-06, "loss": 0.0077, "step": 3279 }, { "epoch": 2.9954337899543377, "grad_norm": 10.634551048278809, "learning_rate": 7.783866057838661e-06, "loss": 0.0627, "step": 3280 }, { "epoch": 2.99634703196347, "grad_norm": 5.7458343505859375, "learning_rate": 7.78285134449518e-06, "loss": 0.0412, "step": 3281 }, { "epoch": 2.9972602739726026, "grad_norm": 0.11730367690324783, "learning_rate": 7.7818366311517e-06, "loss": 0.0009, "step": 3282 }, { "epoch": 2.998173515981735, "grad_norm": 14.24971866607666, "learning_rate": 7.78082191780822e-06, "loss": 0.0767, "step": 3283 }, { "epoch": 2.9990867579908675, "grad_norm": 392.1225891113281, "learning_rate": 7.779807204464738e-06, "loss": 3.8291, "step": 3284 }, { "epoch": 3.0, "grad_norm": 112.13819122314453, "learning_rate": 7.778792491121259e-06, "loss": 1.5534, "step": 3285 }, { "epoch": 3.0009132420091325, "grad_norm": 1.9359291791915894, "learning_rate": 7.77777777777778e-06, "loss": 0.014, "step": 3286 }, { "epoch": 3.001826484018265, "grad_norm": 6.144457817077637, "learning_rate": 7.776763064434298e-06, "loss": 0.0317, "step": 3287 }, { "epoch": 3.0027397260273974, "grad_norm": 6.067310333251953, "learning_rate": 7.775748351090817e-06, "loss": 0.0381, "step": 3288 }, { "epoch": 3.00365296803653, "grad_norm": 0.8824125528335571, "learning_rate": 7.774733637747337e-06, "loss": 0.0063, "step": 3289 }, { "epoch": 3.0045662100456623, "grad_norm": 5.09751033782959, "learning_rate": 7.773718924403856e-06, "loss": 0.026, "step": 3290 }, { "epoch": 3.0054794520547947, "grad_norm": 14.226649284362793, "learning_rate": 7.772704211060375e-06, "loss": 0.1007, "step": 3291 }, { "epoch": 3.0063926940639267, "grad_norm": 6.138105869293213, "learning_rate": 7.771689497716896e-06, "loss": 0.0325, "step": 3292 }, { "epoch": 3.007305936073059, "grad_norm": 26.920230865478516, "learning_rate": 7.770674784373416e-06, "loss": 0.1437, "step": 3293 }, { "epoch": 3.0082191780821916, "grad_norm": 69.66954803466797, "learning_rate": 7.769660071029933e-06, "loss": 1.1388, "step": 3294 }, { "epoch": 3.009132420091324, "grad_norm": 3.934523820877075, "learning_rate": 7.768645357686454e-06, "loss": 0.0261, "step": 3295 }, { "epoch": 3.0100456621004565, "grad_norm": 7.00266170501709, "learning_rate": 7.767630644342974e-06, "loss": 0.0423, "step": 3296 }, { "epoch": 3.010958904109589, "grad_norm": 66.51272583007812, "learning_rate": 7.766615930999493e-06, "loss": 0.6213, "step": 3297 }, { "epoch": 3.0118721461187214, "grad_norm": 43.4732666015625, "learning_rate": 7.765601217656012e-06, "loss": 0.3143, "step": 3298 }, { "epoch": 3.012785388127854, "grad_norm": 1.8052071332931519, "learning_rate": 7.764586504312533e-06, "loss": 0.0149, "step": 3299 }, { "epoch": 3.0136986301369864, "grad_norm": 29.648948669433594, "learning_rate": 7.763571790969051e-06, "loss": 0.1528, "step": 3300 }, { "epoch": 3.014611872146119, "grad_norm": 2.856417417526245, "learning_rate": 7.76255707762557e-06, "loss": 0.005, "step": 3301 }, { "epoch": 3.0155251141552513, "grad_norm": 20.00061798095703, "learning_rate": 7.761542364282091e-06, "loss": 0.1273, "step": 3302 }, { "epoch": 3.0164383561643837, "grad_norm": 96.86392211914062, "learning_rate": 7.760527650938611e-06, "loss": 1.5947, "step": 3303 }, { "epoch": 3.017351598173516, "grad_norm": 4.385748386383057, "learning_rate": 7.75951293759513e-06, "loss": 0.043, "step": 3304 }, { "epoch": 3.018264840182648, "grad_norm": 4.304587364196777, "learning_rate": 7.758498224251649e-06, "loss": 0.0242, "step": 3305 }, { "epoch": 3.0191780821917806, "grad_norm": 2.321790933609009, "learning_rate": 7.75748351090817e-06, "loss": 0.0073, "step": 3306 }, { "epoch": 3.020091324200913, "grad_norm": 33.595481872558594, "learning_rate": 7.756468797564688e-06, "loss": 0.195, "step": 3307 }, { "epoch": 3.0210045662100455, "grad_norm": 11.87073802947998, "learning_rate": 7.755454084221207e-06, "loss": 0.0728, "step": 3308 }, { "epoch": 3.021917808219178, "grad_norm": 1.7662891149520874, "learning_rate": 7.754439370877728e-06, "loss": 0.0131, "step": 3309 }, { "epoch": 3.0228310502283104, "grad_norm": 32.12920379638672, "learning_rate": 7.753424657534248e-06, "loss": 0.1586, "step": 3310 }, { "epoch": 3.023744292237443, "grad_norm": 32.575958251953125, "learning_rate": 7.752409944190766e-06, "loss": 0.2533, "step": 3311 }, { "epoch": 3.0246575342465754, "grad_norm": 81.94894409179688, "learning_rate": 7.751395230847286e-06, "loss": 1.559, "step": 3312 }, { "epoch": 3.025570776255708, "grad_norm": 0.5941957831382751, "learning_rate": 7.750380517503807e-06, "loss": 0.0035, "step": 3313 }, { "epoch": 3.0264840182648403, "grad_norm": 2.4860637187957764, "learning_rate": 7.749365804160325e-06, "loss": 0.0118, "step": 3314 }, { "epoch": 3.0273972602739727, "grad_norm": 9.573290824890137, "learning_rate": 7.748351090816844e-06, "loss": 0.0545, "step": 3315 }, { "epoch": 3.028310502283105, "grad_norm": 63.7360954284668, "learning_rate": 7.747336377473365e-06, "loss": 0.782, "step": 3316 }, { "epoch": 3.0292237442922376, "grad_norm": 0.7375425100326538, "learning_rate": 7.746321664129884e-06, "loss": 0.004, "step": 3317 }, { "epoch": 3.03013698630137, "grad_norm": 109.62568664550781, "learning_rate": 7.745306950786403e-06, "loss": 0.9871, "step": 3318 }, { "epoch": 3.031050228310502, "grad_norm": 7.567119598388672, "learning_rate": 7.744292237442923e-06, "loss": 0.042, "step": 3319 }, { "epoch": 3.0319634703196345, "grad_norm": 1.7292691469192505, "learning_rate": 7.743277524099444e-06, "loss": 0.0092, "step": 3320 }, { "epoch": 3.032876712328767, "grad_norm": 26.251482009887695, "learning_rate": 7.742262810755962e-06, "loss": 0.0927, "step": 3321 }, { "epoch": 3.0337899543378994, "grad_norm": 14.34410572052002, "learning_rate": 7.741248097412481e-06, "loss": 0.1019, "step": 3322 }, { "epoch": 3.034703196347032, "grad_norm": 5.898282527923584, "learning_rate": 7.740233384069002e-06, "loss": 0.0435, "step": 3323 }, { "epoch": 3.0356164383561643, "grad_norm": 6.465744972229004, "learning_rate": 7.73921867072552e-06, "loss": 0.0419, "step": 3324 }, { "epoch": 3.036529680365297, "grad_norm": 0.23637831211090088, "learning_rate": 7.73820395738204e-06, "loss": 0.0015, "step": 3325 }, { "epoch": 3.0374429223744293, "grad_norm": 1.657773494720459, "learning_rate": 7.73718924403856e-06, "loss": 0.0086, "step": 3326 }, { "epoch": 3.0383561643835617, "grad_norm": 0.3516515791416168, "learning_rate": 7.736174530695079e-06, "loss": 0.0022, "step": 3327 }, { "epoch": 3.039269406392694, "grad_norm": 4.529101848602295, "learning_rate": 7.735159817351598e-06, "loss": 0.0349, "step": 3328 }, { "epoch": 3.0401826484018266, "grad_norm": 2.6109957695007324, "learning_rate": 7.734145104008118e-06, "loss": 0.0147, "step": 3329 }, { "epoch": 3.041095890410959, "grad_norm": 22.761137008666992, "learning_rate": 7.733130390664639e-06, "loss": 0.1395, "step": 3330 }, { "epoch": 3.0420091324200915, "grad_norm": 7.950385093688965, "learning_rate": 7.732115677321158e-06, "loss": 0.055, "step": 3331 }, { "epoch": 3.0429223744292235, "grad_norm": 2.741245746612549, "learning_rate": 7.731100963977677e-06, "loss": 0.0198, "step": 3332 }, { "epoch": 3.043835616438356, "grad_norm": 3.750687837600708, "learning_rate": 7.730086250634197e-06, "loss": 0.0149, "step": 3333 }, { "epoch": 3.0447488584474884, "grad_norm": 6.158517360687256, "learning_rate": 7.729071537290716e-06, "loss": 0.0357, "step": 3334 }, { "epoch": 3.045662100456621, "grad_norm": 8.854698181152344, "learning_rate": 7.728056823947235e-06, "loss": 0.0541, "step": 3335 }, { "epoch": 3.0465753424657533, "grad_norm": 46.42654800415039, "learning_rate": 7.727042110603755e-06, "loss": 0.3063, "step": 3336 }, { "epoch": 3.047488584474886, "grad_norm": 20.052587509155273, "learning_rate": 7.726027397260276e-06, "loss": 0.166, "step": 3337 }, { "epoch": 3.0484018264840183, "grad_norm": 110.32380676269531, "learning_rate": 7.725012683916793e-06, "loss": 0.8864, "step": 3338 }, { "epoch": 3.0493150684931507, "grad_norm": 3.440967321395874, "learning_rate": 7.723997970573314e-06, "loss": 0.0276, "step": 3339 }, { "epoch": 3.050228310502283, "grad_norm": 9.497747421264648, "learning_rate": 7.722983257229834e-06, "loss": 0.0699, "step": 3340 }, { "epoch": 3.0511415525114156, "grad_norm": 74.26344299316406, "learning_rate": 7.721968543886353e-06, "loss": 0.6078, "step": 3341 }, { "epoch": 3.052054794520548, "grad_norm": 15.193007469177246, "learning_rate": 7.720953830542872e-06, "loss": 0.0434, "step": 3342 }, { "epoch": 3.0529680365296805, "grad_norm": 7.569250106811523, "learning_rate": 7.719939117199392e-06, "loss": 0.0434, "step": 3343 }, { "epoch": 3.053881278538813, "grad_norm": 0.14047689735889435, "learning_rate": 7.718924403855911e-06, "loss": 0.0009, "step": 3344 }, { "epoch": 3.0547945205479454, "grad_norm": 1.2038418054580688, "learning_rate": 7.71790969051243e-06, "loss": 0.0046, "step": 3345 }, { "epoch": 3.0557077625570774, "grad_norm": 29.263084411621094, "learning_rate": 7.71689497716895e-06, "loss": 0.0492, "step": 3346 }, { "epoch": 3.05662100456621, "grad_norm": 5.362860202789307, "learning_rate": 7.715880263825471e-06, "loss": 0.0331, "step": 3347 }, { "epoch": 3.0575342465753423, "grad_norm": 86.25035858154297, "learning_rate": 7.71486555048199e-06, "loss": 1.0982, "step": 3348 }, { "epoch": 3.058447488584475, "grad_norm": 50.16056823730469, "learning_rate": 7.713850837138509e-06, "loss": 0.282, "step": 3349 }, { "epoch": 3.0593607305936072, "grad_norm": 51.04742431640625, "learning_rate": 7.71283612379503e-06, "loss": 0.3121, "step": 3350 }, { "epoch": 3.0602739726027397, "grad_norm": 0.8156310319900513, "learning_rate": 7.711821410451548e-06, "loss": 0.0053, "step": 3351 }, { "epoch": 3.061187214611872, "grad_norm": 3.1268038749694824, "learning_rate": 7.710806697108067e-06, "loss": 0.0071, "step": 3352 }, { "epoch": 3.0621004566210046, "grad_norm": 2.561541795730591, "learning_rate": 7.709791983764588e-06, "loss": 0.0161, "step": 3353 }, { "epoch": 3.063013698630137, "grad_norm": 31.098371505737305, "learning_rate": 7.708777270421106e-06, "loss": 0.2564, "step": 3354 }, { "epoch": 3.0639269406392695, "grad_norm": 21.92746353149414, "learning_rate": 7.707762557077625e-06, "loss": 0.2028, "step": 3355 }, { "epoch": 3.064840182648402, "grad_norm": 0.2973043918609619, "learning_rate": 7.706747843734146e-06, "loss": 0.0023, "step": 3356 }, { "epoch": 3.0657534246575344, "grad_norm": 18.10230827331543, "learning_rate": 7.705733130390666e-06, "loss": 0.1496, "step": 3357 }, { "epoch": 3.066666666666667, "grad_norm": 7.831422805786133, "learning_rate": 7.704718417047185e-06, "loss": 0.0325, "step": 3358 }, { "epoch": 3.067579908675799, "grad_norm": 0.7828724384307861, "learning_rate": 7.703703703703704e-06, "loss": 0.0043, "step": 3359 }, { "epoch": 3.0684931506849313, "grad_norm": 35.72775650024414, "learning_rate": 7.702688990360225e-06, "loss": 0.2888, "step": 3360 }, { "epoch": 3.069406392694064, "grad_norm": 2.6078414916992188, "learning_rate": 7.701674277016743e-06, "loss": 0.0143, "step": 3361 }, { "epoch": 3.0703196347031962, "grad_norm": 0.5311610698699951, "learning_rate": 7.700659563673262e-06, "loss": 0.0032, "step": 3362 }, { "epoch": 3.0712328767123287, "grad_norm": 13.572375297546387, "learning_rate": 7.699644850329783e-06, "loss": 0.0478, "step": 3363 }, { "epoch": 3.072146118721461, "grad_norm": 133.95359802246094, "learning_rate": 7.698630136986302e-06, "loss": 0.5535, "step": 3364 }, { "epoch": 3.0730593607305936, "grad_norm": 1.8215230703353882, "learning_rate": 7.697615423642822e-06, "loss": 0.0155, "step": 3365 }, { "epoch": 3.073972602739726, "grad_norm": 19.01931381225586, "learning_rate": 7.696600710299341e-06, "loss": 0.0774, "step": 3366 }, { "epoch": 3.0748858447488585, "grad_norm": 8.168445587158203, "learning_rate": 7.695585996955862e-06, "loss": 0.0541, "step": 3367 }, { "epoch": 3.075799086757991, "grad_norm": 0.11076727509498596, "learning_rate": 7.69457128361238e-06, "loss": 0.0006, "step": 3368 }, { "epoch": 3.0767123287671234, "grad_norm": 1.7977510690689087, "learning_rate": 7.6935565702689e-06, "loss": 0.0119, "step": 3369 }, { "epoch": 3.077625570776256, "grad_norm": 11.946340560913086, "learning_rate": 7.69254185692542e-06, "loss": 0.1025, "step": 3370 }, { "epoch": 3.0785388127853883, "grad_norm": 56.20178985595703, "learning_rate": 7.691527143581939e-06, "loss": 0.2157, "step": 3371 }, { "epoch": 3.0794520547945208, "grad_norm": 79.32460021972656, "learning_rate": 7.690512430238457e-06, "loss": 1.3546, "step": 3372 }, { "epoch": 3.080365296803653, "grad_norm": 129.33901977539062, "learning_rate": 7.689497716894978e-06, "loss": 0.4721, "step": 3373 }, { "epoch": 3.0812785388127852, "grad_norm": 2.706991195678711, "learning_rate": 7.688483003551497e-06, "loss": 0.0116, "step": 3374 }, { "epoch": 3.0821917808219177, "grad_norm": 23.184907913208008, "learning_rate": 7.687468290208017e-06, "loss": 0.2268, "step": 3375 }, { "epoch": 3.08310502283105, "grad_norm": 5.866180896759033, "learning_rate": 7.686453576864536e-06, "loss": 0.0363, "step": 3376 }, { "epoch": 3.0840182648401826, "grad_norm": 16.757753372192383, "learning_rate": 7.685438863521057e-06, "loss": 0.1204, "step": 3377 }, { "epoch": 3.084931506849315, "grad_norm": 35.72077178955078, "learning_rate": 7.684424150177576e-06, "loss": 0.3097, "step": 3378 }, { "epoch": 3.0858447488584475, "grad_norm": 10.910176277160645, "learning_rate": 7.683409436834094e-06, "loss": 0.0741, "step": 3379 }, { "epoch": 3.08675799086758, "grad_norm": 24.31657600402832, "learning_rate": 7.682394723490615e-06, "loss": 0.1793, "step": 3380 }, { "epoch": 3.0876712328767124, "grad_norm": 0.439799040555954, "learning_rate": 7.681380010147134e-06, "loss": 0.0028, "step": 3381 }, { "epoch": 3.088584474885845, "grad_norm": 6.18697452545166, "learning_rate": 7.680365296803653e-06, "loss": 0.0365, "step": 3382 }, { "epoch": 3.0894977168949773, "grad_norm": 4.00884485244751, "learning_rate": 7.679350583460173e-06, "loss": 0.0288, "step": 3383 }, { "epoch": 3.0904109589041098, "grad_norm": 21.6267147064209, "learning_rate": 7.678335870116692e-06, "loss": 0.1917, "step": 3384 }, { "epoch": 3.091324200913242, "grad_norm": 8.496014595031738, "learning_rate": 7.677321156773213e-06, "loss": 0.0724, "step": 3385 }, { "epoch": 3.0922374429223742, "grad_norm": 0.04286034405231476, "learning_rate": 7.676306443429731e-06, "loss": 0.0004, "step": 3386 }, { "epoch": 3.0931506849315067, "grad_norm": 6.083943843841553, "learning_rate": 7.675291730086252e-06, "loss": 0.0359, "step": 3387 }, { "epoch": 3.094063926940639, "grad_norm": 3.072995901107788, "learning_rate": 7.67427701674277e-06, "loss": 0.0131, "step": 3388 }, { "epoch": 3.0949771689497716, "grad_norm": 1.0058895349502563, "learning_rate": 7.67326230339929e-06, "loss": 0.0036, "step": 3389 }, { "epoch": 3.095890410958904, "grad_norm": 31.087202072143555, "learning_rate": 7.67224759005581e-06, "loss": 0.2316, "step": 3390 }, { "epoch": 3.0968036529680365, "grad_norm": 6.4866719245910645, "learning_rate": 7.671232876712329e-06, "loss": 0.0505, "step": 3391 }, { "epoch": 3.097716894977169, "grad_norm": 4.284134864807129, "learning_rate": 7.67021816336885e-06, "loss": 0.0245, "step": 3392 }, { "epoch": 3.0986301369863014, "grad_norm": 0.26499098539352417, "learning_rate": 7.669203450025368e-06, "loss": 0.0017, "step": 3393 }, { "epoch": 3.099543378995434, "grad_norm": 1.4935342073440552, "learning_rate": 7.668188736681887e-06, "loss": 0.0135, "step": 3394 }, { "epoch": 3.1004566210045663, "grad_norm": 89.02904510498047, "learning_rate": 7.667174023338408e-06, "loss": 1.1319, "step": 3395 }, { "epoch": 3.1013698630136988, "grad_norm": 3.1688740253448486, "learning_rate": 7.666159309994927e-06, "loss": 0.0172, "step": 3396 }, { "epoch": 3.1022831050228312, "grad_norm": 1.0320394039154053, "learning_rate": 7.665144596651447e-06, "loss": 0.0075, "step": 3397 }, { "epoch": 3.1031963470319637, "grad_norm": 18.073776245117188, "learning_rate": 7.664129883307966e-06, "loss": 0.0736, "step": 3398 }, { "epoch": 3.1041095890410957, "grad_norm": 67.64559936523438, "learning_rate": 7.663115169964485e-06, "loss": 0.7271, "step": 3399 }, { "epoch": 3.105022831050228, "grad_norm": 11.879597663879395, "learning_rate": 7.662100456621005e-06, "loss": 0.0619, "step": 3400 }, { "epoch": 3.1059360730593606, "grad_norm": 15.207701683044434, "learning_rate": 7.661085743277524e-06, "loss": 0.0748, "step": 3401 }, { "epoch": 3.106849315068493, "grad_norm": 1.5864222049713135, "learning_rate": 7.660071029934045e-06, "loss": 0.0127, "step": 3402 }, { "epoch": 3.1077625570776255, "grad_norm": 3.5246951580047607, "learning_rate": 7.659056316590564e-06, "loss": 0.0282, "step": 3403 }, { "epoch": 3.108675799086758, "grad_norm": 2.2213008403778076, "learning_rate": 7.658041603247083e-06, "loss": 0.0177, "step": 3404 }, { "epoch": 3.1095890410958904, "grad_norm": 19.288009643554688, "learning_rate": 7.657026889903603e-06, "loss": 0.1228, "step": 3405 }, { "epoch": 3.110502283105023, "grad_norm": 21.368257522583008, "learning_rate": 7.656012176560122e-06, "loss": 0.1242, "step": 3406 }, { "epoch": 3.1114155251141553, "grad_norm": 0.1481909304857254, "learning_rate": 7.654997463216642e-06, "loss": 0.0011, "step": 3407 }, { "epoch": 3.1123287671232878, "grad_norm": 0.08499597758054733, "learning_rate": 7.653982749873161e-06, "loss": 0.0007, "step": 3408 }, { "epoch": 3.11324200913242, "grad_norm": 22.088485717773438, "learning_rate": 7.652968036529682e-06, "loss": 0.1289, "step": 3409 }, { "epoch": 3.1141552511415527, "grad_norm": 5.96906852722168, "learning_rate": 7.6519533231862e-06, "loss": 0.039, "step": 3410 }, { "epoch": 3.115068493150685, "grad_norm": 2.74743914604187, "learning_rate": 7.65093860984272e-06, "loss": 0.0169, "step": 3411 }, { "epoch": 3.115981735159817, "grad_norm": 35.07072830200195, "learning_rate": 7.64992389649924e-06, "loss": 0.2222, "step": 3412 }, { "epoch": 3.1168949771689496, "grad_norm": 61.46548080444336, "learning_rate": 7.648909183155759e-06, "loss": 0.2958, "step": 3413 }, { "epoch": 3.117808219178082, "grad_norm": 12.847156524658203, "learning_rate": 7.647894469812278e-06, "loss": 0.0795, "step": 3414 }, { "epoch": 3.1187214611872145, "grad_norm": 12.652872085571289, "learning_rate": 7.646879756468798e-06, "loss": 0.0937, "step": 3415 }, { "epoch": 3.119634703196347, "grad_norm": 9.641746520996094, "learning_rate": 7.645865043125317e-06, "loss": 0.0396, "step": 3416 }, { "epoch": 3.1205479452054794, "grad_norm": 20.096948623657227, "learning_rate": 7.644850329781838e-06, "loss": 0.1082, "step": 3417 }, { "epoch": 3.121461187214612, "grad_norm": 16.843229293823242, "learning_rate": 7.643835616438356e-06, "loss": 0.1062, "step": 3418 }, { "epoch": 3.1223744292237443, "grad_norm": 42.25606918334961, "learning_rate": 7.642820903094877e-06, "loss": 0.3909, "step": 3419 }, { "epoch": 3.1232876712328768, "grad_norm": 1.1990721225738525, "learning_rate": 7.641806189751396e-06, "loss": 0.009, "step": 3420 }, { "epoch": 3.124200913242009, "grad_norm": 20.354938507080078, "learning_rate": 7.640791476407915e-06, "loss": 0.1178, "step": 3421 }, { "epoch": 3.1251141552511417, "grad_norm": 13.223504066467285, "learning_rate": 7.639776763064435e-06, "loss": 0.1093, "step": 3422 }, { "epoch": 3.126027397260274, "grad_norm": 2.087050199508667, "learning_rate": 7.638762049720954e-06, "loss": 0.0191, "step": 3423 }, { "epoch": 3.1269406392694066, "grad_norm": 18.576169967651367, "learning_rate": 7.637747336377473e-06, "loss": 0.0827, "step": 3424 }, { "epoch": 3.127853881278539, "grad_norm": 3.072613000869751, "learning_rate": 7.636732623033993e-06, "loss": 0.0239, "step": 3425 }, { "epoch": 3.128767123287671, "grad_norm": 1.457724928855896, "learning_rate": 7.635717909690512e-06, "loss": 0.0131, "step": 3426 }, { "epoch": 3.1296803652968035, "grad_norm": 0.22610938549041748, "learning_rate": 7.634703196347033e-06, "loss": 0.0022, "step": 3427 }, { "epoch": 3.130593607305936, "grad_norm": 0.19728852808475494, "learning_rate": 7.633688483003552e-06, "loss": 0.0014, "step": 3428 }, { "epoch": 3.1315068493150684, "grad_norm": 0.42929813265800476, "learning_rate": 7.632673769660072e-06, "loss": 0.0041, "step": 3429 }, { "epoch": 3.132420091324201, "grad_norm": 1.9405300617218018, "learning_rate": 7.631659056316591e-06, "loss": 0.0132, "step": 3430 }, { "epoch": 3.1333333333333333, "grad_norm": 0.40050265192985535, "learning_rate": 7.63064434297311e-06, "loss": 0.0019, "step": 3431 }, { "epoch": 3.1342465753424658, "grad_norm": 13.49007797241211, "learning_rate": 7.62962962962963e-06, "loss": 0.0757, "step": 3432 }, { "epoch": 3.135159817351598, "grad_norm": 0.2574663758277893, "learning_rate": 7.62861491628615e-06, "loss": 0.0018, "step": 3433 }, { "epoch": 3.1360730593607307, "grad_norm": 3.006826400756836, "learning_rate": 7.627600202942669e-06, "loss": 0.0151, "step": 3434 }, { "epoch": 3.136986301369863, "grad_norm": 4.902612209320068, "learning_rate": 7.626585489599189e-06, "loss": 0.0331, "step": 3435 }, { "epoch": 3.1378995433789956, "grad_norm": 0.34924501180648804, "learning_rate": 7.625570776255708e-06, "loss": 0.0018, "step": 3436 }, { "epoch": 3.138812785388128, "grad_norm": 17.499942779541016, "learning_rate": 7.624556062912228e-06, "loss": 0.1128, "step": 3437 }, { "epoch": 3.1397260273972605, "grad_norm": 72.13036346435547, "learning_rate": 7.623541349568747e-06, "loss": 1.0716, "step": 3438 }, { "epoch": 3.1406392694063925, "grad_norm": 0.004095870535820723, "learning_rate": 7.622526636225267e-06, "loss": 0.0, "step": 3439 }, { "epoch": 3.141552511415525, "grad_norm": 4.6342902183532715, "learning_rate": 7.621511922881787e-06, "loss": 0.0277, "step": 3440 }, { "epoch": 3.1424657534246574, "grad_norm": 0.1293715238571167, "learning_rate": 7.620497209538306e-06, "loss": 0.0008, "step": 3441 }, { "epoch": 3.14337899543379, "grad_norm": 4.812154293060303, "learning_rate": 7.619482496194826e-06, "loss": 0.0328, "step": 3442 }, { "epoch": 3.1442922374429223, "grad_norm": 28.379854202270508, "learning_rate": 7.618467782851345e-06, "loss": 0.1952, "step": 3443 }, { "epoch": 3.1452054794520548, "grad_norm": 19.322864532470703, "learning_rate": 7.617453069507864e-06, "loss": 0.1101, "step": 3444 }, { "epoch": 3.146118721461187, "grad_norm": 0.0118585005402565, "learning_rate": 7.616438356164384e-06, "loss": 0.0001, "step": 3445 }, { "epoch": 3.1470319634703197, "grad_norm": 1.1969294548034668, "learning_rate": 7.615423642820904e-06, "loss": 0.0099, "step": 3446 }, { "epoch": 3.147945205479452, "grad_norm": 27.1462459564209, "learning_rate": 7.614408929477423e-06, "loss": 0.1379, "step": 3447 }, { "epoch": 3.1488584474885846, "grad_norm": 98.71218872070312, "learning_rate": 7.613394216133942e-06, "loss": 5.1208, "step": 3448 }, { "epoch": 3.149771689497717, "grad_norm": 20.199119567871094, "learning_rate": 7.612379502790463e-06, "loss": 0.0782, "step": 3449 }, { "epoch": 3.1506849315068495, "grad_norm": 20.077850341796875, "learning_rate": 7.611364789446982e-06, "loss": 0.0989, "step": 3450 }, { "epoch": 3.151598173515982, "grad_norm": 13.973917961120605, "learning_rate": 7.610350076103501e-06, "loss": 0.0636, "step": 3451 }, { "epoch": 3.1525114155251144, "grad_norm": 11.519784927368164, "learning_rate": 7.609335362760021e-06, "loss": 0.0845, "step": 3452 }, { "epoch": 3.1534246575342464, "grad_norm": 3.383307456970215, "learning_rate": 7.608320649416541e-06, "loss": 0.0271, "step": 3453 }, { "epoch": 3.154337899543379, "grad_norm": 7.239487171173096, "learning_rate": 7.6073059360730595e-06, "loss": 0.0596, "step": 3454 }, { "epoch": 3.1552511415525113, "grad_norm": 4.103237628936768, "learning_rate": 7.606291222729579e-06, "loss": 0.0333, "step": 3455 }, { "epoch": 3.1561643835616437, "grad_norm": 10.470491409301758, "learning_rate": 7.605276509386099e-06, "loss": 0.0536, "step": 3456 }, { "epoch": 3.157077625570776, "grad_norm": 21.32865333557129, "learning_rate": 7.604261796042619e-06, "loss": 0.1839, "step": 3457 }, { "epoch": 3.1579908675799087, "grad_norm": 0.14188171923160553, "learning_rate": 7.603247082699137e-06, "loss": 0.0013, "step": 3458 }, { "epoch": 3.158904109589041, "grad_norm": 14.97177505493164, "learning_rate": 7.602232369355658e-06, "loss": 0.0809, "step": 3459 }, { "epoch": 3.1598173515981736, "grad_norm": 38.81141662597656, "learning_rate": 7.601217656012178e-06, "loss": 0.2303, "step": 3460 }, { "epoch": 3.160730593607306, "grad_norm": 18.035093307495117, "learning_rate": 7.6002029426686965e-06, "loss": 0.1592, "step": 3461 }, { "epoch": 3.1616438356164385, "grad_norm": 0.9999459385871887, "learning_rate": 7.599188229325216e-06, "loss": 0.0065, "step": 3462 }, { "epoch": 3.162557077625571, "grad_norm": 4.335775375366211, "learning_rate": 7.598173515981736e-06, "loss": 0.0288, "step": 3463 }, { "epoch": 3.1634703196347034, "grad_norm": 1.0028005838394165, "learning_rate": 7.597158802638255e-06, "loss": 0.0096, "step": 3464 }, { "epoch": 3.1643835616438354, "grad_norm": 4.781137466430664, "learning_rate": 7.596144089294774e-06, "loss": 0.0196, "step": 3465 }, { "epoch": 3.165296803652968, "grad_norm": 1.111965537071228, "learning_rate": 7.595129375951294e-06, "loss": 0.0055, "step": 3466 }, { "epoch": 3.1662100456621003, "grad_norm": 4.576335430145264, "learning_rate": 7.594114662607815e-06, "loss": 0.0328, "step": 3467 }, { "epoch": 3.1671232876712327, "grad_norm": 0.6059512495994568, "learning_rate": 7.5930999492643335e-06, "loss": 0.0039, "step": 3468 }, { "epoch": 3.168036529680365, "grad_norm": 3.0875394344329834, "learning_rate": 7.592085235920853e-06, "loss": 0.0237, "step": 3469 }, { "epoch": 3.1689497716894977, "grad_norm": 10.88535213470459, "learning_rate": 7.591070522577373e-06, "loss": 0.0443, "step": 3470 }, { "epoch": 3.16986301369863, "grad_norm": 1.4505090713500977, "learning_rate": 7.590055809233892e-06, "loss": 0.0114, "step": 3471 }, { "epoch": 3.1707762557077626, "grad_norm": 0.47388172149658203, "learning_rate": 7.589041095890411e-06, "loss": 0.003, "step": 3472 }, { "epoch": 3.171689497716895, "grad_norm": 40.314117431640625, "learning_rate": 7.588026382546931e-06, "loss": 0.2887, "step": 3473 }, { "epoch": 3.1726027397260275, "grad_norm": 3.172438859939575, "learning_rate": 7.58701166920345e-06, "loss": 0.0211, "step": 3474 }, { "epoch": 3.17351598173516, "grad_norm": 2.926518440246582, "learning_rate": 7.58599695585997e-06, "loss": 0.0198, "step": 3475 }, { "epoch": 3.1744292237442924, "grad_norm": 12.299490928649902, "learning_rate": 7.58498224251649e-06, "loss": 0.1109, "step": 3476 }, { "epoch": 3.175342465753425, "grad_norm": 28.45994758605957, "learning_rate": 7.58396752917301e-06, "loss": 0.1844, "step": 3477 }, { "epoch": 3.1762557077625573, "grad_norm": 9.06960678100586, "learning_rate": 7.582952815829529e-06, "loss": 0.0647, "step": 3478 }, { "epoch": 3.1771689497716897, "grad_norm": 51.90496826171875, "learning_rate": 7.581938102486048e-06, "loss": 0.6063, "step": 3479 }, { "epoch": 3.1780821917808217, "grad_norm": 0.5363051891326904, "learning_rate": 7.580923389142568e-06, "loss": 0.0037, "step": 3480 }, { "epoch": 3.178995433789954, "grad_norm": 2.2349212169647217, "learning_rate": 7.579908675799087e-06, "loss": 0.012, "step": 3481 }, { "epoch": 3.1799086757990866, "grad_norm": 0.502032995223999, "learning_rate": 7.578893962455607e-06, "loss": 0.0045, "step": 3482 }, { "epoch": 3.180821917808219, "grad_norm": 3.7383620738983154, "learning_rate": 7.577879249112126e-06, "loss": 0.0229, "step": 3483 }, { "epoch": 3.1817351598173516, "grad_norm": 22.944576263427734, "learning_rate": 7.576864535768645e-06, "loss": 0.227, "step": 3484 }, { "epoch": 3.182648401826484, "grad_norm": 0.2781229615211487, "learning_rate": 7.575849822425166e-06, "loss": 0.0013, "step": 3485 }, { "epoch": 3.1835616438356165, "grad_norm": 114.0041732788086, "learning_rate": 7.574835109081685e-06, "loss": 2.7305, "step": 3486 }, { "epoch": 3.184474885844749, "grad_norm": 90.5605697631836, "learning_rate": 7.573820395738205e-06, "loss": 0.9947, "step": 3487 }, { "epoch": 3.1853881278538814, "grad_norm": 0.5265812277793884, "learning_rate": 7.572805682394724e-06, "loss": 0.0034, "step": 3488 }, { "epoch": 3.186301369863014, "grad_norm": 39.349063873291016, "learning_rate": 7.571790969051244e-06, "loss": 0.3238, "step": 3489 }, { "epoch": 3.1872146118721463, "grad_norm": 95.1288833618164, "learning_rate": 7.570776255707763e-06, "loss": 2.9176, "step": 3490 }, { "epoch": 3.1881278538812787, "grad_norm": 0.5886423587799072, "learning_rate": 7.569761542364282e-06, "loss": 0.0033, "step": 3491 }, { "epoch": 3.1890410958904107, "grad_norm": 111.7139892578125, "learning_rate": 7.568746829020802e-06, "loss": 1.421, "step": 3492 }, { "epoch": 3.189954337899543, "grad_norm": 2.7591710090637207, "learning_rate": 7.567732115677322e-06, "loss": 0.0172, "step": 3493 }, { "epoch": 3.1908675799086756, "grad_norm": 0.36517277359962463, "learning_rate": 7.56671740233384e-06, "loss": 0.0027, "step": 3494 }, { "epoch": 3.191780821917808, "grad_norm": 44.43657302856445, "learning_rate": 7.565702688990361e-06, "loss": 0.4728, "step": 3495 }, { "epoch": 3.1926940639269406, "grad_norm": 24.835594177246094, "learning_rate": 7.564687975646881e-06, "loss": 0.1267, "step": 3496 }, { "epoch": 3.193607305936073, "grad_norm": 0.09052451699972153, "learning_rate": 7.5636732623034e-06, "loss": 0.0007, "step": 3497 }, { "epoch": 3.1945205479452055, "grad_norm": 0.56050705909729, "learning_rate": 7.562658548959919e-06, "loss": 0.003, "step": 3498 }, { "epoch": 3.195433789954338, "grad_norm": 2.644914150238037, "learning_rate": 7.561643835616439e-06, "loss": 0.0198, "step": 3499 }, { "epoch": 3.1963470319634704, "grad_norm": 12.532938003540039, "learning_rate": 7.5606291222729585e-06, "loss": 0.072, "step": 3500 }, { "epoch": 3.197260273972603, "grad_norm": 0.5194905400276184, "learning_rate": 7.559614408929477e-06, "loss": 0.0043, "step": 3501 }, { "epoch": 3.1981735159817353, "grad_norm": 9.281118392944336, "learning_rate": 7.558599695585997e-06, "loss": 0.0685, "step": 3502 }, { "epoch": 3.1990867579908677, "grad_norm": 0.7775249481201172, "learning_rate": 7.557584982242518e-06, "loss": 0.0029, "step": 3503 }, { "epoch": 3.2, "grad_norm": 51.29029846191406, "learning_rate": 7.5565702688990365e-06, "loss": 0.3219, "step": 3504 }, { "epoch": 3.2009132420091326, "grad_norm": 7.156871318817139, "learning_rate": 7.555555555555556e-06, "loss": 0.0304, "step": 3505 }, { "epoch": 3.2018264840182646, "grad_norm": 4.681075096130371, "learning_rate": 7.554540842212076e-06, "loss": 0.0266, "step": 3506 }, { "epoch": 3.202739726027397, "grad_norm": 3.037872076034546, "learning_rate": 7.5535261288685955e-06, "loss": 0.0202, "step": 3507 }, { "epoch": 3.2036529680365295, "grad_norm": 2.131383180618286, "learning_rate": 7.552511415525114e-06, "loss": 0.0176, "step": 3508 }, { "epoch": 3.204566210045662, "grad_norm": 2.6967928409576416, "learning_rate": 7.551496702181634e-06, "loss": 0.0206, "step": 3509 }, { "epoch": 3.2054794520547945, "grad_norm": 2.1517343521118164, "learning_rate": 7.550481988838155e-06, "loss": 0.0195, "step": 3510 }, { "epoch": 3.206392694063927, "grad_norm": 13.098370552062988, "learning_rate": 7.549467275494673e-06, "loss": 0.0827, "step": 3511 }, { "epoch": 3.2073059360730594, "grad_norm": 2.2225804328918457, "learning_rate": 7.548452562151193e-06, "loss": 0.0189, "step": 3512 }, { "epoch": 3.208219178082192, "grad_norm": 2.625025510787964, "learning_rate": 7.547437848807713e-06, "loss": 0.0138, "step": 3513 }, { "epoch": 3.2091324200913243, "grad_norm": 6.537289619445801, "learning_rate": 7.546423135464232e-06, "loss": 0.041, "step": 3514 }, { "epoch": 3.2100456621004567, "grad_norm": 57.94194412231445, "learning_rate": 7.545408422120751e-06, "loss": 0.5022, "step": 3515 }, { "epoch": 3.210958904109589, "grad_norm": 0.10651525110006332, "learning_rate": 7.544393708777271e-06, "loss": 0.0007, "step": 3516 }, { "epoch": 3.2118721461187216, "grad_norm": 5.453615665435791, "learning_rate": 7.543378995433791e-06, "loss": 0.0343, "step": 3517 }, { "epoch": 3.212785388127854, "grad_norm": 1.5613294839859009, "learning_rate": 7.54236428209031e-06, "loss": 0.0104, "step": 3518 }, { "epoch": 3.213698630136986, "grad_norm": 82.3962173461914, "learning_rate": 7.541349568746829e-06, "loss": 2.5313, "step": 3519 }, { "epoch": 3.2146118721461185, "grad_norm": 9.251988410949707, "learning_rate": 7.54033485540335e-06, "loss": 0.0463, "step": 3520 }, { "epoch": 3.215525114155251, "grad_norm": 10.359053611755371, "learning_rate": 7.539320142059869e-06, "loss": 0.0779, "step": 3521 }, { "epoch": 3.2164383561643834, "grad_norm": 28.157611846923828, "learning_rate": 7.538305428716388e-06, "loss": 0.1814, "step": 3522 }, { "epoch": 3.217351598173516, "grad_norm": 10.265131950378418, "learning_rate": 7.537290715372908e-06, "loss": 0.0714, "step": 3523 }, { "epoch": 3.2182648401826484, "grad_norm": 5.77241325378418, "learning_rate": 7.536276002029427e-06, "loss": 0.0288, "step": 3524 }, { "epoch": 3.219178082191781, "grad_norm": 15.5018949508667, "learning_rate": 7.535261288685947e-06, "loss": 0.0962, "step": 3525 }, { "epoch": 3.2200913242009133, "grad_norm": 31.9439640045166, "learning_rate": 7.534246575342466e-06, "loss": 0.2689, "step": 3526 }, { "epoch": 3.2210045662100457, "grad_norm": 15.69792652130127, "learning_rate": 7.533231861998986e-06, "loss": 0.1046, "step": 3527 }, { "epoch": 3.221917808219178, "grad_norm": 15.611227035522461, "learning_rate": 7.532217148655505e-06, "loss": 0.0807, "step": 3528 }, { "epoch": 3.2228310502283106, "grad_norm": 58.35602951049805, "learning_rate": 7.531202435312025e-06, "loss": 0.2561, "step": 3529 }, { "epoch": 3.223744292237443, "grad_norm": 36.75550079345703, "learning_rate": 7.530187721968545e-06, "loss": 0.4174, "step": 3530 }, { "epoch": 3.2246575342465755, "grad_norm": 0.9545445442199707, "learning_rate": 7.529173008625064e-06, "loss": 0.0049, "step": 3531 }, { "epoch": 3.225570776255708, "grad_norm": 9.688504219055176, "learning_rate": 7.528158295281584e-06, "loss": 0.0669, "step": 3532 }, { "epoch": 3.22648401826484, "grad_norm": 55.56958770751953, "learning_rate": 7.527143581938103e-06, "loss": 0.3597, "step": 3533 }, { "epoch": 3.2273972602739724, "grad_norm": 4.0988688468933105, "learning_rate": 7.526128868594622e-06, "loss": 0.0178, "step": 3534 }, { "epoch": 3.228310502283105, "grad_norm": 69.23278045654297, "learning_rate": 7.525114155251142e-06, "loss": 0.676, "step": 3535 }, { "epoch": 3.2292237442922374, "grad_norm": 118.35858154296875, "learning_rate": 7.5240994419076615e-06, "loss": 2.5454, "step": 3536 }, { "epoch": 3.23013698630137, "grad_norm": 8.668741226196289, "learning_rate": 7.523084728564182e-06, "loss": 0.086, "step": 3537 }, { "epoch": 3.2310502283105023, "grad_norm": 28.18177604675293, "learning_rate": 7.5220700152207e-06, "loss": 0.1054, "step": 3538 }, { "epoch": 3.2319634703196347, "grad_norm": 8.704841613769531, "learning_rate": 7.521055301877221e-06, "loss": 0.0731, "step": 3539 }, { "epoch": 3.232876712328767, "grad_norm": 3.4462897777557373, "learning_rate": 7.52004058853374e-06, "loss": 0.018, "step": 3540 }, { "epoch": 3.2337899543378996, "grad_norm": 2.121567964553833, "learning_rate": 7.519025875190259e-06, "loss": 0.0137, "step": 3541 }, { "epoch": 3.234703196347032, "grad_norm": 2.4170544147491455, "learning_rate": 7.518011161846779e-06, "loss": 0.0126, "step": 3542 }, { "epoch": 3.2356164383561645, "grad_norm": 38.89559555053711, "learning_rate": 7.5169964485032985e-06, "loss": 0.3484, "step": 3543 }, { "epoch": 3.236529680365297, "grad_norm": 1.1219946146011353, "learning_rate": 7.515981735159817e-06, "loss": 0.0057, "step": 3544 }, { "epoch": 3.237442922374429, "grad_norm": 7.997583866119385, "learning_rate": 7.514967021816337e-06, "loss": 0.0459, "step": 3545 }, { "epoch": 3.2383561643835614, "grad_norm": 1.429696798324585, "learning_rate": 7.513952308472857e-06, "loss": 0.0108, "step": 3546 }, { "epoch": 3.239269406392694, "grad_norm": 95.25442504882812, "learning_rate": 7.512937595129377e-06, "loss": 0.8496, "step": 3547 }, { "epoch": 3.2401826484018263, "grad_norm": 59.93385696411133, "learning_rate": 7.511922881785896e-06, "loss": 0.4321, "step": 3548 }, { "epoch": 3.241095890410959, "grad_norm": 3.842982530593872, "learning_rate": 7.510908168442416e-06, "loss": 0.0192, "step": 3549 }, { "epoch": 3.2420091324200913, "grad_norm": 4.096563816070557, "learning_rate": 7.5098934550989355e-06, "loss": 0.0274, "step": 3550 }, { "epoch": 3.2429223744292237, "grad_norm": 9.120248794555664, "learning_rate": 7.508878741755454e-06, "loss": 0.0742, "step": 3551 }, { "epoch": 3.243835616438356, "grad_norm": 0.22175101935863495, "learning_rate": 7.507864028411974e-06, "loss": 0.0014, "step": 3552 }, { "epoch": 3.2447488584474886, "grad_norm": 2.0687825679779053, "learning_rate": 7.506849315068494e-06, "loss": 0.014, "step": 3553 }, { "epoch": 3.245662100456621, "grad_norm": 16.917144775390625, "learning_rate": 7.505834601725013e-06, "loss": 0.1451, "step": 3554 }, { "epoch": 3.2465753424657535, "grad_norm": 0.33887720108032227, "learning_rate": 7.504819888381532e-06, "loss": 0.0021, "step": 3555 }, { "epoch": 3.247488584474886, "grad_norm": 7.5532121658325195, "learning_rate": 7.503805175038053e-06, "loss": 0.0458, "step": 3556 }, { "epoch": 3.2484018264840184, "grad_norm": 23.372941970825195, "learning_rate": 7.5027904616945725e-06, "loss": 0.1606, "step": 3557 }, { "epoch": 3.249315068493151, "grad_norm": 8.38725471496582, "learning_rate": 7.501775748351091e-06, "loss": 0.0685, "step": 3558 }, { "epoch": 3.2502283105022833, "grad_norm": 3.6604340076446533, "learning_rate": 7.500761035007611e-06, "loss": 0.0301, "step": 3559 }, { "epoch": 3.2511415525114153, "grad_norm": 0.46645650267601013, "learning_rate": 7.499746321664131e-06, "loss": 0.0025, "step": 3560 }, { "epoch": 3.252054794520548, "grad_norm": 0.6778218150138855, "learning_rate": 7.49873160832065e-06, "loss": 0.0031, "step": 3561 }, { "epoch": 3.2529680365296803, "grad_norm": 3.879269599914551, "learning_rate": 7.497716894977169e-06, "loss": 0.0341, "step": 3562 }, { "epoch": 3.2538812785388127, "grad_norm": 73.10774993896484, "learning_rate": 7.496702181633689e-06, "loss": 0.4389, "step": 3563 }, { "epoch": 3.254794520547945, "grad_norm": 0.11257393658161163, "learning_rate": 7.495687468290208e-06, "loss": 0.0008, "step": 3564 }, { "epoch": 3.2557077625570776, "grad_norm": 1.8661631345748901, "learning_rate": 7.494672754946728e-06, "loss": 0.0115, "step": 3565 }, { "epoch": 3.25662100456621, "grad_norm": 3.1486246585845947, "learning_rate": 7.493658041603248e-06, "loss": 0.0164, "step": 3566 }, { "epoch": 3.2575342465753425, "grad_norm": 2.7382571697235107, "learning_rate": 7.492643328259768e-06, "loss": 0.012, "step": 3567 }, { "epoch": 3.258447488584475, "grad_norm": 3.695483684539795, "learning_rate": 7.491628614916287e-06, "loss": 0.0274, "step": 3568 }, { "epoch": 3.2593607305936074, "grad_norm": 16.366336822509766, "learning_rate": 7.490613901572806e-06, "loss": 0.0819, "step": 3569 }, { "epoch": 3.26027397260274, "grad_norm": 0.09189748764038086, "learning_rate": 7.489599188229326e-06, "loss": 0.0004, "step": 3570 }, { "epoch": 3.2611872146118723, "grad_norm": 0.06946808844804764, "learning_rate": 7.488584474885845e-06, "loss": 0.0007, "step": 3571 }, { "epoch": 3.2621004566210043, "grad_norm": 1.8418506383895874, "learning_rate": 7.4875697615423645e-06, "loss": 0.0098, "step": 3572 }, { "epoch": 3.263013698630137, "grad_norm": 1.632932424545288, "learning_rate": 7.486555048198885e-06, "loss": 0.0108, "step": 3573 }, { "epoch": 3.2639269406392692, "grad_norm": 4.530028820037842, "learning_rate": 7.485540334855403e-06, "loss": 0.0379, "step": 3574 }, { "epoch": 3.2648401826484017, "grad_norm": 2.934908866882324, "learning_rate": 7.484525621511924e-06, "loss": 0.0247, "step": 3575 }, { "epoch": 3.265753424657534, "grad_norm": 48.9134521484375, "learning_rate": 7.483510908168443e-06, "loss": 0.4224, "step": 3576 }, { "epoch": 3.2666666666666666, "grad_norm": 4.9484663009643555, "learning_rate": 7.482496194824963e-06, "loss": 0.0235, "step": 3577 }, { "epoch": 3.267579908675799, "grad_norm": 4.823935031890869, "learning_rate": 7.481481481481482e-06, "loss": 0.0381, "step": 3578 }, { "epoch": 3.2684931506849315, "grad_norm": 4.736446380615234, "learning_rate": 7.4804667681380015e-06, "loss": 0.0277, "step": 3579 }, { "epoch": 3.269406392694064, "grad_norm": 7.324808597564697, "learning_rate": 7.479452054794521e-06, "loss": 0.0396, "step": 3580 }, { "epoch": 3.2703196347031964, "grad_norm": 67.70584869384766, "learning_rate": 7.47843734145104e-06, "loss": 0.6478, "step": 3581 }, { "epoch": 3.271232876712329, "grad_norm": 1.1237562894821167, "learning_rate": 7.47742262810756e-06, "loss": 0.0094, "step": 3582 }, { "epoch": 3.2721461187214613, "grad_norm": 8.338029861450195, "learning_rate": 7.47640791476408e-06, "loss": 0.0405, "step": 3583 }, { "epoch": 3.273059360730594, "grad_norm": 8.668842315673828, "learning_rate": 7.475393201420599e-06, "loss": 0.0507, "step": 3584 }, { "epoch": 3.2739726027397262, "grad_norm": 22.566940307617188, "learning_rate": 7.474378488077119e-06, "loss": 0.157, "step": 3585 }, { "epoch": 3.2748858447488587, "grad_norm": 3.638829231262207, "learning_rate": 7.4733637747336385e-06, "loss": 0.0196, "step": 3586 }, { "epoch": 3.2757990867579907, "grad_norm": 1.1304469108581543, "learning_rate": 7.472349061390158e-06, "loss": 0.009, "step": 3587 }, { "epoch": 3.276712328767123, "grad_norm": 10.080090522766113, "learning_rate": 7.471334348046677e-06, "loss": 0.0475, "step": 3588 }, { "epoch": 3.2776255707762556, "grad_norm": 1.8613121509552002, "learning_rate": 7.470319634703197e-06, "loss": 0.0104, "step": 3589 }, { "epoch": 3.278538812785388, "grad_norm": 12.346779823303223, "learning_rate": 7.469304921359717e-06, "loss": 0.0715, "step": 3590 }, { "epoch": 3.2794520547945205, "grad_norm": 59.0296745300293, "learning_rate": 7.468290208016235e-06, "loss": 0.1982, "step": 3591 }, { "epoch": 3.280365296803653, "grad_norm": 0.6687883138656616, "learning_rate": 7.467275494672756e-06, "loss": 0.0058, "step": 3592 }, { "epoch": 3.2812785388127854, "grad_norm": 6.626013278961182, "learning_rate": 7.4662607813292755e-06, "loss": 0.0449, "step": 3593 }, { "epoch": 3.282191780821918, "grad_norm": 0.028767826035618782, "learning_rate": 7.465246067985794e-06, "loss": 0.0002, "step": 3594 }, { "epoch": 3.2831050228310503, "grad_norm": 1.1557378768920898, "learning_rate": 7.464231354642314e-06, "loss": 0.0064, "step": 3595 }, { "epoch": 3.2840182648401828, "grad_norm": 5.200647354125977, "learning_rate": 7.463216641298834e-06, "loss": 0.0358, "step": 3596 }, { "epoch": 3.2849315068493152, "grad_norm": 13.301665306091309, "learning_rate": 7.4622019279553534e-06, "loss": 0.0893, "step": 3597 }, { "epoch": 3.2858447488584472, "grad_norm": 71.97579193115234, "learning_rate": 7.461187214611872e-06, "loss": 1.6822, "step": 3598 }, { "epoch": 3.2867579908675797, "grad_norm": 118.28839111328125, "learning_rate": 7.460172501268392e-06, "loss": 2.8238, "step": 3599 }, { "epoch": 3.287671232876712, "grad_norm": 1.9661259651184082, "learning_rate": 7.4591577879249125e-06, "loss": 0.0085, "step": 3600 }, { "epoch": 3.2885844748858446, "grad_norm": 2.694995880126953, "learning_rate": 7.458143074581431e-06, "loss": 0.0239, "step": 3601 }, { "epoch": 3.289497716894977, "grad_norm": 36.63172149658203, "learning_rate": 7.457128361237951e-06, "loss": 0.0612, "step": 3602 }, { "epoch": 3.2904109589041095, "grad_norm": 0.6868838667869568, "learning_rate": 7.456113647894471e-06, "loss": 0.0047, "step": 3603 }, { "epoch": 3.291324200913242, "grad_norm": 25.743389129638672, "learning_rate": 7.4550989345509896e-06, "loss": 0.1955, "step": 3604 }, { "epoch": 3.2922374429223744, "grad_norm": 8.271951675415039, "learning_rate": 7.454084221207509e-06, "loss": 0.0454, "step": 3605 }, { "epoch": 3.293150684931507, "grad_norm": 9.31554889678955, "learning_rate": 7.453069507864029e-06, "loss": 0.0566, "step": 3606 }, { "epoch": 3.2940639269406393, "grad_norm": 7.554903030395508, "learning_rate": 7.452054794520549e-06, "loss": 0.0533, "step": 3607 }, { "epoch": 3.2949771689497718, "grad_norm": 99.0049819946289, "learning_rate": 7.4510400811770675e-06, "loss": 2.0645, "step": 3608 }, { "epoch": 3.2958904109589042, "grad_norm": 1.584564208984375, "learning_rate": 7.450025367833588e-06, "loss": 0.0092, "step": 3609 }, { "epoch": 3.2968036529680367, "grad_norm": 45.867794036865234, "learning_rate": 7.449010654490108e-06, "loss": 0.3754, "step": 3610 }, { "epoch": 3.297716894977169, "grad_norm": 41.71500015258789, "learning_rate": 7.4479959411466266e-06, "loss": 0.3786, "step": 3611 }, { "epoch": 3.2986301369863016, "grad_norm": 0.13717322051525116, "learning_rate": 7.446981227803146e-06, "loss": 0.0008, "step": 3612 }, { "epoch": 3.2995433789954336, "grad_norm": 4.00123405456543, "learning_rate": 7.445966514459666e-06, "loss": 0.0281, "step": 3613 }, { "epoch": 3.300456621004566, "grad_norm": 0.569036066532135, "learning_rate": 7.444951801116185e-06, "loss": 0.0029, "step": 3614 }, { "epoch": 3.3013698630136985, "grad_norm": 122.89127349853516, "learning_rate": 7.4439370877727045e-06, "loss": 3.4961, "step": 3615 }, { "epoch": 3.302283105022831, "grad_norm": 2.006329298019409, "learning_rate": 7.442922374429224e-06, "loss": 0.021, "step": 3616 }, { "epoch": 3.3031963470319634, "grad_norm": 0.2531399428844452, "learning_rate": 7.441907661085745e-06, "loss": 0.0018, "step": 3617 }, { "epoch": 3.304109589041096, "grad_norm": 66.33006286621094, "learning_rate": 7.440892947742263e-06, "loss": 0.4545, "step": 3618 }, { "epoch": 3.3050228310502283, "grad_norm": 11.680542945861816, "learning_rate": 7.439878234398783e-06, "loss": 0.0703, "step": 3619 }, { "epoch": 3.3059360730593608, "grad_norm": 4.659170150756836, "learning_rate": 7.438863521055303e-06, "loss": 0.0338, "step": 3620 }, { "epoch": 3.3068493150684932, "grad_norm": 23.45325469970703, "learning_rate": 7.437848807711822e-06, "loss": 0.154, "step": 3621 }, { "epoch": 3.3077625570776257, "grad_norm": 20.31644058227539, "learning_rate": 7.4368340943683415e-06, "loss": 0.1365, "step": 3622 }, { "epoch": 3.308675799086758, "grad_norm": 34.741241455078125, "learning_rate": 7.435819381024861e-06, "loss": 0.2722, "step": 3623 }, { "epoch": 3.3095890410958906, "grad_norm": 11.815503120422363, "learning_rate": 7.43480466768138e-06, "loss": 0.1061, "step": 3624 }, { "epoch": 3.3105022831050226, "grad_norm": 10.324026107788086, "learning_rate": 7.4337899543379e-06, "loss": 0.0865, "step": 3625 }, { "epoch": 3.311415525114155, "grad_norm": 2.863905906677246, "learning_rate": 7.432775240994419e-06, "loss": 0.0201, "step": 3626 }, { "epoch": 3.3123287671232875, "grad_norm": 0.847305953502655, "learning_rate": 7.43176052765094e-06, "loss": 0.0052, "step": 3627 }, { "epoch": 3.31324200913242, "grad_norm": 76.23734283447266, "learning_rate": 7.430745814307459e-06, "loss": 0.9466, "step": 3628 }, { "epoch": 3.3141552511415524, "grad_norm": 0.38884034752845764, "learning_rate": 7.4297311009639785e-06, "loss": 0.0031, "step": 3629 }, { "epoch": 3.315068493150685, "grad_norm": 50.440635681152344, "learning_rate": 7.428716387620498e-06, "loss": 0.3549, "step": 3630 }, { "epoch": 3.3159817351598173, "grad_norm": 0.19988805055618286, "learning_rate": 7.427701674277017e-06, "loss": 0.0009, "step": 3631 }, { "epoch": 3.3168949771689498, "grad_norm": 1.9710009098052979, "learning_rate": 7.426686960933537e-06, "loss": 0.0152, "step": 3632 }, { "epoch": 3.317808219178082, "grad_norm": 16.49820899963379, "learning_rate": 7.425672247590056e-06, "loss": 0.1418, "step": 3633 }, { "epoch": 3.3187214611872147, "grad_norm": 21.574296951293945, "learning_rate": 7.424657534246575e-06, "loss": 0.0853, "step": 3634 }, { "epoch": 3.319634703196347, "grad_norm": 0.4194672107696533, "learning_rate": 7.423642820903095e-06, "loss": 0.0031, "step": 3635 }, { "epoch": 3.3205479452054796, "grad_norm": 8.465167999267578, "learning_rate": 7.4226281075596155e-06, "loss": 0.0708, "step": 3636 }, { "epoch": 3.321461187214612, "grad_norm": 1.5175652503967285, "learning_rate": 7.421613394216135e-06, "loss": 0.0099, "step": 3637 }, { "epoch": 3.3223744292237445, "grad_norm": 0.6712175011634827, "learning_rate": 7.420598680872654e-06, "loss": 0.004, "step": 3638 }, { "epoch": 3.323287671232877, "grad_norm": 37.79096603393555, "learning_rate": 7.419583967529174e-06, "loss": 0.283, "step": 3639 }, { "epoch": 3.324200913242009, "grad_norm": 5.537343502044678, "learning_rate": 7.418569254185693e-06, "loss": 0.0341, "step": 3640 }, { "epoch": 3.3251141552511414, "grad_norm": 20.27739906311035, "learning_rate": 7.417554540842212e-06, "loss": 0.081, "step": 3641 }, { "epoch": 3.326027397260274, "grad_norm": 10.24923324584961, "learning_rate": 7.416539827498732e-06, "loss": 0.069, "step": 3642 }, { "epoch": 3.3269406392694063, "grad_norm": 72.85472869873047, "learning_rate": 7.415525114155252e-06, "loss": 0.88, "step": 3643 }, { "epoch": 3.3278538812785388, "grad_norm": 1.5507358312606812, "learning_rate": 7.4145104008117705e-06, "loss": 0.0098, "step": 3644 }, { "epoch": 3.328767123287671, "grad_norm": 7.623997211456299, "learning_rate": 7.413495687468291e-06, "loss": 0.0558, "step": 3645 }, { "epoch": 3.3296803652968037, "grad_norm": 21.25470542907715, "learning_rate": 7.412480974124811e-06, "loss": 0.0796, "step": 3646 }, { "epoch": 3.330593607305936, "grad_norm": 1.2044024467468262, "learning_rate": 7.41146626078133e-06, "loss": 0.0072, "step": 3647 }, { "epoch": 3.3315068493150686, "grad_norm": 2.169628143310547, "learning_rate": 7.410451547437849e-06, "loss": 0.0145, "step": 3648 }, { "epoch": 3.332420091324201, "grad_norm": 0.7470640540122986, "learning_rate": 7.409436834094369e-06, "loss": 0.0063, "step": 3649 }, { "epoch": 3.3333333333333335, "grad_norm": 2.755319833755493, "learning_rate": 7.408422120750889e-06, "loss": 0.0199, "step": 3650 }, { "epoch": 3.334246575342466, "grad_norm": 59.14192199707031, "learning_rate": 7.4074074074074075e-06, "loss": 0.3558, "step": 3651 }, { "epoch": 3.335159817351598, "grad_norm": 45.19364547729492, "learning_rate": 7.406392694063927e-06, "loss": 0.2812, "step": 3652 }, { "epoch": 3.3360730593607304, "grad_norm": 0.39767828583717346, "learning_rate": 7.405377980720448e-06, "loss": 0.0023, "step": 3653 }, { "epoch": 3.336986301369863, "grad_norm": 1.6238713264465332, "learning_rate": 7.404363267376966e-06, "loss": 0.01, "step": 3654 }, { "epoch": 3.3378995433789953, "grad_norm": 7.021391868591309, "learning_rate": 7.403348554033486e-06, "loss": 0.0559, "step": 3655 }, { "epoch": 3.3388127853881278, "grad_norm": 0.4394574463367462, "learning_rate": 7.402333840690006e-06, "loss": 0.0028, "step": 3656 }, { "epoch": 3.33972602739726, "grad_norm": 9.454804420471191, "learning_rate": 7.401319127346526e-06, "loss": 0.0667, "step": 3657 }, { "epoch": 3.3406392694063927, "grad_norm": 0.2720225155353546, "learning_rate": 7.4003044140030445e-06, "loss": 0.0025, "step": 3658 }, { "epoch": 3.341552511415525, "grad_norm": 0.14381174743175507, "learning_rate": 7.399289700659564e-06, "loss": 0.001, "step": 3659 }, { "epoch": 3.3424657534246576, "grad_norm": 10.981610298156738, "learning_rate": 7.398274987316084e-06, "loss": 0.0835, "step": 3660 }, { "epoch": 3.34337899543379, "grad_norm": 3.955305576324463, "learning_rate": 7.397260273972603e-06, "loss": 0.0242, "step": 3661 }, { "epoch": 3.3442922374429225, "grad_norm": 14.822775840759277, "learning_rate": 7.396245560629122e-06, "loss": 0.1174, "step": 3662 }, { "epoch": 3.345205479452055, "grad_norm": 31.049524307250977, "learning_rate": 7.395230847285643e-06, "loss": 0.2556, "step": 3663 }, { "epoch": 3.3461187214611874, "grad_norm": 11.201703071594238, "learning_rate": 7.394216133942162e-06, "loss": 0.079, "step": 3664 }, { "epoch": 3.34703196347032, "grad_norm": 9.533329963684082, "learning_rate": 7.3932014205986815e-06, "loss": 0.0679, "step": 3665 }, { "epoch": 3.3479452054794523, "grad_norm": 31.638357162475586, "learning_rate": 7.392186707255201e-06, "loss": 0.1876, "step": 3666 }, { "epoch": 3.3488584474885843, "grad_norm": 0.17661525309085846, "learning_rate": 7.391171993911721e-06, "loss": 0.0012, "step": 3667 }, { "epoch": 3.3497716894977168, "grad_norm": 21.32923698425293, "learning_rate": 7.39015728056824e-06, "loss": 0.1969, "step": 3668 }, { "epoch": 3.350684931506849, "grad_norm": 9.739097595214844, "learning_rate": 7.389142567224759e-06, "loss": 0.0623, "step": 3669 }, { "epoch": 3.3515981735159817, "grad_norm": 86.26960754394531, "learning_rate": 7.388127853881279e-06, "loss": 1.5363, "step": 3670 }, { "epoch": 3.352511415525114, "grad_norm": 57.81304931640625, "learning_rate": 7.387113140537798e-06, "loss": 0.479, "step": 3671 }, { "epoch": 3.3534246575342466, "grad_norm": 15.844071388244629, "learning_rate": 7.3860984271943185e-06, "loss": 0.1192, "step": 3672 }, { "epoch": 3.354337899543379, "grad_norm": 1.984391689300537, "learning_rate": 7.385083713850838e-06, "loss": 0.0125, "step": 3673 }, { "epoch": 3.3552511415525115, "grad_norm": 14.122801780700684, "learning_rate": 7.384069000507357e-06, "loss": 0.1053, "step": 3674 }, { "epoch": 3.356164383561644, "grad_norm": 3.030710458755493, "learning_rate": 7.383054287163877e-06, "loss": 0.0256, "step": 3675 }, { "epoch": 3.3570776255707764, "grad_norm": 4.146261215209961, "learning_rate": 7.382039573820396e-06, "loss": 0.03, "step": 3676 }, { "epoch": 3.357990867579909, "grad_norm": 2.7591731548309326, "learning_rate": 7.381024860476916e-06, "loss": 0.0269, "step": 3677 }, { "epoch": 3.3589041095890413, "grad_norm": 1.042052149772644, "learning_rate": 7.380010147133435e-06, "loss": 0.0047, "step": 3678 }, { "epoch": 3.3598173515981733, "grad_norm": 8.259452819824219, "learning_rate": 7.378995433789955e-06, "loss": 0.0507, "step": 3679 }, { "epoch": 3.3607305936073057, "grad_norm": 104.76571655273438, "learning_rate": 7.377980720446475e-06, "loss": 5.6542, "step": 3680 }, { "epoch": 3.361643835616438, "grad_norm": 5.870633602142334, "learning_rate": 7.376966007102994e-06, "loss": 0.0482, "step": 3681 }, { "epoch": 3.3625570776255707, "grad_norm": 13.612985610961914, "learning_rate": 7.375951293759514e-06, "loss": 0.0888, "step": 3682 }, { "epoch": 3.363470319634703, "grad_norm": 0.07719674706459045, "learning_rate": 7.374936580416033e-06, "loss": 0.0006, "step": 3683 }, { "epoch": 3.3643835616438356, "grad_norm": 4.0553178787231445, "learning_rate": 7.373921867072552e-06, "loss": 0.024, "step": 3684 }, { "epoch": 3.365296803652968, "grad_norm": 27.186477661132812, "learning_rate": 7.372907153729072e-06, "loss": 0.1758, "step": 3685 }, { "epoch": 3.3662100456621005, "grad_norm": 16.144926071166992, "learning_rate": 7.371892440385592e-06, "loss": 0.1057, "step": 3686 }, { "epoch": 3.367123287671233, "grad_norm": 15.404406547546387, "learning_rate": 7.370877727042111e-06, "loss": 0.0749, "step": 3687 }, { "epoch": 3.3680365296803654, "grad_norm": 89.45437622070312, "learning_rate": 7.36986301369863e-06, "loss": 1.7859, "step": 3688 }, { "epoch": 3.368949771689498, "grad_norm": 1.5996657609939575, "learning_rate": 7.368848300355151e-06, "loss": 0.01, "step": 3689 }, { "epoch": 3.3698630136986303, "grad_norm": 67.76899719238281, "learning_rate": 7.36783358701167e-06, "loss": 0.4006, "step": 3690 }, { "epoch": 3.3707762557077627, "grad_norm": 3.0847675800323486, "learning_rate": 7.366818873668189e-06, "loss": 0.019, "step": 3691 }, { "epoch": 3.371689497716895, "grad_norm": 14.615703582763672, "learning_rate": 7.365804160324709e-06, "loss": 0.0848, "step": 3692 }, { "epoch": 3.3726027397260276, "grad_norm": 4.289705276489258, "learning_rate": 7.364789446981229e-06, "loss": 0.0246, "step": 3693 }, { "epoch": 3.3735159817351597, "grad_norm": 9.04980182647705, "learning_rate": 7.3637747336377475e-06, "loss": 0.0651, "step": 3694 }, { "epoch": 3.374429223744292, "grad_norm": 7.061894416809082, "learning_rate": 7.362760020294267e-06, "loss": 0.0844, "step": 3695 }, { "epoch": 3.3753424657534246, "grad_norm": 9.826672554016113, "learning_rate": 7.361745306950787e-06, "loss": 0.0542, "step": 3696 }, { "epoch": 3.376255707762557, "grad_norm": 60.5388069152832, "learning_rate": 7.360730593607307e-06, "loss": 0.7713, "step": 3697 }, { "epoch": 3.3771689497716895, "grad_norm": 0.3341805934906006, "learning_rate": 7.359715880263825e-06, "loss": 0.0018, "step": 3698 }, { "epoch": 3.378082191780822, "grad_norm": 1.0581783056259155, "learning_rate": 7.358701166920346e-06, "loss": 0.0087, "step": 3699 }, { "epoch": 3.3789954337899544, "grad_norm": 2.2726218700408936, "learning_rate": 7.357686453576866e-06, "loss": 0.0159, "step": 3700 }, { "epoch": 3.379908675799087, "grad_norm": 81.34866333007812, "learning_rate": 7.3566717402333845e-06, "loss": 3.441, "step": 3701 }, { "epoch": 3.3808219178082193, "grad_norm": 1.4125254154205322, "learning_rate": 7.355657026889904e-06, "loss": 0.0067, "step": 3702 }, { "epoch": 3.3817351598173517, "grad_norm": 0.4240553379058838, "learning_rate": 7.354642313546424e-06, "loss": 0.0032, "step": 3703 }, { "epoch": 3.382648401826484, "grad_norm": 8.527451515197754, "learning_rate": 7.353627600202943e-06, "loss": 0.0613, "step": 3704 }, { "epoch": 3.383561643835616, "grad_norm": 15.292994499206543, "learning_rate": 7.352612886859462e-06, "loss": 0.1285, "step": 3705 }, { "epoch": 3.3844748858447486, "grad_norm": 7.816246032714844, "learning_rate": 7.351598173515982e-06, "loss": 0.0294, "step": 3706 }, { "epoch": 3.385388127853881, "grad_norm": 28.867033004760742, "learning_rate": 7.350583460172503e-06, "loss": 0.3433, "step": 3707 }, { "epoch": 3.3863013698630136, "grad_norm": 7.572982311248779, "learning_rate": 7.3495687468290215e-06, "loss": 0.037, "step": 3708 }, { "epoch": 3.387214611872146, "grad_norm": 6.796749591827393, "learning_rate": 7.348554033485541e-06, "loss": 0.0451, "step": 3709 }, { "epoch": 3.3881278538812785, "grad_norm": 1.0572062730789185, "learning_rate": 7.347539320142061e-06, "loss": 0.0078, "step": 3710 }, { "epoch": 3.389041095890411, "grad_norm": 7.695361137390137, "learning_rate": 7.34652460679858e-06, "loss": 0.0481, "step": 3711 }, { "epoch": 3.3899543378995434, "grad_norm": 5.335010528564453, "learning_rate": 7.345509893455099e-06, "loss": 0.0429, "step": 3712 }, { "epoch": 3.390867579908676, "grad_norm": 25.611835479736328, "learning_rate": 7.344495180111619e-06, "loss": 0.1293, "step": 3713 }, { "epoch": 3.3917808219178083, "grad_norm": 30.549577713012695, "learning_rate": 7.343480466768138e-06, "loss": 0.1488, "step": 3714 }, { "epoch": 3.3926940639269407, "grad_norm": 37.36494445800781, "learning_rate": 7.342465753424658e-06, "loss": 0.2808, "step": 3715 }, { "epoch": 3.393607305936073, "grad_norm": 79.72049713134766, "learning_rate": 7.341451040081178e-06, "loss": 1.2684, "step": 3716 }, { "epoch": 3.3945205479452056, "grad_norm": 38.158241271972656, "learning_rate": 7.340436326737698e-06, "loss": 0.4367, "step": 3717 }, { "epoch": 3.395433789954338, "grad_norm": 0.2716071307659149, "learning_rate": 7.339421613394217e-06, "loss": 0.0021, "step": 3718 }, { "epoch": 3.3963470319634705, "grad_norm": 4.02541971206665, "learning_rate": 7.338406900050736e-06, "loss": 0.0392, "step": 3719 }, { "epoch": 3.3972602739726026, "grad_norm": 0.5440673828125, "learning_rate": 7.337392186707256e-06, "loss": 0.0041, "step": 3720 }, { "epoch": 3.398173515981735, "grad_norm": 105.88372039794922, "learning_rate": 7.336377473363775e-06, "loss": 3.9295, "step": 3721 }, { "epoch": 3.3990867579908675, "grad_norm": 48.82932662963867, "learning_rate": 7.335362760020295e-06, "loss": 0.5681, "step": 3722 }, { "epoch": 3.4, "grad_norm": 71.87686920166016, "learning_rate": 7.334348046676814e-06, "loss": 0.1088, "step": 3723 }, { "epoch": 3.4009132420091324, "grad_norm": 4.437734603881836, "learning_rate": 7.333333333333333e-06, "loss": 0.0332, "step": 3724 }, { "epoch": 3.401826484018265, "grad_norm": 1.099267840385437, "learning_rate": 7.332318619989854e-06, "loss": 0.0052, "step": 3725 }, { "epoch": 3.4027397260273973, "grad_norm": 16.294675827026367, "learning_rate": 7.331303906646373e-06, "loss": 0.1428, "step": 3726 }, { "epoch": 3.4036529680365297, "grad_norm": 0.42377808690071106, "learning_rate": 7.330289193302893e-06, "loss": 0.0027, "step": 3727 }, { "epoch": 3.404566210045662, "grad_norm": 24.12274932861328, "learning_rate": 7.329274479959412e-06, "loss": 0.2386, "step": 3728 }, { "epoch": 3.4054794520547946, "grad_norm": 1.738348364830017, "learning_rate": 7.328259766615932e-06, "loss": 0.0149, "step": 3729 }, { "epoch": 3.406392694063927, "grad_norm": 1.79509699344635, "learning_rate": 7.327245053272451e-06, "loss": 0.0101, "step": 3730 }, { "epoch": 3.4073059360730595, "grad_norm": 36.639381408691406, "learning_rate": 7.32623033992897e-06, "loss": 0.4039, "step": 3731 }, { "epoch": 3.4082191780821915, "grad_norm": 0.12255147099494934, "learning_rate": 7.32521562658549e-06, "loss": 0.0008, "step": 3732 }, { "epoch": 3.409132420091324, "grad_norm": 3.0694432258605957, "learning_rate": 7.32420091324201e-06, "loss": 0.0222, "step": 3733 }, { "epoch": 3.4100456621004565, "grad_norm": 1.5381029844284058, "learning_rate": 7.323186199898528e-06, "loss": 0.008, "step": 3734 }, { "epoch": 3.410958904109589, "grad_norm": 13.34021282196045, "learning_rate": 7.322171486555049e-06, "loss": 0.1018, "step": 3735 }, { "epoch": 3.4118721461187214, "grad_norm": 47.72376251220703, "learning_rate": 7.321156773211569e-06, "loss": 0.5109, "step": 3736 }, { "epoch": 3.412785388127854, "grad_norm": 25.081588745117188, "learning_rate": 7.320142059868088e-06, "loss": 0.1865, "step": 3737 }, { "epoch": 3.4136986301369863, "grad_norm": 5.9820942878723145, "learning_rate": 7.319127346524607e-06, "loss": 0.0149, "step": 3738 }, { "epoch": 3.4146118721461187, "grad_norm": 0.7937784194946289, "learning_rate": 7.318112633181127e-06, "loss": 0.0051, "step": 3739 }, { "epoch": 3.415525114155251, "grad_norm": 5.752017021179199, "learning_rate": 7.3170979198376465e-06, "loss": 0.041, "step": 3740 }, { "epoch": 3.4164383561643836, "grad_norm": 3.8300938606262207, "learning_rate": 7.316083206494165e-06, "loss": 0.0188, "step": 3741 }, { "epoch": 3.417351598173516, "grad_norm": 10.870438575744629, "learning_rate": 7.315068493150685e-06, "loss": 0.0807, "step": 3742 }, { "epoch": 3.4182648401826485, "grad_norm": 1.2507004737854004, "learning_rate": 7.314053779807206e-06, "loss": 0.0092, "step": 3743 }, { "epoch": 3.419178082191781, "grad_norm": 12.487606048583984, "learning_rate": 7.3130390664637244e-06, "loss": 0.0873, "step": 3744 }, { "epoch": 3.4200913242009134, "grad_norm": 13.040355682373047, "learning_rate": 7.312024353120244e-06, "loss": 0.1259, "step": 3745 }, { "epoch": 3.421004566210046, "grad_norm": 57.24106216430664, "learning_rate": 7.311009639776764e-06, "loss": 0.4166, "step": 3746 }, { "epoch": 3.421917808219178, "grad_norm": 2.307727098464966, "learning_rate": 7.3099949264332835e-06, "loss": 0.0138, "step": 3747 }, { "epoch": 3.4228310502283104, "grad_norm": 87.69284057617188, "learning_rate": 7.308980213089802e-06, "loss": 2.0406, "step": 3748 }, { "epoch": 3.423744292237443, "grad_norm": 36.2689094543457, "learning_rate": 7.307965499746322e-06, "loss": 0.2072, "step": 3749 }, { "epoch": 3.4246575342465753, "grad_norm": 24.426593780517578, "learning_rate": 7.306950786402842e-06, "loss": 0.1641, "step": 3750 }, { "epoch": 3.4255707762557077, "grad_norm": 71.6814956665039, "learning_rate": 7.305936073059361e-06, "loss": 0.9308, "step": 3751 }, { "epoch": 3.42648401826484, "grad_norm": 44.03071975708008, "learning_rate": 7.304921359715881e-06, "loss": 0.2932, "step": 3752 }, { "epoch": 3.4273972602739726, "grad_norm": 45.83868408203125, "learning_rate": 7.303906646372401e-06, "loss": 0.3447, "step": 3753 }, { "epoch": 3.428310502283105, "grad_norm": 101.52962493896484, "learning_rate": 7.30289193302892e-06, "loss": 0.6476, "step": 3754 }, { "epoch": 3.4292237442922375, "grad_norm": 1.0514473915100098, "learning_rate": 7.301877219685439e-06, "loss": 0.0063, "step": 3755 }, { "epoch": 3.43013698630137, "grad_norm": 16.45037841796875, "learning_rate": 7.300862506341959e-06, "loss": 0.1846, "step": 3756 }, { "epoch": 3.4310502283105024, "grad_norm": 7.508545398712158, "learning_rate": 7.299847792998479e-06, "loss": 0.0371, "step": 3757 }, { "epoch": 3.431963470319635, "grad_norm": 1.3728002309799194, "learning_rate": 7.298833079654998e-06, "loss": 0.0113, "step": 3758 }, { "epoch": 3.432876712328767, "grad_norm": 12.84211540222168, "learning_rate": 7.297818366311517e-06, "loss": 0.0602, "step": 3759 }, { "epoch": 3.4337899543378994, "grad_norm": 15.577217102050781, "learning_rate": 7.296803652968038e-06, "loss": 0.1048, "step": 3760 }, { "epoch": 3.434703196347032, "grad_norm": 1.1227039098739624, "learning_rate": 7.295788939624557e-06, "loss": 0.0081, "step": 3761 }, { "epoch": 3.4356164383561643, "grad_norm": 0.23906098306179047, "learning_rate": 7.294774226281076e-06, "loss": 0.0019, "step": 3762 }, { "epoch": 3.4365296803652967, "grad_norm": 70.1905517578125, "learning_rate": 7.293759512937596e-06, "loss": 0.8704, "step": 3763 }, { "epoch": 3.437442922374429, "grad_norm": 23.16960906982422, "learning_rate": 7.292744799594115e-06, "loss": 0.1762, "step": 3764 }, { "epoch": 3.4383561643835616, "grad_norm": 0.7389726638793945, "learning_rate": 7.291730086250635e-06, "loss": 0.0034, "step": 3765 }, { "epoch": 3.439269406392694, "grad_norm": 11.15634822845459, "learning_rate": 7.290715372907154e-06, "loss": 0.0791, "step": 3766 }, { "epoch": 3.4401826484018265, "grad_norm": 109.9561767578125, "learning_rate": 7.289700659563674e-06, "loss": 0.581, "step": 3767 }, { "epoch": 3.441095890410959, "grad_norm": 5.827457427978516, "learning_rate": 7.288685946220193e-06, "loss": 0.0321, "step": 3768 }, { "epoch": 3.4420091324200914, "grad_norm": 2.065551280975342, "learning_rate": 7.287671232876713e-06, "loss": 0.0151, "step": 3769 }, { "epoch": 3.442922374429224, "grad_norm": 18.731693267822266, "learning_rate": 7.286656519533233e-06, "loss": 0.1046, "step": 3770 }, { "epoch": 3.4438356164383563, "grad_norm": 0.49910464882850647, "learning_rate": 7.285641806189752e-06, "loss": 0.0055, "step": 3771 }, { "epoch": 3.444748858447489, "grad_norm": 40.476802825927734, "learning_rate": 7.284627092846272e-06, "loss": 0.4055, "step": 3772 }, { "epoch": 3.4456621004566212, "grad_norm": 53.264244079589844, "learning_rate": 7.283612379502791e-06, "loss": 0.5943, "step": 3773 }, { "epoch": 3.4465753424657533, "grad_norm": 5.3927764892578125, "learning_rate": 7.28259766615931e-06, "loss": 0.0323, "step": 3774 }, { "epoch": 3.4474885844748857, "grad_norm": 5.707638263702393, "learning_rate": 7.28158295281583e-06, "loss": 0.0116, "step": 3775 }, { "epoch": 3.448401826484018, "grad_norm": 22.957929611206055, "learning_rate": 7.2805682394723495e-06, "loss": 0.1462, "step": 3776 }, { "epoch": 3.4493150684931506, "grad_norm": 8.346916198730469, "learning_rate": 7.27955352612887e-06, "loss": 0.0282, "step": 3777 }, { "epoch": 3.450228310502283, "grad_norm": 9.330735206604004, "learning_rate": 7.278538812785388e-06, "loss": 0.0455, "step": 3778 }, { "epoch": 3.4511415525114155, "grad_norm": 6.985513687133789, "learning_rate": 7.277524099441909e-06, "loss": 0.0449, "step": 3779 }, { "epoch": 3.452054794520548, "grad_norm": 1.7732867002487183, "learning_rate": 7.276509386098428e-06, "loss": 0.0081, "step": 3780 }, { "epoch": 3.4529680365296804, "grad_norm": 45.62754821777344, "learning_rate": 7.275494672754947e-06, "loss": 0.3846, "step": 3781 }, { "epoch": 3.453881278538813, "grad_norm": 1.8106329441070557, "learning_rate": 7.274479959411467e-06, "loss": 0.0147, "step": 3782 }, { "epoch": 3.4547945205479453, "grad_norm": 4.283328533172607, "learning_rate": 7.2734652460679865e-06, "loss": 0.0376, "step": 3783 }, { "epoch": 3.455707762557078, "grad_norm": 3.9953486919403076, "learning_rate": 7.272450532724505e-06, "loss": 0.0311, "step": 3784 }, { "epoch": 3.45662100456621, "grad_norm": 2.6236326694488525, "learning_rate": 7.271435819381025e-06, "loss": 0.0165, "step": 3785 }, { "epoch": 3.4575342465753423, "grad_norm": 0.3535451292991638, "learning_rate": 7.270421106037545e-06, "loss": 0.0023, "step": 3786 }, { "epoch": 3.4584474885844747, "grad_norm": 108.26703643798828, "learning_rate": 7.269406392694065e-06, "loss": 2.1887, "step": 3787 }, { "epoch": 3.459360730593607, "grad_norm": 6.001184463500977, "learning_rate": 7.268391679350584e-06, "loss": 0.0377, "step": 3788 }, { "epoch": 3.4602739726027396, "grad_norm": 3.3301119804382324, "learning_rate": 7.267376966007104e-06, "loss": 0.0161, "step": 3789 }, { "epoch": 3.461187214611872, "grad_norm": 4.809671401977539, "learning_rate": 7.2663622526636235e-06, "loss": 0.0349, "step": 3790 }, { "epoch": 3.4621004566210045, "grad_norm": 1.5231049060821533, "learning_rate": 7.265347539320142e-06, "loss": 0.0088, "step": 3791 }, { "epoch": 3.463013698630137, "grad_norm": 26.014551162719727, "learning_rate": 7.264332825976662e-06, "loss": 0.1855, "step": 3792 }, { "epoch": 3.4639269406392694, "grad_norm": 31.538755416870117, "learning_rate": 7.263318112633182e-06, "loss": 0.344, "step": 3793 }, { "epoch": 3.464840182648402, "grad_norm": 0.7636082172393799, "learning_rate": 7.2623033992897006e-06, "loss": 0.0043, "step": 3794 }, { "epoch": 3.4657534246575343, "grad_norm": 11.433036804199219, "learning_rate": 7.26128868594622e-06, "loss": 0.1196, "step": 3795 }, { "epoch": 3.466666666666667, "grad_norm": 31.810413360595703, "learning_rate": 7.260273972602741e-06, "loss": 0.2695, "step": 3796 }, { "epoch": 3.4675799086757992, "grad_norm": 1.6510900259017944, "learning_rate": 7.2592592592592605e-06, "loss": 0.015, "step": 3797 }, { "epoch": 3.4684931506849317, "grad_norm": 1.203249216079712, "learning_rate": 7.258244545915779e-06, "loss": 0.0076, "step": 3798 }, { "epoch": 3.469406392694064, "grad_norm": 3.8971521854400635, "learning_rate": 7.257229832572299e-06, "loss": 0.0337, "step": 3799 }, { "epoch": 3.470319634703196, "grad_norm": 16.510637283325195, "learning_rate": 7.256215119228819e-06, "loss": 0.1513, "step": 3800 }, { "epoch": 3.4712328767123286, "grad_norm": 15.071669578552246, "learning_rate": 7.2552004058853376e-06, "loss": 0.1428, "step": 3801 }, { "epoch": 3.472146118721461, "grad_norm": 1.644382357597351, "learning_rate": 7.254185692541857e-06, "loss": 0.0112, "step": 3802 }, { "epoch": 3.4730593607305935, "grad_norm": 46.3703498840332, "learning_rate": 7.253170979198377e-06, "loss": 0.6492, "step": 3803 }, { "epoch": 3.473972602739726, "grad_norm": 34.43634796142578, "learning_rate": 7.252156265854896e-06, "loss": 0.1997, "step": 3804 }, { "epoch": 3.4748858447488584, "grad_norm": 3.7406015396118164, "learning_rate": 7.251141552511416e-06, "loss": 0.0193, "step": 3805 }, { "epoch": 3.475799086757991, "grad_norm": 1.3400381803512573, "learning_rate": 7.250126839167936e-06, "loss": 0.0112, "step": 3806 }, { "epoch": 3.4767123287671233, "grad_norm": 0.4806782603263855, "learning_rate": 7.249112125824456e-06, "loss": 0.0039, "step": 3807 }, { "epoch": 3.477625570776256, "grad_norm": 72.78699493408203, "learning_rate": 7.2480974124809746e-06, "loss": 2.4934, "step": 3808 }, { "epoch": 3.4785388127853882, "grad_norm": 47.53947830200195, "learning_rate": 7.247082699137494e-06, "loss": 0.7026, "step": 3809 }, { "epoch": 3.4794520547945207, "grad_norm": 82.76289367675781, "learning_rate": 7.246067985794014e-06, "loss": 3.0721, "step": 3810 }, { "epoch": 3.480365296803653, "grad_norm": 25.791507720947266, "learning_rate": 7.245053272450533e-06, "loss": 0.2674, "step": 3811 }, { "epoch": 3.481278538812785, "grad_norm": 39.618045806884766, "learning_rate": 7.2440385591070525e-06, "loss": 0.3057, "step": 3812 }, { "epoch": 3.4821917808219176, "grad_norm": 0.17752037942409515, "learning_rate": 7.243023845763573e-06, "loss": 0.0012, "step": 3813 }, { "epoch": 3.48310502283105, "grad_norm": 3.468825101852417, "learning_rate": 7.242009132420091e-06, "loss": 0.0236, "step": 3814 }, { "epoch": 3.4840182648401825, "grad_norm": 10.200716972351074, "learning_rate": 7.2409944190766116e-06, "loss": 0.0711, "step": 3815 }, { "epoch": 3.484931506849315, "grad_norm": 0.15001150965690613, "learning_rate": 7.239979705733131e-06, "loss": 0.0012, "step": 3816 }, { "epoch": 3.4858447488584474, "grad_norm": 2.57749080657959, "learning_rate": 7.238964992389651e-06, "loss": 0.0193, "step": 3817 }, { "epoch": 3.48675799086758, "grad_norm": 6.683781147003174, "learning_rate": 7.23795027904617e-06, "loss": 0.0321, "step": 3818 }, { "epoch": 3.4876712328767123, "grad_norm": 0.5205693244934082, "learning_rate": 7.2369355657026895e-06, "loss": 0.0028, "step": 3819 }, { "epoch": 3.4885844748858448, "grad_norm": 12.555866241455078, "learning_rate": 7.235920852359209e-06, "loss": 0.0749, "step": 3820 }, { "epoch": 3.4894977168949772, "grad_norm": 2.4212746620178223, "learning_rate": 7.234906139015728e-06, "loss": 0.0154, "step": 3821 }, { "epoch": 3.4904109589041097, "grad_norm": 0.6936039924621582, "learning_rate": 7.233891425672248e-06, "loss": 0.0054, "step": 3822 }, { "epoch": 3.491324200913242, "grad_norm": 18.31840705871582, "learning_rate": 7.232876712328768e-06, "loss": 0.07, "step": 3823 }, { "epoch": 3.4922374429223746, "grad_norm": 1.5887982845306396, "learning_rate": 7.231861998985287e-06, "loss": 0.0104, "step": 3824 }, { "epoch": 3.493150684931507, "grad_norm": 3.194211483001709, "learning_rate": 7.230847285641807e-06, "loss": 0.0257, "step": 3825 }, { "epoch": 3.4940639269406395, "grad_norm": 4.3318610191345215, "learning_rate": 7.2298325722983265e-06, "loss": 0.0257, "step": 3826 }, { "epoch": 3.4949771689497715, "grad_norm": 2.0852019786834717, "learning_rate": 7.228817858954846e-06, "loss": 0.0139, "step": 3827 }, { "epoch": 3.495890410958904, "grad_norm": 71.24475860595703, "learning_rate": 7.227803145611365e-06, "loss": 0.5236, "step": 3828 }, { "epoch": 3.4968036529680364, "grad_norm": 7.4897637367248535, "learning_rate": 7.226788432267885e-06, "loss": 0.0594, "step": 3829 }, { "epoch": 3.497716894977169, "grad_norm": 0.516537070274353, "learning_rate": 7.225773718924404e-06, "loss": 0.0034, "step": 3830 }, { "epoch": 3.4986301369863013, "grad_norm": 0.1694604605436325, "learning_rate": 7.224759005580923e-06, "loss": 0.001, "step": 3831 }, { "epoch": 3.4995433789954338, "grad_norm": 1.0902032852172852, "learning_rate": 7.223744292237444e-06, "loss": 0.0058, "step": 3832 }, { "epoch": 3.5004566210045662, "grad_norm": 21.688091278076172, "learning_rate": 7.2227295788939635e-06, "loss": 0.1564, "step": 3833 }, { "epoch": 3.5013698630136987, "grad_norm": 7.047280311584473, "learning_rate": 7.221714865550482e-06, "loss": 0.025, "step": 3834 }, { "epoch": 3.502283105022831, "grad_norm": 0.053374677896499634, "learning_rate": 7.220700152207002e-06, "loss": 0.0004, "step": 3835 }, { "epoch": 3.5031963470319636, "grad_norm": 2.6354451179504395, "learning_rate": 7.219685438863522e-06, "loss": 0.0221, "step": 3836 }, { "epoch": 3.504109589041096, "grad_norm": 4.425342082977295, "learning_rate": 7.218670725520041e-06, "loss": 0.0294, "step": 3837 }, { "epoch": 3.505022831050228, "grad_norm": 0.4652149975299835, "learning_rate": 7.21765601217656e-06, "loss": 0.004, "step": 3838 }, { "epoch": 3.5059360730593605, "grad_norm": 15.69769287109375, "learning_rate": 7.21664129883308e-06, "loss": 0.1565, "step": 3839 }, { "epoch": 3.506849315068493, "grad_norm": 61.81621170043945, "learning_rate": 7.2156265854896005e-06, "loss": 1.2831, "step": 3840 }, { "epoch": 3.5077625570776254, "grad_norm": 0.8411372900009155, "learning_rate": 7.214611872146119e-06, "loss": 0.0041, "step": 3841 }, { "epoch": 3.508675799086758, "grad_norm": 0.17241981625556946, "learning_rate": 7.213597158802639e-06, "loss": 0.0012, "step": 3842 }, { "epoch": 3.5095890410958903, "grad_norm": 75.29191589355469, "learning_rate": 7.212582445459159e-06, "loss": 0.8953, "step": 3843 }, { "epoch": 3.5105022831050228, "grad_norm": 4.823570728302002, "learning_rate": 7.2115677321156776e-06, "loss": 0.0268, "step": 3844 }, { "epoch": 3.5114155251141552, "grad_norm": 14.730896949768066, "learning_rate": 7.210553018772197e-06, "loss": 0.1115, "step": 3845 }, { "epoch": 3.5123287671232877, "grad_norm": 23.522342681884766, "learning_rate": 7.209538305428717e-06, "loss": 0.1773, "step": 3846 }, { "epoch": 3.51324200913242, "grad_norm": 6.528955936431885, "learning_rate": 7.208523592085237e-06, "loss": 0.0627, "step": 3847 }, { "epoch": 3.5141552511415526, "grad_norm": 89.47245025634766, "learning_rate": 7.2075088787417555e-06, "loss": 1.5264, "step": 3848 }, { "epoch": 3.515068493150685, "grad_norm": 7.9062886238098145, "learning_rate": 7.206494165398276e-06, "loss": 0.0336, "step": 3849 }, { "epoch": 3.5159817351598175, "grad_norm": 37.23404312133789, "learning_rate": 7.205479452054796e-06, "loss": 0.7116, "step": 3850 }, { "epoch": 3.51689497716895, "grad_norm": 15.793161392211914, "learning_rate": 7.2044647387113146e-06, "loss": 0.0693, "step": 3851 }, { "epoch": 3.5178082191780824, "grad_norm": 95.07591247558594, "learning_rate": 7.203450025367834e-06, "loss": 0.9602, "step": 3852 }, { "epoch": 3.518721461187215, "grad_norm": 1.1596765518188477, "learning_rate": 7.202435312024354e-06, "loss": 0.0096, "step": 3853 }, { "epoch": 3.5196347031963473, "grad_norm": 3.4994125366210938, "learning_rate": 7.201420598680873e-06, "loss": 0.0218, "step": 3854 }, { "epoch": 3.5205479452054793, "grad_norm": 6.973701477050781, "learning_rate": 7.2004058853373925e-06, "loss": 0.0464, "step": 3855 }, { "epoch": 3.5214611872146118, "grad_norm": 0.2071882039308548, "learning_rate": 7.199391171993912e-06, "loss": 0.0013, "step": 3856 }, { "epoch": 3.522374429223744, "grad_norm": 1.9021739959716797, "learning_rate": 7.198376458650433e-06, "loss": 0.0112, "step": 3857 }, { "epoch": 3.5232876712328767, "grad_norm": 7.177318572998047, "learning_rate": 7.197361745306951e-06, "loss": 0.0649, "step": 3858 }, { "epoch": 3.524200913242009, "grad_norm": 0.21210676431655884, "learning_rate": 7.196347031963471e-06, "loss": 0.001, "step": 3859 }, { "epoch": 3.5251141552511416, "grad_norm": 2.8765363693237305, "learning_rate": 7.195332318619991e-06, "loss": 0.0279, "step": 3860 }, { "epoch": 3.526027397260274, "grad_norm": 4.579653739929199, "learning_rate": 7.19431760527651e-06, "loss": 0.0263, "step": 3861 }, { "epoch": 3.5269406392694065, "grad_norm": 17.682897567749023, "learning_rate": 7.1933028919330295e-06, "loss": 0.1364, "step": 3862 }, { "epoch": 3.527853881278539, "grad_norm": 3.3675642013549805, "learning_rate": 7.192288178589549e-06, "loss": 0.0277, "step": 3863 }, { "epoch": 3.5287671232876714, "grad_norm": 0.10556744784116745, "learning_rate": 7.191273465246068e-06, "loss": 0.0011, "step": 3864 }, { "epoch": 3.5296803652968034, "grad_norm": 1.6192764043807983, "learning_rate": 7.190258751902588e-06, "loss": 0.0111, "step": 3865 }, { "epoch": 3.530593607305936, "grad_norm": 2.4976682662963867, "learning_rate": 7.189244038559107e-06, "loss": 0.0194, "step": 3866 }, { "epoch": 3.5315068493150683, "grad_norm": 103.74982452392578, "learning_rate": 7.188229325215628e-06, "loss": 0.9047, "step": 3867 }, { "epoch": 3.5324200913242008, "grad_norm": 32.91465377807617, "learning_rate": 7.187214611872147e-06, "loss": 0.2575, "step": 3868 }, { "epoch": 3.533333333333333, "grad_norm": 0.01844627968966961, "learning_rate": 7.1861998985286665e-06, "loss": 0.0001, "step": 3869 }, { "epoch": 3.5342465753424657, "grad_norm": 0.6078088879585266, "learning_rate": 7.185185185185186e-06, "loss": 0.0038, "step": 3870 }, { "epoch": 3.535159817351598, "grad_norm": 5.854267120361328, "learning_rate": 7.184170471841705e-06, "loss": 0.0213, "step": 3871 }, { "epoch": 3.5360730593607306, "grad_norm": 4.971067428588867, "learning_rate": 7.183155758498225e-06, "loss": 0.0311, "step": 3872 }, { "epoch": 3.536986301369863, "grad_norm": 40.719181060791016, "learning_rate": 7.182141045154744e-06, "loss": 0.2221, "step": 3873 }, { "epoch": 3.5378995433789955, "grad_norm": 6.988062858581543, "learning_rate": 7.181126331811263e-06, "loss": 0.0869, "step": 3874 }, { "epoch": 3.538812785388128, "grad_norm": 52.04302215576172, "learning_rate": 7.180111618467783e-06, "loss": 0.3744, "step": 3875 }, { "epoch": 3.5397260273972604, "grad_norm": 1.2306979894638062, "learning_rate": 7.1790969051243035e-06, "loss": 0.0117, "step": 3876 }, { "epoch": 3.540639269406393, "grad_norm": 3.457324504852295, "learning_rate": 7.178082191780823e-06, "loss": 0.0242, "step": 3877 }, { "epoch": 3.5415525114155253, "grad_norm": 21.839353561401367, "learning_rate": 7.177067478437342e-06, "loss": 0.171, "step": 3878 }, { "epoch": 3.5424657534246577, "grad_norm": 8.456737518310547, "learning_rate": 7.176052765093862e-06, "loss": 0.0395, "step": 3879 }, { "epoch": 3.54337899543379, "grad_norm": 47.341064453125, "learning_rate": 7.175038051750381e-06, "loss": 0.3326, "step": 3880 }, { "epoch": 3.544292237442922, "grad_norm": 0.3547138571739197, "learning_rate": 7.1740233384069e-06, "loss": 0.0028, "step": 3881 }, { "epoch": 3.5452054794520547, "grad_norm": 0.7934607267379761, "learning_rate": 7.17300862506342e-06, "loss": 0.0055, "step": 3882 }, { "epoch": 3.546118721461187, "grad_norm": 40.333621978759766, "learning_rate": 7.17199391171994e-06, "loss": 0.4208, "step": 3883 }, { "epoch": 3.5470319634703196, "grad_norm": 8.228065490722656, "learning_rate": 7.1709791983764585e-06, "loss": 0.0815, "step": 3884 }, { "epoch": 3.547945205479452, "grad_norm": 7.753324508666992, "learning_rate": 7.169964485032979e-06, "loss": 0.0608, "step": 3885 }, { "epoch": 3.5488584474885845, "grad_norm": 7.930552959442139, "learning_rate": 7.168949771689499e-06, "loss": 0.0395, "step": 3886 }, { "epoch": 3.549771689497717, "grad_norm": 7.080461502075195, "learning_rate": 7.167935058346018e-06, "loss": 0.0385, "step": 3887 }, { "epoch": 3.5506849315068494, "grad_norm": 10.502224922180176, "learning_rate": 7.166920345002537e-06, "loss": 0.0807, "step": 3888 }, { "epoch": 3.551598173515982, "grad_norm": 0.8608444929122925, "learning_rate": 7.165905631659057e-06, "loss": 0.0074, "step": 3889 }, { "epoch": 3.5525114155251143, "grad_norm": 0.35398322343826294, "learning_rate": 7.164890918315577e-06, "loss": 0.0024, "step": 3890 }, { "epoch": 3.5534246575342463, "grad_norm": 2.368138074874878, "learning_rate": 7.1638762049720955e-06, "loss": 0.0155, "step": 3891 }, { "epoch": 3.5543378995433788, "grad_norm": 2.549717426300049, "learning_rate": 7.162861491628615e-06, "loss": 0.0129, "step": 3892 }, { "epoch": 3.555251141552511, "grad_norm": 0.570517897605896, "learning_rate": 7.161846778285136e-06, "loss": 0.0039, "step": 3893 }, { "epoch": 3.5561643835616437, "grad_norm": 5.55166482925415, "learning_rate": 7.160832064941654e-06, "loss": 0.0306, "step": 3894 }, { "epoch": 3.557077625570776, "grad_norm": 5.948043346405029, "learning_rate": 7.159817351598174e-06, "loss": 0.0265, "step": 3895 }, { "epoch": 3.5579908675799086, "grad_norm": 0.755236029624939, "learning_rate": 7.158802638254694e-06, "loss": 0.0068, "step": 3896 }, { "epoch": 3.558904109589041, "grad_norm": 43.65530776977539, "learning_rate": 7.157787924911214e-06, "loss": 0.8781, "step": 3897 }, { "epoch": 3.5598173515981735, "grad_norm": 9.926082611083984, "learning_rate": 7.1567732115677325e-06, "loss": 0.0827, "step": 3898 }, { "epoch": 3.560730593607306, "grad_norm": 0.9233574867248535, "learning_rate": 7.155758498224252e-06, "loss": 0.006, "step": 3899 }, { "epoch": 3.5616438356164384, "grad_norm": 3.8253707885742188, "learning_rate": 7.154743784880772e-06, "loss": 0.0252, "step": 3900 }, { "epoch": 3.562557077625571, "grad_norm": 20.146486282348633, "learning_rate": 7.153729071537291e-06, "loss": 0.1308, "step": 3901 }, { "epoch": 3.5634703196347033, "grad_norm": 26.027549743652344, "learning_rate": 7.15271435819381e-06, "loss": 0.2453, "step": 3902 }, { "epoch": 3.5643835616438357, "grad_norm": 0.414549320936203, "learning_rate": 7.151699644850331e-06, "loss": 0.0031, "step": 3903 }, { "epoch": 3.565296803652968, "grad_norm": 32.960567474365234, "learning_rate": 7.15068493150685e-06, "loss": 0.2435, "step": 3904 }, { "epoch": 3.5662100456621006, "grad_norm": 4.490262985229492, "learning_rate": 7.1496702181633695e-06, "loss": 0.0269, "step": 3905 }, { "epoch": 3.567123287671233, "grad_norm": 47.26699447631836, "learning_rate": 7.148655504819889e-06, "loss": 0.7494, "step": 3906 }, { "epoch": 3.5680365296803656, "grad_norm": 4.057979583740234, "learning_rate": 7.147640791476409e-06, "loss": 0.0255, "step": 3907 }, { "epoch": 3.5689497716894976, "grad_norm": 1.3839387893676758, "learning_rate": 7.146626078132928e-06, "loss": 0.0066, "step": 3908 }, { "epoch": 3.56986301369863, "grad_norm": 83.08975982666016, "learning_rate": 7.145611364789447e-06, "loss": 0.577, "step": 3909 }, { "epoch": 3.5707762557077625, "grad_norm": 82.52487182617188, "learning_rate": 7.144596651445967e-06, "loss": 1.1554, "step": 3910 }, { "epoch": 3.571689497716895, "grad_norm": 0.442374050617218, "learning_rate": 7.143581938102486e-06, "loss": 0.0037, "step": 3911 }, { "epoch": 3.5726027397260274, "grad_norm": 3.2456653118133545, "learning_rate": 7.1425672247590065e-06, "loss": 0.0246, "step": 3912 }, { "epoch": 3.57351598173516, "grad_norm": 0.16614645719528198, "learning_rate": 7.141552511415526e-06, "loss": 0.0013, "step": 3913 }, { "epoch": 3.5744292237442923, "grad_norm": 4.620035171508789, "learning_rate": 7.140537798072045e-06, "loss": 0.0346, "step": 3914 }, { "epoch": 3.5753424657534247, "grad_norm": 9.550459861755371, "learning_rate": 7.139523084728565e-06, "loss": 0.0485, "step": 3915 }, { "epoch": 3.576255707762557, "grad_norm": 15.832300186157227, "learning_rate": 7.138508371385084e-06, "loss": 0.1344, "step": 3916 }, { "epoch": 3.5771689497716896, "grad_norm": 33.68806457519531, "learning_rate": 7.137493658041604e-06, "loss": 0.2314, "step": 3917 }, { "epoch": 3.5780821917808217, "grad_norm": 23.536638259887695, "learning_rate": 7.136478944698123e-06, "loss": 0.1656, "step": 3918 }, { "epoch": 3.578995433789954, "grad_norm": 0.022804217413067818, "learning_rate": 7.135464231354643e-06, "loss": 0.0002, "step": 3919 }, { "epoch": 3.5799086757990866, "grad_norm": 67.79097747802734, "learning_rate": 7.134449518011163e-06, "loss": 0.1969, "step": 3920 }, { "epoch": 3.580821917808219, "grad_norm": 0.8100586533546448, "learning_rate": 7.133434804667682e-06, "loss": 0.007, "step": 3921 }, { "epoch": 3.5817351598173515, "grad_norm": 10.155292510986328, "learning_rate": 7.132420091324202e-06, "loss": 0.059, "step": 3922 }, { "epoch": 3.582648401826484, "grad_norm": 0.9901970624923706, "learning_rate": 7.131405377980721e-06, "loss": 0.0066, "step": 3923 }, { "epoch": 3.5835616438356164, "grad_norm": 36.56309127807617, "learning_rate": 7.13039066463724e-06, "loss": 0.2673, "step": 3924 }, { "epoch": 3.584474885844749, "grad_norm": 17.104843139648438, "learning_rate": 7.12937595129376e-06, "loss": 0.0775, "step": 3925 }, { "epoch": 3.5853881278538813, "grad_norm": 37.08780288696289, "learning_rate": 7.12836123795028e-06, "loss": 0.2771, "step": 3926 }, { "epoch": 3.5863013698630137, "grad_norm": 6.221035003662109, "learning_rate": 7.127346524606799e-06, "loss": 0.0409, "step": 3927 }, { "epoch": 3.587214611872146, "grad_norm": 0.9469850063323975, "learning_rate": 7.126331811263318e-06, "loss": 0.0079, "step": 3928 }, { "epoch": 3.5881278538812786, "grad_norm": 1.23285710811615, "learning_rate": 7.125317097919839e-06, "loss": 0.0114, "step": 3929 }, { "epoch": 3.589041095890411, "grad_norm": 27.573734283447266, "learning_rate": 7.124302384576358e-06, "loss": 0.3164, "step": 3930 }, { "epoch": 3.5899543378995435, "grad_norm": 435.3406677246094, "learning_rate": 7.123287671232877e-06, "loss": 1.1075, "step": 3931 }, { "epoch": 3.590867579908676, "grad_norm": 0.9783831238746643, "learning_rate": 7.122272957889397e-06, "loss": 0.0083, "step": 3932 }, { "epoch": 3.5917808219178085, "grad_norm": 45.652618408203125, "learning_rate": 7.121258244545917e-06, "loss": 0.2578, "step": 3933 }, { "epoch": 3.592694063926941, "grad_norm": 30.18583106994629, "learning_rate": 7.1202435312024354e-06, "loss": 0.1654, "step": 3934 }, { "epoch": 3.593607305936073, "grad_norm": 0.3490384817123413, "learning_rate": 7.119228817858955e-06, "loss": 0.0021, "step": 3935 }, { "epoch": 3.5945205479452054, "grad_norm": 0.9050902724266052, "learning_rate": 7.118214104515475e-06, "loss": 0.0033, "step": 3936 }, { "epoch": 3.595433789954338, "grad_norm": 60.401512145996094, "learning_rate": 7.117199391171995e-06, "loss": 0.3342, "step": 3937 }, { "epoch": 3.5963470319634703, "grad_norm": 106.36408233642578, "learning_rate": 7.116184677828513e-06, "loss": 1.8738, "step": 3938 }, { "epoch": 3.5972602739726027, "grad_norm": 0.880193293094635, "learning_rate": 7.115169964485034e-06, "loss": 0.0054, "step": 3939 }, { "epoch": 3.598173515981735, "grad_norm": 68.57185363769531, "learning_rate": 7.114155251141554e-06, "loss": 0.424, "step": 3940 }, { "epoch": 3.5990867579908676, "grad_norm": 1.3962392807006836, "learning_rate": 7.1131405377980724e-06, "loss": 0.0098, "step": 3941 }, { "epoch": 3.6, "grad_norm": 5.629729270935059, "learning_rate": 7.112125824454592e-06, "loss": 0.026, "step": 3942 }, { "epoch": 3.6009132420091325, "grad_norm": 49.93651580810547, "learning_rate": 7.111111111111112e-06, "loss": 0.3666, "step": 3943 }, { "epoch": 3.601826484018265, "grad_norm": 0.13709698617458344, "learning_rate": 7.110096397767631e-06, "loss": 0.0012, "step": 3944 }, { "epoch": 3.602739726027397, "grad_norm": 61.44957733154297, "learning_rate": 7.10908168442415e-06, "loss": 0.6591, "step": 3945 }, { "epoch": 3.6036529680365295, "grad_norm": 0.33458805084228516, "learning_rate": 7.10806697108067e-06, "loss": 0.002, "step": 3946 }, { "epoch": 3.604566210045662, "grad_norm": 8.159640312194824, "learning_rate": 7.107052257737191e-06, "loss": 0.0529, "step": 3947 }, { "epoch": 3.6054794520547944, "grad_norm": 88.94961547851562, "learning_rate": 7.1060375443937094e-06, "loss": 1.2861, "step": 3948 }, { "epoch": 3.606392694063927, "grad_norm": 115.87762451171875, "learning_rate": 7.105022831050229e-06, "loss": 0.8472, "step": 3949 }, { "epoch": 3.6073059360730593, "grad_norm": 3.7478983402252197, "learning_rate": 7.104008117706749e-06, "loss": 0.019, "step": 3950 }, { "epoch": 3.6082191780821917, "grad_norm": 58.832889556884766, "learning_rate": 7.102993404363268e-06, "loss": 0.7594, "step": 3951 }, { "epoch": 3.609132420091324, "grad_norm": 13.946168899536133, "learning_rate": 7.101978691019787e-06, "loss": 0.0932, "step": 3952 }, { "epoch": 3.6100456621004566, "grad_norm": 14.033349990844727, "learning_rate": 7.100963977676307e-06, "loss": 0.0992, "step": 3953 }, { "epoch": 3.610958904109589, "grad_norm": 9.284667015075684, "learning_rate": 7.099949264332826e-06, "loss": 0.0458, "step": 3954 }, { "epoch": 3.6118721461187215, "grad_norm": 8.5177001953125, "learning_rate": 7.098934550989346e-06, "loss": 0.0617, "step": 3955 }, { "epoch": 3.612785388127854, "grad_norm": 47.44496154785156, "learning_rate": 7.097919837645866e-06, "loss": 0.4555, "step": 3956 }, { "epoch": 3.6136986301369864, "grad_norm": 28.526330947875977, "learning_rate": 7.096905124302386e-06, "loss": 0.2409, "step": 3957 }, { "epoch": 3.614611872146119, "grad_norm": 3.948920488357544, "learning_rate": 7.095890410958905e-06, "loss": 0.034, "step": 3958 }, { "epoch": 3.6155251141552514, "grad_norm": 6.998459815979004, "learning_rate": 7.094875697615424e-06, "loss": 0.0377, "step": 3959 }, { "epoch": 3.616438356164384, "grad_norm": 0.7454842925071716, "learning_rate": 7.093860984271944e-06, "loss": 0.0038, "step": 3960 }, { "epoch": 3.6173515981735163, "grad_norm": 7.3564229011535645, "learning_rate": 7.092846270928463e-06, "loss": 0.0512, "step": 3961 }, { "epoch": 3.6182648401826483, "grad_norm": 37.90184783935547, "learning_rate": 7.091831557584983e-06, "loss": 0.259, "step": 3962 }, { "epoch": 3.6191780821917807, "grad_norm": 41.01169204711914, "learning_rate": 7.090816844241502e-06, "loss": 0.1227, "step": 3963 }, { "epoch": 3.620091324200913, "grad_norm": 6.892489910125732, "learning_rate": 7.089802130898021e-06, "loss": 0.0493, "step": 3964 }, { "epoch": 3.6210045662100456, "grad_norm": 107.44596862792969, "learning_rate": 7.088787417554542e-06, "loss": 2.0008, "step": 3965 }, { "epoch": 3.621917808219178, "grad_norm": 1.1774977445602417, "learning_rate": 7.087772704211061e-06, "loss": 0.01, "step": 3966 }, { "epoch": 3.6228310502283105, "grad_norm": 27.346546173095703, "learning_rate": 7.086757990867581e-06, "loss": 0.245, "step": 3967 }, { "epoch": 3.623744292237443, "grad_norm": 4.203376770019531, "learning_rate": 7.0857432775241e-06, "loss": 0.0302, "step": 3968 }, { "epoch": 3.6246575342465754, "grad_norm": 0.5334810018539429, "learning_rate": 7.08472856418062e-06, "loss": 0.003, "step": 3969 }, { "epoch": 3.625570776255708, "grad_norm": 33.44697570800781, "learning_rate": 7.083713850837139e-06, "loss": 0.1533, "step": 3970 }, { "epoch": 3.6264840182648403, "grad_norm": 35.336639404296875, "learning_rate": 7.082699137493658e-06, "loss": 0.2372, "step": 3971 }, { "epoch": 3.6273972602739724, "grad_norm": 20.999622344970703, "learning_rate": 7.081684424150178e-06, "loss": 0.1371, "step": 3972 }, { "epoch": 3.628310502283105, "grad_norm": 0.6492531299591064, "learning_rate": 7.080669710806698e-06, "loss": 0.0033, "step": 3973 }, { "epoch": 3.6292237442922373, "grad_norm": 1.7983455657958984, "learning_rate": 7.079654997463216e-06, "loss": 0.0101, "step": 3974 }, { "epoch": 3.6301369863013697, "grad_norm": 7.428617000579834, "learning_rate": 7.078640284119737e-06, "loss": 0.0296, "step": 3975 }, { "epoch": 3.631050228310502, "grad_norm": 158.716552734375, "learning_rate": 7.077625570776257e-06, "loss": 0.6255, "step": 3976 }, { "epoch": 3.6319634703196346, "grad_norm": 22.671388626098633, "learning_rate": 7.076610857432776e-06, "loss": 0.1717, "step": 3977 }, { "epoch": 3.632876712328767, "grad_norm": 44.71002960205078, "learning_rate": 7.075596144089295e-06, "loss": 0.4193, "step": 3978 }, { "epoch": 3.6337899543378995, "grad_norm": 5.166435718536377, "learning_rate": 7.074581430745815e-06, "loss": 0.0201, "step": 3979 }, { "epoch": 3.634703196347032, "grad_norm": 10.185637474060059, "learning_rate": 7.0735667174023345e-06, "loss": 0.0619, "step": 3980 }, { "epoch": 3.6356164383561644, "grad_norm": 48.98384475708008, "learning_rate": 7.072552004058853e-06, "loss": 0.2636, "step": 3981 }, { "epoch": 3.636529680365297, "grad_norm": 41.377288818359375, "learning_rate": 7.071537290715373e-06, "loss": 0.2002, "step": 3982 }, { "epoch": 3.6374429223744293, "grad_norm": 2.3487229347229004, "learning_rate": 7.070522577371894e-06, "loss": 0.0186, "step": 3983 }, { "epoch": 3.638356164383562, "grad_norm": 12.072942733764648, "learning_rate": 7.069507864028412e-06, "loss": 0.0725, "step": 3984 }, { "epoch": 3.6392694063926943, "grad_norm": 8.854228019714355, "learning_rate": 7.068493150684932e-06, "loss": 0.0448, "step": 3985 }, { "epoch": 3.6401826484018267, "grad_norm": 2.480205774307251, "learning_rate": 7.067478437341452e-06, "loss": 0.0127, "step": 3986 }, { "epoch": 3.641095890410959, "grad_norm": 40.99736404418945, "learning_rate": 7.0664637239979715e-06, "loss": 0.2174, "step": 3987 }, { "epoch": 3.642009132420091, "grad_norm": 32.59159469604492, "learning_rate": 7.06544901065449e-06, "loss": 0.1678, "step": 3988 }, { "epoch": 3.6429223744292236, "grad_norm": 100.78633117675781, "learning_rate": 7.06443429731101e-06, "loss": 3.1183, "step": 3989 }, { "epoch": 3.643835616438356, "grad_norm": 22.504304885864258, "learning_rate": 7.06341958396753e-06, "loss": 0.1482, "step": 3990 }, { "epoch": 3.6447488584474885, "grad_norm": 0.5219756960868835, "learning_rate": 7.0624048706240486e-06, "loss": 0.0033, "step": 3991 }, { "epoch": 3.645662100456621, "grad_norm": 6.285340309143066, "learning_rate": 7.061390157280569e-06, "loss": 0.0385, "step": 3992 }, { "epoch": 3.6465753424657534, "grad_norm": 19.511857986450195, "learning_rate": 7.060375443937089e-06, "loss": 0.1347, "step": 3993 }, { "epoch": 3.647488584474886, "grad_norm": 1.8400152921676636, "learning_rate": 7.059360730593608e-06, "loss": 0.0106, "step": 3994 }, { "epoch": 3.6484018264840183, "grad_norm": 6.925300121307373, "learning_rate": 7.058346017250127e-06, "loss": 0.0309, "step": 3995 }, { "epoch": 3.649315068493151, "grad_norm": 5.427379131317139, "learning_rate": 7.057331303906647e-06, "loss": 0.0305, "step": 3996 }, { "epoch": 3.6502283105022832, "grad_norm": 6.8236541748046875, "learning_rate": 7.056316590563167e-06, "loss": 0.037, "step": 3997 }, { "epoch": 3.6511415525114153, "grad_norm": 1.1638531684875488, "learning_rate": 7.0553018772196856e-06, "loss": 0.0057, "step": 3998 }, { "epoch": 3.6520547945205477, "grad_norm": 3.3812997341156006, "learning_rate": 7.054287163876205e-06, "loss": 0.021, "step": 3999 }, { "epoch": 3.65296803652968, "grad_norm": 0.7798053622245789, "learning_rate": 7.053272450532726e-06, "loss": 0.0043, "step": 4000 }, { "epoch": 3.6538812785388126, "grad_norm": 48.711517333984375, "learning_rate": 7.052257737189245e-06, "loss": 0.3491, "step": 4001 }, { "epoch": 3.654794520547945, "grad_norm": 16.452186584472656, "learning_rate": 7.051243023845764e-06, "loss": 0.1137, "step": 4002 }, { "epoch": 3.6557077625570775, "grad_norm": 3.0623531341552734, "learning_rate": 7.050228310502284e-06, "loss": 0.0113, "step": 4003 }, { "epoch": 3.65662100456621, "grad_norm": 234.0429229736328, "learning_rate": 7.049213597158803e-06, "loss": 1.7759, "step": 4004 }, { "epoch": 3.6575342465753424, "grad_norm": 87.96124267578125, "learning_rate": 7.0481988838153226e-06, "loss": 0.5654, "step": 4005 }, { "epoch": 3.658447488584475, "grad_norm": 1.9789228439331055, "learning_rate": 7.047184170471842e-06, "loss": 0.0098, "step": 4006 }, { "epoch": 3.6593607305936073, "grad_norm": 95.02983093261719, "learning_rate": 7.046169457128362e-06, "loss": 0.4165, "step": 4007 }, { "epoch": 3.66027397260274, "grad_norm": 137.53915405273438, "learning_rate": 7.045154743784881e-06, "loss": 1.5326, "step": 4008 }, { "epoch": 3.6611872146118722, "grad_norm": 94.191650390625, "learning_rate": 7.044140030441401e-06, "loss": 1.0696, "step": 4009 }, { "epoch": 3.6621004566210047, "grad_norm": 21.4315185546875, "learning_rate": 7.043125317097921e-06, "loss": 0.1198, "step": 4010 }, { "epoch": 3.663013698630137, "grad_norm": 47.48586654663086, "learning_rate": 7.04211060375444e-06, "loss": 0.6388, "step": 4011 }, { "epoch": 3.6639269406392696, "grad_norm": 71.43972778320312, "learning_rate": 7.0410958904109596e-06, "loss": 0.8181, "step": 4012 }, { "epoch": 3.664840182648402, "grad_norm": 43.06103515625, "learning_rate": 7.040081177067479e-06, "loss": 0.2842, "step": 4013 }, { "epoch": 3.6657534246575345, "grad_norm": 2.273784637451172, "learning_rate": 7.039066463723998e-06, "loss": 0.015, "step": 4014 }, { "epoch": 3.6666666666666665, "grad_norm": 17.972858428955078, "learning_rate": 7.038051750380518e-06, "loss": 0.0808, "step": 4015 }, { "epoch": 3.667579908675799, "grad_norm": 10.206771850585938, "learning_rate": 7.0370370370370375e-06, "loss": 0.0556, "step": 4016 }, { "epoch": 3.6684931506849314, "grad_norm": 0.043171826750040054, "learning_rate": 7.036022323693558e-06, "loss": 0.0002, "step": 4017 }, { "epoch": 3.669406392694064, "grad_norm": 71.7262191772461, "learning_rate": 7.035007610350076e-06, "loss": 1.0441, "step": 4018 }, { "epoch": 3.6703196347031963, "grad_norm": 70.1703109741211, "learning_rate": 7.0339928970065966e-06, "loss": 0.7797, "step": 4019 }, { "epoch": 3.671232876712329, "grad_norm": 13.922669410705566, "learning_rate": 7.032978183663116e-06, "loss": 0.083, "step": 4020 }, { "epoch": 3.6721461187214612, "grad_norm": 52.800697326660156, "learning_rate": 7.031963470319635e-06, "loss": 0.4931, "step": 4021 }, { "epoch": 3.6730593607305937, "grad_norm": 35.30528259277344, "learning_rate": 7.030948756976155e-06, "loss": 0.2014, "step": 4022 }, { "epoch": 3.673972602739726, "grad_norm": 2.621122360229492, "learning_rate": 7.0299340436326745e-06, "loss": 0.0179, "step": 4023 }, { "epoch": 3.6748858447488586, "grad_norm": 7.388402938842773, "learning_rate": 7.028919330289193e-06, "loss": 0.0458, "step": 4024 }, { "epoch": 3.6757990867579906, "grad_norm": 9.217000007629395, "learning_rate": 7.027904616945713e-06, "loss": 0.0598, "step": 4025 }, { "epoch": 3.676712328767123, "grad_norm": 5.311054229736328, "learning_rate": 7.026889903602233e-06, "loss": 0.041, "step": 4026 }, { "epoch": 3.6776255707762555, "grad_norm": 21.07688331604004, "learning_rate": 7.025875190258753e-06, "loss": 0.2461, "step": 4027 }, { "epoch": 3.678538812785388, "grad_norm": 38.438201904296875, "learning_rate": 7.024860476915272e-06, "loss": 0.2463, "step": 4028 }, { "epoch": 3.6794520547945204, "grad_norm": 21.81747817993164, "learning_rate": 7.023845763571792e-06, "loss": 0.1374, "step": 4029 }, { "epoch": 3.680365296803653, "grad_norm": 0.665556788444519, "learning_rate": 7.0228310502283115e-06, "loss": 0.0057, "step": 4030 }, { "epoch": 3.6812785388127853, "grad_norm": 6.552459239959717, "learning_rate": 7.02181633688483e-06, "loss": 0.052, "step": 4031 }, { "epoch": 3.682191780821918, "grad_norm": 21.572532653808594, "learning_rate": 7.02080162354135e-06, "loss": 0.2386, "step": 4032 }, { "epoch": 3.6831050228310502, "grad_norm": 96.43976593017578, "learning_rate": 7.01978691019787e-06, "loss": 0.7328, "step": 4033 }, { "epoch": 3.6840182648401827, "grad_norm": 5.148290634155273, "learning_rate": 7.0187721968543886e-06, "loss": 0.0258, "step": 4034 }, { "epoch": 3.684931506849315, "grad_norm": 542.0838012695312, "learning_rate": 7.017757483510908e-06, "loss": 2.7728, "step": 4035 }, { "epoch": 3.6858447488584476, "grad_norm": 12.62264347076416, "learning_rate": 7.016742770167429e-06, "loss": 0.0667, "step": 4036 }, { "epoch": 3.68675799086758, "grad_norm": 3.0730535984039307, "learning_rate": 7.0157280568239485e-06, "loss": 0.02, "step": 4037 }, { "epoch": 3.6876712328767125, "grad_norm": 5.960460662841797, "learning_rate": 7.014713343480467e-06, "loss": 0.0632, "step": 4038 }, { "epoch": 3.688584474885845, "grad_norm": 0.3170708119869232, "learning_rate": 7.013698630136987e-06, "loss": 0.0024, "step": 4039 }, { "epoch": 3.6894977168949774, "grad_norm": 13.229220390319824, "learning_rate": 7.012683916793507e-06, "loss": 0.0729, "step": 4040 }, { "epoch": 3.69041095890411, "grad_norm": 8.33135986328125, "learning_rate": 7.0116692034500256e-06, "loss": 0.0726, "step": 4041 }, { "epoch": 3.691324200913242, "grad_norm": 120.3056869506836, "learning_rate": 7.010654490106545e-06, "loss": 2.0558, "step": 4042 }, { "epoch": 3.6922374429223743, "grad_norm": 31.706113815307617, "learning_rate": 7.009639776763065e-06, "loss": 0.1586, "step": 4043 }, { "epoch": 3.6931506849315068, "grad_norm": 3.6524999141693115, "learning_rate": 7.008625063419584e-06, "loss": 0.0201, "step": 4044 }, { "epoch": 3.6940639269406392, "grad_norm": 1.1569920778274536, "learning_rate": 7.007610350076104e-06, "loss": 0.0072, "step": 4045 }, { "epoch": 3.6949771689497717, "grad_norm": 41.60550308227539, "learning_rate": 7.006595636732624e-06, "loss": 0.4196, "step": 4046 }, { "epoch": 3.695890410958904, "grad_norm": 114.11224365234375, "learning_rate": 7.005580923389144e-06, "loss": 1.0599, "step": 4047 }, { "epoch": 3.6968036529680366, "grad_norm": 13.36036205291748, "learning_rate": 7.0045662100456626e-06, "loss": 0.1349, "step": 4048 }, { "epoch": 3.697716894977169, "grad_norm": 4.0995049476623535, "learning_rate": 7.003551496702182e-06, "loss": 0.0223, "step": 4049 }, { "epoch": 3.6986301369863015, "grad_norm": 42.915340423583984, "learning_rate": 7.002536783358702e-06, "loss": 0.3087, "step": 4050 }, { "epoch": 3.699543378995434, "grad_norm": 5.305718421936035, "learning_rate": 7.001522070015221e-06, "loss": 0.0388, "step": 4051 }, { "epoch": 3.700456621004566, "grad_norm": 40.06791687011719, "learning_rate": 7.0005073566717405e-06, "loss": 0.3854, "step": 4052 }, { "epoch": 3.7013698630136984, "grad_norm": 1.8199594020843506, "learning_rate": 6.999492643328261e-06, "loss": 0.0181, "step": 4053 }, { "epoch": 3.702283105022831, "grad_norm": 5.552797794342041, "learning_rate": 6.998477929984779e-06, "loss": 0.0334, "step": 4054 }, { "epoch": 3.7031963470319633, "grad_norm": 47.89922332763672, "learning_rate": 6.9974632166412995e-06, "loss": 0.2946, "step": 4055 }, { "epoch": 3.7041095890410958, "grad_norm": 1.0213416814804077, "learning_rate": 6.996448503297819e-06, "loss": 0.0062, "step": 4056 }, { "epoch": 3.7050228310502282, "grad_norm": 4.065206527709961, "learning_rate": 6.995433789954339e-06, "loss": 0.0301, "step": 4057 }, { "epoch": 3.7059360730593607, "grad_norm": 20.163471221923828, "learning_rate": 6.994419076610858e-06, "loss": 0.1222, "step": 4058 }, { "epoch": 3.706849315068493, "grad_norm": 3.4599099159240723, "learning_rate": 6.9934043632673775e-06, "loss": 0.0156, "step": 4059 }, { "epoch": 3.7077625570776256, "grad_norm": 0.2708364725112915, "learning_rate": 6.992389649923897e-06, "loss": 0.002, "step": 4060 }, { "epoch": 3.708675799086758, "grad_norm": 3.0465660095214844, "learning_rate": 6.991374936580416e-06, "loss": 0.0198, "step": 4061 }, { "epoch": 3.7095890410958905, "grad_norm": 0.5306707620620728, "learning_rate": 6.990360223236936e-06, "loss": 0.0032, "step": 4062 }, { "epoch": 3.710502283105023, "grad_norm": 4.356563091278076, "learning_rate": 6.989345509893456e-06, "loss": 0.0278, "step": 4063 }, { "epoch": 3.7114155251141554, "grad_norm": 1.8681972026824951, "learning_rate": 6.988330796549975e-06, "loss": 0.0148, "step": 4064 }, { "epoch": 3.712328767123288, "grad_norm": 16.50705909729004, "learning_rate": 6.987316083206495e-06, "loss": 0.1398, "step": 4065 }, { "epoch": 3.7132420091324203, "grad_norm": 26.417564392089844, "learning_rate": 6.9863013698630145e-06, "loss": 0.1906, "step": 4066 }, { "epoch": 3.7141552511415528, "grad_norm": 8.473560333251953, "learning_rate": 6.985286656519534e-06, "loss": 0.0591, "step": 4067 }, { "epoch": 3.7150684931506848, "grad_norm": 6.991457462310791, "learning_rate": 6.984271943176053e-06, "loss": 0.0398, "step": 4068 }, { "epoch": 3.7159817351598172, "grad_norm": 6.716320037841797, "learning_rate": 6.983257229832573e-06, "loss": 0.0456, "step": 4069 }, { "epoch": 3.7168949771689497, "grad_norm": 8.017003059387207, "learning_rate": 6.982242516489092e-06, "loss": 0.0618, "step": 4070 }, { "epoch": 3.717808219178082, "grad_norm": 35.659095764160156, "learning_rate": 6.981227803145611e-06, "loss": 0.1936, "step": 4071 }, { "epoch": 3.7187214611872146, "grad_norm": 2.1863439083099365, "learning_rate": 6.980213089802132e-06, "loss": 0.0122, "step": 4072 }, { "epoch": 3.719634703196347, "grad_norm": 1.959411382675171, "learning_rate": 6.9791983764586515e-06, "loss": 0.006, "step": 4073 }, { "epoch": 3.7205479452054795, "grad_norm": 0.7640106081962585, "learning_rate": 6.97818366311517e-06, "loss": 0.0055, "step": 4074 }, { "epoch": 3.721461187214612, "grad_norm": 3.7789463996887207, "learning_rate": 6.97716894977169e-06, "loss": 0.0259, "step": 4075 }, { "epoch": 3.7223744292237444, "grad_norm": 39.139617919921875, "learning_rate": 6.97615423642821e-06, "loss": 0.237, "step": 4076 }, { "epoch": 3.723287671232877, "grad_norm": 3.53363037109375, "learning_rate": 6.975139523084729e-06, "loss": 0.0117, "step": 4077 }, { "epoch": 3.724200913242009, "grad_norm": 13.659538269042969, "learning_rate": 6.974124809741248e-06, "loss": 0.1018, "step": 4078 }, { "epoch": 3.7251141552511413, "grad_norm": 4.098698616027832, "learning_rate": 6.973110096397768e-06, "loss": 0.0353, "step": 4079 }, { "epoch": 3.7260273972602738, "grad_norm": 58.61143112182617, "learning_rate": 6.9720953830542885e-06, "loss": 0.7442, "step": 4080 }, { "epoch": 3.726940639269406, "grad_norm": 126.29373931884766, "learning_rate": 6.9710806697108065e-06, "loss": 1.1938, "step": 4081 }, { "epoch": 3.7278538812785387, "grad_norm": 109.4899673461914, "learning_rate": 6.970065956367327e-06, "loss": 2.1657, "step": 4082 }, { "epoch": 3.728767123287671, "grad_norm": 2.008014678955078, "learning_rate": 6.969051243023847e-06, "loss": 0.0149, "step": 4083 }, { "epoch": 3.7296803652968036, "grad_norm": 74.58202362060547, "learning_rate": 6.9680365296803655e-06, "loss": 0.841, "step": 4084 }, { "epoch": 3.730593607305936, "grad_norm": 17.43114471435547, "learning_rate": 6.967021816336885e-06, "loss": 0.074, "step": 4085 }, { "epoch": 3.7315068493150685, "grad_norm": 1.4320755004882812, "learning_rate": 6.966007102993405e-06, "loss": 0.0078, "step": 4086 }, { "epoch": 3.732420091324201, "grad_norm": 16.314573287963867, "learning_rate": 6.964992389649925e-06, "loss": 0.1018, "step": 4087 }, { "epoch": 3.7333333333333334, "grad_norm": 20.53571891784668, "learning_rate": 6.9639776763064435e-06, "loss": 0.1141, "step": 4088 }, { "epoch": 3.734246575342466, "grad_norm": 12.184585571289062, "learning_rate": 6.962962962962964e-06, "loss": 0.0876, "step": 4089 }, { "epoch": 3.7351598173515983, "grad_norm": 24.66888427734375, "learning_rate": 6.961948249619484e-06, "loss": 0.1693, "step": 4090 }, { "epoch": 3.7360730593607308, "grad_norm": 14.997069358825684, "learning_rate": 6.9609335362760025e-06, "loss": 0.0914, "step": 4091 }, { "epoch": 3.736986301369863, "grad_norm": 2.9353489875793457, "learning_rate": 6.959918822932522e-06, "loss": 0.0144, "step": 4092 }, { "epoch": 3.7378995433789957, "grad_norm": 2.516324996948242, "learning_rate": 6.958904109589042e-06, "loss": 0.0161, "step": 4093 }, { "epoch": 3.738812785388128, "grad_norm": 1.3832494020462036, "learning_rate": 6.957889396245561e-06, "loss": 0.0034, "step": 4094 }, { "epoch": 3.73972602739726, "grad_norm": 1.0503665208816528, "learning_rate": 6.9568746829020805e-06, "loss": 0.0072, "step": 4095 }, { "epoch": 3.7406392694063926, "grad_norm": 109.10620880126953, "learning_rate": 6.9558599695586e-06, "loss": 4.4546, "step": 4096 }, { "epoch": 3.741552511415525, "grad_norm": 0.5482844710350037, "learning_rate": 6.954845256215121e-06, "loss": 0.0043, "step": 4097 }, { "epoch": 3.7424657534246575, "grad_norm": 6.241611957550049, "learning_rate": 6.953830542871639e-06, "loss": 0.0374, "step": 4098 }, { "epoch": 3.74337899543379, "grad_norm": 3.9202399253845215, "learning_rate": 6.952815829528159e-06, "loss": 0.017, "step": 4099 }, { "epoch": 3.7442922374429224, "grad_norm": 7.371834754943848, "learning_rate": 6.951801116184679e-06, "loss": 0.0481, "step": 4100 }, { "epoch": 3.745205479452055, "grad_norm": 1.2963567972183228, "learning_rate": 6.950786402841198e-06, "loss": 0.01, "step": 4101 }, { "epoch": 3.7461187214611873, "grad_norm": 1.0390487909317017, "learning_rate": 6.9497716894977175e-06, "loss": 0.0083, "step": 4102 }, { "epoch": 3.7470319634703197, "grad_norm": 41.51789855957031, "learning_rate": 6.948756976154237e-06, "loss": 0.3137, "step": 4103 }, { "epoch": 3.747945205479452, "grad_norm": 81.28630828857422, "learning_rate": 6.947742262810756e-06, "loss": 1.0569, "step": 4104 }, { "epoch": 3.748858447488584, "grad_norm": 52.914424896240234, "learning_rate": 6.946727549467276e-06, "loss": 0.4725, "step": 4105 }, { "epoch": 3.7497716894977167, "grad_norm": 14.031608581542969, "learning_rate": 6.945712836123795e-06, "loss": 0.1193, "step": 4106 }, { "epoch": 3.750684931506849, "grad_norm": 2.639314651489258, "learning_rate": 6.944698122780316e-06, "loss": 0.0158, "step": 4107 }, { "epoch": 3.7515981735159816, "grad_norm": 4.093026638031006, "learning_rate": 6.943683409436835e-06, "loss": 0.0213, "step": 4108 }, { "epoch": 3.752511415525114, "grad_norm": 0.24712808430194855, "learning_rate": 6.9426686960933545e-06, "loss": 0.0018, "step": 4109 }, { "epoch": 3.7534246575342465, "grad_norm": 105.69403839111328, "learning_rate": 6.941653982749874e-06, "loss": 1.1497, "step": 4110 }, { "epoch": 3.754337899543379, "grad_norm": 1.8981376886367798, "learning_rate": 6.940639269406393e-06, "loss": 0.0156, "step": 4111 }, { "epoch": 3.7552511415525114, "grad_norm": 27.98189926147461, "learning_rate": 6.939624556062913e-06, "loss": 0.2554, "step": 4112 }, { "epoch": 3.756164383561644, "grad_norm": 145.48001098632812, "learning_rate": 6.938609842719432e-06, "loss": 0.8795, "step": 4113 }, { "epoch": 3.7570776255707763, "grad_norm": 0.59312504529953, "learning_rate": 6.937595129375951e-06, "loss": 0.0053, "step": 4114 }, { "epoch": 3.7579908675799087, "grad_norm": 27.473167419433594, "learning_rate": 6.936580416032471e-06, "loss": 0.1693, "step": 4115 }, { "epoch": 3.758904109589041, "grad_norm": 5.14956521987915, "learning_rate": 6.9355657026889914e-06, "loss": 0.0472, "step": 4116 }, { "epoch": 3.7598173515981737, "grad_norm": 1.295670509338379, "learning_rate": 6.934550989345511e-06, "loss": 0.0079, "step": 4117 }, { "epoch": 3.760730593607306, "grad_norm": 6.5451483726501465, "learning_rate": 6.93353627600203e-06, "loss": 0.0343, "step": 4118 }, { "epoch": 3.7616438356164386, "grad_norm": 9.811758041381836, "learning_rate": 6.93252156265855e-06, "loss": 0.0597, "step": 4119 }, { "epoch": 3.762557077625571, "grad_norm": 6.644251823425293, "learning_rate": 6.931506849315069e-06, "loss": 0.0392, "step": 4120 }, { "epoch": 3.7634703196347035, "grad_norm": 1.143447995185852, "learning_rate": 6.930492135971588e-06, "loss": 0.0078, "step": 4121 }, { "epoch": 3.7643835616438355, "grad_norm": 0.4280158281326294, "learning_rate": 6.929477422628108e-06, "loss": 0.0027, "step": 4122 }, { "epoch": 3.765296803652968, "grad_norm": 79.28455352783203, "learning_rate": 6.928462709284628e-06, "loss": 1.3989, "step": 4123 }, { "epoch": 3.7662100456621004, "grad_norm": 5.953413486480713, "learning_rate": 6.9274479959411464e-06, "loss": 0.0529, "step": 4124 }, { "epoch": 3.767123287671233, "grad_norm": 5.452071666717529, "learning_rate": 6.926433282597667e-06, "loss": 0.0399, "step": 4125 }, { "epoch": 3.7680365296803653, "grad_norm": 27.501461029052734, "learning_rate": 6.925418569254187e-06, "loss": 0.1809, "step": 4126 }, { "epoch": 3.7689497716894977, "grad_norm": 7.581355094909668, "learning_rate": 6.924403855910706e-06, "loss": 0.0415, "step": 4127 }, { "epoch": 3.76986301369863, "grad_norm": 11.400049209594727, "learning_rate": 6.923389142567225e-06, "loss": 0.0627, "step": 4128 }, { "epoch": 3.7707762557077626, "grad_norm": 9.149380683898926, "learning_rate": 6.922374429223745e-06, "loss": 0.048, "step": 4129 }, { "epoch": 3.771689497716895, "grad_norm": 71.33353424072266, "learning_rate": 6.921359715880265e-06, "loss": 0.9406, "step": 4130 }, { "epoch": 3.7726027397260276, "grad_norm": 19.431795120239258, "learning_rate": 6.9203450025367834e-06, "loss": 0.1012, "step": 4131 }, { "epoch": 3.7735159817351596, "grad_norm": 118.15940856933594, "learning_rate": 6.919330289193303e-06, "loss": 5.0299, "step": 4132 }, { "epoch": 3.774429223744292, "grad_norm": 12.054444313049316, "learning_rate": 6.918315575849824e-06, "loss": 0.1108, "step": 4133 }, { "epoch": 3.7753424657534245, "grad_norm": 62.17317199707031, "learning_rate": 6.917300862506342e-06, "loss": 0.3791, "step": 4134 }, { "epoch": 3.776255707762557, "grad_norm": 89.4162826538086, "learning_rate": 6.916286149162862e-06, "loss": 0.7822, "step": 4135 }, { "epoch": 3.7771689497716894, "grad_norm": 86.65149688720703, "learning_rate": 6.915271435819382e-06, "loss": 1.1942, "step": 4136 }, { "epoch": 3.778082191780822, "grad_norm": 0.8460943698883057, "learning_rate": 6.914256722475902e-06, "loss": 0.0061, "step": 4137 }, { "epoch": 3.7789954337899543, "grad_norm": 5.573483467102051, "learning_rate": 6.9132420091324204e-06, "loss": 0.0441, "step": 4138 }, { "epoch": 3.7799086757990867, "grad_norm": 1.036279320716858, "learning_rate": 6.91222729578894e-06, "loss": 0.008, "step": 4139 }, { "epoch": 3.780821917808219, "grad_norm": 62.27153015136719, "learning_rate": 6.91121258244546e-06, "loss": 0.6849, "step": 4140 }, { "epoch": 3.7817351598173516, "grad_norm": 3.741307497024536, "learning_rate": 6.910197869101979e-06, "loss": 0.0324, "step": 4141 }, { "epoch": 3.782648401826484, "grad_norm": 13.447831153869629, "learning_rate": 6.909183155758498e-06, "loss": 0.1052, "step": 4142 }, { "epoch": 3.7835616438356166, "grad_norm": 422.4764404296875, "learning_rate": 6.908168442415019e-06, "loss": 2.2298, "step": 4143 }, { "epoch": 3.784474885844749, "grad_norm": 1.2639265060424805, "learning_rate": 6.907153729071538e-06, "loss": 0.0087, "step": 4144 }, { "epoch": 3.7853881278538815, "grad_norm": 0.8616583943367004, "learning_rate": 6.9061390157280574e-06, "loss": 0.0061, "step": 4145 }, { "epoch": 3.786301369863014, "grad_norm": 8.775202751159668, "learning_rate": 6.905124302384577e-06, "loss": 0.0547, "step": 4146 }, { "epoch": 3.7872146118721464, "grad_norm": 29.729658126831055, "learning_rate": 6.904109589041097e-06, "loss": 0.3011, "step": 4147 }, { "epoch": 3.7881278538812784, "grad_norm": 7.511214256286621, "learning_rate": 6.903094875697616e-06, "loss": 0.0418, "step": 4148 }, { "epoch": 3.789041095890411, "grad_norm": 5.960559844970703, "learning_rate": 6.902080162354135e-06, "loss": 0.032, "step": 4149 }, { "epoch": 3.7899543378995433, "grad_norm": 37.1842041015625, "learning_rate": 6.901065449010655e-06, "loss": 0.0443, "step": 4150 }, { "epoch": 3.7908675799086757, "grad_norm": 5.511570930480957, "learning_rate": 6.900050735667174e-06, "loss": 0.0487, "step": 4151 }, { "epoch": 3.791780821917808, "grad_norm": 2.0279786586761475, "learning_rate": 6.8990360223236944e-06, "loss": 0.0177, "step": 4152 }, { "epoch": 3.7926940639269406, "grad_norm": 7.663942337036133, "learning_rate": 6.898021308980214e-06, "loss": 0.0645, "step": 4153 }, { "epoch": 3.793607305936073, "grad_norm": 101.89940643310547, "learning_rate": 6.897006595636733e-06, "loss": 0.9309, "step": 4154 }, { "epoch": 3.7945205479452055, "grad_norm": 62.04570007324219, "learning_rate": 6.895991882293253e-06, "loss": 0.4487, "step": 4155 }, { "epoch": 3.795433789954338, "grad_norm": 12.074592590332031, "learning_rate": 6.894977168949772e-06, "loss": 0.0248, "step": 4156 }, { "epoch": 3.7963470319634705, "grad_norm": 10.245457649230957, "learning_rate": 6.893962455606292e-06, "loss": 0.0798, "step": 4157 }, { "epoch": 3.7972602739726025, "grad_norm": 21.70946502685547, "learning_rate": 6.892947742262811e-06, "loss": 0.1231, "step": 4158 }, { "epoch": 3.798173515981735, "grad_norm": 84.71639251708984, "learning_rate": 6.891933028919331e-06, "loss": 0.4883, "step": 4159 }, { "epoch": 3.7990867579908674, "grad_norm": 10.046184539794922, "learning_rate": 6.890918315575851e-06, "loss": 0.0695, "step": 4160 }, { "epoch": 3.8, "grad_norm": 14.210148811340332, "learning_rate": 6.889903602232369e-06, "loss": 0.0742, "step": 4161 }, { "epoch": 3.8009132420091323, "grad_norm": 36.52666091918945, "learning_rate": 6.88888888888889e-06, "loss": 0.187, "step": 4162 }, { "epoch": 3.8018264840182647, "grad_norm": 15.129415512084961, "learning_rate": 6.887874175545409e-06, "loss": 0.1213, "step": 4163 }, { "epoch": 3.802739726027397, "grad_norm": 19.742328643798828, "learning_rate": 6.886859462201928e-06, "loss": 0.212, "step": 4164 }, { "epoch": 3.8036529680365296, "grad_norm": 3.6549644470214844, "learning_rate": 6.885844748858448e-06, "loss": 0.0217, "step": 4165 }, { "epoch": 3.804566210045662, "grad_norm": 87.8946533203125, "learning_rate": 6.884830035514968e-06, "loss": 1.4855, "step": 4166 }, { "epoch": 3.8054794520547945, "grad_norm": 0.991487979888916, "learning_rate": 6.883815322171487e-06, "loss": 0.0071, "step": 4167 }, { "epoch": 3.806392694063927, "grad_norm": 2.3696539402008057, "learning_rate": 6.882800608828006e-06, "loss": 0.0195, "step": 4168 }, { "epoch": 3.8073059360730594, "grad_norm": 3.5429322719573975, "learning_rate": 6.881785895484527e-06, "loss": 0.0051, "step": 4169 }, { "epoch": 3.808219178082192, "grad_norm": 9.317170143127441, "learning_rate": 6.880771182141046e-06, "loss": 0.055, "step": 4170 }, { "epoch": 3.8091324200913244, "grad_norm": 7.9872589111328125, "learning_rate": 6.879756468797565e-06, "loss": 0.0449, "step": 4171 }, { "epoch": 3.810045662100457, "grad_norm": 39.1112060546875, "learning_rate": 6.878741755454085e-06, "loss": 0.4469, "step": 4172 }, { "epoch": 3.8109589041095893, "grad_norm": 1.7946118116378784, "learning_rate": 6.877727042110605e-06, "loss": 0.0072, "step": 4173 }, { "epoch": 3.8118721461187217, "grad_norm": 46.90468978881836, "learning_rate": 6.876712328767123e-06, "loss": 0.3404, "step": 4174 }, { "epoch": 3.8127853881278537, "grad_norm": 1.407321810722351, "learning_rate": 6.875697615423643e-06, "loss": 0.0065, "step": 4175 }, { "epoch": 3.813698630136986, "grad_norm": 59.544578552246094, "learning_rate": 6.874682902080163e-06, "loss": 0.902, "step": 4176 }, { "epoch": 3.8146118721461186, "grad_norm": 5.6052327156066895, "learning_rate": 6.873668188736683e-06, "loss": 0.0267, "step": 4177 }, { "epoch": 3.815525114155251, "grad_norm": 0.20944111049175262, "learning_rate": 6.872653475393201e-06, "loss": 0.0013, "step": 4178 }, { "epoch": 3.8164383561643835, "grad_norm": 4.855275630950928, "learning_rate": 6.871638762049722e-06, "loss": 0.0273, "step": 4179 }, { "epoch": 3.817351598173516, "grad_norm": 59.557395935058594, "learning_rate": 6.8706240487062416e-06, "loss": 0.3938, "step": 4180 }, { "epoch": 3.8182648401826484, "grad_norm": 10.160359382629395, "learning_rate": 6.86960933536276e-06, "loss": 0.0708, "step": 4181 }, { "epoch": 3.819178082191781, "grad_norm": 13.069756507873535, "learning_rate": 6.86859462201928e-06, "loss": 0.0899, "step": 4182 }, { "epoch": 3.8200913242009134, "grad_norm": 2.5031349658966064, "learning_rate": 6.8675799086758e-06, "loss": 0.0176, "step": 4183 }, { "epoch": 3.821004566210046, "grad_norm": 0.2658337652683258, "learning_rate": 6.866565195332319e-06, "loss": 0.0015, "step": 4184 }, { "epoch": 3.821917808219178, "grad_norm": 17.546396255493164, "learning_rate": 6.865550481988838e-06, "loss": 0.1359, "step": 4185 }, { "epoch": 3.8228310502283103, "grad_norm": 0.7065362334251404, "learning_rate": 6.864535768645358e-06, "loss": 0.0057, "step": 4186 }, { "epoch": 3.8237442922374427, "grad_norm": 62.40631103515625, "learning_rate": 6.8635210553018786e-06, "loss": 0.4445, "step": 4187 }, { "epoch": 3.824657534246575, "grad_norm": 0.4294723868370056, "learning_rate": 6.862506341958397e-06, "loss": 0.0024, "step": 4188 }, { "epoch": 3.8255707762557076, "grad_norm": 15.883291244506836, "learning_rate": 6.861491628614917e-06, "loss": 0.1042, "step": 4189 }, { "epoch": 3.82648401826484, "grad_norm": 1.882327914237976, "learning_rate": 6.860476915271437e-06, "loss": 0.0109, "step": 4190 }, { "epoch": 3.8273972602739725, "grad_norm": 40.00183868408203, "learning_rate": 6.859462201927956e-06, "loss": 0.2897, "step": 4191 }, { "epoch": 3.828310502283105, "grad_norm": 25.39442253112793, "learning_rate": 6.858447488584475e-06, "loss": 0.2001, "step": 4192 }, { "epoch": 3.8292237442922374, "grad_norm": 40.63257598876953, "learning_rate": 6.857432775240995e-06, "loss": 0.2026, "step": 4193 }, { "epoch": 3.83013698630137, "grad_norm": 0.35772374272346497, "learning_rate": 6.856418061897514e-06, "loss": 0.003, "step": 4194 }, { "epoch": 3.8310502283105023, "grad_norm": 1.7907055616378784, "learning_rate": 6.8554033485540336e-06, "loss": 0.0107, "step": 4195 }, { "epoch": 3.831963470319635, "grad_norm": 2.4288313388824463, "learning_rate": 6.854388635210554e-06, "loss": 0.0136, "step": 4196 }, { "epoch": 3.8328767123287673, "grad_norm": 1.6993095874786377, "learning_rate": 6.853373921867074e-06, "loss": 0.0153, "step": 4197 }, { "epoch": 3.8337899543378997, "grad_norm": 7.912725448608398, "learning_rate": 6.852359208523593e-06, "loss": 0.0372, "step": 4198 }, { "epoch": 3.834703196347032, "grad_norm": 2.8120343685150146, "learning_rate": 6.851344495180112e-06, "loss": 0.0223, "step": 4199 }, { "epoch": 3.8356164383561646, "grad_norm": 41.13436508178711, "learning_rate": 6.850329781836632e-06, "loss": 0.2271, "step": 4200 }, { "epoch": 3.836529680365297, "grad_norm": 85.16776275634766, "learning_rate": 6.849315068493151e-06, "loss": 0.6243, "step": 4201 }, { "epoch": 3.837442922374429, "grad_norm": 101.4961929321289, "learning_rate": 6.8483003551496706e-06, "loss": 2.2549, "step": 4202 }, { "epoch": 3.8383561643835615, "grad_norm": 3.70737886428833, "learning_rate": 6.84728564180619e-06, "loss": 0.0229, "step": 4203 }, { "epoch": 3.839269406392694, "grad_norm": 70.27249908447266, "learning_rate": 6.846270928462709e-06, "loss": 0.2941, "step": 4204 }, { "epoch": 3.8401826484018264, "grad_norm": 45.6216926574707, "learning_rate": 6.84525621511923e-06, "loss": 0.3296, "step": 4205 }, { "epoch": 3.841095890410959, "grad_norm": 4.009867191314697, "learning_rate": 6.844241501775749e-06, "loss": 0.0199, "step": 4206 }, { "epoch": 3.8420091324200913, "grad_norm": 0.15008734166622162, "learning_rate": 6.843226788432269e-06, "loss": 0.001, "step": 4207 }, { "epoch": 3.842922374429224, "grad_norm": 23.056087493896484, "learning_rate": 6.842212075088788e-06, "loss": 0.1818, "step": 4208 }, { "epoch": 3.8438356164383563, "grad_norm": 5.124761581420898, "learning_rate": 6.8411973617453076e-06, "loss": 0.0492, "step": 4209 }, { "epoch": 3.8447488584474887, "grad_norm": 4.765244483947754, "learning_rate": 6.840182648401827e-06, "loss": 0.0273, "step": 4210 }, { "epoch": 3.845662100456621, "grad_norm": 49.503639221191406, "learning_rate": 6.839167935058346e-06, "loss": 0.3917, "step": 4211 }, { "epoch": 3.846575342465753, "grad_norm": 18.182170867919922, "learning_rate": 6.838153221714866e-06, "loss": 0.1211, "step": 4212 }, { "epoch": 3.8474885844748856, "grad_norm": 0.4466243088245392, "learning_rate": 6.837138508371386e-06, "loss": 0.0028, "step": 4213 }, { "epoch": 3.848401826484018, "grad_norm": 0.88437819480896, "learning_rate": 6.836123795027904e-06, "loss": 0.0085, "step": 4214 }, { "epoch": 3.8493150684931505, "grad_norm": 1.047544002532959, "learning_rate": 6.835109081684425e-06, "loss": 0.0051, "step": 4215 }, { "epoch": 3.850228310502283, "grad_norm": 1.6131069660186768, "learning_rate": 6.8340943683409446e-06, "loss": 0.0025, "step": 4216 }, { "epoch": 3.8511415525114154, "grad_norm": 23.878164291381836, "learning_rate": 6.833079654997464e-06, "loss": 0.0438, "step": 4217 }, { "epoch": 3.852054794520548, "grad_norm": 1.8611100912094116, "learning_rate": 6.832064941653983e-06, "loss": 0.0129, "step": 4218 }, { "epoch": 3.8529680365296803, "grad_norm": 0.23206472396850586, "learning_rate": 6.831050228310503e-06, "loss": 0.0015, "step": 4219 }, { "epoch": 3.853881278538813, "grad_norm": 2.8302743434906006, "learning_rate": 6.8300355149670225e-06, "loss": 0.0157, "step": 4220 }, { "epoch": 3.8547945205479452, "grad_norm": 28.946989059448242, "learning_rate": 6.829020801623541e-06, "loss": 0.1675, "step": 4221 }, { "epoch": 3.8557077625570777, "grad_norm": 2.7978975772857666, "learning_rate": 6.828006088280061e-06, "loss": 0.0171, "step": 4222 }, { "epoch": 3.85662100456621, "grad_norm": 1.144190788269043, "learning_rate": 6.8269913749365816e-06, "loss": 0.0069, "step": 4223 }, { "epoch": 3.8575342465753426, "grad_norm": 2.02266788482666, "learning_rate": 6.8259766615931e-06, "loss": 0.0144, "step": 4224 }, { "epoch": 3.858447488584475, "grad_norm": 160.9676055908203, "learning_rate": 6.82496194824962e-06, "loss": 1.2369, "step": 4225 }, { "epoch": 3.8593607305936075, "grad_norm": 22.599836349487305, "learning_rate": 6.82394723490614e-06, "loss": 0.1495, "step": 4226 }, { "epoch": 3.86027397260274, "grad_norm": 8.724139213562012, "learning_rate": 6.8229325215626595e-06, "loss": 0.0446, "step": 4227 }, { "epoch": 3.8611872146118724, "grad_norm": 0.1451530009508133, "learning_rate": 6.821917808219178e-06, "loss": 0.001, "step": 4228 }, { "epoch": 3.8621004566210044, "grad_norm": 2.2594685554504395, "learning_rate": 6.820903094875698e-06, "loss": 0.0099, "step": 4229 }, { "epoch": 3.863013698630137, "grad_norm": 2.436340093612671, "learning_rate": 6.819888381532218e-06, "loss": 0.0117, "step": 4230 }, { "epoch": 3.8639269406392693, "grad_norm": 42.27296447753906, "learning_rate": 6.8188736681887366e-06, "loss": 0.2856, "step": 4231 }, { "epoch": 3.864840182648402, "grad_norm": 46.97990798950195, "learning_rate": 6.817858954845257e-06, "loss": 0.3857, "step": 4232 }, { "epoch": 3.8657534246575342, "grad_norm": 1.7711389064788818, "learning_rate": 6.816844241501777e-06, "loss": 0.0099, "step": 4233 }, { "epoch": 3.8666666666666667, "grad_norm": 59.80369567871094, "learning_rate": 6.815829528158296e-06, "loss": 0.2002, "step": 4234 }, { "epoch": 3.867579908675799, "grad_norm": 0.2928674519062042, "learning_rate": 6.814814814814815e-06, "loss": 0.001, "step": 4235 }, { "epoch": 3.8684931506849316, "grad_norm": 0.9540826678276062, "learning_rate": 6.813800101471335e-06, "loss": 0.0076, "step": 4236 }, { "epoch": 3.869406392694064, "grad_norm": 1.8615493774414062, "learning_rate": 6.812785388127855e-06, "loss": 0.0126, "step": 4237 }, { "epoch": 3.8703196347031965, "grad_norm": 0.9492036700248718, "learning_rate": 6.8117706747843736e-06, "loss": 0.0054, "step": 4238 }, { "epoch": 3.8712328767123285, "grad_norm": 285.1290588378906, "learning_rate": 6.810755961440893e-06, "loss": 3.4513, "step": 4239 }, { "epoch": 3.872146118721461, "grad_norm": 0.04039780795574188, "learning_rate": 6.809741248097414e-06, "loss": 0.0004, "step": 4240 }, { "epoch": 3.8730593607305934, "grad_norm": 7.815289497375488, "learning_rate": 6.808726534753932e-06, "loss": 0.0708, "step": 4241 }, { "epoch": 3.873972602739726, "grad_norm": 1.4843251705169678, "learning_rate": 6.807711821410452e-06, "loss": 0.0104, "step": 4242 }, { "epoch": 3.8748858447488583, "grad_norm": 1.3468239307403564, "learning_rate": 6.806697108066972e-06, "loss": 0.0105, "step": 4243 }, { "epoch": 3.875799086757991, "grad_norm": 15.438456535339355, "learning_rate": 6.805682394723491e-06, "loss": 0.0854, "step": 4244 }, { "epoch": 3.8767123287671232, "grad_norm": 51.13793182373047, "learning_rate": 6.8046676813800105e-06, "loss": 0.3023, "step": 4245 }, { "epoch": 3.8776255707762557, "grad_norm": 0.27386948466300964, "learning_rate": 6.80365296803653e-06, "loss": 0.0011, "step": 4246 }, { "epoch": 3.878538812785388, "grad_norm": 147.39222717285156, "learning_rate": 6.80263825469305e-06, "loss": 3.3281, "step": 4247 }, { "epoch": 3.8794520547945206, "grad_norm": 102.33924102783203, "learning_rate": 6.801623541349569e-06, "loss": 3.3643, "step": 4248 }, { "epoch": 3.880365296803653, "grad_norm": 14.268165588378906, "learning_rate": 6.800608828006089e-06, "loss": 0.0826, "step": 4249 }, { "epoch": 3.8812785388127855, "grad_norm": 4.188077926635742, "learning_rate": 6.799594114662609e-06, "loss": 0.026, "step": 4250 }, { "epoch": 3.882191780821918, "grad_norm": 2.989499568939209, "learning_rate": 6.798579401319128e-06, "loss": 0.0227, "step": 4251 }, { "epoch": 3.8831050228310504, "grad_norm": 47.27411651611328, "learning_rate": 6.7975646879756475e-06, "loss": 0.6942, "step": 4252 }, { "epoch": 3.884018264840183, "grad_norm": 21.502912521362305, "learning_rate": 6.796549974632167e-06, "loss": 0.1287, "step": 4253 }, { "epoch": 3.8849315068493153, "grad_norm": 0.18859194219112396, "learning_rate": 6.795535261288686e-06, "loss": 0.0009, "step": 4254 }, { "epoch": 3.8858447488584473, "grad_norm": 0.43195727467536926, "learning_rate": 6.794520547945206e-06, "loss": 0.0018, "step": 4255 }, { "epoch": 3.88675799086758, "grad_norm": 0.4441935420036316, "learning_rate": 6.7935058346017255e-06, "loss": 0.0034, "step": 4256 }, { "epoch": 3.8876712328767122, "grad_norm": 49.4456901550293, "learning_rate": 6.792491121258246e-06, "loss": 0.4818, "step": 4257 }, { "epoch": 3.8885844748858447, "grad_norm": 10.015874862670898, "learning_rate": 6.791476407914764e-06, "loss": 0.0536, "step": 4258 }, { "epoch": 3.889497716894977, "grad_norm": 8.760370254516602, "learning_rate": 6.7904616945712845e-06, "loss": 0.0616, "step": 4259 }, { "epoch": 3.8904109589041096, "grad_norm": 0.05287552252411842, "learning_rate": 6.789446981227804e-06, "loss": 0.0004, "step": 4260 }, { "epoch": 3.891324200913242, "grad_norm": 15.092194557189941, "learning_rate": 6.788432267884323e-06, "loss": 0.0836, "step": 4261 }, { "epoch": 3.8922374429223745, "grad_norm": 0.33864066004753113, "learning_rate": 6.787417554540843e-06, "loss": 0.0013, "step": 4262 }, { "epoch": 3.893150684931507, "grad_norm": 0.976861298084259, "learning_rate": 6.7864028411973625e-06, "loss": 0.0071, "step": 4263 }, { "epoch": 3.8940639269406394, "grad_norm": 0.6568773984909058, "learning_rate": 6.785388127853881e-06, "loss": 0.0034, "step": 4264 }, { "epoch": 3.8949771689497714, "grad_norm": 26.505502700805664, "learning_rate": 6.784373414510401e-06, "loss": 0.106, "step": 4265 }, { "epoch": 3.895890410958904, "grad_norm": 5.384793758392334, "learning_rate": 6.783358701166921e-06, "loss": 0.0337, "step": 4266 }, { "epoch": 3.8968036529680363, "grad_norm": 0.5215028524398804, "learning_rate": 6.782343987823441e-06, "loss": 0.0032, "step": 4267 }, { "epoch": 3.8977168949771688, "grad_norm": 88.91465759277344, "learning_rate": 6.78132927447996e-06, "loss": 0.8361, "step": 4268 }, { "epoch": 3.8986301369863012, "grad_norm": 12.004197120666504, "learning_rate": 6.78031456113648e-06, "loss": 0.0416, "step": 4269 }, { "epoch": 3.8995433789954337, "grad_norm": 1.187647819519043, "learning_rate": 6.7792998477929995e-06, "loss": 0.0087, "step": 4270 }, { "epoch": 3.900456621004566, "grad_norm": 3.0877041816711426, "learning_rate": 6.778285134449518e-06, "loss": 0.0197, "step": 4271 }, { "epoch": 3.9013698630136986, "grad_norm": 0.3406069576740265, "learning_rate": 6.777270421106038e-06, "loss": 0.002, "step": 4272 }, { "epoch": 3.902283105022831, "grad_norm": 9.347029685974121, "learning_rate": 6.776255707762558e-06, "loss": 0.0493, "step": 4273 }, { "epoch": 3.9031963470319635, "grad_norm": 3.262897491455078, "learning_rate": 6.7752409944190765e-06, "loss": 0.0237, "step": 4274 }, { "epoch": 3.904109589041096, "grad_norm": 0.33271265029907227, "learning_rate": 6.774226281075596e-06, "loss": 0.0021, "step": 4275 }, { "epoch": 3.9050228310502284, "grad_norm": 0.2868013083934784, "learning_rate": 6.773211567732117e-06, "loss": 0.0021, "step": 4276 }, { "epoch": 3.905936073059361, "grad_norm": 98.30290222167969, "learning_rate": 6.7721968543886365e-06, "loss": 1.1585, "step": 4277 }, { "epoch": 3.9068493150684933, "grad_norm": 147.87774658203125, "learning_rate": 6.771182141045155e-06, "loss": 0.4331, "step": 4278 }, { "epoch": 3.9077625570776258, "grad_norm": 0.17415672540664673, "learning_rate": 6.770167427701675e-06, "loss": 0.0006, "step": 4279 }, { "epoch": 3.908675799086758, "grad_norm": 24.357751846313477, "learning_rate": 6.769152714358195e-06, "loss": 0.1012, "step": 4280 }, { "epoch": 3.9095890410958907, "grad_norm": 0.5692216157913208, "learning_rate": 6.7681380010147135e-06, "loss": 0.0044, "step": 4281 }, { "epoch": 3.9105022831050227, "grad_norm": 45.773902893066406, "learning_rate": 6.767123287671233e-06, "loss": 0.4909, "step": 4282 }, { "epoch": 3.911415525114155, "grad_norm": 0.11112138628959656, "learning_rate": 6.766108574327753e-06, "loss": 0.0007, "step": 4283 }, { "epoch": 3.9123287671232876, "grad_norm": 22.585405349731445, "learning_rate": 6.765093860984272e-06, "loss": 0.188, "step": 4284 }, { "epoch": 3.91324200913242, "grad_norm": 101.54639434814453, "learning_rate": 6.7640791476407915e-06, "loss": 0.7026, "step": 4285 }, { "epoch": 3.9141552511415525, "grad_norm": 9.810993194580078, "learning_rate": 6.763064434297312e-06, "loss": 0.0628, "step": 4286 }, { "epoch": 3.915068493150685, "grad_norm": 21.865068435668945, "learning_rate": 6.762049720953832e-06, "loss": 0.1411, "step": 4287 }, { "epoch": 3.9159817351598174, "grad_norm": 44.103240966796875, "learning_rate": 6.7610350076103505e-06, "loss": 0.3884, "step": 4288 }, { "epoch": 3.91689497716895, "grad_norm": 102.78223419189453, "learning_rate": 6.76002029426687e-06, "loss": 0.3157, "step": 4289 }, { "epoch": 3.9178082191780823, "grad_norm": 135.3174285888672, "learning_rate": 6.75900558092339e-06, "loss": 1.9992, "step": 4290 }, { "epoch": 3.9187214611872148, "grad_norm": 0.6585187315940857, "learning_rate": 6.757990867579909e-06, "loss": 0.0054, "step": 4291 }, { "epoch": 3.9196347031963468, "grad_norm": 1.0223287343978882, "learning_rate": 6.7569761542364285e-06, "loss": 0.01, "step": 4292 }, { "epoch": 3.9205479452054792, "grad_norm": 1.5726814270019531, "learning_rate": 6.755961440892949e-06, "loss": 0.0116, "step": 4293 }, { "epoch": 3.9214611872146117, "grad_norm": 19.928674697875977, "learning_rate": 6.754946727549467e-06, "loss": 0.1086, "step": 4294 }, { "epoch": 3.922374429223744, "grad_norm": 0.35906630754470825, "learning_rate": 6.7539320142059875e-06, "loss": 0.0019, "step": 4295 }, { "epoch": 3.9232876712328766, "grad_norm": 7.213819980621338, "learning_rate": 6.752917300862507e-06, "loss": 0.0475, "step": 4296 }, { "epoch": 3.924200913242009, "grad_norm": 25.834047317504883, "learning_rate": 6.751902587519027e-06, "loss": 0.1549, "step": 4297 }, { "epoch": 3.9251141552511415, "grad_norm": 1.9588813781738281, "learning_rate": 6.750887874175546e-06, "loss": 0.0122, "step": 4298 }, { "epoch": 3.926027397260274, "grad_norm": 0.8409919738769531, "learning_rate": 6.7498731608320655e-06, "loss": 0.0061, "step": 4299 }, { "epoch": 3.9269406392694064, "grad_norm": 28.482572555541992, "learning_rate": 6.748858447488585e-06, "loss": 0.1867, "step": 4300 }, { "epoch": 3.927853881278539, "grad_norm": 1.2147455215454102, "learning_rate": 6.747843734145104e-06, "loss": 0.0066, "step": 4301 }, { "epoch": 3.9287671232876713, "grad_norm": 6.461462497711182, "learning_rate": 6.746829020801624e-06, "loss": 0.0439, "step": 4302 }, { "epoch": 3.9296803652968038, "grad_norm": 2.9081625938415527, "learning_rate": 6.745814307458144e-06, "loss": 0.0192, "step": 4303 }, { "epoch": 3.930593607305936, "grad_norm": 20.259004592895508, "learning_rate": 6.744799594114663e-06, "loss": 0.1095, "step": 4304 }, { "epoch": 3.9315068493150687, "grad_norm": 6.34426736831665, "learning_rate": 6.743784880771183e-06, "loss": 0.0338, "step": 4305 }, { "epoch": 3.932420091324201, "grad_norm": 231.98623657226562, "learning_rate": 6.7427701674277024e-06, "loss": 0.6133, "step": 4306 }, { "epoch": 3.9333333333333336, "grad_norm": 9.285202026367188, "learning_rate": 6.741755454084222e-06, "loss": 0.0545, "step": 4307 }, { "epoch": 3.934246575342466, "grad_norm": 20.761444091796875, "learning_rate": 6.740740740740741e-06, "loss": 0.1443, "step": 4308 }, { "epoch": 3.935159817351598, "grad_norm": 0.550426185131073, "learning_rate": 6.739726027397261e-06, "loss": 0.0037, "step": 4309 }, { "epoch": 3.9360730593607305, "grad_norm": 2.585886240005493, "learning_rate": 6.73871131405378e-06, "loss": 0.0156, "step": 4310 }, { "epoch": 3.936986301369863, "grad_norm": 14.941396713256836, "learning_rate": 6.737696600710299e-06, "loss": 0.1002, "step": 4311 }, { "epoch": 3.9378995433789954, "grad_norm": 29.644792556762695, "learning_rate": 6.73668188736682e-06, "loss": 0.1937, "step": 4312 }, { "epoch": 3.938812785388128, "grad_norm": 137.5024871826172, "learning_rate": 6.7356671740233394e-06, "loss": 0.9547, "step": 4313 }, { "epoch": 3.9397260273972603, "grad_norm": 61.112022399902344, "learning_rate": 6.734652460679858e-06, "loss": 0.3977, "step": 4314 }, { "epoch": 3.9406392694063928, "grad_norm": 0.013777137733995914, "learning_rate": 6.733637747336378e-06, "loss": 0.0001, "step": 4315 }, { "epoch": 3.941552511415525, "grad_norm": 0.3242696225643158, "learning_rate": 6.732623033992898e-06, "loss": 0.0028, "step": 4316 }, { "epoch": 3.9424657534246577, "grad_norm": 36.624813079833984, "learning_rate": 6.731608320649417e-06, "loss": 0.5111, "step": 4317 }, { "epoch": 3.94337899543379, "grad_norm": 1.2043426036834717, "learning_rate": 6.730593607305936e-06, "loss": 0.0056, "step": 4318 }, { "epoch": 3.944292237442922, "grad_norm": 1.4746760129928589, "learning_rate": 6.729578893962456e-06, "loss": 0.0112, "step": 4319 }, { "epoch": 3.9452054794520546, "grad_norm": 10.213351249694824, "learning_rate": 6.7285641806189764e-06, "loss": 0.0802, "step": 4320 }, { "epoch": 3.946118721461187, "grad_norm": 0.07185765355825424, "learning_rate": 6.7275494672754944e-06, "loss": 0.0005, "step": 4321 }, { "epoch": 3.9470319634703195, "grad_norm": 0.30083978176116943, "learning_rate": 6.726534753932015e-06, "loss": 0.0025, "step": 4322 }, { "epoch": 3.947945205479452, "grad_norm": 2.563673734664917, "learning_rate": 6.725520040588535e-06, "loss": 0.0132, "step": 4323 }, { "epoch": 3.9488584474885844, "grad_norm": 0.5913723707199097, "learning_rate": 6.7245053272450535e-06, "loss": 0.0056, "step": 4324 }, { "epoch": 3.949771689497717, "grad_norm": 68.80867004394531, "learning_rate": 6.723490613901573e-06, "loss": 0.9575, "step": 4325 }, { "epoch": 3.9506849315068493, "grad_norm": 0.17904160916805267, "learning_rate": 6.722475900558093e-06, "loss": 0.001, "step": 4326 }, { "epoch": 3.9515981735159817, "grad_norm": 26.973724365234375, "learning_rate": 6.721461187214613e-06, "loss": 0.1646, "step": 4327 }, { "epoch": 3.952511415525114, "grad_norm": 0.48832452297210693, "learning_rate": 6.7204464738711314e-06, "loss": 0.0036, "step": 4328 }, { "epoch": 3.9534246575342467, "grad_norm": 1.3999547958374023, "learning_rate": 6.719431760527652e-06, "loss": 0.0094, "step": 4329 }, { "epoch": 3.954337899543379, "grad_norm": 161.85487365722656, "learning_rate": 6.718417047184172e-06, "loss": 0.6369, "step": 4330 }, { "epoch": 3.9552511415525116, "grad_norm": 2.1891746520996094, "learning_rate": 6.7174023338406905e-06, "loss": 0.017, "step": 4331 }, { "epoch": 3.956164383561644, "grad_norm": 4.523464679718018, "learning_rate": 6.71638762049721e-06, "loss": 0.0155, "step": 4332 }, { "epoch": 3.9570776255707765, "grad_norm": 2.9119391441345215, "learning_rate": 6.71537290715373e-06, "loss": 0.0162, "step": 4333 }, { "epoch": 3.957990867579909, "grad_norm": 0.5129327178001404, "learning_rate": 6.714358193810249e-06, "loss": 0.0038, "step": 4334 }, { "epoch": 3.958904109589041, "grad_norm": 45.12488555908203, "learning_rate": 6.7133434804667684e-06, "loss": 0.2908, "step": 4335 }, { "epoch": 3.9598173515981734, "grad_norm": 14.420683860778809, "learning_rate": 6.712328767123288e-06, "loss": 0.0635, "step": 4336 }, { "epoch": 3.960730593607306, "grad_norm": 12.745102882385254, "learning_rate": 6.711314053779809e-06, "loss": 0.1266, "step": 4337 }, { "epoch": 3.9616438356164383, "grad_norm": 95.03225708007812, "learning_rate": 6.710299340436327e-06, "loss": 2.3151, "step": 4338 }, { "epoch": 3.9625570776255707, "grad_norm": 20.90781593322754, "learning_rate": 6.709284627092847e-06, "loss": 0.1357, "step": 4339 }, { "epoch": 3.963470319634703, "grad_norm": 2.6729984283447266, "learning_rate": 6.708269913749367e-06, "loss": 0.0139, "step": 4340 }, { "epoch": 3.9643835616438357, "grad_norm": 104.09454345703125, "learning_rate": 6.707255200405886e-06, "loss": 1.6425, "step": 4341 }, { "epoch": 3.965296803652968, "grad_norm": 1.1421806812286377, "learning_rate": 6.7062404870624054e-06, "loss": 0.0063, "step": 4342 }, { "epoch": 3.9662100456621006, "grad_norm": 26.226119995117188, "learning_rate": 6.705225773718925e-06, "loss": 0.1946, "step": 4343 }, { "epoch": 3.967123287671233, "grad_norm": 2.690263271331787, "learning_rate": 6.704211060375444e-06, "loss": 0.0198, "step": 4344 }, { "epoch": 3.968036529680365, "grad_norm": 9.97222900390625, "learning_rate": 6.703196347031964e-06, "loss": 0.0518, "step": 4345 }, { "epoch": 3.9689497716894975, "grad_norm": 3.7010293006896973, "learning_rate": 6.702181633688483e-06, "loss": 0.0148, "step": 4346 }, { "epoch": 3.96986301369863, "grad_norm": 14.080850601196289, "learning_rate": 6.701166920345004e-06, "loss": 0.1303, "step": 4347 }, { "epoch": 3.9707762557077624, "grad_norm": 25.844703674316406, "learning_rate": 6.700152207001523e-06, "loss": 0.2009, "step": 4348 }, { "epoch": 3.971689497716895, "grad_norm": 3.774754524230957, "learning_rate": 6.6991374936580424e-06, "loss": 0.0227, "step": 4349 }, { "epoch": 3.9726027397260273, "grad_norm": 0.4060371518135071, "learning_rate": 6.698122780314562e-06, "loss": 0.0032, "step": 4350 }, { "epoch": 3.9735159817351597, "grad_norm": 1.3810046911239624, "learning_rate": 6.697108066971081e-06, "loss": 0.0051, "step": 4351 }, { "epoch": 3.974429223744292, "grad_norm": 7.369829177856445, "learning_rate": 6.696093353627601e-06, "loss": 0.036, "step": 4352 }, { "epoch": 3.9753424657534246, "grad_norm": 26.473217010498047, "learning_rate": 6.69507864028412e-06, "loss": 0.2047, "step": 4353 }, { "epoch": 3.976255707762557, "grad_norm": 7.185098648071289, "learning_rate": 6.694063926940639e-06, "loss": 0.0433, "step": 4354 }, { "epoch": 3.9771689497716896, "grad_norm": 4.9656758308410645, "learning_rate": 6.693049213597159e-06, "loss": 0.0347, "step": 4355 }, { "epoch": 3.978082191780822, "grad_norm": 3.75113582611084, "learning_rate": 6.6920345002536794e-06, "loss": 0.0197, "step": 4356 }, { "epoch": 3.9789954337899545, "grad_norm": 62.97331237792969, "learning_rate": 6.691019786910199e-06, "loss": 0.801, "step": 4357 }, { "epoch": 3.979908675799087, "grad_norm": 17.946338653564453, "learning_rate": 6.690005073566718e-06, "loss": 0.1229, "step": 4358 }, { "epoch": 3.9808219178082194, "grad_norm": 0.5518220663070679, "learning_rate": 6.688990360223238e-06, "loss": 0.0025, "step": 4359 }, { "epoch": 3.981735159817352, "grad_norm": 0.06094847992062569, "learning_rate": 6.687975646879757e-06, "loss": 0.0003, "step": 4360 }, { "epoch": 3.9826484018264843, "grad_norm": 1.5134568214416504, "learning_rate": 6.686960933536276e-06, "loss": 0.0124, "step": 4361 }, { "epoch": 3.9835616438356163, "grad_norm": 13.719854354858398, "learning_rate": 6.685946220192796e-06, "loss": 0.0636, "step": 4362 }, { "epoch": 3.9844748858447487, "grad_norm": 28.80373764038086, "learning_rate": 6.684931506849316e-06, "loss": 0.1783, "step": 4363 }, { "epoch": 3.985388127853881, "grad_norm": 0.18277834355831146, "learning_rate": 6.683916793505834e-06, "loss": 0.0012, "step": 4364 }, { "epoch": 3.9863013698630136, "grad_norm": 48.456295013427734, "learning_rate": 6.682902080162354e-06, "loss": 0.4692, "step": 4365 }, { "epoch": 3.987214611872146, "grad_norm": 8.320455551147461, "learning_rate": 6.681887366818875e-06, "loss": 0.0694, "step": 4366 }, { "epoch": 3.9881278538812786, "grad_norm": 1.6431998014450073, "learning_rate": 6.680872653475394e-06, "loss": 0.0114, "step": 4367 }, { "epoch": 3.989041095890411, "grad_norm": 7.065260887145996, "learning_rate": 6.679857940131913e-06, "loss": 0.0533, "step": 4368 }, { "epoch": 3.9899543378995435, "grad_norm": 156.12852478027344, "learning_rate": 6.678843226788433e-06, "loss": 0.7346, "step": 4369 }, { "epoch": 3.990867579908676, "grad_norm": 39.5681266784668, "learning_rate": 6.6778285134449526e-06, "loss": 0.3216, "step": 4370 }, { "epoch": 3.9917808219178084, "grad_norm": 0.23215252161026, "learning_rate": 6.676813800101471e-06, "loss": 0.0013, "step": 4371 }, { "epoch": 3.9926940639269404, "grad_norm": 5.698763370513916, "learning_rate": 6.675799086757991e-06, "loss": 0.057, "step": 4372 }, { "epoch": 3.993607305936073, "grad_norm": 23.10250473022461, "learning_rate": 6.674784373414512e-06, "loss": 0.1244, "step": 4373 }, { "epoch": 3.9945205479452053, "grad_norm": 0.02606787718832493, "learning_rate": 6.67376966007103e-06, "loss": 0.0002, "step": 4374 }, { "epoch": 3.9954337899543377, "grad_norm": 34.47496032714844, "learning_rate": 6.67275494672755e-06, "loss": 0.2299, "step": 4375 }, { "epoch": 3.99634703196347, "grad_norm": 5.190269470214844, "learning_rate": 6.67174023338407e-06, "loss": 0.0351, "step": 4376 }, { "epoch": 3.9972602739726026, "grad_norm": 32.93498229980469, "learning_rate": 6.6707255200405896e-06, "loss": 0.2262, "step": 4377 }, { "epoch": 3.998173515981735, "grad_norm": 0.42780500650405884, "learning_rate": 6.669710806697108e-06, "loss": 0.0024, "step": 4378 }, { "epoch": 3.9990867579908675, "grad_norm": 65.05498504638672, "learning_rate": 6.668696093353628e-06, "loss": 0.6528, "step": 4379 }, { "epoch": 4.0, "grad_norm": 0.20320548117160797, "learning_rate": 6.667681380010148e-06, "loss": 0.0014, "step": 4380 }, { "epoch": 4.0009132420091325, "grad_norm": 67.1938705444336, "learning_rate": 6.666666666666667e-06, "loss": 0.3547, "step": 4381 }, { "epoch": 4.001826484018265, "grad_norm": 2.124309539794922, "learning_rate": 6.665651953323186e-06, "loss": 0.0154, "step": 4382 }, { "epoch": 4.002739726027397, "grad_norm": 6.462545394897461, "learning_rate": 6.664637239979707e-06, "loss": 0.0507, "step": 4383 }, { "epoch": 4.00365296803653, "grad_norm": 2.19303035736084, "learning_rate": 6.663622526636226e-06, "loss": 0.0083, "step": 4384 }, { "epoch": 4.004566210045662, "grad_norm": 2.1248762607574463, "learning_rate": 6.662607813292745e-06, "loss": 0.0173, "step": 4385 }, { "epoch": 4.005479452054795, "grad_norm": 29.053329467773438, "learning_rate": 6.661593099949265e-06, "loss": 0.1337, "step": 4386 }, { "epoch": 4.006392694063927, "grad_norm": 0.5232353806495667, "learning_rate": 6.660578386605785e-06, "loss": 0.0039, "step": 4387 }, { "epoch": 4.00730593607306, "grad_norm": 32.7874755859375, "learning_rate": 6.659563673262304e-06, "loss": 0.2964, "step": 4388 }, { "epoch": 4.008219178082192, "grad_norm": 6.5639567375183105, "learning_rate": 6.658548959918823e-06, "loss": 0.0414, "step": 4389 }, { "epoch": 4.0091324200913245, "grad_norm": 2.103320360183716, "learning_rate": 6.657534246575343e-06, "loss": 0.017, "step": 4390 }, { "epoch": 4.010045662100457, "grad_norm": 5.3260884284973145, "learning_rate": 6.656519533231862e-06, "loss": 0.0333, "step": 4391 }, { "epoch": 4.010958904109589, "grad_norm": 10.318753242492676, "learning_rate": 6.655504819888382e-06, "loss": 0.0732, "step": 4392 }, { "epoch": 4.011872146118722, "grad_norm": 6.28458309173584, "learning_rate": 6.654490106544902e-06, "loss": 0.0231, "step": 4393 }, { "epoch": 4.0127853881278535, "grad_norm": 59.08998107910156, "learning_rate": 6.653475393201421e-06, "loss": 0.3075, "step": 4394 }, { "epoch": 4.013698630136986, "grad_norm": 2.5896666049957275, "learning_rate": 6.652460679857941e-06, "loss": 0.0155, "step": 4395 }, { "epoch": 4.014611872146118, "grad_norm": 2.729989767074585, "learning_rate": 6.65144596651446e-06, "loss": 0.019, "step": 4396 }, { "epoch": 4.015525114155251, "grad_norm": 0.9828444719314575, "learning_rate": 6.65043125317098e-06, "loss": 0.0072, "step": 4397 }, { "epoch": 4.016438356164383, "grad_norm": 3.8048899173736572, "learning_rate": 6.649416539827499e-06, "loss": 0.0237, "step": 4398 }, { "epoch": 4.017351598173516, "grad_norm": 20.210111618041992, "learning_rate": 6.6484018264840186e-06, "loss": 0.0854, "step": 4399 }, { "epoch": 4.018264840182648, "grad_norm": 11.666059494018555, "learning_rate": 6.647387113140539e-06, "loss": 0.0724, "step": 4400 }, { "epoch": 4.019178082191781, "grad_norm": 4.321464538574219, "learning_rate": 6.646372399797057e-06, "loss": 0.0257, "step": 4401 }, { "epoch": 4.020091324200913, "grad_norm": 0.34054115414619446, "learning_rate": 6.645357686453578e-06, "loss": 0.0025, "step": 4402 }, { "epoch": 4.0210045662100455, "grad_norm": 14.206340789794922, "learning_rate": 6.644342973110097e-06, "loss": 0.0344, "step": 4403 }, { "epoch": 4.021917808219178, "grad_norm": 1.0285347700119019, "learning_rate": 6.643328259766616e-06, "loss": 0.0051, "step": 4404 }, { "epoch": 4.0228310502283104, "grad_norm": 5.2390666007995605, "learning_rate": 6.642313546423136e-06, "loss": 0.0283, "step": 4405 }, { "epoch": 4.023744292237443, "grad_norm": 0.2101077288389206, "learning_rate": 6.6412988330796556e-06, "loss": 0.0018, "step": 4406 }, { "epoch": 4.024657534246575, "grad_norm": 1.7806907892227173, "learning_rate": 6.640284119736175e-06, "loss": 0.0135, "step": 4407 }, { "epoch": 4.025570776255708, "grad_norm": 3.539686441421509, "learning_rate": 6.639269406392694e-06, "loss": 0.0262, "step": 4408 }, { "epoch": 4.02648401826484, "grad_norm": 0.3399296700954437, "learning_rate": 6.638254693049215e-06, "loss": 0.0029, "step": 4409 }, { "epoch": 4.027397260273973, "grad_norm": 3.3465590476989746, "learning_rate": 6.637239979705734e-06, "loss": 0.0187, "step": 4410 }, { "epoch": 4.028310502283105, "grad_norm": 0.8508517742156982, "learning_rate": 6.636225266362253e-06, "loss": 0.0038, "step": 4411 }, { "epoch": 4.029223744292238, "grad_norm": 0.055785633623600006, "learning_rate": 6.635210553018773e-06, "loss": 0.0003, "step": 4412 }, { "epoch": 4.03013698630137, "grad_norm": 0.5372115969657898, "learning_rate": 6.6341958396752926e-06, "loss": 0.0041, "step": 4413 }, { "epoch": 4.0310502283105025, "grad_norm": 0.0744357630610466, "learning_rate": 6.633181126331811e-06, "loss": 0.0004, "step": 4414 }, { "epoch": 4.031963470319635, "grad_norm": 21.146163940429688, "learning_rate": 6.632166412988331e-06, "loss": 0.143, "step": 4415 }, { "epoch": 4.032876712328767, "grad_norm": 12.556800842285156, "learning_rate": 6.631151699644851e-06, "loss": 0.0961, "step": 4416 }, { "epoch": 4.0337899543379, "grad_norm": 2.647804021835327, "learning_rate": 6.630136986301371e-06, "loss": 0.0155, "step": 4417 }, { "epoch": 4.034703196347032, "grad_norm": 0.22047215700149536, "learning_rate": 6.629122272957889e-06, "loss": 0.0014, "step": 4418 }, { "epoch": 4.035616438356165, "grad_norm": 4.983941555023193, "learning_rate": 6.62810755961441e-06, "loss": 0.0329, "step": 4419 }, { "epoch": 4.036529680365296, "grad_norm": 2.025766611099243, "learning_rate": 6.6270928462709296e-06, "loss": 0.016, "step": 4420 }, { "epoch": 4.037442922374429, "grad_norm": 3.237311363220215, "learning_rate": 6.626078132927448e-06, "loss": 0.0189, "step": 4421 }, { "epoch": 4.038356164383561, "grad_norm": 14.626745223999023, "learning_rate": 6.625063419583968e-06, "loss": 0.1313, "step": 4422 }, { "epoch": 4.039269406392694, "grad_norm": 0.060877878218889236, "learning_rate": 6.624048706240488e-06, "loss": 0.0004, "step": 4423 }, { "epoch": 4.040182648401826, "grad_norm": 3.693392753601074, "learning_rate": 6.623033992897007e-06, "loss": 0.0157, "step": 4424 }, { "epoch": 4.041095890410959, "grad_norm": 14.414261817932129, "learning_rate": 6.622019279553526e-06, "loss": 0.0976, "step": 4425 }, { "epoch": 4.042009132420091, "grad_norm": 7.9362664222717285, "learning_rate": 6.621004566210046e-06, "loss": 0.045, "step": 4426 }, { "epoch": 4.0429223744292235, "grad_norm": 93.67622375488281, "learning_rate": 6.6199898528665666e-06, "loss": 0.5532, "step": 4427 }, { "epoch": 4.043835616438356, "grad_norm": 2.70881986618042, "learning_rate": 6.618975139523085e-06, "loss": 0.0157, "step": 4428 }, { "epoch": 4.044748858447488, "grad_norm": 9.071416854858398, "learning_rate": 6.617960426179605e-06, "loss": 0.0764, "step": 4429 }, { "epoch": 4.045662100456621, "grad_norm": 0.5700066685676575, "learning_rate": 6.616945712836125e-06, "loss": 0.0022, "step": 4430 }, { "epoch": 4.046575342465753, "grad_norm": 122.59686279296875, "learning_rate": 6.615930999492644e-06, "loss": 1.3947, "step": 4431 }, { "epoch": 4.047488584474886, "grad_norm": 4.401022911071777, "learning_rate": 6.614916286149163e-06, "loss": 0.0321, "step": 4432 }, { "epoch": 4.048401826484018, "grad_norm": 2.3718507289886475, "learning_rate": 6.613901572805683e-06, "loss": 0.0097, "step": 4433 }, { "epoch": 4.049315068493151, "grad_norm": 1.2217247486114502, "learning_rate": 6.612886859462202e-06, "loss": 0.0075, "step": 4434 }, { "epoch": 4.050228310502283, "grad_norm": 0.09414996951818466, "learning_rate": 6.6118721461187215e-06, "loss": 0.0005, "step": 4435 }, { "epoch": 4.051141552511416, "grad_norm": 4.95656156539917, "learning_rate": 6.610857432775242e-06, "loss": 0.0281, "step": 4436 }, { "epoch": 4.052054794520548, "grad_norm": 0.7478557825088501, "learning_rate": 6.609842719431762e-06, "loss": 0.0046, "step": 4437 }, { "epoch": 4.0529680365296805, "grad_norm": 2.0112428665161133, "learning_rate": 6.608828006088281e-06, "loss": 0.0129, "step": 4438 }, { "epoch": 4.053881278538813, "grad_norm": 0.22849629819393158, "learning_rate": 6.6078132927448e-06, "loss": 0.0019, "step": 4439 }, { "epoch": 4.054794520547945, "grad_norm": 0.214207723736763, "learning_rate": 6.60679857940132e-06, "loss": 0.0012, "step": 4440 }, { "epoch": 4.055707762557078, "grad_norm": 1.187690258026123, "learning_rate": 6.605783866057839e-06, "loss": 0.0048, "step": 4441 }, { "epoch": 4.05662100456621, "grad_norm": 3.5993077754974365, "learning_rate": 6.6047691527143585e-06, "loss": 0.0181, "step": 4442 }, { "epoch": 4.057534246575343, "grad_norm": 0.13051436841487885, "learning_rate": 6.603754439370878e-06, "loss": 0.0006, "step": 4443 }, { "epoch": 4.058447488584475, "grad_norm": 0.42924001812934875, "learning_rate": 6.602739726027397e-06, "loss": 0.0017, "step": 4444 }, { "epoch": 4.059360730593608, "grad_norm": 24.089256286621094, "learning_rate": 6.601725012683917e-06, "loss": 0.1925, "step": 4445 }, { "epoch": 4.06027397260274, "grad_norm": 3.631683826446533, "learning_rate": 6.600710299340437e-06, "loss": 0.0174, "step": 4446 }, { "epoch": 4.061187214611872, "grad_norm": 0.7793925404548645, "learning_rate": 6.599695585996957e-06, "loss": 0.0034, "step": 4447 }, { "epoch": 4.062100456621004, "grad_norm": 0.940912127494812, "learning_rate": 6.598680872653476e-06, "loss": 0.005, "step": 4448 }, { "epoch": 4.063013698630137, "grad_norm": 7.218814849853516, "learning_rate": 6.5976661593099955e-06, "loss": 0.0277, "step": 4449 }, { "epoch": 4.063926940639269, "grad_norm": 2.3616597652435303, "learning_rate": 6.596651445966515e-06, "loss": 0.02, "step": 4450 }, { "epoch": 4.0648401826484015, "grad_norm": 8.690581321716309, "learning_rate": 6.595636732623034e-06, "loss": 0.041, "step": 4451 }, { "epoch": 4.065753424657534, "grad_norm": 33.55598449707031, "learning_rate": 6.594622019279554e-06, "loss": 0.2544, "step": 4452 }, { "epoch": 4.066666666666666, "grad_norm": 44.04161834716797, "learning_rate": 6.593607305936074e-06, "loss": 0.3168, "step": 4453 }, { "epoch": 4.067579908675799, "grad_norm": 58.22500991821289, "learning_rate": 6.592592592592592e-06, "loss": 0.1476, "step": 4454 }, { "epoch": 4.068493150684931, "grad_norm": 51.63316345214844, "learning_rate": 6.591577879249113e-06, "loss": 0.3639, "step": 4455 }, { "epoch": 4.069406392694064, "grad_norm": 6.097972869873047, "learning_rate": 6.5905631659056325e-06, "loss": 0.0375, "step": 4456 }, { "epoch": 4.070319634703196, "grad_norm": 111.25652313232422, "learning_rate": 6.589548452562152e-06, "loss": 1.3832, "step": 4457 }, { "epoch": 4.071232876712329, "grad_norm": 69.38522338867188, "learning_rate": 6.588533739218671e-06, "loss": 0.5529, "step": 4458 }, { "epoch": 4.072146118721461, "grad_norm": 26.378572463989258, "learning_rate": 6.587519025875191e-06, "loss": 0.192, "step": 4459 }, { "epoch": 4.073059360730594, "grad_norm": 1.7826552391052246, "learning_rate": 6.5865043125317105e-06, "loss": 0.0113, "step": 4460 }, { "epoch": 4.073972602739726, "grad_norm": 0.7407390475273132, "learning_rate": 6.585489599188229e-06, "loss": 0.0038, "step": 4461 }, { "epoch": 4.0748858447488585, "grad_norm": 2.351331949234009, "learning_rate": 6.584474885844749e-06, "loss": 0.0143, "step": 4462 }, { "epoch": 4.075799086757991, "grad_norm": 49.20182800292969, "learning_rate": 6.5834601725012695e-06, "loss": 0.2738, "step": 4463 }, { "epoch": 4.076712328767123, "grad_norm": 85.727783203125, "learning_rate": 6.582445459157788e-06, "loss": 0.7237, "step": 4464 }, { "epoch": 4.077625570776256, "grad_norm": 42.229427337646484, "learning_rate": 6.581430745814308e-06, "loss": 0.3003, "step": 4465 }, { "epoch": 4.078538812785388, "grad_norm": 6.4371256828308105, "learning_rate": 6.580416032470828e-06, "loss": 0.035, "step": 4466 }, { "epoch": 4.079452054794521, "grad_norm": 0.3532881736755371, "learning_rate": 6.5794013191273475e-06, "loss": 0.0022, "step": 4467 }, { "epoch": 4.080365296803653, "grad_norm": 1.8065247535705566, "learning_rate": 6.578386605783866e-06, "loss": 0.0043, "step": 4468 }, { "epoch": 4.081278538812786, "grad_norm": 11.764923095703125, "learning_rate": 6.577371892440386e-06, "loss": 0.072, "step": 4469 }, { "epoch": 4.082191780821918, "grad_norm": 17.777042388916016, "learning_rate": 6.576357179096906e-06, "loss": 0.0959, "step": 4470 }, { "epoch": 4.083105022831051, "grad_norm": 22.62656593322754, "learning_rate": 6.5753424657534245e-06, "loss": 0.166, "step": 4471 }, { "epoch": 4.084018264840183, "grad_norm": 4.5091118812561035, "learning_rate": 6.574327752409945e-06, "loss": 0.0239, "step": 4472 }, { "epoch": 4.0849315068493155, "grad_norm": 114.6173324584961, "learning_rate": 6.573313039066465e-06, "loss": 1.0486, "step": 4473 }, { "epoch": 4.085844748858447, "grad_norm": 4.7129597663879395, "learning_rate": 6.572298325722984e-06, "loss": 0.0308, "step": 4474 }, { "epoch": 4.0867579908675795, "grad_norm": 0.22879396378993988, "learning_rate": 6.571283612379503e-06, "loss": 0.0021, "step": 4475 }, { "epoch": 4.087671232876712, "grad_norm": 13.375734329223633, "learning_rate": 6.570268899036023e-06, "loss": 0.07, "step": 4476 }, { "epoch": 4.088584474885844, "grad_norm": 16.831693649291992, "learning_rate": 6.569254185692543e-06, "loss": 0.0993, "step": 4477 }, { "epoch": 4.089497716894977, "grad_norm": 0.7352384924888611, "learning_rate": 6.5682394723490615e-06, "loss": 0.0041, "step": 4478 }, { "epoch": 4.090410958904109, "grad_norm": 4.1885271072387695, "learning_rate": 6.567224759005581e-06, "loss": 0.0269, "step": 4479 }, { "epoch": 4.091324200913242, "grad_norm": 25.88628387451172, "learning_rate": 6.566210045662102e-06, "loss": 0.1003, "step": 4480 }, { "epoch": 4.092237442922374, "grad_norm": 92.63113403320312, "learning_rate": 6.56519533231862e-06, "loss": 0.695, "step": 4481 }, { "epoch": 4.093150684931507, "grad_norm": 17.943395614624023, "learning_rate": 6.56418061897514e-06, "loss": 0.1287, "step": 4482 }, { "epoch": 4.094063926940639, "grad_norm": 1.9574899673461914, "learning_rate": 6.56316590563166e-06, "loss": 0.0095, "step": 4483 }, { "epoch": 4.094977168949772, "grad_norm": 0.08383053541183472, "learning_rate": 6.562151192288179e-06, "loss": 0.0004, "step": 4484 }, { "epoch": 4.095890410958904, "grad_norm": 4.708549976348877, "learning_rate": 6.5611364789446985e-06, "loss": 0.0291, "step": 4485 }, { "epoch": 4.0968036529680365, "grad_norm": 1.8516619205474854, "learning_rate": 6.560121765601218e-06, "loss": 0.0078, "step": 4486 }, { "epoch": 4.097716894977169, "grad_norm": 0.3485392928123474, "learning_rate": 6.559107052257738e-06, "loss": 0.0024, "step": 4487 }, { "epoch": 4.098630136986301, "grad_norm": 17.09836196899414, "learning_rate": 6.558092338914257e-06, "loss": 0.1212, "step": 4488 }, { "epoch": 4.099543378995434, "grad_norm": 1.129447340965271, "learning_rate": 6.5570776255707765e-06, "loss": 0.0054, "step": 4489 }, { "epoch": 4.100456621004566, "grad_norm": 0.22644303739070892, "learning_rate": 6.556062912227297e-06, "loss": 0.0017, "step": 4490 }, { "epoch": 4.101369863013699, "grad_norm": 1.2848902940750122, "learning_rate": 6.555048198883816e-06, "loss": 0.0064, "step": 4491 }, { "epoch": 4.102283105022831, "grad_norm": 29.757102966308594, "learning_rate": 6.5540334855403355e-06, "loss": 0.1199, "step": 4492 }, { "epoch": 4.103196347031964, "grad_norm": 54.04579544067383, "learning_rate": 6.553018772196855e-06, "loss": 0.3769, "step": 4493 }, { "epoch": 4.104109589041096, "grad_norm": 1.91537606716156, "learning_rate": 6.552004058853374e-06, "loss": 0.0127, "step": 4494 }, { "epoch": 4.105022831050229, "grad_norm": 0.10286834836006165, "learning_rate": 6.550989345509894e-06, "loss": 0.0006, "step": 4495 }, { "epoch": 4.105936073059361, "grad_norm": 10.245086669921875, "learning_rate": 6.5499746321664134e-06, "loss": 0.0505, "step": 4496 }, { "epoch": 4.1068493150684935, "grad_norm": 0.24824093282222748, "learning_rate": 6.548959918822934e-06, "loss": 0.0017, "step": 4497 }, { "epoch": 4.107762557077626, "grad_norm": 1.1911224126815796, "learning_rate": 6.547945205479452e-06, "loss": 0.0092, "step": 4498 }, { "epoch": 4.108675799086758, "grad_norm": 41.75881576538086, "learning_rate": 6.5469304921359725e-06, "loss": 0.707, "step": 4499 }, { "epoch": 4.109589041095891, "grad_norm": 0.056792039424180984, "learning_rate": 6.545915778792492e-06, "loss": 0.0003, "step": 4500 }, { "epoch": 4.110502283105022, "grad_norm": 0.020864389836788177, "learning_rate": 6.544901065449011e-06, "loss": 0.0001, "step": 4501 }, { "epoch": 4.111415525114155, "grad_norm": 176.40065002441406, "learning_rate": 6.543886352105531e-06, "loss": 0.938, "step": 4502 }, { "epoch": 4.112328767123287, "grad_norm": 2.6433112621307373, "learning_rate": 6.5428716387620504e-06, "loss": 0.0192, "step": 4503 }, { "epoch": 4.11324200913242, "grad_norm": 1.3186613321304321, "learning_rate": 6.541856925418569e-06, "loss": 0.0054, "step": 4504 }, { "epoch": 4.114155251141552, "grad_norm": 3.7719435691833496, "learning_rate": 6.540842212075089e-06, "loss": 0.0232, "step": 4505 }, { "epoch": 4.115068493150685, "grad_norm": 3.663876533508301, "learning_rate": 6.539827498731609e-06, "loss": 0.0238, "step": 4506 }, { "epoch": 4.115981735159817, "grad_norm": 34.87196350097656, "learning_rate": 6.538812785388129e-06, "loss": 0.146, "step": 4507 }, { "epoch": 4.11689497716895, "grad_norm": 6.4668989181518555, "learning_rate": 6.537798072044648e-06, "loss": 0.0248, "step": 4508 }, { "epoch": 4.117808219178082, "grad_norm": 30.775405883789062, "learning_rate": 6.536783358701168e-06, "loss": 0.2058, "step": 4509 }, { "epoch": 4.1187214611872145, "grad_norm": 2.799206495285034, "learning_rate": 6.5357686453576874e-06, "loss": 0.0198, "step": 4510 }, { "epoch": 4.119634703196347, "grad_norm": 0.0656730905175209, "learning_rate": 6.534753932014206e-06, "loss": 0.0006, "step": 4511 }, { "epoch": 4.120547945205479, "grad_norm": 1.5165289640426636, "learning_rate": 6.533739218670726e-06, "loss": 0.0058, "step": 4512 }, { "epoch": 4.121461187214612, "grad_norm": 8.244516372680664, "learning_rate": 6.532724505327246e-06, "loss": 0.0641, "step": 4513 }, { "epoch": 4.122374429223744, "grad_norm": 9.126114845275879, "learning_rate": 6.5317097919837645e-06, "loss": 0.0535, "step": 4514 }, { "epoch": 4.123287671232877, "grad_norm": 1.8020493984222412, "learning_rate": 6.530695078640284e-06, "loss": 0.0119, "step": 4515 }, { "epoch": 4.124200913242009, "grad_norm": 1.7712591886520386, "learning_rate": 6.529680365296805e-06, "loss": 0.0114, "step": 4516 }, { "epoch": 4.125114155251142, "grad_norm": 84.93159484863281, "learning_rate": 6.5286656519533244e-06, "loss": 0.959, "step": 4517 }, { "epoch": 4.126027397260274, "grad_norm": 2.7848281860351562, "learning_rate": 6.527650938609843e-06, "loss": 0.0189, "step": 4518 }, { "epoch": 4.126940639269407, "grad_norm": 5.631885051727295, "learning_rate": 6.526636225266363e-06, "loss": 0.0447, "step": 4519 }, { "epoch": 4.127853881278539, "grad_norm": 0.09541705995798111, "learning_rate": 6.525621511922883e-06, "loss": 0.0006, "step": 4520 }, { "epoch": 4.1287671232876715, "grad_norm": 7.311749458312988, "learning_rate": 6.5246067985794015e-06, "loss": 0.0464, "step": 4521 }, { "epoch": 4.129680365296804, "grad_norm": 2.9948019981384277, "learning_rate": 6.523592085235921e-06, "loss": 0.0142, "step": 4522 }, { "epoch": 4.130593607305936, "grad_norm": 1.449087142944336, "learning_rate": 6.522577371892441e-06, "loss": 0.0082, "step": 4523 }, { "epoch": 4.131506849315069, "grad_norm": 1.7073098421096802, "learning_rate": 6.52156265854896e-06, "loss": 0.0058, "step": 4524 }, { "epoch": 4.132420091324201, "grad_norm": 6.777945518493652, "learning_rate": 6.5205479452054794e-06, "loss": 0.046, "step": 4525 }, { "epoch": 4.133333333333334, "grad_norm": 0.613930881023407, "learning_rate": 6.519533231862e-06, "loss": 0.0036, "step": 4526 }, { "epoch": 4.134246575342466, "grad_norm": 0.21248769760131836, "learning_rate": 6.51851851851852e-06, "loss": 0.0017, "step": 4527 }, { "epoch": 4.135159817351598, "grad_norm": 8.26872444152832, "learning_rate": 6.5175038051750385e-06, "loss": 0.042, "step": 4528 }, { "epoch": 4.13607305936073, "grad_norm": 0.47980737686157227, "learning_rate": 6.516489091831558e-06, "loss": 0.0029, "step": 4529 }, { "epoch": 4.136986301369863, "grad_norm": 0.8145886659622192, "learning_rate": 6.515474378488078e-06, "loss": 0.004, "step": 4530 }, { "epoch": 4.137899543378995, "grad_norm": 3.27597975730896, "learning_rate": 6.514459665144597e-06, "loss": 0.0123, "step": 4531 }, { "epoch": 4.138812785388128, "grad_norm": 10.977805137634277, "learning_rate": 6.5134449518011164e-06, "loss": 0.0656, "step": 4532 }, { "epoch": 4.13972602739726, "grad_norm": 0.4532575309276581, "learning_rate": 6.512430238457637e-06, "loss": 0.0028, "step": 4533 }, { "epoch": 4.1406392694063925, "grad_norm": 12.315839767456055, "learning_rate": 6.511415525114155e-06, "loss": 0.0895, "step": 4534 }, { "epoch": 4.141552511415525, "grad_norm": 0.26655009388923645, "learning_rate": 6.5104008117706755e-06, "loss": 0.0013, "step": 4535 }, { "epoch": 4.142465753424657, "grad_norm": 14.536312103271484, "learning_rate": 6.509386098427195e-06, "loss": 0.1224, "step": 4536 }, { "epoch": 4.14337899543379, "grad_norm": 1.0816396474838257, "learning_rate": 6.508371385083715e-06, "loss": 0.005, "step": 4537 }, { "epoch": 4.144292237442922, "grad_norm": 4.336440086364746, "learning_rate": 6.507356671740234e-06, "loss": 0.0314, "step": 4538 }, { "epoch": 4.145205479452055, "grad_norm": 3.815211296081543, "learning_rate": 6.5063419583967534e-06, "loss": 0.0221, "step": 4539 }, { "epoch": 4.146118721461187, "grad_norm": 7.2695159912109375, "learning_rate": 6.505327245053273e-06, "loss": 0.0553, "step": 4540 }, { "epoch": 4.14703196347032, "grad_norm": 2.6263182163238525, "learning_rate": 6.504312531709792e-06, "loss": 0.019, "step": 4541 }, { "epoch": 4.147945205479452, "grad_norm": 43.312782287597656, "learning_rate": 6.503297818366312e-06, "loss": 0.4997, "step": 4542 }, { "epoch": 4.148858447488585, "grad_norm": 0.15541282296180725, "learning_rate": 6.502283105022832e-06, "loss": 0.0008, "step": 4543 }, { "epoch": 4.149771689497717, "grad_norm": 109.60707092285156, "learning_rate": 6.501268391679351e-06, "loss": 0.9714, "step": 4544 }, { "epoch": 4.1506849315068495, "grad_norm": 0.4969988167285919, "learning_rate": 6.500253678335871e-06, "loss": 0.0011, "step": 4545 }, { "epoch": 4.151598173515982, "grad_norm": 0.4407709836959839, "learning_rate": 6.4992389649923904e-06, "loss": 0.0028, "step": 4546 }, { "epoch": 4.152511415525114, "grad_norm": 2.882850408554077, "learning_rate": 6.49822425164891e-06, "loss": 0.015, "step": 4547 }, { "epoch": 4.153424657534247, "grad_norm": 190.9525604248047, "learning_rate": 6.497209538305429e-06, "loss": 1.247, "step": 4548 }, { "epoch": 4.154337899543379, "grad_norm": 0.7349593639373779, "learning_rate": 6.496194824961949e-06, "loss": 0.0034, "step": 4549 }, { "epoch": 4.155251141552512, "grad_norm": 3.2087950706481934, "learning_rate": 6.495180111618468e-06, "loss": 0.0225, "step": 4550 }, { "epoch": 4.156164383561644, "grad_norm": 4.715561389923096, "learning_rate": 6.494165398274987e-06, "loss": 0.031, "step": 4551 }, { "epoch": 4.157077625570777, "grad_norm": 0.40037766098976135, "learning_rate": 6.493150684931508e-06, "loss": 0.0018, "step": 4552 }, { "epoch": 4.157990867579909, "grad_norm": 18.63614273071289, "learning_rate": 6.4921359715880274e-06, "loss": 0.0829, "step": 4553 }, { "epoch": 4.1589041095890416, "grad_norm": 3.1505424976348877, "learning_rate": 6.491121258244546e-06, "loss": 0.0191, "step": 4554 }, { "epoch": 4.159817351598173, "grad_norm": 4.266785621643066, "learning_rate": 6.490106544901066e-06, "loss": 0.0204, "step": 4555 }, { "epoch": 4.160730593607306, "grad_norm": 5.469350337982178, "learning_rate": 6.489091831557586e-06, "loss": 0.0345, "step": 4556 }, { "epoch": 4.161643835616438, "grad_norm": 90.80764770507812, "learning_rate": 6.488077118214105e-06, "loss": 0.8273, "step": 4557 }, { "epoch": 4.1625570776255705, "grad_norm": 16.965885162353516, "learning_rate": 6.487062404870624e-06, "loss": 0.0504, "step": 4558 }, { "epoch": 4.163470319634703, "grad_norm": 64.44674682617188, "learning_rate": 6.486047691527144e-06, "loss": 0.3756, "step": 4559 }, { "epoch": 4.164383561643835, "grad_norm": 6.027727127075195, "learning_rate": 6.485032978183664e-06, "loss": 0.025, "step": 4560 }, { "epoch": 4.165296803652968, "grad_norm": 0.09476053714752197, "learning_rate": 6.484018264840182e-06, "loss": 0.0005, "step": 4561 }, { "epoch": 4.1662100456621, "grad_norm": 32.83087921142578, "learning_rate": 6.483003551496703e-06, "loss": 0.1632, "step": 4562 }, { "epoch": 4.167123287671233, "grad_norm": 1.6239656209945679, "learning_rate": 6.481988838153223e-06, "loss": 0.0103, "step": 4563 }, { "epoch": 4.168036529680365, "grad_norm": 29.57720375061035, "learning_rate": 6.4809741248097415e-06, "loss": 0.1262, "step": 4564 }, { "epoch": 4.168949771689498, "grad_norm": 4.223878860473633, "learning_rate": 6.479959411466261e-06, "loss": 0.0212, "step": 4565 }, { "epoch": 4.16986301369863, "grad_norm": 1.2690258026123047, "learning_rate": 6.478944698122781e-06, "loss": 0.0086, "step": 4566 }, { "epoch": 4.170776255707763, "grad_norm": 3.658879041671753, "learning_rate": 6.4779299847793006e-06, "loss": 0.0155, "step": 4567 }, { "epoch": 4.171689497716895, "grad_norm": 1.8767614364624023, "learning_rate": 6.476915271435819e-06, "loss": 0.0118, "step": 4568 }, { "epoch": 4.1726027397260275, "grad_norm": 0.5956113934516907, "learning_rate": 6.475900558092339e-06, "loss": 0.0034, "step": 4569 }, { "epoch": 4.17351598173516, "grad_norm": 2.4050357341766357, "learning_rate": 6.47488584474886e-06, "loss": 0.0097, "step": 4570 }, { "epoch": 4.174429223744292, "grad_norm": 5.011362552642822, "learning_rate": 6.4738711314053785e-06, "loss": 0.039, "step": 4571 }, { "epoch": 4.175342465753425, "grad_norm": 1.3045700788497925, "learning_rate": 6.472856418061898e-06, "loss": 0.0092, "step": 4572 }, { "epoch": 4.176255707762557, "grad_norm": 0.18155242502689362, "learning_rate": 6.471841704718418e-06, "loss": 0.0013, "step": 4573 }, { "epoch": 4.17716894977169, "grad_norm": 0.138820618391037, "learning_rate": 6.470826991374937e-06, "loss": 0.0005, "step": 4574 }, { "epoch": 4.178082191780822, "grad_norm": 1.5007187128067017, "learning_rate": 6.469812278031456e-06, "loss": 0.0109, "step": 4575 }, { "epoch": 4.178995433789955, "grad_norm": 92.48696899414062, "learning_rate": 6.468797564687976e-06, "loss": 0.5411, "step": 4576 }, { "epoch": 4.179908675799087, "grad_norm": 2.6997792720794678, "learning_rate": 6.467782851344497e-06, "loss": 0.0154, "step": 4577 }, { "epoch": 4.1808219178082195, "grad_norm": 0.06668905168771744, "learning_rate": 6.466768138001015e-06, "loss": 0.0005, "step": 4578 }, { "epoch": 4.181735159817352, "grad_norm": 4.979199409484863, "learning_rate": 6.465753424657535e-06, "loss": 0.0258, "step": 4579 }, { "epoch": 4.182648401826484, "grad_norm": 2.2933619022369385, "learning_rate": 6.464738711314055e-06, "loss": 0.0137, "step": 4580 }, { "epoch": 4.183561643835616, "grad_norm": 3.9736225605010986, "learning_rate": 6.463723997970574e-06, "loss": 0.0251, "step": 4581 }, { "epoch": 4.1844748858447485, "grad_norm": 31.588674545288086, "learning_rate": 6.462709284627093e-06, "loss": 0.1895, "step": 4582 }, { "epoch": 4.185388127853881, "grad_norm": 4.027328014373779, "learning_rate": 6.461694571283613e-06, "loss": 0.0272, "step": 4583 }, { "epoch": 4.186301369863013, "grad_norm": 0.7267832159996033, "learning_rate": 6.460679857940132e-06, "loss": 0.0059, "step": 4584 }, { "epoch": 4.187214611872146, "grad_norm": 44.130374908447266, "learning_rate": 6.459665144596652e-06, "loss": 0.3601, "step": 4585 }, { "epoch": 4.188127853881278, "grad_norm": 0.7941380143165588, "learning_rate": 6.458650431253171e-06, "loss": 0.0056, "step": 4586 }, { "epoch": 4.189041095890411, "grad_norm": 1.0402462482452393, "learning_rate": 6.457635717909692e-06, "loss": 0.0046, "step": 4587 }, { "epoch": 4.189954337899543, "grad_norm": 0.9217687845230103, "learning_rate": 6.456621004566211e-06, "loss": 0.0063, "step": 4588 }, { "epoch": 4.190867579908676, "grad_norm": 0.1894616037607193, "learning_rate": 6.45560629122273e-06, "loss": 0.0011, "step": 4589 }, { "epoch": 4.191780821917808, "grad_norm": 3.954831123352051, "learning_rate": 6.45459157787925e-06, "loss": 0.0198, "step": 4590 }, { "epoch": 4.1926940639269406, "grad_norm": 65.34929656982422, "learning_rate": 6.453576864535769e-06, "loss": 0.4029, "step": 4591 }, { "epoch": 4.193607305936073, "grad_norm": 10.30935001373291, "learning_rate": 6.452562151192289e-06, "loss": 0.0482, "step": 4592 }, { "epoch": 4.1945205479452055, "grad_norm": 14.893054008483887, "learning_rate": 6.451547437848808e-06, "loss": 0.0707, "step": 4593 }, { "epoch": 4.195433789954338, "grad_norm": 32.59239196777344, "learning_rate": 6.450532724505327e-06, "loss": 0.0745, "step": 4594 }, { "epoch": 4.19634703196347, "grad_norm": 5.5294365882873535, "learning_rate": 6.449518011161847e-06, "loss": 0.0197, "step": 4595 }, { "epoch": 4.197260273972603, "grad_norm": 1.8021944761276245, "learning_rate": 6.448503297818367e-06, "loss": 0.0084, "step": 4596 }, { "epoch": 4.198173515981735, "grad_norm": 84.03450775146484, "learning_rate": 6.447488584474887e-06, "loss": 0.358, "step": 4597 }, { "epoch": 4.199086757990868, "grad_norm": 3.3828723430633545, "learning_rate": 6.446473871131406e-06, "loss": 0.0153, "step": 4598 }, { "epoch": 4.2, "grad_norm": 12.424534797668457, "learning_rate": 6.445459157787926e-06, "loss": 0.0656, "step": 4599 }, { "epoch": 4.200913242009133, "grad_norm": 0.42391839623451233, "learning_rate": 6.444444444444445e-06, "loss": 0.0027, "step": 4600 }, { "epoch": 4.201826484018265, "grad_norm": 0.16203348338603973, "learning_rate": 6.443429731100964e-06, "loss": 0.0012, "step": 4601 }, { "epoch": 4.2027397260273975, "grad_norm": 3.7471747398376465, "learning_rate": 6.442415017757484e-06, "loss": 0.0196, "step": 4602 }, { "epoch": 4.20365296803653, "grad_norm": 0.7018501162528992, "learning_rate": 6.4414003044140036e-06, "loss": 0.0047, "step": 4603 }, { "epoch": 4.2045662100456624, "grad_norm": 37.413265228271484, "learning_rate": 6.440385591070522e-06, "loss": 0.4816, "step": 4604 }, { "epoch": 4.205479452054795, "grad_norm": 21.594148635864258, "learning_rate": 6.439370877727042e-06, "loss": 0.1747, "step": 4605 }, { "epoch": 4.206392694063927, "grad_norm": 0.3824171721935272, "learning_rate": 6.438356164383563e-06, "loss": 0.0022, "step": 4606 }, { "epoch": 4.207305936073059, "grad_norm": 88.789306640625, "learning_rate": 6.437341451040082e-06, "loss": 0.8094, "step": 4607 }, { "epoch": 4.208219178082191, "grad_norm": 52.14889144897461, "learning_rate": 6.436326737696601e-06, "loss": 0.2861, "step": 4608 }, { "epoch": 4.209132420091324, "grad_norm": 9.278239250183105, "learning_rate": 6.435312024353121e-06, "loss": 0.0447, "step": 4609 }, { "epoch": 4.210045662100456, "grad_norm": 3.0338127613067627, "learning_rate": 6.4342973110096406e-06, "loss": 0.0238, "step": 4610 }, { "epoch": 4.210958904109589, "grad_norm": 19.9429931640625, "learning_rate": 6.433282597666159e-06, "loss": 0.0851, "step": 4611 }, { "epoch": 4.211872146118721, "grad_norm": 1.6239213943481445, "learning_rate": 6.432267884322679e-06, "loss": 0.0108, "step": 4612 }, { "epoch": 4.212785388127854, "grad_norm": 100.91063690185547, "learning_rate": 6.4312531709792e-06, "loss": 1.31, "step": 4613 }, { "epoch": 4.213698630136986, "grad_norm": 5.04923677444458, "learning_rate": 6.430238457635718e-06, "loss": 0.0274, "step": 4614 }, { "epoch": 4.2146118721461185, "grad_norm": 2.4031786918640137, "learning_rate": 6.429223744292238e-06, "loss": 0.0139, "step": 4615 }, { "epoch": 4.215525114155251, "grad_norm": 30.278398513793945, "learning_rate": 6.428209030948758e-06, "loss": 0.2061, "step": 4616 }, { "epoch": 4.2164383561643834, "grad_norm": 5.974484443664551, "learning_rate": 6.4271943176052776e-06, "loss": 0.0467, "step": 4617 }, { "epoch": 4.217351598173516, "grad_norm": 0.8623780012130737, "learning_rate": 6.426179604261796e-06, "loss": 0.006, "step": 4618 }, { "epoch": 4.218264840182648, "grad_norm": 2.9263017177581787, "learning_rate": 6.425164890918316e-06, "loss": 0.0229, "step": 4619 }, { "epoch": 4.219178082191781, "grad_norm": 4.735133171081543, "learning_rate": 6.424150177574836e-06, "loss": 0.0313, "step": 4620 }, { "epoch": 4.220091324200913, "grad_norm": 16.319480895996094, "learning_rate": 6.423135464231355e-06, "loss": 0.1555, "step": 4621 }, { "epoch": 4.221004566210046, "grad_norm": 3.3114147186279297, "learning_rate": 6.422120750887874e-06, "loss": 0.0161, "step": 4622 }, { "epoch": 4.221917808219178, "grad_norm": 2.189638376235962, "learning_rate": 6.421106037544395e-06, "loss": 0.0132, "step": 4623 }, { "epoch": 4.222831050228311, "grad_norm": 0.35084930062294006, "learning_rate": 6.420091324200914e-06, "loss": 0.0023, "step": 4624 }, { "epoch": 4.223744292237443, "grad_norm": 2.17494797706604, "learning_rate": 6.419076610857433e-06, "loss": 0.0075, "step": 4625 }, { "epoch": 4.2246575342465755, "grad_norm": 2.0014724731445312, "learning_rate": 6.418061897513953e-06, "loss": 0.0096, "step": 4626 }, { "epoch": 4.225570776255708, "grad_norm": 73.44737243652344, "learning_rate": 6.417047184170473e-06, "loss": 0.5196, "step": 4627 }, { "epoch": 4.22648401826484, "grad_norm": 4.610448360443115, "learning_rate": 6.416032470826992e-06, "loss": 0.0353, "step": 4628 }, { "epoch": 4.227397260273973, "grad_norm": 9.60859489440918, "learning_rate": 6.415017757483511e-06, "loss": 0.0566, "step": 4629 }, { "epoch": 4.228310502283105, "grad_norm": 0.11934881657361984, "learning_rate": 6.414003044140031e-06, "loss": 0.0011, "step": 4630 }, { "epoch": 4.229223744292238, "grad_norm": 0.4374723732471466, "learning_rate": 6.41298833079655e-06, "loss": 0.0033, "step": 4631 }, { "epoch": 4.23013698630137, "grad_norm": 1.1698054075241089, "learning_rate": 6.41197361745307e-06, "loss": 0.0101, "step": 4632 }, { "epoch": 4.231050228310503, "grad_norm": 0.398821622133255, "learning_rate": 6.41095890410959e-06, "loss": 0.0022, "step": 4633 }, { "epoch": 4.231963470319634, "grad_norm": 1.0952666997909546, "learning_rate": 6.409944190766109e-06, "loss": 0.0083, "step": 4634 }, { "epoch": 4.232876712328767, "grad_norm": 0.28027579188346863, "learning_rate": 6.408929477422629e-06, "loss": 0.0024, "step": 4635 }, { "epoch": 4.233789954337899, "grad_norm": 49.271644592285156, "learning_rate": 6.407914764079148e-06, "loss": 0.3329, "step": 4636 }, { "epoch": 4.234703196347032, "grad_norm": 5.99301290512085, "learning_rate": 6.406900050735668e-06, "loss": 0.0317, "step": 4637 }, { "epoch": 4.235616438356164, "grad_norm": 0.4820566773414612, "learning_rate": 6.405885337392187e-06, "loss": 0.0023, "step": 4638 }, { "epoch": 4.2365296803652965, "grad_norm": 124.92041778564453, "learning_rate": 6.4048706240487065e-06, "loss": 0.7946, "step": 4639 }, { "epoch": 4.237442922374429, "grad_norm": 0.8789742588996887, "learning_rate": 6.403855910705227e-06, "loss": 0.005, "step": 4640 }, { "epoch": 4.238356164383561, "grad_norm": 22.770198822021484, "learning_rate": 6.402841197361745e-06, "loss": 0.0916, "step": 4641 }, { "epoch": 4.239269406392694, "grad_norm": 4.909086227416992, "learning_rate": 6.401826484018266e-06, "loss": 0.0341, "step": 4642 }, { "epoch": 4.240182648401826, "grad_norm": 18.387290954589844, "learning_rate": 6.400811770674785e-06, "loss": 0.0676, "step": 4643 }, { "epoch": 4.241095890410959, "grad_norm": 2.8325440883636475, "learning_rate": 6.399797057331304e-06, "loss": 0.0129, "step": 4644 }, { "epoch": 4.242009132420091, "grad_norm": 48.34788513183594, "learning_rate": 6.398782343987824e-06, "loss": 0.3931, "step": 4645 }, { "epoch": 4.242922374429224, "grad_norm": 42.082374572753906, "learning_rate": 6.3977676306443435e-06, "loss": 0.2706, "step": 4646 }, { "epoch": 4.243835616438356, "grad_norm": 11.438403129577637, "learning_rate": 6.396752917300863e-06, "loss": 0.0577, "step": 4647 }, { "epoch": 4.244748858447489, "grad_norm": 0.49134692549705505, "learning_rate": 6.395738203957382e-06, "loss": 0.0025, "step": 4648 }, { "epoch": 4.245662100456621, "grad_norm": 0.7454319596290588, "learning_rate": 6.394723490613902e-06, "loss": 0.0049, "step": 4649 }, { "epoch": 4.2465753424657535, "grad_norm": 5.378226280212402, "learning_rate": 6.393708777270422e-06, "loss": 0.0255, "step": 4650 }, { "epoch": 4.247488584474886, "grad_norm": 0.4163541793823242, "learning_rate": 6.392694063926941e-06, "loss": 0.0026, "step": 4651 }, { "epoch": 4.248401826484018, "grad_norm": 0.2671467065811157, "learning_rate": 6.391679350583461e-06, "loss": 0.0018, "step": 4652 }, { "epoch": 4.249315068493151, "grad_norm": 30.676494598388672, "learning_rate": 6.3906646372399805e-06, "loss": 0.1834, "step": 4653 }, { "epoch": 4.250228310502283, "grad_norm": 0.008289494551718235, "learning_rate": 6.389649923896499e-06, "loss": 0.0, "step": 4654 }, { "epoch": 4.251141552511416, "grad_norm": 4.059659004211426, "learning_rate": 6.388635210553019e-06, "loss": 0.0198, "step": 4655 }, { "epoch": 4.252054794520548, "grad_norm": 3.637294292449951, "learning_rate": 6.387620497209539e-06, "loss": 0.0221, "step": 4656 }, { "epoch": 4.252968036529681, "grad_norm": 1.8571178913116455, "learning_rate": 6.386605783866059e-06, "loss": 0.0089, "step": 4657 }, { "epoch": 4.253881278538813, "grad_norm": 1.3454939126968384, "learning_rate": 6.385591070522577e-06, "loss": 0.0084, "step": 4658 }, { "epoch": 4.254794520547946, "grad_norm": 0.2141198068857193, "learning_rate": 6.384576357179098e-06, "loss": 0.0014, "step": 4659 }, { "epoch": 4.255707762557078, "grad_norm": 5.915118217468262, "learning_rate": 6.3835616438356175e-06, "loss": 0.0356, "step": 4660 }, { "epoch": 4.25662100456621, "grad_norm": 10.993197441101074, "learning_rate": 6.382546930492136e-06, "loss": 0.0501, "step": 4661 }, { "epoch": 4.257534246575342, "grad_norm": 2.611056327819824, "learning_rate": 6.381532217148656e-06, "loss": 0.016, "step": 4662 }, { "epoch": 4.2584474885844745, "grad_norm": 19.874900817871094, "learning_rate": 6.380517503805176e-06, "loss": 0.1182, "step": 4663 }, { "epoch": 4.259360730593607, "grad_norm": 0.3223308324813843, "learning_rate": 6.379502790461695e-06, "loss": 0.0016, "step": 4664 }, { "epoch": 4.260273972602739, "grad_norm": 6.554471969604492, "learning_rate": 6.378488077118214e-06, "loss": 0.0369, "step": 4665 }, { "epoch": 4.261187214611872, "grad_norm": 16.499181747436523, "learning_rate": 6.377473363774734e-06, "loss": 0.0754, "step": 4666 }, { "epoch": 4.262100456621004, "grad_norm": 14.525385856628418, "learning_rate": 6.3764586504312545e-06, "loss": 0.0944, "step": 4667 }, { "epoch": 4.263013698630137, "grad_norm": 15.257816314697266, "learning_rate": 6.375443937087773e-06, "loss": 0.0925, "step": 4668 }, { "epoch": 4.263926940639269, "grad_norm": 26.572275161743164, "learning_rate": 6.374429223744293e-06, "loss": 0.1127, "step": 4669 }, { "epoch": 4.264840182648402, "grad_norm": 83.36658477783203, "learning_rate": 6.373414510400813e-06, "loss": 0.5521, "step": 4670 }, { "epoch": 4.265753424657534, "grad_norm": 1.722554087638855, "learning_rate": 6.372399797057332e-06, "loss": 0.007, "step": 4671 }, { "epoch": 4.266666666666667, "grad_norm": 0.5840069055557251, "learning_rate": 6.371385083713851e-06, "loss": 0.0042, "step": 4672 }, { "epoch": 4.267579908675799, "grad_norm": 0.6160171031951904, "learning_rate": 6.370370370370371e-06, "loss": 0.0036, "step": 4673 }, { "epoch": 4.2684931506849315, "grad_norm": 72.42765808105469, "learning_rate": 6.36935565702689e-06, "loss": 0.5524, "step": 4674 }, { "epoch": 4.269406392694064, "grad_norm": 124.70833587646484, "learning_rate": 6.3683409436834095e-06, "loss": 2.654, "step": 4675 }, { "epoch": 4.270319634703196, "grad_norm": 11.892790794372559, "learning_rate": 6.36732623033993e-06, "loss": 0.0375, "step": 4676 }, { "epoch": 4.271232876712329, "grad_norm": 3.295020818710327, "learning_rate": 6.36631151699645e-06, "loss": 0.0174, "step": 4677 }, { "epoch": 4.272146118721461, "grad_norm": 0.025901278480887413, "learning_rate": 6.365296803652969e-06, "loss": 0.0002, "step": 4678 }, { "epoch": 4.273059360730594, "grad_norm": 1.3936550617218018, "learning_rate": 6.364282090309488e-06, "loss": 0.0096, "step": 4679 }, { "epoch": 4.273972602739726, "grad_norm": 78.37689971923828, "learning_rate": 6.363267376966008e-06, "loss": 0.5319, "step": 4680 }, { "epoch": 4.274885844748859, "grad_norm": 4.493451118469238, "learning_rate": 6.362252663622527e-06, "loss": 0.0318, "step": 4681 }, { "epoch": 4.275799086757991, "grad_norm": 2.3232369422912598, "learning_rate": 6.3612379502790465e-06, "loss": 0.0156, "step": 4682 }, { "epoch": 4.276712328767124, "grad_norm": 12.555160522460938, "learning_rate": 6.360223236935566e-06, "loss": 0.0819, "step": 4683 }, { "epoch": 4.277625570776256, "grad_norm": 3.3442540168762207, "learning_rate": 6.359208523592085e-06, "loss": 0.0186, "step": 4684 }, { "epoch": 4.2785388127853885, "grad_norm": 0.11345211416482925, "learning_rate": 6.358193810248605e-06, "loss": 0.0006, "step": 4685 }, { "epoch": 4.279452054794521, "grad_norm": 1.3720687627792358, "learning_rate": 6.357179096905125e-06, "loss": 0.0094, "step": 4686 }, { "epoch": 4.280365296803653, "grad_norm": 4.81957483291626, "learning_rate": 6.356164383561645e-06, "loss": 0.0282, "step": 4687 }, { "epoch": 4.281278538812785, "grad_norm": 0.8542103171348572, "learning_rate": 6.355149670218164e-06, "loss": 0.0073, "step": 4688 }, { "epoch": 4.282191780821917, "grad_norm": 1.5948693752288818, "learning_rate": 6.3541349568746835e-06, "loss": 0.0093, "step": 4689 }, { "epoch": 4.28310502283105, "grad_norm": 2.655834913253784, "learning_rate": 6.353120243531203e-06, "loss": 0.0107, "step": 4690 }, { "epoch": 4.284018264840182, "grad_norm": 0.8614993095397949, "learning_rate": 6.352105530187722e-06, "loss": 0.0039, "step": 4691 }, { "epoch": 4.284931506849315, "grad_norm": 6.790313720703125, "learning_rate": 6.351090816844242e-06, "loss": 0.0505, "step": 4692 }, { "epoch": 4.285844748858447, "grad_norm": 30.868637084960938, "learning_rate": 6.3500761035007614e-06, "loss": 0.0594, "step": 4693 }, { "epoch": 4.28675799086758, "grad_norm": 1.8912278413772583, "learning_rate": 6.34906139015728e-06, "loss": 0.0071, "step": 4694 }, { "epoch": 4.287671232876712, "grad_norm": 29.46641731262207, "learning_rate": 6.348046676813801e-06, "loss": 0.1509, "step": 4695 }, { "epoch": 4.288584474885845, "grad_norm": 2.7309682369232178, "learning_rate": 6.3470319634703205e-06, "loss": 0.0141, "step": 4696 }, { "epoch": 4.289497716894977, "grad_norm": 3.079535961151123, "learning_rate": 6.34601725012684e-06, "loss": 0.0223, "step": 4697 }, { "epoch": 4.2904109589041095, "grad_norm": 1.4517085552215576, "learning_rate": 6.345002536783359e-06, "loss": 0.0092, "step": 4698 }, { "epoch": 4.291324200913242, "grad_norm": 0.27879437804222107, "learning_rate": 6.343987823439879e-06, "loss": 0.0021, "step": 4699 }, { "epoch": 4.292237442922374, "grad_norm": 12.75192642211914, "learning_rate": 6.3429731100963984e-06, "loss": 0.0839, "step": 4700 }, { "epoch": 4.293150684931507, "grad_norm": 1.4975048303604126, "learning_rate": 6.341958396752917e-06, "loss": 0.0113, "step": 4701 }, { "epoch": 4.294063926940639, "grad_norm": 0.26998063921928406, "learning_rate": 6.340943683409437e-06, "loss": 0.0022, "step": 4702 }, { "epoch": 4.294977168949772, "grad_norm": 0.7908628582954407, "learning_rate": 6.3399289700659575e-06, "loss": 0.0036, "step": 4703 }, { "epoch": 4.295890410958904, "grad_norm": 4.202581405639648, "learning_rate": 6.338914256722476e-06, "loss": 0.0181, "step": 4704 }, { "epoch": 4.296803652968037, "grad_norm": 2.9405815601348877, "learning_rate": 6.337899543378996e-06, "loss": 0.0165, "step": 4705 }, { "epoch": 4.297716894977169, "grad_norm": 0.07383091747760773, "learning_rate": 6.336884830035516e-06, "loss": 0.0004, "step": 4706 }, { "epoch": 4.298630136986302, "grad_norm": 0.2545827329158783, "learning_rate": 6.3358701166920354e-06, "loss": 0.0012, "step": 4707 }, { "epoch": 4.299543378995434, "grad_norm": 5.729373455047607, "learning_rate": 6.334855403348554e-06, "loss": 0.0299, "step": 4708 }, { "epoch": 4.3004566210045665, "grad_norm": 0.07209885865449905, "learning_rate": 6.333840690005074e-06, "loss": 0.0004, "step": 4709 }, { "epoch": 4.301369863013699, "grad_norm": 24.320226669311523, "learning_rate": 6.332825976661594e-06, "loss": 0.1448, "step": 4710 }, { "epoch": 4.302283105022831, "grad_norm": 62.146602630615234, "learning_rate": 6.3318112633181125e-06, "loss": 0.5327, "step": 4711 }, { "epoch": 4.303196347031964, "grad_norm": 30.571157455444336, "learning_rate": 6.330796549974633e-06, "loss": 0.2339, "step": 4712 }, { "epoch": 4.304109589041096, "grad_norm": 16.026376724243164, "learning_rate": 6.329781836631153e-06, "loss": 0.0976, "step": 4713 }, { "epoch": 4.305022831050229, "grad_norm": 71.80430603027344, "learning_rate": 6.328767123287672e-06, "loss": 0.6797, "step": 4714 }, { "epoch": 4.30593607305936, "grad_norm": 90.2544937133789, "learning_rate": 6.327752409944191e-06, "loss": 5.0874, "step": 4715 }, { "epoch": 4.306849315068493, "grad_norm": 0.6088566184043884, "learning_rate": 6.326737696600711e-06, "loss": 0.0035, "step": 4716 }, { "epoch": 4.307762557077625, "grad_norm": 48.553367614746094, "learning_rate": 6.325722983257231e-06, "loss": 0.288, "step": 4717 }, { "epoch": 4.308675799086758, "grad_norm": 1.3272967338562012, "learning_rate": 6.3247082699137495e-06, "loss": 0.011, "step": 4718 }, { "epoch": 4.30958904109589, "grad_norm": 2.086341142654419, "learning_rate": 6.323693556570269e-06, "loss": 0.0114, "step": 4719 }, { "epoch": 4.310502283105023, "grad_norm": 5.085206508636475, "learning_rate": 6.32267884322679e-06, "loss": 0.035, "step": 4720 }, { "epoch": 4.311415525114155, "grad_norm": 0.10182248055934906, "learning_rate": 6.321664129883308e-06, "loss": 0.0008, "step": 4721 }, { "epoch": 4.3123287671232875, "grad_norm": 4.556596279144287, "learning_rate": 6.320649416539828e-06, "loss": 0.0234, "step": 4722 }, { "epoch": 4.31324200913242, "grad_norm": 0.05946796014904976, "learning_rate": 6.319634703196348e-06, "loss": 0.0004, "step": 4723 }, { "epoch": 4.314155251141552, "grad_norm": 2.5956132411956787, "learning_rate": 6.318619989852867e-06, "loss": 0.0146, "step": 4724 }, { "epoch": 4.315068493150685, "grad_norm": 6.693475246429443, "learning_rate": 6.3176052765093865e-06, "loss": 0.0383, "step": 4725 }, { "epoch": 4.315981735159817, "grad_norm": 2.4516563415527344, "learning_rate": 6.316590563165906e-06, "loss": 0.0222, "step": 4726 }, { "epoch": 4.31689497716895, "grad_norm": 2.4168028831481934, "learning_rate": 6.315575849822426e-06, "loss": 0.0185, "step": 4727 }, { "epoch": 4.317808219178082, "grad_norm": 5.936742305755615, "learning_rate": 6.314561136478945e-06, "loss": 0.04, "step": 4728 }, { "epoch": 4.318721461187215, "grad_norm": 6.452297687530518, "learning_rate": 6.3135464231354644e-06, "loss": 0.04, "step": 4729 }, { "epoch": 4.319634703196347, "grad_norm": 59.0095329284668, "learning_rate": 6.312531709791985e-06, "loss": 0.5409, "step": 4730 }, { "epoch": 4.32054794520548, "grad_norm": 8.555061340332031, "learning_rate": 6.311516996448504e-06, "loss": 0.0467, "step": 4731 }, { "epoch": 4.321461187214612, "grad_norm": 10.177248001098633, "learning_rate": 6.3105022831050235e-06, "loss": 0.0646, "step": 4732 }, { "epoch": 4.3223744292237445, "grad_norm": 16.31907081604004, "learning_rate": 6.309487569761543e-06, "loss": 0.11, "step": 4733 }, { "epoch": 4.323287671232877, "grad_norm": 3.5589616298675537, "learning_rate": 6.308472856418062e-06, "loss": 0.0213, "step": 4734 }, { "epoch": 4.324200913242009, "grad_norm": 4.834095001220703, "learning_rate": 6.307458143074582e-06, "loss": 0.0358, "step": 4735 }, { "epoch": 4.325114155251142, "grad_norm": 81.3114013671875, "learning_rate": 6.3064434297311014e-06, "loss": 2.538, "step": 4736 }, { "epoch": 4.326027397260274, "grad_norm": 0.1574224978685379, "learning_rate": 6.305428716387622e-06, "loss": 0.0013, "step": 4737 }, { "epoch": 4.326940639269407, "grad_norm": 1.0626842975616455, "learning_rate": 6.30441400304414e-06, "loss": 0.0085, "step": 4738 }, { "epoch": 4.327853881278539, "grad_norm": 30.77861785888672, "learning_rate": 6.3033992897006605e-06, "loss": 0.193, "step": 4739 }, { "epoch": 4.328767123287671, "grad_norm": 5.053157329559326, "learning_rate": 6.30238457635718e-06, "loss": 0.0405, "step": 4740 }, { "epoch": 4.329680365296804, "grad_norm": 0.15045276284217834, "learning_rate": 6.301369863013699e-06, "loss": 0.0009, "step": 4741 }, { "epoch": 4.330593607305936, "grad_norm": 28.221263885498047, "learning_rate": 6.300355149670219e-06, "loss": 0.1978, "step": 4742 }, { "epoch": 4.331506849315068, "grad_norm": 1.6406699419021606, "learning_rate": 6.2993404363267384e-06, "loss": 0.0095, "step": 4743 }, { "epoch": 4.332420091324201, "grad_norm": 0.054080940783023834, "learning_rate": 6.298325722983257e-06, "loss": 0.0004, "step": 4744 }, { "epoch": 4.333333333333333, "grad_norm": 0.1413850486278534, "learning_rate": 6.297311009639777e-06, "loss": 0.001, "step": 4745 }, { "epoch": 4.3342465753424655, "grad_norm": 11.731868743896484, "learning_rate": 6.296296296296297e-06, "loss": 0.0414, "step": 4746 }, { "epoch": 4.335159817351598, "grad_norm": 1.8516675233840942, "learning_rate": 6.295281582952817e-06, "loss": 0.0119, "step": 4747 }, { "epoch": 4.33607305936073, "grad_norm": 3.8013713359832764, "learning_rate": 6.294266869609336e-06, "loss": 0.036, "step": 4748 }, { "epoch": 4.336986301369863, "grad_norm": 5.145938873291016, "learning_rate": 6.293252156265856e-06, "loss": 0.0453, "step": 4749 }, { "epoch": 4.337899543378995, "grad_norm": 1.847504734992981, "learning_rate": 6.292237442922375e-06, "loss": 0.0088, "step": 4750 }, { "epoch": 4.338812785388128, "grad_norm": 19.895496368408203, "learning_rate": 6.291222729578894e-06, "loss": 0.1233, "step": 4751 }, { "epoch": 4.33972602739726, "grad_norm": 8.280673027038574, "learning_rate": 6.290208016235414e-06, "loss": 0.0664, "step": 4752 }, { "epoch": 4.340639269406393, "grad_norm": 1.0181801319122314, "learning_rate": 6.289193302891934e-06, "loss": 0.0067, "step": 4753 }, { "epoch": 4.341552511415525, "grad_norm": 3.105107069015503, "learning_rate": 6.2881785895484525e-06, "loss": 0.0209, "step": 4754 }, { "epoch": 4.342465753424658, "grad_norm": 49.876102447509766, "learning_rate": 6.287163876204972e-06, "loss": 0.6238, "step": 4755 }, { "epoch": 4.34337899543379, "grad_norm": 6.1008477210998535, "learning_rate": 6.286149162861493e-06, "loss": 0.0321, "step": 4756 }, { "epoch": 4.3442922374429225, "grad_norm": 18.222393035888672, "learning_rate": 6.285134449518012e-06, "loss": 0.126, "step": 4757 }, { "epoch": 4.345205479452055, "grad_norm": 20.638614654541016, "learning_rate": 6.284119736174531e-06, "loss": 0.1276, "step": 4758 }, { "epoch": 4.346118721461187, "grad_norm": 5.800159931182861, "learning_rate": 6.283105022831051e-06, "loss": 0.0437, "step": 4759 }, { "epoch": 4.34703196347032, "grad_norm": 5.754451751708984, "learning_rate": 6.282090309487571e-06, "loss": 0.0443, "step": 4760 }, { "epoch": 4.347945205479452, "grad_norm": 2.2861549854278564, "learning_rate": 6.2810755961440895e-06, "loss": 0.0167, "step": 4761 }, { "epoch": 4.348858447488585, "grad_norm": 2.026045799255371, "learning_rate": 6.280060882800609e-06, "loss": 0.0129, "step": 4762 }, { "epoch": 4.349771689497717, "grad_norm": 1.4395785331726074, "learning_rate": 6.279046169457129e-06, "loss": 0.0068, "step": 4763 }, { "epoch": 4.35068493150685, "grad_norm": 26.796907424926758, "learning_rate": 6.278031456113648e-06, "loss": 0.1668, "step": 4764 }, { "epoch": 4.351598173515982, "grad_norm": 8.2876615524292, "learning_rate": 6.277016742770167e-06, "loss": 0.048, "step": 4765 }, { "epoch": 4.352511415525115, "grad_norm": 7.628794193267822, "learning_rate": 6.276002029426688e-06, "loss": 0.0431, "step": 4766 }, { "epoch": 4.353424657534246, "grad_norm": 3.5092272758483887, "learning_rate": 6.274987316083208e-06, "loss": 0.0181, "step": 4767 }, { "epoch": 4.3543378995433795, "grad_norm": 1.9517762660980225, "learning_rate": 6.2739726027397265e-06, "loss": 0.0143, "step": 4768 }, { "epoch": 4.355251141552511, "grad_norm": 0.2508706748485565, "learning_rate": 6.272957889396246e-06, "loss": 0.001, "step": 4769 }, { "epoch": 4.3561643835616435, "grad_norm": 0.2979598641395569, "learning_rate": 6.271943176052766e-06, "loss": 0.0021, "step": 4770 }, { "epoch": 4.357077625570776, "grad_norm": 19.1948184967041, "learning_rate": 6.270928462709285e-06, "loss": 0.1591, "step": 4771 }, { "epoch": 4.357990867579908, "grad_norm": 0.11864650249481201, "learning_rate": 6.269913749365804e-06, "loss": 0.0008, "step": 4772 }, { "epoch": 4.358904109589041, "grad_norm": 4.848877429962158, "learning_rate": 6.268899036022324e-06, "loss": 0.0425, "step": 4773 }, { "epoch": 4.359817351598173, "grad_norm": 0.029851900413632393, "learning_rate": 6.267884322678843e-06, "loss": 0.0003, "step": 4774 }, { "epoch": 4.360730593607306, "grad_norm": 64.17314910888672, "learning_rate": 6.2668696093353635e-06, "loss": 0.6706, "step": 4775 }, { "epoch": 4.361643835616438, "grad_norm": 14.475104331970215, "learning_rate": 6.265854895991883e-06, "loss": 0.0974, "step": 4776 }, { "epoch": 4.362557077625571, "grad_norm": 0.5575304627418518, "learning_rate": 6.264840182648403e-06, "loss": 0.0038, "step": 4777 }, { "epoch": 4.363470319634703, "grad_norm": 5.56693696975708, "learning_rate": 6.263825469304922e-06, "loss": 0.0326, "step": 4778 }, { "epoch": 4.364383561643836, "grad_norm": 1.785805106163025, "learning_rate": 6.262810755961441e-06, "loss": 0.0069, "step": 4779 }, { "epoch": 4.365296803652968, "grad_norm": 16.477645874023438, "learning_rate": 6.261796042617961e-06, "loss": 0.0928, "step": 4780 }, { "epoch": 4.3662100456621005, "grad_norm": 1.884128451347351, "learning_rate": 6.26078132927448e-06, "loss": 0.0105, "step": 4781 }, { "epoch": 4.367123287671233, "grad_norm": 4.252908229827881, "learning_rate": 6.259766615931e-06, "loss": 0.0252, "step": 4782 }, { "epoch": 4.368036529680365, "grad_norm": 2.7733421325683594, "learning_rate": 6.25875190258752e-06, "loss": 0.0201, "step": 4783 }, { "epoch": 4.368949771689498, "grad_norm": 0.72563636302948, "learning_rate": 6.257737189244039e-06, "loss": 0.005, "step": 4784 }, { "epoch": 4.36986301369863, "grad_norm": 36.92793655395508, "learning_rate": 6.256722475900559e-06, "loss": 0.2107, "step": 4785 }, { "epoch": 4.370776255707763, "grad_norm": 8.090476036071777, "learning_rate": 6.255707762557078e-06, "loss": 0.0299, "step": 4786 }, { "epoch": 4.371689497716895, "grad_norm": 0.02150150202214718, "learning_rate": 6.254693049213598e-06, "loss": 0.0002, "step": 4787 }, { "epoch": 4.372602739726028, "grad_norm": 0.08809508383274078, "learning_rate": 6.253678335870117e-06, "loss": 0.0004, "step": 4788 }, { "epoch": 4.37351598173516, "grad_norm": 109.44340515136719, "learning_rate": 6.252663622526637e-06, "loss": 1.7942, "step": 4789 }, { "epoch": 4.3744292237442925, "grad_norm": 0.015440328046679497, "learning_rate": 6.251648909183156e-06, "loss": 0.0001, "step": 4790 }, { "epoch": 4.375342465753425, "grad_norm": 1.496738314628601, "learning_rate": 6.250634195839675e-06, "loss": 0.0076, "step": 4791 }, { "epoch": 4.3762557077625575, "grad_norm": 3.0197081565856934, "learning_rate": 6.249619482496196e-06, "loss": 0.0259, "step": 4792 }, { "epoch": 4.37716894977169, "grad_norm": 36.38610076904297, "learning_rate": 6.248604769152715e-06, "loss": 0.303, "step": 4793 }, { "epoch": 4.3780821917808215, "grad_norm": 49.1194953918457, "learning_rate": 6.247590055809234e-06, "loss": 0.4452, "step": 4794 }, { "epoch": 4.378995433789954, "grad_norm": 5.335875988006592, "learning_rate": 6.246575342465754e-06, "loss": 0.0239, "step": 4795 }, { "epoch": 4.379908675799086, "grad_norm": 0.5684645175933838, "learning_rate": 6.245560629122274e-06, "loss": 0.0056, "step": 4796 }, { "epoch": 4.380821917808219, "grad_norm": 39.96236801147461, "learning_rate": 6.244545915778793e-06, "loss": 0.3351, "step": 4797 }, { "epoch": 4.381735159817351, "grad_norm": 0.9258634448051453, "learning_rate": 6.243531202435312e-06, "loss": 0.0044, "step": 4798 }, { "epoch": 4.382648401826484, "grad_norm": 1.0735142230987549, "learning_rate": 6.242516489091832e-06, "loss": 0.0068, "step": 4799 }, { "epoch": 4.383561643835616, "grad_norm": 24.717947006225586, "learning_rate": 6.241501775748352e-06, "loss": 0.2006, "step": 4800 }, { "epoch": 4.384474885844749, "grad_norm": 88.5384521484375, "learning_rate": 6.24048706240487e-06, "loss": 2.16, "step": 4801 }, { "epoch": 4.385388127853881, "grad_norm": 1.7830485105514526, "learning_rate": 6.239472349061391e-06, "loss": 0.0117, "step": 4802 }, { "epoch": 4.3863013698630136, "grad_norm": 2.308095693588257, "learning_rate": 6.238457635717911e-06, "loss": 0.0136, "step": 4803 }, { "epoch": 4.387214611872146, "grad_norm": 1.8178733587265015, "learning_rate": 6.2374429223744295e-06, "loss": 0.0105, "step": 4804 }, { "epoch": 4.3881278538812785, "grad_norm": 0.6371842622756958, "learning_rate": 6.236428209030949e-06, "loss": 0.0041, "step": 4805 }, { "epoch": 4.389041095890411, "grad_norm": 4.352443218231201, "learning_rate": 6.235413495687469e-06, "loss": 0.0274, "step": 4806 }, { "epoch": 4.389954337899543, "grad_norm": 0.9282264113426208, "learning_rate": 6.2343987823439886e-06, "loss": 0.0056, "step": 4807 }, { "epoch": 4.390867579908676, "grad_norm": 18.360544204711914, "learning_rate": 6.233384069000507e-06, "loss": 0.1093, "step": 4808 }, { "epoch": 4.391780821917808, "grad_norm": 7.65017557144165, "learning_rate": 6.232369355657027e-06, "loss": 0.0413, "step": 4809 }, { "epoch": 4.392694063926941, "grad_norm": 10.971640586853027, "learning_rate": 6.231354642313548e-06, "loss": 0.1293, "step": 4810 }, { "epoch": 4.393607305936073, "grad_norm": 1.300457239151001, "learning_rate": 6.2303399289700665e-06, "loss": 0.0122, "step": 4811 }, { "epoch": 4.394520547945206, "grad_norm": 3.856196641921997, "learning_rate": 6.229325215626586e-06, "loss": 0.0267, "step": 4812 }, { "epoch": 4.395433789954338, "grad_norm": 4.055069923400879, "learning_rate": 6.228310502283106e-06, "loss": 0.0384, "step": 4813 }, { "epoch": 4.3963470319634705, "grad_norm": 1.0028505325317383, "learning_rate": 6.227295788939625e-06, "loss": 0.0062, "step": 4814 }, { "epoch": 4.397260273972603, "grad_norm": 4.129992485046387, "learning_rate": 6.226281075596144e-06, "loss": 0.0223, "step": 4815 }, { "epoch": 4.3981735159817354, "grad_norm": 36.44713592529297, "learning_rate": 6.225266362252664e-06, "loss": 0.2861, "step": 4816 }, { "epoch": 4.399086757990868, "grad_norm": 1.0715899467468262, "learning_rate": 6.224251648909185e-06, "loss": 0.0054, "step": 4817 }, { "epoch": 4.4, "grad_norm": 0.03769668936729431, "learning_rate": 6.223236935565703e-06, "loss": 0.0003, "step": 4818 }, { "epoch": 4.400913242009133, "grad_norm": 7.430515766143799, "learning_rate": 6.222222222222223e-06, "loss": 0.0412, "step": 4819 }, { "epoch": 4.401826484018265, "grad_norm": 0.002110701287165284, "learning_rate": 6.221207508878743e-06, "loss": 0.0, "step": 4820 }, { "epoch": 4.402739726027397, "grad_norm": 10.673552513122559, "learning_rate": 6.220192795535262e-06, "loss": 0.0739, "step": 4821 }, { "epoch": 4.403652968036529, "grad_norm": 0.4329606592655182, "learning_rate": 6.219178082191781e-06, "loss": 0.0023, "step": 4822 }, { "epoch": 4.404566210045662, "grad_norm": 1.3968141078948975, "learning_rate": 6.218163368848301e-06, "loss": 0.0085, "step": 4823 }, { "epoch": 4.405479452054794, "grad_norm": 0.1865750104188919, "learning_rate": 6.21714865550482e-06, "loss": 0.0011, "step": 4824 }, { "epoch": 4.406392694063927, "grad_norm": 2.7046563625335693, "learning_rate": 6.21613394216134e-06, "loss": 0.0193, "step": 4825 }, { "epoch": 4.407305936073059, "grad_norm": 48.44721603393555, "learning_rate": 6.215119228817859e-06, "loss": 0.399, "step": 4826 }, { "epoch": 4.4082191780821915, "grad_norm": 27.678428649902344, "learning_rate": 6.21410451547438e-06, "loss": 0.2621, "step": 4827 }, { "epoch": 4.409132420091324, "grad_norm": 8.339215278625488, "learning_rate": 6.213089802130899e-06, "loss": 0.0581, "step": 4828 }, { "epoch": 4.4100456621004565, "grad_norm": 3.581993818283081, "learning_rate": 6.212075088787418e-06, "loss": 0.0282, "step": 4829 }, { "epoch": 4.410958904109589, "grad_norm": 57.95824432373047, "learning_rate": 6.211060375443938e-06, "loss": 0.4131, "step": 4830 }, { "epoch": 4.411872146118721, "grad_norm": 2.662341356277466, "learning_rate": 6.210045662100457e-06, "loss": 0.0201, "step": 4831 }, { "epoch": 4.412785388127854, "grad_norm": 10.074056625366211, "learning_rate": 6.209030948756977e-06, "loss": 0.0534, "step": 4832 }, { "epoch": 4.413698630136986, "grad_norm": 1.9398702383041382, "learning_rate": 6.208016235413496e-06, "loss": 0.0132, "step": 4833 }, { "epoch": 4.414611872146119, "grad_norm": 103.90211486816406, "learning_rate": 6.207001522070015e-06, "loss": 3.1089, "step": 4834 }, { "epoch": 4.415525114155251, "grad_norm": 0.4613822400569916, "learning_rate": 6.205986808726535e-06, "loss": 0.0037, "step": 4835 }, { "epoch": 4.416438356164384, "grad_norm": 0.7053696513175964, "learning_rate": 6.204972095383055e-06, "loss": 0.0048, "step": 4836 }, { "epoch": 4.417351598173516, "grad_norm": 0.10045462101697922, "learning_rate": 6.203957382039575e-06, "loss": 0.0007, "step": 4837 }, { "epoch": 4.4182648401826485, "grad_norm": 7.425210475921631, "learning_rate": 6.202942668696094e-06, "loss": 0.0496, "step": 4838 }, { "epoch": 4.419178082191781, "grad_norm": 12.180761337280273, "learning_rate": 6.201927955352614e-06, "loss": 0.0635, "step": 4839 }, { "epoch": 4.420091324200913, "grad_norm": 0.6703388690948486, "learning_rate": 6.200913242009133e-06, "loss": 0.0053, "step": 4840 }, { "epoch": 4.421004566210046, "grad_norm": 2.117053747177124, "learning_rate": 6.199898528665652e-06, "loss": 0.0144, "step": 4841 }, { "epoch": 4.421917808219178, "grad_norm": 66.6168441772461, "learning_rate": 6.198883815322172e-06, "loss": 0.4643, "step": 4842 }, { "epoch": 4.422831050228311, "grad_norm": 0.9518606066703796, "learning_rate": 6.1978691019786915e-06, "loss": 0.007, "step": 4843 }, { "epoch": 4.423744292237443, "grad_norm": 1.3637171983718872, "learning_rate": 6.19685438863521e-06, "loss": 0.0057, "step": 4844 }, { "epoch": 4.424657534246576, "grad_norm": 28.010225296020508, "learning_rate": 6.19583967529173e-06, "loss": 0.5221, "step": 4845 }, { "epoch": 4.425570776255708, "grad_norm": 3.5113182067871094, "learning_rate": 6.194824961948251e-06, "loss": 0.0228, "step": 4846 }, { "epoch": 4.426484018264841, "grad_norm": 3.099405288696289, "learning_rate": 6.19381024860477e-06, "loss": 0.0276, "step": 4847 }, { "epoch": 4.427397260273972, "grad_norm": 16.922901153564453, "learning_rate": 6.192795535261289e-06, "loss": 0.0791, "step": 4848 }, { "epoch": 4.428310502283105, "grad_norm": 7.093107223510742, "learning_rate": 6.191780821917809e-06, "loss": 0.0408, "step": 4849 }, { "epoch": 4.429223744292237, "grad_norm": 0.10716923326253891, "learning_rate": 6.1907661085743285e-06, "loss": 0.0006, "step": 4850 }, { "epoch": 4.4301369863013695, "grad_norm": 0.39056485891342163, "learning_rate": 6.189751395230847e-06, "loss": 0.0032, "step": 4851 }, { "epoch": 4.431050228310502, "grad_norm": 0.36427226662635803, "learning_rate": 6.188736681887367e-06, "loss": 0.0029, "step": 4852 }, { "epoch": 4.4319634703196344, "grad_norm": 2.980548143386841, "learning_rate": 6.187721968543887e-06, "loss": 0.0202, "step": 4853 }, { "epoch": 4.432876712328767, "grad_norm": 3.109825372695923, "learning_rate": 6.186707255200406e-06, "loss": 0.0195, "step": 4854 }, { "epoch": 4.433789954337899, "grad_norm": 4.864294052124023, "learning_rate": 6.185692541856926e-06, "loss": 0.0299, "step": 4855 }, { "epoch": 4.434703196347032, "grad_norm": 30.18191146850586, "learning_rate": 6.184677828513446e-06, "loss": 0.2995, "step": 4856 }, { "epoch": 4.435616438356164, "grad_norm": 1.831399917602539, "learning_rate": 6.1836631151699655e-06, "loss": 0.0161, "step": 4857 }, { "epoch": 4.436529680365297, "grad_norm": 0.6876999139785767, "learning_rate": 6.182648401826484e-06, "loss": 0.0044, "step": 4858 }, { "epoch": 4.437442922374429, "grad_norm": 3.2865331172943115, "learning_rate": 6.181633688483004e-06, "loss": 0.0217, "step": 4859 }, { "epoch": 4.438356164383562, "grad_norm": 92.21307373046875, "learning_rate": 6.180618975139524e-06, "loss": 1.5165, "step": 4860 }, { "epoch": 4.439269406392694, "grad_norm": 0.8253594636917114, "learning_rate": 6.179604261796043e-06, "loss": 0.0059, "step": 4861 }, { "epoch": 4.4401826484018265, "grad_norm": 1.7545710802078247, "learning_rate": 6.178589548452562e-06, "loss": 0.0145, "step": 4862 }, { "epoch": 4.441095890410959, "grad_norm": 0.5811802744865417, "learning_rate": 6.177574835109083e-06, "loss": 0.0042, "step": 4863 }, { "epoch": 4.442009132420091, "grad_norm": 2.0756561756134033, "learning_rate": 6.176560121765602e-06, "loss": 0.0137, "step": 4864 }, { "epoch": 4.442922374429224, "grad_norm": 0.24159689247608185, "learning_rate": 6.175545408422121e-06, "loss": 0.0015, "step": 4865 }, { "epoch": 4.443835616438356, "grad_norm": 9.057783126831055, "learning_rate": 6.174530695078641e-06, "loss": 0.0898, "step": 4866 }, { "epoch": 4.444748858447489, "grad_norm": 89.64385223388672, "learning_rate": 6.173515981735161e-06, "loss": 0.5615, "step": 4867 }, { "epoch": 4.445662100456621, "grad_norm": 0.45894595980644226, "learning_rate": 6.17250126839168e-06, "loss": 0.0025, "step": 4868 }, { "epoch": 4.446575342465754, "grad_norm": 3.380303382873535, "learning_rate": 6.171486555048199e-06, "loss": 0.02, "step": 4869 }, { "epoch": 4.447488584474886, "grad_norm": 43.16197204589844, "learning_rate": 6.170471841704719e-06, "loss": 0.5048, "step": 4870 }, { "epoch": 4.448401826484019, "grad_norm": 0.9818916320800781, "learning_rate": 6.169457128361238e-06, "loss": 0.0065, "step": 4871 }, { "epoch": 4.449315068493151, "grad_norm": 4.530580043792725, "learning_rate": 6.168442415017758e-06, "loss": 0.0102, "step": 4872 }, { "epoch": 4.4502283105022835, "grad_norm": 34.209373474121094, "learning_rate": 6.167427701674278e-06, "loss": 0.3152, "step": 4873 }, { "epoch": 4.451141552511416, "grad_norm": 124.02887725830078, "learning_rate": 6.166412988330797e-06, "loss": 2.7027, "step": 4874 }, { "epoch": 4.4520547945205475, "grad_norm": 93.0252685546875, "learning_rate": 6.165398274987317e-06, "loss": 0.2999, "step": 4875 }, { "epoch": 4.45296803652968, "grad_norm": 71.96197509765625, "learning_rate": 6.164383561643836e-06, "loss": 0.4928, "step": 4876 }, { "epoch": 4.453881278538812, "grad_norm": 0.39743247628211975, "learning_rate": 6.163368848300356e-06, "loss": 0.0032, "step": 4877 }, { "epoch": 4.454794520547945, "grad_norm": 0.188593327999115, "learning_rate": 6.162354134956875e-06, "loss": 0.0006, "step": 4878 }, { "epoch": 4.455707762557077, "grad_norm": 0.09245389699935913, "learning_rate": 6.1613394216133945e-06, "loss": 0.0009, "step": 4879 }, { "epoch": 4.45662100456621, "grad_norm": 6.107860565185547, "learning_rate": 6.160324708269915e-06, "loss": 0.0464, "step": 4880 }, { "epoch": 4.457534246575342, "grad_norm": 59.79962158203125, "learning_rate": 6.159309994926433e-06, "loss": 0.2266, "step": 4881 }, { "epoch": 4.458447488584475, "grad_norm": 1.8099546432495117, "learning_rate": 6.158295281582954e-06, "loss": 0.0164, "step": 4882 }, { "epoch": 4.459360730593607, "grad_norm": 33.537567138671875, "learning_rate": 6.157280568239473e-06, "loss": 0.2124, "step": 4883 }, { "epoch": 4.46027397260274, "grad_norm": 2.589165210723877, "learning_rate": 6.156265854895992e-06, "loss": 0.0172, "step": 4884 }, { "epoch": 4.461187214611872, "grad_norm": 1.5176349878311157, "learning_rate": 6.155251141552512e-06, "loss": 0.0098, "step": 4885 }, { "epoch": 4.4621004566210045, "grad_norm": 0.37631624937057495, "learning_rate": 6.1542364282090315e-06, "loss": 0.0021, "step": 4886 }, { "epoch": 4.463013698630137, "grad_norm": 5.32150936126709, "learning_rate": 6.153221714865551e-06, "loss": 0.0493, "step": 4887 }, { "epoch": 4.463926940639269, "grad_norm": 0.8889058828353882, "learning_rate": 6.15220700152207e-06, "loss": 0.0078, "step": 4888 }, { "epoch": 4.464840182648402, "grad_norm": 0.1016450896859169, "learning_rate": 6.15119228817859e-06, "loss": 0.0009, "step": 4889 }, { "epoch": 4.465753424657534, "grad_norm": 1.9299678802490234, "learning_rate": 6.15017757483511e-06, "loss": 0.0136, "step": 4890 }, { "epoch": 4.466666666666667, "grad_norm": 4.696848392486572, "learning_rate": 6.149162861491629e-06, "loss": 0.0366, "step": 4891 }, { "epoch": 4.467579908675799, "grad_norm": 0.9137385487556458, "learning_rate": 6.148148148148149e-06, "loss": 0.0043, "step": 4892 }, { "epoch": 4.468493150684932, "grad_norm": 0.15871617197990417, "learning_rate": 6.1471334348046685e-06, "loss": 0.0007, "step": 4893 }, { "epoch": 4.469406392694064, "grad_norm": 5.459829330444336, "learning_rate": 6.146118721461187e-06, "loss": 0.0261, "step": 4894 }, { "epoch": 4.470319634703197, "grad_norm": 2.27260684967041, "learning_rate": 6.145104008117707e-06, "loss": 0.0138, "step": 4895 }, { "epoch": 4.471232876712329, "grad_norm": 0.9743214845657349, "learning_rate": 6.144089294774227e-06, "loss": 0.007, "step": 4896 }, { "epoch": 4.4721461187214615, "grad_norm": 1.8945168256759644, "learning_rate": 6.143074581430747e-06, "loss": 0.0106, "step": 4897 }, { "epoch": 4.473059360730594, "grad_norm": 3.376997232437134, "learning_rate": 6.142059868087265e-06, "loss": 0.0231, "step": 4898 }, { "epoch": 4.473972602739726, "grad_norm": 135.06698608398438, "learning_rate": 6.141045154743786e-06, "loss": 1.668, "step": 4899 }, { "epoch": 4.474885844748858, "grad_norm": 0.6401609182357788, "learning_rate": 6.1400304414003055e-06, "loss": 0.004, "step": 4900 }, { "epoch": 4.475799086757991, "grad_norm": 0.5185571908950806, "learning_rate": 6.139015728056824e-06, "loss": 0.0037, "step": 4901 }, { "epoch": 4.476712328767123, "grad_norm": 0.5396716594696045, "learning_rate": 6.138001014713344e-06, "loss": 0.0043, "step": 4902 }, { "epoch": 4.477625570776255, "grad_norm": 1.1439300775527954, "learning_rate": 6.136986301369864e-06, "loss": 0.007, "step": 4903 }, { "epoch": 4.478538812785388, "grad_norm": 5.079713821411133, "learning_rate": 6.135971588026383e-06, "loss": 0.0099, "step": 4904 }, { "epoch": 4.47945205479452, "grad_norm": 90.10560607910156, "learning_rate": 6.134956874682902e-06, "loss": 1.3879, "step": 4905 }, { "epoch": 4.480365296803653, "grad_norm": 8.669622421264648, "learning_rate": 6.133942161339422e-06, "loss": 0.0666, "step": 4906 }, { "epoch": 4.481278538812785, "grad_norm": 0.22223928570747375, "learning_rate": 6.1329274479959425e-06, "loss": 0.0016, "step": 4907 }, { "epoch": 4.482191780821918, "grad_norm": 0.06565593183040619, "learning_rate": 6.131912734652461e-06, "loss": 0.0005, "step": 4908 }, { "epoch": 4.48310502283105, "grad_norm": 29.59088706970215, "learning_rate": 6.130898021308981e-06, "loss": 0.2864, "step": 4909 }, { "epoch": 4.4840182648401825, "grad_norm": 0.3723951578140259, "learning_rate": 6.129883307965501e-06, "loss": 0.0022, "step": 4910 }, { "epoch": 4.484931506849315, "grad_norm": 14.72134780883789, "learning_rate": 6.12886859462202e-06, "loss": 0.0995, "step": 4911 }, { "epoch": 4.485844748858447, "grad_norm": 19.285669326782227, "learning_rate": 6.127853881278539e-06, "loss": 0.1521, "step": 4912 }, { "epoch": 4.48675799086758, "grad_norm": 0.07090284675359726, "learning_rate": 6.126839167935059e-06, "loss": 0.0003, "step": 4913 }, { "epoch": 4.487671232876712, "grad_norm": 58.327667236328125, "learning_rate": 6.125824454591578e-06, "loss": 0.335, "step": 4914 }, { "epoch": 4.488584474885845, "grad_norm": 7.054019451141357, "learning_rate": 6.1248097412480975e-06, "loss": 0.067, "step": 4915 }, { "epoch": 4.489497716894977, "grad_norm": 4.301089763641357, "learning_rate": 6.123795027904618e-06, "loss": 0.0274, "step": 4916 }, { "epoch": 4.49041095890411, "grad_norm": 32.29283142089844, "learning_rate": 6.122780314561138e-06, "loss": 0.2531, "step": 4917 }, { "epoch": 4.491324200913242, "grad_norm": 0.9998927116394043, "learning_rate": 6.121765601217657e-06, "loss": 0.0053, "step": 4918 }, { "epoch": 4.492237442922375, "grad_norm": 2.1892669200897217, "learning_rate": 6.120750887874176e-06, "loss": 0.0194, "step": 4919 }, { "epoch": 4.493150684931507, "grad_norm": 15.889001846313477, "learning_rate": 6.119736174530696e-06, "loss": 0.0749, "step": 4920 }, { "epoch": 4.4940639269406395, "grad_norm": 19.915096282958984, "learning_rate": 6.118721461187215e-06, "loss": 0.1185, "step": 4921 }, { "epoch": 4.494977168949772, "grad_norm": 0.4381082355976105, "learning_rate": 6.1177067478437345e-06, "loss": 0.0033, "step": 4922 }, { "epoch": 4.495890410958904, "grad_norm": 6.752540588378906, "learning_rate": 6.116692034500254e-06, "loss": 0.0447, "step": 4923 }, { "epoch": 4.496803652968037, "grad_norm": 26.997901916503906, "learning_rate": 6.115677321156773e-06, "loss": 0.1566, "step": 4924 }, { "epoch": 4.497716894977169, "grad_norm": 11.20703125, "learning_rate": 6.114662607813293e-06, "loss": 0.0951, "step": 4925 }, { "epoch": 4.498630136986302, "grad_norm": 4.170664310455322, "learning_rate": 6.113647894469813e-06, "loss": 0.0287, "step": 4926 }, { "epoch": 4.499543378995433, "grad_norm": 2.48142671585083, "learning_rate": 6.112633181126333e-06, "loss": 0.0182, "step": 4927 }, { "epoch": 4.500456621004567, "grad_norm": 11.869293212890625, "learning_rate": 6.111618467782852e-06, "loss": 0.0909, "step": 4928 }, { "epoch": 4.501369863013698, "grad_norm": 3.6587116718292236, "learning_rate": 6.1106037544393715e-06, "loss": 0.0202, "step": 4929 }, { "epoch": 4.502283105022831, "grad_norm": 0.6726700663566589, "learning_rate": 6.109589041095891e-06, "loss": 0.0047, "step": 4930 }, { "epoch": 4.503196347031963, "grad_norm": 5.361343860626221, "learning_rate": 6.10857432775241e-06, "loss": 0.0336, "step": 4931 }, { "epoch": 4.504109589041096, "grad_norm": 13.306597709655762, "learning_rate": 6.10755961440893e-06, "loss": 0.09, "step": 4932 }, { "epoch": 4.505022831050228, "grad_norm": 0.3544495701789856, "learning_rate": 6.1065449010654494e-06, "loss": 0.0028, "step": 4933 }, { "epoch": 4.5059360730593605, "grad_norm": 0.7471415996551514, "learning_rate": 6.105530187721968e-06, "loss": 0.0032, "step": 4934 }, { "epoch": 4.506849315068493, "grad_norm": 16.682878494262695, "learning_rate": 6.104515474378489e-06, "loss": 0.1128, "step": 4935 }, { "epoch": 4.507762557077625, "grad_norm": 2.0304174423217773, "learning_rate": 6.1035007610350085e-06, "loss": 0.0148, "step": 4936 }, { "epoch": 4.508675799086758, "grad_norm": 0.12413958460092545, "learning_rate": 6.102486047691528e-06, "loss": 0.0009, "step": 4937 }, { "epoch": 4.50958904109589, "grad_norm": 0.14638236165046692, "learning_rate": 6.101471334348047e-06, "loss": 0.0011, "step": 4938 }, { "epoch": 4.510502283105023, "grad_norm": 3.269857406616211, "learning_rate": 6.100456621004567e-06, "loss": 0.028, "step": 4939 }, { "epoch": 4.511415525114155, "grad_norm": 11.036603927612305, "learning_rate": 6.099441907661086e-06, "loss": 0.0883, "step": 4940 }, { "epoch": 4.512328767123288, "grad_norm": 0.32139769196510315, "learning_rate": 6.098427194317605e-06, "loss": 0.0018, "step": 4941 }, { "epoch": 4.51324200913242, "grad_norm": 0.24290640652179718, "learning_rate": 6.097412480974125e-06, "loss": 0.0013, "step": 4942 }, { "epoch": 4.514155251141553, "grad_norm": 75.6833267211914, "learning_rate": 6.0963977676306455e-06, "loss": 0.6304, "step": 4943 }, { "epoch": 4.515068493150685, "grad_norm": 12.81019401550293, "learning_rate": 6.095383054287164e-06, "loss": 0.0472, "step": 4944 }, { "epoch": 4.5159817351598175, "grad_norm": 42.983158111572266, "learning_rate": 6.094368340943684e-06, "loss": 0.2693, "step": 4945 }, { "epoch": 4.51689497716895, "grad_norm": 20.717098236083984, "learning_rate": 6.093353627600204e-06, "loss": 0.1241, "step": 4946 }, { "epoch": 4.517808219178082, "grad_norm": 32.79779815673828, "learning_rate": 6.092338914256723e-06, "loss": 0.2286, "step": 4947 }, { "epoch": 4.518721461187215, "grad_norm": 69.1375961303711, "learning_rate": 6.091324200913242e-06, "loss": 0.7743, "step": 4948 }, { "epoch": 4.519634703196347, "grad_norm": 2.1990113258361816, "learning_rate": 6.090309487569762e-06, "loss": 0.0138, "step": 4949 }, { "epoch": 4.52054794520548, "grad_norm": 89.64912414550781, "learning_rate": 6.089294774226282e-06, "loss": 0.6038, "step": 4950 }, { "epoch": 4.521461187214612, "grad_norm": 0.25340384244918823, "learning_rate": 6.0882800608828005e-06, "loss": 0.0016, "step": 4951 }, { "epoch": 4.522374429223745, "grad_norm": 0.058822307735681534, "learning_rate": 6.087265347539321e-06, "loss": 0.0004, "step": 4952 }, { "epoch": 4.523287671232877, "grad_norm": 1.3872621059417725, "learning_rate": 6.086250634195841e-06, "loss": 0.0094, "step": 4953 }, { "epoch": 4.524200913242009, "grad_norm": 0.14459228515625, "learning_rate": 6.0852359208523596e-06, "loss": 0.0008, "step": 4954 }, { "epoch": 4.525114155251142, "grad_norm": 0.5165494084358215, "learning_rate": 6.084221207508879e-06, "loss": 0.0028, "step": 4955 }, { "epoch": 4.526027397260274, "grad_norm": 2.5077178478240967, "learning_rate": 6.083206494165399e-06, "loss": 0.0136, "step": 4956 }, { "epoch": 4.526940639269406, "grad_norm": 3.633335828781128, "learning_rate": 6.082191780821919e-06, "loss": 0.024, "step": 4957 }, { "epoch": 4.5278538812785385, "grad_norm": 5.349911212921143, "learning_rate": 6.0811770674784375e-06, "loss": 0.0393, "step": 4958 }, { "epoch": 4.528767123287671, "grad_norm": 2.964815855026245, "learning_rate": 6.080162354134957e-06, "loss": 0.0133, "step": 4959 }, { "epoch": 4.529680365296803, "grad_norm": 9.334402084350586, "learning_rate": 6.079147640791478e-06, "loss": 0.066, "step": 4960 }, { "epoch": 4.530593607305936, "grad_norm": 4.712311744689941, "learning_rate": 6.078132927447996e-06, "loss": 0.027, "step": 4961 }, { "epoch": 4.531506849315068, "grad_norm": 0.0901949405670166, "learning_rate": 6.077118214104516e-06, "loss": 0.0005, "step": 4962 }, { "epoch": 4.532420091324201, "grad_norm": 5.671307563781738, "learning_rate": 6.076103500761036e-06, "loss": 0.0235, "step": 4963 }, { "epoch": 4.533333333333333, "grad_norm": 79.31974029541016, "learning_rate": 6.075088787417555e-06, "loss": 0.6116, "step": 4964 }, { "epoch": 4.534246575342466, "grad_norm": 0.9744187593460083, "learning_rate": 6.0740740740740745e-06, "loss": 0.0038, "step": 4965 }, { "epoch": 4.535159817351598, "grad_norm": 0.42387762665748596, "learning_rate": 6.073059360730594e-06, "loss": 0.0018, "step": 4966 }, { "epoch": 4.536073059360731, "grad_norm": 58.83143615722656, "learning_rate": 6.072044647387114e-06, "loss": 0.4003, "step": 4967 }, { "epoch": 4.536986301369863, "grad_norm": 0.8086957931518555, "learning_rate": 6.071029934043633e-06, "loss": 0.0049, "step": 4968 }, { "epoch": 4.5378995433789955, "grad_norm": 3.6293766498565674, "learning_rate": 6.070015220700152e-06, "loss": 0.03, "step": 4969 }, { "epoch": 4.538812785388128, "grad_norm": 6.7703375816345215, "learning_rate": 6.069000507356673e-06, "loss": 0.0407, "step": 4970 }, { "epoch": 4.53972602739726, "grad_norm": 4.8691840171813965, "learning_rate": 6.067985794013192e-06, "loss": 0.0322, "step": 4971 }, { "epoch": 4.540639269406393, "grad_norm": 3.844560384750366, "learning_rate": 6.0669710806697115e-06, "loss": 0.0255, "step": 4972 }, { "epoch": 4.541552511415525, "grad_norm": 20.244836807250977, "learning_rate": 6.065956367326231e-06, "loss": 0.1354, "step": 4973 }, { "epoch": 4.542465753424658, "grad_norm": 34.23277282714844, "learning_rate": 6.06494165398275e-06, "loss": 0.269, "step": 4974 }, { "epoch": 4.54337899543379, "grad_norm": 0.008744382299482822, "learning_rate": 6.06392694063927e-06, "loss": 0.0001, "step": 4975 }, { "epoch": 4.544292237442923, "grad_norm": 14.61913776397705, "learning_rate": 6.062912227295789e-06, "loss": 0.071, "step": 4976 }, { "epoch": 4.545205479452055, "grad_norm": 19.16398048400879, "learning_rate": 6.061897513952309e-06, "loss": 0.1646, "step": 4977 }, { "epoch": 4.546118721461188, "grad_norm": 14.448333740234375, "learning_rate": 6.060882800608828e-06, "loss": 0.0949, "step": 4978 }, { "epoch": 4.54703196347032, "grad_norm": 44.944969177246094, "learning_rate": 6.0598680872653485e-06, "loss": 0.3686, "step": 4979 }, { "epoch": 4.5479452054794525, "grad_norm": 0.23363621532917023, "learning_rate": 6.058853373921868e-06, "loss": 0.0012, "step": 4980 }, { "epoch": 4.548858447488584, "grad_norm": 0.9347777962684631, "learning_rate": 6.057838660578387e-06, "loss": 0.0041, "step": 4981 }, { "epoch": 4.549771689497717, "grad_norm": 117.26302337646484, "learning_rate": 6.056823947234907e-06, "loss": 2.0805, "step": 4982 }, { "epoch": 4.550684931506849, "grad_norm": 0.9691550731658936, "learning_rate": 6.055809233891426e-06, "loss": 0.0061, "step": 4983 }, { "epoch": 4.551598173515981, "grad_norm": 5.6687331199646, "learning_rate": 6.054794520547945e-06, "loss": 0.0255, "step": 4984 }, { "epoch": 4.552511415525114, "grad_norm": 0.4143216609954834, "learning_rate": 6.053779807204465e-06, "loss": 0.0022, "step": 4985 }, { "epoch": 4.553424657534246, "grad_norm": 96.6860580444336, "learning_rate": 6.052765093860985e-06, "loss": 0.9958, "step": 4986 }, { "epoch": 4.554337899543379, "grad_norm": 23.030344009399414, "learning_rate": 6.051750380517505e-06, "loss": 0.206, "step": 4987 }, { "epoch": 4.555251141552511, "grad_norm": 1.1947191953659058, "learning_rate": 6.050735667174024e-06, "loss": 0.0084, "step": 4988 }, { "epoch": 4.556164383561644, "grad_norm": 0.11253992468118668, "learning_rate": 6.049720953830544e-06, "loss": 0.0007, "step": 4989 }, { "epoch": 4.557077625570776, "grad_norm": 18.942148208618164, "learning_rate": 6.048706240487063e-06, "loss": 0.0754, "step": 4990 }, { "epoch": 4.557990867579909, "grad_norm": 0.02740485593676567, "learning_rate": 6.047691527143582e-06, "loss": 0.0002, "step": 4991 }, { "epoch": 4.558904109589041, "grad_norm": 1.4455811977386475, "learning_rate": 6.046676813800102e-06, "loss": 0.0076, "step": 4992 }, { "epoch": 4.5598173515981735, "grad_norm": 0.9273141026496887, "learning_rate": 6.045662100456622e-06, "loss": 0.0057, "step": 4993 }, { "epoch": 4.560730593607306, "grad_norm": 11.614686012268066, "learning_rate": 6.0446473871131405e-06, "loss": 0.1, "step": 4994 }, { "epoch": 4.561643835616438, "grad_norm": 15.064282417297363, "learning_rate": 6.04363267376966e-06, "loss": 0.0998, "step": 4995 }, { "epoch": 4.562557077625571, "grad_norm": 0.19494283199310303, "learning_rate": 6.042617960426181e-06, "loss": 0.001, "step": 4996 }, { "epoch": 4.563470319634703, "grad_norm": 0.37095433473587036, "learning_rate": 6.0416032470827e-06, "loss": 0.0026, "step": 4997 }, { "epoch": 4.564383561643836, "grad_norm": 0.5074388384819031, "learning_rate": 6.040588533739219e-06, "loss": 0.0036, "step": 4998 }, { "epoch": 4.565296803652968, "grad_norm": 1.9445765018463135, "learning_rate": 6.039573820395739e-06, "loss": 0.0135, "step": 4999 }, { "epoch": 4.566210045662101, "grad_norm": 0.6672837138175964, "learning_rate": 6.038559107052259e-06, "loss": 0.0052, "step": 5000 }, { "epoch": 4.567123287671233, "grad_norm": 6.868631839752197, "learning_rate": 6.0375443937087775e-06, "loss": 0.0378, "step": 5001 }, { "epoch": 4.5680365296803656, "grad_norm": 0.33390527963638306, "learning_rate": 6.036529680365297e-06, "loss": 0.002, "step": 5002 }, { "epoch": 4.568949771689498, "grad_norm": 0.13105358183383942, "learning_rate": 6.035514967021817e-06, "loss": 0.0008, "step": 5003 }, { "epoch": 4.5698630136986305, "grad_norm": 2.3817787170410156, "learning_rate": 6.034500253678336e-06, "loss": 0.0113, "step": 5004 }, { "epoch": 4.570776255707763, "grad_norm": 4.8692426681518555, "learning_rate": 6.033485540334855e-06, "loss": 0.0275, "step": 5005 }, { "epoch": 4.5716894977168945, "grad_norm": 6.700533866882324, "learning_rate": 6.032470826991376e-06, "loss": 0.0352, "step": 5006 }, { "epoch": 4.572602739726028, "grad_norm": 13.296224594116211, "learning_rate": 6.031456113647896e-06, "loss": 0.0651, "step": 5007 }, { "epoch": 4.573515981735159, "grad_norm": 0.4671914577484131, "learning_rate": 6.0304414003044145e-06, "loss": 0.003, "step": 5008 }, { "epoch": 4.574429223744293, "grad_norm": 6.1197943687438965, "learning_rate": 6.029426686960934e-06, "loss": 0.043, "step": 5009 }, { "epoch": 4.575342465753424, "grad_norm": 81.25835418701172, "learning_rate": 6.028411973617454e-06, "loss": 0.9258, "step": 5010 }, { "epoch": 4.576255707762557, "grad_norm": 0.43803921341896057, "learning_rate": 6.027397260273973e-06, "loss": 0.0021, "step": 5011 }, { "epoch": 4.577168949771689, "grad_norm": 2.8811416625976562, "learning_rate": 6.026382546930492e-06, "loss": 0.0162, "step": 5012 }, { "epoch": 4.578082191780822, "grad_norm": 4.8809709548950195, "learning_rate": 6.025367833587012e-06, "loss": 0.0251, "step": 5013 }, { "epoch": 4.578995433789954, "grad_norm": 0.30153825879096985, "learning_rate": 6.024353120243531e-06, "loss": 0.0011, "step": 5014 }, { "epoch": 4.579908675799087, "grad_norm": 3.2107322216033936, "learning_rate": 6.0233384069000515e-06, "loss": 0.0153, "step": 5015 }, { "epoch": 4.580821917808219, "grad_norm": 0.05129655450582504, "learning_rate": 6.022323693556571e-06, "loss": 0.0004, "step": 5016 }, { "epoch": 4.5817351598173515, "grad_norm": 2.9860503673553467, "learning_rate": 6.021308980213091e-06, "loss": 0.0188, "step": 5017 }, { "epoch": 4.582648401826484, "grad_norm": 58.067596435546875, "learning_rate": 6.02029426686961e-06, "loss": 0.5438, "step": 5018 }, { "epoch": 4.583561643835616, "grad_norm": 111.07013702392578, "learning_rate": 6.019279553526129e-06, "loss": 5.1097, "step": 5019 }, { "epoch": 4.584474885844749, "grad_norm": 2.012324333190918, "learning_rate": 6.018264840182649e-06, "loss": 0.012, "step": 5020 }, { "epoch": 4.585388127853881, "grad_norm": 9.503437995910645, "learning_rate": 6.017250126839168e-06, "loss": 0.0564, "step": 5021 }, { "epoch": 4.586301369863014, "grad_norm": 0.049713097512722015, "learning_rate": 6.016235413495688e-06, "loss": 0.0004, "step": 5022 }, { "epoch": 4.587214611872146, "grad_norm": 1.1266205310821533, "learning_rate": 6.015220700152208e-06, "loss": 0.0089, "step": 5023 }, { "epoch": 4.588127853881279, "grad_norm": 5.060126781463623, "learning_rate": 6.014205986808727e-06, "loss": 0.0333, "step": 5024 }, { "epoch": 4.589041095890411, "grad_norm": 0.4833078980445862, "learning_rate": 6.013191273465247e-06, "loss": 0.0015, "step": 5025 }, { "epoch": 4.5899543378995435, "grad_norm": 11.989129066467285, "learning_rate": 6.012176560121766e-06, "loss": 0.1345, "step": 5026 }, { "epoch": 4.590867579908676, "grad_norm": 3.7108585834503174, "learning_rate": 6.011161846778286e-06, "loss": 0.0232, "step": 5027 }, { "epoch": 4.5917808219178085, "grad_norm": 1.3378043174743652, "learning_rate": 6.010147133434805e-06, "loss": 0.0069, "step": 5028 }, { "epoch": 4.592694063926941, "grad_norm": 0.1175069734454155, "learning_rate": 6.009132420091325e-06, "loss": 0.0007, "step": 5029 }, { "epoch": 4.593607305936073, "grad_norm": 14.170201301574707, "learning_rate": 6.008117706747844e-06, "loss": 0.0824, "step": 5030 }, { "epoch": 4.594520547945206, "grad_norm": 6.952031135559082, "learning_rate": 6.007102993404363e-06, "loss": 0.0334, "step": 5031 }, { "epoch": 4.595433789954338, "grad_norm": 2.671848773956299, "learning_rate": 6.006088280060884e-06, "loss": 0.0096, "step": 5032 }, { "epoch": 4.59634703196347, "grad_norm": 103.3674545288086, "learning_rate": 6.005073566717403e-06, "loss": 1.3643, "step": 5033 }, { "epoch": 4.597260273972603, "grad_norm": 39.37034606933594, "learning_rate": 6.004058853373922e-06, "loss": 0.1736, "step": 5034 }, { "epoch": 4.598173515981735, "grad_norm": 3.2259726524353027, "learning_rate": 6.003044140030442e-06, "loss": 0.0172, "step": 5035 }, { "epoch": 4.599086757990867, "grad_norm": 0.4694974720478058, "learning_rate": 6.002029426686962e-06, "loss": 0.0033, "step": 5036 }, { "epoch": 4.6, "grad_norm": 3.4071905612945557, "learning_rate": 6.001014713343481e-06, "loss": 0.0132, "step": 5037 }, { "epoch": 4.600913242009132, "grad_norm": 2.293997287750244, "learning_rate": 6e-06, "loss": 0.016, "step": 5038 }, { "epoch": 4.6018264840182646, "grad_norm": 3.840515613555908, "learning_rate": 5.99898528665652e-06, "loss": 0.0262, "step": 5039 }, { "epoch": 4.602739726027397, "grad_norm": 14.799886703491211, "learning_rate": 5.99797057331304e-06, "loss": 0.1538, "step": 5040 }, { "epoch": 4.6036529680365295, "grad_norm": 3.29691219329834, "learning_rate": 5.996955859969558e-06, "loss": 0.0181, "step": 5041 }, { "epoch": 4.604566210045662, "grad_norm": 0.6446681618690491, "learning_rate": 5.995941146626079e-06, "loss": 0.0042, "step": 5042 }, { "epoch": 4.605479452054794, "grad_norm": 0.25184696912765503, "learning_rate": 5.994926433282599e-06, "loss": 0.0015, "step": 5043 }, { "epoch": 4.606392694063927, "grad_norm": 29.80645179748535, "learning_rate": 5.9939117199391175e-06, "loss": 0.2486, "step": 5044 }, { "epoch": 4.607305936073059, "grad_norm": 6.987106800079346, "learning_rate": 5.992897006595637e-06, "loss": 0.0515, "step": 5045 }, { "epoch": 4.608219178082192, "grad_norm": 44.38870620727539, "learning_rate": 5.991882293252157e-06, "loss": 0.3464, "step": 5046 }, { "epoch": 4.609132420091324, "grad_norm": 21.864309310913086, "learning_rate": 5.990867579908676e-06, "loss": 0.1435, "step": 5047 }, { "epoch": 4.610045662100457, "grad_norm": 38.31217575073242, "learning_rate": 5.989852866565195e-06, "loss": 0.3361, "step": 5048 }, { "epoch": 4.610958904109589, "grad_norm": 0.02524600923061371, "learning_rate": 5.988838153221715e-06, "loss": 0.0001, "step": 5049 }, { "epoch": 4.6118721461187215, "grad_norm": 0.4024222195148468, "learning_rate": 5.987823439878236e-06, "loss": 0.0025, "step": 5050 }, { "epoch": 4.612785388127854, "grad_norm": 1.9417051076889038, "learning_rate": 5.9868087265347545e-06, "loss": 0.0103, "step": 5051 }, { "epoch": 4.6136986301369864, "grad_norm": 0.11030411720275879, "learning_rate": 5.985794013191274e-06, "loss": 0.0009, "step": 5052 }, { "epoch": 4.614611872146119, "grad_norm": 0.4261958599090576, "learning_rate": 5.984779299847794e-06, "loss": 0.0031, "step": 5053 }, { "epoch": 4.615525114155251, "grad_norm": 17.524154663085938, "learning_rate": 5.983764586504313e-06, "loss": 0.142, "step": 5054 }, { "epoch": 4.616438356164384, "grad_norm": 0.3620370030403137, "learning_rate": 5.982749873160832e-06, "loss": 0.0018, "step": 5055 }, { "epoch": 4.617351598173516, "grad_norm": 108.12537384033203, "learning_rate": 5.981735159817352e-06, "loss": 0.6368, "step": 5056 }, { "epoch": 4.618264840182649, "grad_norm": 15.243748664855957, "learning_rate": 5.980720446473871e-06, "loss": 0.0797, "step": 5057 }, { "epoch": 4.619178082191781, "grad_norm": 0.21959839761257172, "learning_rate": 5.979705733130391e-06, "loss": 0.0011, "step": 5058 }, { "epoch": 4.620091324200914, "grad_norm": 0.1838013380765915, "learning_rate": 5.978691019786911e-06, "loss": 0.0017, "step": 5059 }, { "epoch": 4.621004566210045, "grad_norm": 3.025536060333252, "learning_rate": 5.977676306443431e-06, "loss": 0.0182, "step": 5060 }, { "epoch": 4.6219178082191785, "grad_norm": 18.121292114257812, "learning_rate": 5.97666159309995e-06, "loss": 0.144, "step": 5061 }, { "epoch": 4.62283105022831, "grad_norm": 18.727718353271484, "learning_rate": 5.975646879756469e-06, "loss": 0.1614, "step": 5062 }, { "epoch": 4.6237442922374425, "grad_norm": 0.08675985038280487, "learning_rate": 5.974632166412989e-06, "loss": 0.0004, "step": 5063 }, { "epoch": 4.624657534246575, "grad_norm": 4.811676979064941, "learning_rate": 5.973617453069508e-06, "loss": 0.0239, "step": 5064 }, { "epoch": 4.6255707762557075, "grad_norm": 30.827436447143555, "learning_rate": 5.972602739726028e-06, "loss": 0.2662, "step": 5065 }, { "epoch": 4.62648401826484, "grad_norm": 1.9449371099472046, "learning_rate": 5.971588026382547e-06, "loss": 0.0121, "step": 5066 }, { "epoch": 4.627397260273972, "grad_norm": 0.6343604326248169, "learning_rate": 5.970573313039066e-06, "loss": 0.0063, "step": 5067 }, { "epoch": 4.628310502283105, "grad_norm": 4.203662872314453, "learning_rate": 5.969558599695587e-06, "loss": 0.0248, "step": 5068 }, { "epoch": 4.629223744292237, "grad_norm": 1.1661360263824463, "learning_rate": 5.968543886352106e-06, "loss": 0.0077, "step": 5069 }, { "epoch": 4.63013698630137, "grad_norm": 13.795806884765625, "learning_rate": 5.967529173008626e-06, "loss": 0.0765, "step": 5070 }, { "epoch": 4.631050228310502, "grad_norm": 0.4724084138870239, "learning_rate": 5.966514459665145e-06, "loss": 0.0038, "step": 5071 }, { "epoch": 4.631963470319635, "grad_norm": 16.802724838256836, "learning_rate": 5.965499746321665e-06, "loss": 0.0669, "step": 5072 }, { "epoch": 4.632876712328767, "grad_norm": 0.1672820746898651, "learning_rate": 5.964485032978184e-06, "loss": 0.001, "step": 5073 }, { "epoch": 4.6337899543378995, "grad_norm": 0.42651525139808655, "learning_rate": 5.963470319634703e-06, "loss": 0.0022, "step": 5074 }, { "epoch": 4.634703196347032, "grad_norm": 1.0204646587371826, "learning_rate": 5.962455606291223e-06, "loss": 0.006, "step": 5075 }, { "epoch": 4.635616438356164, "grad_norm": 64.80485534667969, "learning_rate": 5.961440892947743e-06, "loss": 0.3624, "step": 5076 }, { "epoch": 4.636529680365297, "grad_norm": 10.095235824584961, "learning_rate": 5.960426179604261e-06, "loss": 0.0564, "step": 5077 }, { "epoch": 4.637442922374429, "grad_norm": 1.0918785333633423, "learning_rate": 5.959411466260782e-06, "loss": 0.0061, "step": 5078 }, { "epoch": 4.638356164383562, "grad_norm": 3.814493417739868, "learning_rate": 5.958396752917302e-06, "loss": 0.0152, "step": 5079 }, { "epoch": 4.639269406392694, "grad_norm": 0.08266062289476395, "learning_rate": 5.957382039573821e-06, "loss": 0.0005, "step": 5080 }, { "epoch": 4.640182648401827, "grad_norm": 21.05353546142578, "learning_rate": 5.95636732623034e-06, "loss": 0.0961, "step": 5081 }, { "epoch": 4.641095890410959, "grad_norm": 27.078338623046875, "learning_rate": 5.95535261288686e-06, "loss": 0.1527, "step": 5082 }, { "epoch": 4.642009132420092, "grad_norm": 1.7300140857696533, "learning_rate": 5.9543378995433795e-06, "loss": 0.0106, "step": 5083 }, { "epoch": 4.642922374429224, "grad_norm": 11.416524887084961, "learning_rate": 5.953323186199898e-06, "loss": 0.0708, "step": 5084 }, { "epoch": 4.6438356164383565, "grad_norm": 10.22096061706543, "learning_rate": 5.952308472856418e-06, "loss": 0.0356, "step": 5085 }, { "epoch": 4.644748858447489, "grad_norm": 61.65169143676758, "learning_rate": 5.951293759512939e-06, "loss": 0.3637, "step": 5086 }, { "epoch": 4.6456621004566205, "grad_norm": 0.4424465298652649, "learning_rate": 5.9502790461694574e-06, "loss": 0.0029, "step": 5087 }, { "epoch": 4.646575342465754, "grad_norm": 3.545585870742798, "learning_rate": 5.949264332825977e-06, "loss": 0.017, "step": 5088 }, { "epoch": 4.647488584474885, "grad_norm": 7.820219993591309, "learning_rate": 5.948249619482497e-06, "loss": 0.0422, "step": 5089 }, { "epoch": 4.648401826484018, "grad_norm": 0.009376788511872292, "learning_rate": 5.9472349061390165e-06, "loss": 0.0001, "step": 5090 }, { "epoch": 4.64931506849315, "grad_norm": 0.32082802057266235, "learning_rate": 5.946220192795535e-06, "loss": 0.0016, "step": 5091 }, { "epoch": 4.650228310502283, "grad_norm": 1.5192134380340576, "learning_rate": 5.945205479452055e-06, "loss": 0.009, "step": 5092 }, { "epoch": 4.651141552511415, "grad_norm": 0.7021704316139221, "learning_rate": 5.944190766108575e-06, "loss": 0.004, "step": 5093 }, { "epoch": 4.652054794520548, "grad_norm": 0.07272211462259293, "learning_rate": 5.943176052765094e-06, "loss": 0.0005, "step": 5094 }, { "epoch": 4.65296803652968, "grad_norm": 1.2178397178649902, "learning_rate": 5.942161339421614e-06, "loss": 0.0039, "step": 5095 }, { "epoch": 4.653881278538813, "grad_norm": 0.9690282344818115, "learning_rate": 5.941146626078134e-06, "loss": 0.0068, "step": 5096 }, { "epoch": 4.654794520547945, "grad_norm": 0.6541019678115845, "learning_rate": 5.940131912734653e-06, "loss": 0.0044, "step": 5097 }, { "epoch": 4.6557077625570775, "grad_norm": 8.789979934692383, "learning_rate": 5.939117199391172e-06, "loss": 0.0667, "step": 5098 }, { "epoch": 4.65662100456621, "grad_norm": 2.2469258308410645, "learning_rate": 5.938102486047692e-06, "loss": 0.0137, "step": 5099 }, { "epoch": 4.657534246575342, "grad_norm": 3.5999667644500732, "learning_rate": 5.937087772704212e-06, "loss": 0.0244, "step": 5100 }, { "epoch": 4.658447488584475, "grad_norm": 7.407402038574219, "learning_rate": 5.936073059360731e-06, "loss": 0.0408, "step": 5101 }, { "epoch": 4.659360730593607, "grad_norm": 1.1958683729171753, "learning_rate": 5.93505834601725e-06, "loss": 0.0095, "step": 5102 }, { "epoch": 4.66027397260274, "grad_norm": 0.2500751316547394, "learning_rate": 5.934043632673771e-06, "loss": 0.0018, "step": 5103 }, { "epoch": 4.661187214611872, "grad_norm": 2.098921298980713, "learning_rate": 5.933028919330289e-06, "loss": 0.0116, "step": 5104 }, { "epoch": 4.662100456621005, "grad_norm": 117.4765625, "learning_rate": 5.932014205986809e-06, "loss": 2.4136, "step": 5105 }, { "epoch": 4.663013698630137, "grad_norm": 0.09081800282001495, "learning_rate": 5.930999492643329e-06, "loss": 0.0005, "step": 5106 }, { "epoch": 4.66392694063927, "grad_norm": 4.462942123413086, "learning_rate": 5.929984779299848e-06, "loss": 0.0341, "step": 5107 }, { "epoch": 4.664840182648402, "grad_norm": 2.674417018890381, "learning_rate": 5.928970065956368e-06, "loss": 0.0138, "step": 5108 }, { "epoch": 4.6657534246575345, "grad_norm": 3.2009119987487793, "learning_rate": 5.927955352612887e-06, "loss": 0.0128, "step": 5109 }, { "epoch": 4.666666666666667, "grad_norm": 4.916360855102539, "learning_rate": 5.926940639269407e-06, "loss": 0.0296, "step": 5110 }, { "epoch": 4.667579908675799, "grad_norm": 34.281856536865234, "learning_rate": 5.925925925925926e-06, "loss": 0.3537, "step": 5111 }, { "epoch": 4.668493150684932, "grad_norm": 0.14936687052249908, "learning_rate": 5.924911212582446e-06, "loss": 0.001, "step": 5112 }, { "epoch": 4.669406392694064, "grad_norm": 0.01582307741045952, "learning_rate": 5.923896499238966e-06, "loss": 0.0001, "step": 5113 }, { "epoch": 4.670319634703196, "grad_norm": 0.18301907181739807, "learning_rate": 5.922881785895485e-06, "loss": 0.0009, "step": 5114 }, { "epoch": 4.671232876712329, "grad_norm": 21.214061737060547, "learning_rate": 5.921867072552005e-06, "loss": 0.1324, "step": 5115 }, { "epoch": 4.672146118721461, "grad_norm": 23.01827621459961, "learning_rate": 5.920852359208524e-06, "loss": 0.1324, "step": 5116 }, { "epoch": 4.673059360730593, "grad_norm": 4.200490951538086, "learning_rate": 5.919837645865043e-06, "loss": 0.0278, "step": 5117 }, { "epoch": 4.673972602739726, "grad_norm": 71.87676239013672, "learning_rate": 5.918822932521563e-06, "loss": 0.7316, "step": 5118 }, { "epoch": 4.674885844748858, "grad_norm": 3.3690054416656494, "learning_rate": 5.9178082191780825e-06, "loss": 0.0268, "step": 5119 }, { "epoch": 4.675799086757991, "grad_norm": 99.77314758300781, "learning_rate": 5.916793505834603e-06, "loss": 2.7537, "step": 5120 }, { "epoch": 4.676712328767123, "grad_norm": 0.4253365099430084, "learning_rate": 5.915778792491121e-06, "loss": 0.0037, "step": 5121 }, { "epoch": 4.6776255707762555, "grad_norm": 5.3510823249816895, "learning_rate": 5.914764079147642e-06, "loss": 0.0331, "step": 5122 }, { "epoch": 4.678538812785388, "grad_norm": 9.88636589050293, "learning_rate": 5.913749365804161e-06, "loss": 0.062, "step": 5123 }, { "epoch": 4.67945205479452, "grad_norm": 0.7636845707893372, "learning_rate": 5.91273465246068e-06, "loss": 0.0062, "step": 5124 }, { "epoch": 4.680365296803653, "grad_norm": 28.16612434387207, "learning_rate": 5.9117199391172e-06, "loss": 0.2624, "step": 5125 }, { "epoch": 4.681278538812785, "grad_norm": 10.624711036682129, "learning_rate": 5.9107052257737195e-06, "loss": 0.0536, "step": 5126 }, { "epoch": 4.682191780821918, "grad_norm": 1.0429636240005493, "learning_rate": 5.909690512430238e-06, "loss": 0.0088, "step": 5127 }, { "epoch": 4.68310502283105, "grad_norm": 18.073766708374023, "learning_rate": 5.908675799086758e-06, "loss": 0.0949, "step": 5128 }, { "epoch": 4.684018264840183, "grad_norm": 0.0033678559120744467, "learning_rate": 5.907661085743278e-06, "loss": 0.0, "step": 5129 }, { "epoch": 4.684931506849315, "grad_norm": 51.160247802734375, "learning_rate": 5.906646372399798e-06, "loss": 0.5893, "step": 5130 }, { "epoch": 4.685844748858448, "grad_norm": 22.741500854492188, "learning_rate": 5.905631659056317e-06, "loss": 0.1387, "step": 5131 }, { "epoch": 4.68675799086758, "grad_norm": 18.789859771728516, "learning_rate": 5.904616945712837e-06, "loss": 0.2051, "step": 5132 }, { "epoch": 4.6876712328767125, "grad_norm": 0.9081122279167175, "learning_rate": 5.9036022323693565e-06, "loss": 0.0054, "step": 5133 }, { "epoch": 4.688584474885845, "grad_norm": 0.31715813279151917, "learning_rate": 5.902587519025875e-06, "loss": 0.0021, "step": 5134 }, { "epoch": 4.689497716894977, "grad_norm": 3.089104413986206, "learning_rate": 5.901572805682395e-06, "loss": 0.0234, "step": 5135 }, { "epoch": 4.69041095890411, "grad_norm": 0.9934554696083069, "learning_rate": 5.900558092338915e-06, "loss": 0.0069, "step": 5136 }, { "epoch": 4.691324200913242, "grad_norm": 2.7882792949676514, "learning_rate": 5.8995433789954336e-06, "loss": 0.0189, "step": 5137 }, { "epoch": 4.692237442922375, "grad_norm": 2.3543663024902344, "learning_rate": 5.898528665651953e-06, "loss": 0.0174, "step": 5138 }, { "epoch": 4.693150684931507, "grad_norm": 123.74368286132812, "learning_rate": 5.897513952308474e-06, "loss": 0.5372, "step": 5139 }, { "epoch": 4.69406392694064, "grad_norm": 9.814990043640137, "learning_rate": 5.8964992389649935e-06, "loss": 0.0995, "step": 5140 }, { "epoch": 4.694977168949771, "grad_norm": 0.2887100279331207, "learning_rate": 5.895484525621512e-06, "loss": 0.0022, "step": 5141 }, { "epoch": 4.695890410958905, "grad_norm": 1.354702353477478, "learning_rate": 5.894469812278032e-06, "loss": 0.0063, "step": 5142 }, { "epoch": 4.696803652968036, "grad_norm": 0.04788796603679657, "learning_rate": 5.893455098934552e-06, "loss": 0.0003, "step": 5143 }, { "epoch": 4.697716894977169, "grad_norm": 2.4398295879364014, "learning_rate": 5.8924403855910706e-06, "loss": 0.0197, "step": 5144 }, { "epoch": 4.698630136986301, "grad_norm": 119.86669158935547, "learning_rate": 5.89142567224759e-06, "loss": 3.617, "step": 5145 }, { "epoch": 4.6995433789954335, "grad_norm": 1.2906951904296875, "learning_rate": 5.89041095890411e-06, "loss": 0.008, "step": 5146 }, { "epoch": 4.700456621004566, "grad_norm": 23.65091323852539, "learning_rate": 5.889396245560629e-06, "loss": 0.1807, "step": 5147 }, { "epoch": 4.701369863013698, "grad_norm": 3.2047393321990967, "learning_rate": 5.888381532217149e-06, "loss": 0.0194, "step": 5148 }, { "epoch": 4.702283105022831, "grad_norm": 1.9091784954071045, "learning_rate": 5.887366818873669e-06, "loss": 0.0131, "step": 5149 }, { "epoch": 4.703196347031963, "grad_norm": 3.9423880577087402, "learning_rate": 5.886352105530189e-06, "loss": 0.022, "step": 5150 }, { "epoch": 4.704109589041096, "grad_norm": 10.974541664123535, "learning_rate": 5.8853373921867076e-06, "loss": 0.0505, "step": 5151 }, { "epoch": 4.705022831050228, "grad_norm": 1.4306156635284424, "learning_rate": 5.884322678843227e-06, "loss": 0.0099, "step": 5152 }, { "epoch": 4.705936073059361, "grad_norm": 23.733341217041016, "learning_rate": 5.883307965499747e-06, "loss": 0.175, "step": 5153 }, { "epoch": 4.706849315068493, "grad_norm": 127.90093994140625, "learning_rate": 5.882293252156266e-06, "loss": 6.9694, "step": 5154 }, { "epoch": 4.707762557077626, "grad_norm": 28.203834533691406, "learning_rate": 5.8812785388127855e-06, "loss": 0.186, "step": 5155 }, { "epoch": 4.708675799086758, "grad_norm": 11.89073371887207, "learning_rate": 5.880263825469306e-06, "loss": 0.1006, "step": 5156 }, { "epoch": 4.7095890410958905, "grad_norm": 1.0437425374984741, "learning_rate": 5.879249112125824e-06, "loss": 0.0062, "step": 5157 }, { "epoch": 4.710502283105023, "grad_norm": 0.7967360019683838, "learning_rate": 5.8782343987823446e-06, "loss": 0.0062, "step": 5158 }, { "epoch": 4.711415525114155, "grad_norm": 12.738207817077637, "learning_rate": 5.877219685438864e-06, "loss": 0.0566, "step": 5159 }, { "epoch": 4.712328767123288, "grad_norm": 5.372920513153076, "learning_rate": 5.876204972095384e-06, "loss": 0.0192, "step": 5160 }, { "epoch": 4.71324200913242, "grad_norm": 0.6046936511993408, "learning_rate": 5.875190258751903e-06, "loss": 0.002, "step": 5161 }, { "epoch": 4.714155251141553, "grad_norm": 21.76230812072754, "learning_rate": 5.8741755454084225e-06, "loss": 0.1053, "step": 5162 }, { "epoch": 4.715068493150685, "grad_norm": 28.048677444458008, "learning_rate": 5.873160832064942e-06, "loss": 0.2085, "step": 5163 }, { "epoch": 4.715981735159818, "grad_norm": 6.499228000640869, "learning_rate": 5.872146118721461e-06, "loss": 0.0229, "step": 5164 }, { "epoch": 4.71689497716895, "grad_norm": 2.335533857345581, "learning_rate": 5.871131405377981e-06, "loss": 0.02, "step": 5165 }, { "epoch": 4.717808219178083, "grad_norm": 12.329638481140137, "learning_rate": 5.870116692034501e-06, "loss": 0.0486, "step": 5166 }, { "epoch": 4.718721461187215, "grad_norm": 1.7947322130203247, "learning_rate": 5.86910197869102e-06, "loss": 0.012, "step": 5167 }, { "epoch": 4.719634703196347, "grad_norm": 0.3186401128768921, "learning_rate": 5.86808726534754e-06, "loss": 0.0017, "step": 5168 }, { "epoch": 4.72054794520548, "grad_norm": 1.7729498147964478, "learning_rate": 5.8670725520040595e-06, "loss": 0.0095, "step": 5169 }, { "epoch": 4.7214611872146115, "grad_norm": 3.4334774017333984, "learning_rate": 5.866057838660579e-06, "loss": 0.0227, "step": 5170 }, { "epoch": 4.722374429223744, "grad_norm": 0.19946150481700897, "learning_rate": 5.865043125317098e-06, "loss": 0.0016, "step": 5171 }, { "epoch": 4.723287671232876, "grad_norm": 0.4413634240627289, "learning_rate": 5.864028411973618e-06, "loss": 0.0023, "step": 5172 }, { "epoch": 4.724200913242009, "grad_norm": 107.57755279541016, "learning_rate": 5.863013698630137e-06, "loss": 3.2798, "step": 5173 }, { "epoch": 4.725114155251141, "grad_norm": 3.513650894165039, "learning_rate": 5.861998985286656e-06, "loss": 0.027, "step": 5174 }, { "epoch": 4.726027397260274, "grad_norm": 3.786301374435425, "learning_rate": 5.860984271943177e-06, "loss": 0.0281, "step": 5175 }, { "epoch": 4.726940639269406, "grad_norm": 25.149009704589844, "learning_rate": 5.8599695585996965e-06, "loss": 0.2006, "step": 5176 }, { "epoch": 4.727853881278539, "grad_norm": 17.079126358032227, "learning_rate": 5.858954845256215e-06, "loss": 0.0405, "step": 5177 }, { "epoch": 4.728767123287671, "grad_norm": 19.832813262939453, "learning_rate": 5.857940131912735e-06, "loss": 0.1456, "step": 5178 }, { "epoch": 4.729680365296804, "grad_norm": 5.32176399230957, "learning_rate": 5.856925418569255e-06, "loss": 0.0489, "step": 5179 }, { "epoch": 4.730593607305936, "grad_norm": 0.6868489384651184, "learning_rate": 5.855910705225774e-06, "loss": 0.0041, "step": 5180 }, { "epoch": 4.7315068493150685, "grad_norm": 5.057949066162109, "learning_rate": 5.854895991882293e-06, "loss": 0.0322, "step": 5181 }, { "epoch": 4.732420091324201, "grad_norm": 26.305082321166992, "learning_rate": 5.853881278538813e-06, "loss": 0.2707, "step": 5182 }, { "epoch": 4.733333333333333, "grad_norm": 18.250490188598633, "learning_rate": 5.8528665651953335e-06, "loss": 0.1549, "step": 5183 }, { "epoch": 4.734246575342466, "grad_norm": 7.063227653503418, "learning_rate": 5.8518518518518515e-06, "loss": 0.0391, "step": 5184 }, { "epoch": 4.735159817351598, "grad_norm": 50.237220764160156, "learning_rate": 5.850837138508372e-06, "loss": 0.4553, "step": 5185 }, { "epoch": 4.736073059360731, "grad_norm": 0.5327274799346924, "learning_rate": 5.849822425164892e-06, "loss": 0.0038, "step": 5186 }, { "epoch": 4.736986301369863, "grad_norm": 3.011155605316162, "learning_rate": 5.8488077118214106e-06, "loss": 0.0165, "step": 5187 }, { "epoch": 4.737899543378996, "grad_norm": 92.8238754272461, "learning_rate": 5.84779299847793e-06, "loss": 4.6552, "step": 5188 }, { "epoch": 4.738812785388128, "grad_norm": 5.041258335113525, "learning_rate": 5.84677828513445e-06, "loss": 0.0312, "step": 5189 }, { "epoch": 4.739726027397261, "grad_norm": 26.013721466064453, "learning_rate": 5.84576357179097e-06, "loss": 0.2331, "step": 5190 }, { "epoch": 4.740639269406393, "grad_norm": 2.2389914989471436, "learning_rate": 5.8447488584474885e-06, "loss": 0.0197, "step": 5191 }, { "epoch": 4.7415525114155255, "grad_norm": 19.13033676147461, "learning_rate": 5.843734145104009e-06, "loss": 0.1141, "step": 5192 }, { "epoch": 4.742465753424657, "grad_norm": 0.6802579760551453, "learning_rate": 5.842719431760529e-06, "loss": 0.0062, "step": 5193 }, { "epoch": 4.74337899543379, "grad_norm": 18.729490280151367, "learning_rate": 5.8417047184170476e-06, "loss": 0.1879, "step": 5194 }, { "epoch": 4.744292237442922, "grad_norm": 80.72889709472656, "learning_rate": 5.840690005073567e-06, "loss": 1.0558, "step": 5195 }, { "epoch": 4.745205479452055, "grad_norm": 5.282419204711914, "learning_rate": 5.839675291730087e-06, "loss": 0.039, "step": 5196 }, { "epoch": 4.746118721461187, "grad_norm": 0.8670524954795837, "learning_rate": 5.838660578386606e-06, "loss": 0.0057, "step": 5197 }, { "epoch": 4.747031963470319, "grad_norm": 10.455726623535156, "learning_rate": 5.8376458650431255e-06, "loss": 0.0757, "step": 5198 }, { "epoch": 4.747945205479452, "grad_norm": 17.1789493560791, "learning_rate": 5.836631151699645e-06, "loss": 0.1464, "step": 5199 }, { "epoch": 4.748858447488584, "grad_norm": 91.71302032470703, "learning_rate": 5.835616438356166e-06, "loss": 1.93, "step": 5200 }, { "epoch": 4.749771689497717, "grad_norm": 6.1435346603393555, "learning_rate": 5.834601725012684e-06, "loss": 0.046, "step": 5201 }, { "epoch": 4.750684931506849, "grad_norm": 19.56318473815918, "learning_rate": 5.833587011669204e-06, "loss": 0.1466, "step": 5202 }, { "epoch": 4.751598173515982, "grad_norm": 0.6915968656539917, "learning_rate": 5.832572298325724e-06, "loss": 0.0053, "step": 5203 }, { "epoch": 4.752511415525114, "grad_norm": 22.07624053955078, "learning_rate": 5.831557584982243e-06, "loss": 0.1191, "step": 5204 }, { "epoch": 4.7534246575342465, "grad_norm": 16.978872299194336, "learning_rate": 5.8305428716387625e-06, "loss": 0.1025, "step": 5205 }, { "epoch": 4.754337899543379, "grad_norm": 42.55967712402344, "learning_rate": 5.829528158295282e-06, "loss": 0.724, "step": 5206 }, { "epoch": 4.755251141552511, "grad_norm": 12.226722717285156, "learning_rate": 5.828513444951801e-06, "loss": 0.0743, "step": 5207 }, { "epoch": 4.756164383561644, "grad_norm": 24.717514038085938, "learning_rate": 5.827498731608321e-06, "loss": 0.1141, "step": 5208 }, { "epoch": 4.757077625570776, "grad_norm": 5.222588062286377, "learning_rate": 5.82648401826484e-06, "loss": 0.0429, "step": 5209 }, { "epoch": 4.757990867579909, "grad_norm": 3.4128708839416504, "learning_rate": 5.825469304921361e-06, "loss": 0.0228, "step": 5210 }, { "epoch": 4.758904109589041, "grad_norm": 0.27480611205101013, "learning_rate": 5.82445459157788e-06, "loss": 0.0024, "step": 5211 }, { "epoch": 4.759817351598174, "grad_norm": 0.44429251551628113, "learning_rate": 5.8234398782343995e-06, "loss": 0.003, "step": 5212 }, { "epoch": 4.760730593607306, "grad_norm": 16.65463638305664, "learning_rate": 5.822425164890919e-06, "loss": 0.1234, "step": 5213 }, { "epoch": 4.761643835616439, "grad_norm": 1.05391263961792, "learning_rate": 5.821410451547438e-06, "loss": 0.0048, "step": 5214 }, { "epoch": 4.762557077625571, "grad_norm": 0.10136956721544266, "learning_rate": 5.820395738203958e-06, "loss": 0.0003, "step": 5215 }, { "epoch": 4.7634703196347035, "grad_norm": 3.449831485748291, "learning_rate": 5.819381024860477e-06, "loss": 0.0306, "step": 5216 }, { "epoch": 4.764383561643836, "grad_norm": 0.48797228932380676, "learning_rate": 5.818366311516996e-06, "loss": 0.0041, "step": 5217 }, { "epoch": 4.765296803652968, "grad_norm": 2.150912046432495, "learning_rate": 5.817351598173516e-06, "loss": 0.0175, "step": 5218 }, { "epoch": 4.766210045662101, "grad_norm": 72.61621856689453, "learning_rate": 5.8163368848300365e-06, "loss": 0.8269, "step": 5219 }, { "epoch": 4.767123287671232, "grad_norm": 6.25045919418335, "learning_rate": 5.815322171486556e-06, "loss": 0.0431, "step": 5220 }, { "epoch": 4.768036529680366, "grad_norm": 2.4804134368896484, "learning_rate": 5.814307458143075e-06, "loss": 0.0165, "step": 5221 }, { "epoch": 4.768949771689497, "grad_norm": 37.246341705322266, "learning_rate": 5.813292744799595e-06, "loss": 0.2168, "step": 5222 }, { "epoch": 4.76986301369863, "grad_norm": 0.7272422909736633, "learning_rate": 5.812278031456114e-06, "loss": 0.0043, "step": 5223 }, { "epoch": 4.770776255707762, "grad_norm": 2.5788044929504395, "learning_rate": 5.811263318112633e-06, "loss": 0.0106, "step": 5224 }, { "epoch": 4.771689497716895, "grad_norm": 7.972287654876709, "learning_rate": 5.810248604769153e-06, "loss": 0.0533, "step": 5225 }, { "epoch": 4.772602739726027, "grad_norm": 1.0484752655029297, "learning_rate": 5.809233891425673e-06, "loss": 0.0085, "step": 5226 }, { "epoch": 4.77351598173516, "grad_norm": 0.9723784327507019, "learning_rate": 5.8082191780821915e-06, "loss": 0.0079, "step": 5227 }, { "epoch": 4.774429223744292, "grad_norm": 37.79200744628906, "learning_rate": 5.807204464738712e-06, "loss": 0.3476, "step": 5228 }, { "epoch": 4.7753424657534245, "grad_norm": 48.10444259643555, "learning_rate": 5.806189751395232e-06, "loss": 0.4706, "step": 5229 }, { "epoch": 4.776255707762557, "grad_norm": 12.133745193481445, "learning_rate": 5.805175038051751e-06, "loss": 0.057, "step": 5230 }, { "epoch": 4.777168949771689, "grad_norm": 95.88712310791016, "learning_rate": 5.80416032470827e-06, "loss": 1.3194, "step": 5231 }, { "epoch": 4.778082191780822, "grad_norm": 15.228135108947754, "learning_rate": 5.80314561136479e-06, "loss": 0.108, "step": 5232 }, { "epoch": 4.778995433789954, "grad_norm": 3.9755265712738037, "learning_rate": 5.80213089802131e-06, "loss": 0.0287, "step": 5233 }, { "epoch": 4.779908675799087, "grad_norm": 12.975773811340332, "learning_rate": 5.8011161846778285e-06, "loss": 0.0836, "step": 5234 }, { "epoch": 4.780821917808219, "grad_norm": 13.3253812789917, "learning_rate": 5.800101471334348e-06, "loss": 0.0807, "step": 5235 }, { "epoch": 4.781735159817352, "grad_norm": 0.29831182956695557, "learning_rate": 5.799086757990869e-06, "loss": 0.0021, "step": 5236 }, { "epoch": 4.782648401826484, "grad_norm": 21.817264556884766, "learning_rate": 5.798072044647387e-06, "loss": 0.134, "step": 5237 }, { "epoch": 4.7835616438356166, "grad_norm": 31.701276779174805, "learning_rate": 5.797057331303907e-06, "loss": 0.7232, "step": 5238 }, { "epoch": 4.784474885844749, "grad_norm": 0.6532248854637146, "learning_rate": 5.796042617960427e-06, "loss": 0.0039, "step": 5239 }, { "epoch": 4.7853881278538815, "grad_norm": 1.0936447381973267, "learning_rate": 5.795027904616947e-06, "loss": 0.0076, "step": 5240 }, { "epoch": 4.786301369863014, "grad_norm": 11.502208709716797, "learning_rate": 5.7940131912734655e-06, "loss": 0.0341, "step": 5241 }, { "epoch": 4.787214611872146, "grad_norm": 0.015040822327136993, "learning_rate": 5.792998477929985e-06, "loss": 0.0001, "step": 5242 }, { "epoch": 4.788127853881279, "grad_norm": 0.5679131746292114, "learning_rate": 5.791983764586505e-06, "loss": 0.005, "step": 5243 }, { "epoch": 4.789041095890411, "grad_norm": 6.362490177154541, "learning_rate": 5.790969051243024e-06, "loss": 0.0416, "step": 5244 }, { "epoch": 4.789954337899544, "grad_norm": 0.5940694808959961, "learning_rate": 5.789954337899543e-06, "loss": 0.002, "step": 5245 }, { "epoch": 4.790867579908676, "grad_norm": 160.51390075683594, "learning_rate": 5.788939624556064e-06, "loss": 0.3747, "step": 5246 }, { "epoch": 4.791780821917808, "grad_norm": 14.907342910766602, "learning_rate": 5.787924911212583e-06, "loss": 0.0904, "step": 5247 }, { "epoch": 4.792694063926941, "grad_norm": 4.432779312133789, "learning_rate": 5.7869101978691025e-06, "loss": 0.0297, "step": 5248 }, { "epoch": 4.793607305936073, "grad_norm": 7.479038238525391, "learning_rate": 5.785895484525622e-06, "loss": 0.0554, "step": 5249 }, { "epoch": 4.794520547945205, "grad_norm": 1.7072820663452148, "learning_rate": 5.784880771182142e-06, "loss": 0.0113, "step": 5250 }, { "epoch": 4.7954337899543376, "grad_norm": 66.86388397216797, "learning_rate": 5.783866057838661e-06, "loss": 0.4668, "step": 5251 }, { "epoch": 4.79634703196347, "grad_norm": 5.331115245819092, "learning_rate": 5.78285134449518e-06, "loss": 0.0244, "step": 5252 }, { "epoch": 4.7972602739726025, "grad_norm": 74.09292602539062, "learning_rate": 5.7818366311517e-06, "loss": 1.0676, "step": 5253 }, { "epoch": 4.798173515981735, "grad_norm": 1.561504602432251, "learning_rate": 5.780821917808219e-06, "loss": 0.008, "step": 5254 }, { "epoch": 4.799086757990867, "grad_norm": 0.14688003063201904, "learning_rate": 5.7798072044647394e-06, "loss": 0.0013, "step": 5255 }, { "epoch": 4.8, "grad_norm": 6.694660186767578, "learning_rate": 5.778792491121259e-06, "loss": 0.0525, "step": 5256 }, { "epoch": 4.800913242009132, "grad_norm": 85.7085952758789, "learning_rate": 5.777777777777778e-06, "loss": 0.8827, "step": 5257 }, { "epoch": 4.801826484018265, "grad_norm": 5.863836288452148, "learning_rate": 5.776763064434298e-06, "loss": 0.0447, "step": 5258 }, { "epoch": 4.802739726027397, "grad_norm": 0.6920908689498901, "learning_rate": 5.775748351090817e-06, "loss": 0.0054, "step": 5259 }, { "epoch": 4.80365296803653, "grad_norm": 1.5520323514938354, "learning_rate": 5.774733637747337e-06, "loss": 0.0108, "step": 5260 }, { "epoch": 4.804566210045662, "grad_norm": 12.904488563537598, "learning_rate": 5.773718924403856e-06, "loss": 0.1043, "step": 5261 }, { "epoch": 4.8054794520547945, "grad_norm": 0.5994160771369934, "learning_rate": 5.772704211060376e-06, "loss": 0.0046, "step": 5262 }, { "epoch": 4.806392694063927, "grad_norm": 177.38621520996094, "learning_rate": 5.771689497716896e-06, "loss": 0.5753, "step": 5263 }, { "epoch": 4.8073059360730594, "grad_norm": 14.36812973022461, "learning_rate": 5.770674784373414e-06, "loss": 0.0515, "step": 5264 }, { "epoch": 4.808219178082192, "grad_norm": 1.9265586137771606, "learning_rate": 5.769660071029935e-06, "loss": 0.0102, "step": 5265 }, { "epoch": 4.809132420091324, "grad_norm": 19.24673843383789, "learning_rate": 5.768645357686454e-06, "loss": 0.0756, "step": 5266 }, { "epoch": 4.810045662100457, "grad_norm": 1.526038408279419, "learning_rate": 5.767630644342973e-06, "loss": 0.0112, "step": 5267 }, { "epoch": 4.810958904109589, "grad_norm": 20.056291580200195, "learning_rate": 5.766615930999493e-06, "loss": 0.125, "step": 5268 }, { "epoch": 4.811872146118722, "grad_norm": 1.1979866027832031, "learning_rate": 5.765601217656013e-06, "loss": 0.0076, "step": 5269 }, { "epoch": 4.812785388127854, "grad_norm": 24.252016067504883, "learning_rate": 5.764586504312532e-06, "loss": 0.1773, "step": 5270 }, { "epoch": 4.813698630136987, "grad_norm": 50.46253204345703, "learning_rate": 5.763571790969051e-06, "loss": 0.4107, "step": 5271 }, { "epoch": 4.814611872146119, "grad_norm": 2.2567434310913086, "learning_rate": 5.762557077625572e-06, "loss": 0.0121, "step": 5272 }, { "epoch": 4.8155251141552515, "grad_norm": 0.3013279139995575, "learning_rate": 5.761542364282091e-06, "loss": 0.0019, "step": 5273 }, { "epoch": 4.816438356164383, "grad_norm": 1.8717716932296753, "learning_rate": 5.76052765093861e-06, "loss": 0.0118, "step": 5274 }, { "epoch": 4.817351598173516, "grad_norm": 13.829297065734863, "learning_rate": 5.75951293759513e-06, "loss": 0.0791, "step": 5275 }, { "epoch": 4.818264840182648, "grad_norm": 4.181802749633789, "learning_rate": 5.75849822425165e-06, "loss": 0.0377, "step": 5276 }, { "epoch": 4.8191780821917805, "grad_norm": 16.9544677734375, "learning_rate": 5.7574835109081684e-06, "loss": 0.2031, "step": 5277 }, { "epoch": 4.820091324200913, "grad_norm": 21.043203353881836, "learning_rate": 5.756468797564688e-06, "loss": 0.1229, "step": 5278 }, { "epoch": 4.821004566210045, "grad_norm": 24.356199264526367, "learning_rate": 5.755454084221208e-06, "loss": 0.2621, "step": 5279 }, { "epoch": 4.821917808219178, "grad_norm": 26.58235740661621, "learning_rate": 5.754439370877728e-06, "loss": 0.1623, "step": 5280 }, { "epoch": 4.82283105022831, "grad_norm": 3.612083911895752, "learning_rate": 5.753424657534246e-06, "loss": 0.0171, "step": 5281 }, { "epoch": 4.823744292237443, "grad_norm": 0.7983404994010925, "learning_rate": 5.752409944190767e-06, "loss": 0.0049, "step": 5282 }, { "epoch": 4.824657534246575, "grad_norm": 0.08829519152641296, "learning_rate": 5.751395230847287e-06, "loss": 0.0004, "step": 5283 }, { "epoch": 4.825570776255708, "grad_norm": 5.49267053604126, "learning_rate": 5.7503805175038054e-06, "loss": 0.0353, "step": 5284 }, { "epoch": 4.82648401826484, "grad_norm": 15.311284065246582, "learning_rate": 5.749365804160325e-06, "loss": 0.0753, "step": 5285 }, { "epoch": 4.8273972602739725, "grad_norm": 3.1396474838256836, "learning_rate": 5.748351090816845e-06, "loss": 0.0132, "step": 5286 }, { "epoch": 4.828310502283105, "grad_norm": 34.44754409790039, "learning_rate": 5.747336377473364e-06, "loss": 0.2899, "step": 5287 }, { "epoch": 4.829223744292237, "grad_norm": 11.045700073242188, "learning_rate": 5.746321664129883e-06, "loss": 0.0772, "step": 5288 }, { "epoch": 4.83013698630137, "grad_norm": 1.5454390048980713, "learning_rate": 5.745306950786403e-06, "loss": 0.0114, "step": 5289 }, { "epoch": 4.831050228310502, "grad_norm": 3.634521007537842, "learning_rate": 5.744292237442924e-06, "loss": 0.0277, "step": 5290 }, { "epoch": 4.831963470319635, "grad_norm": 5.302465438842773, "learning_rate": 5.7432775240994424e-06, "loss": 0.0302, "step": 5291 }, { "epoch": 4.832876712328767, "grad_norm": 0.05178002640604973, "learning_rate": 5.742262810755962e-06, "loss": 0.0003, "step": 5292 }, { "epoch": 4.8337899543379, "grad_norm": 10.25014877319336, "learning_rate": 5.741248097412482e-06, "loss": 0.0642, "step": 5293 }, { "epoch": 4.834703196347032, "grad_norm": 1.056504726409912, "learning_rate": 5.740233384069001e-06, "loss": 0.0068, "step": 5294 }, { "epoch": 4.835616438356165, "grad_norm": 0.6091381907463074, "learning_rate": 5.73921867072552e-06, "loss": 0.0041, "step": 5295 }, { "epoch": 4.836529680365297, "grad_norm": 0.7019872069358826, "learning_rate": 5.73820395738204e-06, "loss": 0.0039, "step": 5296 }, { "epoch": 4.8374429223744295, "grad_norm": 21.5441837310791, "learning_rate": 5.737189244038559e-06, "loss": 0.1303, "step": 5297 }, { "epoch": 4.838356164383562, "grad_norm": 103.48848724365234, "learning_rate": 5.736174530695079e-06, "loss": 2.898, "step": 5298 }, { "epoch": 4.839269406392694, "grad_norm": 4.756032466888428, "learning_rate": 5.735159817351599e-06, "loss": 0.0297, "step": 5299 }, { "epoch": 4.840182648401827, "grad_norm": 11.679990768432617, "learning_rate": 5.734145104008119e-06, "loss": 0.0723, "step": 5300 }, { "epoch": 4.8410958904109584, "grad_norm": 0.4178980886936188, "learning_rate": 5.733130390664638e-06, "loss": 0.0027, "step": 5301 }, { "epoch": 4.842009132420092, "grad_norm": 1.6533414125442505, "learning_rate": 5.732115677321157e-06, "loss": 0.0069, "step": 5302 }, { "epoch": 4.842922374429223, "grad_norm": 35.565616607666016, "learning_rate": 5.731100963977677e-06, "loss": 0.2136, "step": 5303 }, { "epoch": 4.843835616438356, "grad_norm": 15.911946296691895, "learning_rate": 5.730086250634196e-06, "loss": 0.1011, "step": 5304 }, { "epoch": 4.844748858447488, "grad_norm": 1.0788774490356445, "learning_rate": 5.729071537290716e-06, "loss": 0.0067, "step": 5305 }, { "epoch": 4.845662100456621, "grad_norm": 34.2581787109375, "learning_rate": 5.728056823947235e-06, "loss": 0.1361, "step": 5306 }, { "epoch": 4.846575342465753, "grad_norm": 0.06479249149560928, "learning_rate": 5.727042110603754e-06, "loss": 0.0003, "step": 5307 }, { "epoch": 4.847488584474886, "grad_norm": 7.839534759521484, "learning_rate": 5.726027397260274e-06, "loss": 0.0363, "step": 5308 }, { "epoch": 4.848401826484018, "grad_norm": 5.0446553230285645, "learning_rate": 5.725012683916794e-06, "loss": 0.022, "step": 5309 }, { "epoch": 4.8493150684931505, "grad_norm": 0.8902245759963989, "learning_rate": 5.723997970573314e-06, "loss": 0.0039, "step": 5310 }, { "epoch": 4.850228310502283, "grad_norm": 0.2590930163860321, "learning_rate": 5.722983257229833e-06, "loss": 0.0011, "step": 5311 }, { "epoch": 4.851141552511415, "grad_norm": 15.7620210647583, "learning_rate": 5.721968543886353e-06, "loss": 0.0747, "step": 5312 }, { "epoch": 4.852054794520548, "grad_norm": 1.7323048114776611, "learning_rate": 5.720953830542872e-06, "loss": 0.0129, "step": 5313 }, { "epoch": 4.85296803652968, "grad_norm": 1.5583781003952026, "learning_rate": 5.719939117199391e-06, "loss": 0.0122, "step": 5314 }, { "epoch": 4.853881278538813, "grad_norm": 15.359121322631836, "learning_rate": 5.718924403855911e-06, "loss": 0.0929, "step": 5315 }, { "epoch": 4.854794520547945, "grad_norm": 1.4957679510116577, "learning_rate": 5.717909690512431e-06, "loss": 0.0093, "step": 5316 }, { "epoch": 4.855707762557078, "grad_norm": 0.8437188267707825, "learning_rate": 5.716894977168949e-06, "loss": 0.005, "step": 5317 }, { "epoch": 4.85662100456621, "grad_norm": 12.879734992980957, "learning_rate": 5.71588026382547e-06, "loss": 0.0474, "step": 5318 }, { "epoch": 4.857534246575343, "grad_norm": 4.790459156036377, "learning_rate": 5.71486555048199e-06, "loss": 0.0387, "step": 5319 }, { "epoch": 4.858447488584475, "grad_norm": 0.06876649707555771, "learning_rate": 5.713850837138509e-06, "loss": 0.0004, "step": 5320 }, { "epoch": 4.8593607305936075, "grad_norm": 79.55477905273438, "learning_rate": 5.712836123795028e-06, "loss": 0.5555, "step": 5321 }, { "epoch": 4.86027397260274, "grad_norm": 0.23956269025802612, "learning_rate": 5.711821410451548e-06, "loss": 0.0016, "step": 5322 }, { "epoch": 4.861187214611872, "grad_norm": 0.912663459777832, "learning_rate": 5.7108066971080675e-06, "loss": 0.0046, "step": 5323 }, { "epoch": 4.862100456621005, "grad_norm": 0.47644856572151184, "learning_rate": 5.709791983764586e-06, "loss": 0.0029, "step": 5324 }, { "epoch": 4.863013698630137, "grad_norm": 0.3302465081214905, "learning_rate": 5.708777270421106e-06, "loss": 0.0024, "step": 5325 }, { "epoch": 4.86392694063927, "grad_norm": 241.13937377929688, "learning_rate": 5.7077625570776266e-06, "loss": 1.2061, "step": 5326 }, { "epoch": 4.864840182648402, "grad_norm": 7.794719696044922, "learning_rate": 5.706747843734145e-06, "loss": 0.0558, "step": 5327 }, { "epoch": 4.865753424657534, "grad_norm": 21.965469360351562, "learning_rate": 5.705733130390665e-06, "loss": 0.0854, "step": 5328 }, { "epoch": 4.866666666666667, "grad_norm": 28.0684757232666, "learning_rate": 5.704718417047185e-06, "loss": 0.1545, "step": 5329 }, { "epoch": 4.867579908675799, "grad_norm": 0.5134126543998718, "learning_rate": 5.7037037037037045e-06, "loss": 0.0026, "step": 5330 }, { "epoch": 4.868493150684931, "grad_norm": 2.6117939949035645, "learning_rate": 5.702688990360223e-06, "loss": 0.0109, "step": 5331 }, { "epoch": 4.869406392694064, "grad_norm": 1.986464500427246, "learning_rate": 5.701674277016743e-06, "loss": 0.0081, "step": 5332 }, { "epoch": 4.870319634703196, "grad_norm": 4.595193862915039, "learning_rate": 5.700659563673263e-06, "loss": 0.0138, "step": 5333 }, { "epoch": 4.8712328767123285, "grad_norm": 0.3864593505859375, "learning_rate": 5.6996448503297816e-06, "loss": 0.0023, "step": 5334 }, { "epoch": 4.872146118721461, "grad_norm": 1.27817702293396, "learning_rate": 5.698630136986302e-06, "loss": 0.0066, "step": 5335 }, { "epoch": 4.873059360730593, "grad_norm": 38.41493225097656, "learning_rate": 5.697615423642822e-06, "loss": 0.2072, "step": 5336 }, { "epoch": 4.873972602739726, "grad_norm": 49.2209358215332, "learning_rate": 5.696600710299341e-06, "loss": 0.2825, "step": 5337 }, { "epoch": 4.874885844748858, "grad_norm": 4.708596706390381, "learning_rate": 5.69558599695586e-06, "loss": 0.0212, "step": 5338 }, { "epoch": 4.875799086757991, "grad_norm": 7.986703872680664, "learning_rate": 5.69457128361238e-06, "loss": 0.0508, "step": 5339 }, { "epoch": 4.876712328767123, "grad_norm": 11.402205467224121, "learning_rate": 5.6935565702689e-06, "loss": 0.1134, "step": 5340 }, { "epoch": 4.877625570776256, "grad_norm": 0.22110377252101898, "learning_rate": 5.6925418569254186e-06, "loss": 0.001, "step": 5341 }, { "epoch": 4.878538812785388, "grad_norm": 5.32871150970459, "learning_rate": 5.691527143581938e-06, "loss": 0.0225, "step": 5342 }, { "epoch": 4.879452054794521, "grad_norm": 0.9396352171897888, "learning_rate": 5.690512430238459e-06, "loss": 0.0069, "step": 5343 }, { "epoch": 4.880365296803653, "grad_norm": 1.3424382209777832, "learning_rate": 5.689497716894977e-06, "loss": 0.0085, "step": 5344 }, { "epoch": 4.8812785388127855, "grad_norm": 14.205056190490723, "learning_rate": 5.688483003551497e-06, "loss": 0.0921, "step": 5345 }, { "epoch": 4.882191780821918, "grad_norm": 2.2756309509277344, "learning_rate": 5.687468290208017e-06, "loss": 0.0098, "step": 5346 }, { "epoch": 4.88310502283105, "grad_norm": 10.232769966125488, "learning_rate": 5.686453576864536e-06, "loss": 0.0672, "step": 5347 }, { "epoch": 4.884018264840183, "grad_norm": 2.938960075378418, "learning_rate": 5.6854388635210556e-06, "loss": 0.0205, "step": 5348 }, { "epoch": 4.884931506849315, "grad_norm": 3.203045129776001, "learning_rate": 5.684424150177575e-06, "loss": 0.0111, "step": 5349 }, { "epoch": 4.885844748858448, "grad_norm": 198.29617309570312, "learning_rate": 5.683409436834095e-06, "loss": 6.887, "step": 5350 }, { "epoch": 4.88675799086758, "grad_norm": 0.7502985000610352, "learning_rate": 5.682394723490614e-06, "loss": 0.0027, "step": 5351 }, { "epoch": 4.887671232876713, "grad_norm": 6.179208755493164, "learning_rate": 5.681380010147134e-06, "loss": 0.031, "step": 5352 }, { "epoch": 4.888584474885845, "grad_norm": 5.1571269035339355, "learning_rate": 5.680365296803654e-06, "loss": 0.0335, "step": 5353 }, { "epoch": 4.889497716894978, "grad_norm": 0.3082234859466553, "learning_rate": 5.679350583460173e-06, "loss": 0.0019, "step": 5354 }, { "epoch": 4.890410958904109, "grad_norm": 5.696011543273926, "learning_rate": 5.6783358701166926e-06, "loss": 0.0307, "step": 5355 }, { "epoch": 4.8913242009132425, "grad_norm": 75.4659423828125, "learning_rate": 5.677321156773212e-06, "loss": 0.796, "step": 5356 }, { "epoch": 4.892237442922374, "grad_norm": 0.37050414085388184, "learning_rate": 5.676306443429731e-06, "loss": 0.0025, "step": 5357 }, { "epoch": 4.8931506849315065, "grad_norm": 0.3384896218776703, "learning_rate": 5.675291730086251e-06, "loss": 0.0021, "step": 5358 }, { "epoch": 4.894063926940639, "grad_norm": 8.97120189666748, "learning_rate": 5.6742770167427705e-06, "loss": 0.0301, "step": 5359 }, { "epoch": 4.894977168949771, "grad_norm": 1.7509510517120361, "learning_rate": 5.673262303399291e-06, "loss": 0.0103, "step": 5360 }, { "epoch": 4.895890410958904, "grad_norm": 27.265594482421875, "learning_rate": 5.672247590055809e-06, "loss": 0.1592, "step": 5361 }, { "epoch": 4.896803652968036, "grad_norm": 4.6964111328125, "learning_rate": 5.6712328767123296e-06, "loss": 0.0234, "step": 5362 }, { "epoch": 4.897716894977169, "grad_norm": 87.75729370117188, "learning_rate": 5.670218163368849e-06, "loss": 0.4996, "step": 5363 }, { "epoch": 4.898630136986301, "grad_norm": 1.385829210281372, "learning_rate": 5.669203450025368e-06, "loss": 0.0069, "step": 5364 }, { "epoch": 4.899543378995434, "grad_norm": 16.093191146850586, "learning_rate": 5.668188736681888e-06, "loss": 0.0838, "step": 5365 }, { "epoch": 4.900456621004566, "grad_norm": 20.487804412841797, "learning_rate": 5.6671740233384075e-06, "loss": 0.1266, "step": 5366 }, { "epoch": 4.901369863013699, "grad_norm": 1.3714022636413574, "learning_rate": 5.666159309994926e-06, "loss": 0.0062, "step": 5367 }, { "epoch": 4.902283105022831, "grad_norm": 2.094073534011841, "learning_rate": 5.665144596651446e-06, "loss": 0.0145, "step": 5368 }, { "epoch": 4.9031963470319635, "grad_norm": 0.7645642161369324, "learning_rate": 5.664129883307966e-06, "loss": 0.0039, "step": 5369 }, { "epoch": 4.904109589041096, "grad_norm": 79.2953109741211, "learning_rate": 5.663115169964486e-06, "loss": 0.5156, "step": 5370 }, { "epoch": 4.905022831050228, "grad_norm": 14.072821617126465, "learning_rate": 5.662100456621005e-06, "loss": 0.0728, "step": 5371 }, { "epoch": 4.905936073059361, "grad_norm": 1.9930394887924194, "learning_rate": 5.661085743277525e-06, "loss": 0.0118, "step": 5372 }, { "epoch": 4.906849315068493, "grad_norm": 3.4806113243103027, "learning_rate": 5.6600710299340445e-06, "loss": 0.0277, "step": 5373 }, { "epoch": 4.907762557077626, "grad_norm": 3.2744369506835938, "learning_rate": 5.659056316590563e-06, "loss": 0.0251, "step": 5374 }, { "epoch": 4.908675799086758, "grad_norm": 17.026123046875, "learning_rate": 5.658041603247083e-06, "loss": 0.12, "step": 5375 }, { "epoch": 4.909589041095891, "grad_norm": 1.7791988849639893, "learning_rate": 5.657026889903603e-06, "loss": 0.0169, "step": 5376 }, { "epoch": 4.910502283105023, "grad_norm": 18.013805389404297, "learning_rate": 5.6560121765601216e-06, "loss": 0.1233, "step": 5377 }, { "epoch": 4.911415525114156, "grad_norm": 17.616823196411133, "learning_rate": 5.654997463216641e-06, "loss": 0.1264, "step": 5378 }, { "epoch": 4.912328767123288, "grad_norm": 1.6395812034606934, "learning_rate": 5.653982749873162e-06, "loss": 0.0078, "step": 5379 }, { "epoch": 4.91324200913242, "grad_norm": 4.866463661193848, "learning_rate": 5.6529680365296815e-06, "loss": 0.0346, "step": 5380 }, { "epoch": 4.914155251141553, "grad_norm": 63.30501937866211, "learning_rate": 5.6519533231862e-06, "loss": 0.3406, "step": 5381 }, { "epoch": 4.9150684931506845, "grad_norm": 2.064986228942871, "learning_rate": 5.65093860984272e-06, "loss": 0.0158, "step": 5382 }, { "epoch": 4.915981735159818, "grad_norm": 12.710531234741211, "learning_rate": 5.64992389649924e-06, "loss": 0.0606, "step": 5383 }, { "epoch": 4.916894977168949, "grad_norm": 2.8969886302948, "learning_rate": 5.6489091831557586e-06, "loss": 0.0172, "step": 5384 }, { "epoch": 4.917808219178082, "grad_norm": 0.27814367413520813, "learning_rate": 5.647894469812278e-06, "loss": 0.0016, "step": 5385 }, { "epoch": 4.918721461187214, "grad_norm": 21.060951232910156, "learning_rate": 5.646879756468798e-06, "loss": 0.1368, "step": 5386 }, { "epoch": 4.919634703196347, "grad_norm": 3.930816173553467, "learning_rate": 5.645865043125317e-06, "loss": 0.0367, "step": 5387 }, { "epoch": 4.920547945205479, "grad_norm": 1.0025886297225952, "learning_rate": 5.6448503297818365e-06, "loss": 0.0062, "step": 5388 }, { "epoch": 4.921461187214612, "grad_norm": 2.951523780822754, "learning_rate": 5.643835616438357e-06, "loss": 0.0167, "step": 5389 }, { "epoch": 4.922374429223744, "grad_norm": 0.10090874135494232, "learning_rate": 5.642820903094877e-06, "loss": 0.0006, "step": 5390 }, { "epoch": 4.923287671232877, "grad_norm": 0.19361674785614014, "learning_rate": 5.6418061897513955e-06, "loss": 0.0013, "step": 5391 }, { "epoch": 4.924200913242009, "grad_norm": 1.3701666593551636, "learning_rate": 5.640791476407915e-06, "loss": 0.0106, "step": 5392 }, { "epoch": 4.9251141552511415, "grad_norm": 2.674537181854248, "learning_rate": 5.639776763064435e-06, "loss": 0.0165, "step": 5393 }, { "epoch": 4.926027397260274, "grad_norm": 5.213809013366699, "learning_rate": 5.638762049720954e-06, "loss": 0.0284, "step": 5394 }, { "epoch": 4.926940639269406, "grad_norm": 0.21657918393611908, "learning_rate": 5.6377473363774735e-06, "loss": 0.0015, "step": 5395 }, { "epoch": 4.927853881278539, "grad_norm": 55.48455810546875, "learning_rate": 5.636732623033994e-06, "loss": 0.4058, "step": 5396 }, { "epoch": 4.928767123287671, "grad_norm": 5.632187843322754, "learning_rate": 5.635717909690512e-06, "loss": 0.0333, "step": 5397 }, { "epoch": 4.929680365296804, "grad_norm": 7.191441059112549, "learning_rate": 5.6347031963470325e-06, "loss": 0.0492, "step": 5398 }, { "epoch": 4.930593607305936, "grad_norm": 1.0178680419921875, "learning_rate": 5.633688483003552e-06, "loss": 0.0077, "step": 5399 }, { "epoch": 4.931506849315069, "grad_norm": 0.8090068697929382, "learning_rate": 5.632673769660072e-06, "loss": 0.0049, "step": 5400 }, { "epoch": 4.932420091324201, "grad_norm": 20.55438232421875, "learning_rate": 5.631659056316591e-06, "loss": 0.0969, "step": 5401 }, { "epoch": 4.933333333333334, "grad_norm": 1.1764312982559204, "learning_rate": 5.6306443429731105e-06, "loss": 0.0066, "step": 5402 }, { "epoch": 4.934246575342466, "grad_norm": 2.371250867843628, "learning_rate": 5.62962962962963e-06, "loss": 0.018, "step": 5403 }, { "epoch": 4.9351598173515985, "grad_norm": 24.41947364807129, "learning_rate": 5.628614916286149e-06, "loss": 0.1528, "step": 5404 }, { "epoch": 4.936073059360731, "grad_norm": 0.6217753291130066, "learning_rate": 5.627600202942669e-06, "loss": 0.0038, "step": 5405 }, { "epoch": 4.936986301369863, "grad_norm": 0.7860606908798218, "learning_rate": 5.626585489599189e-06, "loss": 0.0048, "step": 5406 }, { "epoch": 4.937899543378995, "grad_norm": 2.131850481033325, "learning_rate": 5.625570776255708e-06, "loss": 0.0075, "step": 5407 }, { "epoch": 4.938812785388128, "grad_norm": 0.2820683717727661, "learning_rate": 5.624556062912228e-06, "loss": 0.0024, "step": 5408 }, { "epoch": 4.93972602739726, "grad_norm": 82.45884704589844, "learning_rate": 5.6235413495687475e-06, "loss": 0.717, "step": 5409 }, { "epoch": 4.940639269406392, "grad_norm": 8.322848320007324, "learning_rate": 5.622526636225267e-06, "loss": 0.0519, "step": 5410 }, { "epoch": 4.941552511415525, "grad_norm": 3.9273650646209717, "learning_rate": 5.621511922881786e-06, "loss": 0.0191, "step": 5411 }, { "epoch": 4.942465753424657, "grad_norm": 20.11125373840332, "learning_rate": 5.620497209538306e-06, "loss": 0.1254, "step": 5412 }, { "epoch": 4.94337899543379, "grad_norm": 8.720454216003418, "learning_rate": 5.619482496194825e-06, "loss": 0.0467, "step": 5413 }, { "epoch": 4.944292237442922, "grad_norm": 0.05911979824304581, "learning_rate": 5.618467782851344e-06, "loss": 0.0005, "step": 5414 }, { "epoch": 4.945205479452055, "grad_norm": 2.4482312202453613, "learning_rate": 5.617453069507865e-06, "loss": 0.0134, "step": 5415 }, { "epoch": 4.946118721461187, "grad_norm": 3.655613660812378, "learning_rate": 5.6164383561643845e-06, "loss": 0.0231, "step": 5416 }, { "epoch": 4.9470319634703195, "grad_norm": 95.15876770019531, "learning_rate": 5.615423642820903e-06, "loss": 3.3572, "step": 5417 }, { "epoch": 4.947945205479452, "grad_norm": 12.414278030395508, "learning_rate": 5.614408929477423e-06, "loss": 0.0653, "step": 5418 }, { "epoch": 4.948858447488584, "grad_norm": 7.363667011260986, "learning_rate": 5.613394216133943e-06, "loss": 0.0143, "step": 5419 }, { "epoch": 4.949771689497717, "grad_norm": 1.392022967338562, "learning_rate": 5.612379502790462e-06, "loss": 0.011, "step": 5420 }, { "epoch": 4.950684931506849, "grad_norm": 22.255067825317383, "learning_rate": 5.611364789446981e-06, "loss": 0.1482, "step": 5421 }, { "epoch": 4.951598173515982, "grad_norm": 61.29143524169922, "learning_rate": 5.610350076103501e-06, "loss": 0.4929, "step": 5422 }, { "epoch": 4.952511415525114, "grad_norm": 0.23285971581935883, "learning_rate": 5.6093353627600215e-06, "loss": 0.002, "step": 5423 }, { "epoch": 4.953424657534247, "grad_norm": 0.17782175540924072, "learning_rate": 5.6083206494165395e-06, "loss": 0.0008, "step": 5424 }, { "epoch": 4.954337899543379, "grad_norm": 144.34487915039062, "learning_rate": 5.60730593607306e-06, "loss": 4.5287, "step": 5425 }, { "epoch": 4.955251141552512, "grad_norm": 4.00949239730835, "learning_rate": 5.60629122272958e-06, "loss": 0.0256, "step": 5426 }, { "epoch": 4.956164383561644, "grad_norm": 0.23481565713882446, "learning_rate": 5.6052765093860985e-06, "loss": 0.0017, "step": 5427 }, { "epoch": 4.9570776255707765, "grad_norm": 4.3316144943237305, "learning_rate": 5.604261796042618e-06, "loss": 0.0255, "step": 5428 }, { "epoch": 4.957990867579909, "grad_norm": 34.3664665222168, "learning_rate": 5.603247082699138e-06, "loss": 0.2201, "step": 5429 }, { "epoch": 4.958904109589041, "grad_norm": 7.537479400634766, "learning_rate": 5.602232369355658e-06, "loss": 0.0338, "step": 5430 }, { "epoch": 4.959817351598174, "grad_norm": 24.623971939086914, "learning_rate": 5.6012176560121765e-06, "loss": 0.1097, "step": 5431 }, { "epoch": 4.960730593607306, "grad_norm": 0.48994943499565125, "learning_rate": 5.600202942668697e-06, "loss": 0.0022, "step": 5432 }, { "epoch": 4.961643835616439, "grad_norm": 2.2211320400238037, "learning_rate": 5.599188229325217e-06, "loss": 0.0147, "step": 5433 }, { "epoch": 4.96255707762557, "grad_norm": 15.115062713623047, "learning_rate": 5.5981735159817355e-06, "loss": 0.0928, "step": 5434 }, { "epoch": 4.963470319634704, "grad_norm": 0.06220464035868645, "learning_rate": 5.597158802638255e-06, "loss": 0.0005, "step": 5435 }, { "epoch": 4.964383561643835, "grad_norm": 3.8174054622650146, "learning_rate": 5.596144089294775e-06, "loss": 0.0162, "step": 5436 }, { "epoch": 4.965296803652968, "grad_norm": 5.0422539710998535, "learning_rate": 5.595129375951294e-06, "loss": 0.0242, "step": 5437 }, { "epoch": 4.9662100456621, "grad_norm": 0.2660387456417084, "learning_rate": 5.5941146626078135e-06, "loss": 0.0018, "step": 5438 }, { "epoch": 4.967123287671233, "grad_norm": 7.509033203125, "learning_rate": 5.593099949264333e-06, "loss": 0.0405, "step": 5439 }, { "epoch": 4.968036529680365, "grad_norm": 0.06877650320529938, "learning_rate": 5.592085235920854e-06, "loss": 0.0004, "step": 5440 }, { "epoch": 4.9689497716894975, "grad_norm": 17.43589210510254, "learning_rate": 5.591070522577372e-06, "loss": 0.0995, "step": 5441 }, { "epoch": 4.96986301369863, "grad_norm": 0.2860652804374695, "learning_rate": 5.590055809233892e-06, "loss": 0.0011, "step": 5442 }, { "epoch": 4.970776255707762, "grad_norm": 0.4451531767845154, "learning_rate": 5.589041095890412e-06, "loss": 0.0026, "step": 5443 }, { "epoch": 4.971689497716895, "grad_norm": 112.40308380126953, "learning_rate": 5.588026382546931e-06, "loss": 3.0756, "step": 5444 }, { "epoch": 4.972602739726027, "grad_norm": 5.858927249908447, "learning_rate": 5.5870116692034504e-06, "loss": 0.0465, "step": 5445 }, { "epoch": 4.97351598173516, "grad_norm": 2.8087520599365234, "learning_rate": 5.58599695585997e-06, "loss": 0.0178, "step": 5446 }, { "epoch": 4.974429223744292, "grad_norm": 3.645634889602661, "learning_rate": 5.584982242516489e-06, "loss": 0.0201, "step": 5447 }, { "epoch": 4.975342465753425, "grad_norm": 1.7516191005706787, "learning_rate": 5.583967529173009e-06, "loss": 0.0092, "step": 5448 }, { "epoch": 4.976255707762557, "grad_norm": 19.128339767456055, "learning_rate": 5.582952815829528e-06, "loss": 0.096, "step": 5449 }, { "epoch": 4.9771689497716896, "grad_norm": 0.14991839230060577, "learning_rate": 5.581938102486049e-06, "loss": 0.001, "step": 5450 }, { "epoch": 4.978082191780822, "grad_norm": 0.6311704516410828, "learning_rate": 5.580923389142568e-06, "loss": 0.005, "step": 5451 }, { "epoch": 4.9789954337899545, "grad_norm": 7.219148635864258, "learning_rate": 5.5799086757990874e-06, "loss": 0.0486, "step": 5452 }, { "epoch": 4.979908675799087, "grad_norm": 0.05010334774851799, "learning_rate": 5.578893962455607e-06, "loss": 0.0004, "step": 5453 }, { "epoch": 4.980821917808219, "grad_norm": 0.32073405385017395, "learning_rate": 5.577879249112126e-06, "loss": 0.0023, "step": 5454 }, { "epoch": 4.981735159817352, "grad_norm": 0.32418715953826904, "learning_rate": 5.576864535768646e-06, "loss": 0.0023, "step": 5455 }, { "epoch": 4.982648401826484, "grad_norm": 1.0275617837905884, "learning_rate": 5.575849822425165e-06, "loss": 0.0046, "step": 5456 }, { "epoch": 4.983561643835617, "grad_norm": 9.482038497924805, "learning_rate": 5.574835109081684e-06, "loss": 0.0572, "step": 5457 }, { "epoch": 4.984474885844749, "grad_norm": 3.313321590423584, "learning_rate": 5.573820395738204e-06, "loss": 0.0194, "step": 5458 }, { "epoch": 4.985388127853882, "grad_norm": 8.37830924987793, "learning_rate": 5.5728056823947244e-06, "loss": 0.0572, "step": 5459 }, { "epoch": 4.986301369863014, "grad_norm": 11.794483184814453, "learning_rate": 5.571790969051244e-06, "loss": 0.0645, "step": 5460 }, { "epoch": 4.987214611872146, "grad_norm": 0.2005540430545807, "learning_rate": 5.570776255707763e-06, "loss": 0.0013, "step": 5461 }, { "epoch": 4.988127853881279, "grad_norm": 2.9759116172790527, "learning_rate": 5.569761542364283e-06, "loss": 0.0212, "step": 5462 }, { "epoch": 4.989041095890411, "grad_norm": 20.281248092651367, "learning_rate": 5.568746829020802e-06, "loss": 0.167, "step": 5463 }, { "epoch": 4.989954337899543, "grad_norm": 29.51638412475586, "learning_rate": 5.567732115677321e-06, "loss": 0.2209, "step": 5464 }, { "epoch": 4.9908675799086755, "grad_norm": 0.5209759473800659, "learning_rate": 5.566717402333841e-06, "loss": 0.0036, "step": 5465 }, { "epoch": 4.991780821917808, "grad_norm": 0.9470652341842651, "learning_rate": 5.565702688990361e-06, "loss": 0.0063, "step": 5466 }, { "epoch": 4.99269406392694, "grad_norm": 0.37484684586524963, "learning_rate": 5.5646879756468794e-06, "loss": 0.0026, "step": 5467 }, { "epoch": 4.993607305936073, "grad_norm": 76.919921875, "learning_rate": 5.563673262303399e-06, "loss": 0.8178, "step": 5468 }, { "epoch": 4.994520547945205, "grad_norm": 20.91408920288086, "learning_rate": 5.56265854895992e-06, "loss": 0.1216, "step": 5469 }, { "epoch": 4.995433789954338, "grad_norm": 5.201568603515625, "learning_rate": 5.561643835616439e-06, "loss": 0.0294, "step": 5470 }, { "epoch": 4.99634703196347, "grad_norm": 1.0674047470092773, "learning_rate": 5.560629122272958e-06, "loss": 0.0057, "step": 5471 }, { "epoch": 4.997260273972603, "grad_norm": 0.25743433833122253, "learning_rate": 5.559614408929478e-06, "loss": 0.0018, "step": 5472 }, { "epoch": 4.998173515981735, "grad_norm": 0.06422736495733261, "learning_rate": 5.558599695585998e-06, "loss": 0.0006, "step": 5473 }, { "epoch": 4.9990867579908675, "grad_norm": 59.80506134033203, "learning_rate": 5.5575849822425164e-06, "loss": 0.796, "step": 5474 }, { "epoch": 5.0, "grad_norm": 137.36029052734375, "learning_rate": 5.556570268899036e-06, "loss": 1.0621, "step": 5475 }, { "epoch": 5.0009132420091325, "grad_norm": 1.8592010736465454, "learning_rate": 5.555555555555557e-06, "loss": 0.0137, "step": 5476 }, { "epoch": 5.001826484018265, "grad_norm": 0.8554001450538635, "learning_rate": 5.554540842212075e-06, "loss": 0.006, "step": 5477 }, { "epoch": 5.002739726027397, "grad_norm": 0.219779834151268, "learning_rate": 5.553526128868595e-06, "loss": 0.0013, "step": 5478 }, { "epoch": 5.00365296803653, "grad_norm": 1.6103100776672363, "learning_rate": 5.552511415525115e-06, "loss": 0.012, "step": 5479 }, { "epoch": 5.004566210045662, "grad_norm": 0.6246341466903687, "learning_rate": 5.551496702181635e-06, "loss": 0.0043, "step": 5480 }, { "epoch": 5.005479452054795, "grad_norm": 9.436360359191895, "learning_rate": 5.5504819888381534e-06, "loss": 0.046, "step": 5481 }, { "epoch": 5.006392694063927, "grad_norm": 2.203310012817383, "learning_rate": 5.549467275494673e-06, "loss": 0.0178, "step": 5482 }, { "epoch": 5.00730593607306, "grad_norm": 5.617350101470947, "learning_rate": 5.548452562151193e-06, "loss": 0.0261, "step": 5483 }, { "epoch": 5.008219178082192, "grad_norm": 0.5143392086029053, "learning_rate": 5.547437848807712e-06, "loss": 0.0047, "step": 5484 }, { "epoch": 5.0091324200913245, "grad_norm": 2.661870241165161, "learning_rate": 5.546423135464231e-06, "loss": 0.0153, "step": 5485 }, { "epoch": 5.010045662100457, "grad_norm": 0.05782362073659897, "learning_rate": 5.545408422120752e-06, "loss": 0.0004, "step": 5486 }, { "epoch": 5.010958904109589, "grad_norm": 53.996028900146484, "learning_rate": 5.544393708777271e-06, "loss": 0.3706, "step": 5487 }, { "epoch": 5.011872146118722, "grad_norm": 0.8575087785720825, "learning_rate": 5.5433789954337904e-06, "loss": 0.0042, "step": 5488 }, { "epoch": 5.0127853881278535, "grad_norm": 5.770202159881592, "learning_rate": 5.54236428209031e-06, "loss": 0.0278, "step": 5489 }, { "epoch": 5.013698630136986, "grad_norm": 4.791797637939453, "learning_rate": 5.54134956874683e-06, "loss": 0.0295, "step": 5490 }, { "epoch": 5.014611872146118, "grad_norm": 10.715629577636719, "learning_rate": 5.540334855403349e-06, "loss": 0.0706, "step": 5491 }, { "epoch": 5.015525114155251, "grad_norm": 0.46019452810287476, "learning_rate": 5.539320142059868e-06, "loss": 0.0027, "step": 5492 }, { "epoch": 5.016438356164383, "grad_norm": 0.07505905628204346, "learning_rate": 5.538305428716388e-06, "loss": 0.0004, "step": 5493 }, { "epoch": 5.017351598173516, "grad_norm": 0.4682457149028778, "learning_rate": 5.537290715372907e-06, "loss": 0.0033, "step": 5494 }, { "epoch": 5.018264840182648, "grad_norm": 6.901314735412598, "learning_rate": 5.5362760020294274e-06, "loss": 0.0418, "step": 5495 }, { "epoch": 5.019178082191781, "grad_norm": 13.804346084594727, "learning_rate": 5.535261288685947e-06, "loss": 0.0895, "step": 5496 }, { "epoch": 5.020091324200913, "grad_norm": 2.301741361618042, "learning_rate": 5.534246575342466e-06, "loss": 0.0159, "step": 5497 }, { "epoch": 5.0210045662100455, "grad_norm": 0.017937971279025078, "learning_rate": 5.533231861998986e-06, "loss": 0.0001, "step": 5498 }, { "epoch": 5.021917808219178, "grad_norm": 7.548657417297363, "learning_rate": 5.532217148655505e-06, "loss": 0.0413, "step": 5499 }, { "epoch": 5.0228310502283104, "grad_norm": 5.691102504730225, "learning_rate": 5.531202435312025e-06, "loss": 0.0442, "step": 5500 }, { "epoch": 5.023744292237443, "grad_norm": 2.9965341091156006, "learning_rate": 5.530187721968544e-06, "loss": 0.0257, "step": 5501 }, { "epoch": 5.024657534246575, "grad_norm": 82.42205047607422, "learning_rate": 5.529173008625064e-06, "loss": 2.076, "step": 5502 }, { "epoch": 5.025570776255708, "grad_norm": 11.193077087402344, "learning_rate": 5.528158295281584e-06, "loss": 0.0652, "step": 5503 }, { "epoch": 5.02648401826484, "grad_norm": 12.638575553894043, "learning_rate": 5.527143581938102e-06, "loss": 0.053, "step": 5504 }, { "epoch": 5.027397260273973, "grad_norm": 2.327241897583008, "learning_rate": 5.526128868594623e-06, "loss": 0.0122, "step": 5505 }, { "epoch": 5.028310502283105, "grad_norm": 24.429811477661133, "learning_rate": 5.525114155251142e-06, "loss": 0.1781, "step": 5506 }, { "epoch": 5.029223744292238, "grad_norm": 1.19810950756073, "learning_rate": 5.524099441907661e-06, "loss": 0.0058, "step": 5507 }, { "epoch": 5.03013698630137, "grad_norm": 10.753724098205566, "learning_rate": 5.523084728564181e-06, "loss": 0.0778, "step": 5508 }, { "epoch": 5.0310502283105025, "grad_norm": 2.026860237121582, "learning_rate": 5.522070015220701e-06, "loss": 0.0133, "step": 5509 }, { "epoch": 5.031963470319635, "grad_norm": 0.5967985391616821, "learning_rate": 5.52105530187722e-06, "loss": 0.0032, "step": 5510 }, { "epoch": 5.032876712328767, "grad_norm": 20.501588821411133, "learning_rate": 5.520040588533739e-06, "loss": 0.1065, "step": 5511 }, { "epoch": 5.0337899543379, "grad_norm": 2.4714951515197754, "learning_rate": 5.519025875190259e-06, "loss": 0.0135, "step": 5512 }, { "epoch": 5.034703196347032, "grad_norm": 0.4338850975036621, "learning_rate": 5.518011161846779e-06, "loss": 0.0031, "step": 5513 }, { "epoch": 5.035616438356165, "grad_norm": 3.417201042175293, "learning_rate": 5.516996448503298e-06, "loss": 0.0182, "step": 5514 }, { "epoch": 5.036529680365296, "grad_norm": 0.3120671510696411, "learning_rate": 5.515981735159818e-06, "loss": 0.002, "step": 5515 }, { "epoch": 5.037442922374429, "grad_norm": 1.9949616193771362, "learning_rate": 5.5149670218163376e-06, "loss": 0.0116, "step": 5516 }, { "epoch": 5.038356164383561, "grad_norm": 19.720722198486328, "learning_rate": 5.513952308472856e-06, "loss": 0.1478, "step": 5517 }, { "epoch": 5.039269406392694, "grad_norm": 5.872110843658447, "learning_rate": 5.512937595129376e-06, "loss": 0.0294, "step": 5518 }, { "epoch": 5.040182648401826, "grad_norm": 0.7720698714256287, "learning_rate": 5.511922881785896e-06, "loss": 0.0059, "step": 5519 }, { "epoch": 5.041095890410959, "grad_norm": 1.4146559238433838, "learning_rate": 5.510908168442416e-06, "loss": 0.0084, "step": 5520 }, { "epoch": 5.042009132420091, "grad_norm": 0.5466907620429993, "learning_rate": 5.509893455098934e-06, "loss": 0.0035, "step": 5521 }, { "epoch": 5.0429223744292235, "grad_norm": 14.005912780761719, "learning_rate": 5.508878741755455e-06, "loss": 0.0838, "step": 5522 }, { "epoch": 5.043835616438356, "grad_norm": 0.039984676986932755, "learning_rate": 5.5078640284119746e-06, "loss": 0.0003, "step": 5523 }, { "epoch": 5.044748858447488, "grad_norm": 0.2411375492811203, "learning_rate": 5.506849315068493e-06, "loss": 0.0017, "step": 5524 }, { "epoch": 5.045662100456621, "grad_norm": 2.230583667755127, "learning_rate": 5.505834601725013e-06, "loss": 0.0142, "step": 5525 }, { "epoch": 5.046575342465753, "grad_norm": 1.8301682472229004, "learning_rate": 5.504819888381533e-06, "loss": 0.0114, "step": 5526 }, { "epoch": 5.047488584474886, "grad_norm": 0.44080543518066406, "learning_rate": 5.503805175038052e-06, "loss": 0.0034, "step": 5527 }, { "epoch": 5.048401826484018, "grad_norm": 0.927176833152771, "learning_rate": 5.502790461694571e-06, "loss": 0.0052, "step": 5528 }, { "epoch": 5.049315068493151, "grad_norm": 8.323750495910645, "learning_rate": 5.501775748351091e-06, "loss": 0.0438, "step": 5529 }, { "epoch": 5.050228310502283, "grad_norm": 0.6184713840484619, "learning_rate": 5.5007610350076116e-06, "loss": 0.0037, "step": 5530 }, { "epoch": 5.051141552511416, "grad_norm": 8.130250930786133, "learning_rate": 5.49974632166413e-06, "loss": 0.0372, "step": 5531 }, { "epoch": 5.052054794520548, "grad_norm": 0.3819190263748169, "learning_rate": 5.49873160832065e-06, "loss": 0.0016, "step": 5532 }, { "epoch": 5.0529680365296805, "grad_norm": 0.1593330055475235, "learning_rate": 5.49771689497717e-06, "loss": 0.0012, "step": 5533 }, { "epoch": 5.053881278538813, "grad_norm": 82.85529327392578, "learning_rate": 5.496702181633689e-06, "loss": 0.7785, "step": 5534 }, { "epoch": 5.054794520547945, "grad_norm": 22.60142707824707, "learning_rate": 5.495687468290208e-06, "loss": 0.1294, "step": 5535 }, { "epoch": 5.055707762557078, "grad_norm": 11.57682991027832, "learning_rate": 5.494672754946728e-06, "loss": 0.0943, "step": 5536 }, { "epoch": 5.05662100456621, "grad_norm": 6.3450164794921875, "learning_rate": 5.493658041603247e-06, "loss": 0.0189, "step": 5537 }, { "epoch": 5.057534246575343, "grad_norm": 0.23607227206230164, "learning_rate": 5.4926433282597666e-06, "loss": 0.0016, "step": 5538 }, { "epoch": 5.058447488584475, "grad_norm": 9.173283576965332, "learning_rate": 5.491628614916287e-06, "loss": 0.0614, "step": 5539 }, { "epoch": 5.059360730593608, "grad_norm": 14.821418762207031, "learning_rate": 5.490613901572807e-06, "loss": 0.0366, "step": 5540 }, { "epoch": 5.06027397260274, "grad_norm": 0.28129780292510986, "learning_rate": 5.489599188229326e-06, "loss": 0.0018, "step": 5541 }, { "epoch": 5.061187214611872, "grad_norm": 1.9172580242156982, "learning_rate": 5.488584474885845e-06, "loss": 0.0114, "step": 5542 }, { "epoch": 5.062100456621004, "grad_norm": 10.581048011779785, "learning_rate": 5.487569761542365e-06, "loss": 0.0484, "step": 5543 }, { "epoch": 5.063013698630137, "grad_norm": 14.80334758758545, "learning_rate": 5.486555048198884e-06, "loss": 0.1033, "step": 5544 }, { "epoch": 5.063926940639269, "grad_norm": 0.49257892370224, "learning_rate": 5.4855403348554036e-06, "loss": 0.0034, "step": 5545 }, { "epoch": 5.0648401826484015, "grad_norm": 52.15130615234375, "learning_rate": 5.484525621511923e-06, "loss": 0.7505, "step": 5546 }, { "epoch": 5.065753424657534, "grad_norm": 0.8353469967842102, "learning_rate": 5.483510908168442e-06, "loss": 0.0064, "step": 5547 }, { "epoch": 5.066666666666666, "grad_norm": 1.3413138389587402, "learning_rate": 5.482496194824962e-06, "loss": 0.0075, "step": 5548 }, { "epoch": 5.067579908675799, "grad_norm": 0.42972975969314575, "learning_rate": 5.481481481481482e-06, "loss": 0.0024, "step": 5549 }, { "epoch": 5.068493150684931, "grad_norm": 6.691566467285156, "learning_rate": 5.480466768138002e-06, "loss": 0.0438, "step": 5550 }, { "epoch": 5.069406392694064, "grad_norm": 2.791764736175537, "learning_rate": 5.479452054794521e-06, "loss": 0.0137, "step": 5551 }, { "epoch": 5.070319634703196, "grad_norm": 2.0490944385528564, "learning_rate": 5.4784373414510406e-06, "loss": 0.0139, "step": 5552 }, { "epoch": 5.071232876712329, "grad_norm": 1.965303897857666, "learning_rate": 5.47742262810756e-06, "loss": 0.0169, "step": 5553 }, { "epoch": 5.072146118721461, "grad_norm": 7.066697597503662, "learning_rate": 5.476407914764079e-06, "loss": 0.0511, "step": 5554 }, { "epoch": 5.073059360730594, "grad_norm": 40.38325881958008, "learning_rate": 5.475393201420599e-06, "loss": 0.2699, "step": 5555 }, { "epoch": 5.073972602739726, "grad_norm": 0.07962317764759064, "learning_rate": 5.474378488077119e-06, "loss": 0.0006, "step": 5556 }, { "epoch": 5.0748858447488585, "grad_norm": 2.3774025440216064, "learning_rate": 5.473363774733637e-06, "loss": 0.0113, "step": 5557 }, { "epoch": 5.075799086757991, "grad_norm": 12.822922706604004, "learning_rate": 5.472349061390158e-06, "loss": 0.0873, "step": 5558 }, { "epoch": 5.076712328767123, "grad_norm": 39.13115692138672, "learning_rate": 5.4713343480466776e-06, "loss": 0.3076, "step": 5559 }, { "epoch": 5.077625570776256, "grad_norm": 8.14199447631836, "learning_rate": 5.470319634703197e-06, "loss": 0.0496, "step": 5560 }, { "epoch": 5.078538812785388, "grad_norm": 0.02541305683553219, "learning_rate": 5.469304921359716e-06, "loss": 0.0001, "step": 5561 }, { "epoch": 5.079452054794521, "grad_norm": 17.97801399230957, "learning_rate": 5.468290208016236e-06, "loss": 0.0839, "step": 5562 }, { "epoch": 5.080365296803653, "grad_norm": 0.74183189868927, "learning_rate": 5.4672754946727555e-06, "loss": 0.0044, "step": 5563 }, { "epoch": 5.081278538812786, "grad_norm": 9.618182182312012, "learning_rate": 5.466260781329274e-06, "loss": 0.0579, "step": 5564 }, { "epoch": 5.082191780821918, "grad_norm": 0.6592544317245483, "learning_rate": 5.465246067985794e-06, "loss": 0.0039, "step": 5565 }, { "epoch": 5.083105022831051, "grad_norm": 0.6674944162368774, "learning_rate": 5.4642313546423146e-06, "loss": 0.0033, "step": 5566 }, { "epoch": 5.084018264840183, "grad_norm": 12.05002498626709, "learning_rate": 5.463216641298833e-06, "loss": 0.0708, "step": 5567 }, { "epoch": 5.0849315068493155, "grad_norm": 14.746060371398926, "learning_rate": 5.462201927955353e-06, "loss": 0.1144, "step": 5568 }, { "epoch": 5.085844748858447, "grad_norm": 0.5686160922050476, "learning_rate": 5.461187214611873e-06, "loss": 0.0029, "step": 5569 }, { "epoch": 5.0867579908675795, "grad_norm": 0.015150058083236217, "learning_rate": 5.4601725012683925e-06, "loss": 0.0001, "step": 5570 }, { "epoch": 5.087671232876712, "grad_norm": 17.434734344482422, "learning_rate": 5.459157787924911e-06, "loss": 0.0853, "step": 5571 }, { "epoch": 5.088584474885844, "grad_norm": 1.9562788009643555, "learning_rate": 5.458143074581431e-06, "loss": 0.0149, "step": 5572 }, { "epoch": 5.089497716894977, "grad_norm": 0.34068456292152405, "learning_rate": 5.457128361237951e-06, "loss": 0.0017, "step": 5573 }, { "epoch": 5.090410958904109, "grad_norm": 30.247621536254883, "learning_rate": 5.4561136478944696e-06, "loss": 0.2006, "step": 5574 }, { "epoch": 5.091324200913242, "grad_norm": 5.925263404846191, "learning_rate": 5.45509893455099e-06, "loss": 0.029, "step": 5575 }, { "epoch": 5.092237442922374, "grad_norm": 2.2210562229156494, "learning_rate": 5.45408422120751e-06, "loss": 0.0107, "step": 5576 }, { "epoch": 5.093150684931507, "grad_norm": 0.23944641649723053, "learning_rate": 5.453069507864029e-06, "loss": 0.0012, "step": 5577 }, { "epoch": 5.094063926940639, "grad_norm": 0.3658835291862488, "learning_rate": 5.452054794520548e-06, "loss": 0.0025, "step": 5578 }, { "epoch": 5.094977168949772, "grad_norm": 98.62445831298828, "learning_rate": 5.451040081177068e-06, "loss": 0.9803, "step": 5579 }, { "epoch": 5.095890410958904, "grad_norm": 13.58236312866211, "learning_rate": 5.450025367833588e-06, "loss": 0.0619, "step": 5580 }, { "epoch": 5.0968036529680365, "grad_norm": 0.21532201766967773, "learning_rate": 5.4490106544901065e-06, "loss": 0.0014, "step": 5581 }, { "epoch": 5.097716894977169, "grad_norm": 3.879037857055664, "learning_rate": 5.447995941146626e-06, "loss": 0.0131, "step": 5582 }, { "epoch": 5.098630136986301, "grad_norm": 0.6483006477355957, "learning_rate": 5.446981227803147e-06, "loss": 0.0051, "step": 5583 }, { "epoch": 5.099543378995434, "grad_norm": 1.226011872291565, "learning_rate": 5.445966514459665e-06, "loss": 0.0068, "step": 5584 }, { "epoch": 5.100456621004566, "grad_norm": 9.126286506652832, "learning_rate": 5.444951801116185e-06, "loss": 0.0533, "step": 5585 }, { "epoch": 5.101369863013699, "grad_norm": 125.2040786743164, "learning_rate": 5.443937087772705e-06, "loss": 2.4987, "step": 5586 }, { "epoch": 5.102283105022831, "grad_norm": 23.350248336791992, "learning_rate": 5.442922374429224e-06, "loss": 0.1738, "step": 5587 }, { "epoch": 5.103196347031964, "grad_norm": 0.3449239134788513, "learning_rate": 5.4419076610857435e-06, "loss": 0.0027, "step": 5588 }, { "epoch": 5.104109589041096, "grad_norm": 1.0423671007156372, "learning_rate": 5.440892947742263e-06, "loss": 0.0094, "step": 5589 }, { "epoch": 5.105022831050229, "grad_norm": 5.13532018661499, "learning_rate": 5.439878234398783e-06, "loss": 0.045, "step": 5590 }, { "epoch": 5.105936073059361, "grad_norm": 0.35462597012519836, "learning_rate": 5.438863521055302e-06, "loss": 0.0027, "step": 5591 }, { "epoch": 5.1068493150684935, "grad_norm": 0.885878324508667, "learning_rate": 5.4378488077118215e-06, "loss": 0.0046, "step": 5592 }, { "epoch": 5.107762557077626, "grad_norm": 2.014937162399292, "learning_rate": 5.436834094368342e-06, "loss": 0.0122, "step": 5593 }, { "epoch": 5.108675799086758, "grad_norm": 0.7569983005523682, "learning_rate": 5.435819381024861e-06, "loss": 0.0044, "step": 5594 }, { "epoch": 5.109589041095891, "grad_norm": 17.319129943847656, "learning_rate": 5.4348046676813805e-06, "loss": 0.1313, "step": 5595 }, { "epoch": 5.110502283105022, "grad_norm": 5.568841934204102, "learning_rate": 5.4337899543379e-06, "loss": 0.0416, "step": 5596 }, { "epoch": 5.111415525114155, "grad_norm": 7.659235000610352, "learning_rate": 5.432775240994419e-06, "loss": 0.0462, "step": 5597 }, { "epoch": 5.112328767123287, "grad_norm": 0.4821767807006836, "learning_rate": 5.431760527650939e-06, "loss": 0.0044, "step": 5598 }, { "epoch": 5.11324200913242, "grad_norm": 12.694847106933594, "learning_rate": 5.4307458143074585e-06, "loss": 0.0831, "step": 5599 }, { "epoch": 5.114155251141552, "grad_norm": 5.8571062088012695, "learning_rate": 5.429731100963979e-06, "loss": 0.0349, "step": 5600 }, { "epoch": 5.115068493150685, "grad_norm": 0.4342265725135803, "learning_rate": 5.428716387620497e-06, "loss": 0.0027, "step": 5601 }, { "epoch": 5.115981735159817, "grad_norm": 1.4592543840408325, "learning_rate": 5.4277016742770175e-06, "loss": 0.0113, "step": 5602 }, { "epoch": 5.11689497716895, "grad_norm": 9.08432674407959, "learning_rate": 5.426686960933537e-06, "loss": 0.0442, "step": 5603 }, { "epoch": 5.117808219178082, "grad_norm": 0.6572741270065308, "learning_rate": 5.425672247590056e-06, "loss": 0.0029, "step": 5604 }, { "epoch": 5.1187214611872145, "grad_norm": 5.284173488616943, "learning_rate": 5.424657534246576e-06, "loss": 0.0359, "step": 5605 }, { "epoch": 5.119634703196347, "grad_norm": 10.63306713104248, "learning_rate": 5.4236428209030955e-06, "loss": 0.0697, "step": 5606 }, { "epoch": 5.120547945205479, "grad_norm": 0.09304304420948029, "learning_rate": 5.422628107559614e-06, "loss": 0.0006, "step": 5607 }, { "epoch": 5.121461187214612, "grad_norm": 40.61052703857422, "learning_rate": 5.421613394216134e-06, "loss": 0.2615, "step": 5608 }, { "epoch": 5.122374429223744, "grad_norm": 0.12328722327947617, "learning_rate": 5.420598680872654e-06, "loss": 0.0008, "step": 5609 }, { "epoch": 5.123287671232877, "grad_norm": 0.11123239994049072, "learning_rate": 5.419583967529174e-06, "loss": 0.0005, "step": 5610 }, { "epoch": 5.124200913242009, "grad_norm": 1.0946921110153198, "learning_rate": 5.418569254185693e-06, "loss": 0.0031, "step": 5611 }, { "epoch": 5.125114155251142, "grad_norm": 0.27616602182388306, "learning_rate": 5.417554540842213e-06, "loss": 0.0019, "step": 5612 }, { "epoch": 5.126027397260274, "grad_norm": 5.884853839874268, "learning_rate": 5.4165398274987325e-06, "loss": 0.0364, "step": 5613 }, { "epoch": 5.126940639269407, "grad_norm": 1.0119253396987915, "learning_rate": 5.415525114155251e-06, "loss": 0.005, "step": 5614 }, { "epoch": 5.127853881278539, "grad_norm": 11.031585693359375, "learning_rate": 5.414510400811771e-06, "loss": 0.0576, "step": 5615 }, { "epoch": 5.1287671232876715, "grad_norm": 0.5618900656700134, "learning_rate": 5.413495687468291e-06, "loss": 0.0029, "step": 5616 }, { "epoch": 5.129680365296804, "grad_norm": 10.756733894348145, "learning_rate": 5.4124809741248095e-06, "loss": 0.0468, "step": 5617 }, { "epoch": 5.130593607305936, "grad_norm": 34.866302490234375, "learning_rate": 5.411466260781329e-06, "loss": 0.3335, "step": 5618 }, { "epoch": 5.131506849315069, "grad_norm": 1.346643090248108, "learning_rate": 5.41045154743785e-06, "loss": 0.0086, "step": 5619 }, { "epoch": 5.132420091324201, "grad_norm": 1.7759383916854858, "learning_rate": 5.4094368340943695e-06, "loss": 0.0137, "step": 5620 }, { "epoch": 5.133333333333334, "grad_norm": 3.4204108715057373, "learning_rate": 5.408422120750888e-06, "loss": 0.0234, "step": 5621 }, { "epoch": 5.134246575342466, "grad_norm": 4.90950870513916, "learning_rate": 5.407407407407408e-06, "loss": 0.0226, "step": 5622 }, { "epoch": 5.135159817351598, "grad_norm": 5.964973449707031, "learning_rate": 5.406392694063928e-06, "loss": 0.0266, "step": 5623 }, { "epoch": 5.13607305936073, "grad_norm": 4.206118583679199, "learning_rate": 5.4053779807204465e-06, "loss": 0.0354, "step": 5624 }, { "epoch": 5.136986301369863, "grad_norm": 28.85500717163086, "learning_rate": 5.404363267376966e-06, "loss": 0.011, "step": 5625 }, { "epoch": 5.137899543378995, "grad_norm": 0.8943412899971008, "learning_rate": 5.403348554033486e-06, "loss": 0.0064, "step": 5626 }, { "epoch": 5.138812785388128, "grad_norm": 0.2628285586833954, "learning_rate": 5.402333840690005e-06, "loss": 0.0011, "step": 5627 }, { "epoch": 5.13972602739726, "grad_norm": 3.430936336517334, "learning_rate": 5.4013191273465245e-06, "loss": 0.0218, "step": 5628 }, { "epoch": 5.1406392694063925, "grad_norm": 2.0890860557556152, "learning_rate": 5.400304414003045e-06, "loss": 0.0154, "step": 5629 }, { "epoch": 5.141552511415525, "grad_norm": 0.8027562499046326, "learning_rate": 5.399289700659565e-06, "loss": 0.0033, "step": 5630 }, { "epoch": 5.142465753424657, "grad_norm": 0.10565441101789474, "learning_rate": 5.3982749873160835e-06, "loss": 0.0008, "step": 5631 }, { "epoch": 5.14337899543379, "grad_norm": 1.2325191497802734, "learning_rate": 5.397260273972603e-06, "loss": 0.0063, "step": 5632 }, { "epoch": 5.144292237442922, "grad_norm": 2.0925254821777344, "learning_rate": 5.396245560629123e-06, "loss": 0.0104, "step": 5633 }, { "epoch": 5.145205479452055, "grad_norm": 0.9362099766731262, "learning_rate": 5.395230847285642e-06, "loss": 0.0071, "step": 5634 }, { "epoch": 5.146118721461187, "grad_norm": 19.86600112915039, "learning_rate": 5.3942161339421614e-06, "loss": 0.0749, "step": 5635 }, { "epoch": 5.14703196347032, "grad_norm": 0.031410474330186844, "learning_rate": 5.393201420598682e-06, "loss": 0.0002, "step": 5636 }, { "epoch": 5.147945205479452, "grad_norm": 3.082075834274292, "learning_rate": 5.3921867072552e-06, "loss": 0.0134, "step": 5637 }, { "epoch": 5.148858447488585, "grad_norm": 8.352127075195312, "learning_rate": 5.3911719939117205e-06, "loss": 0.0607, "step": 5638 }, { "epoch": 5.149771689497717, "grad_norm": 19.5617733001709, "learning_rate": 5.39015728056824e-06, "loss": 0.0822, "step": 5639 }, { "epoch": 5.1506849315068495, "grad_norm": 9.941794395446777, "learning_rate": 5.38914256722476e-06, "loss": 0.0512, "step": 5640 }, { "epoch": 5.151598173515982, "grad_norm": 0.3928012251853943, "learning_rate": 5.388127853881279e-06, "loss": 0.0021, "step": 5641 }, { "epoch": 5.152511415525114, "grad_norm": 1.7927541732788086, "learning_rate": 5.3871131405377984e-06, "loss": 0.0075, "step": 5642 }, { "epoch": 5.153424657534247, "grad_norm": 1.19281804561615, "learning_rate": 5.386098427194318e-06, "loss": 0.0061, "step": 5643 }, { "epoch": 5.154337899543379, "grad_norm": 0.0660158172249794, "learning_rate": 5.385083713850837e-06, "loss": 0.0005, "step": 5644 }, { "epoch": 5.155251141552512, "grad_norm": 0.9148076772689819, "learning_rate": 5.384069000507357e-06, "loss": 0.0072, "step": 5645 }, { "epoch": 5.156164383561644, "grad_norm": 2.2701563835144043, "learning_rate": 5.383054287163877e-06, "loss": 0.0114, "step": 5646 }, { "epoch": 5.157077625570777, "grad_norm": 5.054596424102783, "learning_rate": 5.382039573820396e-06, "loss": 0.0365, "step": 5647 }, { "epoch": 5.157990867579909, "grad_norm": 2.473222255706787, "learning_rate": 5.381024860476916e-06, "loss": 0.0209, "step": 5648 }, { "epoch": 5.1589041095890416, "grad_norm": 1.6558570861816406, "learning_rate": 5.3800101471334354e-06, "loss": 0.0106, "step": 5649 }, { "epoch": 5.159817351598173, "grad_norm": 1.1688822507858276, "learning_rate": 5.378995433789955e-06, "loss": 0.0053, "step": 5650 }, { "epoch": 5.160730593607306, "grad_norm": 0.22779174149036407, "learning_rate": 5.377980720446474e-06, "loss": 0.0016, "step": 5651 }, { "epoch": 5.161643835616438, "grad_norm": 0.014785866253077984, "learning_rate": 5.376966007102994e-06, "loss": 0.0001, "step": 5652 }, { "epoch": 5.1625570776255705, "grad_norm": 64.45268249511719, "learning_rate": 5.375951293759513e-06, "loss": 0.7307, "step": 5653 }, { "epoch": 5.163470319634703, "grad_norm": 6.057159423828125, "learning_rate": 5.374936580416032e-06, "loss": 0.0348, "step": 5654 }, { "epoch": 5.164383561643835, "grad_norm": 41.146034240722656, "learning_rate": 5.373921867072553e-06, "loss": 0.2474, "step": 5655 }, { "epoch": 5.165296803652968, "grad_norm": 0.023750007152557373, "learning_rate": 5.3729071537290724e-06, "loss": 0.0002, "step": 5656 }, { "epoch": 5.1662100456621, "grad_norm": 5.242729187011719, "learning_rate": 5.371892440385591e-06, "loss": 0.0303, "step": 5657 }, { "epoch": 5.167123287671233, "grad_norm": 101.02428436279297, "learning_rate": 5.370877727042111e-06, "loss": 1.0448, "step": 5658 }, { "epoch": 5.168036529680365, "grad_norm": 1.861724615097046, "learning_rate": 5.369863013698631e-06, "loss": 0.0079, "step": 5659 }, { "epoch": 5.168949771689498, "grad_norm": 0.909873366355896, "learning_rate": 5.36884830035515e-06, "loss": 0.0025, "step": 5660 }, { "epoch": 5.16986301369863, "grad_norm": 2.405837297439575, "learning_rate": 5.367833587011669e-06, "loss": 0.0198, "step": 5661 }, { "epoch": 5.170776255707763, "grad_norm": 17.23872947692871, "learning_rate": 5.366818873668189e-06, "loss": 0.1119, "step": 5662 }, { "epoch": 5.171689497716895, "grad_norm": 2.236060380935669, "learning_rate": 5.3658041603247094e-06, "loss": 0.0097, "step": 5663 }, { "epoch": 5.1726027397260275, "grad_norm": 1.1918220520019531, "learning_rate": 5.3647894469812274e-06, "loss": 0.0075, "step": 5664 }, { "epoch": 5.17351598173516, "grad_norm": 30.446752548217773, "learning_rate": 5.363774733637748e-06, "loss": 0.315, "step": 5665 }, { "epoch": 5.174429223744292, "grad_norm": 0.4804019629955292, "learning_rate": 5.362760020294268e-06, "loss": 0.0028, "step": 5666 }, { "epoch": 5.175342465753425, "grad_norm": 0.053160276263952255, "learning_rate": 5.3617453069507865e-06, "loss": 0.0004, "step": 5667 }, { "epoch": 5.176255707762557, "grad_norm": 0.020769724622368813, "learning_rate": 5.360730593607306e-06, "loss": 0.0001, "step": 5668 }, { "epoch": 5.17716894977169, "grad_norm": 18.2865047454834, "learning_rate": 5.359715880263826e-06, "loss": 0.0916, "step": 5669 }, { "epoch": 5.178082191780822, "grad_norm": 0.2852633595466614, "learning_rate": 5.358701166920346e-06, "loss": 0.0014, "step": 5670 }, { "epoch": 5.178995433789955, "grad_norm": 22.483362197875977, "learning_rate": 5.3576864535768644e-06, "loss": 0.1639, "step": 5671 }, { "epoch": 5.179908675799087, "grad_norm": 8.732645988464355, "learning_rate": 5.356671740233384e-06, "loss": 0.0477, "step": 5672 }, { "epoch": 5.1808219178082195, "grad_norm": 0.4667666256427765, "learning_rate": 5.355657026889905e-06, "loss": 0.0025, "step": 5673 }, { "epoch": 5.181735159817352, "grad_norm": 8.466375350952148, "learning_rate": 5.3546423135464235e-06, "loss": 0.0695, "step": 5674 }, { "epoch": 5.182648401826484, "grad_norm": 93.75572204589844, "learning_rate": 5.353627600202943e-06, "loss": 0.8467, "step": 5675 }, { "epoch": 5.183561643835616, "grad_norm": 3.7304763793945312, "learning_rate": 5.352612886859463e-06, "loss": 0.026, "step": 5676 }, { "epoch": 5.1844748858447485, "grad_norm": 0.6673040390014648, "learning_rate": 5.351598173515982e-06, "loss": 0.0052, "step": 5677 }, { "epoch": 5.185388127853881, "grad_norm": 13.76888656616211, "learning_rate": 5.3505834601725014e-06, "loss": 0.1168, "step": 5678 }, { "epoch": 5.186301369863013, "grad_norm": 0.06103532016277313, "learning_rate": 5.349568746829021e-06, "loss": 0.0005, "step": 5679 }, { "epoch": 5.187214611872146, "grad_norm": 105.75712585449219, "learning_rate": 5.348554033485542e-06, "loss": 0.6385, "step": 5680 }, { "epoch": 5.188127853881278, "grad_norm": 3.6918599605560303, "learning_rate": 5.34753932014206e-06, "loss": 0.0232, "step": 5681 }, { "epoch": 5.189041095890411, "grad_norm": 7.011072158813477, "learning_rate": 5.34652460679858e-06, "loss": 0.0371, "step": 5682 }, { "epoch": 5.189954337899543, "grad_norm": 2.948396921157837, "learning_rate": 5.3455098934551e-06, "loss": 0.0154, "step": 5683 }, { "epoch": 5.190867579908676, "grad_norm": 0.5893531441688538, "learning_rate": 5.344495180111619e-06, "loss": 0.0024, "step": 5684 }, { "epoch": 5.191780821917808, "grad_norm": 0.08250275999307632, "learning_rate": 5.3434804667681384e-06, "loss": 0.0004, "step": 5685 }, { "epoch": 5.1926940639269406, "grad_norm": 0.38197267055511475, "learning_rate": 5.342465753424658e-06, "loss": 0.0016, "step": 5686 }, { "epoch": 5.193607305936073, "grad_norm": 1.9484604597091675, "learning_rate": 5.341451040081177e-06, "loss": 0.0109, "step": 5687 }, { "epoch": 5.1945205479452055, "grad_norm": 0.35712748765945435, "learning_rate": 5.340436326737697e-06, "loss": 0.0031, "step": 5688 }, { "epoch": 5.195433789954338, "grad_norm": 12.265913963317871, "learning_rate": 5.339421613394216e-06, "loss": 0.073, "step": 5689 }, { "epoch": 5.19634703196347, "grad_norm": 44.151981353759766, "learning_rate": 5.338406900050737e-06, "loss": 0.1577, "step": 5690 }, { "epoch": 5.197260273972603, "grad_norm": 4.420377731323242, "learning_rate": 5.337392186707256e-06, "loss": 0.0201, "step": 5691 }, { "epoch": 5.198173515981735, "grad_norm": 3.155104637145996, "learning_rate": 5.3363774733637754e-06, "loss": 0.0157, "step": 5692 }, { "epoch": 5.199086757990868, "grad_norm": 34.26845932006836, "learning_rate": 5.335362760020295e-06, "loss": 0.2454, "step": 5693 }, { "epoch": 5.2, "grad_norm": 0.18134483695030212, "learning_rate": 5.334348046676814e-06, "loss": 0.0007, "step": 5694 }, { "epoch": 5.200913242009133, "grad_norm": 0.7865970134735107, "learning_rate": 5.333333333333334e-06, "loss": 0.0059, "step": 5695 }, { "epoch": 5.201826484018265, "grad_norm": 3.5014848709106445, "learning_rate": 5.332318619989853e-06, "loss": 0.0136, "step": 5696 }, { "epoch": 5.2027397260273975, "grad_norm": 1.4623805284500122, "learning_rate": 5.331303906646372e-06, "loss": 0.0068, "step": 5697 }, { "epoch": 5.20365296803653, "grad_norm": 0.12580353021621704, "learning_rate": 5.330289193302892e-06, "loss": 0.0009, "step": 5698 }, { "epoch": 5.2045662100456624, "grad_norm": 19.21773338317871, "learning_rate": 5.3292744799594124e-06, "loss": 0.0367, "step": 5699 }, { "epoch": 5.205479452054795, "grad_norm": 0.5333134531974792, "learning_rate": 5.328259766615932e-06, "loss": 0.0011, "step": 5700 }, { "epoch": 5.206392694063927, "grad_norm": 0.47013023495674133, "learning_rate": 5.327245053272451e-06, "loss": 0.003, "step": 5701 }, { "epoch": 5.207305936073059, "grad_norm": 12.297525405883789, "learning_rate": 5.326230339928971e-06, "loss": 0.081, "step": 5702 }, { "epoch": 5.208219178082191, "grad_norm": 0.1723143458366394, "learning_rate": 5.32521562658549e-06, "loss": 0.0012, "step": 5703 }, { "epoch": 5.209132420091324, "grad_norm": 10.507189750671387, "learning_rate": 5.324200913242009e-06, "loss": 0.0618, "step": 5704 }, { "epoch": 5.210045662100456, "grad_norm": 0.2518446743488312, "learning_rate": 5.323186199898529e-06, "loss": 0.0022, "step": 5705 }, { "epoch": 5.210958904109589, "grad_norm": 1.0526363849639893, "learning_rate": 5.3221714865550486e-06, "loss": 0.0041, "step": 5706 }, { "epoch": 5.211872146118721, "grad_norm": 6.5579986572265625, "learning_rate": 5.321156773211567e-06, "loss": 0.0303, "step": 5707 }, { "epoch": 5.212785388127854, "grad_norm": 0.3808903098106384, "learning_rate": 5.320142059868087e-06, "loss": 0.003, "step": 5708 }, { "epoch": 5.213698630136986, "grad_norm": 10.645429611206055, "learning_rate": 5.319127346524608e-06, "loss": 0.0656, "step": 5709 }, { "epoch": 5.2146118721461185, "grad_norm": 0.044228892773389816, "learning_rate": 5.318112633181127e-06, "loss": 0.0003, "step": 5710 }, { "epoch": 5.215525114155251, "grad_norm": 0.013210241682827473, "learning_rate": 5.317097919837646e-06, "loss": 0.0001, "step": 5711 }, { "epoch": 5.2164383561643834, "grad_norm": 1.529034972190857, "learning_rate": 5.316083206494166e-06, "loss": 0.0076, "step": 5712 }, { "epoch": 5.217351598173516, "grad_norm": 2.068504810333252, "learning_rate": 5.3150684931506856e-06, "loss": 0.0125, "step": 5713 }, { "epoch": 5.218264840182648, "grad_norm": 6.995090007781982, "learning_rate": 5.314053779807204e-06, "loss": 0.0461, "step": 5714 }, { "epoch": 5.219178082191781, "grad_norm": 1.7238881587982178, "learning_rate": 5.313039066463724e-06, "loss": 0.0074, "step": 5715 }, { "epoch": 5.220091324200913, "grad_norm": 1.459373116493225, "learning_rate": 5.312024353120245e-06, "loss": 0.0113, "step": 5716 }, { "epoch": 5.221004566210046, "grad_norm": 5.50946569442749, "learning_rate": 5.311009639776763e-06, "loss": 0.0344, "step": 5717 }, { "epoch": 5.221917808219178, "grad_norm": 2.3266823291778564, "learning_rate": 5.309994926433283e-06, "loss": 0.0131, "step": 5718 }, { "epoch": 5.222831050228311, "grad_norm": 1.9298144578933716, "learning_rate": 5.308980213089803e-06, "loss": 0.0119, "step": 5719 }, { "epoch": 5.223744292237443, "grad_norm": 3.9709815979003906, "learning_rate": 5.3079654997463226e-06, "loss": 0.0144, "step": 5720 }, { "epoch": 5.2246575342465755, "grad_norm": 0.014601275324821472, "learning_rate": 5.306950786402841e-06, "loss": 0.0001, "step": 5721 }, { "epoch": 5.225570776255708, "grad_norm": 18.808412551879883, "learning_rate": 5.305936073059361e-06, "loss": 0.0452, "step": 5722 }, { "epoch": 5.22648401826484, "grad_norm": 10.381620407104492, "learning_rate": 5.304921359715881e-06, "loss": 0.0899, "step": 5723 }, { "epoch": 5.227397260273973, "grad_norm": 0.3164418935775757, "learning_rate": 5.3039066463724e-06, "loss": 0.0016, "step": 5724 }, { "epoch": 5.228310502283105, "grad_norm": 17.835905075073242, "learning_rate": 5.302891933028919e-06, "loss": 0.096, "step": 5725 }, { "epoch": 5.229223744292238, "grad_norm": 0.01298306230455637, "learning_rate": 5.30187721968544e-06, "loss": 0.0001, "step": 5726 }, { "epoch": 5.23013698630137, "grad_norm": 5.246553897857666, "learning_rate": 5.300862506341959e-06, "loss": 0.023, "step": 5727 }, { "epoch": 5.231050228310503, "grad_norm": 0.04653479903936386, "learning_rate": 5.299847792998478e-06, "loss": 0.0003, "step": 5728 }, { "epoch": 5.231963470319634, "grad_norm": 7.56089448928833, "learning_rate": 5.298833079654998e-06, "loss": 0.03, "step": 5729 }, { "epoch": 5.232876712328767, "grad_norm": 0.44066405296325684, "learning_rate": 5.297818366311518e-06, "loss": 0.0024, "step": 5730 }, { "epoch": 5.233789954337899, "grad_norm": 0.02703052945435047, "learning_rate": 5.296803652968037e-06, "loss": 0.0002, "step": 5731 }, { "epoch": 5.234703196347032, "grad_norm": 3.183286190032959, "learning_rate": 5.295788939624556e-06, "loss": 0.0187, "step": 5732 }, { "epoch": 5.235616438356164, "grad_norm": 0.42866066098213196, "learning_rate": 5.294774226281076e-06, "loss": 0.0029, "step": 5733 }, { "epoch": 5.2365296803652965, "grad_norm": 4.198871612548828, "learning_rate": 5.293759512937595e-06, "loss": 0.0241, "step": 5734 }, { "epoch": 5.237442922374429, "grad_norm": 12.243379592895508, "learning_rate": 5.292744799594115e-06, "loss": 0.1005, "step": 5735 }, { "epoch": 5.238356164383561, "grad_norm": 0.5022281408309937, "learning_rate": 5.291730086250635e-06, "loss": 0.0038, "step": 5736 }, { "epoch": 5.239269406392694, "grad_norm": 8.311105728149414, "learning_rate": 5.290715372907154e-06, "loss": 0.0373, "step": 5737 }, { "epoch": 5.240182648401826, "grad_norm": 22.744216918945312, "learning_rate": 5.289700659563674e-06, "loss": 0.1084, "step": 5738 }, { "epoch": 5.241095890410959, "grad_norm": 4.063996315002441, "learning_rate": 5.288685946220193e-06, "loss": 0.0214, "step": 5739 }, { "epoch": 5.242009132420091, "grad_norm": 0.7755069136619568, "learning_rate": 5.287671232876713e-06, "loss": 0.0045, "step": 5740 }, { "epoch": 5.242922374429224, "grad_norm": 20.93000602722168, "learning_rate": 5.286656519533232e-06, "loss": 0.0595, "step": 5741 }, { "epoch": 5.243835616438356, "grad_norm": 0.8061555027961731, "learning_rate": 5.2856418061897516e-06, "loss": 0.0041, "step": 5742 }, { "epoch": 5.244748858447489, "grad_norm": 84.66116333007812, "learning_rate": 5.284627092846272e-06, "loss": 1.2689, "step": 5743 }, { "epoch": 5.245662100456621, "grad_norm": 5.065352916717529, "learning_rate": 5.28361237950279e-06, "loss": 0.0424, "step": 5744 }, { "epoch": 5.2465753424657535, "grad_norm": 3.3872017860412598, "learning_rate": 5.282597666159311e-06, "loss": 0.0066, "step": 5745 }, { "epoch": 5.247488584474886, "grad_norm": 111.42041015625, "learning_rate": 5.28158295281583e-06, "loss": 0.9914, "step": 5746 }, { "epoch": 5.248401826484018, "grad_norm": 0.49919435381889343, "learning_rate": 5.280568239472349e-06, "loss": 0.003, "step": 5747 }, { "epoch": 5.249315068493151, "grad_norm": 0.17758573591709137, "learning_rate": 5.279553526128869e-06, "loss": 0.0013, "step": 5748 }, { "epoch": 5.250228310502283, "grad_norm": 0.032668258994817734, "learning_rate": 5.2785388127853886e-06, "loss": 0.0001, "step": 5749 }, { "epoch": 5.251141552511416, "grad_norm": 21.159149169921875, "learning_rate": 5.277524099441908e-06, "loss": 0.1296, "step": 5750 }, { "epoch": 5.252054794520548, "grad_norm": 102.46183013916016, "learning_rate": 5.276509386098427e-06, "loss": 2.8281, "step": 5751 }, { "epoch": 5.252968036529681, "grad_norm": 4.7813873291015625, "learning_rate": 5.275494672754947e-06, "loss": 0.0211, "step": 5752 }, { "epoch": 5.253881278538813, "grad_norm": 21.28205680847168, "learning_rate": 5.274479959411467e-06, "loss": 0.1834, "step": 5753 }, { "epoch": 5.254794520547946, "grad_norm": 1.6739245653152466, "learning_rate": 5.273465246067986e-06, "loss": 0.011, "step": 5754 }, { "epoch": 5.255707762557078, "grad_norm": 7.2223381996154785, "learning_rate": 5.272450532724506e-06, "loss": 0.0386, "step": 5755 }, { "epoch": 5.25662100456621, "grad_norm": 5.886251926422119, "learning_rate": 5.2714358193810256e-06, "loss": 0.0263, "step": 5756 }, { "epoch": 5.257534246575342, "grad_norm": 137.6173095703125, "learning_rate": 5.270421106037544e-06, "loss": 5.1829, "step": 5757 }, { "epoch": 5.2584474885844745, "grad_norm": 0.01822701469063759, "learning_rate": 5.269406392694064e-06, "loss": 0.0001, "step": 5758 }, { "epoch": 5.259360730593607, "grad_norm": 0.7051283121109009, "learning_rate": 5.268391679350584e-06, "loss": 0.0013, "step": 5759 }, { "epoch": 5.260273972602739, "grad_norm": 2.1665778160095215, "learning_rate": 5.267376966007104e-06, "loss": 0.0184, "step": 5760 }, { "epoch": 5.261187214611872, "grad_norm": 0.7518437504768372, "learning_rate": 5.266362252663622e-06, "loss": 0.0032, "step": 5761 }, { "epoch": 5.262100456621004, "grad_norm": 0.9840978980064392, "learning_rate": 5.265347539320143e-06, "loss": 0.0076, "step": 5762 }, { "epoch": 5.263013698630137, "grad_norm": 35.14602279663086, "learning_rate": 5.2643328259766626e-06, "loss": 0.2232, "step": 5763 }, { "epoch": 5.263926940639269, "grad_norm": 3.586090087890625, "learning_rate": 5.263318112633181e-06, "loss": 0.0327, "step": 5764 }, { "epoch": 5.264840182648402, "grad_norm": 3.2650630474090576, "learning_rate": 5.262303399289701e-06, "loss": 0.0179, "step": 5765 }, { "epoch": 5.265753424657534, "grad_norm": 1.1997102499008179, "learning_rate": 5.261288685946221e-06, "loss": 0.0081, "step": 5766 }, { "epoch": 5.266666666666667, "grad_norm": 1.8190598487854004, "learning_rate": 5.26027397260274e-06, "loss": 0.0125, "step": 5767 }, { "epoch": 5.267579908675799, "grad_norm": 0.6779801249504089, "learning_rate": 5.259259259259259e-06, "loss": 0.0041, "step": 5768 }, { "epoch": 5.2684931506849315, "grad_norm": 1.0950429439544678, "learning_rate": 5.258244545915779e-06, "loss": 0.0036, "step": 5769 }, { "epoch": 5.269406392694064, "grad_norm": 1.299506664276123, "learning_rate": 5.2572298325722996e-06, "loss": 0.0061, "step": 5770 }, { "epoch": 5.270319634703196, "grad_norm": 7.063385009765625, "learning_rate": 5.256215119228818e-06, "loss": 0.0454, "step": 5771 }, { "epoch": 5.271232876712329, "grad_norm": 43.86025619506836, "learning_rate": 5.255200405885338e-06, "loss": 0.2158, "step": 5772 }, { "epoch": 5.272146118721461, "grad_norm": 0.6006897687911987, "learning_rate": 5.254185692541858e-06, "loss": 0.0037, "step": 5773 }, { "epoch": 5.273059360730594, "grad_norm": 0.28282272815704346, "learning_rate": 5.253170979198377e-06, "loss": 0.0013, "step": 5774 }, { "epoch": 5.273972602739726, "grad_norm": 1.5471607446670532, "learning_rate": 5.252156265854896e-06, "loss": 0.0121, "step": 5775 }, { "epoch": 5.274885844748859, "grad_norm": 52.52340316772461, "learning_rate": 5.251141552511416e-06, "loss": 0.4286, "step": 5776 }, { "epoch": 5.275799086757991, "grad_norm": 1.5325647592544556, "learning_rate": 5.250126839167935e-06, "loss": 0.0101, "step": 5777 }, { "epoch": 5.276712328767124, "grad_norm": 2.33686900138855, "learning_rate": 5.2491121258244545e-06, "loss": 0.0126, "step": 5778 }, { "epoch": 5.277625570776256, "grad_norm": 16.052209854125977, "learning_rate": 5.248097412480975e-06, "loss": 0.0877, "step": 5779 }, { "epoch": 5.2785388127853885, "grad_norm": 20.84886360168457, "learning_rate": 5.247082699137495e-06, "loss": 0.179, "step": 5780 }, { "epoch": 5.279452054794521, "grad_norm": 31.18094253540039, "learning_rate": 5.246067985794014e-06, "loss": 0.2773, "step": 5781 }, { "epoch": 5.280365296803653, "grad_norm": 0.3573618233203888, "learning_rate": 5.245053272450533e-06, "loss": 0.0026, "step": 5782 }, { "epoch": 5.281278538812785, "grad_norm": 1.4892220497131348, "learning_rate": 5.244038559107053e-06, "loss": 0.0094, "step": 5783 }, { "epoch": 5.282191780821917, "grad_norm": 31.210798263549805, "learning_rate": 5.243023845763572e-06, "loss": 0.2103, "step": 5784 }, { "epoch": 5.28310502283105, "grad_norm": 0.003562498139217496, "learning_rate": 5.2420091324200915e-06, "loss": 0.0, "step": 5785 }, { "epoch": 5.284018264840182, "grad_norm": 0.6931656002998352, "learning_rate": 5.240994419076611e-06, "loss": 0.0032, "step": 5786 }, { "epoch": 5.284931506849315, "grad_norm": 6.872745513916016, "learning_rate": 5.23997970573313e-06, "loss": 0.0302, "step": 5787 }, { "epoch": 5.285844748858447, "grad_norm": 20.41712188720703, "learning_rate": 5.23896499238965e-06, "loss": 0.0985, "step": 5788 }, { "epoch": 5.28675799086758, "grad_norm": 8.7637357711792, "learning_rate": 5.23795027904617e-06, "loss": 0.0549, "step": 5789 }, { "epoch": 5.287671232876712, "grad_norm": 84.42449951171875, "learning_rate": 5.23693556570269e-06, "loss": 0.4756, "step": 5790 }, { "epoch": 5.288584474885845, "grad_norm": 2.094240427017212, "learning_rate": 5.235920852359209e-06, "loss": 0.0163, "step": 5791 }, { "epoch": 5.289497716894977, "grad_norm": 3.1047098636627197, "learning_rate": 5.2349061390157285e-06, "loss": 0.0162, "step": 5792 }, { "epoch": 5.2904109589041095, "grad_norm": 0.02997712418437004, "learning_rate": 5.233891425672248e-06, "loss": 0.0002, "step": 5793 }, { "epoch": 5.291324200913242, "grad_norm": 6.053950786590576, "learning_rate": 5.232876712328767e-06, "loss": 0.03, "step": 5794 }, { "epoch": 5.292237442922374, "grad_norm": 2.673412799835205, "learning_rate": 5.231861998985287e-06, "loss": 0.0095, "step": 5795 }, { "epoch": 5.293150684931507, "grad_norm": 2.9054946899414062, "learning_rate": 5.2308472856418065e-06, "loss": 0.0132, "step": 5796 }, { "epoch": 5.294063926940639, "grad_norm": 0.327476441860199, "learning_rate": 5.229832572298325e-06, "loss": 0.0021, "step": 5797 }, { "epoch": 5.294977168949772, "grad_norm": 9.341288566589355, "learning_rate": 5.228817858954846e-06, "loss": 0.0546, "step": 5798 }, { "epoch": 5.295890410958904, "grad_norm": 0.3939177095890045, "learning_rate": 5.2278031456113655e-06, "loss": 0.0028, "step": 5799 }, { "epoch": 5.296803652968037, "grad_norm": 0.0456240139901638, "learning_rate": 5.226788432267885e-06, "loss": 0.0003, "step": 5800 }, { "epoch": 5.297716894977169, "grad_norm": 5.996258735656738, "learning_rate": 5.225773718924404e-06, "loss": 0.0414, "step": 5801 }, { "epoch": 5.298630136986302, "grad_norm": 0.28803759813308716, "learning_rate": 5.224759005580924e-06, "loss": 0.0016, "step": 5802 }, { "epoch": 5.299543378995434, "grad_norm": 0.02656475454568863, "learning_rate": 5.2237442922374435e-06, "loss": 0.0002, "step": 5803 }, { "epoch": 5.3004566210045665, "grad_norm": 0.5418006777763367, "learning_rate": 5.222729578893962e-06, "loss": 0.0036, "step": 5804 }, { "epoch": 5.301369863013699, "grad_norm": 5.89894437789917, "learning_rate": 5.221714865550482e-06, "loss": 0.0362, "step": 5805 }, { "epoch": 5.302283105022831, "grad_norm": 0.49978193640708923, "learning_rate": 5.2207001522070025e-06, "loss": 0.0033, "step": 5806 }, { "epoch": 5.303196347031964, "grad_norm": 1.4029134511947632, "learning_rate": 5.219685438863521e-06, "loss": 0.0081, "step": 5807 }, { "epoch": 5.304109589041096, "grad_norm": 21.003210067749023, "learning_rate": 5.218670725520041e-06, "loss": 0.1498, "step": 5808 }, { "epoch": 5.305022831050229, "grad_norm": 0.3789713382720947, "learning_rate": 5.217656012176561e-06, "loss": 0.0035, "step": 5809 }, { "epoch": 5.30593607305936, "grad_norm": 2.270077705383301, "learning_rate": 5.2166412988330805e-06, "loss": 0.0151, "step": 5810 }, { "epoch": 5.306849315068493, "grad_norm": 2.214977979660034, "learning_rate": 5.215626585489599e-06, "loss": 0.0027, "step": 5811 }, { "epoch": 5.307762557077625, "grad_norm": 8.283802032470703, "learning_rate": 5.214611872146119e-06, "loss": 0.0509, "step": 5812 }, { "epoch": 5.308675799086758, "grad_norm": 61.50189208984375, "learning_rate": 5.213597158802639e-06, "loss": 0.7846, "step": 5813 }, { "epoch": 5.30958904109589, "grad_norm": 6.613113880157471, "learning_rate": 5.2125824454591575e-06, "loss": 0.0387, "step": 5814 }, { "epoch": 5.310502283105023, "grad_norm": 0.03617119416594505, "learning_rate": 5.211567732115678e-06, "loss": 0.0002, "step": 5815 }, { "epoch": 5.311415525114155, "grad_norm": 0.6463091373443604, "learning_rate": 5.210553018772198e-06, "loss": 0.0033, "step": 5816 }, { "epoch": 5.3123287671232875, "grad_norm": 18.876358032226562, "learning_rate": 5.209538305428717e-06, "loss": 0.1316, "step": 5817 }, { "epoch": 5.31324200913242, "grad_norm": 0.5099788308143616, "learning_rate": 5.208523592085236e-06, "loss": 0.0027, "step": 5818 }, { "epoch": 5.314155251141552, "grad_norm": 0.9448224902153015, "learning_rate": 5.207508878741756e-06, "loss": 0.0072, "step": 5819 }, { "epoch": 5.315068493150685, "grad_norm": 3.203990936279297, "learning_rate": 5.206494165398276e-06, "loss": 0.0205, "step": 5820 }, { "epoch": 5.315981735159817, "grad_norm": 3.496354818344116, "learning_rate": 5.2054794520547945e-06, "loss": 0.023, "step": 5821 }, { "epoch": 5.31689497716895, "grad_norm": 0.46180087327957153, "learning_rate": 5.204464738711314e-06, "loss": 0.0028, "step": 5822 }, { "epoch": 5.317808219178082, "grad_norm": 0.9346237182617188, "learning_rate": 5.203450025367835e-06, "loss": 0.0053, "step": 5823 }, { "epoch": 5.318721461187215, "grad_norm": 0.3427952826023102, "learning_rate": 5.202435312024353e-06, "loss": 0.0026, "step": 5824 }, { "epoch": 5.319634703196347, "grad_norm": 0.6891804337501526, "learning_rate": 5.201420598680873e-06, "loss": 0.0034, "step": 5825 }, { "epoch": 5.32054794520548, "grad_norm": 0.012793781235814095, "learning_rate": 5.200405885337393e-06, "loss": 0.0001, "step": 5826 }, { "epoch": 5.321461187214612, "grad_norm": 4.840476989746094, "learning_rate": 5.199391171993912e-06, "loss": 0.0417, "step": 5827 }, { "epoch": 5.3223744292237445, "grad_norm": 5.313849449157715, "learning_rate": 5.1983764586504315e-06, "loss": 0.0211, "step": 5828 }, { "epoch": 5.323287671232877, "grad_norm": 0.21153806149959564, "learning_rate": 5.197361745306951e-06, "loss": 0.0014, "step": 5829 }, { "epoch": 5.324200913242009, "grad_norm": 120.00868225097656, "learning_rate": 5.196347031963471e-06, "loss": 3.5623, "step": 5830 }, { "epoch": 5.325114155251142, "grad_norm": 6.86880350112915, "learning_rate": 5.19533231861999e-06, "loss": 0.0415, "step": 5831 }, { "epoch": 5.326027397260274, "grad_norm": 1.4869847297668457, "learning_rate": 5.1943176052765094e-06, "loss": 0.0073, "step": 5832 }, { "epoch": 5.326940639269407, "grad_norm": 1.772440791130066, "learning_rate": 5.19330289193303e-06, "loss": 0.0124, "step": 5833 }, { "epoch": 5.327853881278539, "grad_norm": 0.16471371054649353, "learning_rate": 5.192288178589549e-06, "loss": 0.0008, "step": 5834 }, { "epoch": 5.328767123287671, "grad_norm": 8.432331085205078, "learning_rate": 5.1912734652460685e-06, "loss": 0.0466, "step": 5835 }, { "epoch": 5.329680365296804, "grad_norm": 86.20440673828125, "learning_rate": 5.190258751902588e-06, "loss": 2.3739, "step": 5836 }, { "epoch": 5.330593607305936, "grad_norm": 46.22989273071289, "learning_rate": 5.189244038559107e-06, "loss": 0.3632, "step": 5837 }, { "epoch": 5.331506849315068, "grad_norm": 0.3986038863658905, "learning_rate": 5.188229325215627e-06, "loss": 0.0028, "step": 5838 }, { "epoch": 5.332420091324201, "grad_norm": 23.36009979248047, "learning_rate": 5.1872146118721464e-06, "loss": 0.1466, "step": 5839 }, { "epoch": 5.333333333333333, "grad_norm": 2.8077378273010254, "learning_rate": 5.186199898528667e-06, "loss": 0.0199, "step": 5840 }, { "epoch": 5.3342465753424655, "grad_norm": 11.574509620666504, "learning_rate": 5.185185185185185e-06, "loss": 0.0608, "step": 5841 }, { "epoch": 5.335159817351598, "grad_norm": 0.15484727919101715, "learning_rate": 5.1841704718417055e-06, "loss": 0.0009, "step": 5842 }, { "epoch": 5.33607305936073, "grad_norm": 29.63031768798828, "learning_rate": 5.183155758498225e-06, "loss": 0.1373, "step": 5843 }, { "epoch": 5.336986301369863, "grad_norm": 46.5118293762207, "learning_rate": 5.182141045154744e-06, "loss": 0.2266, "step": 5844 }, { "epoch": 5.337899543378995, "grad_norm": 1.1077585220336914, "learning_rate": 5.181126331811264e-06, "loss": 0.0104, "step": 5845 }, { "epoch": 5.338812785388128, "grad_norm": 1.926727533340454, "learning_rate": 5.1801116184677834e-06, "loss": 0.0133, "step": 5846 }, { "epoch": 5.33972602739726, "grad_norm": 0.05538669601082802, "learning_rate": 5.179096905124302e-06, "loss": 0.0003, "step": 5847 }, { "epoch": 5.340639269406393, "grad_norm": 4.786743640899658, "learning_rate": 5.178082191780822e-06, "loss": 0.0275, "step": 5848 }, { "epoch": 5.341552511415525, "grad_norm": 0.4418053925037384, "learning_rate": 5.177067478437342e-06, "loss": 0.003, "step": 5849 }, { "epoch": 5.342465753424658, "grad_norm": 1.4201725721359253, "learning_rate": 5.176052765093862e-06, "loss": 0.008, "step": 5850 }, { "epoch": 5.34337899543379, "grad_norm": 5.656551361083984, "learning_rate": 5.175038051750381e-06, "loss": 0.0361, "step": 5851 }, { "epoch": 5.3442922374429225, "grad_norm": 5.255163669586182, "learning_rate": 5.174023338406901e-06, "loss": 0.0381, "step": 5852 }, { "epoch": 5.345205479452055, "grad_norm": 87.56156921386719, "learning_rate": 5.1730086250634204e-06, "loss": 1.0869, "step": 5853 }, { "epoch": 5.346118721461187, "grad_norm": 0.07793895900249481, "learning_rate": 5.171993911719939e-06, "loss": 0.0006, "step": 5854 }, { "epoch": 5.34703196347032, "grad_norm": 1.897475004196167, "learning_rate": 5.170979198376459e-06, "loss": 0.0124, "step": 5855 }, { "epoch": 5.347945205479452, "grad_norm": 19.82221794128418, "learning_rate": 5.169964485032979e-06, "loss": 0.1157, "step": 5856 }, { "epoch": 5.348858447488585, "grad_norm": 11.613313674926758, "learning_rate": 5.1689497716894975e-06, "loss": 0.0851, "step": 5857 }, { "epoch": 5.349771689497717, "grad_norm": 33.69391632080078, "learning_rate": 5.167935058346017e-06, "loss": 0.3028, "step": 5858 }, { "epoch": 5.35068493150685, "grad_norm": 2.2430155277252197, "learning_rate": 5.166920345002538e-06, "loss": 0.0149, "step": 5859 }, { "epoch": 5.351598173515982, "grad_norm": 7.860225200653076, "learning_rate": 5.1659056316590574e-06, "loss": 0.0398, "step": 5860 }, { "epoch": 5.352511415525115, "grad_norm": 24.373889923095703, "learning_rate": 5.164890918315576e-06, "loss": 0.1908, "step": 5861 }, { "epoch": 5.353424657534246, "grad_norm": 5.3928399085998535, "learning_rate": 5.163876204972096e-06, "loss": 0.0332, "step": 5862 }, { "epoch": 5.3543378995433795, "grad_norm": 2.5052218437194824, "learning_rate": 5.162861491628616e-06, "loss": 0.0109, "step": 5863 }, { "epoch": 5.355251141552511, "grad_norm": 9.026878356933594, "learning_rate": 5.1618467782851345e-06, "loss": 0.044, "step": 5864 }, { "epoch": 5.3561643835616435, "grad_norm": 0.6733435392379761, "learning_rate": 5.160832064941654e-06, "loss": 0.0042, "step": 5865 }, { "epoch": 5.357077625570776, "grad_norm": 4.185092449188232, "learning_rate": 5.159817351598174e-06, "loss": 0.0225, "step": 5866 }, { "epoch": 5.357990867579908, "grad_norm": 0.9745388627052307, "learning_rate": 5.158802638254693e-06, "loss": 0.0056, "step": 5867 }, { "epoch": 5.358904109589041, "grad_norm": 7.269984245300293, "learning_rate": 5.1577879249112124e-06, "loss": 0.0469, "step": 5868 }, { "epoch": 5.359817351598173, "grad_norm": 1.4463798999786377, "learning_rate": 5.156773211567733e-06, "loss": 0.0093, "step": 5869 }, { "epoch": 5.360730593607306, "grad_norm": 104.38318634033203, "learning_rate": 5.155758498224253e-06, "loss": 2.1804, "step": 5870 }, { "epoch": 5.361643835616438, "grad_norm": 101.97356414794922, "learning_rate": 5.1547437848807715e-06, "loss": 0.4064, "step": 5871 }, { "epoch": 5.362557077625571, "grad_norm": 4.21293830871582, "learning_rate": 5.153729071537291e-06, "loss": 0.025, "step": 5872 }, { "epoch": 5.363470319634703, "grad_norm": 0.7516523003578186, "learning_rate": 5.152714358193811e-06, "loss": 0.0036, "step": 5873 }, { "epoch": 5.364383561643836, "grad_norm": 1.55061674118042, "learning_rate": 5.15169964485033e-06, "loss": 0.0109, "step": 5874 }, { "epoch": 5.365296803652968, "grad_norm": 8.379528999328613, "learning_rate": 5.1506849315068494e-06, "loss": 0.0658, "step": 5875 }, { "epoch": 5.3662100456621005, "grad_norm": 5.43128776550293, "learning_rate": 5.149670218163369e-06, "loss": 0.0346, "step": 5876 }, { "epoch": 5.367123287671233, "grad_norm": 3.9311861991882324, "learning_rate": 5.148655504819888e-06, "loss": 0.0115, "step": 5877 }, { "epoch": 5.368036529680365, "grad_norm": 2.630565643310547, "learning_rate": 5.1476407914764085e-06, "loss": 0.0232, "step": 5878 }, { "epoch": 5.368949771689498, "grad_norm": 0.05689365044236183, "learning_rate": 5.146626078132928e-06, "loss": 0.0003, "step": 5879 }, { "epoch": 5.36986301369863, "grad_norm": 15.112096786499023, "learning_rate": 5.145611364789448e-06, "loss": 0.076, "step": 5880 }, { "epoch": 5.370776255707763, "grad_norm": 0.24475085735321045, "learning_rate": 5.144596651445967e-06, "loss": 0.0025, "step": 5881 }, { "epoch": 5.371689497716895, "grad_norm": 3.483593702316284, "learning_rate": 5.1435819381024864e-06, "loss": 0.0286, "step": 5882 }, { "epoch": 5.372602739726028, "grad_norm": 0.18854258954524994, "learning_rate": 5.142567224759006e-06, "loss": 0.0014, "step": 5883 }, { "epoch": 5.37351598173516, "grad_norm": 1.6502556800842285, "learning_rate": 5.141552511415525e-06, "loss": 0.0082, "step": 5884 }, { "epoch": 5.3744292237442925, "grad_norm": 0.8383001089096069, "learning_rate": 5.140537798072045e-06, "loss": 0.0039, "step": 5885 }, { "epoch": 5.375342465753425, "grad_norm": 0.5114734172821045, "learning_rate": 5.139523084728565e-06, "loss": 0.0033, "step": 5886 }, { "epoch": 5.3762557077625575, "grad_norm": 7.74973726272583, "learning_rate": 5.138508371385084e-06, "loss": 0.0578, "step": 5887 }, { "epoch": 5.37716894977169, "grad_norm": 0.7731032967567444, "learning_rate": 5.137493658041604e-06, "loss": 0.0064, "step": 5888 }, { "epoch": 5.3780821917808215, "grad_norm": 1.3638747930526733, "learning_rate": 5.1364789446981234e-06, "loss": 0.0083, "step": 5889 }, { "epoch": 5.378995433789954, "grad_norm": 14.802696228027344, "learning_rate": 5.135464231354643e-06, "loss": 0.0764, "step": 5890 }, { "epoch": 5.379908675799086, "grad_norm": 1.7126351594924927, "learning_rate": 5.134449518011162e-06, "loss": 0.0071, "step": 5891 }, { "epoch": 5.380821917808219, "grad_norm": 0.8102322220802307, "learning_rate": 5.133434804667682e-06, "loss": 0.0044, "step": 5892 }, { "epoch": 5.381735159817351, "grad_norm": 2.5759665966033936, "learning_rate": 5.132420091324201e-06, "loss": 0.0179, "step": 5893 }, { "epoch": 5.382648401826484, "grad_norm": 5.470436096191406, "learning_rate": 5.13140537798072e-06, "loss": 0.0328, "step": 5894 }, { "epoch": 5.383561643835616, "grad_norm": 2.447904586791992, "learning_rate": 5.130390664637241e-06, "loss": 0.0116, "step": 5895 }, { "epoch": 5.384474885844749, "grad_norm": 2.116823434829712, "learning_rate": 5.12937595129376e-06, "loss": 0.0119, "step": 5896 }, { "epoch": 5.385388127853881, "grad_norm": 1.0508049726486206, "learning_rate": 5.128361237950279e-06, "loss": 0.0067, "step": 5897 }, { "epoch": 5.3863013698630136, "grad_norm": 1.3972772359848022, "learning_rate": 5.127346524606799e-06, "loss": 0.0089, "step": 5898 }, { "epoch": 5.387214611872146, "grad_norm": 8.502371788024902, "learning_rate": 5.126331811263319e-06, "loss": 0.0733, "step": 5899 }, { "epoch": 5.3881278538812785, "grad_norm": 49.232303619384766, "learning_rate": 5.125317097919838e-06, "loss": 0.4723, "step": 5900 }, { "epoch": 5.389041095890411, "grad_norm": 2.010878562927246, "learning_rate": 5.124302384576357e-06, "loss": 0.0119, "step": 5901 }, { "epoch": 5.389954337899543, "grad_norm": 0.09356872737407684, "learning_rate": 5.123287671232877e-06, "loss": 0.0006, "step": 5902 }, { "epoch": 5.390867579908676, "grad_norm": 16.037656784057617, "learning_rate": 5.122272957889397e-06, "loss": 0.0763, "step": 5903 }, { "epoch": 5.391780821917808, "grad_norm": 35.18043899536133, "learning_rate": 5.121258244545915e-06, "loss": 0.1947, "step": 5904 }, { "epoch": 5.392694063926941, "grad_norm": 1.0459622144699097, "learning_rate": 5.120243531202436e-06, "loss": 0.0047, "step": 5905 }, { "epoch": 5.393607305936073, "grad_norm": 0.5682546496391296, "learning_rate": 5.119228817858956e-06, "loss": 0.0037, "step": 5906 }, { "epoch": 5.394520547945206, "grad_norm": 46.95804977416992, "learning_rate": 5.1182141045154745e-06, "loss": 0.167, "step": 5907 }, { "epoch": 5.395433789954338, "grad_norm": 26.37067222595215, "learning_rate": 5.117199391171994e-06, "loss": 0.2339, "step": 5908 }, { "epoch": 5.3963470319634705, "grad_norm": 110.75962829589844, "learning_rate": 5.116184677828514e-06, "loss": 1.6851, "step": 5909 }, { "epoch": 5.397260273972603, "grad_norm": 16.411449432373047, "learning_rate": 5.1151699644850336e-06, "loss": 0.0967, "step": 5910 }, { "epoch": 5.3981735159817354, "grad_norm": 62.8715934753418, "learning_rate": 5.114155251141552e-06, "loss": 0.4624, "step": 5911 }, { "epoch": 5.399086757990868, "grad_norm": 0.3801456391811371, "learning_rate": 5.113140537798072e-06, "loss": 0.0019, "step": 5912 }, { "epoch": 5.4, "grad_norm": 0.3146836459636688, "learning_rate": 5.112125824454593e-06, "loss": 0.0023, "step": 5913 }, { "epoch": 5.400913242009133, "grad_norm": 0.7271426320075989, "learning_rate": 5.1111111111111115e-06, "loss": 0.0053, "step": 5914 }, { "epoch": 5.401826484018265, "grad_norm": 5.241911888122559, "learning_rate": 5.110096397767631e-06, "loss": 0.0314, "step": 5915 }, { "epoch": 5.402739726027397, "grad_norm": 2.4766845703125, "learning_rate": 5.109081684424151e-06, "loss": 0.0107, "step": 5916 }, { "epoch": 5.403652968036529, "grad_norm": 33.92884826660156, "learning_rate": 5.10806697108067e-06, "loss": 0.2779, "step": 5917 }, { "epoch": 5.404566210045662, "grad_norm": 5.309550762176514, "learning_rate": 5.107052257737189e-06, "loss": 0.0253, "step": 5918 }, { "epoch": 5.405479452054794, "grad_norm": 0.6420443058013916, "learning_rate": 5.106037544393709e-06, "loss": 0.0036, "step": 5919 }, { "epoch": 5.406392694063927, "grad_norm": 20.526084899902344, "learning_rate": 5.10502283105023e-06, "loss": 0.1584, "step": 5920 }, { "epoch": 5.407305936073059, "grad_norm": 0.20885249972343445, "learning_rate": 5.104008117706748e-06, "loss": 0.0011, "step": 5921 }, { "epoch": 5.4082191780821915, "grad_norm": 3.2418394088745117, "learning_rate": 5.102993404363268e-06, "loss": 0.0139, "step": 5922 }, { "epoch": 5.409132420091324, "grad_norm": 2.707726240158081, "learning_rate": 5.101978691019788e-06, "loss": 0.0146, "step": 5923 }, { "epoch": 5.4100456621004565, "grad_norm": 2.038029432296753, "learning_rate": 5.100963977676307e-06, "loss": 0.0105, "step": 5924 }, { "epoch": 5.410958904109589, "grad_norm": 0.7083870768547058, "learning_rate": 5.099949264332826e-06, "loss": 0.0035, "step": 5925 }, { "epoch": 5.411872146118721, "grad_norm": 0.293759822845459, "learning_rate": 5.098934550989346e-06, "loss": 0.0015, "step": 5926 }, { "epoch": 5.412785388127854, "grad_norm": 0.13339076936244965, "learning_rate": 5.097919837645865e-06, "loss": 0.0007, "step": 5927 }, { "epoch": 5.413698630136986, "grad_norm": 1.0488191843032837, "learning_rate": 5.096905124302385e-06, "loss": 0.0047, "step": 5928 }, { "epoch": 5.414611872146119, "grad_norm": 0.049997664988040924, "learning_rate": 5.095890410958904e-06, "loss": 0.0003, "step": 5929 }, { "epoch": 5.415525114155251, "grad_norm": 0.6920482516288757, "learning_rate": 5.094875697615425e-06, "loss": 0.0049, "step": 5930 }, { "epoch": 5.416438356164384, "grad_norm": 3.367055892944336, "learning_rate": 5.093860984271944e-06, "loss": 0.0223, "step": 5931 }, { "epoch": 5.417351598173516, "grad_norm": 36.62857437133789, "learning_rate": 5.092846270928463e-06, "loss": 0.2901, "step": 5932 }, { "epoch": 5.4182648401826485, "grad_norm": 42.78873062133789, "learning_rate": 5.091831557584983e-06, "loss": 0.1644, "step": 5933 }, { "epoch": 5.419178082191781, "grad_norm": 0.3641909658908844, "learning_rate": 5.090816844241502e-06, "loss": 0.0015, "step": 5934 }, { "epoch": 5.420091324200913, "grad_norm": 0.5399484634399414, "learning_rate": 5.089802130898022e-06, "loss": 0.0035, "step": 5935 }, { "epoch": 5.421004566210046, "grad_norm": 1.1698939800262451, "learning_rate": 5.088787417554541e-06, "loss": 0.0079, "step": 5936 }, { "epoch": 5.421917808219178, "grad_norm": 6.764132022857666, "learning_rate": 5.08777270421106e-06, "loss": 0.0293, "step": 5937 }, { "epoch": 5.422831050228311, "grad_norm": 30.247671127319336, "learning_rate": 5.08675799086758e-06, "loss": 0.2214, "step": 5938 }, { "epoch": 5.423744292237443, "grad_norm": 0.04448872432112694, "learning_rate": 5.0857432775241e-06, "loss": 0.0002, "step": 5939 }, { "epoch": 5.424657534246576, "grad_norm": 0.918610155582428, "learning_rate": 5.08472856418062e-06, "loss": 0.0071, "step": 5940 }, { "epoch": 5.425570776255708, "grad_norm": 0.9199041128158569, "learning_rate": 5.083713850837139e-06, "loss": 0.0046, "step": 5941 }, { "epoch": 5.426484018264841, "grad_norm": 7.984180450439453, "learning_rate": 5.082699137493659e-06, "loss": 0.0457, "step": 5942 }, { "epoch": 5.427397260273972, "grad_norm": 1.1209005117416382, "learning_rate": 5.081684424150178e-06, "loss": 0.006, "step": 5943 }, { "epoch": 5.428310502283105, "grad_norm": 2.5092620849609375, "learning_rate": 5.080669710806697e-06, "loss": 0.0176, "step": 5944 }, { "epoch": 5.429223744292237, "grad_norm": 2.274136543273926, "learning_rate": 5.079654997463217e-06, "loss": 0.0128, "step": 5945 }, { "epoch": 5.4301369863013695, "grad_norm": 0.5717371702194214, "learning_rate": 5.0786402841197366e-06, "loss": 0.0035, "step": 5946 }, { "epoch": 5.431050228310502, "grad_norm": 0.9632689952850342, "learning_rate": 5.077625570776255e-06, "loss": 0.0055, "step": 5947 }, { "epoch": 5.4319634703196344, "grad_norm": 13.486738204956055, "learning_rate": 5.076610857432775e-06, "loss": 0.0952, "step": 5948 }, { "epoch": 5.432876712328767, "grad_norm": 0.5618943572044373, "learning_rate": 5.075596144089296e-06, "loss": 0.0005, "step": 5949 }, { "epoch": 5.433789954337899, "grad_norm": 4.101158618927002, "learning_rate": 5.074581430745815e-06, "loss": 0.0224, "step": 5950 }, { "epoch": 5.434703196347032, "grad_norm": 138.82615661621094, "learning_rate": 5.073566717402334e-06, "loss": 0.2199, "step": 5951 }, { "epoch": 5.435616438356164, "grad_norm": 1.37431001663208, "learning_rate": 5.072552004058854e-06, "loss": 0.0088, "step": 5952 }, { "epoch": 5.436529680365297, "grad_norm": 0.20363546907901764, "learning_rate": 5.0715372907153736e-06, "loss": 0.0013, "step": 5953 }, { "epoch": 5.437442922374429, "grad_norm": 23.282691955566406, "learning_rate": 5.070522577371892e-06, "loss": 0.1845, "step": 5954 }, { "epoch": 5.438356164383562, "grad_norm": 6.861678600311279, "learning_rate": 5.069507864028412e-06, "loss": 0.0301, "step": 5955 }, { "epoch": 5.439269406392694, "grad_norm": 0.9638235569000244, "learning_rate": 5.068493150684932e-06, "loss": 0.0049, "step": 5956 }, { "epoch": 5.4401826484018265, "grad_norm": 0.21636821329593658, "learning_rate": 5.067478437341451e-06, "loss": 0.0014, "step": 5957 }, { "epoch": 5.441095890410959, "grad_norm": 1.0499457120895386, "learning_rate": 5.066463723997971e-06, "loss": 0.0051, "step": 5958 }, { "epoch": 5.442009132420091, "grad_norm": 67.30091094970703, "learning_rate": 5.065449010654491e-06, "loss": 1.0238, "step": 5959 }, { "epoch": 5.442922374429224, "grad_norm": 2.9569365978240967, "learning_rate": 5.0644342973110106e-06, "loss": 0.0169, "step": 5960 }, { "epoch": 5.443835616438356, "grad_norm": 0.08202164620161057, "learning_rate": 5.063419583967529e-06, "loss": 0.0006, "step": 5961 }, { "epoch": 5.444748858447489, "grad_norm": 0.14667938649654388, "learning_rate": 5.062404870624049e-06, "loss": 0.001, "step": 5962 }, { "epoch": 5.445662100456621, "grad_norm": 0.010651436634361744, "learning_rate": 5.061390157280569e-06, "loss": 0.0001, "step": 5963 }, { "epoch": 5.446575342465754, "grad_norm": 0.23612505197525024, "learning_rate": 5.060375443937088e-06, "loss": 0.0016, "step": 5964 }, { "epoch": 5.447488584474886, "grad_norm": 0.028012549504637718, "learning_rate": 5.059360730593607e-06, "loss": 0.0002, "step": 5965 }, { "epoch": 5.448401826484019, "grad_norm": 3.582767963409424, "learning_rate": 5.058346017250128e-06, "loss": 0.0247, "step": 5966 }, { "epoch": 5.449315068493151, "grad_norm": 0.46758824586868286, "learning_rate": 5.057331303906647e-06, "loss": 0.0035, "step": 5967 }, { "epoch": 5.4502283105022835, "grad_norm": 5.003401756286621, "learning_rate": 5.056316590563166e-06, "loss": 0.0289, "step": 5968 }, { "epoch": 5.451141552511416, "grad_norm": 0.28198668360710144, "learning_rate": 5.055301877219686e-06, "loss": 0.0013, "step": 5969 }, { "epoch": 5.4520547945205475, "grad_norm": 2.8337321281433105, "learning_rate": 5.054287163876206e-06, "loss": 0.0124, "step": 5970 }, { "epoch": 5.45296803652968, "grad_norm": 0.2018682360649109, "learning_rate": 5.053272450532725e-06, "loss": 0.0016, "step": 5971 }, { "epoch": 5.453881278538812, "grad_norm": 4.131338596343994, "learning_rate": 5.052257737189244e-06, "loss": 0.0128, "step": 5972 }, { "epoch": 5.454794520547945, "grad_norm": 0.152289479970932, "learning_rate": 5.051243023845764e-06, "loss": 0.0011, "step": 5973 }, { "epoch": 5.455707762557077, "grad_norm": 0.021888891234993935, "learning_rate": 5.050228310502283e-06, "loss": 0.0001, "step": 5974 }, { "epoch": 5.45662100456621, "grad_norm": 1.5548450946807861, "learning_rate": 5.049213597158803e-06, "loss": 0.0109, "step": 5975 }, { "epoch": 5.457534246575342, "grad_norm": 2.3261637687683105, "learning_rate": 5.048198883815323e-06, "loss": 0.0121, "step": 5976 }, { "epoch": 5.458447488584475, "grad_norm": 1.2905728816986084, "learning_rate": 5.047184170471842e-06, "loss": 0.0078, "step": 5977 }, { "epoch": 5.459360730593607, "grad_norm": 1.9841725826263428, "learning_rate": 5.046169457128362e-06, "loss": 0.0119, "step": 5978 }, { "epoch": 5.46027397260274, "grad_norm": 3.9688823223114014, "learning_rate": 5.045154743784881e-06, "loss": 0.0307, "step": 5979 }, { "epoch": 5.461187214611872, "grad_norm": 10.206511497497559, "learning_rate": 5.044140030441401e-06, "loss": 0.055, "step": 5980 }, { "epoch": 5.4621004566210045, "grad_norm": 1.7787631750106812, "learning_rate": 5.04312531709792e-06, "loss": 0.0102, "step": 5981 }, { "epoch": 5.463013698630137, "grad_norm": 0.031211812049150467, "learning_rate": 5.0421106037544395e-06, "loss": 0.0001, "step": 5982 }, { "epoch": 5.463926940639269, "grad_norm": 52.1816291809082, "learning_rate": 5.04109589041096e-06, "loss": 0.3394, "step": 5983 }, { "epoch": 5.464840182648402, "grad_norm": 1.6314682960510254, "learning_rate": 5.040081177067478e-06, "loss": 0.0067, "step": 5984 }, { "epoch": 5.465753424657534, "grad_norm": 7.31341552734375, "learning_rate": 5.039066463723999e-06, "loss": 0.0425, "step": 5985 }, { "epoch": 5.466666666666667, "grad_norm": 3.303750514984131, "learning_rate": 5.038051750380518e-06, "loss": 0.0252, "step": 5986 }, { "epoch": 5.467579908675799, "grad_norm": 1.3042510747909546, "learning_rate": 5.037037037037037e-06, "loss": 0.0112, "step": 5987 }, { "epoch": 5.468493150684932, "grad_norm": 1.1620818376541138, "learning_rate": 5.036022323693557e-06, "loss": 0.008, "step": 5988 }, { "epoch": 5.469406392694064, "grad_norm": 1.9783384799957275, "learning_rate": 5.0350076103500765e-06, "loss": 0.0087, "step": 5989 }, { "epoch": 5.470319634703197, "grad_norm": 4.644320964813232, "learning_rate": 5.033992897006596e-06, "loss": 0.0385, "step": 5990 }, { "epoch": 5.471232876712329, "grad_norm": 2.2815968990325928, "learning_rate": 5.032978183663115e-06, "loss": 0.0129, "step": 5991 }, { "epoch": 5.4721461187214615, "grad_norm": 24.565502166748047, "learning_rate": 5.031963470319635e-06, "loss": 0.1704, "step": 5992 }, { "epoch": 5.473059360730594, "grad_norm": 8.532377243041992, "learning_rate": 5.030948756976155e-06, "loss": 0.0407, "step": 5993 }, { "epoch": 5.473972602739726, "grad_norm": 1.3245998620986938, "learning_rate": 5.029934043632674e-06, "loss": 0.0071, "step": 5994 }, { "epoch": 5.474885844748858, "grad_norm": 42.224613189697266, "learning_rate": 5.028919330289194e-06, "loss": 0.212, "step": 5995 }, { "epoch": 5.475799086757991, "grad_norm": 7.365599632263184, "learning_rate": 5.0279046169457135e-06, "loss": 0.0443, "step": 5996 }, { "epoch": 5.476712328767123, "grad_norm": 25.50279998779297, "learning_rate": 5.026889903602232e-06, "loss": 0.0975, "step": 5997 }, { "epoch": 5.477625570776255, "grad_norm": 5.31293249130249, "learning_rate": 5.025875190258752e-06, "loss": 0.0332, "step": 5998 }, { "epoch": 5.478538812785388, "grad_norm": 0.15593428909778595, "learning_rate": 5.024860476915272e-06, "loss": 0.0012, "step": 5999 }, { "epoch": 5.47945205479452, "grad_norm": 0.21383559703826904, "learning_rate": 5.0238457635717915e-06, "loss": 0.0012, "step": 6000 }, { "epoch": 5.480365296803653, "grad_norm": 0.0052279080264270306, "learning_rate": 5.02283105022831e-06, "loss": 0.0, "step": 6001 }, { "epoch": 5.481278538812785, "grad_norm": 20.003873825073242, "learning_rate": 5.021816336884831e-06, "loss": 0.0823, "step": 6002 }, { "epoch": 5.482191780821918, "grad_norm": 0.9121565818786621, "learning_rate": 5.0208016235413505e-06, "loss": 0.0055, "step": 6003 }, { "epoch": 5.48310502283105, "grad_norm": 0.9471112489700317, "learning_rate": 5.019786910197869e-06, "loss": 0.0054, "step": 6004 }, { "epoch": 5.4840182648401825, "grad_norm": 3.8396947383880615, "learning_rate": 5.018772196854389e-06, "loss": 0.0228, "step": 6005 }, { "epoch": 5.484931506849315, "grad_norm": 9.920799255371094, "learning_rate": 5.017757483510909e-06, "loss": 0.0463, "step": 6006 }, { "epoch": 5.485844748858447, "grad_norm": 51.817867279052734, "learning_rate": 5.016742770167428e-06, "loss": 0.5167, "step": 6007 }, { "epoch": 5.48675799086758, "grad_norm": 12.427271842956543, "learning_rate": 5.015728056823947e-06, "loss": 0.0785, "step": 6008 }, { "epoch": 5.487671232876712, "grad_norm": 5.3483099937438965, "learning_rate": 5.014713343480467e-06, "loss": 0.0326, "step": 6009 }, { "epoch": 5.488584474885845, "grad_norm": 0.7945491075515747, "learning_rate": 5.0136986301369875e-06, "loss": 0.0048, "step": 6010 }, { "epoch": 5.489497716894977, "grad_norm": 5.7434492111206055, "learning_rate": 5.012683916793506e-06, "loss": 0.0199, "step": 6011 }, { "epoch": 5.49041095890411, "grad_norm": 0.33361783623695374, "learning_rate": 5.011669203450026e-06, "loss": 0.002, "step": 6012 }, { "epoch": 5.491324200913242, "grad_norm": 2.031273126602173, "learning_rate": 5.010654490106546e-06, "loss": 0.0097, "step": 6013 }, { "epoch": 5.492237442922375, "grad_norm": 217.09149169921875, "learning_rate": 5.009639776763065e-06, "loss": 4.0547, "step": 6014 }, { "epoch": 5.493150684931507, "grad_norm": 0.9579596519470215, "learning_rate": 5.008625063419584e-06, "loss": 0.005, "step": 6015 }, { "epoch": 5.4940639269406395, "grad_norm": 14.368438720703125, "learning_rate": 5.007610350076104e-06, "loss": 0.1041, "step": 6016 }, { "epoch": 5.494977168949772, "grad_norm": 11.186957359313965, "learning_rate": 5.006595636732623e-06, "loss": 0.0616, "step": 6017 }, { "epoch": 5.495890410958904, "grad_norm": 3.686694383621216, "learning_rate": 5.0055809233891425e-06, "loss": 0.0202, "step": 6018 }, { "epoch": 5.496803652968037, "grad_norm": 5.008241176605225, "learning_rate": 5.004566210045663e-06, "loss": 0.0284, "step": 6019 }, { "epoch": 5.497716894977169, "grad_norm": 4.096617221832275, "learning_rate": 5.003551496702183e-06, "loss": 0.034, "step": 6020 }, { "epoch": 5.498630136986302, "grad_norm": 0.026719363406300545, "learning_rate": 5.002536783358702e-06, "loss": 0.0002, "step": 6021 }, { "epoch": 5.499543378995433, "grad_norm": 0.5917346477508545, "learning_rate": 5.001522070015221e-06, "loss": 0.0048, "step": 6022 }, { "epoch": 5.500456621004567, "grad_norm": 0.5987708568572998, "learning_rate": 5.000507356671741e-06, "loss": 0.0033, "step": 6023 }, { "epoch": 5.501369863013698, "grad_norm": 1.9631105661392212, "learning_rate": 4.999492643328261e-06, "loss": 0.0146, "step": 6024 }, { "epoch": 5.502283105022831, "grad_norm": 0.3385750651359558, "learning_rate": 4.9984779299847795e-06, "loss": 0.0022, "step": 6025 }, { "epoch": 5.503196347031963, "grad_norm": 35.75171661376953, "learning_rate": 4.997463216641299e-06, "loss": 0.2267, "step": 6026 }, { "epoch": 5.504109589041096, "grad_norm": 0.41143450140953064, "learning_rate": 4.996448503297819e-06, "loss": 0.0025, "step": 6027 }, { "epoch": 5.505022831050228, "grad_norm": 0.3413510024547577, "learning_rate": 4.995433789954338e-06, "loss": 0.0019, "step": 6028 }, { "epoch": 5.5059360730593605, "grad_norm": 8.524327278137207, "learning_rate": 4.994419076610858e-06, "loss": 0.0482, "step": 6029 }, { "epoch": 5.506849315068493, "grad_norm": 0.14519503712654114, "learning_rate": 4.993404363267377e-06, "loss": 0.0009, "step": 6030 }, { "epoch": 5.507762557077625, "grad_norm": 63.506195068359375, "learning_rate": 4.992389649923897e-06, "loss": 0.4742, "step": 6031 }, { "epoch": 5.508675799086758, "grad_norm": 3.08359694480896, "learning_rate": 4.9913749365804165e-06, "loss": 0.0167, "step": 6032 }, { "epoch": 5.50958904109589, "grad_norm": 1.9651119709014893, "learning_rate": 4.990360223236936e-06, "loss": 0.0145, "step": 6033 }, { "epoch": 5.510502283105023, "grad_norm": 95.97836303710938, "learning_rate": 4.989345509893456e-06, "loss": 0.6665, "step": 6034 }, { "epoch": 5.511415525114155, "grad_norm": 6.668697357177734, "learning_rate": 4.988330796549975e-06, "loss": 0.0307, "step": 6035 }, { "epoch": 5.512328767123288, "grad_norm": 0.23131932318210602, "learning_rate": 4.9873160832064944e-06, "loss": 0.0013, "step": 6036 }, { "epoch": 5.51324200913242, "grad_norm": 3.6406102180480957, "learning_rate": 4.986301369863014e-06, "loss": 0.0203, "step": 6037 }, { "epoch": 5.514155251141553, "grad_norm": 2.5947978496551514, "learning_rate": 4.985286656519534e-06, "loss": 0.0151, "step": 6038 }, { "epoch": 5.515068493150685, "grad_norm": 0.41330236196517944, "learning_rate": 4.9842719431760535e-06, "loss": 0.0023, "step": 6039 }, { "epoch": 5.5159817351598175, "grad_norm": 0.9853816032409668, "learning_rate": 4.983257229832572e-06, "loss": 0.0052, "step": 6040 }, { "epoch": 5.51689497716895, "grad_norm": 2.6174049377441406, "learning_rate": 4.982242516489092e-06, "loss": 0.014, "step": 6041 }, { "epoch": 5.517808219178082, "grad_norm": 0.3446061313152313, "learning_rate": 4.981227803145612e-06, "loss": 0.0017, "step": 6042 }, { "epoch": 5.518721461187215, "grad_norm": 9.94777774810791, "learning_rate": 4.9802130898021314e-06, "loss": 0.0573, "step": 6043 }, { "epoch": 5.519634703196347, "grad_norm": 0.35152459144592285, "learning_rate": 4.979198376458651e-06, "loss": 0.0025, "step": 6044 }, { "epoch": 5.52054794520548, "grad_norm": 0.3122032880783081, "learning_rate": 4.97818366311517e-06, "loss": 0.0012, "step": 6045 }, { "epoch": 5.521461187214612, "grad_norm": 0.685538649559021, "learning_rate": 4.97716894977169e-06, "loss": 0.0024, "step": 6046 }, { "epoch": 5.522374429223745, "grad_norm": 7.05784797668457, "learning_rate": 4.976154236428209e-06, "loss": 0.0547, "step": 6047 }, { "epoch": 5.523287671232877, "grad_norm": 20.839466094970703, "learning_rate": 4.975139523084729e-06, "loss": 0.0856, "step": 6048 }, { "epoch": 5.524200913242009, "grad_norm": 3.7842037677764893, "learning_rate": 4.974124809741249e-06, "loss": 0.0263, "step": 6049 }, { "epoch": 5.525114155251142, "grad_norm": 84.28697204589844, "learning_rate": 4.973110096397768e-06, "loss": 0.8805, "step": 6050 }, { "epoch": 5.526027397260274, "grad_norm": 4.22861909866333, "learning_rate": 4.972095383054287e-06, "loss": 0.025, "step": 6051 }, { "epoch": 5.526940639269406, "grad_norm": 29.694198608398438, "learning_rate": 4.971080669710807e-06, "loss": 0.2456, "step": 6052 }, { "epoch": 5.5278538812785385, "grad_norm": 2.46893048286438, "learning_rate": 4.970065956367327e-06, "loss": 0.0116, "step": 6053 }, { "epoch": 5.528767123287671, "grad_norm": 5.582386016845703, "learning_rate": 4.969051243023846e-06, "loss": 0.0441, "step": 6054 }, { "epoch": 5.529680365296803, "grad_norm": 4.598155498504639, "learning_rate": 4.968036529680366e-06, "loss": 0.026, "step": 6055 }, { "epoch": 5.530593607305936, "grad_norm": 101.64908599853516, "learning_rate": 4.967021816336885e-06, "loss": 1.3945, "step": 6056 }, { "epoch": 5.531506849315068, "grad_norm": 0.8933621644973755, "learning_rate": 4.966007102993405e-06, "loss": 0.0047, "step": 6057 }, { "epoch": 5.532420091324201, "grad_norm": 0.18385787308216095, "learning_rate": 4.964992389649924e-06, "loss": 0.0013, "step": 6058 }, { "epoch": 5.533333333333333, "grad_norm": 0.5982921719551086, "learning_rate": 4.963977676306444e-06, "loss": 0.0039, "step": 6059 }, { "epoch": 5.534246575342466, "grad_norm": 0.38147759437561035, "learning_rate": 4.962962962962964e-06, "loss": 0.0013, "step": 6060 }, { "epoch": 5.535159817351598, "grad_norm": 10.757244110107422, "learning_rate": 4.9619482496194825e-06, "loss": 0.0532, "step": 6061 }, { "epoch": 5.536073059360731, "grad_norm": 0.41210439801216125, "learning_rate": 4.960933536276002e-06, "loss": 0.0022, "step": 6062 }, { "epoch": 5.536986301369863, "grad_norm": 9.484064102172852, "learning_rate": 4.959918822932522e-06, "loss": 0.0598, "step": 6063 }, { "epoch": 5.5378995433789955, "grad_norm": 2.010976552963257, "learning_rate": 4.958904109589042e-06, "loss": 0.0105, "step": 6064 }, { "epoch": 5.538812785388128, "grad_norm": 0.09200466424226761, "learning_rate": 4.957889396245561e-06, "loss": 0.0006, "step": 6065 }, { "epoch": 5.53972602739726, "grad_norm": 14.857340812683105, "learning_rate": 4.95687468290208e-06, "loss": 0.0765, "step": 6066 }, { "epoch": 5.540639269406393, "grad_norm": 0.07945464551448822, "learning_rate": 4.9558599695586e-06, "loss": 0.0005, "step": 6067 }, { "epoch": 5.541552511415525, "grad_norm": 6.873791217803955, "learning_rate": 4.9548452562151195e-06, "loss": 0.0396, "step": 6068 }, { "epoch": 5.542465753424658, "grad_norm": 0.2561158239841461, "learning_rate": 4.953830542871639e-06, "loss": 0.0014, "step": 6069 }, { "epoch": 5.54337899543379, "grad_norm": 0.6617558598518372, "learning_rate": 4.952815829528159e-06, "loss": 0.0062, "step": 6070 }, { "epoch": 5.544292237442923, "grad_norm": 0.618354082107544, "learning_rate": 4.951801116184678e-06, "loss": 0.0041, "step": 6071 }, { "epoch": 5.545205479452055, "grad_norm": 0.7856453657150269, "learning_rate": 4.9507864028411974e-06, "loss": 0.0044, "step": 6072 }, { "epoch": 5.546118721461188, "grad_norm": 0.028732355684041977, "learning_rate": 4.949771689497717e-06, "loss": 0.0001, "step": 6073 }, { "epoch": 5.54703196347032, "grad_norm": 0.6587498784065247, "learning_rate": 4.948756976154237e-06, "loss": 0.0038, "step": 6074 }, { "epoch": 5.5479452054794525, "grad_norm": 1.256264090538025, "learning_rate": 4.9477422628107565e-06, "loss": 0.0074, "step": 6075 }, { "epoch": 5.548858447488584, "grad_norm": 7.670838356018066, "learning_rate": 4.946727549467275e-06, "loss": 0.0292, "step": 6076 }, { "epoch": 5.549771689497717, "grad_norm": 0.02675800770521164, "learning_rate": 4.945712836123796e-06, "loss": 0.0002, "step": 6077 }, { "epoch": 5.550684931506849, "grad_norm": 7.153369903564453, "learning_rate": 4.944698122780315e-06, "loss": 0.022, "step": 6078 }, { "epoch": 5.551598173515981, "grad_norm": 20.043472290039062, "learning_rate": 4.9436834094368344e-06, "loss": 0.1296, "step": 6079 }, { "epoch": 5.552511415525114, "grad_norm": 3.8007893562316895, "learning_rate": 4.942668696093354e-06, "loss": 0.0145, "step": 6080 }, { "epoch": 5.553424657534246, "grad_norm": 50.14711380004883, "learning_rate": 4.941653982749873e-06, "loss": 0.2902, "step": 6081 }, { "epoch": 5.554337899543379, "grad_norm": 12.959783554077148, "learning_rate": 4.9406392694063935e-06, "loss": 0.0793, "step": 6082 }, { "epoch": 5.555251141552511, "grad_norm": 0.5840331315994263, "learning_rate": 4.939624556062912e-06, "loss": 0.0037, "step": 6083 }, { "epoch": 5.556164383561644, "grad_norm": 17.135440826416016, "learning_rate": 4.938609842719432e-06, "loss": 0.0944, "step": 6084 }, { "epoch": 5.557077625570776, "grad_norm": 1.350685715675354, "learning_rate": 4.937595129375952e-06, "loss": 0.0096, "step": 6085 }, { "epoch": 5.557990867579909, "grad_norm": 0.2466716170310974, "learning_rate": 4.9365804160324706e-06, "loss": 0.0014, "step": 6086 }, { "epoch": 5.558904109589041, "grad_norm": 6.604089736938477, "learning_rate": 4.935565702688991e-06, "loss": 0.0372, "step": 6087 }, { "epoch": 5.5598173515981735, "grad_norm": 0.19453898072242737, "learning_rate": 4.93455098934551e-06, "loss": 0.0012, "step": 6088 }, { "epoch": 5.560730593607306, "grad_norm": 7.354029178619385, "learning_rate": 4.93353627600203e-06, "loss": 0.0569, "step": 6089 }, { "epoch": 5.561643835616438, "grad_norm": 35.52647399902344, "learning_rate": 4.932521562658549e-06, "loss": 0.2668, "step": 6090 }, { "epoch": 5.562557077625571, "grad_norm": 0.18080563843250275, "learning_rate": 4.931506849315069e-06, "loss": 0.0011, "step": 6091 }, { "epoch": 5.563470319634703, "grad_norm": 2.5130841732025146, "learning_rate": 4.930492135971589e-06, "loss": 0.0178, "step": 6092 }, { "epoch": 5.564383561643836, "grad_norm": 42.69486618041992, "learning_rate": 4.9294774226281076e-06, "loss": 0.4065, "step": 6093 }, { "epoch": 5.565296803652968, "grad_norm": 0.7254257202148438, "learning_rate": 4.928462709284627e-06, "loss": 0.0043, "step": 6094 }, { "epoch": 5.566210045662101, "grad_norm": 2.260483741760254, "learning_rate": 4.927447995941147e-06, "loss": 0.0157, "step": 6095 }, { "epoch": 5.567123287671233, "grad_norm": 2.5464377403259277, "learning_rate": 4.926433282597667e-06, "loss": 0.0152, "step": 6096 }, { "epoch": 5.5680365296803656, "grad_norm": 2.4220449924468994, "learning_rate": 4.925418569254186e-06, "loss": 0.0168, "step": 6097 }, { "epoch": 5.568949771689498, "grad_norm": 1.1373919248580933, "learning_rate": 4.924403855910705e-06, "loss": 0.0056, "step": 6098 }, { "epoch": 5.5698630136986305, "grad_norm": 10.39106273651123, "learning_rate": 4.923389142567226e-06, "loss": 0.1152, "step": 6099 }, { "epoch": 5.570776255707763, "grad_norm": 4.580247402191162, "learning_rate": 4.9223744292237446e-06, "loss": 0.0213, "step": 6100 }, { "epoch": 5.5716894977168945, "grad_norm": 2.172813892364502, "learning_rate": 4.921359715880264e-06, "loss": 0.01, "step": 6101 }, { "epoch": 5.572602739726028, "grad_norm": 0.0029901592060923576, "learning_rate": 4.920345002536784e-06, "loss": 0.0, "step": 6102 }, { "epoch": 5.573515981735159, "grad_norm": 1.6811548471450806, "learning_rate": 4.919330289193303e-06, "loss": 0.0115, "step": 6103 }, { "epoch": 5.574429223744293, "grad_norm": 2.8103115558624268, "learning_rate": 4.918315575849823e-06, "loss": 0.0129, "step": 6104 }, { "epoch": 5.575342465753424, "grad_norm": 0.951974630355835, "learning_rate": 4.917300862506342e-06, "loss": 0.0074, "step": 6105 }, { "epoch": 5.576255707762557, "grad_norm": 6.928492069244385, "learning_rate": 4.916286149162862e-06, "loss": 0.0311, "step": 6106 }, { "epoch": 5.577168949771689, "grad_norm": 15.297346115112305, "learning_rate": 4.9152714358193816e-06, "loss": 0.1196, "step": 6107 }, { "epoch": 5.578082191780822, "grad_norm": 0.3687571883201599, "learning_rate": 4.9142567224759e-06, "loss": 0.0024, "step": 6108 }, { "epoch": 5.578995433789954, "grad_norm": 1.6310675144195557, "learning_rate": 4.913242009132421e-06, "loss": 0.0115, "step": 6109 }, { "epoch": 5.579908675799087, "grad_norm": 0.1376333236694336, "learning_rate": 4.91222729578894e-06, "loss": 0.0012, "step": 6110 }, { "epoch": 5.580821917808219, "grad_norm": 3.5958971977233887, "learning_rate": 4.9112125824454595e-06, "loss": 0.0218, "step": 6111 }, { "epoch": 5.5817351598173515, "grad_norm": 0.8676282167434692, "learning_rate": 4.910197869101979e-06, "loss": 0.0056, "step": 6112 }, { "epoch": 5.582648401826484, "grad_norm": 0.055713001638650894, "learning_rate": 4.909183155758499e-06, "loss": 0.0003, "step": 6113 }, { "epoch": 5.583561643835616, "grad_norm": 0.2782297134399414, "learning_rate": 4.9081684424150186e-06, "loss": 0.002, "step": 6114 }, { "epoch": 5.584474885844749, "grad_norm": 1.2485228776931763, "learning_rate": 4.907153729071537e-06, "loss": 0.0069, "step": 6115 }, { "epoch": 5.585388127853881, "grad_norm": 1.7225834131240845, "learning_rate": 4.906139015728057e-06, "loss": 0.0102, "step": 6116 }, { "epoch": 5.586301369863014, "grad_norm": 3.1220850944519043, "learning_rate": 4.905124302384577e-06, "loss": 0.0209, "step": 6117 }, { "epoch": 5.587214611872146, "grad_norm": 0.4145631790161133, "learning_rate": 4.9041095890410965e-06, "loss": 0.0022, "step": 6118 }, { "epoch": 5.588127853881279, "grad_norm": 0.8038853406906128, "learning_rate": 4.903094875697616e-06, "loss": 0.005, "step": 6119 }, { "epoch": 5.589041095890411, "grad_norm": 3.2765917778015137, "learning_rate": 4.902080162354135e-06, "loss": 0.0149, "step": 6120 }, { "epoch": 5.5899543378995435, "grad_norm": 77.67150115966797, "learning_rate": 4.901065449010655e-06, "loss": 0.3532, "step": 6121 }, { "epoch": 5.590867579908676, "grad_norm": 92.50324249267578, "learning_rate": 4.900050735667174e-06, "loss": 1.2867, "step": 6122 }, { "epoch": 5.5917808219178085, "grad_norm": 6.210866451263428, "learning_rate": 4.899036022323694e-06, "loss": 0.011, "step": 6123 }, { "epoch": 5.592694063926941, "grad_norm": 104.68810272216797, "learning_rate": 4.898021308980214e-06, "loss": 1.4288, "step": 6124 }, { "epoch": 5.593607305936073, "grad_norm": 1.3776658773422241, "learning_rate": 4.897006595636733e-06, "loss": 0.0067, "step": 6125 }, { "epoch": 5.594520547945206, "grad_norm": 0.3216652572154999, "learning_rate": 4.895991882293252e-06, "loss": 0.0017, "step": 6126 }, { "epoch": 5.595433789954338, "grad_norm": 0.5116814970970154, "learning_rate": 4.894977168949772e-06, "loss": 0.0023, "step": 6127 }, { "epoch": 5.59634703196347, "grad_norm": 1.963927984237671, "learning_rate": 4.893962455606292e-06, "loss": 0.015, "step": 6128 }, { "epoch": 5.597260273972603, "grad_norm": 1.935160517692566, "learning_rate": 4.892947742262811e-06, "loss": 0.0108, "step": 6129 }, { "epoch": 5.598173515981735, "grad_norm": 4.212774276733398, "learning_rate": 4.89193302891933e-06, "loss": 0.0323, "step": 6130 }, { "epoch": 5.599086757990867, "grad_norm": 1.4726039171218872, "learning_rate": 4.89091831557585e-06, "loss": 0.0083, "step": 6131 }, { "epoch": 5.6, "grad_norm": 0.7858176827430725, "learning_rate": 4.88990360223237e-06, "loss": 0.0034, "step": 6132 }, { "epoch": 5.600913242009132, "grad_norm": 1.8986626863479614, "learning_rate": 4.888888888888889e-06, "loss": 0.014, "step": 6133 }, { "epoch": 5.6018264840182646, "grad_norm": 0.7526654005050659, "learning_rate": 4.887874175545409e-06, "loss": 0.0043, "step": 6134 }, { "epoch": 5.602739726027397, "grad_norm": 0.2075674682855606, "learning_rate": 4.886859462201929e-06, "loss": 0.0015, "step": 6135 }, { "epoch": 5.6036529680365295, "grad_norm": 0.7174786329269409, "learning_rate": 4.8858447488584476e-06, "loss": 0.0032, "step": 6136 }, { "epoch": 5.604566210045662, "grad_norm": 0.05975215509533882, "learning_rate": 4.884830035514967e-06, "loss": 0.0004, "step": 6137 }, { "epoch": 5.605479452054794, "grad_norm": 0.06937610357999802, "learning_rate": 4.883815322171487e-06, "loss": 0.0006, "step": 6138 }, { "epoch": 5.606392694063927, "grad_norm": 0.8531103730201721, "learning_rate": 4.882800608828007e-06, "loss": 0.0038, "step": 6139 }, { "epoch": 5.607305936073059, "grad_norm": 0.12605181336402893, "learning_rate": 4.881785895484526e-06, "loss": 0.0004, "step": 6140 }, { "epoch": 5.608219178082192, "grad_norm": 0.020141705870628357, "learning_rate": 4.880771182141045e-06, "loss": 0.0001, "step": 6141 }, { "epoch": 5.609132420091324, "grad_norm": 109.27742767333984, "learning_rate": 4.879756468797565e-06, "loss": 3.7577, "step": 6142 }, { "epoch": 5.610045662100457, "grad_norm": 0.07367727905511856, "learning_rate": 4.8787417554540846e-06, "loss": 0.0004, "step": 6143 }, { "epoch": 5.610958904109589, "grad_norm": 3.257004499435425, "learning_rate": 4.877727042110604e-06, "loss": 0.0186, "step": 6144 }, { "epoch": 5.6118721461187215, "grad_norm": 0.5485288500785828, "learning_rate": 4.876712328767124e-06, "loss": 0.0023, "step": 6145 }, { "epoch": 5.612785388127854, "grad_norm": 0.8767184615135193, "learning_rate": 4.875697615423643e-06, "loss": 0.0047, "step": 6146 }, { "epoch": 5.6136986301369864, "grad_norm": 0.24076512455940247, "learning_rate": 4.8746829020801625e-06, "loss": 0.0014, "step": 6147 }, { "epoch": 5.614611872146119, "grad_norm": 0.13872762024402618, "learning_rate": 4.873668188736682e-06, "loss": 0.0009, "step": 6148 }, { "epoch": 5.615525114155251, "grad_norm": 35.34217071533203, "learning_rate": 4.872653475393202e-06, "loss": 0.2601, "step": 6149 }, { "epoch": 5.616438356164384, "grad_norm": 16.82062530517578, "learning_rate": 4.8716387620497216e-06, "loss": 0.1056, "step": 6150 }, { "epoch": 5.617351598173516, "grad_norm": 11.065567970275879, "learning_rate": 4.87062404870624e-06, "loss": 0.0677, "step": 6151 }, { "epoch": 5.618264840182649, "grad_norm": 0.3554242253303528, "learning_rate": 4.86960933536276e-06, "loss": 0.0023, "step": 6152 }, { "epoch": 5.619178082191781, "grad_norm": 50.08404541015625, "learning_rate": 4.86859462201928e-06, "loss": 0.3309, "step": 6153 }, { "epoch": 5.620091324200914, "grad_norm": 0.813753604888916, "learning_rate": 4.8675799086757995e-06, "loss": 0.0032, "step": 6154 }, { "epoch": 5.621004566210045, "grad_norm": 4.135814189910889, "learning_rate": 4.866565195332319e-06, "loss": 0.0134, "step": 6155 }, { "epoch": 5.6219178082191785, "grad_norm": 0.14140942692756653, "learning_rate": 4.865550481988838e-06, "loss": 0.0009, "step": 6156 }, { "epoch": 5.62283105022831, "grad_norm": 0.4618859589099884, "learning_rate": 4.8645357686453585e-06, "loss": 0.0026, "step": 6157 }, { "epoch": 5.6237442922374425, "grad_norm": 2.8089828491210938, "learning_rate": 4.863521055301877e-06, "loss": 0.018, "step": 6158 }, { "epoch": 5.624657534246575, "grad_norm": 16.923946380615234, "learning_rate": 4.862506341958397e-06, "loss": 0.0795, "step": 6159 }, { "epoch": 5.6255707762557075, "grad_norm": 102.10325622558594, "learning_rate": 4.861491628614917e-06, "loss": 2.1001, "step": 6160 }, { "epoch": 5.62648401826484, "grad_norm": 2.691350221633911, "learning_rate": 4.860476915271436e-06, "loss": 0.0118, "step": 6161 }, { "epoch": 5.627397260273972, "grad_norm": 4.104260444641113, "learning_rate": 4.859462201927956e-06, "loss": 0.0256, "step": 6162 }, { "epoch": 5.628310502283105, "grad_norm": 0.47548025846481323, "learning_rate": 4.858447488584475e-06, "loss": 0.0028, "step": 6163 }, { "epoch": 5.629223744292237, "grad_norm": 25.637245178222656, "learning_rate": 4.857432775240995e-06, "loss": 0.095, "step": 6164 }, { "epoch": 5.63013698630137, "grad_norm": 0.4121074080467224, "learning_rate": 4.856418061897514e-06, "loss": 0.0027, "step": 6165 }, { "epoch": 5.631050228310502, "grad_norm": 0.5800380706787109, "learning_rate": 4.855403348554033e-06, "loss": 0.0028, "step": 6166 }, { "epoch": 5.631963470319635, "grad_norm": 0.21650485694408417, "learning_rate": 4.854388635210554e-06, "loss": 0.0013, "step": 6167 }, { "epoch": 5.632876712328767, "grad_norm": 0.927836537361145, "learning_rate": 4.853373921867073e-06, "loss": 0.0066, "step": 6168 }, { "epoch": 5.6337899543378995, "grad_norm": 0.903235673904419, "learning_rate": 4.852359208523592e-06, "loss": 0.0045, "step": 6169 }, { "epoch": 5.634703196347032, "grad_norm": 5.055257320404053, "learning_rate": 4.851344495180112e-06, "loss": 0.027, "step": 6170 }, { "epoch": 5.635616438356164, "grad_norm": 3.457442283630371, "learning_rate": 4.850329781836632e-06, "loss": 0.025, "step": 6171 }, { "epoch": 5.636529680365297, "grad_norm": 0.7583674788475037, "learning_rate": 4.849315068493151e-06, "loss": 0.0043, "step": 6172 }, { "epoch": 5.637442922374429, "grad_norm": 3.0032131671905518, "learning_rate": 4.84830035514967e-06, "loss": 0.0146, "step": 6173 }, { "epoch": 5.638356164383562, "grad_norm": 21.649873733520508, "learning_rate": 4.84728564180619e-06, "loss": 0.1837, "step": 6174 }, { "epoch": 5.639269406392694, "grad_norm": 1.033684253692627, "learning_rate": 4.84627092846271e-06, "loss": 0.0105, "step": 6175 }, { "epoch": 5.640182648401827, "grad_norm": 1.110152006149292, "learning_rate": 4.845256215119229e-06, "loss": 0.0014, "step": 6176 }, { "epoch": 5.641095890410959, "grad_norm": 1.2725520133972168, "learning_rate": 4.844241501775749e-06, "loss": 0.0055, "step": 6177 }, { "epoch": 5.642009132420092, "grad_norm": 42.964290618896484, "learning_rate": 4.843226788432268e-06, "loss": 0.4583, "step": 6178 }, { "epoch": 5.642922374429224, "grad_norm": 1.3662059307098389, "learning_rate": 4.842212075088788e-06, "loss": 0.008, "step": 6179 }, { "epoch": 5.6438356164383565, "grad_norm": 10.87923812866211, "learning_rate": 4.841197361745307e-06, "loss": 0.088, "step": 6180 }, { "epoch": 5.644748858447489, "grad_norm": 11.260472297668457, "learning_rate": 4.840182648401827e-06, "loss": 0.0816, "step": 6181 }, { "epoch": 5.6456621004566205, "grad_norm": 4.187982082366943, "learning_rate": 4.839167935058347e-06, "loss": 0.0245, "step": 6182 }, { "epoch": 5.646575342465754, "grad_norm": 0.11607375741004944, "learning_rate": 4.8381532217148655e-06, "loss": 0.0008, "step": 6183 }, { "epoch": 5.647488584474885, "grad_norm": 2.155867338180542, "learning_rate": 4.837138508371386e-06, "loss": 0.0104, "step": 6184 }, { "epoch": 5.648401826484018, "grad_norm": 5.364386081695557, "learning_rate": 4.836123795027905e-06, "loss": 0.02, "step": 6185 }, { "epoch": 5.64931506849315, "grad_norm": 0.16326814889907837, "learning_rate": 4.8351090816844245e-06, "loss": 0.0008, "step": 6186 }, { "epoch": 5.650228310502283, "grad_norm": 104.10813903808594, "learning_rate": 4.834094368340944e-06, "loss": 3.3451, "step": 6187 }, { "epoch": 5.651141552511415, "grad_norm": 41.531227111816406, "learning_rate": 4.833079654997463e-06, "loss": 0.1541, "step": 6188 }, { "epoch": 5.652054794520548, "grad_norm": 1.3526393175125122, "learning_rate": 4.832064941653984e-06, "loss": 0.0082, "step": 6189 }, { "epoch": 5.65296803652968, "grad_norm": 2.504042148590088, "learning_rate": 4.8310502283105025e-06, "loss": 0.0169, "step": 6190 }, { "epoch": 5.653881278538813, "grad_norm": 6.088899612426758, "learning_rate": 4.830035514967022e-06, "loss": 0.0293, "step": 6191 }, { "epoch": 5.654794520547945, "grad_norm": 22.959470748901367, "learning_rate": 4.829020801623542e-06, "loss": 0.1275, "step": 6192 }, { "epoch": 5.6557077625570775, "grad_norm": 2.9957406520843506, "learning_rate": 4.8280060882800615e-06, "loss": 0.0154, "step": 6193 }, { "epoch": 5.65662100456621, "grad_norm": 0.8031895756721497, "learning_rate": 4.826991374936581e-06, "loss": 0.0051, "step": 6194 }, { "epoch": 5.657534246575342, "grad_norm": 3.1586432456970215, "learning_rate": 4.8259766615931e-06, "loss": 0.0253, "step": 6195 }, { "epoch": 5.658447488584475, "grad_norm": 0.7300640940666199, "learning_rate": 4.82496194824962e-06, "loss": 0.0033, "step": 6196 }, { "epoch": 5.659360730593607, "grad_norm": 1.1526777744293213, "learning_rate": 4.8239472349061395e-06, "loss": 0.0069, "step": 6197 }, { "epoch": 5.66027397260274, "grad_norm": 0.9369051456451416, "learning_rate": 4.822932521562659e-06, "loss": 0.0075, "step": 6198 }, { "epoch": 5.661187214611872, "grad_norm": 0.03301450237631798, "learning_rate": 4.821917808219179e-06, "loss": 0.0003, "step": 6199 }, { "epoch": 5.662100456621005, "grad_norm": 4.000277042388916, "learning_rate": 4.820903094875698e-06, "loss": 0.0141, "step": 6200 }, { "epoch": 5.663013698630137, "grad_norm": 4.145960807800293, "learning_rate": 4.819888381532217e-06, "loss": 0.0203, "step": 6201 }, { "epoch": 5.66392694063927, "grad_norm": 1.6888585090637207, "learning_rate": 4.818873668188737e-06, "loss": 0.0121, "step": 6202 }, { "epoch": 5.664840182648402, "grad_norm": 28.332599639892578, "learning_rate": 4.817858954845257e-06, "loss": 0.1644, "step": 6203 }, { "epoch": 5.6657534246575345, "grad_norm": 7.910858154296875, "learning_rate": 4.8168442415017765e-06, "loss": 0.0574, "step": 6204 }, { "epoch": 5.666666666666667, "grad_norm": 3.5520517826080322, "learning_rate": 4.815829528158295e-06, "loss": 0.0215, "step": 6205 }, { "epoch": 5.667579908675799, "grad_norm": 1.4198368787765503, "learning_rate": 4.814814814814815e-06, "loss": 0.008, "step": 6206 }, { "epoch": 5.668493150684932, "grad_norm": 24.98106575012207, "learning_rate": 4.813800101471335e-06, "loss": 0.1132, "step": 6207 }, { "epoch": 5.669406392694064, "grad_norm": 4.296924114227295, "learning_rate": 4.812785388127854e-06, "loss": 0.0197, "step": 6208 }, { "epoch": 5.670319634703196, "grad_norm": 1.0110794305801392, "learning_rate": 4.811770674784374e-06, "loss": 0.0057, "step": 6209 }, { "epoch": 5.671232876712329, "grad_norm": 4.112667560577393, "learning_rate": 4.810755961440893e-06, "loss": 0.0233, "step": 6210 }, { "epoch": 5.672146118721461, "grad_norm": 4.8405280113220215, "learning_rate": 4.809741248097413e-06, "loss": 0.0345, "step": 6211 }, { "epoch": 5.673059360730593, "grad_norm": 0.009853698313236237, "learning_rate": 4.808726534753932e-06, "loss": 0.0001, "step": 6212 }, { "epoch": 5.673972602739726, "grad_norm": 0.11209976673126221, "learning_rate": 4.807711821410452e-06, "loss": 0.0008, "step": 6213 }, { "epoch": 5.674885844748858, "grad_norm": 0.9165375232696533, "learning_rate": 4.806697108066972e-06, "loss": 0.0063, "step": 6214 }, { "epoch": 5.675799086757991, "grad_norm": 2.6582391262054443, "learning_rate": 4.805682394723491e-06, "loss": 0.013, "step": 6215 }, { "epoch": 5.676712328767123, "grad_norm": 22.8302001953125, "learning_rate": 4.80466768138001e-06, "loss": 0.1617, "step": 6216 }, { "epoch": 5.6776255707762555, "grad_norm": 0.20037326216697693, "learning_rate": 4.80365296803653e-06, "loss": 0.0013, "step": 6217 }, { "epoch": 5.678538812785388, "grad_norm": 0.025379525497555733, "learning_rate": 4.80263825469305e-06, "loss": 0.0002, "step": 6218 }, { "epoch": 5.67945205479452, "grad_norm": 1.092864751815796, "learning_rate": 4.801623541349569e-06, "loss": 0.0046, "step": 6219 }, { "epoch": 5.680365296803653, "grad_norm": 3.693505048751831, "learning_rate": 4.800608828006089e-06, "loss": 0.0084, "step": 6220 }, { "epoch": 5.681278538812785, "grad_norm": 0.47139477729797363, "learning_rate": 4.799594114662608e-06, "loss": 0.0029, "step": 6221 }, { "epoch": 5.682191780821918, "grad_norm": 2.187790870666504, "learning_rate": 4.7985794013191275e-06, "loss": 0.0137, "step": 6222 }, { "epoch": 5.68310502283105, "grad_norm": 2.5987701416015625, "learning_rate": 4.797564687975647e-06, "loss": 0.0275, "step": 6223 }, { "epoch": 5.684018264840183, "grad_norm": 0.9644069075584412, "learning_rate": 4.796549974632167e-06, "loss": 0.0045, "step": 6224 }, { "epoch": 5.684931506849315, "grad_norm": 0.7076928019523621, "learning_rate": 4.795535261288687e-06, "loss": 0.0035, "step": 6225 }, { "epoch": 5.685844748858448, "grad_norm": 29.91012954711914, "learning_rate": 4.7945205479452054e-06, "loss": 0.3628, "step": 6226 }, { "epoch": 5.68675799086758, "grad_norm": 16.89063835144043, "learning_rate": 4.793505834601725e-06, "loss": 0.0698, "step": 6227 }, { "epoch": 5.6876712328767125, "grad_norm": 0.08751390129327774, "learning_rate": 4.792491121258245e-06, "loss": 0.0004, "step": 6228 }, { "epoch": 5.688584474885845, "grad_norm": 0.34830015897750854, "learning_rate": 4.7914764079147645e-06, "loss": 0.0021, "step": 6229 }, { "epoch": 5.689497716894977, "grad_norm": 1.267885446548462, "learning_rate": 4.790461694571284e-06, "loss": 0.008, "step": 6230 }, { "epoch": 5.69041095890411, "grad_norm": 0.466118186712265, "learning_rate": 4.789446981227803e-06, "loss": 0.0025, "step": 6231 }, { "epoch": 5.691324200913242, "grad_norm": 2.2260994911193848, "learning_rate": 4.788432267884323e-06, "loss": 0.0111, "step": 6232 }, { "epoch": 5.692237442922375, "grad_norm": 0.9636062979698181, "learning_rate": 4.7874175545408424e-06, "loss": 0.0058, "step": 6233 }, { "epoch": 5.693150684931507, "grad_norm": 0.4402628540992737, "learning_rate": 4.786402841197362e-06, "loss": 0.0027, "step": 6234 }, { "epoch": 5.69406392694064, "grad_norm": 0.5103573203086853, "learning_rate": 4.785388127853882e-06, "loss": 0.0023, "step": 6235 }, { "epoch": 5.694977168949771, "grad_norm": 1.8838998079299927, "learning_rate": 4.784373414510401e-06, "loss": 0.0121, "step": 6236 }, { "epoch": 5.695890410958905, "grad_norm": 4.568314075469971, "learning_rate": 4.783358701166921e-06, "loss": 0.0108, "step": 6237 }, { "epoch": 5.696803652968036, "grad_norm": 2.316103935241699, "learning_rate": 4.78234398782344e-06, "loss": 0.0122, "step": 6238 }, { "epoch": 5.697716894977169, "grad_norm": 1.5877212285995483, "learning_rate": 4.78132927447996e-06, "loss": 0.008, "step": 6239 }, { "epoch": 5.698630136986301, "grad_norm": 0.17084528505802155, "learning_rate": 4.7803145611364794e-06, "loss": 0.0009, "step": 6240 }, { "epoch": 5.6995433789954335, "grad_norm": 0.1953868269920349, "learning_rate": 4.779299847792998e-06, "loss": 0.0017, "step": 6241 }, { "epoch": 5.700456621004566, "grad_norm": 0.08087906241416931, "learning_rate": 4.778285134449519e-06, "loss": 0.0006, "step": 6242 }, { "epoch": 5.701369863013698, "grad_norm": 0.10758601874113083, "learning_rate": 4.777270421106038e-06, "loss": 0.0007, "step": 6243 }, { "epoch": 5.702283105022831, "grad_norm": 1.404019832611084, "learning_rate": 4.776255707762557e-06, "loss": 0.0079, "step": 6244 }, { "epoch": 5.703196347031963, "grad_norm": 0.11240104585886002, "learning_rate": 4.775240994419077e-06, "loss": 0.0006, "step": 6245 }, { "epoch": 5.704109589041096, "grad_norm": 0.5881790518760681, "learning_rate": 4.774226281075596e-06, "loss": 0.0036, "step": 6246 }, { "epoch": 5.705022831050228, "grad_norm": 0.27645936608314514, "learning_rate": 4.7732115677321164e-06, "loss": 0.0017, "step": 6247 }, { "epoch": 5.705936073059361, "grad_norm": 0.8882924914360046, "learning_rate": 4.772196854388635e-06, "loss": 0.0064, "step": 6248 }, { "epoch": 5.706849315068493, "grad_norm": 34.6588249206543, "learning_rate": 4.771182141045155e-06, "loss": 0.2393, "step": 6249 }, { "epoch": 5.707762557077626, "grad_norm": 0.48298683762550354, "learning_rate": 4.770167427701675e-06, "loss": 0.0023, "step": 6250 }, { "epoch": 5.708675799086758, "grad_norm": 0.16966567933559418, "learning_rate": 4.769152714358194e-06, "loss": 0.0013, "step": 6251 }, { "epoch": 5.7095890410958905, "grad_norm": 0.4011862576007843, "learning_rate": 4.768138001014714e-06, "loss": 0.0004, "step": 6252 }, { "epoch": 5.710502283105023, "grad_norm": 0.1455932855606079, "learning_rate": 4.767123287671233e-06, "loss": 0.0012, "step": 6253 }, { "epoch": 5.711415525114155, "grad_norm": 0.5875170230865479, "learning_rate": 4.766108574327753e-06, "loss": 0.0055, "step": 6254 }, { "epoch": 5.712328767123288, "grad_norm": 2.1085102558135986, "learning_rate": 4.765093860984272e-06, "loss": 0.0086, "step": 6255 }, { "epoch": 5.71324200913242, "grad_norm": 0.5850653052330017, "learning_rate": 4.764079147640792e-06, "loss": 0.003, "step": 6256 }, { "epoch": 5.714155251141553, "grad_norm": 0.8544503450393677, "learning_rate": 4.763064434297312e-06, "loss": 0.0043, "step": 6257 }, { "epoch": 5.715068493150685, "grad_norm": 38.30911636352539, "learning_rate": 4.7620497209538305e-06, "loss": 0.1817, "step": 6258 }, { "epoch": 5.715981735159818, "grad_norm": 2.945082664489746, "learning_rate": 4.761035007610351e-06, "loss": 0.0206, "step": 6259 }, { "epoch": 5.71689497716895, "grad_norm": 5.02589225769043, "learning_rate": 4.76002029426687e-06, "loss": 0.0129, "step": 6260 }, { "epoch": 5.717808219178083, "grad_norm": 16.89525604248047, "learning_rate": 4.75900558092339e-06, "loss": 0.0834, "step": 6261 }, { "epoch": 5.718721461187215, "grad_norm": 0.7973673343658447, "learning_rate": 4.757990867579909e-06, "loss": 0.0037, "step": 6262 }, { "epoch": 5.719634703196347, "grad_norm": 0.3176236152648926, "learning_rate": 4.756976154236428e-06, "loss": 0.0021, "step": 6263 }, { "epoch": 5.72054794520548, "grad_norm": 0.6244144439697266, "learning_rate": 4.755961440892949e-06, "loss": 0.0041, "step": 6264 }, { "epoch": 5.7214611872146115, "grad_norm": 0.11139512807130814, "learning_rate": 4.7549467275494675e-06, "loss": 0.0008, "step": 6265 }, { "epoch": 5.722374429223744, "grad_norm": 23.85251808166504, "learning_rate": 4.753932014205987e-06, "loss": 0.2053, "step": 6266 }, { "epoch": 5.723287671232876, "grad_norm": 1.2851462364196777, "learning_rate": 4.752917300862507e-06, "loss": 0.0082, "step": 6267 }, { "epoch": 5.724200913242009, "grad_norm": 0.09720870107412338, "learning_rate": 4.751902587519026e-06, "loss": 0.0006, "step": 6268 }, { "epoch": 5.725114155251141, "grad_norm": 2.3184127807617188, "learning_rate": 4.750887874175546e-06, "loss": 0.0129, "step": 6269 }, { "epoch": 5.726027397260274, "grad_norm": 0.2783288359642029, "learning_rate": 4.749873160832065e-06, "loss": 0.0018, "step": 6270 }, { "epoch": 5.726940639269406, "grad_norm": 2.8722519874572754, "learning_rate": 4.748858447488585e-06, "loss": 0.011, "step": 6271 }, { "epoch": 5.727853881278539, "grad_norm": 88.31060028076172, "learning_rate": 4.7478437341451045e-06, "loss": 0.7126, "step": 6272 }, { "epoch": 5.728767123287671, "grad_norm": 0.14702296257019043, "learning_rate": 4.746829020801624e-06, "loss": 0.0007, "step": 6273 }, { "epoch": 5.729680365296804, "grad_norm": 5.3906779289245605, "learning_rate": 4.745814307458144e-06, "loss": 0.0361, "step": 6274 }, { "epoch": 5.730593607305936, "grad_norm": 6.2851691246032715, "learning_rate": 4.744799594114663e-06, "loss": 0.037, "step": 6275 }, { "epoch": 5.7315068493150685, "grad_norm": 7.106982707977295, "learning_rate": 4.743784880771182e-06, "loss": 0.0421, "step": 6276 }, { "epoch": 5.732420091324201, "grad_norm": 0.44596144556999207, "learning_rate": 4.742770167427702e-06, "loss": 0.0025, "step": 6277 }, { "epoch": 5.733333333333333, "grad_norm": 5.276826858520508, "learning_rate": 4.741755454084222e-06, "loss": 0.0219, "step": 6278 }, { "epoch": 5.734246575342466, "grad_norm": 17.2066707611084, "learning_rate": 4.7407407407407415e-06, "loss": 0.1007, "step": 6279 }, { "epoch": 5.735159817351598, "grad_norm": 0.07500888407230377, "learning_rate": 4.73972602739726e-06, "loss": 0.0004, "step": 6280 }, { "epoch": 5.736073059360731, "grad_norm": 0.04350031167268753, "learning_rate": 4.73871131405378e-06, "loss": 0.0003, "step": 6281 }, { "epoch": 5.736986301369863, "grad_norm": 0.9837618470191956, "learning_rate": 4.7376966007103e-06, "loss": 0.0066, "step": 6282 }, { "epoch": 5.737899543378996, "grad_norm": 3.345592737197876, "learning_rate": 4.736681887366819e-06, "loss": 0.0022, "step": 6283 }, { "epoch": 5.738812785388128, "grad_norm": 0.45819321274757385, "learning_rate": 4.735667174023339e-06, "loss": 0.003, "step": 6284 }, { "epoch": 5.739726027397261, "grad_norm": 0.9919724464416504, "learning_rate": 4.734652460679858e-06, "loss": 0.0039, "step": 6285 }, { "epoch": 5.740639269406393, "grad_norm": 6.1581830978393555, "learning_rate": 4.733637747336378e-06, "loss": 0.0304, "step": 6286 }, { "epoch": 5.7415525114155255, "grad_norm": 2.010908842086792, "learning_rate": 4.732623033992897e-06, "loss": 0.0118, "step": 6287 }, { "epoch": 5.742465753424657, "grad_norm": 0.15976494550704956, "learning_rate": 4.731608320649417e-06, "loss": 0.0012, "step": 6288 }, { "epoch": 5.74337899543379, "grad_norm": 0.05082892253994942, "learning_rate": 4.730593607305937e-06, "loss": 0.0003, "step": 6289 }, { "epoch": 5.744292237442922, "grad_norm": 5.935503005981445, "learning_rate": 4.7295788939624556e-06, "loss": 0.0273, "step": 6290 }, { "epoch": 5.745205479452055, "grad_norm": 11.693597793579102, "learning_rate": 4.728564180618975e-06, "loss": 0.0513, "step": 6291 }, { "epoch": 5.746118721461187, "grad_norm": 0.43922796845436096, "learning_rate": 4.727549467275495e-06, "loss": 0.0022, "step": 6292 }, { "epoch": 5.747031963470319, "grad_norm": 8.90762710571289, "learning_rate": 4.726534753932015e-06, "loss": 0.0333, "step": 6293 }, { "epoch": 5.747945205479452, "grad_norm": 0.16739574074745178, "learning_rate": 4.725520040588534e-06, "loss": 0.0008, "step": 6294 }, { "epoch": 5.748858447488584, "grad_norm": 5.674805641174316, "learning_rate": 4.724505327245054e-06, "loss": 0.0262, "step": 6295 }, { "epoch": 5.749771689497717, "grad_norm": 1.1571812629699707, "learning_rate": 4.723490613901573e-06, "loss": 0.0025, "step": 6296 }, { "epoch": 5.750684931506849, "grad_norm": 28.1802978515625, "learning_rate": 4.7224759005580926e-06, "loss": 0.1538, "step": 6297 }, { "epoch": 5.751598173515982, "grad_norm": 0.5180697441101074, "learning_rate": 4.721461187214612e-06, "loss": 0.0042, "step": 6298 }, { "epoch": 5.752511415525114, "grad_norm": 2.398237705230713, "learning_rate": 4.720446473871132e-06, "loss": 0.0114, "step": 6299 }, { "epoch": 5.7534246575342465, "grad_norm": 1.3492953777313232, "learning_rate": 4.719431760527652e-06, "loss": 0.0062, "step": 6300 }, { "epoch": 5.754337899543379, "grad_norm": 45.645362854003906, "learning_rate": 4.7184170471841705e-06, "loss": 0.2603, "step": 6301 }, { "epoch": 5.755251141552511, "grad_norm": 0.031323231756687164, "learning_rate": 4.71740233384069e-06, "loss": 0.0002, "step": 6302 }, { "epoch": 5.756164383561644, "grad_norm": 0.09658905863761902, "learning_rate": 4.71638762049721e-06, "loss": 0.0005, "step": 6303 }, { "epoch": 5.757077625570776, "grad_norm": 0.7024836540222168, "learning_rate": 4.7153729071537296e-06, "loss": 0.0046, "step": 6304 }, { "epoch": 5.757990867579909, "grad_norm": 0.13303473591804504, "learning_rate": 4.714358193810249e-06, "loss": 0.0006, "step": 6305 }, { "epoch": 5.758904109589041, "grad_norm": 0.20984682440757751, "learning_rate": 4.713343480466768e-06, "loss": 0.0014, "step": 6306 }, { "epoch": 5.759817351598174, "grad_norm": 0.5370704531669617, "learning_rate": 4.712328767123288e-06, "loss": 0.0036, "step": 6307 }, { "epoch": 5.760730593607306, "grad_norm": 0.024560363963246346, "learning_rate": 4.7113140537798075e-06, "loss": 0.0002, "step": 6308 }, { "epoch": 5.761643835616439, "grad_norm": 14.995269775390625, "learning_rate": 4.710299340436327e-06, "loss": 0.1042, "step": 6309 }, { "epoch": 5.762557077625571, "grad_norm": 7.971420764923096, "learning_rate": 4.709284627092847e-06, "loss": 0.0408, "step": 6310 }, { "epoch": 5.7634703196347035, "grad_norm": 0.9049755334854126, "learning_rate": 4.708269913749366e-06, "loss": 0.0034, "step": 6311 }, { "epoch": 5.764383561643836, "grad_norm": 0.022655265405774117, "learning_rate": 4.707255200405885e-06, "loss": 0.0001, "step": 6312 }, { "epoch": 5.765296803652968, "grad_norm": 12.7859525680542, "learning_rate": 4.706240487062405e-06, "loss": 0.0667, "step": 6313 }, { "epoch": 5.766210045662101, "grad_norm": 2.4191956520080566, "learning_rate": 4.705225773718925e-06, "loss": 0.0181, "step": 6314 }, { "epoch": 5.767123287671232, "grad_norm": 0.058614496141672134, "learning_rate": 4.7042110603754445e-06, "loss": 0.0004, "step": 6315 }, { "epoch": 5.768036529680366, "grad_norm": 48.429710388183594, "learning_rate": 4.703196347031963e-06, "loss": 0.1083, "step": 6316 }, { "epoch": 5.768949771689497, "grad_norm": 0.347827672958374, "learning_rate": 4.702181633688484e-06, "loss": 0.002, "step": 6317 }, { "epoch": 5.76986301369863, "grad_norm": 1.9570585489273071, "learning_rate": 4.701166920345003e-06, "loss": 0.0115, "step": 6318 }, { "epoch": 5.770776255707762, "grad_norm": 0.2587395906448364, "learning_rate": 4.700152207001522e-06, "loss": 0.0012, "step": 6319 }, { "epoch": 5.771689497716895, "grad_norm": 0.7158928513526917, "learning_rate": 4.699137493658042e-06, "loss": 0.0031, "step": 6320 }, { "epoch": 5.772602739726027, "grad_norm": 0.00786140188574791, "learning_rate": 4.698122780314561e-06, "loss": 0.0001, "step": 6321 }, { "epoch": 5.77351598173516, "grad_norm": 2.032644510269165, "learning_rate": 4.6971080669710815e-06, "loss": 0.0123, "step": 6322 }, { "epoch": 5.774429223744292, "grad_norm": 0.16541504859924316, "learning_rate": 4.6960933536276e-06, "loss": 0.0012, "step": 6323 }, { "epoch": 5.7753424657534245, "grad_norm": 0.8928655982017517, "learning_rate": 4.69507864028412e-06, "loss": 0.0025, "step": 6324 }, { "epoch": 5.776255707762557, "grad_norm": 10.385451316833496, "learning_rate": 4.69406392694064e-06, "loss": 0.0584, "step": 6325 }, { "epoch": 5.777168949771689, "grad_norm": 1.208068609237671, "learning_rate": 4.6930492135971586e-06, "loss": 0.0064, "step": 6326 }, { "epoch": 5.778082191780822, "grad_norm": 1.8668946027755737, "learning_rate": 4.692034500253679e-06, "loss": 0.0163, "step": 6327 }, { "epoch": 5.778995433789954, "grad_norm": 40.79283905029297, "learning_rate": 4.691019786910198e-06, "loss": 0.3468, "step": 6328 }, { "epoch": 5.779908675799087, "grad_norm": 31.34278678894043, "learning_rate": 4.690005073566718e-06, "loss": 0.1675, "step": 6329 }, { "epoch": 5.780821917808219, "grad_norm": 3.207226276397705, "learning_rate": 4.688990360223237e-06, "loss": 0.019, "step": 6330 }, { "epoch": 5.781735159817352, "grad_norm": 0.2226523756980896, "learning_rate": 4.687975646879756e-06, "loss": 0.0013, "step": 6331 }, { "epoch": 5.782648401826484, "grad_norm": 105.18053436279297, "learning_rate": 4.686960933536277e-06, "loss": 1.0267, "step": 6332 }, { "epoch": 5.7835616438356166, "grad_norm": 6.630535125732422, "learning_rate": 4.6859462201927956e-06, "loss": 0.055, "step": 6333 }, { "epoch": 5.784474885844749, "grad_norm": 2.4936561584472656, "learning_rate": 4.684931506849315e-06, "loss": 0.015, "step": 6334 }, { "epoch": 5.7853881278538815, "grad_norm": 0.044303543865680695, "learning_rate": 4.683916793505835e-06, "loss": 0.0002, "step": 6335 }, { "epoch": 5.786301369863014, "grad_norm": 6.220585823059082, "learning_rate": 4.682902080162355e-06, "loss": 0.0514, "step": 6336 }, { "epoch": 5.787214611872146, "grad_norm": 0.44606295228004456, "learning_rate": 4.681887366818874e-06, "loss": 0.0026, "step": 6337 }, { "epoch": 5.788127853881279, "grad_norm": 1.8334856033325195, "learning_rate": 4.680872653475393e-06, "loss": 0.0093, "step": 6338 }, { "epoch": 5.789041095890411, "grad_norm": 0.2066878229379654, "learning_rate": 4.679857940131914e-06, "loss": 0.0016, "step": 6339 }, { "epoch": 5.789954337899544, "grad_norm": 86.4130630493164, "learning_rate": 4.6788432267884326e-06, "loss": 0.5672, "step": 6340 }, { "epoch": 5.790867579908676, "grad_norm": 6.471952438354492, "learning_rate": 4.677828513444952e-06, "loss": 0.0344, "step": 6341 }, { "epoch": 5.791780821917808, "grad_norm": 3.9239580631256104, "learning_rate": 4.676813800101472e-06, "loss": 0.0157, "step": 6342 }, { "epoch": 5.792694063926941, "grad_norm": 1.3044757843017578, "learning_rate": 4.675799086757991e-06, "loss": 0.0068, "step": 6343 }, { "epoch": 5.793607305936073, "grad_norm": 1.0059318542480469, "learning_rate": 4.674784373414511e-06, "loss": 0.0079, "step": 6344 }, { "epoch": 5.794520547945205, "grad_norm": 0.20521774888038635, "learning_rate": 4.67376966007103e-06, "loss": 0.0011, "step": 6345 }, { "epoch": 5.7954337899543376, "grad_norm": 0.2760332524776459, "learning_rate": 4.67275494672755e-06, "loss": 0.002, "step": 6346 }, { "epoch": 5.79634703196347, "grad_norm": 1.5777721405029297, "learning_rate": 4.6717402333840695e-06, "loss": 0.0128, "step": 6347 }, { "epoch": 5.7972602739726025, "grad_norm": 0.6432254910469055, "learning_rate": 4.670725520040588e-06, "loss": 0.0031, "step": 6348 }, { "epoch": 5.798173515981735, "grad_norm": 0.03676020726561546, "learning_rate": 4.669710806697109e-06, "loss": 0.0002, "step": 6349 }, { "epoch": 5.799086757990867, "grad_norm": 0.10407796502113342, "learning_rate": 4.668696093353628e-06, "loss": 0.0006, "step": 6350 }, { "epoch": 5.8, "grad_norm": 3.697113513946533, "learning_rate": 4.6676813800101475e-06, "loss": 0.0242, "step": 6351 }, { "epoch": 5.800913242009132, "grad_norm": 0.9099736213684082, "learning_rate": 4.666666666666667e-06, "loss": 0.0053, "step": 6352 }, { "epoch": 5.801826484018265, "grad_norm": 8.32662582397461, "learning_rate": 4.665651953323187e-06, "loss": 0.0464, "step": 6353 }, { "epoch": 5.802739726027397, "grad_norm": 0.10941244661808014, "learning_rate": 4.6646372399797065e-06, "loss": 0.0008, "step": 6354 }, { "epoch": 5.80365296803653, "grad_norm": 0.28298577666282654, "learning_rate": 4.663622526636225e-06, "loss": 0.0017, "step": 6355 }, { "epoch": 5.804566210045662, "grad_norm": 0.6005351543426514, "learning_rate": 4.662607813292745e-06, "loss": 0.003, "step": 6356 }, { "epoch": 5.8054794520547945, "grad_norm": 55.72584915161133, "learning_rate": 4.661593099949265e-06, "loss": 0.2825, "step": 6357 }, { "epoch": 5.806392694063927, "grad_norm": 0.5280293226242065, "learning_rate": 4.6605783866057845e-06, "loss": 0.0027, "step": 6358 }, { "epoch": 5.8073059360730594, "grad_norm": 2.3847436904907227, "learning_rate": 4.659563673262304e-06, "loss": 0.0143, "step": 6359 }, { "epoch": 5.808219178082192, "grad_norm": 4.838251113891602, "learning_rate": 4.658548959918823e-06, "loss": 0.0207, "step": 6360 }, { "epoch": 5.809132420091324, "grad_norm": 20.24001693725586, "learning_rate": 4.657534246575343e-06, "loss": 0.0979, "step": 6361 }, { "epoch": 5.810045662100457, "grad_norm": 0.09619516134262085, "learning_rate": 4.656519533231862e-06, "loss": 0.0003, "step": 6362 }, { "epoch": 5.810958904109589, "grad_norm": 0.25040948390960693, "learning_rate": 4.655504819888382e-06, "loss": 0.0014, "step": 6363 }, { "epoch": 5.811872146118722, "grad_norm": 9.994551658630371, "learning_rate": 4.654490106544902e-06, "loss": 0.0671, "step": 6364 }, { "epoch": 5.812785388127854, "grad_norm": 0.10367390513420105, "learning_rate": 4.653475393201421e-06, "loss": 0.0006, "step": 6365 }, { "epoch": 5.813698630136987, "grad_norm": 14.85915470123291, "learning_rate": 4.65246067985794e-06, "loss": 0.0935, "step": 6366 }, { "epoch": 5.814611872146119, "grad_norm": 7.535752296447754, "learning_rate": 4.65144596651446e-06, "loss": 0.0398, "step": 6367 }, { "epoch": 5.8155251141552515, "grad_norm": 13.266636848449707, "learning_rate": 4.65043125317098e-06, "loss": 0.0488, "step": 6368 }, { "epoch": 5.816438356164383, "grad_norm": 8.63316822052002, "learning_rate": 4.649416539827499e-06, "loss": 0.0502, "step": 6369 }, { "epoch": 5.817351598173516, "grad_norm": 0.05517164617776871, "learning_rate": 4.648401826484018e-06, "loss": 0.0003, "step": 6370 }, { "epoch": 5.818264840182648, "grad_norm": 11.156465530395508, "learning_rate": 4.647387113140538e-06, "loss": 0.0356, "step": 6371 }, { "epoch": 5.8191780821917805, "grad_norm": 0.18736110627651215, "learning_rate": 4.646372399797058e-06, "loss": 0.0008, "step": 6372 }, { "epoch": 5.820091324200913, "grad_norm": 9.928979873657227, "learning_rate": 4.645357686453577e-06, "loss": 0.0444, "step": 6373 }, { "epoch": 5.821004566210045, "grad_norm": 0.07225890457630157, "learning_rate": 4.644342973110097e-06, "loss": 0.0004, "step": 6374 }, { "epoch": 5.821917808219178, "grad_norm": 0.06721023470163345, "learning_rate": 4.643328259766617e-06, "loss": 0.0005, "step": 6375 }, { "epoch": 5.82283105022831, "grad_norm": 3.909416437149048, "learning_rate": 4.6423135464231355e-06, "loss": 0.0188, "step": 6376 }, { "epoch": 5.823744292237443, "grad_norm": 0.4491003751754761, "learning_rate": 4.641298833079655e-06, "loss": 0.0025, "step": 6377 }, { "epoch": 5.824657534246575, "grad_norm": 0.07212662696838379, "learning_rate": 4.640284119736175e-06, "loss": 0.0005, "step": 6378 }, { "epoch": 5.825570776255708, "grad_norm": 0.0855688825249672, "learning_rate": 4.639269406392695e-06, "loss": 0.0006, "step": 6379 }, { "epoch": 5.82648401826484, "grad_norm": 1.8510897159576416, "learning_rate": 4.638254693049214e-06, "loss": 0.0117, "step": 6380 }, { "epoch": 5.8273972602739725, "grad_norm": 121.9374771118164, "learning_rate": 4.637239979705733e-06, "loss": 5.1358, "step": 6381 }, { "epoch": 5.828310502283105, "grad_norm": 0.5212311148643494, "learning_rate": 4.636225266362253e-06, "loss": 0.0042, "step": 6382 }, { "epoch": 5.829223744292237, "grad_norm": 0.3063257336616516, "learning_rate": 4.6352105530187725e-06, "loss": 0.0016, "step": 6383 }, { "epoch": 5.83013698630137, "grad_norm": 12.900991439819336, "learning_rate": 4.634195839675292e-06, "loss": 0.0725, "step": 6384 }, { "epoch": 5.831050228310502, "grad_norm": 1.146670937538147, "learning_rate": 4.633181126331812e-06, "loss": 0.0072, "step": 6385 }, { "epoch": 5.831963470319635, "grad_norm": 10.789520263671875, "learning_rate": 4.632166412988331e-06, "loss": 0.0641, "step": 6386 }, { "epoch": 5.832876712328767, "grad_norm": 0.25551220774650574, "learning_rate": 4.6311516996448505e-06, "loss": 0.0014, "step": 6387 }, { "epoch": 5.8337899543379, "grad_norm": 19.3532657623291, "learning_rate": 4.63013698630137e-06, "loss": 0.0788, "step": 6388 }, { "epoch": 5.834703196347032, "grad_norm": 0.10334388166666031, "learning_rate": 4.62912227295789e-06, "loss": 0.0006, "step": 6389 }, { "epoch": 5.835616438356165, "grad_norm": 0.030760791152715683, "learning_rate": 4.6281075596144095e-06, "loss": 0.0002, "step": 6390 }, { "epoch": 5.836529680365297, "grad_norm": 6.644675254821777, "learning_rate": 4.627092846270928e-06, "loss": 0.0364, "step": 6391 }, { "epoch": 5.8374429223744295, "grad_norm": 1.3225669860839844, "learning_rate": 4.626078132927448e-06, "loss": 0.0058, "step": 6392 }, { "epoch": 5.838356164383562, "grad_norm": 24.723718643188477, "learning_rate": 4.625063419583968e-06, "loss": 0.1536, "step": 6393 }, { "epoch": 5.839269406392694, "grad_norm": 0.09665362536907196, "learning_rate": 4.6240487062404875e-06, "loss": 0.0005, "step": 6394 }, { "epoch": 5.840182648401827, "grad_norm": 6.276851654052734, "learning_rate": 4.623033992897007e-06, "loss": 0.0393, "step": 6395 }, { "epoch": 5.8410958904109584, "grad_norm": 31.097000122070312, "learning_rate": 4.622019279553526e-06, "loss": 0.08, "step": 6396 }, { "epoch": 5.842009132420092, "grad_norm": 0.6885431408882141, "learning_rate": 4.6210045662100465e-06, "loss": 0.0035, "step": 6397 }, { "epoch": 5.842922374429223, "grad_norm": 1.0139696598052979, "learning_rate": 4.619989852866565e-06, "loss": 0.0061, "step": 6398 }, { "epoch": 5.843835616438356, "grad_norm": 4.681410789489746, "learning_rate": 4.618975139523085e-06, "loss": 0.0207, "step": 6399 }, { "epoch": 5.844748858447488, "grad_norm": 121.90554809570312, "learning_rate": 4.617960426179605e-06, "loss": 1.5806, "step": 6400 }, { "epoch": 5.845662100456621, "grad_norm": 1.2892944812774658, "learning_rate": 4.616945712836124e-06, "loss": 0.0056, "step": 6401 }, { "epoch": 5.846575342465753, "grad_norm": 1.4551563262939453, "learning_rate": 4.615930999492644e-06, "loss": 0.0077, "step": 6402 }, { "epoch": 5.847488584474886, "grad_norm": 91.76730346679688, "learning_rate": 4.614916286149163e-06, "loss": 2.3243, "step": 6403 }, { "epoch": 5.848401826484018, "grad_norm": 7.71757698059082, "learning_rate": 4.613901572805683e-06, "loss": 0.0233, "step": 6404 }, { "epoch": 5.8493150684931505, "grad_norm": 4.129533767700195, "learning_rate": 4.612886859462202e-06, "loss": 0.0382, "step": 6405 }, { "epoch": 5.850228310502283, "grad_norm": 2.466641426086426, "learning_rate": 4.611872146118721e-06, "loss": 0.0141, "step": 6406 }, { "epoch": 5.851141552511415, "grad_norm": 2.966705322265625, "learning_rate": 4.610857432775242e-06, "loss": 0.0148, "step": 6407 }, { "epoch": 5.852054794520548, "grad_norm": 0.30074167251586914, "learning_rate": 4.609842719431761e-06, "loss": 0.0021, "step": 6408 }, { "epoch": 5.85296803652968, "grad_norm": 0.1791064441204071, "learning_rate": 4.60882800608828e-06, "loss": 0.001, "step": 6409 }, { "epoch": 5.853881278538813, "grad_norm": 74.14971160888672, "learning_rate": 4.6078132927448e-06, "loss": 0.9607, "step": 6410 }, { "epoch": 5.854794520547945, "grad_norm": 1.52005136013031, "learning_rate": 4.606798579401319e-06, "loss": 0.0084, "step": 6411 }, { "epoch": 5.855707762557078, "grad_norm": 0.3071831464767456, "learning_rate": 4.605783866057839e-06, "loss": 0.0017, "step": 6412 }, { "epoch": 5.85662100456621, "grad_norm": 82.3860092163086, "learning_rate": 4.604769152714358e-06, "loss": 0.4766, "step": 6413 }, { "epoch": 5.857534246575343, "grad_norm": 1.726475477218628, "learning_rate": 4.603754439370878e-06, "loss": 0.0109, "step": 6414 }, { "epoch": 5.858447488584475, "grad_norm": 1.0025073289871216, "learning_rate": 4.602739726027398e-06, "loss": 0.0062, "step": 6415 }, { "epoch": 5.8593607305936075, "grad_norm": 9.051809310913086, "learning_rate": 4.601725012683917e-06, "loss": 0.0534, "step": 6416 }, { "epoch": 5.86027397260274, "grad_norm": 1.3196114301681519, "learning_rate": 4.600710299340437e-06, "loss": 0.0092, "step": 6417 }, { "epoch": 5.861187214611872, "grad_norm": 7.317388534545898, "learning_rate": 4.599695585996956e-06, "loss": 0.0418, "step": 6418 }, { "epoch": 5.862100456621005, "grad_norm": 2.5089032649993896, "learning_rate": 4.598680872653476e-06, "loss": 0.0173, "step": 6419 }, { "epoch": 5.863013698630137, "grad_norm": 0.13485532999038696, "learning_rate": 4.597666159309995e-06, "loss": 0.0011, "step": 6420 }, { "epoch": 5.86392694063927, "grad_norm": 1.2967809438705444, "learning_rate": 4.596651445966515e-06, "loss": 0.0065, "step": 6421 }, { "epoch": 5.864840182648402, "grad_norm": 7.094124794006348, "learning_rate": 4.595636732623035e-06, "loss": 0.0419, "step": 6422 }, { "epoch": 5.865753424657534, "grad_norm": 0.1435021162033081, "learning_rate": 4.5946220192795534e-06, "loss": 0.0008, "step": 6423 }, { "epoch": 5.866666666666667, "grad_norm": 0.597109854221344, "learning_rate": 4.593607305936074e-06, "loss": 0.0039, "step": 6424 }, { "epoch": 5.867579908675799, "grad_norm": 6.532402992248535, "learning_rate": 4.592592592592593e-06, "loss": 0.0368, "step": 6425 }, { "epoch": 5.868493150684931, "grad_norm": 106.93161010742188, "learning_rate": 4.5915778792491125e-06, "loss": 1.6276, "step": 6426 }, { "epoch": 5.869406392694064, "grad_norm": 0.49421319365501404, "learning_rate": 4.590563165905632e-06, "loss": 0.0027, "step": 6427 }, { "epoch": 5.870319634703196, "grad_norm": 20.957942962646484, "learning_rate": 4.589548452562151e-06, "loss": 0.1301, "step": 6428 }, { "epoch": 5.8712328767123285, "grad_norm": 1.1439770460128784, "learning_rate": 4.588533739218672e-06, "loss": 0.0094, "step": 6429 }, { "epoch": 5.872146118721461, "grad_norm": 20.432992935180664, "learning_rate": 4.5875190258751904e-06, "loss": 0.1446, "step": 6430 }, { "epoch": 5.873059360730593, "grad_norm": 5.861547470092773, "learning_rate": 4.58650431253171e-06, "loss": 0.0337, "step": 6431 }, { "epoch": 5.873972602739726, "grad_norm": 117.13226318359375, "learning_rate": 4.58548959918823e-06, "loss": 5.5893, "step": 6432 }, { "epoch": 5.874885844748858, "grad_norm": 2.0376081466674805, "learning_rate": 4.5844748858447495e-06, "loss": 0.012, "step": 6433 }, { "epoch": 5.875799086757991, "grad_norm": 1.1359972953796387, "learning_rate": 4.583460172501269e-06, "loss": 0.0087, "step": 6434 }, { "epoch": 5.876712328767123, "grad_norm": 30.735179901123047, "learning_rate": 4.582445459157788e-06, "loss": 0.1438, "step": 6435 }, { "epoch": 5.877625570776256, "grad_norm": 7.453762054443359, "learning_rate": 4.581430745814308e-06, "loss": 0.0238, "step": 6436 }, { "epoch": 5.878538812785388, "grad_norm": 0.5025057196617126, "learning_rate": 4.5804160324708274e-06, "loss": 0.0029, "step": 6437 }, { "epoch": 5.879452054794521, "grad_norm": 20.740955352783203, "learning_rate": 4.579401319127347e-06, "loss": 0.1141, "step": 6438 }, { "epoch": 5.880365296803653, "grad_norm": 27.50944709777832, "learning_rate": 4.578386605783867e-06, "loss": 0.1732, "step": 6439 }, { "epoch": 5.8812785388127855, "grad_norm": 3.8242061138153076, "learning_rate": 4.577371892440386e-06, "loss": 0.0334, "step": 6440 }, { "epoch": 5.882191780821918, "grad_norm": 0.08280996233224869, "learning_rate": 4.576357179096905e-06, "loss": 0.0007, "step": 6441 }, { "epoch": 5.88310502283105, "grad_norm": 0.7284174561500549, "learning_rate": 4.575342465753425e-06, "loss": 0.0054, "step": 6442 }, { "epoch": 5.884018264840183, "grad_norm": 1.7786836624145508, "learning_rate": 4.574327752409945e-06, "loss": 0.0101, "step": 6443 }, { "epoch": 5.884931506849315, "grad_norm": 29.17136573791504, "learning_rate": 4.5733130390664644e-06, "loss": 0.1937, "step": 6444 }, { "epoch": 5.885844748858448, "grad_norm": 6.364663124084473, "learning_rate": 4.572298325722983e-06, "loss": 0.0307, "step": 6445 }, { "epoch": 5.88675799086758, "grad_norm": 35.75521469116211, "learning_rate": 4.571283612379503e-06, "loss": 0.2739, "step": 6446 }, { "epoch": 5.887671232876713, "grad_norm": 18.574689865112305, "learning_rate": 4.570268899036023e-06, "loss": 0.123, "step": 6447 }, { "epoch": 5.888584474885845, "grad_norm": 0.09402740001678467, "learning_rate": 4.569254185692542e-06, "loss": 0.0006, "step": 6448 }, { "epoch": 5.889497716894978, "grad_norm": 119.59957122802734, "learning_rate": 4.568239472349062e-06, "loss": 0.8503, "step": 6449 }, { "epoch": 5.890410958904109, "grad_norm": 1.7058062553405762, "learning_rate": 4.567224759005581e-06, "loss": 0.0125, "step": 6450 }, { "epoch": 5.8913242009132425, "grad_norm": 12.584043502807617, "learning_rate": 4.566210045662101e-06, "loss": 0.0765, "step": 6451 }, { "epoch": 5.892237442922374, "grad_norm": 1.354610800743103, "learning_rate": 4.56519533231862e-06, "loss": 0.0103, "step": 6452 }, { "epoch": 5.8931506849315065, "grad_norm": 39.60746383666992, "learning_rate": 4.56418061897514e-06, "loss": 0.263, "step": 6453 }, { "epoch": 5.894063926940639, "grad_norm": 2.7299563884735107, "learning_rate": 4.56316590563166e-06, "loss": 0.0222, "step": 6454 }, { "epoch": 5.894977168949771, "grad_norm": 114.65876007080078, "learning_rate": 4.562151192288179e-06, "loss": 1.8057, "step": 6455 }, { "epoch": 5.895890410958904, "grad_norm": 0.7279145121574402, "learning_rate": 4.561136478944698e-06, "loss": 0.0042, "step": 6456 }, { "epoch": 5.896803652968036, "grad_norm": 6.927371025085449, "learning_rate": 4.560121765601218e-06, "loss": 0.0404, "step": 6457 }, { "epoch": 5.897716894977169, "grad_norm": 20.486244201660156, "learning_rate": 4.559107052257738e-06, "loss": 0.1397, "step": 6458 }, { "epoch": 5.898630136986301, "grad_norm": 17.07538414001465, "learning_rate": 4.558092338914257e-06, "loss": 0.0874, "step": 6459 }, { "epoch": 5.899543378995434, "grad_norm": 1.395563006401062, "learning_rate": 4.557077625570777e-06, "loss": 0.0053, "step": 6460 }, { "epoch": 5.900456621004566, "grad_norm": 0.25660109519958496, "learning_rate": 4.556062912227296e-06, "loss": 0.0018, "step": 6461 }, { "epoch": 5.901369863013699, "grad_norm": 7.3709564208984375, "learning_rate": 4.5550481988838155e-06, "loss": 0.0382, "step": 6462 }, { "epoch": 5.902283105022831, "grad_norm": 0.11416549235582352, "learning_rate": 4.554033485540335e-06, "loss": 0.0007, "step": 6463 }, { "epoch": 5.9031963470319635, "grad_norm": 1.4529342651367188, "learning_rate": 4.553018772196855e-06, "loss": 0.0094, "step": 6464 }, { "epoch": 5.904109589041096, "grad_norm": 0.1112225204706192, "learning_rate": 4.552004058853375e-06, "loss": 0.0008, "step": 6465 }, { "epoch": 5.905022831050228, "grad_norm": 18.212955474853516, "learning_rate": 4.550989345509893e-06, "loss": 0.0994, "step": 6466 }, { "epoch": 5.905936073059361, "grad_norm": 4.1775946617126465, "learning_rate": 4.549974632166413e-06, "loss": 0.0208, "step": 6467 }, { "epoch": 5.906849315068493, "grad_norm": 20.388351440429688, "learning_rate": 4.548959918822933e-06, "loss": 0.0668, "step": 6468 }, { "epoch": 5.907762557077626, "grad_norm": 0.31561487913131714, "learning_rate": 4.5479452054794525e-06, "loss": 0.0015, "step": 6469 }, { "epoch": 5.908675799086758, "grad_norm": 31.552227020263672, "learning_rate": 4.546930492135972e-06, "loss": 0.1778, "step": 6470 }, { "epoch": 5.909589041095891, "grad_norm": 88.46366119384766, "learning_rate": 4.545915778792491e-06, "loss": 0.5768, "step": 6471 }, { "epoch": 5.910502283105023, "grad_norm": 3.2137627601623535, "learning_rate": 4.544901065449011e-06, "loss": 0.019, "step": 6472 }, { "epoch": 5.911415525114156, "grad_norm": 0.6747101545333862, "learning_rate": 4.54388635210553e-06, "loss": 0.0037, "step": 6473 }, { "epoch": 5.912328767123288, "grad_norm": 148.01072692871094, "learning_rate": 4.54287163876205e-06, "loss": 2.1821, "step": 6474 }, { "epoch": 5.91324200913242, "grad_norm": 20.90620994567871, "learning_rate": 4.54185692541857e-06, "loss": 0.085, "step": 6475 }, { "epoch": 5.914155251141553, "grad_norm": 3.2375237941741943, "learning_rate": 4.540842212075089e-06, "loss": 0.0169, "step": 6476 }, { "epoch": 5.9150684931506845, "grad_norm": 3.8048553466796875, "learning_rate": 4.539827498731609e-06, "loss": 0.0205, "step": 6477 }, { "epoch": 5.915981735159818, "grad_norm": 0.4053637385368347, "learning_rate": 4.538812785388128e-06, "loss": 0.0018, "step": 6478 }, { "epoch": 5.916894977168949, "grad_norm": 20.049665451049805, "learning_rate": 4.537798072044648e-06, "loss": 0.1333, "step": 6479 }, { "epoch": 5.917808219178082, "grad_norm": 35.89512634277344, "learning_rate": 4.536783358701167e-06, "loss": 0.2012, "step": 6480 }, { "epoch": 5.918721461187214, "grad_norm": 1.042816400527954, "learning_rate": 4.535768645357686e-06, "loss": 0.0069, "step": 6481 }, { "epoch": 5.919634703196347, "grad_norm": 0.27215924859046936, "learning_rate": 4.534753932014207e-06, "loss": 0.0008, "step": 6482 }, { "epoch": 5.920547945205479, "grad_norm": 21.255014419555664, "learning_rate": 4.533739218670726e-06, "loss": 0.183, "step": 6483 }, { "epoch": 5.921461187214612, "grad_norm": 16.57000160217285, "learning_rate": 4.532724505327245e-06, "loss": 0.1301, "step": 6484 }, { "epoch": 5.922374429223744, "grad_norm": 0.2190479189157486, "learning_rate": 4.531709791983765e-06, "loss": 0.0013, "step": 6485 }, { "epoch": 5.923287671232877, "grad_norm": 0.7085447907447815, "learning_rate": 4.530695078640284e-06, "loss": 0.0013, "step": 6486 }, { "epoch": 5.924200913242009, "grad_norm": 11.82103157043457, "learning_rate": 4.529680365296804e-06, "loss": 0.093, "step": 6487 }, { "epoch": 5.9251141552511415, "grad_norm": 0.36960458755493164, "learning_rate": 4.528665651953323e-06, "loss": 0.0017, "step": 6488 }, { "epoch": 5.926027397260274, "grad_norm": 0.08504331111907959, "learning_rate": 4.527650938609843e-06, "loss": 0.0006, "step": 6489 }, { "epoch": 5.926940639269406, "grad_norm": 0.877192497253418, "learning_rate": 4.526636225266363e-06, "loss": 0.0054, "step": 6490 }, { "epoch": 5.927853881278539, "grad_norm": 11.263606071472168, "learning_rate": 4.5256215119228815e-06, "loss": 0.0799, "step": 6491 }, { "epoch": 5.928767123287671, "grad_norm": 0.5292152166366577, "learning_rate": 4.524606798579402e-06, "loss": 0.0032, "step": 6492 }, { "epoch": 5.929680365296804, "grad_norm": 87.9476547241211, "learning_rate": 4.523592085235921e-06, "loss": 1.0638, "step": 6493 }, { "epoch": 5.930593607305936, "grad_norm": 0.11675896495580673, "learning_rate": 4.5225773718924406e-06, "loss": 0.0007, "step": 6494 }, { "epoch": 5.931506849315069, "grad_norm": 0.3005017936229706, "learning_rate": 4.52156265854896e-06, "loss": 0.002, "step": 6495 }, { "epoch": 5.932420091324201, "grad_norm": 19.671419143676758, "learning_rate": 4.52054794520548e-06, "loss": 0.1239, "step": 6496 }, { "epoch": 5.933333333333334, "grad_norm": 2.5661191940307617, "learning_rate": 4.519533231862e-06, "loss": 0.0171, "step": 6497 }, { "epoch": 5.934246575342466, "grad_norm": 2.2926700115203857, "learning_rate": 4.5185185185185185e-06, "loss": 0.0137, "step": 6498 }, { "epoch": 5.9351598173515985, "grad_norm": 8.218537330627441, "learning_rate": 4.517503805175039e-06, "loss": 0.0621, "step": 6499 }, { "epoch": 5.936073059360731, "grad_norm": 20.99772071838379, "learning_rate": 4.516489091831558e-06, "loss": 0.1037, "step": 6500 }, { "epoch": 5.936986301369863, "grad_norm": 102.24706268310547, "learning_rate": 4.5154743784880776e-06, "loss": 4.1889, "step": 6501 }, { "epoch": 5.937899543378995, "grad_norm": 1.9575610160827637, "learning_rate": 4.514459665144597e-06, "loss": 0.0158, "step": 6502 }, { "epoch": 5.938812785388128, "grad_norm": 5.7835822105407715, "learning_rate": 4.513444951801116e-06, "loss": 0.0409, "step": 6503 }, { "epoch": 5.93972602739726, "grad_norm": 20.753101348876953, "learning_rate": 4.512430238457637e-06, "loss": 0.1131, "step": 6504 }, { "epoch": 5.940639269406392, "grad_norm": 16.76190757751465, "learning_rate": 4.5114155251141555e-06, "loss": 0.1083, "step": 6505 }, { "epoch": 5.941552511415525, "grad_norm": 1.6215077638626099, "learning_rate": 4.510400811770675e-06, "loss": 0.0132, "step": 6506 }, { "epoch": 5.942465753424657, "grad_norm": 2.233760356903076, "learning_rate": 4.509386098427195e-06, "loss": 0.0073, "step": 6507 }, { "epoch": 5.94337899543379, "grad_norm": 18.60235023498535, "learning_rate": 4.508371385083714e-06, "loss": 0.0898, "step": 6508 }, { "epoch": 5.944292237442922, "grad_norm": 2.671957015991211, "learning_rate": 4.507356671740234e-06, "loss": 0.0138, "step": 6509 }, { "epoch": 5.945205479452055, "grad_norm": 3.8075971603393555, "learning_rate": 4.506341958396753e-06, "loss": 0.0272, "step": 6510 }, { "epoch": 5.946118721461187, "grad_norm": 0.22865061461925507, "learning_rate": 4.505327245053273e-06, "loss": 0.0015, "step": 6511 }, { "epoch": 5.9470319634703195, "grad_norm": 0.05727062001824379, "learning_rate": 4.5043125317097925e-06, "loss": 0.0004, "step": 6512 }, { "epoch": 5.947945205479452, "grad_norm": 0.200349822640419, "learning_rate": 4.503297818366311e-06, "loss": 0.0014, "step": 6513 }, { "epoch": 5.948858447488584, "grad_norm": 4.04185676574707, "learning_rate": 4.502283105022832e-06, "loss": 0.0178, "step": 6514 }, { "epoch": 5.949771689497717, "grad_norm": 1.1474733352661133, "learning_rate": 4.501268391679351e-06, "loss": 0.0072, "step": 6515 }, { "epoch": 5.950684931506849, "grad_norm": 235.62799072265625, "learning_rate": 4.50025367833587e-06, "loss": 0.8644, "step": 6516 }, { "epoch": 5.951598173515982, "grad_norm": 36.84966278076172, "learning_rate": 4.49923896499239e-06, "loss": 0.3298, "step": 6517 }, { "epoch": 5.952511415525114, "grad_norm": 16.43213653564453, "learning_rate": 4.49822425164891e-06, "loss": 0.0735, "step": 6518 }, { "epoch": 5.953424657534247, "grad_norm": 1.6199307441711426, "learning_rate": 4.4972095383054295e-06, "loss": 0.0116, "step": 6519 }, { "epoch": 5.954337899543379, "grad_norm": 0.4022383391857147, "learning_rate": 4.496194824961948e-06, "loss": 0.0022, "step": 6520 }, { "epoch": 5.955251141552512, "grad_norm": 3.7361950874328613, "learning_rate": 4.495180111618468e-06, "loss": 0.0335, "step": 6521 }, { "epoch": 5.956164383561644, "grad_norm": 30.368127822875977, "learning_rate": 4.494165398274988e-06, "loss": 0.2418, "step": 6522 }, { "epoch": 5.9570776255707765, "grad_norm": 4.368144989013672, "learning_rate": 4.493150684931507e-06, "loss": 0.0262, "step": 6523 }, { "epoch": 5.957990867579909, "grad_norm": 32.09543228149414, "learning_rate": 4.492135971588027e-06, "loss": 0.2769, "step": 6524 }, { "epoch": 5.958904109589041, "grad_norm": 1.055103063583374, "learning_rate": 4.491121258244546e-06, "loss": 0.0064, "step": 6525 }, { "epoch": 5.959817351598174, "grad_norm": 0.7140148878097534, "learning_rate": 4.490106544901066e-06, "loss": 0.004, "step": 6526 }, { "epoch": 5.960730593607306, "grad_norm": 8.93826961517334, "learning_rate": 4.489091831557585e-06, "loss": 0.0761, "step": 6527 }, { "epoch": 5.961643835616439, "grad_norm": 0.44712314009666443, "learning_rate": 4.488077118214105e-06, "loss": 0.0034, "step": 6528 }, { "epoch": 5.96255707762557, "grad_norm": 1.4007045030593872, "learning_rate": 4.487062404870625e-06, "loss": 0.0093, "step": 6529 }, { "epoch": 5.963470319634704, "grad_norm": 140.3307647705078, "learning_rate": 4.4860476915271436e-06, "loss": 3.5037, "step": 6530 }, { "epoch": 5.964383561643835, "grad_norm": 0.9643915891647339, "learning_rate": 4.485032978183663e-06, "loss": 0.007, "step": 6531 }, { "epoch": 5.965296803652968, "grad_norm": 0.02909122034907341, "learning_rate": 4.484018264840183e-06, "loss": 0.0002, "step": 6532 }, { "epoch": 5.9662100456621, "grad_norm": 0.24276195466518402, "learning_rate": 4.483003551496703e-06, "loss": 0.0023, "step": 6533 }, { "epoch": 5.967123287671233, "grad_norm": 2.6407523155212402, "learning_rate": 4.481988838153222e-06, "loss": 0.0186, "step": 6534 }, { "epoch": 5.968036529680365, "grad_norm": 4.805961608886719, "learning_rate": 4.480974124809742e-06, "loss": 0.0369, "step": 6535 }, { "epoch": 5.9689497716894975, "grad_norm": 0.07461165636777878, "learning_rate": 4.479959411466261e-06, "loss": 0.0004, "step": 6536 }, { "epoch": 5.96986301369863, "grad_norm": 2.0380215644836426, "learning_rate": 4.4789446981227805e-06, "loss": 0.0149, "step": 6537 }, { "epoch": 5.970776255707762, "grad_norm": 1.0597126483917236, "learning_rate": 4.4779299847793e-06, "loss": 0.0072, "step": 6538 }, { "epoch": 5.971689497716895, "grad_norm": 0.46905481815338135, "learning_rate": 4.47691527143582e-06, "loss": 0.0027, "step": 6539 }, { "epoch": 5.972602739726027, "grad_norm": 51.18366622924805, "learning_rate": 4.47590055809234e-06, "loss": 0.2826, "step": 6540 }, { "epoch": 5.97351598173516, "grad_norm": 0.3174706995487213, "learning_rate": 4.4748858447488585e-06, "loss": 0.0021, "step": 6541 }, { "epoch": 5.974429223744292, "grad_norm": 0.4388105571269989, "learning_rate": 4.473871131405378e-06, "loss": 0.0032, "step": 6542 }, { "epoch": 5.975342465753425, "grad_norm": 0.7681016325950623, "learning_rate": 4.472856418061898e-06, "loss": 0.0032, "step": 6543 }, { "epoch": 5.976255707762557, "grad_norm": 6.685067176818848, "learning_rate": 4.4718417047184175e-06, "loss": 0.035, "step": 6544 }, { "epoch": 5.9771689497716896, "grad_norm": 0.6064339876174927, "learning_rate": 4.470826991374937e-06, "loss": 0.0041, "step": 6545 }, { "epoch": 5.978082191780822, "grad_norm": 0.0040087103843688965, "learning_rate": 4.469812278031456e-06, "loss": 0.0, "step": 6546 }, { "epoch": 5.9789954337899545, "grad_norm": 2.2914085388183594, "learning_rate": 4.468797564687976e-06, "loss": 0.0161, "step": 6547 }, { "epoch": 5.979908675799087, "grad_norm": 45.011592864990234, "learning_rate": 4.4677828513444955e-06, "loss": 0.2553, "step": 6548 }, { "epoch": 5.980821917808219, "grad_norm": 29.86031150817871, "learning_rate": 4.466768138001015e-06, "loss": 0.2573, "step": 6549 }, { "epoch": 5.981735159817352, "grad_norm": 0.8088852167129517, "learning_rate": 4.465753424657535e-06, "loss": 0.0031, "step": 6550 }, { "epoch": 5.982648401826484, "grad_norm": 0.8973986506462097, "learning_rate": 4.464738711314054e-06, "loss": 0.005, "step": 6551 }, { "epoch": 5.983561643835617, "grad_norm": 0.64470374584198, "learning_rate": 4.463723997970573e-06, "loss": 0.0047, "step": 6552 }, { "epoch": 5.984474885844749, "grad_norm": 12.150053024291992, "learning_rate": 4.462709284627093e-06, "loss": 0.0989, "step": 6553 }, { "epoch": 5.985388127853882, "grad_norm": 109.73738861083984, "learning_rate": 4.461694571283613e-06, "loss": 2.0237, "step": 6554 }, { "epoch": 5.986301369863014, "grad_norm": 0.14478234946727753, "learning_rate": 4.4606798579401325e-06, "loss": 0.0011, "step": 6555 }, { "epoch": 5.987214611872146, "grad_norm": 1.3511677980422974, "learning_rate": 4.459665144596651e-06, "loss": 0.0089, "step": 6556 }, { "epoch": 5.988127853881279, "grad_norm": 10.489396095275879, "learning_rate": 4.458650431253172e-06, "loss": 0.06, "step": 6557 }, { "epoch": 5.989041095890411, "grad_norm": 14.13818645477295, "learning_rate": 4.457635717909691e-06, "loss": 0.0993, "step": 6558 }, { "epoch": 5.989954337899543, "grad_norm": 6.743479251861572, "learning_rate": 4.45662100456621e-06, "loss": 0.0329, "step": 6559 }, { "epoch": 5.9908675799086755, "grad_norm": 11.439584732055664, "learning_rate": 4.45560629122273e-06, "loss": 0.0833, "step": 6560 }, { "epoch": 5.991780821917808, "grad_norm": 4.3752946853637695, "learning_rate": 4.454591577879249e-06, "loss": 0.0337, "step": 6561 }, { "epoch": 5.99269406392694, "grad_norm": 13.691838264465332, "learning_rate": 4.4535768645357695e-06, "loss": 0.0831, "step": 6562 }, { "epoch": 5.993607305936073, "grad_norm": 5.767090797424316, "learning_rate": 4.452562151192288e-06, "loss": 0.0373, "step": 6563 }, { "epoch": 5.994520547945205, "grad_norm": 0.06139592453837395, "learning_rate": 4.451547437848808e-06, "loss": 0.0005, "step": 6564 }, { "epoch": 5.995433789954338, "grad_norm": 11.122858047485352, "learning_rate": 4.450532724505328e-06, "loss": 0.0772, "step": 6565 }, { "epoch": 5.99634703196347, "grad_norm": 51.46703338623047, "learning_rate": 4.4495180111618465e-06, "loss": 0.3401, "step": 6566 }, { "epoch": 5.997260273972603, "grad_norm": 0.07130299508571625, "learning_rate": 4.448503297818367e-06, "loss": 0.0006, "step": 6567 }, { "epoch": 5.998173515981735, "grad_norm": 0.6645475625991821, "learning_rate": 4.447488584474886e-06, "loss": 0.0045, "step": 6568 }, { "epoch": 5.9990867579908675, "grad_norm": 57.213218688964844, "learning_rate": 4.446473871131406e-06, "loss": 0.566, "step": 6569 }, { "epoch": 6.0, "grad_norm": 2.9532551765441895, "learning_rate": 4.445459157787925e-06, "loss": 0.018, "step": 6570 }, { "epoch": 6.0009132420091325, "grad_norm": 26.283374786376953, "learning_rate": 4.444444444444444e-06, "loss": 0.1666, "step": 6571 }, { "epoch": 6.001826484018265, "grad_norm": 2.824615240097046, "learning_rate": 4.443429731100965e-06, "loss": 0.0236, "step": 6572 }, { "epoch": 6.002739726027397, "grad_norm": 5.307722091674805, "learning_rate": 4.4424150177574835e-06, "loss": 0.025, "step": 6573 }, { "epoch": 6.00365296803653, "grad_norm": 1.8313086032867432, "learning_rate": 4.441400304414003e-06, "loss": 0.01, "step": 6574 }, { "epoch": 6.004566210045662, "grad_norm": 9.244955062866211, "learning_rate": 4.440385591070523e-06, "loss": 0.0715, "step": 6575 }, { "epoch": 6.005479452054795, "grad_norm": 1.0828777551651, "learning_rate": 4.439370877727043e-06, "loss": 0.0059, "step": 6576 }, { "epoch": 6.006392694063927, "grad_norm": 1.0135515928268433, "learning_rate": 4.438356164383562e-06, "loss": 0.0072, "step": 6577 }, { "epoch": 6.00730593607306, "grad_norm": 2.4660873413085938, "learning_rate": 4.437341451040081e-06, "loss": 0.0183, "step": 6578 }, { "epoch": 6.008219178082192, "grad_norm": 0.24178484082221985, "learning_rate": 4.436326737696602e-06, "loss": 0.0012, "step": 6579 }, { "epoch": 6.0091324200913245, "grad_norm": 2.6682584285736084, "learning_rate": 4.4353120243531205e-06, "loss": 0.0133, "step": 6580 }, { "epoch": 6.010045662100457, "grad_norm": 5.155358791351318, "learning_rate": 4.43429731100964e-06, "loss": 0.0306, "step": 6581 }, { "epoch": 6.010958904109589, "grad_norm": 3.5598485469818115, "learning_rate": 4.43328259766616e-06, "loss": 0.0168, "step": 6582 }, { "epoch": 6.011872146118722, "grad_norm": 5.305447101593018, "learning_rate": 4.432267884322679e-06, "loss": 0.0298, "step": 6583 }, { "epoch": 6.0127853881278535, "grad_norm": 1.8091835975646973, "learning_rate": 4.431253170979199e-06, "loss": 0.0086, "step": 6584 }, { "epoch": 6.013698630136986, "grad_norm": 1.1114920377731323, "learning_rate": 4.430238457635718e-06, "loss": 0.0068, "step": 6585 }, { "epoch": 6.014611872146118, "grad_norm": 67.0782470703125, "learning_rate": 4.429223744292238e-06, "loss": 0.7195, "step": 6586 }, { "epoch": 6.015525114155251, "grad_norm": 18.363100051879883, "learning_rate": 4.4282090309487575e-06, "loss": 0.1586, "step": 6587 }, { "epoch": 6.016438356164383, "grad_norm": 5.232313632965088, "learning_rate": 4.427194317605276e-06, "loss": 0.0306, "step": 6588 }, { "epoch": 6.017351598173516, "grad_norm": 51.80820846557617, "learning_rate": 4.426179604261797e-06, "loss": 0.3915, "step": 6589 }, { "epoch": 6.018264840182648, "grad_norm": 9.906829833984375, "learning_rate": 4.425164890918316e-06, "loss": 0.0652, "step": 6590 }, { "epoch": 6.019178082191781, "grad_norm": 0.23385067284107208, "learning_rate": 4.4241501775748354e-06, "loss": 0.0016, "step": 6591 }, { "epoch": 6.020091324200913, "grad_norm": 32.386817932128906, "learning_rate": 4.423135464231355e-06, "loss": 0.106, "step": 6592 }, { "epoch": 6.0210045662100455, "grad_norm": 1.0662596225738525, "learning_rate": 4.422120750887874e-06, "loss": 0.0068, "step": 6593 }, { "epoch": 6.021917808219178, "grad_norm": 0.360030859708786, "learning_rate": 4.4211060375443945e-06, "loss": 0.0022, "step": 6594 }, { "epoch": 6.0228310502283104, "grad_norm": 38.481834411621094, "learning_rate": 4.420091324200913e-06, "loss": 0.2461, "step": 6595 }, { "epoch": 6.023744292237443, "grad_norm": 0.7546553611755371, "learning_rate": 4.419076610857433e-06, "loss": 0.0046, "step": 6596 }, { "epoch": 6.024657534246575, "grad_norm": 2.9828927516937256, "learning_rate": 4.418061897513953e-06, "loss": 0.0168, "step": 6597 }, { "epoch": 6.025570776255708, "grad_norm": 0.6764053106307983, "learning_rate": 4.4170471841704724e-06, "loss": 0.005, "step": 6598 }, { "epoch": 6.02648401826484, "grad_norm": 45.64820098876953, "learning_rate": 4.416032470826992e-06, "loss": 0.2981, "step": 6599 }, { "epoch": 6.027397260273973, "grad_norm": 0.6802021861076355, "learning_rate": 4.415017757483511e-06, "loss": 0.0042, "step": 6600 }, { "epoch": 6.028310502283105, "grad_norm": 0.7379242777824402, "learning_rate": 4.414003044140031e-06, "loss": 0.0049, "step": 6601 }, { "epoch": 6.029223744292238, "grad_norm": 46.45997619628906, "learning_rate": 4.41298833079655e-06, "loss": 0.2684, "step": 6602 }, { "epoch": 6.03013698630137, "grad_norm": 1.2380081415176392, "learning_rate": 4.41197361745307e-06, "loss": 0.0092, "step": 6603 }, { "epoch": 6.0310502283105025, "grad_norm": 28.371749877929688, "learning_rate": 4.41095890410959e-06, "loss": 0.1305, "step": 6604 }, { "epoch": 6.031963470319635, "grad_norm": 3.1435937881469727, "learning_rate": 4.409944190766109e-06, "loss": 0.0244, "step": 6605 }, { "epoch": 6.032876712328767, "grad_norm": 1.339061975479126, "learning_rate": 4.408929477422628e-06, "loss": 0.009, "step": 6606 }, { "epoch": 6.0337899543379, "grad_norm": 1.4389634132385254, "learning_rate": 4.407914764079148e-06, "loss": 0.0103, "step": 6607 }, { "epoch": 6.034703196347032, "grad_norm": 0.7915146946907043, "learning_rate": 4.406900050735668e-06, "loss": 0.0056, "step": 6608 }, { "epoch": 6.035616438356165, "grad_norm": 26.583847045898438, "learning_rate": 4.405885337392187e-06, "loss": 0.1884, "step": 6609 }, { "epoch": 6.036529680365296, "grad_norm": 0.38138338923454285, "learning_rate": 4.404870624048706e-06, "loss": 0.0021, "step": 6610 }, { "epoch": 6.037442922374429, "grad_norm": 22.323139190673828, "learning_rate": 4.403855910705226e-06, "loss": 0.1331, "step": 6611 }, { "epoch": 6.038356164383561, "grad_norm": 0.30728575587272644, "learning_rate": 4.402841197361746e-06, "loss": 0.0021, "step": 6612 }, { "epoch": 6.039269406392694, "grad_norm": 1.839176058769226, "learning_rate": 4.401826484018265e-06, "loss": 0.0139, "step": 6613 }, { "epoch": 6.040182648401826, "grad_norm": 3.213701009750366, "learning_rate": 4.400811770674785e-06, "loss": 0.0158, "step": 6614 }, { "epoch": 6.041095890410959, "grad_norm": 0.986869752407074, "learning_rate": 4.399797057331304e-06, "loss": 0.0064, "step": 6615 }, { "epoch": 6.042009132420091, "grad_norm": 2.2452073097229004, "learning_rate": 4.3987823439878235e-06, "loss": 0.0102, "step": 6616 }, { "epoch": 6.0429223744292235, "grad_norm": 0.6621577739715576, "learning_rate": 4.397767630644343e-06, "loss": 0.0048, "step": 6617 }, { "epoch": 6.043835616438356, "grad_norm": 2.2728333473205566, "learning_rate": 4.396752917300863e-06, "loss": 0.0132, "step": 6618 }, { "epoch": 6.044748858447488, "grad_norm": 0.8377732038497925, "learning_rate": 4.395738203957383e-06, "loss": 0.0055, "step": 6619 }, { "epoch": 6.045662100456621, "grad_norm": 10.72381591796875, "learning_rate": 4.394723490613902e-06, "loss": 0.0545, "step": 6620 }, { "epoch": 6.046575342465753, "grad_norm": 1.019239902496338, "learning_rate": 4.393708777270421e-06, "loss": 0.0068, "step": 6621 }, { "epoch": 6.047488584474886, "grad_norm": 5.9402923583984375, "learning_rate": 4.392694063926941e-06, "loss": 0.0392, "step": 6622 }, { "epoch": 6.048401826484018, "grad_norm": 3.116574287414551, "learning_rate": 4.3916793505834605e-06, "loss": 0.0157, "step": 6623 }, { "epoch": 6.049315068493151, "grad_norm": 3.3353583812713623, "learning_rate": 4.39066463723998e-06, "loss": 0.0176, "step": 6624 }, { "epoch": 6.050228310502283, "grad_norm": 0.6350598931312561, "learning_rate": 4.3896499238965e-06, "loss": 0.0029, "step": 6625 }, { "epoch": 6.051141552511416, "grad_norm": 2.9486496448516846, "learning_rate": 4.388635210553019e-06, "loss": 0.0199, "step": 6626 }, { "epoch": 6.052054794520548, "grad_norm": 2.0350868701934814, "learning_rate": 4.3876204972095384e-06, "loss": 0.0111, "step": 6627 }, { "epoch": 6.0529680365296805, "grad_norm": 22.77886199951172, "learning_rate": 4.386605783866058e-06, "loss": 0.1774, "step": 6628 }, { "epoch": 6.053881278538813, "grad_norm": 1.29752779006958, "learning_rate": 4.385591070522578e-06, "loss": 0.0066, "step": 6629 }, { "epoch": 6.054794520547945, "grad_norm": 2.6737074851989746, "learning_rate": 4.3845763571790975e-06, "loss": 0.0136, "step": 6630 }, { "epoch": 6.055707762557078, "grad_norm": 0.10919444262981415, "learning_rate": 4.383561643835616e-06, "loss": 0.0007, "step": 6631 }, { "epoch": 6.05662100456621, "grad_norm": 0.16229163110256195, "learning_rate": 4.382546930492136e-06, "loss": 0.001, "step": 6632 }, { "epoch": 6.057534246575343, "grad_norm": 1.8931325674057007, "learning_rate": 4.381532217148656e-06, "loss": 0.0166, "step": 6633 }, { "epoch": 6.058447488584475, "grad_norm": 2.354658603668213, "learning_rate": 4.3805175038051754e-06, "loss": 0.0141, "step": 6634 }, { "epoch": 6.059360730593608, "grad_norm": 101.37854766845703, "learning_rate": 4.379502790461695e-06, "loss": 2.2136, "step": 6635 }, { "epoch": 6.06027397260274, "grad_norm": 0.6073288917541504, "learning_rate": 4.378488077118214e-06, "loss": 0.0021, "step": 6636 }, { "epoch": 6.061187214611872, "grad_norm": 0.4652382731437683, "learning_rate": 4.3774733637747345e-06, "loss": 0.0021, "step": 6637 }, { "epoch": 6.062100456621004, "grad_norm": 1.3227918148040771, "learning_rate": 4.376458650431253e-06, "loss": 0.0076, "step": 6638 }, { "epoch": 6.063013698630137, "grad_norm": 1.12711501121521, "learning_rate": 4.375443937087773e-06, "loss": 0.0106, "step": 6639 }, { "epoch": 6.063926940639269, "grad_norm": 20.61489486694336, "learning_rate": 4.374429223744293e-06, "loss": 0.0968, "step": 6640 }, { "epoch": 6.0648401826484015, "grad_norm": 22.834850311279297, "learning_rate": 4.373414510400812e-06, "loss": 0.1303, "step": 6641 }, { "epoch": 6.065753424657534, "grad_norm": 25.707630157470703, "learning_rate": 4.372399797057332e-06, "loss": 0.2025, "step": 6642 }, { "epoch": 6.066666666666666, "grad_norm": 2.430800676345825, "learning_rate": 4.371385083713851e-06, "loss": 0.0157, "step": 6643 }, { "epoch": 6.067579908675799, "grad_norm": 0.5284340977668762, "learning_rate": 4.370370370370371e-06, "loss": 0.003, "step": 6644 }, { "epoch": 6.068493150684931, "grad_norm": 0.795525074005127, "learning_rate": 4.36935565702689e-06, "loss": 0.007, "step": 6645 }, { "epoch": 6.069406392694064, "grad_norm": 55.94416809082031, "learning_rate": 4.368340943683409e-06, "loss": 0.3437, "step": 6646 }, { "epoch": 6.070319634703196, "grad_norm": 12.46480655670166, "learning_rate": 4.36732623033993e-06, "loss": 0.0806, "step": 6647 }, { "epoch": 6.071232876712329, "grad_norm": 11.441919326782227, "learning_rate": 4.366311516996449e-06, "loss": 0.0295, "step": 6648 }, { "epoch": 6.072146118721461, "grad_norm": 0.5298640727996826, "learning_rate": 4.365296803652968e-06, "loss": 0.0038, "step": 6649 }, { "epoch": 6.073059360730594, "grad_norm": 0.8801708817481995, "learning_rate": 4.364282090309488e-06, "loss": 0.0055, "step": 6650 }, { "epoch": 6.073972602739726, "grad_norm": 3.903979539871216, "learning_rate": 4.363267376966007e-06, "loss": 0.0193, "step": 6651 }, { "epoch": 6.0748858447488585, "grad_norm": 6.527265548706055, "learning_rate": 4.362252663622527e-06, "loss": 0.0457, "step": 6652 }, { "epoch": 6.075799086757991, "grad_norm": 0.1741308867931366, "learning_rate": 4.361237950279046e-06, "loss": 0.0013, "step": 6653 }, { "epoch": 6.076712328767123, "grad_norm": 80.18061828613281, "learning_rate": 4.360223236935566e-06, "loss": 0.6659, "step": 6654 }, { "epoch": 6.077625570776256, "grad_norm": 6.316592693328857, "learning_rate": 4.359208523592086e-06, "loss": 0.0448, "step": 6655 }, { "epoch": 6.078538812785388, "grad_norm": 2.339505195617676, "learning_rate": 4.358193810248605e-06, "loss": 0.0193, "step": 6656 }, { "epoch": 6.079452054794521, "grad_norm": 0.09399528801441193, "learning_rate": 4.357179096905125e-06, "loss": 0.0007, "step": 6657 }, { "epoch": 6.080365296803653, "grad_norm": 3.424396514892578, "learning_rate": 4.356164383561644e-06, "loss": 0.0222, "step": 6658 }, { "epoch": 6.081278538812786, "grad_norm": 8.841208457946777, "learning_rate": 4.355149670218164e-06, "loss": 0.0577, "step": 6659 }, { "epoch": 6.082191780821918, "grad_norm": 35.276153564453125, "learning_rate": 4.354134956874683e-06, "loss": 0.2907, "step": 6660 }, { "epoch": 6.083105022831051, "grad_norm": 3.2099263668060303, "learning_rate": 4.353120243531203e-06, "loss": 0.0135, "step": 6661 }, { "epoch": 6.084018264840183, "grad_norm": 40.49818420410156, "learning_rate": 4.3521055301877226e-06, "loss": 0.3212, "step": 6662 }, { "epoch": 6.0849315068493155, "grad_norm": 15.870345115661621, "learning_rate": 4.351090816844241e-06, "loss": 0.1057, "step": 6663 }, { "epoch": 6.085844748858447, "grad_norm": 19.000959396362305, "learning_rate": 4.350076103500762e-06, "loss": 0.1423, "step": 6664 }, { "epoch": 6.0867579908675795, "grad_norm": 4.575653553009033, "learning_rate": 4.349061390157281e-06, "loss": 0.0286, "step": 6665 }, { "epoch": 6.087671232876712, "grad_norm": 1.917357325553894, "learning_rate": 4.3480466768138005e-06, "loss": 0.0103, "step": 6666 }, { "epoch": 6.088584474885844, "grad_norm": 1.9112204313278198, "learning_rate": 4.34703196347032e-06, "loss": 0.0105, "step": 6667 }, { "epoch": 6.089497716894977, "grad_norm": 3.742091655731201, "learning_rate": 4.346017250126839e-06, "loss": 0.0162, "step": 6668 }, { "epoch": 6.090410958904109, "grad_norm": 2.7132601737976074, "learning_rate": 4.3450025367833596e-06, "loss": 0.0145, "step": 6669 }, { "epoch": 6.091324200913242, "grad_norm": 1.6894084215164185, "learning_rate": 4.343987823439878e-06, "loss": 0.0113, "step": 6670 }, { "epoch": 6.092237442922374, "grad_norm": 0.3283894658088684, "learning_rate": 4.342973110096398e-06, "loss": 0.0023, "step": 6671 }, { "epoch": 6.093150684931507, "grad_norm": 18.427080154418945, "learning_rate": 4.341958396752918e-06, "loss": 0.1323, "step": 6672 }, { "epoch": 6.094063926940639, "grad_norm": 3.2627978324890137, "learning_rate": 4.340943683409437e-06, "loss": 0.0254, "step": 6673 }, { "epoch": 6.094977168949772, "grad_norm": 0.1046561598777771, "learning_rate": 4.339928970065957e-06, "loss": 0.0008, "step": 6674 }, { "epoch": 6.095890410958904, "grad_norm": 1.286077618598938, "learning_rate": 4.338914256722476e-06, "loss": 0.0056, "step": 6675 }, { "epoch": 6.0968036529680365, "grad_norm": 3.8649468421936035, "learning_rate": 4.337899543378996e-06, "loss": 0.0355, "step": 6676 }, { "epoch": 6.097716894977169, "grad_norm": 2.588270425796509, "learning_rate": 4.336884830035515e-06, "loss": 0.0121, "step": 6677 }, { "epoch": 6.098630136986301, "grad_norm": 0.23498369753360748, "learning_rate": 4.335870116692035e-06, "loss": 0.0022, "step": 6678 }, { "epoch": 6.099543378995434, "grad_norm": 0.053998783230781555, "learning_rate": 4.334855403348555e-06, "loss": 0.0004, "step": 6679 }, { "epoch": 6.100456621004566, "grad_norm": 67.70557403564453, "learning_rate": 4.333840690005074e-06, "loss": 1.0906, "step": 6680 }, { "epoch": 6.101369863013699, "grad_norm": 4.311858654022217, "learning_rate": 4.332825976661593e-06, "loss": 0.0299, "step": 6681 }, { "epoch": 6.102283105022831, "grad_norm": 3.4012792110443115, "learning_rate": 4.331811263318113e-06, "loss": 0.0169, "step": 6682 }, { "epoch": 6.103196347031964, "grad_norm": 37.53477478027344, "learning_rate": 4.330796549974633e-06, "loss": 0.2655, "step": 6683 }, { "epoch": 6.104109589041096, "grad_norm": 2.0021309852600098, "learning_rate": 4.329781836631152e-06, "loss": 0.0114, "step": 6684 }, { "epoch": 6.105022831050229, "grad_norm": 0.8066991567611694, "learning_rate": 4.328767123287671e-06, "loss": 0.0053, "step": 6685 }, { "epoch": 6.105936073059361, "grad_norm": 0.34763628244400024, "learning_rate": 4.327752409944191e-06, "loss": 0.0027, "step": 6686 }, { "epoch": 6.1068493150684935, "grad_norm": 2.2156872749328613, "learning_rate": 4.326737696600711e-06, "loss": 0.0132, "step": 6687 }, { "epoch": 6.107762557077626, "grad_norm": 32.78785705566406, "learning_rate": 4.32572298325723e-06, "loss": 0.1841, "step": 6688 }, { "epoch": 6.108675799086758, "grad_norm": 0.28326505422592163, "learning_rate": 4.32470826991375e-06, "loss": 0.0016, "step": 6689 }, { "epoch": 6.109589041095891, "grad_norm": 1.723311424255371, "learning_rate": 4.323693556570269e-06, "loss": 0.0067, "step": 6690 }, { "epoch": 6.110502283105022, "grad_norm": 0.6726482510566711, "learning_rate": 4.3226788432267886e-06, "loss": 0.004, "step": 6691 }, { "epoch": 6.111415525114155, "grad_norm": 0.22550226747989655, "learning_rate": 4.321664129883308e-06, "loss": 0.0017, "step": 6692 }, { "epoch": 6.112328767123287, "grad_norm": 0.36477789282798767, "learning_rate": 4.320649416539828e-06, "loss": 0.0026, "step": 6693 }, { "epoch": 6.11324200913242, "grad_norm": 27.48912811279297, "learning_rate": 4.319634703196348e-06, "loss": 0.1913, "step": 6694 }, { "epoch": 6.114155251141552, "grad_norm": 0.40782278776168823, "learning_rate": 4.3186199898528665e-06, "loss": 0.0041, "step": 6695 }, { "epoch": 6.115068493150685, "grad_norm": 19.471267700195312, "learning_rate": 4.317605276509386e-06, "loss": 0.1037, "step": 6696 }, { "epoch": 6.115981735159817, "grad_norm": 0.20667114853858948, "learning_rate": 4.316590563165906e-06, "loss": 0.002, "step": 6697 }, { "epoch": 6.11689497716895, "grad_norm": 2.0933589935302734, "learning_rate": 4.3155758498224256e-06, "loss": 0.0227, "step": 6698 }, { "epoch": 6.117808219178082, "grad_norm": 0.1397828310728073, "learning_rate": 4.314561136478945e-06, "loss": 0.0011, "step": 6699 }, { "epoch": 6.1187214611872145, "grad_norm": 2.57784366607666, "learning_rate": 4.313546423135465e-06, "loss": 0.0165, "step": 6700 }, { "epoch": 6.119634703196347, "grad_norm": 75.72175598144531, "learning_rate": 4.312531709791984e-06, "loss": 0.5123, "step": 6701 }, { "epoch": 6.120547945205479, "grad_norm": 4.788306713104248, "learning_rate": 4.3115169964485035e-06, "loss": 0.0286, "step": 6702 }, { "epoch": 6.121461187214612, "grad_norm": 0.5069275498390198, "learning_rate": 4.310502283105023e-06, "loss": 0.0033, "step": 6703 }, { "epoch": 6.122374429223744, "grad_norm": 1.9078235626220703, "learning_rate": 4.309487569761543e-06, "loss": 0.0115, "step": 6704 }, { "epoch": 6.123287671232877, "grad_norm": 0.23542143404483795, "learning_rate": 4.3084728564180626e-06, "loss": 0.0014, "step": 6705 }, { "epoch": 6.124200913242009, "grad_norm": 0.016216956079006195, "learning_rate": 4.307458143074581e-06, "loss": 0.0001, "step": 6706 }, { "epoch": 6.125114155251142, "grad_norm": 1.3471695184707642, "learning_rate": 4.306443429731101e-06, "loss": 0.0116, "step": 6707 }, { "epoch": 6.126027397260274, "grad_norm": 0.5716215968132019, "learning_rate": 4.305428716387621e-06, "loss": 0.0036, "step": 6708 }, { "epoch": 6.126940639269407, "grad_norm": 2.652728319168091, "learning_rate": 4.3044140030441405e-06, "loss": 0.0166, "step": 6709 }, { "epoch": 6.127853881278539, "grad_norm": 25.8566837310791, "learning_rate": 4.30339928970066e-06, "loss": 0.1071, "step": 6710 }, { "epoch": 6.1287671232876715, "grad_norm": 7.562348365783691, "learning_rate": 4.302384576357179e-06, "loss": 0.0266, "step": 6711 }, { "epoch": 6.129680365296804, "grad_norm": 66.09180450439453, "learning_rate": 4.301369863013699e-06, "loss": 0.6623, "step": 6712 }, { "epoch": 6.130593607305936, "grad_norm": 1.3805487155914307, "learning_rate": 4.300355149670218e-06, "loss": 0.011, "step": 6713 }, { "epoch": 6.131506849315069, "grad_norm": 0.01804078184068203, "learning_rate": 4.299340436326738e-06, "loss": 0.0001, "step": 6714 }, { "epoch": 6.132420091324201, "grad_norm": 1.638343095779419, "learning_rate": 4.298325722983258e-06, "loss": 0.0096, "step": 6715 }, { "epoch": 6.133333333333334, "grad_norm": 18.150367736816406, "learning_rate": 4.297311009639777e-06, "loss": 0.1067, "step": 6716 }, { "epoch": 6.134246575342466, "grad_norm": 0.08000960946083069, "learning_rate": 4.296296296296296e-06, "loss": 0.0004, "step": 6717 }, { "epoch": 6.135159817351598, "grad_norm": 11.223896980285645, "learning_rate": 4.295281582952816e-06, "loss": 0.0557, "step": 6718 }, { "epoch": 6.13607305936073, "grad_norm": 0.06990036368370056, "learning_rate": 4.294266869609336e-06, "loss": 0.0005, "step": 6719 }, { "epoch": 6.136986301369863, "grad_norm": 12.522074699401855, "learning_rate": 4.293252156265855e-06, "loss": 0.0837, "step": 6720 }, { "epoch": 6.137899543378995, "grad_norm": 1.013057827949524, "learning_rate": 4.292237442922374e-06, "loss": 0.0044, "step": 6721 }, { "epoch": 6.138812785388128, "grad_norm": 18.45586395263672, "learning_rate": 4.291222729578895e-06, "loss": 0.1095, "step": 6722 }, { "epoch": 6.13972602739726, "grad_norm": 0.24180343747138977, "learning_rate": 4.290208016235414e-06, "loss": 0.0018, "step": 6723 }, { "epoch": 6.1406392694063925, "grad_norm": 3.085430860519409, "learning_rate": 4.289193302891933e-06, "loss": 0.0198, "step": 6724 }, { "epoch": 6.141552511415525, "grad_norm": 5.121091365814209, "learning_rate": 4.288178589548453e-06, "loss": 0.0272, "step": 6725 }, { "epoch": 6.142465753424657, "grad_norm": 6.370488166809082, "learning_rate": 4.287163876204972e-06, "loss": 0.0448, "step": 6726 }, { "epoch": 6.14337899543379, "grad_norm": 0.19263708591461182, "learning_rate": 4.286149162861492e-06, "loss": 0.0015, "step": 6727 }, { "epoch": 6.144292237442922, "grad_norm": 0.45614948868751526, "learning_rate": 4.285134449518011e-06, "loss": 0.0021, "step": 6728 }, { "epoch": 6.145205479452055, "grad_norm": 1.5044524669647217, "learning_rate": 4.284119736174531e-06, "loss": 0.011, "step": 6729 }, { "epoch": 6.146118721461187, "grad_norm": 11.114073753356934, "learning_rate": 4.283105022831051e-06, "loss": 0.054, "step": 6730 }, { "epoch": 6.14703196347032, "grad_norm": 0.7339162230491638, "learning_rate": 4.2820903094875695e-06, "loss": 0.0048, "step": 6731 }, { "epoch": 6.147945205479452, "grad_norm": 1.3143880367279053, "learning_rate": 4.28107559614409e-06, "loss": 0.008, "step": 6732 }, { "epoch": 6.148858447488585, "grad_norm": 1.297751545906067, "learning_rate": 4.280060882800609e-06, "loss": 0.0058, "step": 6733 }, { "epoch": 6.149771689497717, "grad_norm": 0.8131924867630005, "learning_rate": 4.2790461694571285e-06, "loss": 0.0055, "step": 6734 }, { "epoch": 6.1506849315068495, "grad_norm": 0.27117469906806946, "learning_rate": 4.278031456113648e-06, "loss": 0.0018, "step": 6735 }, { "epoch": 6.151598173515982, "grad_norm": 20.289579391479492, "learning_rate": 4.277016742770168e-06, "loss": 0.1428, "step": 6736 }, { "epoch": 6.152511415525114, "grad_norm": 120.41574096679688, "learning_rate": 4.276002029426688e-06, "loss": 0.0713, "step": 6737 }, { "epoch": 6.153424657534247, "grad_norm": 0.6114298105239868, "learning_rate": 4.2749873160832065e-06, "loss": 0.0045, "step": 6738 }, { "epoch": 6.154337899543379, "grad_norm": 0.6526806950569153, "learning_rate": 4.273972602739727e-06, "loss": 0.0047, "step": 6739 }, { "epoch": 6.155251141552512, "grad_norm": 1.1017237901687622, "learning_rate": 4.272957889396246e-06, "loss": 0.0063, "step": 6740 }, { "epoch": 6.156164383561644, "grad_norm": 9.780041694641113, "learning_rate": 4.2719431760527655e-06, "loss": 0.0383, "step": 6741 }, { "epoch": 6.157077625570777, "grad_norm": 5.900444507598877, "learning_rate": 4.270928462709285e-06, "loss": 0.0365, "step": 6742 }, { "epoch": 6.157990867579909, "grad_norm": 3.963320016860962, "learning_rate": 4.269913749365804e-06, "loss": 0.0302, "step": 6743 }, { "epoch": 6.1589041095890416, "grad_norm": 20.0787296295166, "learning_rate": 4.268899036022325e-06, "loss": 0.2288, "step": 6744 }, { "epoch": 6.159817351598173, "grad_norm": 2.631141424179077, "learning_rate": 4.2678843226788435e-06, "loss": 0.0178, "step": 6745 }, { "epoch": 6.160730593607306, "grad_norm": 11.776433944702148, "learning_rate": 4.266869609335363e-06, "loss": 0.089, "step": 6746 }, { "epoch": 6.161643835616438, "grad_norm": 5.413474082946777, "learning_rate": 4.265854895991883e-06, "loss": 0.0187, "step": 6747 }, { "epoch": 6.1625570776255705, "grad_norm": 3.8531060218811035, "learning_rate": 4.264840182648402e-06, "loss": 0.0226, "step": 6748 }, { "epoch": 6.163470319634703, "grad_norm": 0.09569501131772995, "learning_rate": 4.263825469304922e-06, "loss": 0.0006, "step": 6749 }, { "epoch": 6.164383561643835, "grad_norm": 42.05181121826172, "learning_rate": 4.262810755961441e-06, "loss": 0.3266, "step": 6750 }, { "epoch": 6.165296803652968, "grad_norm": 1.9713647365570068, "learning_rate": 4.261796042617961e-06, "loss": 0.0165, "step": 6751 }, { "epoch": 6.1662100456621, "grad_norm": 4.902543544769287, "learning_rate": 4.2607813292744805e-06, "loss": 0.0251, "step": 6752 }, { "epoch": 6.167123287671233, "grad_norm": 0.6517849564552307, "learning_rate": 4.259766615930999e-06, "loss": 0.0039, "step": 6753 }, { "epoch": 6.168036529680365, "grad_norm": 0.10845935344696045, "learning_rate": 4.25875190258752e-06, "loss": 0.0008, "step": 6754 }, { "epoch": 6.168949771689498, "grad_norm": 3.601686716079712, "learning_rate": 4.257737189244039e-06, "loss": 0.0285, "step": 6755 }, { "epoch": 6.16986301369863, "grad_norm": 1.9998220205307007, "learning_rate": 4.256722475900558e-06, "loss": 0.0122, "step": 6756 }, { "epoch": 6.170776255707763, "grad_norm": 0.08422388881444931, "learning_rate": 4.255707762557078e-06, "loss": 0.0007, "step": 6757 }, { "epoch": 6.171689497716895, "grad_norm": 1.8304743766784668, "learning_rate": 4.254693049213598e-06, "loss": 0.0134, "step": 6758 }, { "epoch": 6.1726027397260275, "grad_norm": 0.2625288963317871, "learning_rate": 4.2536783358701175e-06, "loss": 0.0018, "step": 6759 }, { "epoch": 6.17351598173516, "grad_norm": 1.0943458080291748, "learning_rate": 4.252663622526636e-06, "loss": 0.009, "step": 6760 }, { "epoch": 6.174429223744292, "grad_norm": 13.919066429138184, "learning_rate": 4.251648909183156e-06, "loss": 0.0651, "step": 6761 }, { "epoch": 6.175342465753425, "grad_norm": 11.04566764831543, "learning_rate": 4.250634195839676e-06, "loss": 0.0638, "step": 6762 }, { "epoch": 6.176255707762557, "grad_norm": 1.874843955039978, "learning_rate": 4.249619482496195e-06, "loss": 0.0105, "step": 6763 }, { "epoch": 6.17716894977169, "grad_norm": 9.05394172668457, "learning_rate": 4.248604769152715e-06, "loss": 0.0644, "step": 6764 }, { "epoch": 6.178082191780822, "grad_norm": 0.6410936117172241, "learning_rate": 4.247590055809234e-06, "loss": 0.005, "step": 6765 }, { "epoch": 6.178995433789955, "grad_norm": 2.1817102432250977, "learning_rate": 4.246575342465754e-06, "loss": 0.0162, "step": 6766 }, { "epoch": 6.179908675799087, "grad_norm": 4.377900123596191, "learning_rate": 4.245560629122273e-06, "loss": 0.0111, "step": 6767 }, { "epoch": 6.1808219178082195, "grad_norm": 8.115923881530762, "learning_rate": 4.244545915778793e-06, "loss": 0.0656, "step": 6768 }, { "epoch": 6.181735159817352, "grad_norm": 0.5255022644996643, "learning_rate": 4.243531202435313e-06, "loss": 0.0032, "step": 6769 }, { "epoch": 6.182648401826484, "grad_norm": 0.27667495608329773, "learning_rate": 4.2425164890918315e-06, "loss": 0.0025, "step": 6770 }, { "epoch": 6.183561643835616, "grad_norm": 2.2845890522003174, "learning_rate": 4.241501775748351e-06, "loss": 0.0118, "step": 6771 }, { "epoch": 6.1844748858447485, "grad_norm": 1.5352692604064941, "learning_rate": 4.240487062404871e-06, "loss": 0.0083, "step": 6772 }, { "epoch": 6.185388127853881, "grad_norm": 0.9221170544624329, "learning_rate": 4.239472349061391e-06, "loss": 0.005, "step": 6773 }, { "epoch": 6.186301369863013, "grad_norm": 0.32310834527015686, "learning_rate": 4.23845763571791e-06, "loss": 0.0019, "step": 6774 }, { "epoch": 6.187214611872146, "grad_norm": 6.1528730392456055, "learning_rate": 4.237442922374429e-06, "loss": 0.0435, "step": 6775 }, { "epoch": 6.188127853881278, "grad_norm": 10.22537612915039, "learning_rate": 4.236428209030949e-06, "loss": 0.0732, "step": 6776 }, { "epoch": 6.189041095890411, "grad_norm": 1.1491656303405762, "learning_rate": 4.2354134956874685e-06, "loss": 0.0096, "step": 6777 }, { "epoch": 6.189954337899543, "grad_norm": 3.8150155544281006, "learning_rate": 4.234398782343988e-06, "loss": 0.0273, "step": 6778 }, { "epoch": 6.190867579908676, "grad_norm": 7.561823844909668, "learning_rate": 4.233384069000508e-06, "loss": 0.0394, "step": 6779 }, { "epoch": 6.191780821917808, "grad_norm": 1.9700465202331543, "learning_rate": 4.232369355657028e-06, "loss": 0.0153, "step": 6780 }, { "epoch": 6.1926940639269406, "grad_norm": 0.47760000824928284, "learning_rate": 4.2313546423135464e-06, "loss": 0.0021, "step": 6781 }, { "epoch": 6.193607305936073, "grad_norm": 0.015400651842355728, "learning_rate": 4.230339928970066e-06, "loss": 0.0001, "step": 6782 }, { "epoch": 6.1945205479452055, "grad_norm": 0.2830146253108978, "learning_rate": 4.229325215626586e-06, "loss": 0.0023, "step": 6783 }, { "epoch": 6.195433789954338, "grad_norm": 1.702457308769226, "learning_rate": 4.2283105022831055e-06, "loss": 0.01, "step": 6784 }, { "epoch": 6.19634703196347, "grad_norm": 6.731873989105225, "learning_rate": 4.227295788939625e-06, "loss": 0.0451, "step": 6785 }, { "epoch": 6.197260273972603, "grad_norm": 6.469781398773193, "learning_rate": 4.226281075596144e-06, "loss": 0.0402, "step": 6786 }, { "epoch": 6.198173515981735, "grad_norm": 11.086577415466309, "learning_rate": 4.225266362252664e-06, "loss": 0.0615, "step": 6787 }, { "epoch": 6.199086757990868, "grad_norm": 0.4773727357387543, "learning_rate": 4.2242516489091834e-06, "loss": 0.0035, "step": 6788 }, { "epoch": 6.2, "grad_norm": 0.693611204624176, "learning_rate": 4.223236935565703e-06, "loss": 0.0042, "step": 6789 }, { "epoch": 6.200913242009133, "grad_norm": 0.06558989733457565, "learning_rate": 4.222222222222223e-06, "loss": 0.0003, "step": 6790 }, { "epoch": 6.201826484018265, "grad_norm": 0.07359948754310608, "learning_rate": 4.221207508878742e-06, "loss": 0.0005, "step": 6791 }, { "epoch": 6.2027397260273975, "grad_norm": 3.894676446914673, "learning_rate": 4.220192795535261e-06, "loss": 0.0178, "step": 6792 }, { "epoch": 6.20365296803653, "grad_norm": 0.09555403143167496, "learning_rate": 4.219178082191781e-06, "loss": 0.0005, "step": 6793 }, { "epoch": 6.2045662100456624, "grad_norm": 1.4613821506500244, "learning_rate": 4.218163368848301e-06, "loss": 0.0079, "step": 6794 }, { "epoch": 6.205479452054795, "grad_norm": 0.08552147448062897, "learning_rate": 4.2171486555048204e-06, "loss": 0.0006, "step": 6795 }, { "epoch": 6.206392694063927, "grad_norm": 5.755458354949951, "learning_rate": 4.216133942161339e-06, "loss": 0.0431, "step": 6796 }, { "epoch": 6.207305936073059, "grad_norm": 88.20272827148438, "learning_rate": 4.215119228817859e-06, "loss": 0.1848, "step": 6797 }, { "epoch": 6.208219178082191, "grad_norm": 0.19376303255558014, "learning_rate": 4.214104515474379e-06, "loss": 0.0017, "step": 6798 }, { "epoch": 6.209132420091324, "grad_norm": 4.126441955566406, "learning_rate": 4.213089802130898e-06, "loss": 0.0287, "step": 6799 }, { "epoch": 6.210045662100456, "grad_norm": 4.596734046936035, "learning_rate": 4.212075088787418e-06, "loss": 0.0331, "step": 6800 }, { "epoch": 6.210958904109589, "grad_norm": 6.757684707641602, "learning_rate": 4.211060375443937e-06, "loss": 0.057, "step": 6801 }, { "epoch": 6.211872146118721, "grad_norm": 1.0009469985961914, "learning_rate": 4.2100456621004574e-06, "loss": 0.0056, "step": 6802 }, { "epoch": 6.212785388127854, "grad_norm": 0.995919406414032, "learning_rate": 4.209030948756976e-06, "loss": 0.0069, "step": 6803 }, { "epoch": 6.213698630136986, "grad_norm": 4.762990474700928, "learning_rate": 4.208016235413496e-06, "loss": 0.0242, "step": 6804 }, { "epoch": 6.2146118721461185, "grad_norm": 5.254402160644531, "learning_rate": 4.207001522070016e-06, "loss": 0.0506, "step": 6805 }, { "epoch": 6.215525114155251, "grad_norm": 1.3303180932998657, "learning_rate": 4.2059868087265345e-06, "loss": 0.0093, "step": 6806 }, { "epoch": 6.2164383561643834, "grad_norm": 0.7027679681777954, "learning_rate": 4.204972095383055e-06, "loss": 0.0048, "step": 6807 }, { "epoch": 6.217351598173516, "grad_norm": 0.378203809261322, "learning_rate": 4.203957382039574e-06, "loss": 0.0026, "step": 6808 }, { "epoch": 6.218264840182648, "grad_norm": 24.76675033569336, "learning_rate": 4.202942668696094e-06, "loss": 0.1199, "step": 6809 }, { "epoch": 6.219178082191781, "grad_norm": 50.69792556762695, "learning_rate": 4.201927955352613e-06, "loss": 0.2935, "step": 6810 }, { "epoch": 6.220091324200913, "grad_norm": 0.905124843120575, "learning_rate": 4.200913242009132e-06, "loss": 0.005, "step": 6811 }, { "epoch": 6.221004566210046, "grad_norm": 31.599998474121094, "learning_rate": 4.199898528665653e-06, "loss": 0.1446, "step": 6812 }, { "epoch": 6.221917808219178, "grad_norm": 0.17856910824775696, "learning_rate": 4.1988838153221715e-06, "loss": 0.0013, "step": 6813 }, { "epoch": 6.222831050228311, "grad_norm": 5.8702592849731445, "learning_rate": 4.197869101978691e-06, "loss": 0.0409, "step": 6814 }, { "epoch": 6.223744292237443, "grad_norm": 31.27927017211914, "learning_rate": 4.196854388635211e-06, "loss": 0.1384, "step": 6815 }, { "epoch": 6.2246575342465755, "grad_norm": 0.5319919586181641, "learning_rate": 4.195839675291731e-06, "loss": 0.0035, "step": 6816 }, { "epoch": 6.225570776255708, "grad_norm": 0.7209212779998779, "learning_rate": 4.19482496194825e-06, "loss": 0.0045, "step": 6817 }, { "epoch": 6.22648401826484, "grad_norm": 0.054890722036361694, "learning_rate": 4.193810248604769e-06, "loss": 0.0003, "step": 6818 }, { "epoch": 6.227397260273973, "grad_norm": 3.387166976928711, "learning_rate": 4.192795535261289e-06, "loss": 0.0138, "step": 6819 }, { "epoch": 6.228310502283105, "grad_norm": 1.2290842533111572, "learning_rate": 4.1917808219178085e-06, "loss": 0.0071, "step": 6820 }, { "epoch": 6.229223744292238, "grad_norm": 9.553555488586426, "learning_rate": 4.190766108574328e-06, "loss": 0.0503, "step": 6821 }, { "epoch": 6.23013698630137, "grad_norm": 2.534789800643921, "learning_rate": 4.189751395230848e-06, "loss": 0.0141, "step": 6822 }, { "epoch": 6.231050228310503, "grad_norm": 0.9429089426994324, "learning_rate": 4.188736681887367e-06, "loss": 0.0054, "step": 6823 }, { "epoch": 6.231963470319634, "grad_norm": 1.3798002004623413, "learning_rate": 4.187721968543887e-06, "loss": 0.0073, "step": 6824 }, { "epoch": 6.232876712328767, "grad_norm": 9.449101448059082, "learning_rate": 4.186707255200406e-06, "loss": 0.0749, "step": 6825 }, { "epoch": 6.233789954337899, "grad_norm": 1.1923993825912476, "learning_rate": 4.185692541856926e-06, "loss": 0.0071, "step": 6826 }, { "epoch": 6.234703196347032, "grad_norm": 3.9156134128570557, "learning_rate": 4.1846778285134455e-06, "loss": 0.0267, "step": 6827 }, { "epoch": 6.235616438356164, "grad_norm": 0.21118050813674927, "learning_rate": 4.183663115169964e-06, "loss": 0.0014, "step": 6828 }, { "epoch": 6.2365296803652965, "grad_norm": 0.7241155505180359, "learning_rate": 4.182648401826485e-06, "loss": 0.0038, "step": 6829 }, { "epoch": 6.237442922374429, "grad_norm": 0.5088173747062683, "learning_rate": 4.181633688483004e-06, "loss": 0.0033, "step": 6830 }, { "epoch": 6.238356164383561, "grad_norm": 1.0520215034484863, "learning_rate": 4.1806189751395234e-06, "loss": 0.0043, "step": 6831 }, { "epoch": 6.239269406392694, "grad_norm": 2.8140220642089844, "learning_rate": 4.179604261796043e-06, "loss": 0.0255, "step": 6832 }, { "epoch": 6.240182648401826, "grad_norm": 2.3580496311187744, "learning_rate": 4.178589548452562e-06, "loss": 0.0149, "step": 6833 }, { "epoch": 6.241095890410959, "grad_norm": 0.02440466731786728, "learning_rate": 4.1775748351090825e-06, "loss": 0.0002, "step": 6834 }, { "epoch": 6.242009132420091, "grad_norm": 0.23516830801963806, "learning_rate": 4.176560121765601e-06, "loss": 0.0016, "step": 6835 }, { "epoch": 6.242922374429224, "grad_norm": 0.19738870859146118, "learning_rate": 4.175545408422121e-06, "loss": 0.0011, "step": 6836 }, { "epoch": 6.243835616438356, "grad_norm": 0.002146003069356084, "learning_rate": 4.174530695078641e-06, "loss": 0.0, "step": 6837 }, { "epoch": 6.244748858447489, "grad_norm": 0.8813510537147522, "learning_rate": 4.1735159817351604e-06, "loss": 0.0048, "step": 6838 }, { "epoch": 6.245662100456621, "grad_norm": 0.08303648978471756, "learning_rate": 4.17250126839168e-06, "loss": 0.0005, "step": 6839 }, { "epoch": 6.2465753424657535, "grad_norm": 3.682274341583252, "learning_rate": 4.171486555048199e-06, "loss": 0.0142, "step": 6840 }, { "epoch": 6.247488584474886, "grad_norm": 5.727811336517334, "learning_rate": 4.170471841704719e-06, "loss": 0.0369, "step": 6841 }, { "epoch": 6.248401826484018, "grad_norm": 1.5457684993743896, "learning_rate": 4.169457128361238e-06, "loss": 0.0107, "step": 6842 }, { "epoch": 6.249315068493151, "grad_norm": 0.11563374847173691, "learning_rate": 4.168442415017758e-06, "loss": 0.0007, "step": 6843 }, { "epoch": 6.250228310502283, "grad_norm": 4.574007987976074, "learning_rate": 4.167427701674278e-06, "loss": 0.0226, "step": 6844 }, { "epoch": 6.251141552511416, "grad_norm": 0.4711828827857971, "learning_rate": 4.166412988330797e-06, "loss": 0.0035, "step": 6845 }, { "epoch": 6.252054794520548, "grad_norm": 3.992419719696045, "learning_rate": 4.165398274987316e-06, "loss": 0.0236, "step": 6846 }, { "epoch": 6.252968036529681, "grad_norm": 0.314698725938797, "learning_rate": 4.164383561643836e-06, "loss": 0.0021, "step": 6847 }, { "epoch": 6.253881278538813, "grad_norm": 32.063594818115234, "learning_rate": 4.163368848300356e-06, "loss": 0.2227, "step": 6848 }, { "epoch": 6.254794520547946, "grad_norm": 0.23530679941177368, "learning_rate": 4.162354134956875e-06, "loss": 0.0016, "step": 6849 }, { "epoch": 6.255707762557078, "grad_norm": 0.5140172243118286, "learning_rate": 4.161339421613394e-06, "loss": 0.0039, "step": 6850 }, { "epoch": 6.25662100456621, "grad_norm": 3.4896841049194336, "learning_rate": 4.160324708269914e-06, "loss": 0.031, "step": 6851 }, { "epoch": 6.257534246575342, "grad_norm": 13.906542778015137, "learning_rate": 4.1593099949264336e-06, "loss": 0.0912, "step": 6852 }, { "epoch": 6.2584474885844745, "grad_norm": 0.033591438084840775, "learning_rate": 4.158295281582953e-06, "loss": 0.0002, "step": 6853 }, { "epoch": 6.259360730593607, "grad_norm": 27.465139389038086, "learning_rate": 4.157280568239473e-06, "loss": 0.1775, "step": 6854 }, { "epoch": 6.260273972602739, "grad_norm": 1.1116390228271484, "learning_rate": 4.156265854895992e-06, "loss": 0.005, "step": 6855 }, { "epoch": 6.261187214611872, "grad_norm": 1.14852774143219, "learning_rate": 4.1552511415525115e-06, "loss": 0.0076, "step": 6856 }, { "epoch": 6.262100456621004, "grad_norm": 0.4255444407463074, "learning_rate": 4.154236428209031e-06, "loss": 0.0022, "step": 6857 }, { "epoch": 6.263013698630137, "grad_norm": 0.019059348851442337, "learning_rate": 4.153221714865551e-06, "loss": 0.0001, "step": 6858 }, { "epoch": 6.263926940639269, "grad_norm": 4.696826457977295, "learning_rate": 4.1522070015220706e-06, "loss": 0.0244, "step": 6859 }, { "epoch": 6.264840182648402, "grad_norm": 0.15584518015384674, "learning_rate": 4.15119228817859e-06, "loss": 0.0009, "step": 6860 }, { "epoch": 6.265753424657534, "grad_norm": 0.27248692512512207, "learning_rate": 4.150177574835109e-06, "loss": 0.0014, "step": 6861 }, { "epoch": 6.266666666666667, "grad_norm": 0.10569839179515839, "learning_rate": 4.149162861491629e-06, "loss": 0.0008, "step": 6862 }, { "epoch": 6.267579908675799, "grad_norm": 0.2406303882598877, "learning_rate": 4.1481481481481485e-06, "loss": 0.0019, "step": 6863 }, { "epoch": 6.2684931506849315, "grad_norm": 0.24549958109855652, "learning_rate": 4.147133434804668e-06, "loss": 0.0013, "step": 6864 }, { "epoch": 6.269406392694064, "grad_norm": 10.518689155578613, "learning_rate": 4.146118721461188e-06, "loss": 0.0841, "step": 6865 }, { "epoch": 6.270319634703196, "grad_norm": 3.756420612335205, "learning_rate": 4.145104008117707e-06, "loss": 0.0277, "step": 6866 }, { "epoch": 6.271232876712329, "grad_norm": 0.7563664317131042, "learning_rate": 4.144089294774226e-06, "loss": 0.0053, "step": 6867 }, { "epoch": 6.272146118721461, "grad_norm": 1.5247397422790527, "learning_rate": 4.143074581430746e-06, "loss": 0.0094, "step": 6868 }, { "epoch": 6.273059360730594, "grad_norm": 0.2873117923736572, "learning_rate": 4.142059868087266e-06, "loss": 0.0019, "step": 6869 }, { "epoch": 6.273972602739726, "grad_norm": 0.10716048628091812, "learning_rate": 4.1410451547437855e-06, "loss": 0.0007, "step": 6870 }, { "epoch": 6.274885844748859, "grad_norm": 0.093154676258564, "learning_rate": 4.140030441400304e-06, "loss": 0.0007, "step": 6871 }, { "epoch": 6.275799086757991, "grad_norm": 3.9340598583221436, "learning_rate": 4.139015728056824e-06, "loss": 0.0189, "step": 6872 }, { "epoch": 6.276712328767124, "grad_norm": 2.4916086196899414, "learning_rate": 4.138001014713344e-06, "loss": 0.0108, "step": 6873 }, { "epoch": 6.277625570776256, "grad_norm": 0.5219421982765198, "learning_rate": 4.136986301369863e-06, "loss": 0.0032, "step": 6874 }, { "epoch": 6.2785388127853885, "grad_norm": 5.85858678817749, "learning_rate": 4.135971588026383e-06, "loss": 0.0345, "step": 6875 }, { "epoch": 6.279452054794521, "grad_norm": 1.6015446186065674, "learning_rate": 4.134956874682902e-06, "loss": 0.0073, "step": 6876 }, { "epoch": 6.280365296803653, "grad_norm": 4.440736293792725, "learning_rate": 4.133942161339422e-06, "loss": 0.0308, "step": 6877 }, { "epoch": 6.281278538812785, "grad_norm": 0.24405984580516815, "learning_rate": 4.132927447995941e-06, "loss": 0.0017, "step": 6878 }, { "epoch": 6.282191780821917, "grad_norm": 2.3858978748321533, "learning_rate": 4.131912734652461e-06, "loss": 0.0092, "step": 6879 }, { "epoch": 6.28310502283105, "grad_norm": 1.1896746158599854, "learning_rate": 4.130898021308981e-06, "loss": 0.0072, "step": 6880 }, { "epoch": 6.284018264840182, "grad_norm": 2.189816951751709, "learning_rate": 4.1298833079654996e-06, "loss": 0.0112, "step": 6881 }, { "epoch": 6.284931506849315, "grad_norm": 1.8657976388931274, "learning_rate": 4.12886859462202e-06, "loss": 0.0131, "step": 6882 }, { "epoch": 6.285844748858447, "grad_norm": 0.7329529523849487, "learning_rate": 4.127853881278539e-06, "loss": 0.0049, "step": 6883 }, { "epoch": 6.28675799086758, "grad_norm": 1.1498953104019165, "learning_rate": 4.126839167935059e-06, "loss": 0.0085, "step": 6884 }, { "epoch": 6.287671232876712, "grad_norm": 19.263713836669922, "learning_rate": 4.125824454591578e-06, "loss": 0.0923, "step": 6885 }, { "epoch": 6.288584474885845, "grad_norm": 3.7987637519836426, "learning_rate": 4.124809741248097e-06, "loss": 0.0284, "step": 6886 }, { "epoch": 6.289497716894977, "grad_norm": 10.595112800598145, "learning_rate": 4.123795027904618e-06, "loss": 0.0739, "step": 6887 }, { "epoch": 6.2904109589041095, "grad_norm": 0.024388771504163742, "learning_rate": 4.1227803145611366e-06, "loss": 0.0001, "step": 6888 }, { "epoch": 6.291324200913242, "grad_norm": 0.24699735641479492, "learning_rate": 4.121765601217656e-06, "loss": 0.0018, "step": 6889 }, { "epoch": 6.292237442922374, "grad_norm": 0.39845705032348633, "learning_rate": 4.120750887874176e-06, "loss": 0.0011, "step": 6890 }, { "epoch": 6.293150684931507, "grad_norm": 18.53983497619629, "learning_rate": 4.119736174530695e-06, "loss": 0.132, "step": 6891 }, { "epoch": 6.294063926940639, "grad_norm": 0.7509304285049438, "learning_rate": 4.118721461187215e-06, "loss": 0.0038, "step": 6892 }, { "epoch": 6.294977168949772, "grad_norm": 0.7295032143592834, "learning_rate": 4.117706747843734e-06, "loss": 0.0049, "step": 6893 }, { "epoch": 6.295890410958904, "grad_norm": 11.88966178894043, "learning_rate": 4.116692034500254e-06, "loss": 0.072, "step": 6894 }, { "epoch": 6.296803652968037, "grad_norm": 1.2629812955856323, "learning_rate": 4.1156773211567736e-06, "loss": 0.0074, "step": 6895 }, { "epoch": 6.297716894977169, "grad_norm": 0.07687364518642426, "learning_rate": 4.114662607813293e-06, "loss": 0.0005, "step": 6896 }, { "epoch": 6.298630136986302, "grad_norm": 5.105025291442871, "learning_rate": 4.113647894469813e-06, "loss": 0.037, "step": 6897 }, { "epoch": 6.299543378995434, "grad_norm": 2.050854444503784, "learning_rate": 4.112633181126332e-06, "loss": 0.0121, "step": 6898 }, { "epoch": 6.3004566210045665, "grad_norm": 0.24067328870296478, "learning_rate": 4.1116184677828515e-06, "loss": 0.0013, "step": 6899 }, { "epoch": 6.301369863013699, "grad_norm": 5.805943012237549, "learning_rate": 4.110603754439371e-06, "loss": 0.0326, "step": 6900 }, { "epoch": 6.302283105022831, "grad_norm": 3.6188406944274902, "learning_rate": 4.109589041095891e-06, "loss": 0.0198, "step": 6901 }, { "epoch": 6.303196347031964, "grad_norm": 4.875394344329834, "learning_rate": 4.1085743277524106e-06, "loss": 0.0149, "step": 6902 }, { "epoch": 6.304109589041096, "grad_norm": 0.4673748314380646, "learning_rate": 4.107559614408929e-06, "loss": 0.0033, "step": 6903 }, { "epoch": 6.305022831050229, "grad_norm": 0.27980589866638184, "learning_rate": 4.10654490106545e-06, "loss": 0.0016, "step": 6904 }, { "epoch": 6.30593607305936, "grad_norm": 2.966465711593628, "learning_rate": 4.105530187721969e-06, "loss": 0.0223, "step": 6905 }, { "epoch": 6.306849315068493, "grad_norm": 0.48050227761268616, "learning_rate": 4.1045154743784885e-06, "loss": 0.0031, "step": 6906 }, { "epoch": 6.307762557077625, "grad_norm": 14.805272102355957, "learning_rate": 4.103500761035008e-06, "loss": 0.0823, "step": 6907 }, { "epoch": 6.308675799086758, "grad_norm": 0.6908344626426697, "learning_rate": 4.102486047691527e-06, "loss": 0.0044, "step": 6908 }, { "epoch": 6.30958904109589, "grad_norm": 13.449264526367188, "learning_rate": 4.1014713343480476e-06, "loss": 0.0681, "step": 6909 }, { "epoch": 6.310502283105023, "grad_norm": 3.1683664321899414, "learning_rate": 4.100456621004566e-06, "loss": 0.0228, "step": 6910 }, { "epoch": 6.311415525114155, "grad_norm": 3.1751458644866943, "learning_rate": 4.099441907661086e-06, "loss": 0.0187, "step": 6911 }, { "epoch": 6.3123287671232875, "grad_norm": 5.655699253082275, "learning_rate": 4.098427194317606e-06, "loss": 0.0279, "step": 6912 }, { "epoch": 6.31324200913242, "grad_norm": 0.9031633734703064, "learning_rate": 4.097412480974125e-06, "loss": 0.0037, "step": 6913 }, { "epoch": 6.314155251141552, "grad_norm": 0.06055277958512306, "learning_rate": 4.096397767630645e-06, "loss": 0.0004, "step": 6914 }, { "epoch": 6.315068493150685, "grad_norm": 0.2813037931919098, "learning_rate": 4.095383054287164e-06, "loss": 0.002, "step": 6915 }, { "epoch": 6.315981735159817, "grad_norm": 1.968657374382019, "learning_rate": 4.094368340943684e-06, "loss": 0.015, "step": 6916 }, { "epoch": 6.31689497716895, "grad_norm": 1.0426979064941406, "learning_rate": 4.093353627600203e-06, "loss": 0.0038, "step": 6917 }, { "epoch": 6.317808219178082, "grad_norm": 0.05499563366174698, "learning_rate": 4.092338914256723e-06, "loss": 0.0004, "step": 6918 }, { "epoch": 6.318721461187215, "grad_norm": 0.4272569715976715, "learning_rate": 4.091324200913243e-06, "loss": 0.0023, "step": 6919 }, { "epoch": 6.319634703196347, "grad_norm": 13.944257736206055, "learning_rate": 4.090309487569762e-06, "loss": 0.0804, "step": 6920 }, { "epoch": 6.32054794520548, "grad_norm": 0.9417294859886169, "learning_rate": 4.089294774226281e-06, "loss": 0.0036, "step": 6921 }, { "epoch": 6.321461187214612, "grad_norm": 2.787529468536377, "learning_rate": 4.088280060882801e-06, "loss": 0.0149, "step": 6922 }, { "epoch": 6.3223744292237445, "grad_norm": 0.8351179957389832, "learning_rate": 4.087265347539321e-06, "loss": 0.0052, "step": 6923 }, { "epoch": 6.323287671232877, "grad_norm": 0.30979084968566895, "learning_rate": 4.08625063419584e-06, "loss": 0.0016, "step": 6924 }, { "epoch": 6.324200913242009, "grad_norm": 9.95527172088623, "learning_rate": 4.085235920852359e-06, "loss": 0.0321, "step": 6925 }, { "epoch": 6.325114155251142, "grad_norm": 0.04992785304784775, "learning_rate": 4.084221207508879e-06, "loss": 0.0003, "step": 6926 }, { "epoch": 6.326027397260274, "grad_norm": 0.15238311886787415, "learning_rate": 4.083206494165399e-06, "loss": 0.0009, "step": 6927 }, { "epoch": 6.326940639269407, "grad_norm": 0.5168843269348145, "learning_rate": 4.082191780821918e-06, "loss": 0.0028, "step": 6928 }, { "epoch": 6.327853881278539, "grad_norm": 0.5306276679039001, "learning_rate": 4.081177067478438e-06, "loss": 0.0041, "step": 6929 }, { "epoch": 6.328767123287671, "grad_norm": 1.0053575038909912, "learning_rate": 4.080162354134957e-06, "loss": 0.0069, "step": 6930 }, { "epoch": 6.329680365296804, "grad_norm": 4.012270927429199, "learning_rate": 4.0791476407914765e-06, "loss": 0.0187, "step": 6931 }, { "epoch": 6.330593607305936, "grad_norm": 1.2154620885849, "learning_rate": 4.078132927447996e-06, "loss": 0.0067, "step": 6932 }, { "epoch": 6.331506849315068, "grad_norm": 58.367191314697266, "learning_rate": 4.077118214104516e-06, "loss": 0.4382, "step": 6933 }, { "epoch": 6.332420091324201, "grad_norm": 9.272238731384277, "learning_rate": 4.076103500761036e-06, "loss": 0.0569, "step": 6934 }, { "epoch": 6.333333333333333, "grad_norm": 8.835707664489746, "learning_rate": 4.0750887874175545e-06, "loss": 0.0382, "step": 6935 }, { "epoch": 6.3342465753424655, "grad_norm": 1.3037166595458984, "learning_rate": 4.074074074074074e-06, "loss": 0.0071, "step": 6936 }, { "epoch": 6.335159817351598, "grad_norm": 35.22101974487305, "learning_rate": 4.073059360730594e-06, "loss": 0.1946, "step": 6937 }, { "epoch": 6.33607305936073, "grad_norm": 1.5241445302963257, "learning_rate": 4.0720446473871135e-06, "loss": 0.0083, "step": 6938 }, { "epoch": 6.336986301369863, "grad_norm": 1.007462978363037, "learning_rate": 4.071029934043633e-06, "loss": 0.0067, "step": 6939 }, { "epoch": 6.337899543378995, "grad_norm": 0.24292446672916412, "learning_rate": 4.070015220700153e-06, "loss": 0.0011, "step": 6940 }, { "epoch": 6.338812785388128, "grad_norm": 45.846920013427734, "learning_rate": 4.069000507356672e-06, "loss": 0.3543, "step": 6941 }, { "epoch": 6.33972602739726, "grad_norm": 1.5756056308746338, "learning_rate": 4.0679857940131915e-06, "loss": 0.0085, "step": 6942 }, { "epoch": 6.340639269406393, "grad_norm": 29.641084671020508, "learning_rate": 4.066971080669711e-06, "loss": 0.0964, "step": 6943 }, { "epoch": 6.341552511415525, "grad_norm": 0.5608273148536682, "learning_rate": 4.065956367326231e-06, "loss": 0.0043, "step": 6944 }, { "epoch": 6.342465753424658, "grad_norm": 3.232574224472046, "learning_rate": 4.0649416539827505e-06, "loss": 0.0134, "step": 6945 }, { "epoch": 6.34337899543379, "grad_norm": 0.03192431479692459, "learning_rate": 4.063926940639269e-06, "loss": 0.0002, "step": 6946 }, { "epoch": 6.3442922374429225, "grad_norm": 0.4342397153377533, "learning_rate": 4.062912227295789e-06, "loss": 0.003, "step": 6947 }, { "epoch": 6.345205479452055, "grad_norm": 2.2297606468200684, "learning_rate": 4.061897513952309e-06, "loss": 0.0086, "step": 6948 }, { "epoch": 6.346118721461187, "grad_norm": 2.409973621368408, "learning_rate": 4.0608828006088285e-06, "loss": 0.013, "step": 6949 }, { "epoch": 6.34703196347032, "grad_norm": 96.89573669433594, "learning_rate": 4.059868087265348e-06, "loss": 0.7296, "step": 6950 }, { "epoch": 6.347945205479452, "grad_norm": 0.006951743736863136, "learning_rate": 4.058853373921867e-06, "loss": 0.0001, "step": 6951 }, { "epoch": 6.348858447488585, "grad_norm": 17.920806884765625, "learning_rate": 4.057838660578387e-06, "loss": 0.0647, "step": 6952 }, { "epoch": 6.349771689497717, "grad_norm": 1.4777129888534546, "learning_rate": 4.056823947234906e-06, "loss": 0.008, "step": 6953 }, { "epoch": 6.35068493150685, "grad_norm": 0.3478490710258484, "learning_rate": 4.055809233891426e-06, "loss": 0.0012, "step": 6954 }, { "epoch": 6.351598173515982, "grad_norm": 2.8330349922180176, "learning_rate": 4.054794520547946e-06, "loss": 0.0177, "step": 6955 }, { "epoch": 6.352511415525115, "grad_norm": 1.0035580396652222, "learning_rate": 4.053779807204465e-06, "loss": 0.0066, "step": 6956 }, { "epoch": 6.353424657534246, "grad_norm": 22.757213592529297, "learning_rate": 4.052765093860984e-06, "loss": 0.1034, "step": 6957 }, { "epoch": 6.3543378995433795, "grad_norm": 0.8665732741355896, "learning_rate": 4.051750380517504e-06, "loss": 0.004, "step": 6958 }, { "epoch": 6.355251141552511, "grad_norm": 54.91313552856445, "learning_rate": 4.050735667174024e-06, "loss": 0.3885, "step": 6959 }, { "epoch": 6.3561643835616435, "grad_norm": 6.113328456878662, "learning_rate": 4.049720953830543e-06, "loss": 0.0313, "step": 6960 }, { "epoch": 6.357077625570776, "grad_norm": 9.877979278564453, "learning_rate": 4.048706240487062e-06, "loss": 0.0584, "step": 6961 }, { "epoch": 6.357990867579908, "grad_norm": 1.3660794496536255, "learning_rate": 4.047691527143583e-06, "loss": 0.0095, "step": 6962 }, { "epoch": 6.358904109589041, "grad_norm": 1.8282767534255981, "learning_rate": 4.046676813800102e-06, "loss": 0.0106, "step": 6963 }, { "epoch": 6.359817351598173, "grad_norm": 11.285603523254395, "learning_rate": 4.045662100456621e-06, "loss": 0.0692, "step": 6964 }, { "epoch": 6.360730593607306, "grad_norm": 1.7713717222213745, "learning_rate": 4.044647387113141e-06, "loss": 0.0073, "step": 6965 }, { "epoch": 6.361643835616438, "grad_norm": 0.3767909109592438, "learning_rate": 4.04363267376966e-06, "loss": 0.0029, "step": 6966 }, { "epoch": 6.362557077625571, "grad_norm": 0.48863673210144043, "learning_rate": 4.04261796042618e-06, "loss": 0.004, "step": 6967 }, { "epoch": 6.363470319634703, "grad_norm": 0.13794094324111938, "learning_rate": 4.041603247082699e-06, "loss": 0.0009, "step": 6968 }, { "epoch": 6.364383561643836, "grad_norm": 5.231997966766357, "learning_rate": 4.040588533739219e-06, "loss": 0.0395, "step": 6969 }, { "epoch": 6.365296803652968, "grad_norm": 0.4628351032733917, "learning_rate": 4.039573820395739e-06, "loss": 0.0033, "step": 6970 }, { "epoch": 6.3662100456621005, "grad_norm": 0.11253214627504349, "learning_rate": 4.0385591070522574e-06, "loss": 0.0008, "step": 6971 }, { "epoch": 6.367123287671233, "grad_norm": 1.1282309293746948, "learning_rate": 4.037544393708778e-06, "loss": 0.0103, "step": 6972 }, { "epoch": 6.368036529680365, "grad_norm": 0.4475432336330414, "learning_rate": 4.036529680365297e-06, "loss": 0.0032, "step": 6973 }, { "epoch": 6.368949771689498, "grad_norm": 0.4712230861186981, "learning_rate": 4.0355149670218165e-06, "loss": 0.003, "step": 6974 }, { "epoch": 6.36986301369863, "grad_norm": 0.030544178560376167, "learning_rate": 4.034500253678336e-06, "loss": 0.0002, "step": 6975 }, { "epoch": 6.370776255707763, "grad_norm": 19.260398864746094, "learning_rate": 4.033485540334856e-06, "loss": 0.1702, "step": 6976 }, { "epoch": 6.371689497716895, "grad_norm": 9.19736099243164, "learning_rate": 4.032470826991376e-06, "loss": 0.0349, "step": 6977 }, { "epoch": 6.372602739726028, "grad_norm": 0.5086756944656372, "learning_rate": 4.0314561136478944e-06, "loss": 0.0038, "step": 6978 }, { "epoch": 6.37351598173516, "grad_norm": 5.011247158050537, "learning_rate": 4.030441400304414e-06, "loss": 0.0266, "step": 6979 }, { "epoch": 6.3744292237442925, "grad_norm": 4.183140754699707, "learning_rate": 4.029426686960934e-06, "loss": 0.0259, "step": 6980 }, { "epoch": 6.375342465753425, "grad_norm": 9.8602933883667, "learning_rate": 4.0284119736174535e-06, "loss": 0.049, "step": 6981 }, { "epoch": 6.3762557077625575, "grad_norm": 0.49213606119155884, "learning_rate": 4.027397260273973e-06, "loss": 0.0018, "step": 6982 }, { "epoch": 6.37716894977169, "grad_norm": 0.050000615417957306, "learning_rate": 4.026382546930492e-06, "loss": 0.0003, "step": 6983 }, { "epoch": 6.3780821917808215, "grad_norm": 40.05522537231445, "learning_rate": 4.025367833587013e-06, "loss": 0.2743, "step": 6984 }, { "epoch": 6.378995433789954, "grad_norm": 45.65336608886719, "learning_rate": 4.0243531202435314e-06, "loss": 0.3846, "step": 6985 }, { "epoch": 6.379908675799086, "grad_norm": 2.325242519378662, "learning_rate": 4.023338406900051e-06, "loss": 0.0191, "step": 6986 }, { "epoch": 6.380821917808219, "grad_norm": 53.97871017456055, "learning_rate": 4.022323693556571e-06, "loss": 0.3416, "step": 6987 }, { "epoch": 6.381735159817351, "grad_norm": 3.10971999168396, "learning_rate": 4.02130898021309e-06, "loss": 0.0136, "step": 6988 }, { "epoch": 6.382648401826484, "grad_norm": 0.12184051424264908, "learning_rate": 4.02029426686961e-06, "loss": 0.0009, "step": 6989 }, { "epoch": 6.383561643835616, "grad_norm": 0.46952319145202637, "learning_rate": 4.019279553526129e-06, "loss": 0.002, "step": 6990 }, { "epoch": 6.384474885844749, "grad_norm": 0.13305631279945374, "learning_rate": 4.018264840182649e-06, "loss": 0.0007, "step": 6991 }, { "epoch": 6.385388127853881, "grad_norm": 0.055190395563840866, "learning_rate": 4.0172501268391684e-06, "loss": 0.0003, "step": 6992 }, { "epoch": 6.3863013698630136, "grad_norm": 9.205158233642578, "learning_rate": 4.016235413495687e-06, "loss": 0.0587, "step": 6993 }, { "epoch": 6.387214611872146, "grad_norm": 1.930761456489563, "learning_rate": 4.015220700152208e-06, "loss": 0.0101, "step": 6994 }, { "epoch": 6.3881278538812785, "grad_norm": 0.08373164385557175, "learning_rate": 4.014205986808727e-06, "loss": 0.0004, "step": 6995 }, { "epoch": 6.389041095890411, "grad_norm": 0.6401422619819641, "learning_rate": 4.013191273465246e-06, "loss": 0.0036, "step": 6996 }, { "epoch": 6.389954337899543, "grad_norm": 7.651622772216797, "learning_rate": 4.012176560121766e-06, "loss": 0.0601, "step": 6997 }, { "epoch": 6.390867579908676, "grad_norm": 2.2409353256225586, "learning_rate": 4.011161846778286e-06, "loss": 0.0115, "step": 6998 }, { "epoch": 6.391780821917808, "grad_norm": 0.815099835395813, "learning_rate": 4.0101471334348054e-06, "loss": 0.0038, "step": 6999 }, { "epoch": 6.392694063926941, "grad_norm": 0.3037410378456116, "learning_rate": 4.009132420091324e-06, "loss": 0.002, "step": 7000 }, { "epoch": 6.393607305936073, "grad_norm": 4.683300495147705, "learning_rate": 4.008117706747844e-06, "loss": 0.0192, "step": 7001 }, { "epoch": 6.394520547945206, "grad_norm": 0.33685511350631714, "learning_rate": 4.007102993404364e-06, "loss": 0.0027, "step": 7002 }, { "epoch": 6.395433789954338, "grad_norm": 6.000749588012695, "learning_rate": 4.006088280060883e-06, "loss": 0.0263, "step": 7003 }, { "epoch": 6.3963470319634705, "grad_norm": 0.07804026454687119, "learning_rate": 4.005073566717403e-06, "loss": 0.0005, "step": 7004 }, { "epoch": 6.397260273972603, "grad_norm": 0.5091132521629333, "learning_rate": 4.004058853373922e-06, "loss": 0.0041, "step": 7005 }, { "epoch": 6.3981735159817354, "grad_norm": 0.019965920597314835, "learning_rate": 4.003044140030442e-06, "loss": 0.0001, "step": 7006 }, { "epoch": 6.399086757990868, "grad_norm": 0.8122708201408386, "learning_rate": 4.002029426686961e-06, "loss": 0.004, "step": 7007 }, { "epoch": 6.4, "grad_norm": 0.03967738151550293, "learning_rate": 4.001014713343481e-06, "loss": 0.0002, "step": 7008 }, { "epoch": 6.400913242009133, "grad_norm": 4.690293312072754, "learning_rate": 4.000000000000001e-06, "loss": 0.0307, "step": 7009 }, { "epoch": 6.401826484018265, "grad_norm": 0.27205604314804077, "learning_rate": 3.9989852866565195e-06, "loss": 0.0016, "step": 7010 }, { "epoch": 6.402739726027397, "grad_norm": 10.191697120666504, "learning_rate": 3.997970573313039e-06, "loss": 0.0807, "step": 7011 }, { "epoch": 6.403652968036529, "grad_norm": 1.426012396812439, "learning_rate": 3.996955859969559e-06, "loss": 0.012, "step": 7012 }, { "epoch": 6.404566210045662, "grad_norm": 1.0769213438034058, "learning_rate": 3.995941146626079e-06, "loss": 0.0066, "step": 7013 }, { "epoch": 6.405479452054794, "grad_norm": 1.3674036264419556, "learning_rate": 3.994926433282598e-06, "loss": 0.0081, "step": 7014 }, { "epoch": 6.406392694063927, "grad_norm": 1.5428158044815063, "learning_rate": 3.993911719939117e-06, "loss": 0.0062, "step": 7015 }, { "epoch": 6.407305936073059, "grad_norm": 0.1032850369811058, "learning_rate": 3.992897006595637e-06, "loss": 0.0005, "step": 7016 }, { "epoch": 6.4082191780821915, "grad_norm": 0.3086729049682617, "learning_rate": 3.9918822932521565e-06, "loss": 0.0014, "step": 7017 }, { "epoch": 6.409132420091324, "grad_norm": 3.8766286373138428, "learning_rate": 3.990867579908676e-06, "loss": 0.0178, "step": 7018 }, { "epoch": 6.4100456621004565, "grad_norm": 0.17737382650375366, "learning_rate": 3.989852866565196e-06, "loss": 0.0011, "step": 7019 }, { "epoch": 6.410958904109589, "grad_norm": 0.7059241533279419, "learning_rate": 3.988838153221716e-06, "loss": 0.005, "step": 7020 }, { "epoch": 6.411872146118721, "grad_norm": 37.36383819580078, "learning_rate": 3.9878234398782344e-06, "loss": 0.1757, "step": 7021 }, { "epoch": 6.412785388127854, "grad_norm": 16.583541870117188, "learning_rate": 3.986808726534754e-06, "loss": 0.0925, "step": 7022 }, { "epoch": 6.413698630136986, "grad_norm": 9.330998420715332, "learning_rate": 3.985794013191274e-06, "loss": 0.0567, "step": 7023 }, { "epoch": 6.414611872146119, "grad_norm": 0.08374840766191483, "learning_rate": 3.9847792998477935e-06, "loss": 0.0005, "step": 7024 }, { "epoch": 6.415525114155251, "grad_norm": 2.1202774047851562, "learning_rate": 3.983764586504313e-06, "loss": 0.009, "step": 7025 }, { "epoch": 6.416438356164384, "grad_norm": 1.386938452720642, "learning_rate": 3.982749873160832e-06, "loss": 0.007, "step": 7026 }, { "epoch": 6.417351598173516, "grad_norm": 1.7892001867294312, "learning_rate": 3.981735159817352e-06, "loss": 0.0128, "step": 7027 }, { "epoch": 6.4182648401826485, "grad_norm": 0.4002157151699066, "learning_rate": 3.9807204464738714e-06, "loss": 0.0022, "step": 7028 }, { "epoch": 6.419178082191781, "grad_norm": 6.899604797363281, "learning_rate": 3.979705733130391e-06, "loss": 0.038, "step": 7029 }, { "epoch": 6.420091324200913, "grad_norm": 0.3333738148212433, "learning_rate": 3.978691019786911e-06, "loss": 0.0017, "step": 7030 }, { "epoch": 6.421004566210046, "grad_norm": 1.3791248798370361, "learning_rate": 3.97767630644343e-06, "loss": 0.0084, "step": 7031 }, { "epoch": 6.421917808219178, "grad_norm": 0.04678258299827576, "learning_rate": 3.976661593099949e-06, "loss": 0.0004, "step": 7032 }, { "epoch": 6.422831050228311, "grad_norm": 0.031173978000879288, "learning_rate": 3.975646879756469e-06, "loss": 0.0001, "step": 7033 }, { "epoch": 6.423744292237443, "grad_norm": 0.3869044780731201, "learning_rate": 3.974632166412989e-06, "loss": 0.0012, "step": 7034 }, { "epoch": 6.424657534246576, "grad_norm": 0.11611422151327133, "learning_rate": 3.9736174530695084e-06, "loss": 0.0007, "step": 7035 }, { "epoch": 6.425570776255708, "grad_norm": 10.057016372680664, "learning_rate": 3.972602739726027e-06, "loss": 0.0571, "step": 7036 }, { "epoch": 6.426484018264841, "grad_norm": 0.6094098687171936, "learning_rate": 3.971588026382547e-06, "loss": 0.0036, "step": 7037 }, { "epoch": 6.427397260273972, "grad_norm": 1.3210655450820923, "learning_rate": 3.970573313039067e-06, "loss": 0.0056, "step": 7038 }, { "epoch": 6.428310502283105, "grad_norm": 0.06771565228700638, "learning_rate": 3.969558599695586e-06, "loss": 0.0005, "step": 7039 }, { "epoch": 6.429223744292237, "grad_norm": 98.63188171386719, "learning_rate": 3.968543886352106e-06, "loss": 0.917, "step": 7040 }, { "epoch": 6.4301369863013695, "grad_norm": 26.488710403442383, "learning_rate": 3.967529173008625e-06, "loss": 0.1644, "step": 7041 }, { "epoch": 6.431050228310502, "grad_norm": 0.02698899619281292, "learning_rate": 3.966514459665145e-06, "loss": 0.0002, "step": 7042 }, { "epoch": 6.4319634703196344, "grad_norm": 1.4399385452270508, "learning_rate": 3.965499746321664e-06, "loss": 0.009, "step": 7043 }, { "epoch": 6.432876712328767, "grad_norm": 32.979339599609375, "learning_rate": 3.964485032978184e-06, "loss": 0.3656, "step": 7044 }, { "epoch": 6.433789954337899, "grad_norm": 46.24826431274414, "learning_rate": 3.963470319634704e-06, "loss": 0.3804, "step": 7045 }, { "epoch": 6.434703196347032, "grad_norm": 29.19890785217285, "learning_rate": 3.9624556062912225e-06, "loss": 0.1998, "step": 7046 }, { "epoch": 6.435616438356164, "grad_norm": 75.17194366455078, "learning_rate": 3.961440892947743e-06, "loss": 0.6243, "step": 7047 }, { "epoch": 6.436529680365297, "grad_norm": 1.8983585834503174, "learning_rate": 3.960426179604262e-06, "loss": 0.0109, "step": 7048 }, { "epoch": 6.437442922374429, "grad_norm": 1.2415626049041748, "learning_rate": 3.9594114662607816e-06, "loss": 0.0066, "step": 7049 }, { "epoch": 6.438356164383562, "grad_norm": 7.6341471672058105, "learning_rate": 3.958396752917301e-06, "loss": 0.0312, "step": 7050 }, { "epoch": 6.439269406392694, "grad_norm": 0.0682014524936676, "learning_rate": 3.95738203957382e-06, "loss": 0.0003, "step": 7051 }, { "epoch": 6.4401826484018265, "grad_norm": 4.610410213470459, "learning_rate": 3.956367326230341e-06, "loss": 0.0249, "step": 7052 }, { "epoch": 6.441095890410959, "grad_norm": 0.22017459571361542, "learning_rate": 3.9553526128868595e-06, "loss": 0.0015, "step": 7053 }, { "epoch": 6.442009132420091, "grad_norm": 0.3722647726535797, "learning_rate": 3.954337899543379e-06, "loss": 0.0026, "step": 7054 }, { "epoch": 6.442922374429224, "grad_norm": 0.9126455187797546, "learning_rate": 3.953323186199899e-06, "loss": 0.0056, "step": 7055 }, { "epoch": 6.443835616438356, "grad_norm": 1.2561419010162354, "learning_rate": 3.9523084728564186e-06, "loss": 0.009, "step": 7056 }, { "epoch": 6.444748858447489, "grad_norm": 6.260626792907715, "learning_rate": 3.951293759512938e-06, "loss": 0.0375, "step": 7057 }, { "epoch": 6.445662100456621, "grad_norm": 8.621179580688477, "learning_rate": 3.950279046169457e-06, "loss": 0.0411, "step": 7058 }, { "epoch": 6.446575342465754, "grad_norm": 0.20015457272529602, "learning_rate": 3.949264332825977e-06, "loss": 0.0013, "step": 7059 }, { "epoch": 6.447488584474886, "grad_norm": 1.5928760766983032, "learning_rate": 3.9482496194824965e-06, "loss": 0.0087, "step": 7060 }, { "epoch": 6.448401826484019, "grad_norm": 1.259572982788086, "learning_rate": 3.947234906139016e-06, "loss": 0.0059, "step": 7061 }, { "epoch": 6.449315068493151, "grad_norm": 0.08747650682926178, "learning_rate": 3.946220192795536e-06, "loss": 0.0003, "step": 7062 }, { "epoch": 6.4502283105022835, "grad_norm": 42.63669967651367, "learning_rate": 3.945205479452055e-06, "loss": 0.2492, "step": 7063 }, { "epoch": 6.451141552511416, "grad_norm": 48.20588684082031, "learning_rate": 3.944190766108575e-06, "loss": 0.3322, "step": 7064 }, { "epoch": 6.4520547945205475, "grad_norm": 1.091501235961914, "learning_rate": 3.943176052765094e-06, "loss": 0.008, "step": 7065 }, { "epoch": 6.45296803652968, "grad_norm": 88.45280456542969, "learning_rate": 3.942161339421614e-06, "loss": 2.0385, "step": 7066 }, { "epoch": 6.453881278538812, "grad_norm": 0.6602297425270081, "learning_rate": 3.9411466260781335e-06, "loss": 0.0045, "step": 7067 }, { "epoch": 6.454794520547945, "grad_norm": 1.449748158454895, "learning_rate": 3.940131912734652e-06, "loss": 0.0073, "step": 7068 }, { "epoch": 6.455707762557077, "grad_norm": 0.3026949167251587, "learning_rate": 3.939117199391173e-06, "loss": 0.002, "step": 7069 }, { "epoch": 6.45662100456621, "grad_norm": 1.7475240230560303, "learning_rate": 3.938102486047692e-06, "loss": 0.0101, "step": 7070 }, { "epoch": 6.457534246575342, "grad_norm": 0.025911645963788033, "learning_rate": 3.937087772704211e-06, "loss": 0.0001, "step": 7071 }, { "epoch": 6.458447488584475, "grad_norm": 1.2152973413467407, "learning_rate": 3.936073059360731e-06, "loss": 0.0067, "step": 7072 }, { "epoch": 6.459360730593607, "grad_norm": 0.8743308186531067, "learning_rate": 3.93505834601725e-06, "loss": 0.0055, "step": 7073 }, { "epoch": 6.46027397260274, "grad_norm": 0.4226841330528259, "learning_rate": 3.9340436326737705e-06, "loss": 0.0016, "step": 7074 }, { "epoch": 6.461187214611872, "grad_norm": 1.4616467952728271, "learning_rate": 3.933028919330289e-06, "loss": 0.0104, "step": 7075 }, { "epoch": 6.4621004566210045, "grad_norm": 0.23765185475349426, "learning_rate": 3.932014205986809e-06, "loss": 0.0013, "step": 7076 }, { "epoch": 6.463013698630137, "grad_norm": 0.3833717703819275, "learning_rate": 3.930999492643329e-06, "loss": 0.0021, "step": 7077 }, { "epoch": 6.463926940639269, "grad_norm": 0.07729683071374893, "learning_rate": 3.929984779299848e-06, "loss": 0.0005, "step": 7078 }, { "epoch": 6.464840182648402, "grad_norm": 11.856734275817871, "learning_rate": 3.928970065956368e-06, "loss": 0.03, "step": 7079 }, { "epoch": 6.465753424657534, "grad_norm": 0.8446851968765259, "learning_rate": 3.927955352612887e-06, "loss": 0.0052, "step": 7080 }, { "epoch": 6.466666666666667, "grad_norm": 9.894909858703613, "learning_rate": 3.926940639269407e-06, "loss": 0.0422, "step": 7081 }, { "epoch": 6.467579908675799, "grad_norm": 0.3027314841747284, "learning_rate": 3.925925925925926e-06, "loss": 0.0016, "step": 7082 }, { "epoch": 6.468493150684932, "grad_norm": 2.239250898361206, "learning_rate": 3.924911212582446e-06, "loss": 0.0094, "step": 7083 }, { "epoch": 6.469406392694064, "grad_norm": 9.50523853302002, "learning_rate": 3.923896499238966e-06, "loss": 0.0504, "step": 7084 }, { "epoch": 6.470319634703197, "grad_norm": 0.0791153535246849, "learning_rate": 3.9228817858954846e-06, "loss": 0.0004, "step": 7085 }, { "epoch": 6.471232876712329, "grad_norm": 0.26923057436943054, "learning_rate": 3.921867072552004e-06, "loss": 0.0022, "step": 7086 }, { "epoch": 6.4721461187214615, "grad_norm": 16.224361419677734, "learning_rate": 3.920852359208524e-06, "loss": 0.0943, "step": 7087 }, { "epoch": 6.473059360730594, "grad_norm": 7.811688423156738, "learning_rate": 3.919837645865044e-06, "loss": 0.0374, "step": 7088 }, { "epoch": 6.473972602739726, "grad_norm": 1.5233296155929565, "learning_rate": 3.918822932521563e-06, "loss": 0.007, "step": 7089 }, { "epoch": 6.474885844748858, "grad_norm": 0.7824435830116272, "learning_rate": 3.917808219178082e-06, "loss": 0.0049, "step": 7090 }, { "epoch": 6.475799086757991, "grad_norm": 10.990852355957031, "learning_rate": 3.916793505834602e-06, "loss": 0.0757, "step": 7091 }, { "epoch": 6.476712328767123, "grad_norm": 0.03012070246040821, "learning_rate": 3.9157787924911216e-06, "loss": 0.0001, "step": 7092 }, { "epoch": 6.477625570776255, "grad_norm": 0.2814811170101166, "learning_rate": 3.914764079147641e-06, "loss": 0.0012, "step": 7093 }, { "epoch": 6.478538812785388, "grad_norm": 9.661975860595703, "learning_rate": 3.913749365804161e-06, "loss": 0.0661, "step": 7094 }, { "epoch": 6.47945205479452, "grad_norm": 9.088496208190918, "learning_rate": 3.91273465246068e-06, "loss": 0.0477, "step": 7095 }, { "epoch": 6.480365296803653, "grad_norm": 0.09484075754880905, "learning_rate": 3.9117199391171995e-06, "loss": 0.0006, "step": 7096 }, { "epoch": 6.481278538812785, "grad_norm": 0.04087071120738983, "learning_rate": 3.910705225773719e-06, "loss": 0.0001, "step": 7097 }, { "epoch": 6.482191780821918, "grad_norm": 20.954652786254883, "learning_rate": 3.909690512430239e-06, "loss": 0.1384, "step": 7098 }, { "epoch": 6.48310502283105, "grad_norm": 9.836226463317871, "learning_rate": 3.9086757990867586e-06, "loss": 0.049, "step": 7099 }, { "epoch": 6.4840182648401825, "grad_norm": 0.40439146757125854, "learning_rate": 3.907661085743278e-06, "loss": 0.0036, "step": 7100 }, { "epoch": 6.484931506849315, "grad_norm": 1.9473819732666016, "learning_rate": 3.906646372399797e-06, "loss": 0.0087, "step": 7101 }, { "epoch": 6.485844748858447, "grad_norm": 2.6643564701080322, "learning_rate": 3.905631659056317e-06, "loss": 0.0137, "step": 7102 }, { "epoch": 6.48675799086758, "grad_norm": 1.078153371810913, "learning_rate": 3.9046169457128365e-06, "loss": 0.0089, "step": 7103 }, { "epoch": 6.487671232876712, "grad_norm": 0.5307403206825256, "learning_rate": 3.903602232369356e-06, "loss": 0.0035, "step": 7104 }, { "epoch": 6.488584474885845, "grad_norm": 0.8506274223327637, "learning_rate": 3.902587519025876e-06, "loss": 0.0066, "step": 7105 }, { "epoch": 6.489497716894977, "grad_norm": 0.06270382553339005, "learning_rate": 3.901572805682395e-06, "loss": 0.0003, "step": 7106 }, { "epoch": 6.49041095890411, "grad_norm": 3.6483471393585205, "learning_rate": 3.900558092338914e-06, "loss": 0.0195, "step": 7107 }, { "epoch": 6.491324200913242, "grad_norm": 45.344234466552734, "learning_rate": 3.899543378995434e-06, "loss": 0.4737, "step": 7108 }, { "epoch": 6.492237442922375, "grad_norm": 37.514373779296875, "learning_rate": 3.898528665651954e-06, "loss": 0.1681, "step": 7109 }, { "epoch": 6.493150684931507, "grad_norm": 0.2699054479598999, "learning_rate": 3.8975139523084735e-06, "loss": 0.0016, "step": 7110 }, { "epoch": 6.4940639269406395, "grad_norm": 12.052483558654785, "learning_rate": 3.896499238964992e-06, "loss": 0.0841, "step": 7111 }, { "epoch": 6.494977168949772, "grad_norm": 6.48898458480835, "learning_rate": 3.895484525621512e-06, "loss": 0.0317, "step": 7112 }, { "epoch": 6.495890410958904, "grad_norm": 0.19769786298274994, "learning_rate": 3.894469812278032e-06, "loss": 0.0011, "step": 7113 }, { "epoch": 6.496803652968037, "grad_norm": 1.9858958721160889, "learning_rate": 3.893455098934551e-06, "loss": 0.0076, "step": 7114 }, { "epoch": 6.497716894977169, "grad_norm": 1.9900627136230469, "learning_rate": 3.892440385591071e-06, "loss": 0.0045, "step": 7115 }, { "epoch": 6.498630136986302, "grad_norm": 8.99107551574707, "learning_rate": 3.89142567224759e-06, "loss": 0.0482, "step": 7116 }, { "epoch": 6.499543378995433, "grad_norm": 1.0873782634735107, "learning_rate": 3.89041095890411e-06, "loss": 0.0077, "step": 7117 }, { "epoch": 6.500456621004567, "grad_norm": 0.1441439986228943, "learning_rate": 3.889396245560629e-06, "loss": 0.0008, "step": 7118 }, { "epoch": 6.501369863013698, "grad_norm": 1.9117728471755981, "learning_rate": 3.888381532217149e-06, "loss": 0.0172, "step": 7119 }, { "epoch": 6.502283105022831, "grad_norm": 0.07935217022895813, "learning_rate": 3.887366818873669e-06, "loss": 0.0005, "step": 7120 }, { "epoch": 6.503196347031963, "grad_norm": 1.7744520902633667, "learning_rate": 3.8863521055301875e-06, "loss": 0.0121, "step": 7121 }, { "epoch": 6.504109589041096, "grad_norm": 9.605530738830566, "learning_rate": 3.885337392186708e-06, "loss": 0.0506, "step": 7122 }, { "epoch": 6.505022831050228, "grad_norm": 0.21071505546569824, "learning_rate": 3.884322678843227e-06, "loss": 0.0009, "step": 7123 }, { "epoch": 6.5059360730593605, "grad_norm": 0.25533342361450195, "learning_rate": 3.883307965499747e-06, "loss": 0.0011, "step": 7124 }, { "epoch": 6.506849315068493, "grad_norm": 1.0561197996139526, "learning_rate": 3.882293252156266e-06, "loss": 0.0061, "step": 7125 }, { "epoch": 6.507762557077625, "grad_norm": 2.1806039810180664, "learning_rate": 3.881278538812785e-06, "loss": 0.0133, "step": 7126 }, { "epoch": 6.508675799086758, "grad_norm": 0.006173489615321159, "learning_rate": 3.880263825469306e-06, "loss": 0.0, "step": 7127 }, { "epoch": 6.50958904109589, "grad_norm": 0.043680042028427124, "learning_rate": 3.8792491121258245e-06, "loss": 0.0003, "step": 7128 }, { "epoch": 6.510502283105023, "grad_norm": 0.00494828587397933, "learning_rate": 3.878234398782344e-06, "loss": 0.0, "step": 7129 }, { "epoch": 6.511415525114155, "grad_norm": 2.823417901992798, "learning_rate": 3.877219685438864e-06, "loss": 0.019, "step": 7130 }, { "epoch": 6.512328767123288, "grad_norm": 22.986900329589844, "learning_rate": 3.876204972095383e-06, "loss": 0.1158, "step": 7131 }, { "epoch": 6.51324200913242, "grad_norm": 1.1860477924346924, "learning_rate": 3.875190258751903e-06, "loss": 0.0063, "step": 7132 }, { "epoch": 6.514155251141553, "grad_norm": 0.06355036795139313, "learning_rate": 3.874175545408422e-06, "loss": 0.0004, "step": 7133 }, { "epoch": 6.515068493150685, "grad_norm": 13.355416297912598, "learning_rate": 3.873160832064942e-06, "loss": 0.064, "step": 7134 }, { "epoch": 6.5159817351598175, "grad_norm": 2.7092583179473877, "learning_rate": 3.8721461187214615e-06, "loss": 0.0154, "step": 7135 }, { "epoch": 6.51689497716895, "grad_norm": 0.040471892803907394, "learning_rate": 3.871131405377981e-06, "loss": 0.0003, "step": 7136 }, { "epoch": 6.517808219178082, "grad_norm": 1.4351277351379395, "learning_rate": 3.870116692034501e-06, "loss": 0.0076, "step": 7137 }, { "epoch": 6.518721461187215, "grad_norm": 10.342509269714355, "learning_rate": 3.86910197869102e-06, "loss": 0.0664, "step": 7138 }, { "epoch": 6.519634703196347, "grad_norm": 6.667107105255127, "learning_rate": 3.8680872653475395e-06, "loss": 0.0297, "step": 7139 }, { "epoch": 6.52054794520548, "grad_norm": 0.14845524728298187, "learning_rate": 3.867072552004059e-06, "loss": 0.0008, "step": 7140 }, { "epoch": 6.521461187214612, "grad_norm": 1.4299770593643188, "learning_rate": 3.866057838660579e-06, "loss": 0.0076, "step": 7141 }, { "epoch": 6.522374429223745, "grad_norm": 45.6163215637207, "learning_rate": 3.8650431253170985e-06, "loss": 0.0709, "step": 7142 }, { "epoch": 6.523287671232877, "grad_norm": 1.7372384071350098, "learning_rate": 3.864028411973617e-06, "loss": 0.0128, "step": 7143 }, { "epoch": 6.524200913242009, "grad_norm": 0.5712347030639648, "learning_rate": 3.863013698630138e-06, "loss": 0.0036, "step": 7144 }, { "epoch": 6.525114155251142, "grad_norm": 3.9648826122283936, "learning_rate": 3.861998985286657e-06, "loss": 0.0229, "step": 7145 }, { "epoch": 6.526027397260274, "grad_norm": 52.541202545166016, "learning_rate": 3.8609842719431765e-06, "loss": 0.3358, "step": 7146 }, { "epoch": 6.526940639269406, "grad_norm": 0.2259409874677658, "learning_rate": 3.859969558599696e-06, "loss": 0.0013, "step": 7147 }, { "epoch": 6.5278538812785385, "grad_norm": 0.041895218193531036, "learning_rate": 3.858954845256215e-06, "loss": 0.0002, "step": 7148 }, { "epoch": 6.528767123287671, "grad_norm": 40.228538513183594, "learning_rate": 3.8579401319127355e-06, "loss": 0.2842, "step": 7149 }, { "epoch": 6.529680365296803, "grad_norm": 4.856868267059326, "learning_rate": 3.856925418569254e-06, "loss": 0.0276, "step": 7150 }, { "epoch": 6.530593607305936, "grad_norm": 52.335391998291016, "learning_rate": 3.855910705225774e-06, "loss": 0.4289, "step": 7151 }, { "epoch": 6.531506849315068, "grad_norm": 0.4036138951778412, "learning_rate": 3.854895991882294e-06, "loss": 0.0021, "step": 7152 }, { "epoch": 6.532420091324201, "grad_norm": 0.162563756108284, "learning_rate": 3.853881278538813e-06, "loss": 0.0006, "step": 7153 }, { "epoch": 6.533333333333333, "grad_norm": 0.06068911403417587, "learning_rate": 3.852866565195333e-06, "loss": 0.0002, "step": 7154 }, { "epoch": 6.534246575342466, "grad_norm": 2.4161903858184814, "learning_rate": 3.851851851851852e-06, "loss": 0.0165, "step": 7155 }, { "epoch": 6.535159817351598, "grad_norm": 0.8230735659599304, "learning_rate": 3.850837138508372e-06, "loss": 0.0045, "step": 7156 }, { "epoch": 6.536073059360731, "grad_norm": 0.03478415310382843, "learning_rate": 3.849822425164891e-06, "loss": 0.0002, "step": 7157 }, { "epoch": 6.536986301369863, "grad_norm": 0.057889897376298904, "learning_rate": 3.848807711821411e-06, "loss": 0.0003, "step": 7158 }, { "epoch": 6.5378995433789955, "grad_norm": 0.022773796692490578, "learning_rate": 3.847792998477931e-06, "loss": 0.0002, "step": 7159 }, { "epoch": 6.538812785388128, "grad_norm": 0.5042672753334045, "learning_rate": 3.84677828513445e-06, "loss": 0.0029, "step": 7160 }, { "epoch": 6.53972602739726, "grad_norm": 0.7310384511947632, "learning_rate": 3.845763571790969e-06, "loss": 0.0032, "step": 7161 }, { "epoch": 6.540639269406393, "grad_norm": 18.376317977905273, "learning_rate": 3.844748858447489e-06, "loss": 0.1395, "step": 7162 }, { "epoch": 6.541552511415525, "grad_norm": 0.8383298516273499, "learning_rate": 3.843734145104009e-06, "loss": 0.0065, "step": 7163 }, { "epoch": 6.542465753424658, "grad_norm": 40.741981506347656, "learning_rate": 3.842719431760528e-06, "loss": 0.2308, "step": 7164 }, { "epoch": 6.54337899543379, "grad_norm": 1.8446357250213623, "learning_rate": 3.841704718417047e-06, "loss": 0.0112, "step": 7165 }, { "epoch": 6.544292237442923, "grad_norm": 1.3904505968093872, "learning_rate": 3.840690005073567e-06, "loss": 0.0105, "step": 7166 }, { "epoch": 6.545205479452055, "grad_norm": 0.08827429264783859, "learning_rate": 3.839675291730087e-06, "loss": 0.0005, "step": 7167 }, { "epoch": 6.546118721461188, "grad_norm": 0.11546173691749573, "learning_rate": 3.838660578386606e-06, "loss": 0.0007, "step": 7168 }, { "epoch": 6.54703196347032, "grad_norm": 1.1349800825119019, "learning_rate": 3.837645865043126e-06, "loss": 0.0077, "step": 7169 }, { "epoch": 6.5479452054794525, "grad_norm": 0.6953991651535034, "learning_rate": 3.836631151699645e-06, "loss": 0.0049, "step": 7170 }, { "epoch": 6.548858447488584, "grad_norm": 7.975856304168701, "learning_rate": 3.8356164383561645e-06, "loss": 0.0389, "step": 7171 }, { "epoch": 6.549771689497717, "grad_norm": 1.3393478393554688, "learning_rate": 3.834601725012684e-06, "loss": 0.0075, "step": 7172 }, { "epoch": 6.550684931506849, "grad_norm": 4.881003379821777, "learning_rate": 3.833587011669204e-06, "loss": 0.0187, "step": 7173 }, { "epoch": 6.551598173515981, "grad_norm": 5.3867506980896, "learning_rate": 3.832572298325724e-06, "loss": 0.0244, "step": 7174 }, { "epoch": 6.552511415525114, "grad_norm": 0.38873276114463806, "learning_rate": 3.8315575849822424e-06, "loss": 0.0019, "step": 7175 }, { "epoch": 6.553424657534246, "grad_norm": 0.21412840485572815, "learning_rate": 3.830542871638762e-06, "loss": 0.0013, "step": 7176 }, { "epoch": 6.554337899543379, "grad_norm": 0.07017775624990463, "learning_rate": 3.829528158295282e-06, "loss": 0.0005, "step": 7177 }, { "epoch": 6.555251141552511, "grad_norm": 17.561758041381836, "learning_rate": 3.8285134449518015e-06, "loss": 0.1087, "step": 7178 }, { "epoch": 6.556164383561644, "grad_norm": 1.3096987009048462, "learning_rate": 3.827498731608321e-06, "loss": 0.0093, "step": 7179 }, { "epoch": 6.557077625570776, "grad_norm": 0.24784287810325623, "learning_rate": 3.826484018264841e-06, "loss": 0.002, "step": 7180 }, { "epoch": 6.557990867579909, "grad_norm": 2.710510492324829, "learning_rate": 3.82546930492136e-06, "loss": 0.0174, "step": 7181 }, { "epoch": 6.558904109589041, "grad_norm": 0.04496974125504494, "learning_rate": 3.8244545915778794e-06, "loss": 0.0002, "step": 7182 }, { "epoch": 6.5598173515981735, "grad_norm": 0.26859989762306213, "learning_rate": 3.823439878234399e-06, "loss": 0.0003, "step": 7183 }, { "epoch": 6.560730593607306, "grad_norm": 22.06685447692871, "learning_rate": 3.822425164890919e-06, "loss": 0.1179, "step": 7184 }, { "epoch": 6.561643835616438, "grad_norm": 5.229318141937256, "learning_rate": 3.8214104515474385e-06, "loss": 0.0236, "step": 7185 }, { "epoch": 6.562557077625571, "grad_norm": 0.14872480928897858, "learning_rate": 3.820395738203957e-06, "loss": 0.0011, "step": 7186 }, { "epoch": 6.563470319634703, "grad_norm": 11.064696311950684, "learning_rate": 3.819381024860477e-06, "loss": 0.0643, "step": 7187 }, { "epoch": 6.564383561643836, "grad_norm": 5.139577388763428, "learning_rate": 3.818366311516997e-06, "loss": 0.017, "step": 7188 }, { "epoch": 6.565296803652968, "grad_norm": 0.24714554846286774, "learning_rate": 3.8173515981735164e-06, "loss": 0.0012, "step": 7189 }, { "epoch": 6.566210045662101, "grad_norm": 2.9159607887268066, "learning_rate": 3.816336884830036e-06, "loss": 0.0171, "step": 7190 }, { "epoch": 6.567123287671233, "grad_norm": 0.005215018521994352, "learning_rate": 3.815322171486555e-06, "loss": 0.0, "step": 7191 }, { "epoch": 6.5680365296803656, "grad_norm": 0.4133615493774414, "learning_rate": 3.814307458143075e-06, "loss": 0.0018, "step": 7192 }, { "epoch": 6.568949771689498, "grad_norm": 2.8180623054504395, "learning_rate": 3.8132927447995944e-06, "loss": 0.0159, "step": 7193 }, { "epoch": 6.5698630136986305, "grad_norm": 23.11424446105957, "learning_rate": 3.812278031456114e-06, "loss": 0.1645, "step": 7194 }, { "epoch": 6.570776255707763, "grad_norm": 138.56939697265625, "learning_rate": 3.8112633181126333e-06, "loss": 2.954, "step": 7195 }, { "epoch": 6.5716894977168945, "grad_norm": 4.436259746551514, "learning_rate": 3.810248604769153e-06, "loss": 0.0279, "step": 7196 }, { "epoch": 6.572602739726028, "grad_norm": 0.6074199080467224, "learning_rate": 3.8092338914256727e-06, "loss": 0.0035, "step": 7197 }, { "epoch": 6.573515981735159, "grad_norm": 0.2810215651988983, "learning_rate": 3.808219178082192e-06, "loss": 0.0021, "step": 7198 }, { "epoch": 6.574429223744293, "grad_norm": 0.005800288636237383, "learning_rate": 3.8072044647387117e-06, "loss": 0.0, "step": 7199 }, { "epoch": 6.575342465753424, "grad_norm": 2.0412235260009766, "learning_rate": 3.8061897513952314e-06, "loss": 0.0106, "step": 7200 }, { "epoch": 6.576255707762557, "grad_norm": 8.138055801391602, "learning_rate": 3.8051750380517506e-06, "loss": 0.0406, "step": 7201 }, { "epoch": 6.577168949771689, "grad_norm": 4.667263031005859, "learning_rate": 3.8041603247082703e-06, "loss": 0.0226, "step": 7202 }, { "epoch": 6.578082191780822, "grad_norm": 1.1538605690002441, "learning_rate": 3.8031456113647896e-06, "loss": 0.0081, "step": 7203 }, { "epoch": 6.578995433789954, "grad_norm": 2.1536567211151123, "learning_rate": 3.8021308980213097e-06, "loss": 0.0153, "step": 7204 }, { "epoch": 6.579908675799087, "grad_norm": 0.12350404262542725, "learning_rate": 3.801116184677829e-06, "loss": 0.0012, "step": 7205 }, { "epoch": 6.580821917808219, "grad_norm": 11.50711727142334, "learning_rate": 3.8001014713343482e-06, "loss": 0.0603, "step": 7206 }, { "epoch": 6.5817351598173515, "grad_norm": 35.05998611450195, "learning_rate": 3.799086757990868e-06, "loss": 0.1876, "step": 7207 }, { "epoch": 6.582648401826484, "grad_norm": 0.558422327041626, "learning_rate": 3.798072044647387e-06, "loss": 0.0022, "step": 7208 }, { "epoch": 6.583561643835616, "grad_norm": 2.0262033939361572, "learning_rate": 3.7970573313039073e-06, "loss": 0.0106, "step": 7209 }, { "epoch": 6.584474885844749, "grad_norm": 0.04207303002476692, "learning_rate": 3.7960426179604266e-06, "loss": 0.0002, "step": 7210 }, { "epoch": 6.585388127853881, "grad_norm": 106.9377212524414, "learning_rate": 3.795027904616946e-06, "loss": 0.9293, "step": 7211 }, { "epoch": 6.586301369863014, "grad_norm": 0.01191187184303999, "learning_rate": 3.7940131912734655e-06, "loss": 0.0001, "step": 7212 }, { "epoch": 6.587214611872146, "grad_norm": 3.9795308113098145, "learning_rate": 3.792998477929985e-06, "loss": 0.0169, "step": 7213 }, { "epoch": 6.588127853881279, "grad_norm": 0.07184790074825287, "learning_rate": 3.791983764586505e-06, "loss": 0.0004, "step": 7214 }, { "epoch": 6.589041095890411, "grad_norm": 0.08832802623510361, "learning_rate": 3.790969051243024e-06, "loss": 0.0006, "step": 7215 }, { "epoch": 6.5899543378995435, "grad_norm": 0.7468781471252441, "learning_rate": 3.7899543378995435e-06, "loss": 0.005, "step": 7216 }, { "epoch": 6.590867579908676, "grad_norm": 0.05447092652320862, "learning_rate": 3.788939624556063e-06, "loss": 0.0003, "step": 7217 }, { "epoch": 6.5917808219178085, "grad_norm": 5.348626613616943, "learning_rate": 3.787924911212583e-06, "loss": 0.0373, "step": 7218 }, { "epoch": 6.592694063926941, "grad_norm": 0.13156114518642426, "learning_rate": 3.7869101978691025e-06, "loss": 0.0007, "step": 7219 }, { "epoch": 6.593607305936073, "grad_norm": 1.184951663017273, "learning_rate": 3.785895484525622e-06, "loss": 0.0059, "step": 7220 }, { "epoch": 6.594520547945206, "grad_norm": 0.10071169584989548, "learning_rate": 3.784880771182141e-06, "loss": 0.0005, "step": 7221 }, { "epoch": 6.595433789954338, "grad_norm": 14.331090927124023, "learning_rate": 3.783866057838661e-06, "loss": 0.0858, "step": 7222 }, { "epoch": 6.59634703196347, "grad_norm": 1.0007133483886719, "learning_rate": 3.7828513444951805e-06, "loss": 0.0064, "step": 7223 }, { "epoch": 6.597260273972603, "grad_norm": 131.4528045654297, "learning_rate": 3.7818366311517e-06, "loss": 1.5363, "step": 7224 }, { "epoch": 6.598173515981735, "grad_norm": 0.05251727253198624, "learning_rate": 3.7808219178082194e-06, "loss": 0.0004, "step": 7225 }, { "epoch": 6.599086757990867, "grad_norm": 5.382068634033203, "learning_rate": 3.7798072044647387e-06, "loss": 0.0274, "step": 7226 }, { "epoch": 6.6, "grad_norm": 0.5156203508377075, "learning_rate": 3.778792491121259e-06, "loss": 0.003, "step": 7227 }, { "epoch": 6.600913242009132, "grad_norm": 0.9517858028411865, "learning_rate": 3.777777777777778e-06, "loss": 0.004, "step": 7228 }, { "epoch": 6.6018264840182646, "grad_norm": 27.82901382446289, "learning_rate": 3.7767630644342978e-06, "loss": 0.0982, "step": 7229 }, { "epoch": 6.602739726027397, "grad_norm": 0.9301067590713501, "learning_rate": 3.775748351090817e-06, "loss": 0.0071, "step": 7230 }, { "epoch": 6.6036529680365295, "grad_norm": 0.34693387150764465, "learning_rate": 3.7747336377473363e-06, "loss": 0.0022, "step": 7231 }, { "epoch": 6.604566210045662, "grad_norm": 0.6703037619590759, "learning_rate": 3.7737189244038564e-06, "loss": 0.0045, "step": 7232 }, { "epoch": 6.605479452054794, "grad_norm": 11.836631774902344, "learning_rate": 3.7727042110603757e-06, "loss": 0.0412, "step": 7233 }, { "epoch": 6.606392694063927, "grad_norm": 0.6886309385299683, "learning_rate": 3.7716894977168954e-06, "loss": 0.0037, "step": 7234 }, { "epoch": 6.607305936073059, "grad_norm": 0.20622099936008453, "learning_rate": 3.7706747843734147e-06, "loss": 0.0018, "step": 7235 }, { "epoch": 6.608219178082192, "grad_norm": 0.0071575697511434555, "learning_rate": 3.7696600710299343e-06, "loss": 0.0, "step": 7236 }, { "epoch": 6.609132420091324, "grad_norm": 0.12551364302635193, "learning_rate": 3.768645357686454e-06, "loss": 0.0008, "step": 7237 }, { "epoch": 6.610045662100457, "grad_norm": 0.9220938086509705, "learning_rate": 3.7676306443429733e-06, "loss": 0.0045, "step": 7238 }, { "epoch": 6.610958904109589, "grad_norm": 22.243732452392578, "learning_rate": 3.766615930999493e-06, "loss": 0.1136, "step": 7239 }, { "epoch": 6.6118721461187215, "grad_norm": 21.800207138061523, "learning_rate": 3.7656012176560127e-06, "loss": 0.1225, "step": 7240 }, { "epoch": 6.612785388127854, "grad_norm": 3.0827815532684326, "learning_rate": 3.764586504312532e-06, "loss": 0.0181, "step": 7241 }, { "epoch": 6.6136986301369864, "grad_norm": 1.6689170598983765, "learning_rate": 3.7635717909690516e-06, "loss": 0.0104, "step": 7242 }, { "epoch": 6.614611872146119, "grad_norm": 0.9150111675262451, "learning_rate": 3.762557077625571e-06, "loss": 0.0065, "step": 7243 }, { "epoch": 6.615525114155251, "grad_norm": 0.450905442237854, "learning_rate": 3.761542364282091e-06, "loss": 0.0023, "step": 7244 }, { "epoch": 6.616438356164384, "grad_norm": 0.4664265811443329, "learning_rate": 3.7605276509386103e-06, "loss": 0.003, "step": 7245 }, { "epoch": 6.617351598173516, "grad_norm": 0.7511101365089417, "learning_rate": 3.7595129375951296e-06, "loss": 0.0038, "step": 7246 }, { "epoch": 6.618264840182649, "grad_norm": 0.06538668274879456, "learning_rate": 3.7584982242516493e-06, "loss": 0.0004, "step": 7247 }, { "epoch": 6.619178082191781, "grad_norm": 0.2498660832643509, "learning_rate": 3.7574835109081685e-06, "loss": 0.0015, "step": 7248 }, { "epoch": 6.620091324200914, "grad_norm": 0.0407596081495285, "learning_rate": 3.7564687975646886e-06, "loss": 0.0003, "step": 7249 }, { "epoch": 6.621004566210045, "grad_norm": 0.02400992624461651, "learning_rate": 3.755454084221208e-06, "loss": 0.0001, "step": 7250 }, { "epoch": 6.6219178082191785, "grad_norm": 0.4724554121494293, "learning_rate": 3.754439370877727e-06, "loss": 0.003, "step": 7251 }, { "epoch": 6.62283105022831, "grad_norm": 0.1543528288602829, "learning_rate": 3.753424657534247e-06, "loss": 0.001, "step": 7252 }, { "epoch": 6.6237442922374425, "grad_norm": 8.002191543579102, "learning_rate": 3.752409944190766e-06, "loss": 0.018, "step": 7253 }, { "epoch": 6.624657534246575, "grad_norm": 0.9587880969047546, "learning_rate": 3.7513952308472863e-06, "loss": 0.0061, "step": 7254 }, { "epoch": 6.6255707762557075, "grad_norm": 44.92153549194336, "learning_rate": 3.7503805175038055e-06, "loss": 0.4765, "step": 7255 }, { "epoch": 6.62648401826484, "grad_norm": 1.5300045013427734, "learning_rate": 3.749365804160325e-06, "loss": 0.0068, "step": 7256 }, { "epoch": 6.627397260273972, "grad_norm": 1.1895970106124878, "learning_rate": 3.7483510908168445e-06, "loss": 0.0077, "step": 7257 }, { "epoch": 6.628310502283105, "grad_norm": 0.5201801657676697, "learning_rate": 3.747336377473364e-06, "loss": 0.0035, "step": 7258 }, { "epoch": 6.629223744292237, "grad_norm": 0.007207568734884262, "learning_rate": 3.746321664129884e-06, "loss": 0.0001, "step": 7259 }, { "epoch": 6.63013698630137, "grad_norm": 0.44560131430625916, "learning_rate": 3.745306950786403e-06, "loss": 0.0029, "step": 7260 }, { "epoch": 6.631050228310502, "grad_norm": 0.07504212856292725, "learning_rate": 3.7442922374429224e-06, "loss": 0.0004, "step": 7261 }, { "epoch": 6.631963470319635, "grad_norm": 1.1735247373580933, "learning_rate": 3.7432775240994425e-06, "loss": 0.0058, "step": 7262 }, { "epoch": 6.632876712328767, "grad_norm": 0.6375855803489685, "learning_rate": 3.742262810755962e-06, "loss": 0.0032, "step": 7263 }, { "epoch": 6.6337899543378995, "grad_norm": 21.918956756591797, "learning_rate": 3.7412480974124815e-06, "loss": 0.1184, "step": 7264 }, { "epoch": 6.634703196347032, "grad_norm": 0.28631970286369324, "learning_rate": 3.7402333840690008e-06, "loss": 0.0017, "step": 7265 }, { "epoch": 6.635616438356164, "grad_norm": 49.34362030029297, "learning_rate": 3.73921867072552e-06, "loss": 0.2533, "step": 7266 }, { "epoch": 6.636529680365297, "grad_norm": 0.6079251170158386, "learning_rate": 3.73820395738204e-06, "loss": 0.0034, "step": 7267 }, { "epoch": 6.637442922374429, "grad_norm": 0.06797311455011368, "learning_rate": 3.7371892440385594e-06, "loss": 0.0003, "step": 7268 }, { "epoch": 6.638356164383562, "grad_norm": 1.3156461715698242, "learning_rate": 3.736174530695079e-06, "loss": 0.0081, "step": 7269 }, { "epoch": 6.639269406392694, "grad_norm": 1.6976840496063232, "learning_rate": 3.7351598173515984e-06, "loss": 0.0106, "step": 7270 }, { "epoch": 6.640182648401827, "grad_norm": 0.08195198327302933, "learning_rate": 3.7341451040081176e-06, "loss": 0.0006, "step": 7271 }, { "epoch": 6.641095890410959, "grad_norm": 5.39387845993042, "learning_rate": 3.7331303906646378e-06, "loss": 0.0365, "step": 7272 }, { "epoch": 6.642009132420092, "grad_norm": 71.0926513671875, "learning_rate": 3.732115677321157e-06, "loss": 0.6134, "step": 7273 }, { "epoch": 6.642922374429224, "grad_norm": 2.420964241027832, "learning_rate": 3.7311009639776767e-06, "loss": 0.0104, "step": 7274 }, { "epoch": 6.6438356164383565, "grad_norm": 0.04069848731160164, "learning_rate": 3.730086250634196e-06, "loss": 0.0003, "step": 7275 }, { "epoch": 6.644748858447489, "grad_norm": 0.07741536945104599, "learning_rate": 3.7290715372907157e-06, "loss": 0.0005, "step": 7276 }, { "epoch": 6.6456621004566205, "grad_norm": 0.25952064990997314, "learning_rate": 3.7280568239472354e-06, "loss": 0.0018, "step": 7277 }, { "epoch": 6.646575342465754, "grad_norm": 4.813154697418213, "learning_rate": 3.7270421106037546e-06, "loss": 0.0375, "step": 7278 }, { "epoch": 6.647488584474885, "grad_norm": 55.78232955932617, "learning_rate": 3.7260273972602743e-06, "loss": 0.3298, "step": 7279 }, { "epoch": 6.648401826484018, "grad_norm": 0.40422096848487854, "learning_rate": 3.725012683916794e-06, "loss": 0.0023, "step": 7280 }, { "epoch": 6.64931506849315, "grad_norm": 1.1520347595214844, "learning_rate": 3.7239979705733133e-06, "loss": 0.0089, "step": 7281 }, { "epoch": 6.650228310502283, "grad_norm": 0.42261603474617004, "learning_rate": 3.722983257229833e-06, "loss": 0.0024, "step": 7282 }, { "epoch": 6.651141552511415, "grad_norm": 7.983979225158691, "learning_rate": 3.7219685438863522e-06, "loss": 0.0512, "step": 7283 }, { "epoch": 6.652054794520548, "grad_norm": 1.363065242767334, "learning_rate": 3.7209538305428724e-06, "loss": 0.0057, "step": 7284 }, { "epoch": 6.65296803652968, "grad_norm": 60.83469009399414, "learning_rate": 3.7199391171993916e-06, "loss": 0.4718, "step": 7285 }, { "epoch": 6.653881278538813, "grad_norm": 0.31645679473876953, "learning_rate": 3.718924403855911e-06, "loss": 0.0009, "step": 7286 }, { "epoch": 6.654794520547945, "grad_norm": 0.07244506478309631, "learning_rate": 3.7179096905124306e-06, "loss": 0.0005, "step": 7287 }, { "epoch": 6.6557077625570775, "grad_norm": 0.7428094148635864, "learning_rate": 3.71689497716895e-06, "loss": 0.0029, "step": 7288 }, { "epoch": 6.65662100456621, "grad_norm": 12.98289966583252, "learning_rate": 3.71588026382547e-06, "loss": 0.0889, "step": 7289 }, { "epoch": 6.657534246575342, "grad_norm": 0.27823150157928467, "learning_rate": 3.7148655504819892e-06, "loss": 0.0015, "step": 7290 }, { "epoch": 6.658447488584475, "grad_norm": 0.053046755492687225, "learning_rate": 3.7138508371385085e-06, "loss": 0.0003, "step": 7291 }, { "epoch": 6.659360730593607, "grad_norm": 0.1888420730829239, "learning_rate": 3.712836123795028e-06, "loss": 0.0015, "step": 7292 }, { "epoch": 6.66027397260274, "grad_norm": 4.366737365722656, "learning_rate": 3.7118214104515475e-06, "loss": 0.0207, "step": 7293 }, { "epoch": 6.661187214611872, "grad_norm": 6.308735370635986, "learning_rate": 3.7108066971080676e-06, "loss": 0.0429, "step": 7294 }, { "epoch": 6.662100456621005, "grad_norm": 0.2603813111782074, "learning_rate": 3.709791983764587e-06, "loss": 0.0014, "step": 7295 }, { "epoch": 6.663013698630137, "grad_norm": 0.016027461737394333, "learning_rate": 3.708777270421106e-06, "loss": 0.0001, "step": 7296 }, { "epoch": 6.66392694063927, "grad_norm": 0.016840549185872078, "learning_rate": 3.707762557077626e-06, "loss": 0.0001, "step": 7297 }, { "epoch": 6.664840182648402, "grad_norm": 0.01565323956310749, "learning_rate": 3.7067478437341455e-06, "loss": 0.0001, "step": 7298 }, { "epoch": 6.6657534246575345, "grad_norm": 0.7285018563270569, "learning_rate": 3.705733130390665e-06, "loss": 0.0038, "step": 7299 }, { "epoch": 6.666666666666667, "grad_norm": 2.881178140640259, "learning_rate": 3.7047184170471845e-06, "loss": 0.0139, "step": 7300 }, { "epoch": 6.667579908675799, "grad_norm": 6.554874897003174, "learning_rate": 3.7037037037037037e-06, "loss": 0.0151, "step": 7301 }, { "epoch": 6.668493150684932, "grad_norm": 4.891843318939209, "learning_rate": 3.702688990360224e-06, "loss": 0.0228, "step": 7302 }, { "epoch": 6.669406392694064, "grad_norm": 1.0030412673950195, "learning_rate": 3.701674277016743e-06, "loss": 0.0069, "step": 7303 }, { "epoch": 6.670319634703196, "grad_norm": 4.250096321105957, "learning_rate": 3.700659563673263e-06, "loss": 0.0155, "step": 7304 }, { "epoch": 6.671232876712329, "grad_norm": 0.5529012680053711, "learning_rate": 3.699644850329782e-06, "loss": 0.004, "step": 7305 }, { "epoch": 6.672146118721461, "grad_norm": 12.153618812561035, "learning_rate": 3.6986301369863014e-06, "loss": 0.1272, "step": 7306 }, { "epoch": 6.673059360730593, "grad_norm": 1.8455381393432617, "learning_rate": 3.6976154236428215e-06, "loss": 0.0131, "step": 7307 }, { "epoch": 6.673972602739726, "grad_norm": 12.354226112365723, "learning_rate": 3.6966007102993407e-06, "loss": 0.0691, "step": 7308 }, { "epoch": 6.674885844748858, "grad_norm": 23.732744216918945, "learning_rate": 3.6955859969558604e-06, "loss": 0.0702, "step": 7309 }, { "epoch": 6.675799086757991, "grad_norm": 1.0622702836990356, "learning_rate": 3.6945712836123797e-06, "loss": 0.0063, "step": 7310 }, { "epoch": 6.676712328767123, "grad_norm": 0.35807305574417114, "learning_rate": 3.693556570268899e-06, "loss": 0.0025, "step": 7311 }, { "epoch": 6.6776255707762555, "grad_norm": 1.0533483028411865, "learning_rate": 3.692541856925419e-06, "loss": 0.0043, "step": 7312 }, { "epoch": 6.678538812785388, "grad_norm": 2.5539395809173584, "learning_rate": 3.6915271435819384e-06, "loss": 0.0078, "step": 7313 }, { "epoch": 6.67945205479452, "grad_norm": 0.3078922629356384, "learning_rate": 3.690512430238458e-06, "loss": 0.0016, "step": 7314 }, { "epoch": 6.680365296803653, "grad_norm": 0.09767916053533554, "learning_rate": 3.6894977168949773e-06, "loss": 0.0005, "step": 7315 }, { "epoch": 6.681278538812785, "grad_norm": 19.25412940979004, "learning_rate": 3.688483003551497e-06, "loss": 0.078, "step": 7316 }, { "epoch": 6.682191780821918, "grad_norm": 8.614945411682129, "learning_rate": 3.6874682902080167e-06, "loss": 0.0402, "step": 7317 }, { "epoch": 6.68310502283105, "grad_norm": 1.1503702402114868, "learning_rate": 3.686453576864536e-06, "loss": 0.0078, "step": 7318 }, { "epoch": 6.684018264840183, "grad_norm": 0.4619215726852417, "learning_rate": 3.6854388635210557e-06, "loss": 0.0023, "step": 7319 }, { "epoch": 6.684931506849315, "grad_norm": 0.013971311040222645, "learning_rate": 3.6844241501775753e-06, "loss": 0.0001, "step": 7320 }, { "epoch": 6.685844748858448, "grad_norm": 19.108779907226562, "learning_rate": 3.6834094368340946e-06, "loss": 0.2139, "step": 7321 }, { "epoch": 6.68675799086758, "grad_norm": 143.82861328125, "learning_rate": 3.6823947234906143e-06, "loss": 3.5051, "step": 7322 }, { "epoch": 6.6876712328767125, "grad_norm": 0.6773048639297485, "learning_rate": 3.6813800101471336e-06, "loss": 0.0039, "step": 7323 }, { "epoch": 6.688584474885845, "grad_norm": 0.007915637455880642, "learning_rate": 3.6803652968036537e-06, "loss": 0.0, "step": 7324 }, { "epoch": 6.689497716894977, "grad_norm": 7.798130035400391, "learning_rate": 3.679350583460173e-06, "loss": 0.0404, "step": 7325 }, { "epoch": 6.69041095890411, "grad_norm": 0.7754072546958923, "learning_rate": 3.6783358701166922e-06, "loss": 0.0044, "step": 7326 }, { "epoch": 6.691324200913242, "grad_norm": 6.67032527923584, "learning_rate": 3.677321156773212e-06, "loss": 0.0372, "step": 7327 }, { "epoch": 6.692237442922375, "grad_norm": 0.9930150508880615, "learning_rate": 3.676306443429731e-06, "loss": 0.0048, "step": 7328 }, { "epoch": 6.693150684931507, "grad_norm": 14.753436088562012, "learning_rate": 3.6752917300862513e-06, "loss": 0.0545, "step": 7329 }, { "epoch": 6.69406392694064, "grad_norm": 3.646777391433716, "learning_rate": 3.6742770167427706e-06, "loss": 0.0196, "step": 7330 }, { "epoch": 6.694977168949771, "grad_norm": 6.412881374359131, "learning_rate": 3.67326230339929e-06, "loss": 0.0593, "step": 7331 }, { "epoch": 6.695890410958905, "grad_norm": 2.2915518283843994, "learning_rate": 3.6722475900558095e-06, "loss": 0.0099, "step": 7332 }, { "epoch": 6.696803652968036, "grad_norm": 3.038912773132324, "learning_rate": 3.671232876712329e-06, "loss": 0.0169, "step": 7333 }, { "epoch": 6.697716894977169, "grad_norm": 3.371504306793213, "learning_rate": 3.670218163368849e-06, "loss": 0.0177, "step": 7334 }, { "epoch": 6.698630136986301, "grad_norm": 0.32291510701179504, "learning_rate": 3.669203450025368e-06, "loss": 0.0014, "step": 7335 }, { "epoch": 6.6995433789954335, "grad_norm": 1.2147094011306763, "learning_rate": 3.6681887366818875e-06, "loss": 0.0078, "step": 7336 }, { "epoch": 6.700456621004566, "grad_norm": 1.4792121648788452, "learning_rate": 3.667174023338407e-06, "loss": 0.0103, "step": 7337 }, { "epoch": 6.701369863013698, "grad_norm": 0.5866826176643372, "learning_rate": 3.666159309994927e-06, "loss": 0.0024, "step": 7338 }, { "epoch": 6.702283105022831, "grad_norm": 8.82824420928955, "learning_rate": 3.6651445966514465e-06, "loss": 0.0727, "step": 7339 }, { "epoch": 6.703196347031963, "grad_norm": 3.127485513687134, "learning_rate": 3.664129883307966e-06, "loss": 0.0149, "step": 7340 }, { "epoch": 6.704109589041096, "grad_norm": 7.154298782348633, "learning_rate": 3.663115169964485e-06, "loss": 0.0291, "step": 7341 }, { "epoch": 6.705022831050228, "grad_norm": 3.234656810760498, "learning_rate": 3.662100456621005e-06, "loss": 0.0198, "step": 7342 }, { "epoch": 6.705936073059361, "grad_norm": 0.19717276096343994, "learning_rate": 3.6610857432775245e-06, "loss": 0.0009, "step": 7343 }, { "epoch": 6.706849315068493, "grad_norm": 0.5707151293754578, "learning_rate": 3.660071029934044e-06, "loss": 0.0034, "step": 7344 }, { "epoch": 6.707762557077626, "grad_norm": 125.17185974121094, "learning_rate": 3.6590563165905634e-06, "loss": 3.7746, "step": 7345 }, { "epoch": 6.708675799086758, "grad_norm": 9.607266426086426, "learning_rate": 3.6580416032470827e-06, "loss": 0.0444, "step": 7346 }, { "epoch": 6.7095890410958905, "grad_norm": 0.08557312935590744, "learning_rate": 3.657026889903603e-06, "loss": 0.0005, "step": 7347 }, { "epoch": 6.710502283105023, "grad_norm": 1.609096884727478, "learning_rate": 3.656012176560122e-06, "loss": 0.007, "step": 7348 }, { "epoch": 6.711415525114155, "grad_norm": 4.339432239532471, "learning_rate": 3.6549974632166418e-06, "loss": 0.0127, "step": 7349 }, { "epoch": 6.712328767123288, "grad_norm": 8.63837718963623, "learning_rate": 3.653982749873161e-06, "loss": 0.0578, "step": 7350 }, { "epoch": 6.71324200913242, "grad_norm": 6.351861000061035, "learning_rate": 3.6529680365296803e-06, "loss": 0.0352, "step": 7351 }, { "epoch": 6.714155251141553, "grad_norm": 3.0395424365997314, "learning_rate": 3.6519533231862004e-06, "loss": 0.0212, "step": 7352 }, { "epoch": 6.715068493150685, "grad_norm": 2.4988863468170166, "learning_rate": 3.6509386098427197e-06, "loss": 0.013, "step": 7353 }, { "epoch": 6.715981735159818, "grad_norm": 3.280569076538086, "learning_rate": 3.6499238964992394e-06, "loss": 0.0182, "step": 7354 }, { "epoch": 6.71689497716895, "grad_norm": 8.77706241607666, "learning_rate": 3.6489091831557586e-06, "loss": 0.0284, "step": 7355 }, { "epoch": 6.717808219178083, "grad_norm": 0.9706282615661621, "learning_rate": 3.6478944698122783e-06, "loss": 0.0054, "step": 7356 }, { "epoch": 6.718721461187215, "grad_norm": 51.956424713134766, "learning_rate": 3.646879756468798e-06, "loss": 0.4089, "step": 7357 }, { "epoch": 6.719634703196347, "grad_norm": 17.73948097229004, "learning_rate": 3.6458650431253173e-06, "loss": 0.1114, "step": 7358 }, { "epoch": 6.72054794520548, "grad_norm": 0.9172928333282471, "learning_rate": 3.644850329781837e-06, "loss": 0.0047, "step": 7359 }, { "epoch": 6.7214611872146115, "grad_norm": 1.0316872596740723, "learning_rate": 3.6438356164383567e-06, "loss": 0.0062, "step": 7360 }, { "epoch": 6.722374429223744, "grad_norm": 7.8400044441223145, "learning_rate": 3.642820903094876e-06, "loss": 0.0358, "step": 7361 }, { "epoch": 6.723287671232876, "grad_norm": 78.34282684326172, "learning_rate": 3.6418061897513956e-06, "loss": 1.0938, "step": 7362 }, { "epoch": 6.724200913242009, "grad_norm": 0.3968789279460907, "learning_rate": 3.640791476407915e-06, "loss": 0.0027, "step": 7363 }, { "epoch": 6.725114155251141, "grad_norm": 8.115240097045898, "learning_rate": 3.639776763064435e-06, "loss": 0.0497, "step": 7364 }, { "epoch": 6.726027397260274, "grad_norm": 41.07030487060547, "learning_rate": 3.6387620497209543e-06, "loss": 0.2585, "step": 7365 }, { "epoch": 6.726940639269406, "grad_norm": 0.02044363133609295, "learning_rate": 3.6377473363774736e-06, "loss": 0.0001, "step": 7366 }, { "epoch": 6.727853881278539, "grad_norm": 0.3352217376232147, "learning_rate": 3.6367326230339933e-06, "loss": 0.0019, "step": 7367 }, { "epoch": 6.728767123287671, "grad_norm": 11.198905944824219, "learning_rate": 3.6357179096905125e-06, "loss": 0.0734, "step": 7368 }, { "epoch": 6.729680365296804, "grad_norm": 1.9148664474487305, "learning_rate": 3.6347031963470326e-06, "loss": 0.0092, "step": 7369 }, { "epoch": 6.730593607305936, "grad_norm": 0.10223326086997986, "learning_rate": 3.633688483003552e-06, "loss": 0.0006, "step": 7370 }, { "epoch": 6.7315068493150685, "grad_norm": 1.1333012580871582, "learning_rate": 3.632673769660071e-06, "loss": 0.0082, "step": 7371 }, { "epoch": 6.732420091324201, "grad_norm": 2.1628355979919434, "learning_rate": 3.631659056316591e-06, "loss": 0.0081, "step": 7372 }, { "epoch": 6.733333333333333, "grad_norm": 0.8299251794815063, "learning_rate": 3.63064434297311e-06, "loss": 0.0042, "step": 7373 }, { "epoch": 6.734246575342466, "grad_norm": 0.14272668957710266, "learning_rate": 3.6296296296296302e-06, "loss": 0.0009, "step": 7374 }, { "epoch": 6.735159817351598, "grad_norm": 1.4574626684188843, "learning_rate": 3.6286149162861495e-06, "loss": 0.0078, "step": 7375 }, { "epoch": 6.736073059360731, "grad_norm": 0.6737328767776489, "learning_rate": 3.6276002029426688e-06, "loss": 0.004, "step": 7376 }, { "epoch": 6.736986301369863, "grad_norm": 2.464012622833252, "learning_rate": 3.6265854895991885e-06, "loss": 0.0164, "step": 7377 }, { "epoch": 6.737899543378996, "grad_norm": 2.8849682807922363, "learning_rate": 3.625570776255708e-06, "loss": 0.0181, "step": 7378 }, { "epoch": 6.738812785388128, "grad_norm": 0.29416364431381226, "learning_rate": 3.624556062912228e-06, "loss": 0.0022, "step": 7379 }, { "epoch": 6.739726027397261, "grad_norm": 0.5379319787025452, "learning_rate": 3.623541349568747e-06, "loss": 0.0029, "step": 7380 }, { "epoch": 6.740639269406393, "grad_norm": 0.49209922552108765, "learning_rate": 3.6225266362252664e-06, "loss": 0.003, "step": 7381 }, { "epoch": 6.7415525114155255, "grad_norm": 0.8509747385978699, "learning_rate": 3.6215119228817865e-06, "loss": 0.0046, "step": 7382 }, { "epoch": 6.742465753424657, "grad_norm": 8.612709045410156, "learning_rate": 3.6204972095383058e-06, "loss": 0.0467, "step": 7383 }, { "epoch": 6.74337899543379, "grad_norm": 0.4673932194709778, "learning_rate": 3.6194824961948255e-06, "loss": 0.0023, "step": 7384 }, { "epoch": 6.744292237442922, "grad_norm": 47.80570983886719, "learning_rate": 3.6184677828513447e-06, "loss": 0.2527, "step": 7385 }, { "epoch": 6.745205479452055, "grad_norm": 0.2411186844110489, "learning_rate": 3.617453069507864e-06, "loss": 0.0016, "step": 7386 }, { "epoch": 6.746118721461187, "grad_norm": 1.0589730739593506, "learning_rate": 3.616438356164384e-06, "loss": 0.0067, "step": 7387 }, { "epoch": 6.747031963470319, "grad_norm": 1.3077383041381836, "learning_rate": 3.6154236428209034e-06, "loss": 0.0058, "step": 7388 }, { "epoch": 6.747945205479452, "grad_norm": 3.2454190254211426, "learning_rate": 3.614408929477423e-06, "loss": 0.0098, "step": 7389 }, { "epoch": 6.748858447488584, "grad_norm": 28.022184371948242, "learning_rate": 3.6133942161339424e-06, "loss": 0.1334, "step": 7390 }, { "epoch": 6.749771689497717, "grad_norm": 0.06782416254281998, "learning_rate": 3.6123795027904616e-06, "loss": 0.0005, "step": 7391 }, { "epoch": 6.750684931506849, "grad_norm": 5.626470565795898, "learning_rate": 3.6113647894469817e-06, "loss": 0.0336, "step": 7392 }, { "epoch": 6.751598173515982, "grad_norm": 1.9243624210357666, "learning_rate": 3.610350076103501e-06, "loss": 0.0148, "step": 7393 }, { "epoch": 6.752511415525114, "grad_norm": 1.3149137496948242, "learning_rate": 3.6093353627600207e-06, "loss": 0.0077, "step": 7394 }, { "epoch": 6.7534246575342465, "grad_norm": 0.8462814092636108, "learning_rate": 3.60832064941654e-06, "loss": 0.0064, "step": 7395 }, { "epoch": 6.754337899543379, "grad_norm": 2.010267734527588, "learning_rate": 3.6073059360730597e-06, "loss": 0.0064, "step": 7396 }, { "epoch": 6.755251141552511, "grad_norm": 0.9778579473495483, "learning_rate": 3.6062912227295794e-06, "loss": 0.0065, "step": 7397 }, { "epoch": 6.756164383561644, "grad_norm": 0.232249915599823, "learning_rate": 3.6052765093860986e-06, "loss": 0.0016, "step": 7398 }, { "epoch": 6.757077625570776, "grad_norm": 3.680197238922119, "learning_rate": 3.6042617960426183e-06, "loss": 0.0267, "step": 7399 }, { "epoch": 6.757990867579909, "grad_norm": 96.38363647460938, "learning_rate": 3.603247082699138e-06, "loss": 0.5535, "step": 7400 }, { "epoch": 6.758904109589041, "grad_norm": 0.16730797290802002, "learning_rate": 3.6022323693556573e-06, "loss": 0.0009, "step": 7401 }, { "epoch": 6.759817351598174, "grad_norm": 15.156058311462402, "learning_rate": 3.601217656012177e-06, "loss": 0.0558, "step": 7402 }, { "epoch": 6.760730593607306, "grad_norm": 102.78216552734375, "learning_rate": 3.6002029426686962e-06, "loss": 0.7765, "step": 7403 }, { "epoch": 6.761643835616439, "grad_norm": 0.8494530320167542, "learning_rate": 3.5991882293252164e-06, "loss": 0.0043, "step": 7404 }, { "epoch": 6.762557077625571, "grad_norm": 0.17507272958755493, "learning_rate": 3.5981735159817356e-06, "loss": 0.0011, "step": 7405 }, { "epoch": 6.7634703196347035, "grad_norm": 7.38298225402832, "learning_rate": 3.597158802638255e-06, "loss": 0.0404, "step": 7406 }, { "epoch": 6.764383561643836, "grad_norm": 0.41194307804107666, "learning_rate": 3.5961440892947746e-06, "loss": 0.0025, "step": 7407 }, { "epoch": 6.765296803652968, "grad_norm": 3.4096662998199463, "learning_rate": 3.595129375951294e-06, "loss": 0.0204, "step": 7408 }, { "epoch": 6.766210045662101, "grad_norm": 141.25430297851562, "learning_rate": 3.594114662607814e-06, "loss": 3.0494, "step": 7409 }, { "epoch": 6.767123287671232, "grad_norm": 1.142777442932129, "learning_rate": 3.5930999492643332e-06, "loss": 0.0052, "step": 7410 }, { "epoch": 6.768036529680366, "grad_norm": 14.335504531860352, "learning_rate": 3.5920852359208525e-06, "loss": 0.0675, "step": 7411 }, { "epoch": 6.768949771689497, "grad_norm": 1.706896424293518, "learning_rate": 3.591070522577372e-06, "loss": 0.0107, "step": 7412 }, { "epoch": 6.76986301369863, "grad_norm": 33.044410705566406, "learning_rate": 3.5900558092338915e-06, "loss": 0.2494, "step": 7413 }, { "epoch": 6.770776255707762, "grad_norm": 0.29887694120407104, "learning_rate": 3.5890410958904116e-06, "loss": 0.0019, "step": 7414 }, { "epoch": 6.771689497716895, "grad_norm": 0.6508060693740845, "learning_rate": 3.588026382546931e-06, "loss": 0.0044, "step": 7415 }, { "epoch": 6.772602739726027, "grad_norm": 0.7739503383636475, "learning_rate": 3.58701166920345e-06, "loss": 0.0042, "step": 7416 }, { "epoch": 6.77351598173516, "grad_norm": 0.505235493183136, "learning_rate": 3.58599695585997e-06, "loss": 0.0023, "step": 7417 }, { "epoch": 6.774429223744292, "grad_norm": 0.7062453627586365, "learning_rate": 3.5849822425164895e-06, "loss": 0.0041, "step": 7418 }, { "epoch": 6.7753424657534245, "grad_norm": 25.56875991821289, "learning_rate": 3.583967529173009e-06, "loss": 0.1419, "step": 7419 }, { "epoch": 6.776255707762557, "grad_norm": 0.0027416208758950233, "learning_rate": 3.5829528158295285e-06, "loss": 0.0, "step": 7420 }, { "epoch": 6.777168949771689, "grad_norm": 1.8394581079483032, "learning_rate": 3.5819381024860477e-06, "loss": 0.009, "step": 7421 }, { "epoch": 6.778082191780822, "grad_norm": 11.561882972717285, "learning_rate": 3.580923389142568e-06, "loss": 0.0239, "step": 7422 }, { "epoch": 6.778995433789954, "grad_norm": 6.739688873291016, "learning_rate": 3.579908675799087e-06, "loss": 0.0315, "step": 7423 }, { "epoch": 6.779908675799087, "grad_norm": 2.759856700897217, "learning_rate": 3.578893962455607e-06, "loss": 0.0128, "step": 7424 }, { "epoch": 6.780821917808219, "grad_norm": 2.4993374347686768, "learning_rate": 3.577879249112126e-06, "loss": 0.0155, "step": 7425 }, { "epoch": 6.781735159817352, "grad_norm": 0.21444663405418396, "learning_rate": 3.5768645357686453e-06, "loss": 0.0016, "step": 7426 }, { "epoch": 6.782648401826484, "grad_norm": 6.5382161140441895, "learning_rate": 3.5758498224251655e-06, "loss": 0.0316, "step": 7427 }, { "epoch": 6.7835616438356166, "grad_norm": 0.24089325964450836, "learning_rate": 3.5748351090816847e-06, "loss": 0.0011, "step": 7428 }, { "epoch": 6.784474885844749, "grad_norm": 3.209911584854126, "learning_rate": 3.5738203957382044e-06, "loss": 0.0138, "step": 7429 }, { "epoch": 6.7853881278538815, "grad_norm": 2.934823989868164, "learning_rate": 3.5728056823947237e-06, "loss": 0.0103, "step": 7430 }, { "epoch": 6.786301369863014, "grad_norm": 1.0821038484573364, "learning_rate": 3.571790969051243e-06, "loss": 0.0059, "step": 7431 }, { "epoch": 6.787214611872146, "grad_norm": 7.077751159667969, "learning_rate": 3.570776255707763e-06, "loss": 0.0404, "step": 7432 }, { "epoch": 6.788127853881279, "grad_norm": 1.9840545654296875, "learning_rate": 3.5697615423642823e-06, "loss": 0.0104, "step": 7433 }, { "epoch": 6.789041095890411, "grad_norm": 1.8409819602966309, "learning_rate": 3.568746829020802e-06, "loss": 0.0103, "step": 7434 }, { "epoch": 6.789954337899544, "grad_norm": 13.373554229736328, "learning_rate": 3.5677321156773213e-06, "loss": 0.0559, "step": 7435 }, { "epoch": 6.790867579908676, "grad_norm": 31.328798294067383, "learning_rate": 3.566717402333841e-06, "loss": 0.1872, "step": 7436 }, { "epoch": 6.791780821917808, "grad_norm": 0.013234677724540234, "learning_rate": 3.5657026889903607e-06, "loss": 0.0001, "step": 7437 }, { "epoch": 6.792694063926941, "grad_norm": 0.12387020140886307, "learning_rate": 3.56468797564688e-06, "loss": 0.0007, "step": 7438 }, { "epoch": 6.793607305936073, "grad_norm": 3.357346296310425, "learning_rate": 3.5636732623033996e-06, "loss": 0.0128, "step": 7439 }, { "epoch": 6.794520547945205, "grad_norm": 0.8872144818305969, "learning_rate": 3.5626585489599193e-06, "loss": 0.0033, "step": 7440 }, { "epoch": 6.7954337899543376, "grad_norm": 0.17434701323509216, "learning_rate": 3.5616438356164386e-06, "loss": 0.0011, "step": 7441 }, { "epoch": 6.79634703196347, "grad_norm": 5.257420063018799, "learning_rate": 3.5606291222729583e-06, "loss": 0.0194, "step": 7442 }, { "epoch": 6.7972602739726025, "grad_norm": 6.362753868103027, "learning_rate": 3.5596144089294776e-06, "loss": 0.0424, "step": 7443 }, { "epoch": 6.798173515981735, "grad_norm": 0.03927413001656532, "learning_rate": 3.5585996955859977e-06, "loss": 0.0002, "step": 7444 }, { "epoch": 6.799086757990867, "grad_norm": 0.5169460773468018, "learning_rate": 3.557584982242517e-06, "loss": 0.0035, "step": 7445 }, { "epoch": 6.8, "grad_norm": 16.745098114013672, "learning_rate": 3.5565702688990362e-06, "loss": 0.0223, "step": 7446 }, { "epoch": 6.800913242009132, "grad_norm": 1.6765120029449463, "learning_rate": 3.555555555555556e-06, "loss": 0.0073, "step": 7447 }, { "epoch": 6.801826484018265, "grad_norm": 7.684715270996094, "learning_rate": 3.554540842212075e-06, "loss": 0.0104, "step": 7448 }, { "epoch": 6.802739726027397, "grad_norm": 6.039050102233887, "learning_rate": 3.5535261288685953e-06, "loss": 0.0343, "step": 7449 }, { "epoch": 6.80365296803653, "grad_norm": 0.1417326182126999, "learning_rate": 3.5525114155251146e-06, "loss": 0.0009, "step": 7450 }, { "epoch": 6.804566210045662, "grad_norm": 0.3165838122367859, "learning_rate": 3.551496702181634e-06, "loss": 0.0025, "step": 7451 }, { "epoch": 6.8054794520547945, "grad_norm": 0.043040014803409576, "learning_rate": 3.5504819888381535e-06, "loss": 0.0003, "step": 7452 }, { "epoch": 6.806392694063927, "grad_norm": 50.462642669677734, "learning_rate": 3.549467275494673e-06, "loss": 0.3446, "step": 7453 }, { "epoch": 6.8073059360730594, "grad_norm": 8.775182723999023, "learning_rate": 3.548452562151193e-06, "loss": 0.037, "step": 7454 }, { "epoch": 6.808219178082192, "grad_norm": 23.368846893310547, "learning_rate": 3.547437848807712e-06, "loss": 0.2134, "step": 7455 }, { "epoch": 6.809132420091324, "grad_norm": 0.05232606828212738, "learning_rate": 3.5464231354642314e-06, "loss": 0.0003, "step": 7456 }, { "epoch": 6.810045662100457, "grad_norm": 6.666518688201904, "learning_rate": 3.545408422120751e-06, "loss": 0.0472, "step": 7457 }, { "epoch": 6.810958904109589, "grad_norm": 0.6382545828819275, "learning_rate": 3.544393708777271e-06, "loss": 0.0049, "step": 7458 }, { "epoch": 6.811872146118722, "grad_norm": 1.3800283670425415, "learning_rate": 3.5433789954337905e-06, "loss": 0.0059, "step": 7459 }, { "epoch": 6.812785388127854, "grad_norm": 2.389847993850708, "learning_rate": 3.54236428209031e-06, "loss": 0.016, "step": 7460 }, { "epoch": 6.813698630136987, "grad_norm": 0.26433539390563965, "learning_rate": 3.541349568746829e-06, "loss": 0.0016, "step": 7461 }, { "epoch": 6.814611872146119, "grad_norm": 0.37768644094467163, "learning_rate": 3.540334855403349e-06, "loss": 0.0018, "step": 7462 }, { "epoch": 6.8155251141552515, "grad_norm": 1.059356927871704, "learning_rate": 3.5393201420598684e-06, "loss": 0.0075, "step": 7463 }, { "epoch": 6.816438356164383, "grad_norm": 5.939528942108154, "learning_rate": 3.538305428716388e-06, "loss": 0.0271, "step": 7464 }, { "epoch": 6.817351598173516, "grad_norm": 0.5971142649650574, "learning_rate": 3.5372907153729074e-06, "loss": 0.0048, "step": 7465 }, { "epoch": 6.818264840182648, "grad_norm": 0.6682262420654297, "learning_rate": 3.5362760020294267e-06, "loss": 0.0053, "step": 7466 }, { "epoch": 6.8191780821917805, "grad_norm": 2.0492136478424072, "learning_rate": 3.535261288685947e-06, "loss": 0.0113, "step": 7467 }, { "epoch": 6.820091324200913, "grad_norm": 96.50022888183594, "learning_rate": 3.534246575342466e-06, "loss": 1.6055, "step": 7468 }, { "epoch": 6.821004566210045, "grad_norm": 9.231163024902344, "learning_rate": 3.5332318619989857e-06, "loss": 0.0505, "step": 7469 }, { "epoch": 6.821917808219178, "grad_norm": 0.9815731644630432, "learning_rate": 3.532217148655505e-06, "loss": 0.0034, "step": 7470 }, { "epoch": 6.82283105022831, "grad_norm": 0.03633257374167442, "learning_rate": 3.5312024353120243e-06, "loss": 0.0002, "step": 7471 }, { "epoch": 6.823744292237443, "grad_norm": 1.9843522310256958, "learning_rate": 3.5301877219685444e-06, "loss": 0.0122, "step": 7472 }, { "epoch": 6.824657534246575, "grad_norm": 1.9166548252105713, "learning_rate": 3.5291730086250637e-06, "loss": 0.0127, "step": 7473 }, { "epoch": 6.825570776255708, "grad_norm": 1.5312330722808838, "learning_rate": 3.5281582952815834e-06, "loss": 0.0089, "step": 7474 }, { "epoch": 6.82648401826484, "grad_norm": 0.32243654131889343, "learning_rate": 3.5271435819381026e-06, "loss": 0.001, "step": 7475 }, { "epoch": 6.8273972602739725, "grad_norm": 0.015718335285782814, "learning_rate": 3.5261288685946223e-06, "loss": 0.0001, "step": 7476 }, { "epoch": 6.828310502283105, "grad_norm": 0.13776004314422607, "learning_rate": 3.525114155251142e-06, "loss": 0.0002, "step": 7477 }, { "epoch": 6.829223744292237, "grad_norm": 43.67076110839844, "learning_rate": 3.5240994419076613e-06, "loss": 0.2473, "step": 7478 }, { "epoch": 6.83013698630137, "grad_norm": 12.253403663635254, "learning_rate": 3.523084728564181e-06, "loss": 0.0596, "step": 7479 }, { "epoch": 6.831050228310502, "grad_norm": 1.492653489112854, "learning_rate": 3.5220700152207007e-06, "loss": 0.0078, "step": 7480 }, { "epoch": 6.831963470319635, "grad_norm": 0.6792794466018677, "learning_rate": 3.52105530187722e-06, "loss": 0.0045, "step": 7481 }, { "epoch": 6.832876712328767, "grad_norm": 0.11057163774967194, "learning_rate": 3.5200405885337396e-06, "loss": 0.0006, "step": 7482 }, { "epoch": 6.8337899543379, "grad_norm": 11.868062019348145, "learning_rate": 3.519025875190259e-06, "loss": 0.0597, "step": 7483 }, { "epoch": 6.834703196347032, "grad_norm": 4.8381524085998535, "learning_rate": 3.518011161846779e-06, "loss": 0.0244, "step": 7484 }, { "epoch": 6.835616438356165, "grad_norm": 0.27490368485450745, "learning_rate": 3.5169964485032983e-06, "loss": 0.0023, "step": 7485 }, { "epoch": 6.836529680365297, "grad_norm": 0.0432824045419693, "learning_rate": 3.5159817351598176e-06, "loss": 0.0003, "step": 7486 }, { "epoch": 6.8374429223744295, "grad_norm": 1.0940370559692383, "learning_rate": 3.5149670218163372e-06, "loss": 0.003, "step": 7487 }, { "epoch": 6.838356164383562, "grad_norm": 0.08234036713838577, "learning_rate": 3.5139523084728565e-06, "loss": 0.0006, "step": 7488 }, { "epoch": 6.839269406392694, "grad_norm": 3.0301036834716797, "learning_rate": 3.5129375951293766e-06, "loss": 0.0143, "step": 7489 }, { "epoch": 6.840182648401827, "grad_norm": 29.32625961303711, "learning_rate": 3.511922881785896e-06, "loss": 0.1329, "step": 7490 }, { "epoch": 6.8410958904109584, "grad_norm": 0.3914133608341217, "learning_rate": 3.510908168442415e-06, "loss": 0.003, "step": 7491 }, { "epoch": 6.842009132420092, "grad_norm": 3.851684093475342, "learning_rate": 3.509893455098935e-06, "loss": 0.0231, "step": 7492 }, { "epoch": 6.842922374429223, "grad_norm": 0.07502901554107666, "learning_rate": 3.508878741755454e-06, "loss": 0.0005, "step": 7493 }, { "epoch": 6.843835616438356, "grad_norm": 2.745105266571045, "learning_rate": 3.5078640284119742e-06, "loss": 0.0162, "step": 7494 }, { "epoch": 6.844748858447488, "grad_norm": 0.06247915327548981, "learning_rate": 3.5068493150684935e-06, "loss": 0.0004, "step": 7495 }, { "epoch": 6.845662100456621, "grad_norm": 1.198692798614502, "learning_rate": 3.5058346017250128e-06, "loss": 0.009, "step": 7496 }, { "epoch": 6.846575342465753, "grad_norm": 0.4539145529270172, "learning_rate": 3.5048198883815325e-06, "loss": 0.0021, "step": 7497 }, { "epoch": 6.847488584474886, "grad_norm": 91.36091613769531, "learning_rate": 3.503805175038052e-06, "loss": 0.9577, "step": 7498 }, { "epoch": 6.848401826484018, "grad_norm": 0.6454083323478699, "learning_rate": 3.502790461694572e-06, "loss": 0.0035, "step": 7499 }, { "epoch": 6.8493150684931505, "grad_norm": 3.0236611366271973, "learning_rate": 3.501775748351091e-06, "loss": 0.0157, "step": 7500 }, { "epoch": 6.850228310502283, "grad_norm": 0.41022124886512756, "learning_rate": 3.5007610350076104e-06, "loss": 0.0019, "step": 7501 }, { "epoch": 6.851141552511415, "grad_norm": 5.885432720184326, "learning_rate": 3.4997463216641305e-06, "loss": 0.0302, "step": 7502 }, { "epoch": 6.852054794520548, "grad_norm": 2.1616649627685547, "learning_rate": 3.4987316083206498e-06, "loss": 0.0102, "step": 7503 }, { "epoch": 6.85296803652968, "grad_norm": 0.5151586532592773, "learning_rate": 3.4977168949771695e-06, "loss": 0.0031, "step": 7504 }, { "epoch": 6.853881278538813, "grad_norm": 4.851841926574707, "learning_rate": 3.4967021816336887e-06, "loss": 0.023, "step": 7505 }, { "epoch": 6.854794520547945, "grad_norm": 0.5110295414924622, "learning_rate": 3.495687468290208e-06, "loss": 0.004, "step": 7506 }, { "epoch": 6.855707762557078, "grad_norm": 0.28369027376174927, "learning_rate": 3.494672754946728e-06, "loss": 0.0007, "step": 7507 }, { "epoch": 6.85662100456621, "grad_norm": 0.24754738807678223, "learning_rate": 3.4936580416032474e-06, "loss": 0.0012, "step": 7508 }, { "epoch": 6.857534246575343, "grad_norm": 33.592201232910156, "learning_rate": 3.492643328259767e-06, "loss": 0.2606, "step": 7509 }, { "epoch": 6.858447488584475, "grad_norm": 0.05766378715634346, "learning_rate": 3.4916286149162863e-06, "loss": 0.0004, "step": 7510 }, { "epoch": 6.8593607305936075, "grad_norm": 7.806212425231934, "learning_rate": 3.4906139015728056e-06, "loss": 0.0549, "step": 7511 }, { "epoch": 6.86027397260274, "grad_norm": 2.871094226837158, "learning_rate": 3.4895991882293257e-06, "loss": 0.0189, "step": 7512 }, { "epoch": 6.861187214611872, "grad_norm": 2.054044008255005, "learning_rate": 3.488584474885845e-06, "loss": 0.0091, "step": 7513 }, { "epoch": 6.862100456621005, "grad_norm": 0.5119280219078064, "learning_rate": 3.4875697615423647e-06, "loss": 0.0033, "step": 7514 }, { "epoch": 6.863013698630137, "grad_norm": 2.245877742767334, "learning_rate": 3.486555048198884e-06, "loss": 0.009, "step": 7515 }, { "epoch": 6.86392694063927, "grad_norm": 23.49797248840332, "learning_rate": 3.4855403348554032e-06, "loss": 0.0938, "step": 7516 }, { "epoch": 6.864840182648402, "grad_norm": 5.719963073730469, "learning_rate": 3.4845256215119233e-06, "loss": 0.0361, "step": 7517 }, { "epoch": 6.865753424657534, "grad_norm": 0.04734189808368683, "learning_rate": 3.4835109081684426e-06, "loss": 0.0003, "step": 7518 }, { "epoch": 6.866666666666667, "grad_norm": 10.642051696777344, "learning_rate": 3.4824961948249623e-06, "loss": 0.0577, "step": 7519 }, { "epoch": 6.867579908675799, "grad_norm": 53.818450927734375, "learning_rate": 3.481481481481482e-06, "loss": 0.1184, "step": 7520 }, { "epoch": 6.868493150684931, "grad_norm": 1.6551326513290405, "learning_rate": 3.4804667681380013e-06, "loss": 0.0116, "step": 7521 }, { "epoch": 6.869406392694064, "grad_norm": 1.6739753484725952, "learning_rate": 3.479452054794521e-06, "loss": 0.0081, "step": 7522 }, { "epoch": 6.870319634703196, "grad_norm": 0.9156282544136047, "learning_rate": 3.4784373414510402e-06, "loss": 0.0026, "step": 7523 }, { "epoch": 6.8712328767123285, "grad_norm": 0.25357604026794434, "learning_rate": 3.4774226281075603e-06, "loss": 0.0019, "step": 7524 }, { "epoch": 6.872146118721461, "grad_norm": 1.5113515853881836, "learning_rate": 3.4764079147640796e-06, "loss": 0.0112, "step": 7525 }, { "epoch": 6.873059360730593, "grad_norm": 22.487071990966797, "learning_rate": 3.475393201420599e-06, "loss": 0.1128, "step": 7526 }, { "epoch": 6.873972602739726, "grad_norm": 1.3826789855957031, "learning_rate": 3.4743784880771186e-06, "loss": 0.0069, "step": 7527 }, { "epoch": 6.874885844748858, "grad_norm": 4.385870933532715, "learning_rate": 3.473363774733638e-06, "loss": 0.0228, "step": 7528 }, { "epoch": 6.875799086757991, "grad_norm": 4.662971496582031, "learning_rate": 3.472349061390158e-06, "loss": 0.0307, "step": 7529 }, { "epoch": 6.876712328767123, "grad_norm": 0.1497696191072464, "learning_rate": 3.4713343480466772e-06, "loss": 0.001, "step": 7530 }, { "epoch": 6.877625570776256, "grad_norm": 0.30111682415008545, "learning_rate": 3.4703196347031965e-06, "loss": 0.0018, "step": 7531 }, { "epoch": 6.878538812785388, "grad_norm": 0.02978815697133541, "learning_rate": 3.469304921359716e-06, "loss": 0.0001, "step": 7532 }, { "epoch": 6.879452054794521, "grad_norm": 6.042610168457031, "learning_rate": 3.4682902080162355e-06, "loss": 0.0272, "step": 7533 }, { "epoch": 6.880365296803653, "grad_norm": 0.11931940168142319, "learning_rate": 3.4672754946727556e-06, "loss": 0.0005, "step": 7534 }, { "epoch": 6.8812785388127855, "grad_norm": 2.377423048019409, "learning_rate": 3.466260781329275e-06, "loss": 0.0121, "step": 7535 }, { "epoch": 6.882191780821918, "grad_norm": 0.3231486678123474, "learning_rate": 3.465246067985794e-06, "loss": 0.0015, "step": 7536 }, { "epoch": 6.88310502283105, "grad_norm": 0.20282071828842163, "learning_rate": 3.464231354642314e-06, "loss": 0.001, "step": 7537 }, { "epoch": 6.884018264840183, "grad_norm": 1.2926900386810303, "learning_rate": 3.4632166412988335e-06, "loss": 0.0033, "step": 7538 }, { "epoch": 6.884931506849315, "grad_norm": 0.607154130935669, "learning_rate": 3.462201927955353e-06, "loss": 0.0032, "step": 7539 }, { "epoch": 6.885844748858448, "grad_norm": 1.6978498697280884, "learning_rate": 3.4611872146118725e-06, "loss": 0.0104, "step": 7540 }, { "epoch": 6.88675799086758, "grad_norm": 3.910205602645874, "learning_rate": 3.4601725012683917e-06, "loss": 0.0209, "step": 7541 }, { "epoch": 6.887671232876713, "grad_norm": 1.1897149085998535, "learning_rate": 3.459157787924912e-06, "loss": 0.0026, "step": 7542 }, { "epoch": 6.888584474885845, "grad_norm": 1.8419036865234375, "learning_rate": 3.458143074581431e-06, "loss": 0.0093, "step": 7543 }, { "epoch": 6.889497716894978, "grad_norm": 141.08950805664062, "learning_rate": 3.457128361237951e-06, "loss": 2.6626, "step": 7544 }, { "epoch": 6.890410958904109, "grad_norm": 2.452921152114868, "learning_rate": 3.45611364789447e-06, "loss": 0.0139, "step": 7545 }, { "epoch": 6.8913242009132425, "grad_norm": 3.332308769226074, "learning_rate": 3.4550989345509893e-06, "loss": 0.0188, "step": 7546 }, { "epoch": 6.892237442922374, "grad_norm": 5.263540744781494, "learning_rate": 3.4540842212075094e-06, "loss": 0.0251, "step": 7547 }, { "epoch": 6.8931506849315065, "grad_norm": 3.6590070724487305, "learning_rate": 3.4530695078640287e-06, "loss": 0.022, "step": 7548 }, { "epoch": 6.894063926940639, "grad_norm": 0.3768511116504669, "learning_rate": 3.4520547945205484e-06, "loss": 0.0027, "step": 7549 }, { "epoch": 6.894977168949771, "grad_norm": 6.980589866638184, "learning_rate": 3.4510400811770677e-06, "loss": 0.0631, "step": 7550 }, { "epoch": 6.895890410958904, "grad_norm": 1.1553455591201782, "learning_rate": 3.450025367833587e-06, "loss": 0.0078, "step": 7551 }, { "epoch": 6.896803652968036, "grad_norm": 1.3667359352111816, "learning_rate": 3.449010654490107e-06, "loss": 0.0066, "step": 7552 }, { "epoch": 6.897716894977169, "grad_norm": 0.3854064345359802, "learning_rate": 3.4479959411466263e-06, "loss": 0.0018, "step": 7553 }, { "epoch": 6.898630136986301, "grad_norm": 2.70040225982666, "learning_rate": 3.446981227803146e-06, "loss": 0.0195, "step": 7554 }, { "epoch": 6.899543378995434, "grad_norm": 0.35248279571533203, "learning_rate": 3.4459665144596653e-06, "loss": 0.0011, "step": 7555 }, { "epoch": 6.900456621004566, "grad_norm": 0.2743012011051178, "learning_rate": 3.4449518011161846e-06, "loss": 0.0017, "step": 7556 }, { "epoch": 6.901369863013699, "grad_norm": 3.5966334342956543, "learning_rate": 3.4439370877727047e-06, "loss": 0.0172, "step": 7557 }, { "epoch": 6.902283105022831, "grad_norm": 35.52797317504883, "learning_rate": 3.442922374429224e-06, "loss": 0.1515, "step": 7558 }, { "epoch": 6.9031963470319635, "grad_norm": 0.38764673471450806, "learning_rate": 3.4419076610857436e-06, "loss": 0.0028, "step": 7559 }, { "epoch": 6.904109589041096, "grad_norm": 1.3450329303741455, "learning_rate": 3.4408929477422633e-06, "loss": 0.0092, "step": 7560 }, { "epoch": 6.905022831050228, "grad_norm": 62.35615921020508, "learning_rate": 3.4398782343987826e-06, "loss": 0.4, "step": 7561 }, { "epoch": 6.905936073059361, "grad_norm": 0.16024000942707062, "learning_rate": 3.4388635210553023e-06, "loss": 0.0011, "step": 7562 }, { "epoch": 6.906849315068493, "grad_norm": 5.200483322143555, "learning_rate": 3.4378488077118216e-06, "loss": 0.0187, "step": 7563 }, { "epoch": 6.907762557077626, "grad_norm": 0.3034200966358185, "learning_rate": 3.4368340943683417e-06, "loss": 0.0021, "step": 7564 }, { "epoch": 6.908675799086758, "grad_norm": 0.20084168016910553, "learning_rate": 3.435819381024861e-06, "loss": 0.0012, "step": 7565 }, { "epoch": 6.909589041095891, "grad_norm": 2.620232582092285, "learning_rate": 3.43480466768138e-06, "loss": 0.0083, "step": 7566 }, { "epoch": 6.910502283105023, "grad_norm": 0.01667380891740322, "learning_rate": 3.4337899543379e-06, "loss": 0.0001, "step": 7567 }, { "epoch": 6.911415525114156, "grad_norm": 5.399437427520752, "learning_rate": 3.432775240994419e-06, "loss": 0.0371, "step": 7568 }, { "epoch": 6.912328767123288, "grad_norm": 0.23661763966083527, "learning_rate": 3.4317605276509393e-06, "loss": 0.001, "step": 7569 }, { "epoch": 6.91324200913242, "grad_norm": 27.590511322021484, "learning_rate": 3.4307458143074586e-06, "loss": 0.1481, "step": 7570 }, { "epoch": 6.914155251141553, "grad_norm": 74.10935974121094, "learning_rate": 3.429731100963978e-06, "loss": 0.415, "step": 7571 }, { "epoch": 6.9150684931506845, "grad_norm": 1.1465901136398315, "learning_rate": 3.4287163876204975e-06, "loss": 0.0075, "step": 7572 }, { "epoch": 6.915981735159818, "grad_norm": 4.753774166107178, "learning_rate": 3.4277016742770168e-06, "loss": 0.0359, "step": 7573 }, { "epoch": 6.916894977168949, "grad_norm": 2.554961681365967, "learning_rate": 3.426686960933537e-06, "loss": 0.0101, "step": 7574 }, { "epoch": 6.917808219178082, "grad_norm": 0.7739976048469543, "learning_rate": 3.425672247590056e-06, "loss": 0.0029, "step": 7575 }, { "epoch": 6.918721461187214, "grad_norm": 11.247488975524902, "learning_rate": 3.4246575342465754e-06, "loss": 0.0736, "step": 7576 }, { "epoch": 6.919634703196347, "grad_norm": 3.667797327041626, "learning_rate": 3.423642820903095e-06, "loss": 0.0184, "step": 7577 }, { "epoch": 6.920547945205479, "grad_norm": 0.3579111397266388, "learning_rate": 3.422628107559615e-06, "loss": 0.0025, "step": 7578 }, { "epoch": 6.921461187214612, "grad_norm": 9.599677085876465, "learning_rate": 3.4216133942161345e-06, "loss": 0.0407, "step": 7579 }, { "epoch": 6.922374429223744, "grad_norm": 97.22900390625, "learning_rate": 3.4205986808726538e-06, "loss": 0.5406, "step": 7580 }, { "epoch": 6.923287671232877, "grad_norm": 2.0282540321350098, "learning_rate": 3.419583967529173e-06, "loss": 0.0106, "step": 7581 }, { "epoch": 6.924200913242009, "grad_norm": 24.06604766845703, "learning_rate": 3.418569254185693e-06, "loss": 0.1545, "step": 7582 }, { "epoch": 6.9251141552511415, "grad_norm": 0.20575928688049316, "learning_rate": 3.4175545408422124e-06, "loss": 0.0011, "step": 7583 }, { "epoch": 6.926027397260274, "grad_norm": 12.099422454833984, "learning_rate": 3.416539827498732e-06, "loss": 0.0177, "step": 7584 }, { "epoch": 6.926940639269406, "grad_norm": 2.459359884262085, "learning_rate": 3.4155251141552514e-06, "loss": 0.0111, "step": 7585 }, { "epoch": 6.927853881278539, "grad_norm": 0.15172456204891205, "learning_rate": 3.4145104008117707e-06, "loss": 0.0005, "step": 7586 }, { "epoch": 6.928767123287671, "grad_norm": 0.7400720119476318, "learning_rate": 3.4134956874682908e-06, "loss": 0.0029, "step": 7587 }, { "epoch": 6.929680365296804, "grad_norm": 6.492416858673096, "learning_rate": 3.41248097412481e-06, "loss": 0.0363, "step": 7588 }, { "epoch": 6.930593607305936, "grad_norm": 16.897212982177734, "learning_rate": 3.4114662607813297e-06, "loss": 0.0932, "step": 7589 }, { "epoch": 6.931506849315069, "grad_norm": 0.1170971617102623, "learning_rate": 3.410451547437849e-06, "loss": 0.0005, "step": 7590 }, { "epoch": 6.932420091324201, "grad_norm": 0.14564816653728485, "learning_rate": 3.4094368340943683e-06, "loss": 0.001, "step": 7591 }, { "epoch": 6.933333333333334, "grad_norm": 0.1419193297624588, "learning_rate": 3.4084221207508884e-06, "loss": 0.0009, "step": 7592 }, { "epoch": 6.934246575342466, "grad_norm": 0.0251627117395401, "learning_rate": 3.4074074074074077e-06, "loss": 0.0001, "step": 7593 }, { "epoch": 6.9351598173515985, "grad_norm": 38.75290298461914, "learning_rate": 3.4063926940639274e-06, "loss": 0.1844, "step": 7594 }, { "epoch": 6.936073059360731, "grad_norm": 0.7766101956367493, "learning_rate": 3.4053779807204466e-06, "loss": 0.0035, "step": 7595 }, { "epoch": 6.936986301369863, "grad_norm": 3.4448773860931396, "learning_rate": 3.404363267376966e-06, "loss": 0.0182, "step": 7596 }, { "epoch": 6.937899543378995, "grad_norm": 0.42776742577552795, "learning_rate": 3.403348554033486e-06, "loss": 0.0018, "step": 7597 }, { "epoch": 6.938812785388128, "grad_norm": 116.36837005615234, "learning_rate": 3.4023338406900053e-06, "loss": 1.3519, "step": 7598 }, { "epoch": 6.93972602739726, "grad_norm": 10.42591667175293, "learning_rate": 3.401319127346525e-06, "loss": 0.0383, "step": 7599 }, { "epoch": 6.940639269406392, "grad_norm": 15.258594512939453, "learning_rate": 3.4003044140030447e-06, "loss": 0.0693, "step": 7600 }, { "epoch": 6.941552511415525, "grad_norm": 0.11191866546869278, "learning_rate": 3.399289700659564e-06, "loss": 0.0008, "step": 7601 }, { "epoch": 6.942465753424657, "grad_norm": 0.5549638867378235, "learning_rate": 3.3982749873160836e-06, "loss": 0.0031, "step": 7602 }, { "epoch": 6.94337899543379, "grad_norm": 3.1362013816833496, "learning_rate": 3.397260273972603e-06, "loss": 0.0196, "step": 7603 }, { "epoch": 6.944292237442922, "grad_norm": 6.129737854003906, "learning_rate": 3.396245560629123e-06, "loss": 0.0317, "step": 7604 }, { "epoch": 6.945205479452055, "grad_norm": 0.021135205402970314, "learning_rate": 3.3952308472856423e-06, "loss": 0.0002, "step": 7605 }, { "epoch": 6.946118721461187, "grad_norm": 0.510373592376709, "learning_rate": 3.3942161339421615e-06, "loss": 0.0032, "step": 7606 }, { "epoch": 6.9470319634703195, "grad_norm": 10.83133316040039, "learning_rate": 3.3932014205986812e-06, "loss": 0.0468, "step": 7607 }, { "epoch": 6.947945205479452, "grad_norm": 0.32843276858329773, "learning_rate": 3.3921867072552005e-06, "loss": 0.0013, "step": 7608 }, { "epoch": 6.948858447488584, "grad_norm": 1.0540916919708252, "learning_rate": 3.3911719939117206e-06, "loss": 0.0053, "step": 7609 }, { "epoch": 6.949771689497717, "grad_norm": 3.4974427223205566, "learning_rate": 3.39015728056824e-06, "loss": 0.0218, "step": 7610 }, { "epoch": 6.950684931506849, "grad_norm": 16.153409957885742, "learning_rate": 3.389142567224759e-06, "loss": 0.0977, "step": 7611 }, { "epoch": 6.951598173515982, "grad_norm": 5.868129253387451, "learning_rate": 3.388127853881279e-06, "loss": 0.0404, "step": 7612 }, { "epoch": 6.952511415525114, "grad_norm": 0.42975783348083496, "learning_rate": 3.387113140537798e-06, "loss": 0.0017, "step": 7613 }, { "epoch": 6.953424657534247, "grad_norm": 3.803457736968994, "learning_rate": 3.3860984271943182e-06, "loss": 0.0142, "step": 7614 }, { "epoch": 6.954337899543379, "grad_norm": 8.843151092529297, "learning_rate": 3.3850837138508375e-06, "loss": 0.0423, "step": 7615 }, { "epoch": 6.955251141552512, "grad_norm": 1.376433253288269, "learning_rate": 3.3840690005073568e-06, "loss": 0.0035, "step": 7616 }, { "epoch": 6.956164383561644, "grad_norm": 1.073502540588379, "learning_rate": 3.3830542871638765e-06, "loss": 0.003, "step": 7617 }, { "epoch": 6.9570776255707765, "grad_norm": 1.2867143154144287, "learning_rate": 3.3820395738203957e-06, "loss": 0.008, "step": 7618 }, { "epoch": 6.957990867579909, "grad_norm": 56.96725082397461, "learning_rate": 3.381024860476916e-06, "loss": 0.2862, "step": 7619 }, { "epoch": 6.958904109589041, "grad_norm": 0.8439745903015137, "learning_rate": 3.380010147133435e-06, "loss": 0.0039, "step": 7620 }, { "epoch": 6.959817351598174, "grad_norm": 0.04796759411692619, "learning_rate": 3.3789954337899544e-06, "loss": 0.0002, "step": 7621 }, { "epoch": 6.960730593607306, "grad_norm": 25.997522354125977, "learning_rate": 3.3779807204464745e-06, "loss": 0.0032, "step": 7622 }, { "epoch": 6.961643835616439, "grad_norm": 76.82015991210938, "learning_rate": 3.3769660071029938e-06, "loss": 0.5894, "step": 7623 }, { "epoch": 6.96255707762557, "grad_norm": 0.3227991461753845, "learning_rate": 3.3759512937595135e-06, "loss": 0.0021, "step": 7624 }, { "epoch": 6.963470319634704, "grad_norm": 2.0195319652557373, "learning_rate": 3.3749365804160327e-06, "loss": 0.015, "step": 7625 }, { "epoch": 6.964383561643835, "grad_norm": 2.414588689804077, "learning_rate": 3.373921867072552e-06, "loss": 0.0126, "step": 7626 }, { "epoch": 6.965296803652968, "grad_norm": 0.69728684425354, "learning_rate": 3.372907153729072e-06, "loss": 0.0044, "step": 7627 }, { "epoch": 6.9662100456621, "grad_norm": 7.572569847106934, "learning_rate": 3.3718924403855914e-06, "loss": 0.0432, "step": 7628 }, { "epoch": 6.967123287671233, "grad_norm": 57.771392822265625, "learning_rate": 3.370877727042111e-06, "loss": 0.3228, "step": 7629 }, { "epoch": 6.968036529680365, "grad_norm": 2.057054042816162, "learning_rate": 3.3698630136986303e-06, "loss": 0.0112, "step": 7630 }, { "epoch": 6.9689497716894975, "grad_norm": 0.2016545534133911, "learning_rate": 3.3688483003551496e-06, "loss": 0.0014, "step": 7631 }, { "epoch": 6.96986301369863, "grad_norm": 4.774658679962158, "learning_rate": 3.3678335870116697e-06, "loss": 0.026, "step": 7632 }, { "epoch": 6.970776255707762, "grad_norm": 0.27823424339294434, "learning_rate": 3.366818873668189e-06, "loss": 0.0016, "step": 7633 }, { "epoch": 6.971689497716895, "grad_norm": 0.24477656185626984, "learning_rate": 3.3658041603247087e-06, "loss": 0.0013, "step": 7634 }, { "epoch": 6.972602739726027, "grad_norm": 0.7644034028053284, "learning_rate": 3.364789446981228e-06, "loss": 0.0032, "step": 7635 }, { "epoch": 6.97351598173516, "grad_norm": 92.98004150390625, "learning_rate": 3.3637747336377472e-06, "loss": 0.7009, "step": 7636 }, { "epoch": 6.974429223744292, "grad_norm": 0.12369338423013687, "learning_rate": 3.3627600202942673e-06, "loss": 0.0009, "step": 7637 }, { "epoch": 6.975342465753425, "grad_norm": 39.53184127807617, "learning_rate": 3.3617453069507866e-06, "loss": 0.4959, "step": 7638 }, { "epoch": 6.976255707762557, "grad_norm": 0.36848875880241394, "learning_rate": 3.3607305936073063e-06, "loss": 0.0024, "step": 7639 }, { "epoch": 6.9771689497716896, "grad_norm": 2.8306779861450195, "learning_rate": 3.359715880263826e-06, "loss": 0.0173, "step": 7640 }, { "epoch": 6.978082191780822, "grad_norm": 0.9668132662773132, "learning_rate": 3.3587011669203453e-06, "loss": 0.0053, "step": 7641 }, { "epoch": 6.9789954337899545, "grad_norm": 2.051666736602783, "learning_rate": 3.357686453576865e-06, "loss": 0.0123, "step": 7642 }, { "epoch": 6.979908675799087, "grad_norm": 0.9817409515380859, "learning_rate": 3.3566717402333842e-06, "loss": 0.0065, "step": 7643 }, { "epoch": 6.980821917808219, "grad_norm": 16.27569580078125, "learning_rate": 3.3556570268899043e-06, "loss": 0.1385, "step": 7644 }, { "epoch": 6.981735159817352, "grad_norm": 0.08520273119211197, "learning_rate": 3.3546423135464236e-06, "loss": 0.0005, "step": 7645 }, { "epoch": 6.982648401826484, "grad_norm": 0.14793436229228973, "learning_rate": 3.353627600202943e-06, "loss": 0.0007, "step": 7646 }, { "epoch": 6.983561643835617, "grad_norm": 0.8989901542663574, "learning_rate": 3.3526128868594626e-06, "loss": 0.0034, "step": 7647 }, { "epoch": 6.984474885844749, "grad_norm": 2.478342294692993, "learning_rate": 3.351598173515982e-06, "loss": 0.0123, "step": 7648 }, { "epoch": 6.985388127853882, "grad_norm": 0.0018124161288142204, "learning_rate": 3.350583460172502e-06, "loss": 0.0, "step": 7649 }, { "epoch": 6.986301369863014, "grad_norm": 1.3298286199569702, "learning_rate": 3.3495687468290212e-06, "loss": 0.0101, "step": 7650 }, { "epoch": 6.987214611872146, "grad_norm": 0.5563557744026184, "learning_rate": 3.3485540334855405e-06, "loss": 0.0038, "step": 7651 }, { "epoch": 6.988127853881279, "grad_norm": 24.288724899291992, "learning_rate": 3.34753932014206e-06, "loss": 0.1643, "step": 7652 }, { "epoch": 6.989041095890411, "grad_norm": 0.04455043748021126, "learning_rate": 3.3465246067985794e-06, "loss": 0.0003, "step": 7653 }, { "epoch": 6.989954337899543, "grad_norm": 85.96282196044922, "learning_rate": 3.3455098934550996e-06, "loss": 1.4935, "step": 7654 }, { "epoch": 6.9908675799086755, "grad_norm": 7.752036094665527, "learning_rate": 3.344495180111619e-06, "loss": 0.0455, "step": 7655 }, { "epoch": 6.991780821917808, "grad_norm": 0.09329694509506226, "learning_rate": 3.343480466768138e-06, "loss": 0.0003, "step": 7656 }, { "epoch": 6.99269406392694, "grad_norm": 0.03310934081673622, "learning_rate": 3.342465753424658e-06, "loss": 0.0002, "step": 7657 }, { "epoch": 6.993607305936073, "grad_norm": 0.18725571036338806, "learning_rate": 3.341451040081177e-06, "loss": 0.0012, "step": 7658 }, { "epoch": 6.994520547945205, "grad_norm": 3.327787399291992, "learning_rate": 3.340436326737697e-06, "loss": 0.0181, "step": 7659 }, { "epoch": 6.995433789954338, "grad_norm": 10.432146072387695, "learning_rate": 3.3394216133942164e-06, "loss": 0.056, "step": 7660 }, { "epoch": 6.99634703196347, "grad_norm": 0.26583367586135864, "learning_rate": 3.3384069000507357e-06, "loss": 0.0016, "step": 7661 }, { "epoch": 6.997260273972603, "grad_norm": 0.342514306306839, "learning_rate": 3.337392186707256e-06, "loss": 0.0007, "step": 7662 }, { "epoch": 6.998173515981735, "grad_norm": 2.4885952472686768, "learning_rate": 3.336377473363775e-06, "loss": 0.0123, "step": 7663 }, { "epoch": 6.9990867579908675, "grad_norm": 0.5902714133262634, "learning_rate": 3.3353627600202948e-06, "loss": 0.0041, "step": 7664 }, { "epoch": 7.0, "grad_norm": 1.489237904548645, "learning_rate": 3.334348046676814e-06, "loss": 0.0078, "step": 7665 }, { "epoch": 7.0009132420091325, "grad_norm": 0.5460017919540405, "learning_rate": 3.3333333333333333e-06, "loss": 0.004, "step": 7666 }, { "epoch": 7.001826484018265, "grad_norm": 0.06588437408208847, "learning_rate": 3.3323186199898534e-06, "loss": 0.0004, "step": 7667 }, { "epoch": 7.002739726027397, "grad_norm": 3.1527559757232666, "learning_rate": 3.3313039066463727e-06, "loss": 0.0143, "step": 7668 }, { "epoch": 7.00365296803653, "grad_norm": 0.19783760607242584, "learning_rate": 3.3302891933028924e-06, "loss": 0.0011, "step": 7669 }, { "epoch": 7.004566210045662, "grad_norm": 0.8331958055496216, "learning_rate": 3.3292744799594117e-06, "loss": 0.0047, "step": 7670 }, { "epoch": 7.005479452054795, "grad_norm": 0.07504352182149887, "learning_rate": 3.328259766615931e-06, "loss": 0.0005, "step": 7671 }, { "epoch": 7.006392694063927, "grad_norm": 0.10520844161510468, "learning_rate": 3.327245053272451e-06, "loss": 0.0004, "step": 7672 }, { "epoch": 7.00730593607306, "grad_norm": 0.0615665428340435, "learning_rate": 3.3262303399289703e-06, "loss": 0.0004, "step": 7673 }, { "epoch": 7.008219178082192, "grad_norm": 3.448099136352539, "learning_rate": 3.32521562658549e-06, "loss": 0.0204, "step": 7674 }, { "epoch": 7.0091324200913245, "grad_norm": 0.2266063690185547, "learning_rate": 3.3242009132420093e-06, "loss": 0.0011, "step": 7675 }, { "epoch": 7.010045662100457, "grad_norm": 0.1219540387392044, "learning_rate": 3.3231861998985286e-06, "loss": 0.0006, "step": 7676 }, { "epoch": 7.010958904109589, "grad_norm": 18.402204513549805, "learning_rate": 3.3221714865550487e-06, "loss": 0.1111, "step": 7677 }, { "epoch": 7.011872146118722, "grad_norm": 13.927295684814453, "learning_rate": 3.321156773211568e-06, "loss": 0.0477, "step": 7678 }, { "epoch": 7.0127853881278535, "grad_norm": 0.14388644695281982, "learning_rate": 3.3201420598680876e-06, "loss": 0.0009, "step": 7679 }, { "epoch": 7.013698630136986, "grad_norm": 0.2240479588508606, "learning_rate": 3.3191273465246073e-06, "loss": 0.0017, "step": 7680 }, { "epoch": 7.014611872146118, "grad_norm": 0.2767016291618347, "learning_rate": 3.3181126331811266e-06, "loss": 0.0016, "step": 7681 }, { "epoch": 7.015525114155251, "grad_norm": 2.276733636856079, "learning_rate": 3.3170979198376463e-06, "loss": 0.0119, "step": 7682 }, { "epoch": 7.016438356164383, "grad_norm": 0.8343714475631714, "learning_rate": 3.3160832064941655e-06, "loss": 0.0015, "step": 7683 }, { "epoch": 7.017351598173516, "grad_norm": 1.6386744976043701, "learning_rate": 3.3150684931506857e-06, "loss": 0.0115, "step": 7684 }, { "epoch": 7.018264840182648, "grad_norm": 2.440600633621216, "learning_rate": 3.314053779807205e-06, "loss": 0.0097, "step": 7685 }, { "epoch": 7.019178082191781, "grad_norm": 0.02546999603509903, "learning_rate": 3.313039066463724e-06, "loss": 0.0002, "step": 7686 }, { "epoch": 7.020091324200913, "grad_norm": 3.6446101665496826, "learning_rate": 3.312024353120244e-06, "loss": 0.0245, "step": 7687 }, { "epoch": 7.0210045662100455, "grad_norm": 1.6756178140640259, "learning_rate": 3.311009639776763e-06, "loss": 0.0086, "step": 7688 }, { "epoch": 7.021917808219178, "grad_norm": 0.07817308604717255, "learning_rate": 3.3099949264332833e-06, "loss": 0.0005, "step": 7689 }, { "epoch": 7.0228310502283104, "grad_norm": 1.098936915397644, "learning_rate": 3.3089802130898025e-06, "loss": 0.0057, "step": 7690 }, { "epoch": 7.023744292237443, "grad_norm": 0.025177357718348503, "learning_rate": 3.307965499746322e-06, "loss": 0.0001, "step": 7691 }, { "epoch": 7.024657534246575, "grad_norm": 119.93157196044922, "learning_rate": 3.3069507864028415e-06, "loss": 4.817, "step": 7692 }, { "epoch": 7.025570776255708, "grad_norm": 0.10163506120443344, "learning_rate": 3.3059360730593608e-06, "loss": 0.0006, "step": 7693 }, { "epoch": 7.02648401826484, "grad_norm": 0.7554910778999329, "learning_rate": 3.304921359715881e-06, "loss": 0.005, "step": 7694 }, { "epoch": 7.027397260273973, "grad_norm": 1.0003299713134766, "learning_rate": 3.3039066463724e-06, "loss": 0.0052, "step": 7695 }, { "epoch": 7.028310502283105, "grad_norm": 0.04874051734805107, "learning_rate": 3.3028919330289194e-06, "loss": 0.0002, "step": 7696 }, { "epoch": 7.029223744292238, "grad_norm": 0.8073399662971497, "learning_rate": 3.301877219685439e-06, "loss": 0.0066, "step": 7697 }, { "epoch": 7.03013698630137, "grad_norm": 3.00544810295105, "learning_rate": 3.3008625063419584e-06, "loss": 0.0271, "step": 7698 }, { "epoch": 7.0310502283105025, "grad_norm": 0.10884788632392883, "learning_rate": 3.2998477929984785e-06, "loss": 0.0004, "step": 7699 }, { "epoch": 7.031963470319635, "grad_norm": 0.19993111491203308, "learning_rate": 3.2988330796549978e-06, "loss": 0.0011, "step": 7700 }, { "epoch": 7.032876712328767, "grad_norm": 0.41218066215515137, "learning_rate": 3.297818366311517e-06, "loss": 0.0024, "step": 7701 }, { "epoch": 7.0337899543379, "grad_norm": 0.44727563858032227, "learning_rate": 3.296803652968037e-06, "loss": 0.0028, "step": 7702 }, { "epoch": 7.034703196347032, "grad_norm": 0.021851949393749237, "learning_rate": 3.2957889396245564e-06, "loss": 0.0001, "step": 7703 }, { "epoch": 7.035616438356165, "grad_norm": 0.04061959311366081, "learning_rate": 3.294774226281076e-06, "loss": 0.0004, "step": 7704 }, { "epoch": 7.036529680365296, "grad_norm": 3.7145676612854004, "learning_rate": 3.2937595129375954e-06, "loss": 0.0159, "step": 7705 }, { "epoch": 7.037442922374429, "grad_norm": 0.6664483547210693, "learning_rate": 3.2927447995941147e-06, "loss": 0.0032, "step": 7706 }, { "epoch": 7.038356164383561, "grad_norm": 2.8740122318267822, "learning_rate": 3.2917300862506348e-06, "loss": 0.0151, "step": 7707 }, { "epoch": 7.039269406392694, "grad_norm": 0.43510061502456665, "learning_rate": 3.290715372907154e-06, "loss": 0.0029, "step": 7708 }, { "epoch": 7.040182648401826, "grad_norm": 0.7652726769447327, "learning_rate": 3.2897006595636737e-06, "loss": 0.0031, "step": 7709 }, { "epoch": 7.041095890410959, "grad_norm": 7.7801127433776855, "learning_rate": 3.288685946220193e-06, "loss": 0.0114, "step": 7710 }, { "epoch": 7.042009132420091, "grad_norm": 0.13940037786960602, "learning_rate": 3.2876712328767123e-06, "loss": 0.0008, "step": 7711 }, { "epoch": 7.0429223744292235, "grad_norm": 0.8076603412628174, "learning_rate": 3.2866565195332324e-06, "loss": 0.005, "step": 7712 }, { "epoch": 7.043835616438356, "grad_norm": 1.1927562952041626, "learning_rate": 3.2856418061897517e-06, "loss": 0.0097, "step": 7713 }, { "epoch": 7.044748858447488, "grad_norm": 2.3227059841156006, "learning_rate": 3.2846270928462713e-06, "loss": 0.017, "step": 7714 }, { "epoch": 7.045662100456621, "grad_norm": 0.34989455342292786, "learning_rate": 3.2836123795027906e-06, "loss": 0.0021, "step": 7715 }, { "epoch": 7.046575342465753, "grad_norm": 0.601175844669342, "learning_rate": 3.28259766615931e-06, "loss": 0.0032, "step": 7716 }, { "epoch": 7.047488584474886, "grad_norm": 2.5396416187286377, "learning_rate": 3.28158295281583e-06, "loss": 0.0133, "step": 7717 }, { "epoch": 7.048401826484018, "grad_norm": 0.21490998566150665, "learning_rate": 3.2805682394723493e-06, "loss": 0.0012, "step": 7718 }, { "epoch": 7.049315068493151, "grad_norm": 1.0528504848480225, "learning_rate": 3.279553526128869e-06, "loss": 0.0033, "step": 7719 }, { "epoch": 7.050228310502283, "grad_norm": 0.003405224531888962, "learning_rate": 3.2785388127853882e-06, "loss": 0.0, "step": 7720 }, { "epoch": 7.051141552511416, "grad_norm": 15.195552825927734, "learning_rate": 3.277524099441908e-06, "loss": 0.0853, "step": 7721 }, { "epoch": 7.052054794520548, "grad_norm": 55.193294525146484, "learning_rate": 3.2765093860984276e-06, "loss": 0.2451, "step": 7722 }, { "epoch": 7.0529680365296805, "grad_norm": 1.138539433479309, "learning_rate": 3.275494672754947e-06, "loss": 0.0074, "step": 7723 }, { "epoch": 7.053881278538813, "grad_norm": 0.544029951095581, "learning_rate": 3.274479959411467e-06, "loss": 0.0028, "step": 7724 }, { "epoch": 7.054794520547945, "grad_norm": 2.688023805618286, "learning_rate": 3.2734652460679863e-06, "loss": 0.0129, "step": 7725 }, { "epoch": 7.055707762557078, "grad_norm": 0.11431541293859482, "learning_rate": 3.2724505327245055e-06, "loss": 0.0007, "step": 7726 }, { "epoch": 7.05662100456621, "grad_norm": 1.305359125137329, "learning_rate": 3.2714358193810252e-06, "loss": 0.0052, "step": 7727 }, { "epoch": 7.057534246575343, "grad_norm": 2.879885673522949, "learning_rate": 3.2704211060375445e-06, "loss": 0.0178, "step": 7728 }, { "epoch": 7.058447488584475, "grad_norm": 40.04127883911133, "learning_rate": 3.2694063926940646e-06, "loss": 0.2038, "step": 7729 }, { "epoch": 7.059360730593608, "grad_norm": 21.498149871826172, "learning_rate": 3.268391679350584e-06, "loss": 0.1343, "step": 7730 }, { "epoch": 7.06027397260274, "grad_norm": 9.735217094421387, "learning_rate": 3.267376966007103e-06, "loss": 0.045, "step": 7731 }, { "epoch": 7.061187214611872, "grad_norm": 1.8302284479141235, "learning_rate": 3.266362252663623e-06, "loss": 0.0095, "step": 7732 }, { "epoch": 7.062100456621004, "grad_norm": 8.841538429260254, "learning_rate": 3.265347539320142e-06, "loss": 0.0395, "step": 7733 }, { "epoch": 7.063013698630137, "grad_norm": 0.331386536359787, "learning_rate": 3.2643328259766622e-06, "loss": 0.002, "step": 7734 }, { "epoch": 7.063926940639269, "grad_norm": 0.3759746849536896, "learning_rate": 3.2633181126331815e-06, "loss": 0.0022, "step": 7735 }, { "epoch": 7.0648401826484015, "grad_norm": 0.9243175387382507, "learning_rate": 3.2623033992897008e-06, "loss": 0.0054, "step": 7736 }, { "epoch": 7.065753424657534, "grad_norm": 0.15023574233055115, "learning_rate": 3.2612886859462204e-06, "loss": 0.001, "step": 7737 }, { "epoch": 7.066666666666666, "grad_norm": 0.04003911465406418, "learning_rate": 3.2602739726027397e-06, "loss": 0.0002, "step": 7738 }, { "epoch": 7.067579908675799, "grad_norm": 37.28948211669922, "learning_rate": 3.25925925925926e-06, "loss": 0.1847, "step": 7739 }, { "epoch": 7.068493150684931, "grad_norm": 5.23380184173584, "learning_rate": 3.258244545915779e-06, "loss": 0.0295, "step": 7740 }, { "epoch": 7.069406392694064, "grad_norm": 0.019517159089446068, "learning_rate": 3.2572298325722984e-06, "loss": 0.0002, "step": 7741 }, { "epoch": 7.070319634703196, "grad_norm": 0.6951574087142944, "learning_rate": 3.2562151192288185e-06, "loss": 0.0041, "step": 7742 }, { "epoch": 7.071232876712329, "grad_norm": 0.032881755381822586, "learning_rate": 3.2552004058853378e-06, "loss": 0.0002, "step": 7743 }, { "epoch": 7.072146118721461, "grad_norm": 1.1175099611282349, "learning_rate": 3.2541856925418574e-06, "loss": 0.001, "step": 7744 }, { "epoch": 7.073059360730594, "grad_norm": 1.266300916671753, "learning_rate": 3.2531709791983767e-06, "loss": 0.0089, "step": 7745 }, { "epoch": 7.073972602739726, "grad_norm": 0.3321775197982788, "learning_rate": 3.252156265854896e-06, "loss": 0.0017, "step": 7746 }, { "epoch": 7.0748858447488585, "grad_norm": 101.77709197998047, "learning_rate": 3.251141552511416e-06, "loss": 0.596, "step": 7747 }, { "epoch": 7.075799086757991, "grad_norm": 0.06925583630800247, "learning_rate": 3.2501268391679354e-06, "loss": 0.0004, "step": 7748 }, { "epoch": 7.076712328767123, "grad_norm": 2.1112775802612305, "learning_rate": 3.249112125824455e-06, "loss": 0.0156, "step": 7749 }, { "epoch": 7.077625570776256, "grad_norm": 2.216092586517334, "learning_rate": 3.2480974124809743e-06, "loss": 0.0137, "step": 7750 }, { "epoch": 7.078538812785388, "grad_norm": 15.612412452697754, "learning_rate": 3.2470826991374936e-06, "loss": 0.0562, "step": 7751 }, { "epoch": 7.079452054794521, "grad_norm": 21.120485305786133, "learning_rate": 3.2460679857940137e-06, "loss": 0.1191, "step": 7752 }, { "epoch": 7.080365296803653, "grad_norm": 1.708339810371399, "learning_rate": 3.245053272450533e-06, "loss": 0.0092, "step": 7753 }, { "epoch": 7.081278538812786, "grad_norm": 32.14799499511719, "learning_rate": 3.2440385591070527e-06, "loss": 0.2459, "step": 7754 }, { "epoch": 7.082191780821918, "grad_norm": 0.36812469363212585, "learning_rate": 3.243023845763572e-06, "loss": 0.0027, "step": 7755 }, { "epoch": 7.083105022831051, "grad_norm": 1.935863971710205, "learning_rate": 3.242009132420091e-06, "loss": 0.0125, "step": 7756 }, { "epoch": 7.084018264840183, "grad_norm": 0.5099879503250122, "learning_rate": 3.2409944190766113e-06, "loss": 0.0029, "step": 7757 }, { "epoch": 7.0849315068493155, "grad_norm": 4.357783794403076, "learning_rate": 3.2399797057331306e-06, "loss": 0.0236, "step": 7758 }, { "epoch": 7.085844748858447, "grad_norm": 0.36407142877578735, "learning_rate": 3.2389649923896503e-06, "loss": 0.0024, "step": 7759 }, { "epoch": 7.0867579908675795, "grad_norm": 0.019907204434275627, "learning_rate": 3.2379502790461696e-06, "loss": 0.0001, "step": 7760 }, { "epoch": 7.087671232876712, "grad_norm": 173.52549743652344, "learning_rate": 3.2369355657026892e-06, "loss": 0.3687, "step": 7761 }, { "epoch": 7.088584474885844, "grad_norm": 0.8272457718849182, "learning_rate": 3.235920852359209e-06, "loss": 0.0044, "step": 7762 }, { "epoch": 7.089497716894977, "grad_norm": 2.0485377311706543, "learning_rate": 3.234906139015728e-06, "loss": 0.0098, "step": 7763 }, { "epoch": 7.090410958904109, "grad_norm": 10.566425323486328, "learning_rate": 3.2338914256722483e-06, "loss": 0.064, "step": 7764 }, { "epoch": 7.091324200913242, "grad_norm": 23.413774490356445, "learning_rate": 3.2328767123287676e-06, "loss": 0.136, "step": 7765 }, { "epoch": 7.092237442922374, "grad_norm": 2.2023894786834717, "learning_rate": 3.231861998985287e-06, "loss": 0.0132, "step": 7766 }, { "epoch": 7.093150684931507, "grad_norm": 0.07451120764017105, "learning_rate": 3.2308472856418066e-06, "loss": 0.0004, "step": 7767 }, { "epoch": 7.094063926940639, "grad_norm": 7.91347599029541, "learning_rate": 3.229832572298326e-06, "loss": 0.0537, "step": 7768 }, { "epoch": 7.094977168949772, "grad_norm": 0.07722634077072144, "learning_rate": 3.228817858954846e-06, "loss": 0.0004, "step": 7769 }, { "epoch": 7.095890410958904, "grad_norm": 0.26011955738067627, "learning_rate": 3.227803145611365e-06, "loss": 0.0017, "step": 7770 }, { "epoch": 7.0968036529680365, "grad_norm": 1.0661427974700928, "learning_rate": 3.2267884322678845e-06, "loss": 0.0058, "step": 7771 }, { "epoch": 7.097716894977169, "grad_norm": 0.7194774746894836, "learning_rate": 3.225773718924404e-06, "loss": 0.0039, "step": 7772 }, { "epoch": 7.098630136986301, "grad_norm": 0.2636057734489441, "learning_rate": 3.2247590055809234e-06, "loss": 0.0013, "step": 7773 }, { "epoch": 7.099543378995434, "grad_norm": 83.41822052001953, "learning_rate": 3.2237442922374436e-06, "loss": 0.1802, "step": 7774 }, { "epoch": 7.100456621004566, "grad_norm": 0.9104375243186951, "learning_rate": 3.222729578893963e-06, "loss": 0.002, "step": 7775 }, { "epoch": 7.101369863013699, "grad_norm": 12.542502403259277, "learning_rate": 3.221714865550482e-06, "loss": 0.0872, "step": 7776 }, { "epoch": 7.102283105022831, "grad_norm": 75.84458923339844, "learning_rate": 3.2207001522070018e-06, "loss": 0.7207, "step": 7777 }, { "epoch": 7.103196347031964, "grad_norm": 0.09989835321903229, "learning_rate": 3.219685438863521e-06, "loss": 0.0007, "step": 7778 }, { "epoch": 7.104109589041096, "grad_norm": 7.889945030212402, "learning_rate": 3.218670725520041e-06, "loss": 0.0413, "step": 7779 }, { "epoch": 7.105022831050229, "grad_norm": 0.6053268313407898, "learning_rate": 3.2176560121765604e-06, "loss": 0.0045, "step": 7780 }, { "epoch": 7.105936073059361, "grad_norm": 1.1340001821517944, "learning_rate": 3.2166412988330797e-06, "loss": 0.0067, "step": 7781 }, { "epoch": 7.1068493150684935, "grad_norm": 0.36681386828422546, "learning_rate": 3.2156265854896e-06, "loss": 0.0018, "step": 7782 }, { "epoch": 7.107762557077626, "grad_norm": 0.0670321136713028, "learning_rate": 3.214611872146119e-06, "loss": 0.0004, "step": 7783 }, { "epoch": 7.108675799086758, "grad_norm": 1.8457398414611816, "learning_rate": 3.2135971588026388e-06, "loss": 0.0109, "step": 7784 }, { "epoch": 7.109589041095891, "grad_norm": 1.7627936601638794, "learning_rate": 3.212582445459158e-06, "loss": 0.0124, "step": 7785 }, { "epoch": 7.110502283105022, "grad_norm": 0.040096886456012726, "learning_rate": 3.2115677321156773e-06, "loss": 0.0003, "step": 7786 }, { "epoch": 7.111415525114155, "grad_norm": 1.312545657157898, "learning_rate": 3.2105530187721974e-06, "loss": 0.0066, "step": 7787 }, { "epoch": 7.112328767123287, "grad_norm": 0.04001160338521004, "learning_rate": 3.2095383054287167e-06, "loss": 0.0003, "step": 7788 }, { "epoch": 7.11324200913242, "grad_norm": 0.28133833408355713, "learning_rate": 3.2085235920852364e-06, "loss": 0.0015, "step": 7789 }, { "epoch": 7.114155251141552, "grad_norm": 0.6905208826065063, "learning_rate": 3.2075088787417557e-06, "loss": 0.004, "step": 7790 }, { "epoch": 7.115068493150685, "grad_norm": 2.3044941425323486, "learning_rate": 3.206494165398275e-06, "loss": 0.0105, "step": 7791 }, { "epoch": 7.115981735159817, "grad_norm": 0.11004932224750519, "learning_rate": 3.205479452054795e-06, "loss": 0.0006, "step": 7792 }, { "epoch": 7.11689497716895, "grad_norm": 0.06759826093912125, "learning_rate": 3.2044647387113143e-06, "loss": 0.0005, "step": 7793 }, { "epoch": 7.117808219178082, "grad_norm": 0.089854396879673, "learning_rate": 3.203450025367834e-06, "loss": 0.0006, "step": 7794 }, { "epoch": 7.1187214611872145, "grad_norm": 0.3776227533817291, "learning_rate": 3.2024353120243533e-06, "loss": 0.0026, "step": 7795 }, { "epoch": 7.119634703196347, "grad_norm": 0.09172698855400085, "learning_rate": 3.2014205986808725e-06, "loss": 0.0006, "step": 7796 }, { "epoch": 7.120547945205479, "grad_norm": 4.067783832550049, "learning_rate": 3.2004058853373927e-06, "loss": 0.0245, "step": 7797 }, { "epoch": 7.121461187214612, "grad_norm": 1.315737009048462, "learning_rate": 3.199391171993912e-06, "loss": 0.0082, "step": 7798 }, { "epoch": 7.122374429223744, "grad_norm": 10.09503173828125, "learning_rate": 3.1983764586504316e-06, "loss": 0.0701, "step": 7799 }, { "epoch": 7.123287671232877, "grad_norm": 2.1838505268096924, "learning_rate": 3.197361745306951e-06, "loss": 0.0115, "step": 7800 }, { "epoch": 7.124200913242009, "grad_norm": 1.1326005458831787, "learning_rate": 3.1963470319634706e-06, "loss": 0.0083, "step": 7801 }, { "epoch": 7.125114155251142, "grad_norm": 1.3777551651000977, "learning_rate": 3.1953323186199903e-06, "loss": 0.0091, "step": 7802 }, { "epoch": 7.126027397260274, "grad_norm": 0.03631759434938431, "learning_rate": 3.1943176052765095e-06, "loss": 0.0001, "step": 7803 }, { "epoch": 7.126940639269407, "grad_norm": 0.3590395450592041, "learning_rate": 3.1933028919330297e-06, "loss": 0.0022, "step": 7804 }, { "epoch": 7.127853881278539, "grad_norm": 22.27787208557129, "learning_rate": 3.192288178589549e-06, "loss": 0.1468, "step": 7805 }, { "epoch": 7.1287671232876715, "grad_norm": 0.38399538397789, "learning_rate": 3.191273465246068e-06, "loss": 0.0019, "step": 7806 }, { "epoch": 7.129680365296804, "grad_norm": 0.2780485153198242, "learning_rate": 3.190258751902588e-06, "loss": 0.0015, "step": 7807 }, { "epoch": 7.130593607305936, "grad_norm": 0.5634766221046448, "learning_rate": 3.189244038559107e-06, "loss": 0.0027, "step": 7808 }, { "epoch": 7.131506849315069, "grad_norm": 0.007540862075984478, "learning_rate": 3.1882293252156273e-06, "loss": 0.0, "step": 7809 }, { "epoch": 7.132420091324201, "grad_norm": 61.724037170410156, "learning_rate": 3.1872146118721465e-06, "loss": 0.6367, "step": 7810 }, { "epoch": 7.133333333333334, "grad_norm": 1.097636342048645, "learning_rate": 3.186199898528666e-06, "loss": 0.0074, "step": 7811 }, { "epoch": 7.134246575342466, "grad_norm": 0.711571455001831, "learning_rate": 3.1851851851851855e-06, "loss": 0.0036, "step": 7812 }, { "epoch": 7.135159817351598, "grad_norm": 0.14790664613246918, "learning_rate": 3.1841704718417048e-06, "loss": 0.0011, "step": 7813 }, { "epoch": 7.13607305936073, "grad_norm": 2.326829433441162, "learning_rate": 3.183155758498225e-06, "loss": 0.0169, "step": 7814 }, { "epoch": 7.136986301369863, "grad_norm": 0.07120069116353989, "learning_rate": 3.182141045154744e-06, "loss": 0.0003, "step": 7815 }, { "epoch": 7.137899543378995, "grad_norm": 0.05070219933986664, "learning_rate": 3.1811263318112634e-06, "loss": 0.0003, "step": 7816 }, { "epoch": 7.138812785388128, "grad_norm": 0.7185972929000854, "learning_rate": 3.180111618467783e-06, "loss": 0.0045, "step": 7817 }, { "epoch": 7.13972602739726, "grad_norm": 4.390379428863525, "learning_rate": 3.1790969051243024e-06, "loss": 0.0214, "step": 7818 }, { "epoch": 7.1406392694063925, "grad_norm": 4.488894462585449, "learning_rate": 3.1780821917808225e-06, "loss": 0.029, "step": 7819 }, { "epoch": 7.141552511415525, "grad_norm": 1.9772578477859497, "learning_rate": 3.1770674784373418e-06, "loss": 0.0063, "step": 7820 }, { "epoch": 7.142465753424657, "grad_norm": 1.339927315711975, "learning_rate": 3.176052765093861e-06, "loss": 0.006, "step": 7821 }, { "epoch": 7.14337899543379, "grad_norm": 2.967865228652954, "learning_rate": 3.1750380517503807e-06, "loss": 0.0217, "step": 7822 }, { "epoch": 7.144292237442922, "grad_norm": 40.059139251708984, "learning_rate": 3.1740233384069004e-06, "loss": 0.2032, "step": 7823 }, { "epoch": 7.145205479452055, "grad_norm": 0.2454233020544052, "learning_rate": 3.17300862506342e-06, "loss": 0.0013, "step": 7824 }, { "epoch": 7.146118721461187, "grad_norm": 32.06253433227539, "learning_rate": 3.1719939117199394e-06, "loss": 0.4222, "step": 7825 }, { "epoch": 7.14703196347032, "grad_norm": 1.1897542476654053, "learning_rate": 3.1709791983764586e-06, "loss": 0.0051, "step": 7826 }, { "epoch": 7.147945205479452, "grad_norm": 0.1163562685251236, "learning_rate": 3.1699644850329788e-06, "loss": 0.0007, "step": 7827 }, { "epoch": 7.148858447488585, "grad_norm": 6.429090976715088, "learning_rate": 3.168949771689498e-06, "loss": 0.0539, "step": 7828 }, { "epoch": 7.149771689497717, "grad_norm": 0.14239811897277832, "learning_rate": 3.1679350583460177e-06, "loss": 0.0006, "step": 7829 }, { "epoch": 7.1506849315068495, "grad_norm": 0.13157251477241516, "learning_rate": 3.166920345002537e-06, "loss": 0.0008, "step": 7830 }, { "epoch": 7.151598173515982, "grad_norm": 0.1871662586927414, "learning_rate": 3.1659056316590563e-06, "loss": 0.0012, "step": 7831 }, { "epoch": 7.152511415525114, "grad_norm": 28.212587356567383, "learning_rate": 3.1648909183155764e-06, "loss": 0.1483, "step": 7832 }, { "epoch": 7.153424657534247, "grad_norm": 0.014249633066356182, "learning_rate": 3.1638762049720956e-06, "loss": 0.0001, "step": 7833 }, { "epoch": 7.154337899543379, "grad_norm": 5.946959495544434, "learning_rate": 3.1628614916286153e-06, "loss": 0.0264, "step": 7834 }, { "epoch": 7.155251141552512, "grad_norm": 0.02814057655632496, "learning_rate": 3.1618467782851346e-06, "loss": 0.0002, "step": 7835 }, { "epoch": 7.156164383561644, "grad_norm": 2.3467299938201904, "learning_rate": 3.160832064941654e-06, "loss": 0.0119, "step": 7836 }, { "epoch": 7.157077625570777, "grad_norm": 0.5322665572166443, "learning_rate": 3.159817351598174e-06, "loss": 0.003, "step": 7837 }, { "epoch": 7.157990867579909, "grad_norm": 0.5152804255485535, "learning_rate": 3.1588026382546933e-06, "loss": 0.0033, "step": 7838 }, { "epoch": 7.1589041095890416, "grad_norm": 3.1090829372406006, "learning_rate": 3.157787924911213e-06, "loss": 0.0178, "step": 7839 }, { "epoch": 7.159817351598173, "grad_norm": 8.749892234802246, "learning_rate": 3.1567732115677322e-06, "loss": 0.0324, "step": 7840 }, { "epoch": 7.160730593607306, "grad_norm": 0.27707672119140625, "learning_rate": 3.155758498224252e-06, "loss": 0.0015, "step": 7841 }, { "epoch": 7.161643835616438, "grad_norm": 0.14400988817214966, "learning_rate": 3.1547437848807716e-06, "loss": 0.0009, "step": 7842 }, { "epoch": 7.1625570776255705, "grad_norm": 0.7530466914176941, "learning_rate": 3.153729071537291e-06, "loss": 0.0047, "step": 7843 }, { "epoch": 7.163470319634703, "grad_norm": 2.9999358654022217, "learning_rate": 3.152714358193811e-06, "loss": 0.0202, "step": 7844 }, { "epoch": 7.164383561643835, "grad_norm": 13.788439750671387, "learning_rate": 3.1516996448503303e-06, "loss": 0.0383, "step": 7845 }, { "epoch": 7.165296803652968, "grad_norm": 1.8527202606201172, "learning_rate": 3.1506849315068495e-06, "loss": 0.0082, "step": 7846 }, { "epoch": 7.1662100456621, "grad_norm": 0.0890679582953453, "learning_rate": 3.1496702181633692e-06, "loss": 0.0005, "step": 7847 }, { "epoch": 7.167123287671233, "grad_norm": 8.041220664978027, "learning_rate": 3.1486555048198885e-06, "loss": 0.0503, "step": 7848 }, { "epoch": 7.168036529680365, "grad_norm": 0.023511452600359917, "learning_rate": 3.1476407914764086e-06, "loss": 0.0001, "step": 7849 }, { "epoch": 7.168949771689498, "grad_norm": 3.7428460121154785, "learning_rate": 3.146626078132928e-06, "loss": 0.0214, "step": 7850 }, { "epoch": 7.16986301369863, "grad_norm": 0.5267532467842102, "learning_rate": 3.145611364789447e-06, "loss": 0.0032, "step": 7851 }, { "epoch": 7.170776255707763, "grad_norm": 13.229331970214844, "learning_rate": 3.144596651445967e-06, "loss": 0.0668, "step": 7852 }, { "epoch": 7.171689497716895, "grad_norm": 1.423063039779663, "learning_rate": 3.143581938102486e-06, "loss": 0.0061, "step": 7853 }, { "epoch": 7.1726027397260275, "grad_norm": 8.92182445526123, "learning_rate": 3.142567224759006e-06, "loss": 0.0609, "step": 7854 }, { "epoch": 7.17351598173516, "grad_norm": 5.204710483551025, "learning_rate": 3.1415525114155255e-06, "loss": 0.0291, "step": 7855 }, { "epoch": 7.174429223744292, "grad_norm": 0.06719030439853668, "learning_rate": 3.1405377980720447e-06, "loss": 0.0004, "step": 7856 }, { "epoch": 7.175342465753425, "grad_norm": 0.150113046169281, "learning_rate": 3.1395230847285644e-06, "loss": 0.0009, "step": 7857 }, { "epoch": 7.176255707762557, "grad_norm": 1.1795997619628906, "learning_rate": 3.1385083713850837e-06, "loss": 0.0082, "step": 7858 }, { "epoch": 7.17716894977169, "grad_norm": 0.009001713246107101, "learning_rate": 3.137493658041604e-06, "loss": 0.0001, "step": 7859 }, { "epoch": 7.178082191780822, "grad_norm": 0.3157157897949219, "learning_rate": 3.136478944698123e-06, "loss": 0.0019, "step": 7860 }, { "epoch": 7.178995433789955, "grad_norm": 0.020408928394317627, "learning_rate": 3.1354642313546424e-06, "loss": 0.0001, "step": 7861 }, { "epoch": 7.179908675799087, "grad_norm": 0.014710056595504284, "learning_rate": 3.134449518011162e-06, "loss": 0.0001, "step": 7862 }, { "epoch": 7.1808219178082195, "grad_norm": 0.7805894613265991, "learning_rate": 3.1334348046676817e-06, "loss": 0.0027, "step": 7863 }, { "epoch": 7.181735159817352, "grad_norm": 0.6305264234542847, "learning_rate": 3.1324200913242014e-06, "loss": 0.0042, "step": 7864 }, { "epoch": 7.182648401826484, "grad_norm": 0.8578463792800903, "learning_rate": 3.1314053779807207e-06, "loss": 0.005, "step": 7865 }, { "epoch": 7.183561643835616, "grad_norm": 0.8079563975334167, "learning_rate": 3.13039066463724e-06, "loss": 0.0077, "step": 7866 }, { "epoch": 7.1844748858447485, "grad_norm": 8.831713676452637, "learning_rate": 3.12937595129376e-06, "loss": 0.0481, "step": 7867 }, { "epoch": 7.185388127853881, "grad_norm": 0.35480740666389465, "learning_rate": 3.1283612379502794e-06, "loss": 0.0031, "step": 7868 }, { "epoch": 7.186301369863013, "grad_norm": 0.9896635413169861, "learning_rate": 3.127346524606799e-06, "loss": 0.0048, "step": 7869 }, { "epoch": 7.187214611872146, "grad_norm": 0.17974618077278137, "learning_rate": 3.1263318112633183e-06, "loss": 0.0012, "step": 7870 }, { "epoch": 7.188127853881278, "grad_norm": 29.128662109375, "learning_rate": 3.1253170979198376e-06, "loss": 0.1249, "step": 7871 }, { "epoch": 7.189041095890411, "grad_norm": 6.9378533363342285, "learning_rate": 3.1243023845763577e-06, "loss": 0.0515, "step": 7872 }, { "epoch": 7.189954337899543, "grad_norm": 0.5245470404624939, "learning_rate": 3.123287671232877e-06, "loss": 0.003, "step": 7873 }, { "epoch": 7.190867579908676, "grad_norm": 0.949887752532959, "learning_rate": 3.1222729578893967e-06, "loss": 0.0049, "step": 7874 }, { "epoch": 7.191780821917808, "grad_norm": 0.5885679721832275, "learning_rate": 3.121258244545916e-06, "loss": 0.0041, "step": 7875 }, { "epoch": 7.1926940639269406, "grad_norm": 0.004066267516463995, "learning_rate": 3.120243531202435e-06, "loss": 0.0, "step": 7876 }, { "epoch": 7.193607305936073, "grad_norm": 5.440922737121582, "learning_rate": 3.1192288178589553e-06, "loss": 0.0448, "step": 7877 }, { "epoch": 7.1945205479452055, "grad_norm": 3.051544189453125, "learning_rate": 3.1182141045154746e-06, "loss": 0.0171, "step": 7878 }, { "epoch": 7.195433789954338, "grad_norm": 0.6688332557678223, "learning_rate": 3.1171993911719943e-06, "loss": 0.0034, "step": 7879 }, { "epoch": 7.19634703196347, "grad_norm": 1.6255269050598145, "learning_rate": 3.1161846778285135e-06, "loss": 0.0101, "step": 7880 }, { "epoch": 7.197260273972603, "grad_norm": 0.3331950008869171, "learning_rate": 3.1151699644850332e-06, "loss": 0.0007, "step": 7881 }, { "epoch": 7.198173515981735, "grad_norm": 5.988378047943115, "learning_rate": 3.114155251141553e-06, "loss": 0.0343, "step": 7882 }, { "epoch": 7.199086757990868, "grad_norm": 0.04933895543217659, "learning_rate": 3.113140537798072e-06, "loss": 0.0004, "step": 7883 }, { "epoch": 7.2, "grad_norm": 1.1474553346633911, "learning_rate": 3.1121258244545923e-06, "loss": 0.0068, "step": 7884 }, { "epoch": 7.200913242009133, "grad_norm": 51.97291564941406, "learning_rate": 3.1111111111111116e-06, "loss": 0.3226, "step": 7885 }, { "epoch": 7.201826484018265, "grad_norm": 0.4060530662536621, "learning_rate": 3.110096397767631e-06, "loss": 0.0026, "step": 7886 }, { "epoch": 7.2027397260273975, "grad_norm": 5.482873439788818, "learning_rate": 3.1090816844241505e-06, "loss": 0.0298, "step": 7887 }, { "epoch": 7.20365296803653, "grad_norm": 1.8398722410202026, "learning_rate": 3.10806697108067e-06, "loss": 0.0081, "step": 7888 }, { "epoch": 7.2045662100456624, "grad_norm": 3.329193592071533, "learning_rate": 3.10705225773719e-06, "loss": 0.0147, "step": 7889 }, { "epoch": 7.205479452054795, "grad_norm": 0.037664711475372314, "learning_rate": 3.106037544393709e-06, "loss": 0.0002, "step": 7890 }, { "epoch": 7.206392694063927, "grad_norm": 2.7440083026885986, "learning_rate": 3.1050228310502285e-06, "loss": 0.0125, "step": 7891 }, { "epoch": 7.207305936073059, "grad_norm": 2.4196643829345703, "learning_rate": 3.104008117706748e-06, "loss": 0.0084, "step": 7892 }, { "epoch": 7.208219178082191, "grad_norm": 4.873528003692627, "learning_rate": 3.1029934043632674e-06, "loss": 0.0279, "step": 7893 }, { "epoch": 7.209132420091324, "grad_norm": 1.655281901359558, "learning_rate": 3.1019786910197875e-06, "loss": 0.0045, "step": 7894 }, { "epoch": 7.210045662100456, "grad_norm": 1.3270591497421265, "learning_rate": 3.100963977676307e-06, "loss": 0.0087, "step": 7895 }, { "epoch": 7.210958904109589, "grad_norm": 1.1357605457305908, "learning_rate": 3.099949264332826e-06, "loss": 0.0057, "step": 7896 }, { "epoch": 7.211872146118721, "grad_norm": 3.7086997032165527, "learning_rate": 3.0989345509893458e-06, "loss": 0.0207, "step": 7897 }, { "epoch": 7.212785388127854, "grad_norm": 1.5430318117141724, "learning_rate": 3.097919837645865e-06, "loss": 0.0079, "step": 7898 }, { "epoch": 7.213698630136986, "grad_norm": 0.18690258264541626, "learning_rate": 3.096905124302385e-06, "loss": 0.0011, "step": 7899 }, { "epoch": 7.2146118721461185, "grad_norm": 0.28840556740760803, "learning_rate": 3.0958904109589044e-06, "loss": 0.0017, "step": 7900 }, { "epoch": 7.215525114155251, "grad_norm": 0.12974227964878082, "learning_rate": 3.0948756976154237e-06, "loss": 0.0008, "step": 7901 }, { "epoch": 7.2164383561643834, "grad_norm": 0.41090676188468933, "learning_rate": 3.0938609842719434e-06, "loss": 0.003, "step": 7902 }, { "epoch": 7.217351598173516, "grad_norm": 4.633270740509033, "learning_rate": 3.092846270928463e-06, "loss": 0.0295, "step": 7903 }, { "epoch": 7.218264840182648, "grad_norm": 0.041080277413129807, "learning_rate": 3.0918315575849828e-06, "loss": 0.0003, "step": 7904 }, { "epoch": 7.219178082191781, "grad_norm": 0.008255505934357643, "learning_rate": 3.090816844241502e-06, "loss": 0.0, "step": 7905 }, { "epoch": 7.220091324200913, "grad_norm": 2.3750553131103516, "learning_rate": 3.0898021308980213e-06, "loss": 0.0089, "step": 7906 }, { "epoch": 7.221004566210046, "grad_norm": 7.907370567321777, "learning_rate": 3.0887874175545414e-06, "loss": 0.0531, "step": 7907 }, { "epoch": 7.221917808219178, "grad_norm": 0.10773328691720963, "learning_rate": 3.0877727042110607e-06, "loss": 0.0006, "step": 7908 }, { "epoch": 7.222831050228311, "grad_norm": 17.64082908630371, "learning_rate": 3.0867579908675804e-06, "loss": 0.0992, "step": 7909 }, { "epoch": 7.223744292237443, "grad_norm": 0.9925012588500977, "learning_rate": 3.0857432775240996e-06, "loss": 0.0073, "step": 7910 }, { "epoch": 7.2246575342465755, "grad_norm": 10.713794708251953, "learning_rate": 3.084728564180619e-06, "loss": 0.0466, "step": 7911 }, { "epoch": 7.225570776255708, "grad_norm": 0.03424527868628502, "learning_rate": 3.083713850837139e-06, "loss": 0.0002, "step": 7912 }, { "epoch": 7.22648401826484, "grad_norm": 0.5579819679260254, "learning_rate": 3.0826991374936583e-06, "loss": 0.0039, "step": 7913 }, { "epoch": 7.227397260273973, "grad_norm": 0.06469880044460297, "learning_rate": 3.081684424150178e-06, "loss": 0.0004, "step": 7914 }, { "epoch": 7.228310502283105, "grad_norm": 0.11091339588165283, "learning_rate": 3.0806697108066973e-06, "loss": 0.0009, "step": 7915 }, { "epoch": 7.229223744292238, "grad_norm": 0.26949238777160645, "learning_rate": 3.0796549974632165e-06, "loss": 0.0012, "step": 7916 }, { "epoch": 7.23013698630137, "grad_norm": 2.6466801166534424, "learning_rate": 3.0786402841197366e-06, "loss": 0.0157, "step": 7917 }, { "epoch": 7.231050228310503, "grad_norm": 0.036566250026226044, "learning_rate": 3.077625570776256e-06, "loss": 0.0002, "step": 7918 }, { "epoch": 7.231963470319634, "grad_norm": 0.29484760761260986, "learning_rate": 3.0766108574327756e-06, "loss": 0.0021, "step": 7919 }, { "epoch": 7.232876712328767, "grad_norm": 0.05529797822237015, "learning_rate": 3.075596144089295e-06, "loss": 0.0005, "step": 7920 }, { "epoch": 7.233789954337899, "grad_norm": 0.06186117231845856, "learning_rate": 3.0745814307458146e-06, "loss": 0.0004, "step": 7921 }, { "epoch": 7.234703196347032, "grad_norm": 5.993383407592773, "learning_rate": 3.0735667174023343e-06, "loss": 0.0322, "step": 7922 }, { "epoch": 7.235616438356164, "grad_norm": 2.5825557708740234, "learning_rate": 3.0725520040588535e-06, "loss": 0.0124, "step": 7923 }, { "epoch": 7.2365296803652965, "grad_norm": 0.24321484565734863, "learning_rate": 3.0715372907153736e-06, "loss": 0.0018, "step": 7924 }, { "epoch": 7.237442922374429, "grad_norm": 3.4876835346221924, "learning_rate": 3.070522577371893e-06, "loss": 0.0187, "step": 7925 }, { "epoch": 7.238356164383561, "grad_norm": 1.601223349571228, "learning_rate": 3.069507864028412e-06, "loss": 0.0083, "step": 7926 }, { "epoch": 7.239269406392694, "grad_norm": 14.591485977172852, "learning_rate": 3.068493150684932e-06, "loss": 0.0841, "step": 7927 }, { "epoch": 7.240182648401826, "grad_norm": 0.9370444416999817, "learning_rate": 3.067478437341451e-06, "loss": 0.0044, "step": 7928 }, { "epoch": 7.241095890410959, "grad_norm": 1.594084620475769, "learning_rate": 3.0664637239979713e-06, "loss": 0.0074, "step": 7929 }, { "epoch": 7.242009132420091, "grad_norm": 0.43389394879341125, "learning_rate": 3.0654490106544905e-06, "loss": 0.0027, "step": 7930 }, { "epoch": 7.242922374429224, "grad_norm": 53.29328918457031, "learning_rate": 3.06443429731101e-06, "loss": 0.5095, "step": 7931 }, { "epoch": 7.243835616438356, "grad_norm": 0.2630879878997803, "learning_rate": 3.0634195839675295e-06, "loss": 0.0011, "step": 7932 }, { "epoch": 7.244748858447489, "grad_norm": 1.401027798652649, "learning_rate": 3.0624048706240488e-06, "loss": 0.0098, "step": 7933 }, { "epoch": 7.245662100456621, "grad_norm": 0.5525004267692566, "learning_rate": 3.061390157280569e-06, "loss": 0.0046, "step": 7934 }, { "epoch": 7.2465753424657535, "grad_norm": 6.7672810554504395, "learning_rate": 3.060375443937088e-06, "loss": 0.0264, "step": 7935 }, { "epoch": 7.247488584474886, "grad_norm": 0.8205968141555786, "learning_rate": 3.0593607305936074e-06, "loss": 0.0032, "step": 7936 }, { "epoch": 7.248401826484018, "grad_norm": 0.15902645885944366, "learning_rate": 3.058346017250127e-06, "loss": 0.0008, "step": 7937 }, { "epoch": 7.249315068493151, "grad_norm": 0.03954535350203514, "learning_rate": 3.0573313039066464e-06, "loss": 0.0003, "step": 7938 }, { "epoch": 7.250228310502283, "grad_norm": 8.471330642700195, "learning_rate": 3.0563165905631665e-06, "loss": 0.0439, "step": 7939 }, { "epoch": 7.251141552511416, "grad_norm": 0.13763338327407837, "learning_rate": 3.0553018772196858e-06, "loss": 0.0009, "step": 7940 }, { "epoch": 7.252054794520548, "grad_norm": 0.8431074619293213, "learning_rate": 3.054287163876205e-06, "loss": 0.0054, "step": 7941 }, { "epoch": 7.252968036529681, "grad_norm": 2.6708199977874756, "learning_rate": 3.0532724505327247e-06, "loss": 0.0175, "step": 7942 }, { "epoch": 7.253881278538813, "grad_norm": 1.3976123332977295, "learning_rate": 3.0522577371892444e-06, "loss": 0.0087, "step": 7943 }, { "epoch": 7.254794520547946, "grad_norm": 4.150523662567139, "learning_rate": 3.051243023845764e-06, "loss": 0.0203, "step": 7944 }, { "epoch": 7.255707762557078, "grad_norm": 0.5631294250488281, "learning_rate": 3.0502283105022834e-06, "loss": 0.0025, "step": 7945 }, { "epoch": 7.25662100456621, "grad_norm": 0.2916569411754608, "learning_rate": 3.0492135971588026e-06, "loss": 0.0019, "step": 7946 }, { "epoch": 7.257534246575342, "grad_norm": 0.06096472218632698, "learning_rate": 3.0481988838153227e-06, "loss": 0.0004, "step": 7947 }, { "epoch": 7.2584474885844745, "grad_norm": 0.11440154910087585, "learning_rate": 3.047184170471842e-06, "loss": 0.0007, "step": 7948 }, { "epoch": 7.259360730593607, "grad_norm": 0.26470157504081726, "learning_rate": 3.0461694571283617e-06, "loss": 0.0018, "step": 7949 }, { "epoch": 7.260273972602739, "grad_norm": 5.022436141967773, "learning_rate": 3.045154743784881e-06, "loss": 0.0286, "step": 7950 }, { "epoch": 7.261187214611872, "grad_norm": 0.8090062737464905, "learning_rate": 3.0441400304414002e-06, "loss": 0.0038, "step": 7951 }, { "epoch": 7.262100456621004, "grad_norm": 3.292011022567749, "learning_rate": 3.0431253170979204e-06, "loss": 0.0151, "step": 7952 }, { "epoch": 7.263013698630137, "grad_norm": 0.24218198657035828, "learning_rate": 3.0421106037544396e-06, "loss": 0.0016, "step": 7953 }, { "epoch": 7.263926940639269, "grad_norm": 101.4675064086914, "learning_rate": 3.0410958904109593e-06, "loss": 0.74, "step": 7954 }, { "epoch": 7.264840182648402, "grad_norm": 0.5608952045440674, "learning_rate": 3.0400811770674786e-06, "loss": 0.0026, "step": 7955 }, { "epoch": 7.265753424657534, "grad_norm": 36.2454833984375, "learning_rate": 3.039066463723998e-06, "loss": 0.1716, "step": 7956 }, { "epoch": 7.266666666666667, "grad_norm": 0.9157142639160156, "learning_rate": 3.038051750380518e-06, "loss": 0.0038, "step": 7957 }, { "epoch": 7.267579908675799, "grad_norm": 0.4996253252029419, "learning_rate": 3.0370370370370372e-06, "loss": 0.0031, "step": 7958 }, { "epoch": 7.2684931506849315, "grad_norm": 0.4889223277568817, "learning_rate": 3.036022323693557e-06, "loss": 0.0019, "step": 7959 }, { "epoch": 7.269406392694064, "grad_norm": 0.6047768592834473, "learning_rate": 3.035007610350076e-06, "loss": 0.0032, "step": 7960 }, { "epoch": 7.270319634703196, "grad_norm": 0.3608936667442322, "learning_rate": 3.033992897006596e-06, "loss": 0.0013, "step": 7961 }, { "epoch": 7.271232876712329, "grad_norm": 0.315207302570343, "learning_rate": 3.0329781836631156e-06, "loss": 0.0016, "step": 7962 }, { "epoch": 7.272146118721461, "grad_norm": 0.9979442954063416, "learning_rate": 3.031963470319635e-06, "loss": 0.0067, "step": 7963 }, { "epoch": 7.273059360730594, "grad_norm": 1.6649342775344849, "learning_rate": 3.0309487569761546e-06, "loss": 0.0072, "step": 7964 }, { "epoch": 7.273972602739726, "grad_norm": 0.14988400042057037, "learning_rate": 3.0299340436326742e-06, "loss": 0.0007, "step": 7965 }, { "epoch": 7.274885844748859, "grad_norm": 0.06699997186660767, "learning_rate": 3.0289193302891935e-06, "loss": 0.0003, "step": 7966 }, { "epoch": 7.275799086757991, "grad_norm": 1.1787874698638916, "learning_rate": 3.027904616945713e-06, "loss": 0.0089, "step": 7967 }, { "epoch": 7.276712328767124, "grad_norm": 110.73027038574219, "learning_rate": 3.0268899036022325e-06, "loss": 1.3053, "step": 7968 }, { "epoch": 7.277625570776256, "grad_norm": 1.2778644561767578, "learning_rate": 3.0258751902587526e-06, "loss": 0.0064, "step": 7969 }, { "epoch": 7.2785388127853885, "grad_norm": 0.40913552045822144, "learning_rate": 3.024860476915272e-06, "loss": 0.0013, "step": 7970 }, { "epoch": 7.279452054794521, "grad_norm": 0.7130905985832214, "learning_rate": 3.023845763571791e-06, "loss": 0.0037, "step": 7971 }, { "epoch": 7.280365296803653, "grad_norm": 53.090675354003906, "learning_rate": 3.022831050228311e-06, "loss": 0.2835, "step": 7972 }, { "epoch": 7.281278538812785, "grad_norm": 8.868134498596191, "learning_rate": 3.02181633688483e-06, "loss": 0.0418, "step": 7973 }, { "epoch": 7.282191780821917, "grad_norm": 0.6544371247291565, "learning_rate": 3.02080162354135e-06, "loss": 0.0056, "step": 7974 }, { "epoch": 7.28310502283105, "grad_norm": 0.05948341637849808, "learning_rate": 3.0197869101978695e-06, "loss": 0.0002, "step": 7975 }, { "epoch": 7.284018264840182, "grad_norm": 13.06666374206543, "learning_rate": 3.0187721968543887e-06, "loss": 0.0841, "step": 7976 }, { "epoch": 7.284931506849315, "grad_norm": 1.1744849681854248, "learning_rate": 3.0177574835109084e-06, "loss": 0.0078, "step": 7977 }, { "epoch": 7.285844748858447, "grad_norm": 7.424348831176758, "learning_rate": 3.0167427701674277e-06, "loss": 0.043, "step": 7978 }, { "epoch": 7.28675799086758, "grad_norm": 0.04138213023543358, "learning_rate": 3.015728056823948e-06, "loss": 0.0003, "step": 7979 }, { "epoch": 7.287671232876712, "grad_norm": 1.9753427505493164, "learning_rate": 3.014713343480467e-06, "loss": 0.0091, "step": 7980 }, { "epoch": 7.288584474885845, "grad_norm": 1.3813902139663696, "learning_rate": 3.0136986301369864e-06, "loss": 0.009, "step": 7981 }, { "epoch": 7.289497716894977, "grad_norm": 2.91634464263916, "learning_rate": 3.012683916793506e-06, "loss": 0.0171, "step": 7982 }, { "epoch": 7.2904109589041095, "grad_norm": 10.135564804077148, "learning_rate": 3.0116692034500257e-06, "loss": 0.0564, "step": 7983 }, { "epoch": 7.291324200913242, "grad_norm": 2.128919839859009, "learning_rate": 3.0106544901065454e-06, "loss": 0.0146, "step": 7984 }, { "epoch": 7.292237442922374, "grad_norm": 1.2351304292678833, "learning_rate": 3.0096397767630647e-06, "loss": 0.0068, "step": 7985 }, { "epoch": 7.293150684931507, "grad_norm": 0.3191891610622406, "learning_rate": 3.008625063419584e-06, "loss": 0.0016, "step": 7986 }, { "epoch": 7.294063926940639, "grad_norm": 66.51930236816406, "learning_rate": 3.007610350076104e-06, "loss": 0.6184, "step": 7987 }, { "epoch": 7.294977168949772, "grad_norm": 0.7321865558624268, "learning_rate": 3.0065956367326233e-06, "loss": 0.0023, "step": 7988 }, { "epoch": 7.295890410958904, "grad_norm": 46.2956428527832, "learning_rate": 3.005580923389143e-06, "loss": 0.188, "step": 7989 }, { "epoch": 7.296803652968037, "grad_norm": 0.17113345861434937, "learning_rate": 3.0045662100456623e-06, "loss": 0.0009, "step": 7990 }, { "epoch": 7.297716894977169, "grad_norm": 18.458032608032227, "learning_rate": 3.0035514967021816e-06, "loss": 0.1214, "step": 7991 }, { "epoch": 7.298630136986302, "grad_norm": 120.55004119873047, "learning_rate": 3.0025367833587017e-06, "loss": 1.524, "step": 7992 }, { "epoch": 7.299543378995434, "grad_norm": 3.089782238006592, "learning_rate": 3.001522070015221e-06, "loss": 0.0257, "step": 7993 }, { "epoch": 7.3004566210045665, "grad_norm": 0.5189289450645447, "learning_rate": 3.0005073566717407e-06, "loss": 0.0024, "step": 7994 }, { "epoch": 7.301369863013699, "grad_norm": 0.742706835269928, "learning_rate": 2.99949264332826e-06, "loss": 0.0043, "step": 7995 }, { "epoch": 7.302283105022831, "grad_norm": 0.17779026925563812, "learning_rate": 2.998477929984779e-06, "loss": 0.001, "step": 7996 }, { "epoch": 7.303196347031964, "grad_norm": 0.13086922466754913, "learning_rate": 2.9974632166412993e-06, "loss": 0.0008, "step": 7997 }, { "epoch": 7.304109589041096, "grad_norm": 2.632913589477539, "learning_rate": 2.9964485032978186e-06, "loss": 0.0138, "step": 7998 }, { "epoch": 7.305022831050229, "grad_norm": 0.05234808474779129, "learning_rate": 2.995433789954338e-06, "loss": 0.0003, "step": 7999 }, { "epoch": 7.30593607305936, "grad_norm": 0.27338457107543945, "learning_rate": 2.9944190766108575e-06, "loss": 0.0014, "step": 8000 }, { "epoch": 7.306849315068493, "grad_norm": 2.678518056869507, "learning_rate": 2.9934043632673772e-06, "loss": 0.0132, "step": 8001 }, { "epoch": 7.307762557077625, "grad_norm": 7.3148980140686035, "learning_rate": 2.992389649923897e-06, "loss": 0.0514, "step": 8002 }, { "epoch": 7.308675799086758, "grad_norm": 1.5228519439697266, "learning_rate": 2.991374936580416e-06, "loss": 0.0067, "step": 8003 }, { "epoch": 7.30958904109589, "grad_norm": 0.040116798132658005, "learning_rate": 2.9903602232369355e-06, "loss": 0.0002, "step": 8004 }, { "epoch": 7.310502283105023, "grad_norm": 0.30389994382858276, "learning_rate": 2.9893455098934556e-06, "loss": 0.0019, "step": 8005 }, { "epoch": 7.311415525114155, "grad_norm": 0.32783740758895874, "learning_rate": 2.988330796549975e-06, "loss": 0.0017, "step": 8006 }, { "epoch": 7.3123287671232875, "grad_norm": 11.783720016479492, "learning_rate": 2.9873160832064945e-06, "loss": 0.091, "step": 8007 }, { "epoch": 7.31324200913242, "grad_norm": 0.4516523778438568, "learning_rate": 2.986301369863014e-06, "loss": 0.0031, "step": 8008 }, { "epoch": 7.314155251141552, "grad_norm": 3.1629414558410645, "learning_rate": 2.985286656519533e-06, "loss": 0.02, "step": 8009 }, { "epoch": 7.315068493150685, "grad_norm": 0.863415002822876, "learning_rate": 2.984271943176053e-06, "loss": 0.0068, "step": 8010 }, { "epoch": 7.315981735159817, "grad_norm": 1.613629937171936, "learning_rate": 2.9832572298325725e-06, "loss": 0.0069, "step": 8011 }, { "epoch": 7.31689497716895, "grad_norm": 5.4456257820129395, "learning_rate": 2.982242516489092e-06, "loss": 0.0197, "step": 8012 }, { "epoch": 7.317808219178082, "grad_norm": 0.7588803172111511, "learning_rate": 2.9812278031456114e-06, "loss": 0.0044, "step": 8013 }, { "epoch": 7.318721461187215, "grad_norm": 7.771955490112305, "learning_rate": 2.9802130898021307e-06, "loss": 0.0413, "step": 8014 }, { "epoch": 7.319634703196347, "grad_norm": 1.077540636062622, "learning_rate": 2.979198376458651e-06, "loss": 0.0063, "step": 8015 }, { "epoch": 7.32054794520548, "grad_norm": 0.030666330829262733, "learning_rate": 2.97818366311517e-06, "loss": 0.0002, "step": 8016 }, { "epoch": 7.321461187214612, "grad_norm": 2.9590325355529785, "learning_rate": 2.9771689497716898e-06, "loss": 0.0121, "step": 8017 }, { "epoch": 7.3223744292237445, "grad_norm": 0.009747479110956192, "learning_rate": 2.976154236428209e-06, "loss": 0.0001, "step": 8018 }, { "epoch": 7.323287671232877, "grad_norm": 0.5552433133125305, "learning_rate": 2.9751395230847287e-06, "loss": 0.0019, "step": 8019 }, { "epoch": 7.324200913242009, "grad_norm": 0.26886168122291565, "learning_rate": 2.9741248097412484e-06, "loss": 0.0024, "step": 8020 }, { "epoch": 7.325114155251142, "grad_norm": 0.4869210124015808, "learning_rate": 2.9731100963977677e-06, "loss": 0.0027, "step": 8021 }, { "epoch": 7.326027397260274, "grad_norm": 4.5645036697387695, "learning_rate": 2.9720953830542874e-06, "loss": 0.0297, "step": 8022 }, { "epoch": 7.326940639269407, "grad_norm": 0.043147649616003036, "learning_rate": 2.971080669710807e-06, "loss": 0.0003, "step": 8023 }, { "epoch": 7.327853881278539, "grad_norm": 0.18680605292320251, "learning_rate": 2.9700659563673263e-06, "loss": 0.0012, "step": 8024 }, { "epoch": 7.328767123287671, "grad_norm": 2.7118608951568604, "learning_rate": 2.969051243023846e-06, "loss": 0.0134, "step": 8025 }, { "epoch": 7.329680365296804, "grad_norm": 0.5393028259277344, "learning_rate": 2.9680365296803653e-06, "loss": 0.0032, "step": 8026 }, { "epoch": 7.330593607305936, "grad_norm": 0.2312234789133072, "learning_rate": 2.9670218163368854e-06, "loss": 0.0015, "step": 8027 }, { "epoch": 7.331506849315068, "grad_norm": 1.513522744178772, "learning_rate": 2.9660071029934047e-06, "loss": 0.0071, "step": 8028 }, { "epoch": 7.332420091324201, "grad_norm": 12.84854507446289, "learning_rate": 2.964992389649924e-06, "loss": 0.017, "step": 8029 }, { "epoch": 7.333333333333333, "grad_norm": 0.15803581476211548, "learning_rate": 2.9639776763064436e-06, "loss": 0.001, "step": 8030 }, { "epoch": 7.3342465753424655, "grad_norm": 4.392039775848389, "learning_rate": 2.962962962962963e-06, "loss": 0.0222, "step": 8031 }, { "epoch": 7.335159817351598, "grad_norm": 0.39330682158470154, "learning_rate": 2.961948249619483e-06, "loss": 0.0026, "step": 8032 }, { "epoch": 7.33607305936073, "grad_norm": 0.31941425800323486, "learning_rate": 2.9609335362760023e-06, "loss": 0.0016, "step": 8033 }, { "epoch": 7.336986301369863, "grad_norm": 1.6298927068710327, "learning_rate": 2.9599188229325216e-06, "loss": 0.0079, "step": 8034 }, { "epoch": 7.337899543378995, "grad_norm": 1.0550808906555176, "learning_rate": 2.9589041095890413e-06, "loss": 0.0059, "step": 8035 }, { "epoch": 7.338812785388128, "grad_norm": 0.20852378010749817, "learning_rate": 2.9578893962455605e-06, "loss": 0.0015, "step": 8036 }, { "epoch": 7.33972602739726, "grad_norm": 0.5866184234619141, "learning_rate": 2.9568746829020806e-06, "loss": 0.0038, "step": 8037 }, { "epoch": 7.340639269406393, "grad_norm": 46.31513977050781, "learning_rate": 2.9558599695586e-06, "loss": 0.4107, "step": 8038 }, { "epoch": 7.341552511415525, "grad_norm": 7.011202335357666, "learning_rate": 2.954845256215119e-06, "loss": 0.0276, "step": 8039 }, { "epoch": 7.342465753424658, "grad_norm": 1.810250997543335, "learning_rate": 2.953830542871639e-06, "loss": 0.0132, "step": 8040 }, { "epoch": 7.34337899543379, "grad_norm": 0.051802024245262146, "learning_rate": 2.9528158295281586e-06, "loss": 0.0004, "step": 8041 }, { "epoch": 7.3442922374429225, "grad_norm": 0.182891845703125, "learning_rate": 2.9518011161846782e-06, "loss": 0.0007, "step": 8042 }, { "epoch": 7.345205479452055, "grad_norm": 4.194972038269043, "learning_rate": 2.9507864028411975e-06, "loss": 0.0115, "step": 8043 }, { "epoch": 7.346118721461187, "grad_norm": 10.398356437683105, "learning_rate": 2.9497716894977168e-06, "loss": 0.0516, "step": 8044 }, { "epoch": 7.34703196347032, "grad_norm": 0.0038354434072971344, "learning_rate": 2.948756976154237e-06, "loss": 0.0, "step": 8045 }, { "epoch": 7.347945205479452, "grad_norm": 2.6765294075012207, "learning_rate": 2.947742262810756e-06, "loss": 0.0199, "step": 8046 }, { "epoch": 7.348858447488585, "grad_norm": 0.3111697733402252, "learning_rate": 2.946727549467276e-06, "loss": 0.0015, "step": 8047 }, { "epoch": 7.349771689497717, "grad_norm": 5.707574367523193, "learning_rate": 2.945712836123795e-06, "loss": 0.0241, "step": 8048 }, { "epoch": 7.35068493150685, "grad_norm": 0.017977794632315636, "learning_rate": 2.9446981227803144e-06, "loss": 0.0001, "step": 8049 }, { "epoch": 7.351598173515982, "grad_norm": 0.08396879583597183, "learning_rate": 2.9436834094368345e-06, "loss": 0.0005, "step": 8050 }, { "epoch": 7.352511415525115, "grad_norm": 9.151899337768555, "learning_rate": 2.9426686960933538e-06, "loss": 0.0627, "step": 8051 }, { "epoch": 7.353424657534246, "grad_norm": 0.3247464597225189, "learning_rate": 2.9416539827498735e-06, "loss": 0.0019, "step": 8052 }, { "epoch": 7.3543378995433795, "grad_norm": 105.31588745117188, "learning_rate": 2.9406392694063927e-06, "loss": 2.6827, "step": 8053 }, { "epoch": 7.355251141552511, "grad_norm": 3.723283290863037, "learning_rate": 2.939624556062912e-06, "loss": 0.0245, "step": 8054 }, { "epoch": 7.3561643835616435, "grad_norm": 0.4553930461406708, "learning_rate": 2.938609842719432e-06, "loss": 0.0024, "step": 8055 }, { "epoch": 7.357077625570776, "grad_norm": 1.1673277616500854, "learning_rate": 2.9375951293759514e-06, "loss": 0.0065, "step": 8056 }, { "epoch": 7.357990867579908, "grad_norm": 0.007826291024684906, "learning_rate": 2.936580416032471e-06, "loss": 0.0, "step": 8057 }, { "epoch": 7.358904109589041, "grad_norm": 0.01988253928720951, "learning_rate": 2.9355657026889904e-06, "loss": 0.0001, "step": 8058 }, { "epoch": 7.359817351598173, "grad_norm": 2.6857070922851562, "learning_rate": 2.93455098934551e-06, "loss": 0.013, "step": 8059 }, { "epoch": 7.360730593607306, "grad_norm": 0.952797532081604, "learning_rate": 2.9335362760020297e-06, "loss": 0.0059, "step": 8060 }, { "epoch": 7.361643835616438, "grad_norm": 21.228532791137695, "learning_rate": 2.932521562658549e-06, "loss": 0.1376, "step": 8061 }, { "epoch": 7.362557077625571, "grad_norm": 25.23097801208496, "learning_rate": 2.9315068493150687e-06, "loss": 0.1414, "step": 8062 }, { "epoch": 7.363470319634703, "grad_norm": 0.5773845314979553, "learning_rate": 2.9304921359715884e-06, "loss": 0.0033, "step": 8063 }, { "epoch": 7.364383561643836, "grad_norm": 1.1020382642745972, "learning_rate": 2.9294774226281077e-06, "loss": 0.0071, "step": 8064 }, { "epoch": 7.365296803652968, "grad_norm": 2.760152816772461, "learning_rate": 2.9284627092846274e-06, "loss": 0.0233, "step": 8065 }, { "epoch": 7.3662100456621005, "grad_norm": 1.170390248298645, "learning_rate": 2.9274479959411466e-06, "loss": 0.0066, "step": 8066 }, { "epoch": 7.367123287671233, "grad_norm": 0.019584788009524345, "learning_rate": 2.9264332825976667e-06, "loss": 0.0001, "step": 8067 }, { "epoch": 7.368036529680365, "grad_norm": 10.067975997924805, "learning_rate": 2.925418569254186e-06, "loss": 0.0475, "step": 8068 }, { "epoch": 7.368949771689498, "grad_norm": 1.356114149093628, "learning_rate": 2.9244038559107053e-06, "loss": 0.0064, "step": 8069 }, { "epoch": 7.36986301369863, "grad_norm": 3.7943289279937744, "learning_rate": 2.923389142567225e-06, "loss": 0.018, "step": 8070 }, { "epoch": 7.370776255707763, "grad_norm": 5.393095016479492, "learning_rate": 2.9223744292237442e-06, "loss": 0.0217, "step": 8071 }, { "epoch": 7.371689497716895, "grad_norm": 2.355302333831787, "learning_rate": 2.9213597158802644e-06, "loss": 0.0125, "step": 8072 }, { "epoch": 7.372602739726028, "grad_norm": 65.18145751953125, "learning_rate": 2.9203450025367836e-06, "loss": 0.4538, "step": 8073 }, { "epoch": 7.37351598173516, "grad_norm": 19.04876136779785, "learning_rate": 2.919330289193303e-06, "loss": 0.0673, "step": 8074 }, { "epoch": 7.3744292237442925, "grad_norm": 0.4721851944923401, "learning_rate": 2.9183155758498226e-06, "loss": 0.0021, "step": 8075 }, { "epoch": 7.375342465753425, "grad_norm": 0.9076579809188843, "learning_rate": 2.917300862506342e-06, "loss": 0.0056, "step": 8076 }, { "epoch": 7.3762557077625575, "grad_norm": 0.30471375584602356, "learning_rate": 2.916286149162862e-06, "loss": 0.0022, "step": 8077 }, { "epoch": 7.37716894977169, "grad_norm": 7.818599700927734, "learning_rate": 2.9152714358193812e-06, "loss": 0.0309, "step": 8078 }, { "epoch": 7.3780821917808215, "grad_norm": 0.06561930477619171, "learning_rate": 2.9142567224759005e-06, "loss": 0.0004, "step": 8079 }, { "epoch": 7.378995433789954, "grad_norm": 2.287273645401001, "learning_rate": 2.91324200913242e-06, "loss": 0.0083, "step": 8080 }, { "epoch": 7.379908675799086, "grad_norm": 2.4773802757263184, "learning_rate": 2.91222729578894e-06, "loss": 0.0087, "step": 8081 }, { "epoch": 7.380821917808219, "grad_norm": 0.00783489178866148, "learning_rate": 2.9112125824454596e-06, "loss": 0.0, "step": 8082 }, { "epoch": 7.381735159817351, "grad_norm": 0.8570103645324707, "learning_rate": 2.910197869101979e-06, "loss": 0.0048, "step": 8083 }, { "epoch": 7.382648401826484, "grad_norm": 0.5219835042953491, "learning_rate": 2.909183155758498e-06, "loss": 0.0036, "step": 8084 }, { "epoch": 7.383561643835616, "grad_norm": 0.33629700541496277, "learning_rate": 2.9081684424150182e-06, "loss": 0.0005, "step": 8085 }, { "epoch": 7.384474885844749, "grad_norm": 46.81624221801758, "learning_rate": 2.9071537290715375e-06, "loss": 0.2833, "step": 8086 }, { "epoch": 7.385388127853881, "grad_norm": 4.566618919372559, "learning_rate": 2.906139015728057e-06, "loss": 0.0344, "step": 8087 }, { "epoch": 7.3863013698630136, "grad_norm": 0.6665778756141663, "learning_rate": 2.9051243023845765e-06, "loss": 0.0041, "step": 8088 }, { "epoch": 7.387214611872146, "grad_norm": 2.490886926651001, "learning_rate": 2.9041095890410957e-06, "loss": 0.0154, "step": 8089 }, { "epoch": 7.3881278538812785, "grad_norm": 10.950526237487793, "learning_rate": 2.903094875697616e-06, "loss": 0.0364, "step": 8090 }, { "epoch": 7.389041095890411, "grad_norm": 2.398367404937744, "learning_rate": 2.902080162354135e-06, "loss": 0.0125, "step": 8091 }, { "epoch": 7.389954337899543, "grad_norm": 8.240381240844727, "learning_rate": 2.901065449010655e-06, "loss": 0.0573, "step": 8092 }, { "epoch": 7.390867579908676, "grad_norm": 0.09852850437164307, "learning_rate": 2.900050735667174e-06, "loss": 0.0005, "step": 8093 }, { "epoch": 7.391780821917808, "grad_norm": 0.12070957571268082, "learning_rate": 2.8990360223236933e-06, "loss": 0.001, "step": 8094 }, { "epoch": 7.392694063926941, "grad_norm": 1.284860610961914, "learning_rate": 2.8980213089802135e-06, "loss": 0.0069, "step": 8095 }, { "epoch": 7.393607305936073, "grad_norm": 0.27946794033050537, "learning_rate": 2.8970065956367327e-06, "loss": 0.0019, "step": 8096 }, { "epoch": 7.394520547945206, "grad_norm": 2.321554660797119, "learning_rate": 2.8959918822932524e-06, "loss": 0.0147, "step": 8097 }, { "epoch": 7.395433789954338, "grad_norm": 0.6309124827384949, "learning_rate": 2.8949771689497717e-06, "loss": 0.003, "step": 8098 }, { "epoch": 7.3963470319634705, "grad_norm": 0.4622802436351776, "learning_rate": 2.8939624556062914e-06, "loss": 0.0025, "step": 8099 }, { "epoch": 7.397260273972603, "grad_norm": 0.026014892384409904, "learning_rate": 2.892947742262811e-06, "loss": 0.0001, "step": 8100 }, { "epoch": 7.3981735159817354, "grad_norm": 2.059298276901245, "learning_rate": 2.8919330289193303e-06, "loss": 0.0117, "step": 8101 }, { "epoch": 7.399086757990868, "grad_norm": 1.3369803428649902, "learning_rate": 2.89091831557585e-06, "loss": 0.0077, "step": 8102 }, { "epoch": 7.4, "grad_norm": 0.043525636196136475, "learning_rate": 2.8899036022323697e-06, "loss": 0.0003, "step": 8103 }, { "epoch": 7.400913242009133, "grad_norm": 0.2986733913421631, "learning_rate": 2.888888888888889e-06, "loss": 0.0021, "step": 8104 }, { "epoch": 7.401826484018265, "grad_norm": 1.6418901681900024, "learning_rate": 2.8878741755454087e-06, "loss": 0.014, "step": 8105 }, { "epoch": 7.402739726027397, "grad_norm": 0.08121638000011444, "learning_rate": 2.886859462201928e-06, "loss": 0.0006, "step": 8106 }, { "epoch": 7.403652968036529, "grad_norm": 41.03401565551758, "learning_rate": 2.885844748858448e-06, "loss": 0.2416, "step": 8107 }, { "epoch": 7.404566210045662, "grad_norm": 1.037777304649353, "learning_rate": 2.8848300355149673e-06, "loss": 0.004, "step": 8108 }, { "epoch": 7.405479452054794, "grad_norm": 0.08430422842502594, "learning_rate": 2.8838153221714866e-06, "loss": 0.0004, "step": 8109 }, { "epoch": 7.406392694063927, "grad_norm": 1.0904184579849243, "learning_rate": 2.8828006088280063e-06, "loss": 0.005, "step": 8110 }, { "epoch": 7.407305936073059, "grad_norm": 0.07012082636356354, "learning_rate": 2.8817858954845256e-06, "loss": 0.0003, "step": 8111 }, { "epoch": 7.4082191780821915, "grad_norm": 25.71303367614746, "learning_rate": 2.8807711821410457e-06, "loss": 0.2775, "step": 8112 }, { "epoch": 7.409132420091324, "grad_norm": 0.17773617804050446, "learning_rate": 2.879756468797565e-06, "loss": 0.001, "step": 8113 }, { "epoch": 7.4100456621004565, "grad_norm": 10.041451454162598, "learning_rate": 2.8787417554540842e-06, "loss": 0.1007, "step": 8114 }, { "epoch": 7.410958904109589, "grad_norm": 0.20465417206287384, "learning_rate": 2.877727042110604e-06, "loss": 0.0016, "step": 8115 }, { "epoch": 7.411872146118721, "grad_norm": 2.9724295139312744, "learning_rate": 2.876712328767123e-06, "loss": 0.008, "step": 8116 }, { "epoch": 7.412785388127854, "grad_norm": 0.19705836474895477, "learning_rate": 2.8756976154236433e-06, "loss": 0.0011, "step": 8117 }, { "epoch": 7.413698630136986, "grad_norm": 16.67189598083496, "learning_rate": 2.8746829020801626e-06, "loss": 0.1001, "step": 8118 }, { "epoch": 7.414611872146119, "grad_norm": 0.6218945384025574, "learning_rate": 2.873668188736682e-06, "loss": 0.0026, "step": 8119 }, { "epoch": 7.415525114155251, "grad_norm": 0.17731280624866486, "learning_rate": 2.8726534753932015e-06, "loss": 0.0011, "step": 8120 }, { "epoch": 7.416438356164384, "grad_norm": 0.04579748213291168, "learning_rate": 2.8716387620497212e-06, "loss": 0.0003, "step": 8121 }, { "epoch": 7.417351598173516, "grad_norm": 1.5878647565841675, "learning_rate": 2.870624048706241e-06, "loss": 0.0099, "step": 8122 }, { "epoch": 7.4182648401826485, "grad_norm": 0.11545510590076447, "learning_rate": 2.86960933536276e-06, "loss": 0.0008, "step": 8123 }, { "epoch": 7.419178082191781, "grad_norm": 0.035056255757808685, "learning_rate": 2.8685946220192794e-06, "loss": 0.0002, "step": 8124 }, { "epoch": 7.420091324200913, "grad_norm": 1.1370129585266113, "learning_rate": 2.8675799086757996e-06, "loss": 0.0051, "step": 8125 }, { "epoch": 7.421004566210046, "grad_norm": 0.06129039078950882, "learning_rate": 2.866565195332319e-06, "loss": 0.0005, "step": 8126 }, { "epoch": 7.421917808219178, "grad_norm": 0.0899893268942833, "learning_rate": 2.8655504819888385e-06, "loss": 0.0006, "step": 8127 }, { "epoch": 7.422831050228311, "grad_norm": 3.554486036300659, "learning_rate": 2.864535768645358e-06, "loss": 0.0173, "step": 8128 }, { "epoch": 7.423744292237443, "grad_norm": 1.102553129196167, "learning_rate": 2.863521055301877e-06, "loss": 0.0051, "step": 8129 }, { "epoch": 7.424657534246576, "grad_norm": 13.582088470458984, "learning_rate": 2.862506341958397e-06, "loss": 0.0786, "step": 8130 }, { "epoch": 7.425570776255708, "grad_norm": 0.21480314433574677, "learning_rate": 2.8614916286149164e-06, "loss": 0.0013, "step": 8131 }, { "epoch": 7.426484018264841, "grad_norm": 1.052895426750183, "learning_rate": 2.860476915271436e-06, "loss": 0.005, "step": 8132 }, { "epoch": 7.427397260273972, "grad_norm": 0.04843326658010483, "learning_rate": 2.8594622019279554e-06, "loss": 0.0004, "step": 8133 }, { "epoch": 7.428310502283105, "grad_norm": 0.1331028938293457, "learning_rate": 2.8584474885844747e-06, "loss": 0.0009, "step": 8134 }, { "epoch": 7.429223744292237, "grad_norm": 0.1941842883825302, "learning_rate": 2.857432775240995e-06, "loss": 0.0013, "step": 8135 }, { "epoch": 7.4301369863013695, "grad_norm": 0.039931513369083405, "learning_rate": 2.856418061897514e-06, "loss": 0.0003, "step": 8136 }, { "epoch": 7.431050228310502, "grad_norm": 0.6883305907249451, "learning_rate": 2.8554033485540337e-06, "loss": 0.0041, "step": 8137 }, { "epoch": 7.4319634703196344, "grad_norm": 2.4222402572631836, "learning_rate": 2.854388635210553e-06, "loss": 0.02, "step": 8138 }, { "epoch": 7.432876712328767, "grad_norm": 0.7260491251945496, "learning_rate": 2.8533739218670727e-06, "loss": 0.0027, "step": 8139 }, { "epoch": 7.433789954337899, "grad_norm": 0.3816387355327606, "learning_rate": 2.8523592085235924e-06, "loss": 0.0021, "step": 8140 }, { "epoch": 7.434703196347032, "grad_norm": 1.8038171529769897, "learning_rate": 2.8513444951801117e-06, "loss": 0.0085, "step": 8141 }, { "epoch": 7.435616438356164, "grad_norm": 1.1644870042800903, "learning_rate": 2.8503297818366314e-06, "loss": 0.0068, "step": 8142 }, { "epoch": 7.436529680365297, "grad_norm": 0.8504095077514648, "learning_rate": 2.849315068493151e-06, "loss": 0.0048, "step": 8143 }, { "epoch": 7.437442922374429, "grad_norm": 0.11194299906492233, "learning_rate": 2.8483003551496703e-06, "loss": 0.0005, "step": 8144 }, { "epoch": 7.438356164383562, "grad_norm": 0.3594039976596832, "learning_rate": 2.84728564180619e-06, "loss": 0.0028, "step": 8145 }, { "epoch": 7.439269406392694, "grad_norm": 2.9129581451416016, "learning_rate": 2.8462709284627093e-06, "loss": 0.0166, "step": 8146 }, { "epoch": 7.4401826484018265, "grad_norm": 6.219748497009277, "learning_rate": 2.8452562151192294e-06, "loss": 0.0469, "step": 8147 }, { "epoch": 7.441095890410959, "grad_norm": 7.1306681632995605, "learning_rate": 2.8442415017757487e-06, "loss": 0.0355, "step": 8148 }, { "epoch": 7.442009132420091, "grad_norm": 2.1389331817626953, "learning_rate": 2.843226788432268e-06, "loss": 0.011, "step": 8149 }, { "epoch": 7.442922374429224, "grad_norm": 0.6857337951660156, "learning_rate": 2.8422120750887876e-06, "loss": 0.0032, "step": 8150 }, { "epoch": 7.443835616438356, "grad_norm": 1.7966432571411133, "learning_rate": 2.841197361745307e-06, "loss": 0.0113, "step": 8151 }, { "epoch": 7.444748858447489, "grad_norm": 0.24445456266403198, "learning_rate": 2.840182648401827e-06, "loss": 0.0016, "step": 8152 }, { "epoch": 7.445662100456621, "grad_norm": 3.319450616836548, "learning_rate": 2.8391679350583463e-06, "loss": 0.0234, "step": 8153 }, { "epoch": 7.446575342465754, "grad_norm": 1.0755951404571533, "learning_rate": 2.8381532217148656e-06, "loss": 0.0061, "step": 8154 }, { "epoch": 7.447488584474886, "grad_norm": 1.8615394830703735, "learning_rate": 2.8371385083713852e-06, "loss": 0.0137, "step": 8155 }, { "epoch": 7.448401826484019, "grad_norm": 4.637876987457275, "learning_rate": 2.8361237950279045e-06, "loss": 0.0219, "step": 8156 }, { "epoch": 7.449315068493151, "grad_norm": 0.06710455566644669, "learning_rate": 2.8351090816844246e-06, "loss": 0.0004, "step": 8157 }, { "epoch": 7.4502283105022835, "grad_norm": 0.3284943699836731, "learning_rate": 2.834094368340944e-06, "loss": 0.0019, "step": 8158 }, { "epoch": 7.451141552511416, "grad_norm": 0.3370639681816101, "learning_rate": 2.833079654997463e-06, "loss": 0.0015, "step": 8159 }, { "epoch": 7.4520547945205475, "grad_norm": 0.041742272675037384, "learning_rate": 2.832064941653983e-06, "loss": 0.0003, "step": 8160 }, { "epoch": 7.45296803652968, "grad_norm": 2.307626485824585, "learning_rate": 2.8310502283105025e-06, "loss": 0.0094, "step": 8161 }, { "epoch": 7.453881278538812, "grad_norm": 8.67637825012207, "learning_rate": 2.8300355149670222e-06, "loss": 0.0656, "step": 8162 }, { "epoch": 7.454794520547945, "grad_norm": 12.990189552307129, "learning_rate": 2.8290208016235415e-06, "loss": 0.0556, "step": 8163 }, { "epoch": 7.455707762557077, "grad_norm": 0.41418012976646423, "learning_rate": 2.8280060882800608e-06, "loss": 0.0013, "step": 8164 }, { "epoch": 7.45662100456621, "grad_norm": 0.9407349228858948, "learning_rate": 2.826991374936581e-06, "loss": 0.005, "step": 8165 }, { "epoch": 7.457534246575342, "grad_norm": 0.8216632008552551, "learning_rate": 2.8259766615931e-06, "loss": 0.005, "step": 8166 }, { "epoch": 7.458447488584475, "grad_norm": 0.07189760357141495, "learning_rate": 2.82496194824962e-06, "loss": 0.0004, "step": 8167 }, { "epoch": 7.459360730593607, "grad_norm": 0.0477876141667366, "learning_rate": 2.823947234906139e-06, "loss": 0.0002, "step": 8168 }, { "epoch": 7.46027397260274, "grad_norm": 11.468174934387207, "learning_rate": 2.8229325215626584e-06, "loss": 0.0454, "step": 8169 }, { "epoch": 7.461187214611872, "grad_norm": 0.18783609569072723, "learning_rate": 2.8219178082191785e-06, "loss": 0.0009, "step": 8170 }, { "epoch": 7.4621004566210045, "grad_norm": 1.470978856086731, "learning_rate": 2.8209030948756978e-06, "loss": 0.0092, "step": 8171 }, { "epoch": 7.463013698630137, "grad_norm": 18.680593490600586, "learning_rate": 2.8198883815322175e-06, "loss": 0.1219, "step": 8172 }, { "epoch": 7.463926940639269, "grad_norm": 0.22622305154800415, "learning_rate": 2.8188736681887367e-06, "loss": 0.0013, "step": 8173 }, { "epoch": 7.464840182648402, "grad_norm": 0.5601993203163147, "learning_rate": 2.817858954845256e-06, "loss": 0.0039, "step": 8174 }, { "epoch": 7.465753424657534, "grad_norm": 49.21709060668945, "learning_rate": 2.816844241501776e-06, "loss": 0.2325, "step": 8175 }, { "epoch": 7.466666666666667, "grad_norm": 6.890961170196533, "learning_rate": 2.8158295281582954e-06, "loss": 0.0279, "step": 8176 }, { "epoch": 7.467579908675799, "grad_norm": 0.09046092629432678, "learning_rate": 2.814814814814815e-06, "loss": 0.0007, "step": 8177 }, { "epoch": 7.468493150684932, "grad_norm": 5.973878383636475, "learning_rate": 2.8138001014713343e-06, "loss": 0.033, "step": 8178 }, { "epoch": 7.469406392694064, "grad_norm": 14.187816619873047, "learning_rate": 2.812785388127854e-06, "loss": 0.0871, "step": 8179 }, { "epoch": 7.470319634703197, "grad_norm": 0.12775321304798126, "learning_rate": 2.8117706747843737e-06, "loss": 0.0007, "step": 8180 }, { "epoch": 7.471232876712329, "grad_norm": 66.03463745117188, "learning_rate": 2.810755961440893e-06, "loss": 0.62, "step": 8181 }, { "epoch": 7.4721461187214615, "grad_norm": 0.28302714228630066, "learning_rate": 2.8097412480974127e-06, "loss": 0.002, "step": 8182 }, { "epoch": 7.473059360730594, "grad_norm": 2.635448694229126, "learning_rate": 2.8087265347539324e-06, "loss": 0.0185, "step": 8183 }, { "epoch": 7.473972602739726, "grad_norm": 1.5983771085739136, "learning_rate": 2.8077118214104517e-06, "loss": 0.0088, "step": 8184 }, { "epoch": 7.474885844748858, "grad_norm": 0.016354979947209358, "learning_rate": 2.8066971080669713e-06, "loss": 0.0001, "step": 8185 }, { "epoch": 7.475799086757991, "grad_norm": 0.4979627728462219, "learning_rate": 2.8056823947234906e-06, "loss": 0.0035, "step": 8186 }, { "epoch": 7.476712328767123, "grad_norm": 8.294683456420898, "learning_rate": 2.8046676813800107e-06, "loss": 0.0617, "step": 8187 }, { "epoch": 7.477625570776255, "grad_norm": 0.25484463572502136, "learning_rate": 2.80365296803653e-06, "loss": 0.0015, "step": 8188 }, { "epoch": 7.478538812785388, "grad_norm": 0.19111870229244232, "learning_rate": 2.8026382546930493e-06, "loss": 0.0015, "step": 8189 }, { "epoch": 7.47945205479452, "grad_norm": 15.376707077026367, "learning_rate": 2.801623541349569e-06, "loss": 0.1012, "step": 8190 }, { "epoch": 7.480365296803653, "grad_norm": 12.451958656311035, "learning_rate": 2.8006088280060882e-06, "loss": 0.077, "step": 8191 }, { "epoch": 7.481278538812785, "grad_norm": 9.560721397399902, "learning_rate": 2.7995941146626083e-06, "loss": 0.0284, "step": 8192 }, { "epoch": 7.482191780821918, "grad_norm": 4.656206130981445, "learning_rate": 2.7985794013191276e-06, "loss": 0.0205, "step": 8193 }, { "epoch": 7.48310502283105, "grad_norm": 0.03939894586801529, "learning_rate": 2.797564687975647e-06, "loss": 0.0002, "step": 8194 }, { "epoch": 7.4840182648401825, "grad_norm": 0.05133925750851631, "learning_rate": 2.7965499746321666e-06, "loss": 0.0004, "step": 8195 }, { "epoch": 7.484931506849315, "grad_norm": 1.5203324556350708, "learning_rate": 2.795535261288686e-06, "loss": 0.0115, "step": 8196 }, { "epoch": 7.485844748858447, "grad_norm": 1.4827643632888794, "learning_rate": 2.794520547945206e-06, "loss": 0.0054, "step": 8197 }, { "epoch": 7.48675799086758, "grad_norm": 10.638294219970703, "learning_rate": 2.7935058346017252e-06, "loss": 0.0586, "step": 8198 }, { "epoch": 7.487671232876712, "grad_norm": 1.1190845966339111, "learning_rate": 2.7924911212582445e-06, "loss": 0.0071, "step": 8199 }, { "epoch": 7.488584474885845, "grad_norm": 0.47362008690834045, "learning_rate": 2.791476407914764e-06, "loss": 0.0019, "step": 8200 }, { "epoch": 7.489497716894977, "grad_norm": 1.6115986108779907, "learning_rate": 2.790461694571284e-06, "loss": 0.0091, "step": 8201 }, { "epoch": 7.49041095890411, "grad_norm": 0.11775849759578705, "learning_rate": 2.7894469812278036e-06, "loss": 0.0006, "step": 8202 }, { "epoch": 7.491324200913242, "grad_norm": 1.6195987462997437, "learning_rate": 2.788432267884323e-06, "loss": 0.0104, "step": 8203 }, { "epoch": 7.492237442922375, "grad_norm": 0.16935795545578003, "learning_rate": 2.787417554540842e-06, "loss": 0.0007, "step": 8204 }, { "epoch": 7.493150684931507, "grad_norm": 3.6299736499786377, "learning_rate": 2.7864028411973622e-06, "loss": 0.0157, "step": 8205 }, { "epoch": 7.4940639269406395, "grad_norm": 0.6133343577384949, "learning_rate": 2.7853881278538815e-06, "loss": 0.0036, "step": 8206 }, { "epoch": 7.494977168949772, "grad_norm": 6.040513038635254, "learning_rate": 2.784373414510401e-06, "loss": 0.0249, "step": 8207 }, { "epoch": 7.495890410958904, "grad_norm": 4.176270008087158, "learning_rate": 2.7833587011669205e-06, "loss": 0.0194, "step": 8208 }, { "epoch": 7.496803652968037, "grad_norm": 1.3404656648635864, "learning_rate": 2.7823439878234397e-06, "loss": 0.0059, "step": 8209 }, { "epoch": 7.497716894977169, "grad_norm": 32.29702377319336, "learning_rate": 2.78132927447996e-06, "loss": 0.2422, "step": 8210 }, { "epoch": 7.498630136986302, "grad_norm": 29.828325271606445, "learning_rate": 2.780314561136479e-06, "loss": 0.0433, "step": 8211 }, { "epoch": 7.499543378995433, "grad_norm": 27.12196159362793, "learning_rate": 2.779299847792999e-06, "loss": 0.0455, "step": 8212 }, { "epoch": 7.500456621004567, "grad_norm": 4.303749084472656, "learning_rate": 2.778285134449518e-06, "loss": 0.0298, "step": 8213 }, { "epoch": 7.501369863013698, "grad_norm": 1.1985366344451904, "learning_rate": 2.7772704211060373e-06, "loss": 0.0045, "step": 8214 }, { "epoch": 7.502283105022831, "grad_norm": 0.7403112053871155, "learning_rate": 2.7762557077625574e-06, "loss": 0.0046, "step": 8215 }, { "epoch": 7.503196347031963, "grad_norm": 10.837761878967285, "learning_rate": 2.7752409944190767e-06, "loss": 0.0597, "step": 8216 }, { "epoch": 7.504109589041096, "grad_norm": 3.2999520301818848, "learning_rate": 2.7742262810755964e-06, "loss": 0.0188, "step": 8217 }, { "epoch": 7.505022831050228, "grad_norm": 0.1561010479927063, "learning_rate": 2.7732115677321157e-06, "loss": 0.0008, "step": 8218 }, { "epoch": 7.5059360730593605, "grad_norm": 0.06129974126815796, "learning_rate": 2.7721968543886354e-06, "loss": 0.0003, "step": 8219 }, { "epoch": 7.506849315068493, "grad_norm": 0.28173312544822693, "learning_rate": 2.771182141045155e-06, "loss": 0.0012, "step": 8220 }, { "epoch": 7.507762557077625, "grad_norm": 11.330406188964844, "learning_rate": 2.7701674277016743e-06, "loss": 0.0711, "step": 8221 }, { "epoch": 7.508675799086758, "grad_norm": 42.07572937011719, "learning_rate": 2.769152714358194e-06, "loss": 0.3184, "step": 8222 }, { "epoch": 7.50958904109589, "grad_norm": 4.962165355682373, "learning_rate": 2.7681380010147137e-06, "loss": 0.0133, "step": 8223 }, { "epoch": 7.510502283105023, "grad_norm": 0.014529122970998287, "learning_rate": 2.767123287671233e-06, "loss": 0.0001, "step": 8224 }, { "epoch": 7.511415525114155, "grad_norm": 5.6781697273254395, "learning_rate": 2.7661085743277527e-06, "loss": 0.0335, "step": 8225 }, { "epoch": 7.512328767123288, "grad_norm": 0.8327934145927429, "learning_rate": 2.765093860984272e-06, "loss": 0.0047, "step": 8226 }, { "epoch": 7.51324200913242, "grad_norm": 1.563253402709961, "learning_rate": 2.764079147640792e-06, "loss": 0.0073, "step": 8227 }, { "epoch": 7.514155251141553, "grad_norm": 0.47053322196006775, "learning_rate": 2.7630644342973113e-06, "loss": 0.0035, "step": 8228 }, { "epoch": 7.515068493150685, "grad_norm": 1.3451818227767944, "learning_rate": 2.7620497209538306e-06, "loss": 0.0072, "step": 8229 }, { "epoch": 7.5159817351598175, "grad_norm": 0.4531514644622803, "learning_rate": 2.7610350076103503e-06, "loss": 0.0019, "step": 8230 }, { "epoch": 7.51689497716895, "grad_norm": 0.17666040360927582, "learning_rate": 2.7600202942668696e-06, "loss": 0.0007, "step": 8231 }, { "epoch": 7.517808219178082, "grad_norm": 15.782849311828613, "learning_rate": 2.7590055809233897e-06, "loss": 0.1097, "step": 8232 }, { "epoch": 7.518721461187215, "grad_norm": 0.15044033527374268, "learning_rate": 2.757990867579909e-06, "loss": 0.0006, "step": 8233 }, { "epoch": 7.519634703196347, "grad_norm": 3.171365261077881, "learning_rate": 2.756976154236428e-06, "loss": 0.0264, "step": 8234 }, { "epoch": 7.52054794520548, "grad_norm": 0.14201700687408447, "learning_rate": 2.755961440892948e-06, "loss": 0.0009, "step": 8235 }, { "epoch": 7.521461187214612, "grad_norm": 0.11955142021179199, "learning_rate": 2.754946727549467e-06, "loss": 0.0006, "step": 8236 }, { "epoch": 7.522374429223745, "grad_norm": 13.223183631896973, "learning_rate": 2.7539320142059873e-06, "loss": 0.0654, "step": 8237 }, { "epoch": 7.523287671232877, "grad_norm": 0.11290610581636429, "learning_rate": 2.7529173008625066e-06, "loss": 0.0009, "step": 8238 }, { "epoch": 7.524200913242009, "grad_norm": 0.6575518846511841, "learning_rate": 2.751902587519026e-06, "loss": 0.0053, "step": 8239 }, { "epoch": 7.525114155251142, "grad_norm": 0.3379969596862793, "learning_rate": 2.7508878741755455e-06, "loss": 0.0015, "step": 8240 }, { "epoch": 7.526027397260274, "grad_norm": 2.6685569286346436, "learning_rate": 2.749873160832065e-06, "loss": 0.0108, "step": 8241 }, { "epoch": 7.526940639269406, "grad_norm": 0.40291401743888855, "learning_rate": 2.748858447488585e-06, "loss": 0.0022, "step": 8242 }, { "epoch": 7.5278538812785385, "grad_norm": 11.02840805053711, "learning_rate": 2.747843734145104e-06, "loss": 0.0464, "step": 8243 }, { "epoch": 7.528767123287671, "grad_norm": 0.34471118450164795, "learning_rate": 2.7468290208016234e-06, "loss": 0.0024, "step": 8244 }, { "epoch": 7.529680365296803, "grad_norm": 0.4231560230255127, "learning_rate": 2.7458143074581436e-06, "loss": 0.0024, "step": 8245 }, { "epoch": 7.530593607305936, "grad_norm": 0.5374273061752319, "learning_rate": 2.744799594114663e-06, "loss": 0.0033, "step": 8246 }, { "epoch": 7.531506849315068, "grad_norm": 0.012694845907390118, "learning_rate": 2.7437848807711825e-06, "loss": 0.0001, "step": 8247 }, { "epoch": 7.532420091324201, "grad_norm": 0.43221747875213623, "learning_rate": 2.7427701674277018e-06, "loss": 0.0016, "step": 8248 }, { "epoch": 7.533333333333333, "grad_norm": 2.400636911392212, "learning_rate": 2.741755454084221e-06, "loss": 0.0159, "step": 8249 }, { "epoch": 7.534246575342466, "grad_norm": 0.12704220414161682, "learning_rate": 2.740740740740741e-06, "loss": 0.001, "step": 8250 }, { "epoch": 7.535159817351598, "grad_norm": 1.3353397846221924, "learning_rate": 2.7397260273972604e-06, "loss": 0.0087, "step": 8251 }, { "epoch": 7.536073059360731, "grad_norm": 0.0922151729464531, "learning_rate": 2.73871131405378e-06, "loss": 0.0005, "step": 8252 }, { "epoch": 7.536986301369863, "grad_norm": 0.22175182402133942, "learning_rate": 2.7376966007102994e-06, "loss": 0.0014, "step": 8253 }, { "epoch": 7.5378995433789955, "grad_norm": 5.5140533447265625, "learning_rate": 2.7366818873668187e-06, "loss": 0.0224, "step": 8254 }, { "epoch": 7.538812785388128, "grad_norm": 0.06852851808071136, "learning_rate": 2.7356671740233388e-06, "loss": 0.0003, "step": 8255 }, { "epoch": 7.53972602739726, "grad_norm": 0.49215370416641235, "learning_rate": 2.734652460679858e-06, "loss": 0.0018, "step": 8256 }, { "epoch": 7.540639269406393, "grad_norm": 0.003774341195821762, "learning_rate": 2.7336377473363777e-06, "loss": 0.0, "step": 8257 }, { "epoch": 7.541552511415525, "grad_norm": 2.4930312633514404, "learning_rate": 2.732623033992897e-06, "loss": 0.0166, "step": 8258 }, { "epoch": 7.542465753424658, "grad_norm": 0.07245662808418274, "learning_rate": 2.7316083206494167e-06, "loss": 0.0004, "step": 8259 }, { "epoch": 7.54337899543379, "grad_norm": 7.3391218185424805, "learning_rate": 2.7305936073059364e-06, "loss": 0.041, "step": 8260 }, { "epoch": 7.544292237442923, "grad_norm": 14.163814544677734, "learning_rate": 2.7295788939624557e-06, "loss": 0.0634, "step": 8261 }, { "epoch": 7.545205479452055, "grad_norm": 0.4210100471973419, "learning_rate": 2.7285641806189754e-06, "loss": 0.0022, "step": 8262 }, { "epoch": 7.546118721461188, "grad_norm": 0.14439545571804047, "learning_rate": 2.727549467275495e-06, "loss": 0.0009, "step": 8263 }, { "epoch": 7.54703196347032, "grad_norm": 1.1087387800216675, "learning_rate": 2.7265347539320143e-06, "loss": 0.0026, "step": 8264 }, { "epoch": 7.5479452054794525, "grad_norm": 0.3412896692752838, "learning_rate": 2.725520040588534e-06, "loss": 0.0018, "step": 8265 }, { "epoch": 7.548858447488584, "grad_norm": 5.274751663208008, "learning_rate": 2.7245053272450533e-06, "loss": 0.0214, "step": 8266 }, { "epoch": 7.549771689497717, "grad_norm": 101.43697357177734, "learning_rate": 2.7234906139015734e-06, "loss": 1.456, "step": 8267 }, { "epoch": 7.550684931506849, "grad_norm": 0.03636487200856209, "learning_rate": 2.7224759005580927e-06, "loss": 0.0002, "step": 8268 }, { "epoch": 7.551598173515981, "grad_norm": 3.217784881591797, "learning_rate": 2.721461187214612e-06, "loss": 0.0233, "step": 8269 }, { "epoch": 7.552511415525114, "grad_norm": 0.11900679767131805, "learning_rate": 2.7204464738711316e-06, "loss": 0.0005, "step": 8270 }, { "epoch": 7.553424657534246, "grad_norm": 1.210487961769104, "learning_rate": 2.719431760527651e-06, "loss": 0.0071, "step": 8271 }, { "epoch": 7.554337899543379, "grad_norm": 0.9827436804771423, "learning_rate": 2.718417047184171e-06, "loss": 0.0045, "step": 8272 }, { "epoch": 7.555251141552511, "grad_norm": 1.9906054735183716, "learning_rate": 2.7174023338406903e-06, "loss": 0.011, "step": 8273 }, { "epoch": 7.556164383561644, "grad_norm": 57.58992385864258, "learning_rate": 2.7163876204972095e-06, "loss": 0.5008, "step": 8274 }, { "epoch": 7.557077625570776, "grad_norm": 0.05802302435040474, "learning_rate": 2.7153729071537292e-06, "loss": 0.0003, "step": 8275 }, { "epoch": 7.557990867579909, "grad_norm": 0.3747788667678833, "learning_rate": 2.7143581938102485e-06, "loss": 0.0021, "step": 8276 }, { "epoch": 7.558904109589041, "grad_norm": 0.007253160700201988, "learning_rate": 2.7133434804667686e-06, "loss": 0.0001, "step": 8277 }, { "epoch": 7.5598173515981735, "grad_norm": 39.60477828979492, "learning_rate": 2.712328767123288e-06, "loss": 0.2276, "step": 8278 }, { "epoch": 7.560730593607306, "grad_norm": 1.3027029037475586, "learning_rate": 2.711314053779807e-06, "loss": 0.0096, "step": 8279 }, { "epoch": 7.561643835616438, "grad_norm": 30.95079803466797, "learning_rate": 2.710299340436327e-06, "loss": 0.1526, "step": 8280 }, { "epoch": 7.562557077625571, "grad_norm": 0.4620283842086792, "learning_rate": 2.7092846270928465e-06, "loss": 0.0033, "step": 8281 }, { "epoch": 7.563470319634703, "grad_norm": 0.1179104670882225, "learning_rate": 2.7082699137493662e-06, "loss": 0.0006, "step": 8282 }, { "epoch": 7.564383561643836, "grad_norm": 0.0528164878487587, "learning_rate": 2.7072552004058855e-06, "loss": 0.0004, "step": 8283 }, { "epoch": 7.565296803652968, "grad_norm": 0.0020384700037539005, "learning_rate": 2.7062404870624048e-06, "loss": 0.0, "step": 8284 }, { "epoch": 7.566210045662101, "grad_norm": 16.99085807800293, "learning_rate": 2.705225773718925e-06, "loss": 0.1414, "step": 8285 }, { "epoch": 7.567123287671233, "grad_norm": 0.2904977798461914, "learning_rate": 2.704211060375444e-06, "loss": 0.0019, "step": 8286 }, { "epoch": 7.5680365296803656, "grad_norm": 86.36994171142578, "learning_rate": 2.703196347031964e-06, "loss": 1.1189, "step": 8287 }, { "epoch": 7.568949771689498, "grad_norm": 8.307104110717773, "learning_rate": 2.702181633688483e-06, "loss": 0.0677, "step": 8288 }, { "epoch": 7.5698630136986305, "grad_norm": 0.2018166333436966, "learning_rate": 2.7011669203450024e-06, "loss": 0.0009, "step": 8289 }, { "epoch": 7.570776255707763, "grad_norm": 2.6043269634246826, "learning_rate": 2.7001522070015225e-06, "loss": 0.015, "step": 8290 }, { "epoch": 7.5716894977168945, "grad_norm": 1.5373021364212036, "learning_rate": 2.6991374936580418e-06, "loss": 0.0128, "step": 8291 }, { "epoch": 7.572602739726028, "grad_norm": 0.4276696741580963, "learning_rate": 2.6981227803145615e-06, "loss": 0.0022, "step": 8292 }, { "epoch": 7.573515981735159, "grad_norm": 0.714107096195221, "learning_rate": 2.6971080669710807e-06, "loss": 0.0039, "step": 8293 }, { "epoch": 7.574429223744293, "grad_norm": 2.0463783740997314, "learning_rate": 2.6960933536276e-06, "loss": 0.0106, "step": 8294 }, { "epoch": 7.575342465753424, "grad_norm": 0.08953824639320374, "learning_rate": 2.69507864028412e-06, "loss": 0.0007, "step": 8295 }, { "epoch": 7.576255707762557, "grad_norm": 0.09626312553882599, "learning_rate": 2.6940639269406394e-06, "loss": 0.0005, "step": 8296 }, { "epoch": 7.577168949771689, "grad_norm": 0.8605483770370483, "learning_rate": 2.693049213597159e-06, "loss": 0.0031, "step": 8297 }, { "epoch": 7.578082191780822, "grad_norm": 1.4891153573989868, "learning_rate": 2.6920345002536783e-06, "loss": 0.005, "step": 8298 }, { "epoch": 7.578995433789954, "grad_norm": 0.2754838466644287, "learning_rate": 2.691019786910198e-06, "loss": 0.0015, "step": 8299 }, { "epoch": 7.579908675799087, "grad_norm": 0.09332713484764099, "learning_rate": 2.6900050735667177e-06, "loss": 0.0005, "step": 8300 }, { "epoch": 7.580821917808219, "grad_norm": 0.5823605060577393, "learning_rate": 2.688990360223237e-06, "loss": 0.0037, "step": 8301 }, { "epoch": 7.5817351598173515, "grad_norm": 2.654937505722046, "learning_rate": 2.6879756468797567e-06, "loss": 0.0147, "step": 8302 }, { "epoch": 7.582648401826484, "grad_norm": 16.191370010375977, "learning_rate": 2.6869609335362764e-06, "loss": 0.0855, "step": 8303 }, { "epoch": 7.583561643835616, "grad_norm": 0.8262388110160828, "learning_rate": 2.6859462201927956e-06, "loss": 0.0038, "step": 8304 }, { "epoch": 7.584474885844749, "grad_norm": 0.730154275894165, "learning_rate": 2.6849315068493153e-06, "loss": 0.0047, "step": 8305 }, { "epoch": 7.585388127853881, "grad_norm": 4.928421974182129, "learning_rate": 2.6839167935058346e-06, "loss": 0.0076, "step": 8306 }, { "epoch": 7.586301369863014, "grad_norm": 0.30961501598358154, "learning_rate": 2.6829020801623547e-06, "loss": 0.0014, "step": 8307 }, { "epoch": 7.587214611872146, "grad_norm": 0.03062196634709835, "learning_rate": 2.681887366818874e-06, "loss": 0.0001, "step": 8308 }, { "epoch": 7.588127853881279, "grad_norm": 0.7554131746292114, "learning_rate": 2.6808726534753933e-06, "loss": 0.0057, "step": 8309 }, { "epoch": 7.589041095890411, "grad_norm": 8.379036903381348, "learning_rate": 2.679857940131913e-06, "loss": 0.039, "step": 8310 }, { "epoch": 7.5899543378995435, "grad_norm": 0.1422448456287384, "learning_rate": 2.6788432267884322e-06, "loss": 0.0008, "step": 8311 }, { "epoch": 7.590867579908676, "grad_norm": 0.13167433440685272, "learning_rate": 2.6778285134449523e-06, "loss": 0.0007, "step": 8312 }, { "epoch": 7.5917808219178085, "grad_norm": 6.781430721282959, "learning_rate": 2.6768138001014716e-06, "loss": 0.0336, "step": 8313 }, { "epoch": 7.592694063926941, "grad_norm": 0.8441151976585388, "learning_rate": 2.675799086757991e-06, "loss": 0.0046, "step": 8314 }, { "epoch": 7.593607305936073, "grad_norm": 0.08916211128234863, "learning_rate": 2.6747843734145106e-06, "loss": 0.0005, "step": 8315 }, { "epoch": 7.594520547945206, "grad_norm": 0.08482096344232559, "learning_rate": 2.67376966007103e-06, "loss": 0.0006, "step": 8316 }, { "epoch": 7.595433789954338, "grad_norm": 0.08044560998678207, "learning_rate": 2.67275494672755e-06, "loss": 0.0004, "step": 8317 }, { "epoch": 7.59634703196347, "grad_norm": 1.189170479774475, "learning_rate": 2.6717402333840692e-06, "loss": 0.0056, "step": 8318 }, { "epoch": 7.597260273972603, "grad_norm": 2.2428481578826904, "learning_rate": 2.6707255200405885e-06, "loss": 0.0143, "step": 8319 }, { "epoch": 7.598173515981735, "grad_norm": 0.26242586970329285, "learning_rate": 2.669710806697108e-06, "loss": 0.0012, "step": 8320 }, { "epoch": 7.599086757990867, "grad_norm": 0.10343587398529053, "learning_rate": 2.668696093353628e-06, "loss": 0.0004, "step": 8321 }, { "epoch": 7.6, "grad_norm": 0.2887439727783203, "learning_rate": 2.6676813800101476e-06, "loss": 0.0018, "step": 8322 }, { "epoch": 7.600913242009132, "grad_norm": 6.008877277374268, "learning_rate": 2.666666666666667e-06, "loss": 0.041, "step": 8323 }, { "epoch": 7.6018264840182646, "grad_norm": 0.5102372765541077, "learning_rate": 2.665651953323186e-06, "loss": 0.0025, "step": 8324 }, { "epoch": 7.602739726027397, "grad_norm": 17.445425033569336, "learning_rate": 2.6646372399797062e-06, "loss": 0.1808, "step": 8325 }, { "epoch": 7.6036529680365295, "grad_norm": 0.9414976239204407, "learning_rate": 2.6636225266362255e-06, "loss": 0.0079, "step": 8326 }, { "epoch": 7.604566210045662, "grad_norm": 0.7819756865501404, "learning_rate": 2.662607813292745e-06, "loss": 0.0062, "step": 8327 }, { "epoch": 7.605479452054794, "grad_norm": 0.022247204557061195, "learning_rate": 2.6615930999492644e-06, "loss": 0.0001, "step": 8328 }, { "epoch": 7.606392694063927, "grad_norm": 40.993621826171875, "learning_rate": 2.6605783866057837e-06, "loss": 0.2461, "step": 8329 }, { "epoch": 7.607305936073059, "grad_norm": 2.9548838138580322, "learning_rate": 2.659563673262304e-06, "loss": 0.0143, "step": 8330 }, { "epoch": 7.608219178082192, "grad_norm": 0.0369681790471077, "learning_rate": 2.658548959918823e-06, "loss": 0.0002, "step": 8331 }, { "epoch": 7.609132420091324, "grad_norm": 6.766128063201904, "learning_rate": 2.6575342465753428e-06, "loss": 0.0484, "step": 8332 }, { "epoch": 7.610045662100457, "grad_norm": 1.6721915006637573, "learning_rate": 2.656519533231862e-06, "loss": 0.0079, "step": 8333 }, { "epoch": 7.610958904109589, "grad_norm": 70.0914535522461, "learning_rate": 2.6555048198883813e-06, "loss": 0.3497, "step": 8334 }, { "epoch": 7.6118721461187215, "grad_norm": 1.8954081535339355, "learning_rate": 2.6544901065449014e-06, "loss": 0.0182, "step": 8335 }, { "epoch": 7.612785388127854, "grad_norm": 9.492293357849121, "learning_rate": 2.6534753932014207e-06, "loss": 0.0553, "step": 8336 }, { "epoch": 7.6136986301369864, "grad_norm": 0.17853184044361115, "learning_rate": 2.6524606798579404e-06, "loss": 0.0009, "step": 8337 }, { "epoch": 7.614611872146119, "grad_norm": 0.5271773338317871, "learning_rate": 2.6514459665144597e-06, "loss": 0.0027, "step": 8338 }, { "epoch": 7.615525114155251, "grad_norm": 11.896711349487305, "learning_rate": 2.6504312531709794e-06, "loss": 0.0611, "step": 8339 }, { "epoch": 7.616438356164384, "grad_norm": 2.3663384914398193, "learning_rate": 2.649416539827499e-06, "loss": 0.0113, "step": 8340 }, { "epoch": 7.617351598173516, "grad_norm": 6.786787033081055, "learning_rate": 2.6484018264840183e-06, "loss": 0.0361, "step": 8341 }, { "epoch": 7.618264840182649, "grad_norm": 0.01276128925383091, "learning_rate": 2.647387113140538e-06, "loss": 0.0001, "step": 8342 }, { "epoch": 7.619178082191781, "grad_norm": 0.2582683265209198, "learning_rate": 2.6463723997970577e-06, "loss": 0.0014, "step": 8343 }, { "epoch": 7.620091324200914, "grad_norm": 0.8750784993171692, "learning_rate": 2.645357686453577e-06, "loss": 0.0039, "step": 8344 }, { "epoch": 7.621004566210045, "grad_norm": 7.782978534698486, "learning_rate": 2.6443429731100967e-06, "loss": 0.0425, "step": 8345 }, { "epoch": 7.6219178082191785, "grad_norm": 26.36996841430664, "learning_rate": 2.643328259766616e-06, "loss": 0.1312, "step": 8346 }, { "epoch": 7.62283105022831, "grad_norm": 0.1319555938243866, "learning_rate": 2.642313546423136e-06, "loss": 0.0012, "step": 8347 }, { "epoch": 7.6237442922374425, "grad_norm": 5.7501220703125, "learning_rate": 2.6412988330796553e-06, "loss": 0.0435, "step": 8348 }, { "epoch": 7.624657534246575, "grad_norm": 2.710063934326172, "learning_rate": 2.6402841197361746e-06, "loss": 0.0154, "step": 8349 }, { "epoch": 7.6255707762557075, "grad_norm": 0.04603943973779678, "learning_rate": 2.6392694063926943e-06, "loss": 0.0002, "step": 8350 }, { "epoch": 7.62648401826484, "grad_norm": 0.20548194646835327, "learning_rate": 2.6382546930492135e-06, "loss": 0.0011, "step": 8351 }, { "epoch": 7.627397260273972, "grad_norm": 15.88722038269043, "learning_rate": 2.6372399797057337e-06, "loss": 0.013, "step": 8352 }, { "epoch": 7.628310502283105, "grad_norm": 1.9391990900039673, "learning_rate": 2.636225266362253e-06, "loss": 0.0131, "step": 8353 }, { "epoch": 7.629223744292237, "grad_norm": 113.84705352783203, "learning_rate": 2.635210553018772e-06, "loss": 1.2671, "step": 8354 }, { "epoch": 7.63013698630137, "grad_norm": 0.046496760100126266, "learning_rate": 2.634195839675292e-06, "loss": 0.0004, "step": 8355 }, { "epoch": 7.631050228310502, "grad_norm": 1.625762701034546, "learning_rate": 2.633181126331811e-06, "loss": 0.0121, "step": 8356 }, { "epoch": 7.631963470319635, "grad_norm": 0.8361044526100159, "learning_rate": 2.6321664129883313e-06, "loss": 0.0051, "step": 8357 }, { "epoch": 7.632876712328767, "grad_norm": 1.0161070823669434, "learning_rate": 2.6311516996448505e-06, "loss": 0.0049, "step": 8358 }, { "epoch": 7.6337899543378995, "grad_norm": 0.33394670486450195, "learning_rate": 2.63013698630137e-06, "loss": 0.0022, "step": 8359 }, { "epoch": 7.634703196347032, "grad_norm": 2.284257650375366, "learning_rate": 2.6291222729578895e-06, "loss": 0.0095, "step": 8360 }, { "epoch": 7.635616438356164, "grad_norm": 4.661179065704346, "learning_rate": 2.628107559614409e-06, "loss": 0.0271, "step": 8361 }, { "epoch": 7.636529680365297, "grad_norm": 0.12714523077011108, "learning_rate": 2.627092846270929e-06, "loss": 0.0009, "step": 8362 }, { "epoch": 7.637442922374429, "grad_norm": 0.292420893907547, "learning_rate": 2.626078132927448e-06, "loss": 0.0025, "step": 8363 }, { "epoch": 7.638356164383562, "grad_norm": 0.5019382834434509, "learning_rate": 2.6250634195839674e-06, "loss": 0.0035, "step": 8364 }, { "epoch": 7.639269406392694, "grad_norm": 0.9692977666854858, "learning_rate": 2.6240487062404875e-06, "loss": 0.0048, "step": 8365 }, { "epoch": 7.640182648401827, "grad_norm": 0.14559167623519897, "learning_rate": 2.623033992897007e-06, "loss": 0.001, "step": 8366 }, { "epoch": 7.641095890410959, "grad_norm": 0.04000614956021309, "learning_rate": 2.6220192795535265e-06, "loss": 0.0002, "step": 8367 }, { "epoch": 7.642009132420092, "grad_norm": 0.12189406901597977, "learning_rate": 2.6210045662100458e-06, "loss": 0.0007, "step": 8368 }, { "epoch": 7.642922374429224, "grad_norm": 0.21503448486328125, "learning_rate": 2.619989852866565e-06, "loss": 0.001, "step": 8369 }, { "epoch": 7.6438356164383565, "grad_norm": 0.7151636481285095, "learning_rate": 2.618975139523085e-06, "loss": 0.0036, "step": 8370 }, { "epoch": 7.644748858447489, "grad_norm": 0.2612222135066986, "learning_rate": 2.6179604261796044e-06, "loss": 0.0016, "step": 8371 }, { "epoch": 7.6456621004566205, "grad_norm": 0.5188589692115784, "learning_rate": 2.616945712836124e-06, "loss": 0.0037, "step": 8372 }, { "epoch": 7.646575342465754, "grad_norm": 0.009950070641934872, "learning_rate": 2.6159309994926434e-06, "loss": 0.0001, "step": 8373 }, { "epoch": 7.647488584474885, "grad_norm": 3.426220178604126, "learning_rate": 2.6149162861491627e-06, "loss": 0.0175, "step": 8374 }, { "epoch": 7.648401826484018, "grad_norm": 1.1813197135925293, "learning_rate": 2.6139015728056828e-06, "loss": 0.0059, "step": 8375 }, { "epoch": 7.64931506849315, "grad_norm": 0.23394174873828888, "learning_rate": 2.612886859462202e-06, "loss": 0.0018, "step": 8376 }, { "epoch": 7.650228310502283, "grad_norm": 0.2539435625076294, "learning_rate": 2.6118721461187217e-06, "loss": 0.001, "step": 8377 }, { "epoch": 7.651141552511415, "grad_norm": 0.22533415257930756, "learning_rate": 2.610857432775241e-06, "loss": 0.0013, "step": 8378 }, { "epoch": 7.652054794520548, "grad_norm": 9.22590446472168, "learning_rate": 2.6098427194317607e-06, "loss": 0.0474, "step": 8379 }, { "epoch": 7.65296803652968, "grad_norm": 1.191144585609436, "learning_rate": 2.6088280060882804e-06, "loss": 0.0092, "step": 8380 }, { "epoch": 7.653881278538813, "grad_norm": 78.500732421875, "learning_rate": 2.6078132927447997e-06, "loss": 0.5742, "step": 8381 }, { "epoch": 7.654794520547945, "grad_norm": 0.7322798371315002, "learning_rate": 2.6067985794013193e-06, "loss": 0.0045, "step": 8382 }, { "epoch": 7.6557077625570775, "grad_norm": 2.757389545440674, "learning_rate": 2.605783866057839e-06, "loss": 0.0122, "step": 8383 }, { "epoch": 7.65662100456621, "grad_norm": 0.318854421377182, "learning_rate": 2.6047691527143583e-06, "loss": 0.0009, "step": 8384 }, { "epoch": 7.657534246575342, "grad_norm": 3.2615535259246826, "learning_rate": 2.603754439370878e-06, "loss": 0.0176, "step": 8385 }, { "epoch": 7.658447488584475, "grad_norm": 4.666698455810547, "learning_rate": 2.6027397260273973e-06, "loss": 0.0211, "step": 8386 }, { "epoch": 7.659360730593607, "grad_norm": 11.848359107971191, "learning_rate": 2.6017250126839174e-06, "loss": 0.0604, "step": 8387 }, { "epoch": 7.66027397260274, "grad_norm": 4.157145977020264, "learning_rate": 2.6007102993404366e-06, "loss": 0.0198, "step": 8388 }, { "epoch": 7.661187214611872, "grad_norm": 5.9463419914245605, "learning_rate": 2.599695585996956e-06, "loss": 0.0285, "step": 8389 }, { "epoch": 7.662100456621005, "grad_norm": 0.5666002631187439, "learning_rate": 2.5986808726534756e-06, "loss": 0.0028, "step": 8390 }, { "epoch": 7.663013698630137, "grad_norm": 10.227089881896973, "learning_rate": 2.597666159309995e-06, "loss": 0.0491, "step": 8391 }, { "epoch": 7.66392694063927, "grad_norm": 0.04699371010065079, "learning_rate": 2.596651445966515e-06, "loss": 0.0004, "step": 8392 }, { "epoch": 7.664840182648402, "grad_norm": 21.15669059753418, "learning_rate": 2.5956367326230343e-06, "loss": 0.0987, "step": 8393 }, { "epoch": 7.6657534246575345, "grad_norm": 0.08056778460741043, "learning_rate": 2.5946220192795535e-06, "loss": 0.0006, "step": 8394 }, { "epoch": 7.666666666666667, "grad_norm": 0.5439457297325134, "learning_rate": 2.5936073059360732e-06, "loss": 0.0028, "step": 8395 }, { "epoch": 7.667579908675799, "grad_norm": 4.965622425079346, "learning_rate": 2.5925925925925925e-06, "loss": 0.0187, "step": 8396 }, { "epoch": 7.668493150684932, "grad_norm": 0.1288502961397171, "learning_rate": 2.5915778792491126e-06, "loss": 0.0008, "step": 8397 }, { "epoch": 7.669406392694064, "grad_norm": 0.14925949275493622, "learning_rate": 2.590563165905632e-06, "loss": 0.001, "step": 8398 }, { "epoch": 7.670319634703196, "grad_norm": 0.3413943350315094, "learning_rate": 2.589548452562151e-06, "loss": 0.0019, "step": 8399 }, { "epoch": 7.671232876712329, "grad_norm": 18.34480857849121, "learning_rate": 2.588533739218671e-06, "loss": 0.0339, "step": 8400 }, { "epoch": 7.672146118721461, "grad_norm": 0.07800988107919693, "learning_rate": 2.5875190258751905e-06, "loss": 0.0006, "step": 8401 }, { "epoch": 7.673059360730593, "grad_norm": 0.5203704237937927, "learning_rate": 2.5865043125317102e-06, "loss": 0.0033, "step": 8402 }, { "epoch": 7.673972602739726, "grad_norm": 0.3550419807434082, "learning_rate": 2.5854895991882295e-06, "loss": 0.0019, "step": 8403 }, { "epoch": 7.674885844748858, "grad_norm": 0.795004665851593, "learning_rate": 2.5844748858447488e-06, "loss": 0.0046, "step": 8404 }, { "epoch": 7.675799086757991, "grad_norm": 0.07125133275985718, "learning_rate": 2.583460172501269e-06, "loss": 0.0004, "step": 8405 }, { "epoch": 7.676712328767123, "grad_norm": 0.8695796728134155, "learning_rate": 2.582445459157788e-06, "loss": 0.006, "step": 8406 }, { "epoch": 7.6776255707762555, "grad_norm": 4.9680609703063965, "learning_rate": 2.581430745814308e-06, "loss": 0.0213, "step": 8407 }, { "epoch": 7.678538812785388, "grad_norm": 0.226698100566864, "learning_rate": 2.580416032470827e-06, "loss": 0.001, "step": 8408 }, { "epoch": 7.67945205479452, "grad_norm": 0.7331643104553223, "learning_rate": 2.5794013191273464e-06, "loss": 0.0048, "step": 8409 }, { "epoch": 7.680365296803653, "grad_norm": 14.540608406066895, "learning_rate": 2.5783866057838665e-06, "loss": 0.0717, "step": 8410 }, { "epoch": 7.681278538812785, "grad_norm": 5.7540459632873535, "learning_rate": 2.5773718924403858e-06, "loss": 0.0215, "step": 8411 }, { "epoch": 7.682191780821918, "grad_norm": 0.15012158453464508, "learning_rate": 2.5763571790969054e-06, "loss": 0.001, "step": 8412 }, { "epoch": 7.68310502283105, "grad_norm": 8.834001541137695, "learning_rate": 2.5753424657534247e-06, "loss": 0.0499, "step": 8413 }, { "epoch": 7.684018264840183, "grad_norm": 22.806833267211914, "learning_rate": 2.574327752409944e-06, "loss": 0.0699, "step": 8414 }, { "epoch": 7.684931506849315, "grad_norm": 10.569896697998047, "learning_rate": 2.573313039066464e-06, "loss": 0.0606, "step": 8415 }, { "epoch": 7.685844748858448, "grad_norm": 2.1004436016082764, "learning_rate": 2.5722983257229834e-06, "loss": 0.0158, "step": 8416 }, { "epoch": 7.68675799086758, "grad_norm": 0.22247549891471863, "learning_rate": 2.571283612379503e-06, "loss": 0.0011, "step": 8417 }, { "epoch": 7.6876712328767125, "grad_norm": 0.2514466345310211, "learning_rate": 2.5702688990360223e-06, "loss": 0.0021, "step": 8418 }, { "epoch": 7.688584474885845, "grad_norm": 4.844552040100098, "learning_rate": 2.569254185692542e-06, "loss": 0.0249, "step": 8419 }, { "epoch": 7.689497716894977, "grad_norm": 2.7757344245910645, "learning_rate": 2.5682394723490617e-06, "loss": 0.0158, "step": 8420 }, { "epoch": 7.69041095890411, "grad_norm": 0.1594405174255371, "learning_rate": 2.567224759005581e-06, "loss": 0.001, "step": 8421 }, { "epoch": 7.691324200913242, "grad_norm": 0.6578715443611145, "learning_rate": 2.5662100456621007e-06, "loss": 0.0037, "step": 8422 }, { "epoch": 7.692237442922375, "grad_norm": 0.21113601326942444, "learning_rate": 2.5651953323186204e-06, "loss": 0.001, "step": 8423 }, { "epoch": 7.693150684931507, "grad_norm": 0.3370320200920105, "learning_rate": 2.5641806189751396e-06, "loss": 0.002, "step": 8424 }, { "epoch": 7.69406392694064, "grad_norm": 0.2773151099681854, "learning_rate": 2.5631659056316593e-06, "loss": 0.002, "step": 8425 }, { "epoch": 7.694977168949771, "grad_norm": 0.7277477979660034, "learning_rate": 2.5621511922881786e-06, "loss": 0.0034, "step": 8426 }, { "epoch": 7.695890410958905, "grad_norm": 2.288574457168579, "learning_rate": 2.5611364789446987e-06, "loss": 0.0107, "step": 8427 }, { "epoch": 7.696803652968036, "grad_norm": 67.52792358398438, "learning_rate": 2.560121765601218e-06, "loss": 0.4152, "step": 8428 }, { "epoch": 7.697716894977169, "grad_norm": 0.1553686559200287, "learning_rate": 2.5591070522577372e-06, "loss": 0.0012, "step": 8429 }, { "epoch": 7.698630136986301, "grad_norm": 0.3256461024284363, "learning_rate": 2.558092338914257e-06, "loss": 0.0019, "step": 8430 }, { "epoch": 7.6995433789954335, "grad_norm": 51.55555725097656, "learning_rate": 2.557077625570776e-06, "loss": 0.3518, "step": 8431 }, { "epoch": 7.700456621004566, "grad_norm": 10.238226890563965, "learning_rate": 2.5560629122272963e-06, "loss": 0.0732, "step": 8432 }, { "epoch": 7.701369863013698, "grad_norm": 0.8339212536811829, "learning_rate": 2.5550481988838156e-06, "loss": 0.0049, "step": 8433 }, { "epoch": 7.702283105022831, "grad_norm": 5.594587802886963, "learning_rate": 2.554033485540335e-06, "loss": 0.0228, "step": 8434 }, { "epoch": 7.703196347031963, "grad_norm": 15.430371284484863, "learning_rate": 2.5530187721968546e-06, "loss": 0.1273, "step": 8435 }, { "epoch": 7.704109589041096, "grad_norm": 7.224018573760986, "learning_rate": 2.552004058853374e-06, "loss": 0.0258, "step": 8436 }, { "epoch": 7.705022831050228, "grad_norm": 19.764108657836914, "learning_rate": 2.550989345509894e-06, "loss": 0.0755, "step": 8437 }, { "epoch": 7.705936073059361, "grad_norm": 2.986060380935669, "learning_rate": 2.549974632166413e-06, "loss": 0.0084, "step": 8438 }, { "epoch": 7.706849315068493, "grad_norm": 1.3649523258209229, "learning_rate": 2.5489599188229325e-06, "loss": 0.0034, "step": 8439 }, { "epoch": 7.707762557077626, "grad_norm": 0.1169598400592804, "learning_rate": 2.547945205479452e-06, "loss": 0.0009, "step": 8440 }, { "epoch": 7.708675799086758, "grad_norm": 0.02696208469569683, "learning_rate": 2.546930492135972e-06, "loss": 0.0002, "step": 8441 }, { "epoch": 7.7095890410958905, "grad_norm": 0.16515782475471497, "learning_rate": 2.5459157787924915e-06, "loss": 0.0009, "step": 8442 }, { "epoch": 7.710502283105023, "grad_norm": 16.051626205444336, "learning_rate": 2.544901065449011e-06, "loss": 0.0801, "step": 8443 }, { "epoch": 7.711415525114155, "grad_norm": 0.2201981544494629, "learning_rate": 2.54388635210553e-06, "loss": 0.0012, "step": 8444 }, { "epoch": 7.712328767123288, "grad_norm": 0.04764382913708687, "learning_rate": 2.54287163876205e-06, "loss": 0.0002, "step": 8445 }, { "epoch": 7.71324200913242, "grad_norm": 139.74932861328125, "learning_rate": 2.5418569254185695e-06, "loss": 1.9036, "step": 8446 }, { "epoch": 7.714155251141553, "grad_norm": 0.051678046584129333, "learning_rate": 2.540842212075089e-06, "loss": 0.0003, "step": 8447 }, { "epoch": 7.715068493150685, "grad_norm": 0.6341903805732727, "learning_rate": 2.5398274987316084e-06, "loss": 0.0035, "step": 8448 }, { "epoch": 7.715981735159818, "grad_norm": 4.747585296630859, "learning_rate": 2.5388127853881277e-06, "loss": 0.0245, "step": 8449 }, { "epoch": 7.71689497716895, "grad_norm": 3.481029510498047, "learning_rate": 2.537798072044648e-06, "loss": 0.0181, "step": 8450 }, { "epoch": 7.717808219178083, "grad_norm": 5.069117069244385, "learning_rate": 2.536783358701167e-06, "loss": 0.0394, "step": 8451 }, { "epoch": 7.718721461187215, "grad_norm": 24.18891716003418, "learning_rate": 2.5357686453576868e-06, "loss": 0.1489, "step": 8452 }, { "epoch": 7.719634703196347, "grad_norm": 0.7428197264671326, "learning_rate": 2.534753932014206e-06, "loss": 0.0024, "step": 8453 }, { "epoch": 7.72054794520548, "grad_norm": 0.18653056025505066, "learning_rate": 2.5337392186707253e-06, "loss": 0.0014, "step": 8454 }, { "epoch": 7.7214611872146115, "grad_norm": 7.607517719268799, "learning_rate": 2.5327245053272454e-06, "loss": 0.0579, "step": 8455 }, { "epoch": 7.722374429223744, "grad_norm": 2.1277709007263184, "learning_rate": 2.5317097919837647e-06, "loss": 0.0138, "step": 8456 }, { "epoch": 7.723287671232876, "grad_norm": 3.522078514099121, "learning_rate": 2.5306950786402844e-06, "loss": 0.0151, "step": 8457 }, { "epoch": 7.724200913242009, "grad_norm": 0.305464506149292, "learning_rate": 2.5296803652968037e-06, "loss": 0.0017, "step": 8458 }, { "epoch": 7.725114155251141, "grad_norm": 0.3847109079360962, "learning_rate": 2.5286656519533234e-06, "loss": 0.0025, "step": 8459 }, { "epoch": 7.726027397260274, "grad_norm": 2.8873825073242188, "learning_rate": 2.527650938609843e-06, "loss": 0.0142, "step": 8460 }, { "epoch": 7.726940639269406, "grad_norm": 5.152431011199951, "learning_rate": 2.5266362252663623e-06, "loss": 0.0282, "step": 8461 }, { "epoch": 7.727853881278539, "grad_norm": 0.09205029159784317, "learning_rate": 2.525621511922882e-06, "loss": 0.0006, "step": 8462 }, { "epoch": 7.728767123287671, "grad_norm": 1.9539331197738647, "learning_rate": 2.5246067985794017e-06, "loss": 0.0094, "step": 8463 }, { "epoch": 7.729680365296804, "grad_norm": 0.883890688419342, "learning_rate": 2.523592085235921e-06, "loss": 0.0043, "step": 8464 }, { "epoch": 7.730593607305936, "grad_norm": 10.182547569274902, "learning_rate": 2.5225773718924407e-06, "loss": 0.0632, "step": 8465 }, { "epoch": 7.7315068493150685, "grad_norm": 4.71343994140625, "learning_rate": 2.52156265854896e-06, "loss": 0.0173, "step": 8466 }, { "epoch": 7.732420091324201, "grad_norm": 3.6208043098449707, "learning_rate": 2.52054794520548e-06, "loss": 0.0272, "step": 8467 }, { "epoch": 7.733333333333333, "grad_norm": 0.012074265629053116, "learning_rate": 2.5195332318619993e-06, "loss": 0.0001, "step": 8468 }, { "epoch": 7.734246575342466, "grad_norm": 9.015890121459961, "learning_rate": 2.5185185185185186e-06, "loss": 0.0456, "step": 8469 }, { "epoch": 7.735159817351598, "grad_norm": 60.18162155151367, "learning_rate": 2.5175038051750383e-06, "loss": 0.2924, "step": 8470 }, { "epoch": 7.736073059360731, "grad_norm": 1.2751342058181763, "learning_rate": 2.5164890918315575e-06, "loss": 0.0092, "step": 8471 }, { "epoch": 7.736986301369863, "grad_norm": 0.10970580577850342, "learning_rate": 2.5154743784880777e-06, "loss": 0.0007, "step": 8472 }, { "epoch": 7.737899543378996, "grad_norm": 0.3287184536457062, "learning_rate": 2.514459665144597e-06, "loss": 0.0023, "step": 8473 }, { "epoch": 7.738812785388128, "grad_norm": 3.0900442600250244, "learning_rate": 2.513444951801116e-06, "loss": 0.0213, "step": 8474 }, { "epoch": 7.739726027397261, "grad_norm": 12.750085830688477, "learning_rate": 2.512430238457636e-06, "loss": 0.0611, "step": 8475 }, { "epoch": 7.740639269406393, "grad_norm": 5.133533954620361, "learning_rate": 2.511415525114155e-06, "loss": 0.0256, "step": 8476 }, { "epoch": 7.7415525114155255, "grad_norm": 0.2583521604537964, "learning_rate": 2.5104008117706753e-06, "loss": 0.0014, "step": 8477 }, { "epoch": 7.742465753424657, "grad_norm": 0.06437887996435165, "learning_rate": 2.5093860984271945e-06, "loss": 0.0003, "step": 8478 }, { "epoch": 7.74337899543379, "grad_norm": 0.007185101509094238, "learning_rate": 2.508371385083714e-06, "loss": 0.0, "step": 8479 }, { "epoch": 7.744292237442922, "grad_norm": 0.04021313413977623, "learning_rate": 2.5073566717402335e-06, "loss": 0.0002, "step": 8480 }, { "epoch": 7.745205479452055, "grad_norm": 0.5942056775093079, "learning_rate": 2.506341958396753e-06, "loss": 0.0025, "step": 8481 }, { "epoch": 7.746118721461187, "grad_norm": 0.0566185787320137, "learning_rate": 2.505327245053273e-06, "loss": 0.0004, "step": 8482 }, { "epoch": 7.747031963470319, "grad_norm": 0.18528717756271362, "learning_rate": 2.504312531709792e-06, "loss": 0.0009, "step": 8483 }, { "epoch": 7.747945205479452, "grad_norm": 5.183837413787842, "learning_rate": 2.5032978183663114e-06, "loss": 0.027, "step": 8484 }, { "epoch": 7.748858447488584, "grad_norm": 66.2806396484375, "learning_rate": 2.5022831050228315e-06, "loss": 0.5516, "step": 8485 }, { "epoch": 7.749771689497717, "grad_norm": 0.2076462060213089, "learning_rate": 2.501268391679351e-06, "loss": 0.0011, "step": 8486 }, { "epoch": 7.750684931506849, "grad_norm": 1.2120089530944824, "learning_rate": 2.5002536783358705e-06, "loss": 0.0075, "step": 8487 }, { "epoch": 7.751598173515982, "grad_norm": 3.922614336013794, "learning_rate": 2.4992389649923898e-06, "loss": 0.0142, "step": 8488 }, { "epoch": 7.752511415525114, "grad_norm": 2.1122453212738037, "learning_rate": 2.4982242516489095e-06, "loss": 0.0098, "step": 8489 }, { "epoch": 7.7534246575342465, "grad_norm": 0.36176371574401855, "learning_rate": 2.497209538305429e-06, "loss": 0.0025, "step": 8490 }, { "epoch": 7.754337899543379, "grad_norm": 0.8190577626228333, "learning_rate": 2.4961948249619484e-06, "loss": 0.0049, "step": 8491 }, { "epoch": 7.755251141552511, "grad_norm": 1.5005244016647339, "learning_rate": 2.495180111618468e-06, "loss": 0.0112, "step": 8492 }, { "epoch": 7.756164383561644, "grad_norm": 0.2811049520969391, "learning_rate": 2.4941653982749874e-06, "loss": 0.0016, "step": 8493 }, { "epoch": 7.757077625570776, "grad_norm": 0.46648094058036804, "learning_rate": 2.493150684931507e-06, "loss": 0.0034, "step": 8494 }, { "epoch": 7.757990867579909, "grad_norm": 4.955049514770508, "learning_rate": 2.4921359715880268e-06, "loss": 0.0313, "step": 8495 }, { "epoch": 7.758904109589041, "grad_norm": 0.2854286730289459, "learning_rate": 2.491121258244546e-06, "loss": 0.0015, "step": 8496 }, { "epoch": 7.759817351598174, "grad_norm": 0.015767119824886322, "learning_rate": 2.4901065449010657e-06, "loss": 0.0001, "step": 8497 }, { "epoch": 7.760730593607306, "grad_norm": 10.315397262573242, "learning_rate": 2.489091831557585e-06, "loss": 0.0208, "step": 8498 }, { "epoch": 7.761643835616439, "grad_norm": 0.019565122202038765, "learning_rate": 2.4880771182141047e-06, "loss": 0.0001, "step": 8499 }, { "epoch": 7.762557077625571, "grad_norm": 0.1995522826910019, "learning_rate": 2.4870624048706244e-06, "loss": 0.0011, "step": 8500 }, { "epoch": 7.7634703196347035, "grad_norm": 4.1153669357299805, "learning_rate": 2.4860476915271436e-06, "loss": 0.0225, "step": 8501 }, { "epoch": 7.764383561643836, "grad_norm": 5.611885070800781, "learning_rate": 2.4850329781836633e-06, "loss": 0.0301, "step": 8502 }, { "epoch": 7.765296803652968, "grad_norm": 9.419595718383789, "learning_rate": 2.484018264840183e-06, "loss": 0.0488, "step": 8503 }, { "epoch": 7.766210045662101, "grad_norm": 0.42643824219703674, "learning_rate": 2.4830035514967023e-06, "loss": 0.0018, "step": 8504 }, { "epoch": 7.767123287671232, "grad_norm": 1.081204891204834, "learning_rate": 2.481988838153222e-06, "loss": 0.0069, "step": 8505 }, { "epoch": 7.768036529680366, "grad_norm": 2.80594539642334, "learning_rate": 2.4809741248097413e-06, "loss": 0.0171, "step": 8506 }, { "epoch": 7.768949771689497, "grad_norm": 149.44754028320312, "learning_rate": 2.479959411466261e-06, "loss": 6.0513, "step": 8507 }, { "epoch": 7.76986301369863, "grad_norm": 0.36081573367118835, "learning_rate": 2.4789446981227806e-06, "loss": 0.0024, "step": 8508 }, { "epoch": 7.770776255707762, "grad_norm": 0.22995656728744507, "learning_rate": 2.4779299847793e-06, "loss": 0.0014, "step": 8509 }, { "epoch": 7.771689497716895, "grad_norm": 0.267781525850296, "learning_rate": 2.4769152714358196e-06, "loss": 0.0016, "step": 8510 }, { "epoch": 7.772602739726027, "grad_norm": 61.822452545166016, "learning_rate": 2.475900558092339e-06, "loss": 0.3256, "step": 8511 }, { "epoch": 7.77351598173516, "grad_norm": 8.205767631530762, "learning_rate": 2.4748858447488586e-06, "loss": 0.0432, "step": 8512 }, { "epoch": 7.774429223744292, "grad_norm": 0.07434377819299698, "learning_rate": 2.4738711314053783e-06, "loss": 0.0003, "step": 8513 }, { "epoch": 7.7753424657534245, "grad_norm": 0.7066711783409119, "learning_rate": 2.472856418061898e-06, "loss": 0.0038, "step": 8514 }, { "epoch": 7.776255707762557, "grad_norm": 1.5691875219345093, "learning_rate": 2.4718417047184172e-06, "loss": 0.0096, "step": 8515 }, { "epoch": 7.777168949771689, "grad_norm": 1.4139987230300903, "learning_rate": 2.4708269913749365e-06, "loss": 0.0079, "step": 8516 }, { "epoch": 7.778082191780822, "grad_norm": 0.04853545501828194, "learning_rate": 2.469812278031456e-06, "loss": 0.0003, "step": 8517 }, { "epoch": 7.778995433789954, "grad_norm": 0.7633110284805298, "learning_rate": 2.468797564687976e-06, "loss": 0.0035, "step": 8518 }, { "epoch": 7.779908675799087, "grad_norm": 1.6762653589248657, "learning_rate": 2.4677828513444956e-06, "loss": 0.012, "step": 8519 }, { "epoch": 7.780821917808219, "grad_norm": 21.97261619567871, "learning_rate": 2.466768138001015e-06, "loss": 0.1343, "step": 8520 }, { "epoch": 7.781735159817352, "grad_norm": 0.029751453548669815, "learning_rate": 2.4657534246575345e-06, "loss": 0.0002, "step": 8521 }, { "epoch": 7.782648401826484, "grad_norm": 0.48336800932884216, "learning_rate": 2.4647387113140538e-06, "loss": 0.0023, "step": 8522 }, { "epoch": 7.7835616438356166, "grad_norm": 0.08586824685335159, "learning_rate": 2.4637239979705735e-06, "loss": 0.0004, "step": 8523 }, { "epoch": 7.784474885844749, "grad_norm": 2.567878246307373, "learning_rate": 2.462709284627093e-06, "loss": 0.0086, "step": 8524 }, { "epoch": 7.7853881278538815, "grad_norm": 0.6028957962989807, "learning_rate": 2.461694571283613e-06, "loss": 0.0025, "step": 8525 }, { "epoch": 7.786301369863014, "grad_norm": 1.1017076969146729, "learning_rate": 2.460679857940132e-06, "loss": 0.0057, "step": 8526 }, { "epoch": 7.787214611872146, "grad_norm": 19.810333251953125, "learning_rate": 2.4596651445966514e-06, "loss": 0.1003, "step": 8527 }, { "epoch": 7.788127853881279, "grad_norm": 0.15195408463478088, "learning_rate": 2.458650431253171e-06, "loss": 0.0009, "step": 8528 }, { "epoch": 7.789041095890411, "grad_norm": 6.325621128082275, "learning_rate": 2.4576357179096908e-06, "loss": 0.0402, "step": 8529 }, { "epoch": 7.789954337899544, "grad_norm": 0.4989190101623535, "learning_rate": 2.4566210045662105e-06, "loss": 0.0023, "step": 8530 }, { "epoch": 7.790867579908676, "grad_norm": 0.17869503796100616, "learning_rate": 2.4556062912227297e-06, "loss": 0.0011, "step": 8531 }, { "epoch": 7.791780821917808, "grad_norm": 2.6452999114990234, "learning_rate": 2.4545915778792494e-06, "loss": 0.0151, "step": 8532 }, { "epoch": 7.792694063926941, "grad_norm": 0.2868099808692932, "learning_rate": 2.4535768645357687e-06, "loss": 0.0015, "step": 8533 }, { "epoch": 7.793607305936073, "grad_norm": 68.56053161621094, "learning_rate": 2.4525621511922884e-06, "loss": 0.7063, "step": 8534 }, { "epoch": 7.794520547945205, "grad_norm": 1.1853443384170532, "learning_rate": 2.451547437848808e-06, "loss": 0.006, "step": 8535 }, { "epoch": 7.7954337899543376, "grad_norm": 16.813373565673828, "learning_rate": 2.4505327245053274e-06, "loss": 0.0742, "step": 8536 }, { "epoch": 7.79634703196347, "grad_norm": 77.65644836425781, "learning_rate": 2.449518011161847e-06, "loss": 0.5425, "step": 8537 }, { "epoch": 7.7972602739726025, "grad_norm": 0.47649675607681274, "learning_rate": 2.4485032978183663e-06, "loss": 0.0023, "step": 8538 }, { "epoch": 7.798173515981735, "grad_norm": 5.71759033203125, "learning_rate": 2.447488584474886e-06, "loss": 0.0246, "step": 8539 }, { "epoch": 7.799086757990867, "grad_norm": 3.171576499938965, "learning_rate": 2.4464738711314057e-06, "loss": 0.0164, "step": 8540 }, { "epoch": 7.8, "grad_norm": 2.1979799270629883, "learning_rate": 2.445459157787925e-06, "loss": 0.0109, "step": 8541 }, { "epoch": 7.800913242009132, "grad_norm": 0.4882452189922333, "learning_rate": 2.4444444444444447e-06, "loss": 0.0023, "step": 8542 }, { "epoch": 7.801826484018265, "grad_norm": 28.707136154174805, "learning_rate": 2.4434297311009644e-06, "loss": 0.1678, "step": 8543 }, { "epoch": 7.802739726027397, "grad_norm": 4.4097394943237305, "learning_rate": 2.4424150177574836e-06, "loss": 0.028, "step": 8544 }, { "epoch": 7.80365296803653, "grad_norm": 0.41438713669776917, "learning_rate": 2.4414003044140033e-06, "loss": 0.0036, "step": 8545 }, { "epoch": 7.804566210045662, "grad_norm": 24.521276473999023, "learning_rate": 2.4403855910705226e-06, "loss": 0.1924, "step": 8546 }, { "epoch": 7.8054794520547945, "grad_norm": 0.3707563579082489, "learning_rate": 2.4393708777270423e-06, "loss": 0.0026, "step": 8547 }, { "epoch": 7.806392694063927, "grad_norm": 0.7601795196533203, "learning_rate": 2.438356164383562e-06, "loss": 0.0042, "step": 8548 }, { "epoch": 7.8073059360730594, "grad_norm": 1.5056278705596924, "learning_rate": 2.4373414510400812e-06, "loss": 0.0086, "step": 8549 }, { "epoch": 7.808219178082192, "grad_norm": 0.2987646758556366, "learning_rate": 2.436326737696601e-06, "loss": 0.0026, "step": 8550 }, { "epoch": 7.809132420091324, "grad_norm": 1.7222774028778076, "learning_rate": 2.43531202435312e-06, "loss": 0.0076, "step": 8551 }, { "epoch": 7.810045662100457, "grad_norm": 1.679386019706726, "learning_rate": 2.43429731100964e-06, "loss": 0.0077, "step": 8552 }, { "epoch": 7.810958904109589, "grad_norm": 0.5536607503890991, "learning_rate": 2.4332825976661596e-06, "loss": 0.004, "step": 8553 }, { "epoch": 7.811872146118722, "grad_norm": 0.7961170077323914, "learning_rate": 2.4322678843226793e-06, "loss": 0.0047, "step": 8554 }, { "epoch": 7.812785388127854, "grad_norm": 1.0297402143478394, "learning_rate": 2.4312531709791985e-06, "loss": 0.0086, "step": 8555 }, { "epoch": 7.813698630136987, "grad_norm": 3.12801456451416, "learning_rate": 2.430238457635718e-06, "loss": 0.0154, "step": 8556 }, { "epoch": 7.814611872146119, "grad_norm": 1.2137601375579834, "learning_rate": 2.4292237442922375e-06, "loss": 0.008, "step": 8557 }, { "epoch": 7.8155251141552515, "grad_norm": 2.0854711532592773, "learning_rate": 2.428209030948757e-06, "loss": 0.011, "step": 8558 }, { "epoch": 7.816438356164383, "grad_norm": 39.652503967285156, "learning_rate": 2.427194317605277e-06, "loss": 0.2008, "step": 8559 }, { "epoch": 7.817351598173516, "grad_norm": 35.86441421508789, "learning_rate": 2.426179604261796e-06, "loss": 0.2156, "step": 8560 }, { "epoch": 7.818264840182648, "grad_norm": 0.5444911122322083, "learning_rate": 2.425164890918316e-06, "loss": 0.0033, "step": 8561 }, { "epoch": 7.8191780821917805, "grad_norm": 21.168132781982422, "learning_rate": 2.424150177574835e-06, "loss": 0.1439, "step": 8562 }, { "epoch": 7.820091324200913, "grad_norm": 17.410621643066406, "learning_rate": 2.423135464231355e-06, "loss": 0.1042, "step": 8563 }, { "epoch": 7.821004566210045, "grad_norm": 0.480133056640625, "learning_rate": 2.4221207508878745e-06, "loss": 0.0023, "step": 8564 }, { "epoch": 7.821917808219178, "grad_norm": 1.9197238683700562, "learning_rate": 2.421106037544394e-06, "loss": 0.007, "step": 8565 }, { "epoch": 7.82283105022831, "grad_norm": 0.042634062469005585, "learning_rate": 2.4200913242009135e-06, "loss": 0.0003, "step": 8566 }, { "epoch": 7.823744292237443, "grad_norm": 6.833116054534912, "learning_rate": 2.4190766108574327e-06, "loss": 0.0445, "step": 8567 }, { "epoch": 7.824657534246575, "grad_norm": 0.6224887371063232, "learning_rate": 2.4180618975139524e-06, "loss": 0.0031, "step": 8568 }, { "epoch": 7.825570776255708, "grad_norm": 0.30309563875198364, "learning_rate": 2.417047184170472e-06, "loss": 0.002, "step": 8569 }, { "epoch": 7.82648401826484, "grad_norm": 1.8004990816116333, "learning_rate": 2.416032470826992e-06, "loss": 0.0079, "step": 8570 }, { "epoch": 7.8273972602739725, "grad_norm": 0.027071576565504074, "learning_rate": 2.415017757483511e-06, "loss": 0.0002, "step": 8571 }, { "epoch": 7.828310502283105, "grad_norm": 13.093255996704102, "learning_rate": 2.4140030441400308e-06, "loss": 0.0639, "step": 8572 }, { "epoch": 7.829223744292237, "grad_norm": 0.0804869756102562, "learning_rate": 2.41298833079655e-06, "loss": 0.0006, "step": 8573 }, { "epoch": 7.83013698630137, "grad_norm": 0.9274415373802185, "learning_rate": 2.4119736174530697e-06, "loss": 0.0045, "step": 8574 }, { "epoch": 7.831050228310502, "grad_norm": 0.06839564442634583, "learning_rate": 2.4109589041095894e-06, "loss": 0.0004, "step": 8575 }, { "epoch": 7.831963470319635, "grad_norm": 0.3501833975315094, "learning_rate": 2.4099441907661087e-06, "loss": 0.0026, "step": 8576 }, { "epoch": 7.832876712328767, "grad_norm": 0.09305919706821442, "learning_rate": 2.4089294774226284e-06, "loss": 0.0004, "step": 8577 }, { "epoch": 7.8337899543379, "grad_norm": 4.302114009857178, "learning_rate": 2.4079147640791476e-06, "loss": 0.0303, "step": 8578 }, { "epoch": 7.834703196347032, "grad_norm": 0.23463605344295502, "learning_rate": 2.4069000507356673e-06, "loss": 0.002, "step": 8579 }, { "epoch": 7.835616438356165, "grad_norm": 1.3947502374649048, "learning_rate": 2.405885337392187e-06, "loss": 0.0126, "step": 8580 }, { "epoch": 7.836529680365297, "grad_norm": 0.004735069815069437, "learning_rate": 2.4048706240487063e-06, "loss": 0.0, "step": 8581 }, { "epoch": 7.8374429223744295, "grad_norm": 0.6025089025497437, "learning_rate": 2.403855910705226e-06, "loss": 0.004, "step": 8582 }, { "epoch": 7.838356164383562, "grad_norm": 57.85795974731445, "learning_rate": 2.4028411973617457e-06, "loss": 0.4483, "step": 8583 }, { "epoch": 7.839269406392694, "grad_norm": 19.158016204833984, "learning_rate": 2.401826484018265e-06, "loss": 0.1199, "step": 8584 }, { "epoch": 7.840182648401827, "grad_norm": 1.9917467832565308, "learning_rate": 2.4008117706747846e-06, "loss": 0.0094, "step": 8585 }, { "epoch": 7.8410958904109584, "grad_norm": 6.616394996643066, "learning_rate": 2.399797057331304e-06, "loss": 0.0417, "step": 8586 }, { "epoch": 7.842009132420092, "grad_norm": 0.4884119927883148, "learning_rate": 2.3987823439878236e-06, "loss": 0.0027, "step": 8587 }, { "epoch": 7.842922374429223, "grad_norm": 0.10209114104509354, "learning_rate": 2.3977676306443433e-06, "loss": 0.0007, "step": 8588 }, { "epoch": 7.843835616438356, "grad_norm": 0.09727859497070312, "learning_rate": 2.3967529173008626e-06, "loss": 0.0005, "step": 8589 }, { "epoch": 7.844748858447488, "grad_norm": 1.1001609563827515, "learning_rate": 2.3957382039573823e-06, "loss": 0.0062, "step": 8590 }, { "epoch": 7.845662100456621, "grad_norm": 2.6741039752960205, "learning_rate": 2.3947234906139015e-06, "loss": 0.0111, "step": 8591 }, { "epoch": 7.846575342465753, "grad_norm": 1.250282645225525, "learning_rate": 2.3937087772704212e-06, "loss": 0.0063, "step": 8592 }, { "epoch": 7.847488584474886, "grad_norm": 0.2085396647453308, "learning_rate": 2.392694063926941e-06, "loss": 0.0012, "step": 8593 }, { "epoch": 7.848401826484018, "grad_norm": 1.757813572883606, "learning_rate": 2.3916793505834606e-06, "loss": 0.0124, "step": 8594 }, { "epoch": 7.8493150684931505, "grad_norm": 21.607744216918945, "learning_rate": 2.39066463723998e-06, "loss": 0.2379, "step": 8595 }, { "epoch": 7.850228310502283, "grad_norm": 0.18599310517311096, "learning_rate": 2.389649923896499e-06, "loss": 0.0012, "step": 8596 }, { "epoch": 7.851141552511415, "grad_norm": 0.15644806623458862, "learning_rate": 2.388635210553019e-06, "loss": 0.0009, "step": 8597 }, { "epoch": 7.852054794520548, "grad_norm": 14.248482704162598, "learning_rate": 2.3876204972095385e-06, "loss": 0.0768, "step": 8598 }, { "epoch": 7.85296803652968, "grad_norm": 2.248025417327881, "learning_rate": 2.3866057838660582e-06, "loss": 0.0049, "step": 8599 }, { "epoch": 7.853881278538813, "grad_norm": 0.14031760394573212, "learning_rate": 2.3855910705225775e-06, "loss": 0.001, "step": 8600 }, { "epoch": 7.854794520547945, "grad_norm": 0.6231405138969421, "learning_rate": 2.384576357179097e-06, "loss": 0.0041, "step": 8601 }, { "epoch": 7.855707762557078, "grad_norm": 1.7864984273910522, "learning_rate": 2.3835616438356164e-06, "loss": 0.0081, "step": 8602 }, { "epoch": 7.85662100456621, "grad_norm": 0.8049283623695374, "learning_rate": 2.382546930492136e-06, "loss": 0.0044, "step": 8603 }, { "epoch": 7.857534246575343, "grad_norm": 0.45018577575683594, "learning_rate": 2.381532217148656e-06, "loss": 0.0025, "step": 8604 }, { "epoch": 7.858447488584475, "grad_norm": 3.7729945182800293, "learning_rate": 2.3805175038051755e-06, "loss": 0.0182, "step": 8605 }, { "epoch": 7.8593607305936075, "grad_norm": 0.11737528443336487, "learning_rate": 2.379502790461695e-06, "loss": 0.0007, "step": 8606 }, { "epoch": 7.86027397260274, "grad_norm": 14.902261734008789, "learning_rate": 2.378488077118214e-06, "loss": 0.1002, "step": 8607 }, { "epoch": 7.861187214611872, "grad_norm": 0.12886297702789307, "learning_rate": 2.3774733637747338e-06, "loss": 0.0008, "step": 8608 }, { "epoch": 7.862100456621005, "grad_norm": 1.2882157564163208, "learning_rate": 2.3764586504312534e-06, "loss": 0.0084, "step": 8609 }, { "epoch": 7.863013698630137, "grad_norm": 1.9093855619430542, "learning_rate": 2.375443937087773e-06, "loss": 0.0096, "step": 8610 }, { "epoch": 7.86392694063927, "grad_norm": 0.19366833567619324, "learning_rate": 2.3744292237442924e-06, "loss": 0.0013, "step": 8611 }, { "epoch": 7.864840182648402, "grad_norm": 0.24874430894851685, "learning_rate": 2.373414510400812e-06, "loss": 0.0014, "step": 8612 }, { "epoch": 7.865753424657534, "grad_norm": 0.3686520457267761, "learning_rate": 2.3723997970573314e-06, "loss": 0.0027, "step": 8613 }, { "epoch": 7.866666666666667, "grad_norm": 1.4487229585647583, "learning_rate": 2.371385083713851e-06, "loss": 0.0125, "step": 8614 }, { "epoch": 7.867579908675799, "grad_norm": 6.987934112548828, "learning_rate": 2.3703703703703707e-06, "loss": 0.0467, "step": 8615 }, { "epoch": 7.868493150684931, "grad_norm": 1.0132896900177002, "learning_rate": 2.36935565702689e-06, "loss": 0.0062, "step": 8616 }, { "epoch": 7.869406392694064, "grad_norm": 0.3744325637817383, "learning_rate": 2.3683409436834097e-06, "loss": 0.0025, "step": 8617 }, { "epoch": 7.870319634703196, "grad_norm": 0.7948901653289795, "learning_rate": 2.367326230339929e-06, "loss": 0.0056, "step": 8618 }, { "epoch": 7.8712328767123285, "grad_norm": 1.0745128393173218, "learning_rate": 2.3663115169964487e-06, "loss": 0.0055, "step": 8619 }, { "epoch": 7.872146118721461, "grad_norm": 3.2417335510253906, "learning_rate": 2.3652968036529684e-06, "loss": 0.0223, "step": 8620 }, { "epoch": 7.873059360730593, "grad_norm": 18.018705368041992, "learning_rate": 2.3642820903094876e-06, "loss": 0.0868, "step": 8621 }, { "epoch": 7.873972602739726, "grad_norm": 0.04386872798204422, "learning_rate": 2.3632673769660073e-06, "loss": 0.0002, "step": 8622 }, { "epoch": 7.874885844748858, "grad_norm": 0.059822872281074524, "learning_rate": 2.362252663622527e-06, "loss": 0.0004, "step": 8623 }, { "epoch": 7.875799086757991, "grad_norm": 2.3520195484161377, "learning_rate": 2.3612379502790463e-06, "loss": 0.0126, "step": 8624 }, { "epoch": 7.876712328767123, "grad_norm": 1.3887853622436523, "learning_rate": 2.360223236935566e-06, "loss": 0.0085, "step": 8625 }, { "epoch": 7.877625570776256, "grad_norm": 0.39965343475341797, "learning_rate": 2.3592085235920852e-06, "loss": 0.0024, "step": 8626 }, { "epoch": 7.878538812785388, "grad_norm": 0.026942281052470207, "learning_rate": 2.358193810248605e-06, "loss": 0.0001, "step": 8627 }, { "epoch": 7.879452054794521, "grad_norm": 0.43493884801864624, "learning_rate": 2.3571790969051246e-06, "loss": 0.0022, "step": 8628 }, { "epoch": 7.880365296803653, "grad_norm": 6.669591426849365, "learning_rate": 2.356164383561644e-06, "loss": 0.0524, "step": 8629 }, { "epoch": 7.8812785388127855, "grad_norm": 3.1628549098968506, "learning_rate": 2.3551496702181636e-06, "loss": 0.0183, "step": 8630 }, { "epoch": 7.882191780821918, "grad_norm": 0.7831675410270691, "learning_rate": 2.354134956874683e-06, "loss": 0.0052, "step": 8631 }, { "epoch": 7.88310502283105, "grad_norm": 0.28771525621414185, "learning_rate": 2.3531202435312025e-06, "loss": 0.0021, "step": 8632 }, { "epoch": 7.884018264840183, "grad_norm": 0.020392581820487976, "learning_rate": 2.3521055301877222e-06, "loss": 0.0001, "step": 8633 }, { "epoch": 7.884931506849315, "grad_norm": 1.506357192993164, "learning_rate": 2.351090816844242e-06, "loss": 0.0093, "step": 8634 }, { "epoch": 7.885844748858448, "grad_norm": 1.1452014446258545, "learning_rate": 2.350076103500761e-06, "loss": 0.0074, "step": 8635 }, { "epoch": 7.88675799086758, "grad_norm": 44.0944709777832, "learning_rate": 2.3490613901572805e-06, "loss": 0.3939, "step": 8636 }, { "epoch": 7.887671232876713, "grad_norm": 1.8767802715301514, "learning_rate": 2.3480466768138e-06, "loss": 0.0081, "step": 8637 }, { "epoch": 7.888584474885845, "grad_norm": 0.08108679950237274, "learning_rate": 2.34703196347032e-06, "loss": 0.0004, "step": 8638 }, { "epoch": 7.889497716894978, "grad_norm": 0.5094614624977112, "learning_rate": 2.3460172501268395e-06, "loss": 0.0002, "step": 8639 }, { "epoch": 7.890410958904109, "grad_norm": 0.01573014445602894, "learning_rate": 2.345002536783359e-06, "loss": 0.0001, "step": 8640 }, { "epoch": 7.8913242009132425, "grad_norm": 0.14107799530029297, "learning_rate": 2.343987823439878e-06, "loss": 0.0008, "step": 8641 }, { "epoch": 7.892237442922374, "grad_norm": 2.0626773834228516, "learning_rate": 2.3429731100963978e-06, "loss": 0.0132, "step": 8642 }, { "epoch": 7.8931506849315065, "grad_norm": 0.5450036525726318, "learning_rate": 2.3419583967529175e-06, "loss": 0.0027, "step": 8643 }, { "epoch": 7.894063926940639, "grad_norm": 0.16824983060359955, "learning_rate": 2.340943683409437e-06, "loss": 0.0011, "step": 8644 }, { "epoch": 7.894977168949771, "grad_norm": 6.812829971313477, "learning_rate": 2.339928970065957e-06, "loss": 0.0473, "step": 8645 }, { "epoch": 7.895890410958904, "grad_norm": 0.2691633999347687, "learning_rate": 2.338914256722476e-06, "loss": 0.0015, "step": 8646 }, { "epoch": 7.896803652968036, "grad_norm": 127.51415252685547, "learning_rate": 2.3378995433789954e-06, "loss": 2.0563, "step": 8647 }, { "epoch": 7.897716894977169, "grad_norm": 0.5588159561157227, "learning_rate": 2.336884830035515e-06, "loss": 0.003, "step": 8648 }, { "epoch": 7.898630136986301, "grad_norm": 6.737799644470215, "learning_rate": 2.3358701166920348e-06, "loss": 0.0491, "step": 8649 }, { "epoch": 7.899543378995434, "grad_norm": 2.6937692165374756, "learning_rate": 2.3348554033485545e-06, "loss": 0.0103, "step": 8650 }, { "epoch": 7.900456621004566, "grad_norm": 2.184013843536377, "learning_rate": 2.3338406900050737e-06, "loss": 0.009, "step": 8651 }, { "epoch": 7.901369863013699, "grad_norm": 3.928253173828125, "learning_rate": 2.3328259766615934e-06, "loss": 0.0181, "step": 8652 }, { "epoch": 7.902283105022831, "grad_norm": 1.1347641944885254, "learning_rate": 2.3318112633181127e-06, "loss": 0.0069, "step": 8653 }, { "epoch": 7.9031963470319635, "grad_norm": 1.7182731628417969, "learning_rate": 2.3307965499746324e-06, "loss": 0.0116, "step": 8654 }, { "epoch": 7.904109589041096, "grad_norm": 1.2105790376663208, "learning_rate": 2.329781836631152e-06, "loss": 0.0058, "step": 8655 }, { "epoch": 7.905022831050228, "grad_norm": 0.05096876621246338, "learning_rate": 2.3287671232876713e-06, "loss": 0.0003, "step": 8656 }, { "epoch": 7.905936073059361, "grad_norm": 0.42546164989471436, "learning_rate": 2.327752409944191e-06, "loss": 0.003, "step": 8657 }, { "epoch": 7.906849315068493, "grad_norm": 0.2573600113391876, "learning_rate": 2.3267376966007103e-06, "loss": 0.0015, "step": 8658 }, { "epoch": 7.907762557077626, "grad_norm": 0.036481987684965134, "learning_rate": 2.32572298325723e-06, "loss": 0.0002, "step": 8659 }, { "epoch": 7.908675799086758, "grad_norm": 0.14605712890625, "learning_rate": 2.3247082699137497e-06, "loss": 0.0011, "step": 8660 }, { "epoch": 7.909589041095891, "grad_norm": 0.21132738888263702, "learning_rate": 2.323693556570269e-06, "loss": 0.0014, "step": 8661 }, { "epoch": 7.910502283105023, "grad_norm": 0.32445773482322693, "learning_rate": 2.3226788432267887e-06, "loss": 0.0015, "step": 8662 }, { "epoch": 7.911415525114156, "grad_norm": 1.974599838256836, "learning_rate": 2.3216641298833083e-06, "loss": 0.016, "step": 8663 }, { "epoch": 7.912328767123288, "grad_norm": 0.00903486367315054, "learning_rate": 2.3206494165398276e-06, "loss": 0.0001, "step": 8664 }, { "epoch": 7.91324200913242, "grad_norm": 0.08537925779819489, "learning_rate": 2.3196347031963473e-06, "loss": 0.0004, "step": 8665 }, { "epoch": 7.914155251141553, "grad_norm": 0.35153672099113464, "learning_rate": 2.3186199898528666e-06, "loss": 0.0019, "step": 8666 }, { "epoch": 7.9150684931506845, "grad_norm": 12.014708518981934, "learning_rate": 2.3176052765093863e-06, "loss": 0.0542, "step": 8667 }, { "epoch": 7.915981735159818, "grad_norm": 4.257312774658203, "learning_rate": 2.316590563165906e-06, "loss": 0.0213, "step": 8668 }, { "epoch": 7.916894977168949, "grad_norm": 1.217321753501892, "learning_rate": 2.3155758498224252e-06, "loss": 0.0078, "step": 8669 }, { "epoch": 7.917808219178082, "grad_norm": 11.762221336364746, "learning_rate": 2.314561136478945e-06, "loss": 0.0556, "step": 8670 }, { "epoch": 7.918721461187214, "grad_norm": 7.731950283050537, "learning_rate": 2.313546423135464e-06, "loss": 0.0364, "step": 8671 }, { "epoch": 7.919634703196347, "grad_norm": 1.2634953260421753, "learning_rate": 2.312531709791984e-06, "loss": 0.0076, "step": 8672 }, { "epoch": 7.920547945205479, "grad_norm": 0.07754623144865036, "learning_rate": 2.3115169964485036e-06, "loss": 0.0004, "step": 8673 }, { "epoch": 7.921461187214612, "grad_norm": 6.456605911254883, "learning_rate": 2.3105022831050233e-06, "loss": 0.0401, "step": 8674 }, { "epoch": 7.922374429223744, "grad_norm": 1.6628679037094116, "learning_rate": 2.3094875697615425e-06, "loss": 0.007, "step": 8675 }, { "epoch": 7.923287671232877, "grad_norm": 0.331478476524353, "learning_rate": 2.308472856418062e-06, "loss": 0.0021, "step": 8676 }, { "epoch": 7.924200913242009, "grad_norm": 0.02848348207771778, "learning_rate": 2.3074581430745815e-06, "loss": 0.0002, "step": 8677 }, { "epoch": 7.9251141552511415, "grad_norm": 0.7820529341697693, "learning_rate": 2.306443429731101e-06, "loss": 0.0041, "step": 8678 }, { "epoch": 7.926027397260274, "grad_norm": 1.502226710319519, "learning_rate": 2.305428716387621e-06, "loss": 0.0102, "step": 8679 }, { "epoch": 7.926940639269406, "grad_norm": 0.5146804451942444, "learning_rate": 2.30441400304414e-06, "loss": 0.0016, "step": 8680 }, { "epoch": 7.927853881278539, "grad_norm": 22.42821502685547, "learning_rate": 2.3033992897006594e-06, "loss": 0.1344, "step": 8681 }, { "epoch": 7.928767123287671, "grad_norm": 0.765306830406189, "learning_rate": 2.302384576357179e-06, "loss": 0.0048, "step": 8682 }, { "epoch": 7.929680365296804, "grad_norm": 14.177045822143555, "learning_rate": 2.301369863013699e-06, "loss": 0.1046, "step": 8683 }, { "epoch": 7.930593607305936, "grad_norm": 0.10759501904249191, "learning_rate": 2.3003551496702185e-06, "loss": 0.0007, "step": 8684 }, { "epoch": 7.931506849315069, "grad_norm": 65.41170501708984, "learning_rate": 2.299340436326738e-06, "loss": 0.7056, "step": 8685 }, { "epoch": 7.932420091324201, "grad_norm": 0.11246052384376526, "learning_rate": 2.2983257229832575e-06, "loss": 0.0007, "step": 8686 }, { "epoch": 7.933333333333334, "grad_norm": 0.4308805465698242, "learning_rate": 2.2973110096397767e-06, "loss": 0.0028, "step": 8687 }, { "epoch": 7.934246575342466, "grad_norm": 0.25941792130470276, "learning_rate": 2.2962962962962964e-06, "loss": 0.0019, "step": 8688 }, { "epoch": 7.9351598173515985, "grad_norm": 0.6312593221664429, "learning_rate": 2.295281582952816e-06, "loss": 0.0042, "step": 8689 }, { "epoch": 7.936073059360731, "grad_norm": 1.5359779596328735, "learning_rate": 2.294266869609336e-06, "loss": 0.0089, "step": 8690 }, { "epoch": 7.936986301369863, "grad_norm": 0.426585853099823, "learning_rate": 2.293252156265855e-06, "loss": 0.0025, "step": 8691 }, { "epoch": 7.937899543378995, "grad_norm": 46.41415786743164, "learning_rate": 2.2922374429223748e-06, "loss": 0.3748, "step": 8692 }, { "epoch": 7.938812785388128, "grad_norm": 0.777534544467926, "learning_rate": 2.291222729578894e-06, "loss": 0.0035, "step": 8693 }, { "epoch": 7.93972602739726, "grad_norm": 0.1706392467021942, "learning_rate": 2.2902080162354137e-06, "loss": 0.0012, "step": 8694 }, { "epoch": 7.940639269406392, "grad_norm": 0.42136508226394653, "learning_rate": 2.2891933028919334e-06, "loss": 0.0017, "step": 8695 }, { "epoch": 7.941552511415525, "grad_norm": 0.015042421407997608, "learning_rate": 2.2881785895484527e-06, "loss": 0.0001, "step": 8696 }, { "epoch": 7.942465753424657, "grad_norm": 0.24134813249111176, "learning_rate": 2.2871638762049724e-06, "loss": 0.0014, "step": 8697 }, { "epoch": 7.94337899543379, "grad_norm": 0.12941113114356995, "learning_rate": 2.2861491628614916e-06, "loss": 0.0007, "step": 8698 }, { "epoch": 7.944292237442922, "grad_norm": 11.816582679748535, "learning_rate": 2.2851344495180113e-06, "loss": 0.0677, "step": 8699 }, { "epoch": 7.945205479452055, "grad_norm": 0.19199322164058685, "learning_rate": 2.284119736174531e-06, "loss": 0.0013, "step": 8700 }, { "epoch": 7.946118721461187, "grad_norm": 0.8946317434310913, "learning_rate": 2.2831050228310503e-06, "loss": 0.0041, "step": 8701 }, { "epoch": 7.9470319634703195, "grad_norm": 30.70130157470703, "learning_rate": 2.28209030948757e-06, "loss": 0.1601, "step": 8702 }, { "epoch": 7.947945205479452, "grad_norm": 0.6731172800064087, "learning_rate": 2.2810755961440897e-06, "loss": 0.0037, "step": 8703 }, { "epoch": 7.948858447488584, "grad_norm": 0.12807349860668182, "learning_rate": 2.280060882800609e-06, "loss": 0.0009, "step": 8704 }, { "epoch": 7.949771689497717, "grad_norm": 0.6762524247169495, "learning_rate": 2.2790461694571286e-06, "loss": 0.0032, "step": 8705 }, { "epoch": 7.950684931506849, "grad_norm": 0.21822278201580048, "learning_rate": 2.278031456113648e-06, "loss": 0.0017, "step": 8706 }, { "epoch": 7.951598173515982, "grad_norm": 2.2169511318206787, "learning_rate": 2.2770167427701676e-06, "loss": 0.0195, "step": 8707 }, { "epoch": 7.952511415525114, "grad_norm": 0.5049201846122742, "learning_rate": 2.2760020294266873e-06, "loss": 0.0027, "step": 8708 }, { "epoch": 7.953424657534247, "grad_norm": 2.5018539428710938, "learning_rate": 2.2749873160832066e-06, "loss": 0.0205, "step": 8709 }, { "epoch": 7.954337899543379, "grad_norm": 1.171090006828308, "learning_rate": 2.2739726027397262e-06, "loss": 0.0077, "step": 8710 }, { "epoch": 7.955251141552512, "grad_norm": 0.03131505846977234, "learning_rate": 2.2729578893962455e-06, "loss": 0.0003, "step": 8711 }, { "epoch": 7.956164383561644, "grad_norm": 0.40983298420906067, "learning_rate": 2.271943176052765e-06, "loss": 0.0025, "step": 8712 }, { "epoch": 7.9570776255707765, "grad_norm": 13.11805248260498, "learning_rate": 2.270928462709285e-06, "loss": 0.0826, "step": 8713 }, { "epoch": 7.957990867579909, "grad_norm": 8.861773490905762, "learning_rate": 2.2699137493658046e-06, "loss": 0.0508, "step": 8714 }, { "epoch": 7.958904109589041, "grad_norm": 2.168978452682495, "learning_rate": 2.268899036022324e-06, "loss": 0.015, "step": 8715 }, { "epoch": 7.959817351598174, "grad_norm": 129.06517028808594, "learning_rate": 2.267884322678843e-06, "loss": 3.1863, "step": 8716 }, { "epoch": 7.960730593607306, "grad_norm": 0.24518120288848877, "learning_rate": 2.266869609335363e-06, "loss": 0.0014, "step": 8717 }, { "epoch": 7.961643835616439, "grad_norm": 0.06821990013122559, "learning_rate": 2.2658548959918825e-06, "loss": 0.0002, "step": 8718 }, { "epoch": 7.96255707762557, "grad_norm": 2.0370309352874756, "learning_rate": 2.264840182648402e-06, "loss": 0.0125, "step": 8719 }, { "epoch": 7.963470319634704, "grad_norm": 1.1074812412261963, "learning_rate": 2.2638254693049215e-06, "loss": 0.0049, "step": 8720 }, { "epoch": 7.964383561643835, "grad_norm": 14.30248737335205, "learning_rate": 2.2628107559614407e-06, "loss": 0.0558, "step": 8721 }, { "epoch": 7.965296803652968, "grad_norm": 0.44594141840934753, "learning_rate": 2.2617960426179604e-06, "loss": 0.0013, "step": 8722 }, { "epoch": 7.9662100456621, "grad_norm": 0.6995487213134766, "learning_rate": 2.26078132927448e-06, "loss": 0.0043, "step": 8723 }, { "epoch": 7.967123287671233, "grad_norm": 0.2759765386581421, "learning_rate": 2.259766615931e-06, "loss": 0.0013, "step": 8724 }, { "epoch": 7.968036529680365, "grad_norm": 0.17133931815624237, "learning_rate": 2.2587519025875195e-06, "loss": 0.0011, "step": 8725 }, { "epoch": 7.9689497716894975, "grad_norm": 2.5975496768951416, "learning_rate": 2.2577371892440388e-06, "loss": 0.0155, "step": 8726 }, { "epoch": 7.96986301369863, "grad_norm": 0.235341414809227, "learning_rate": 2.256722475900558e-06, "loss": 0.0013, "step": 8727 }, { "epoch": 7.970776255707762, "grad_norm": 0.277878999710083, "learning_rate": 2.2557077625570777e-06, "loss": 0.0016, "step": 8728 }, { "epoch": 7.971689497716895, "grad_norm": 0.753678023815155, "learning_rate": 2.2546930492135974e-06, "loss": 0.0033, "step": 8729 }, { "epoch": 7.972602739726027, "grad_norm": 0.9560950994491577, "learning_rate": 2.253678335870117e-06, "loss": 0.0067, "step": 8730 }, { "epoch": 7.97351598173516, "grad_norm": 0.22181059420108795, "learning_rate": 2.2526636225266364e-06, "loss": 0.0014, "step": 8731 }, { "epoch": 7.974429223744292, "grad_norm": 2.878368616104126, "learning_rate": 2.2516489091831557e-06, "loss": 0.0065, "step": 8732 }, { "epoch": 7.975342465753425, "grad_norm": 0.6497061848640442, "learning_rate": 2.2506341958396754e-06, "loss": 0.0037, "step": 8733 }, { "epoch": 7.976255707762557, "grad_norm": 0.6347007751464844, "learning_rate": 2.249619482496195e-06, "loss": 0.004, "step": 8734 }, { "epoch": 7.9771689497716896, "grad_norm": 0.1394694298505783, "learning_rate": 2.2486047691527147e-06, "loss": 0.0011, "step": 8735 }, { "epoch": 7.978082191780822, "grad_norm": 0.011442429386079311, "learning_rate": 2.247590055809234e-06, "loss": 0.0001, "step": 8736 }, { "epoch": 7.9789954337899545, "grad_norm": 0.6546228528022766, "learning_rate": 2.2465753424657537e-06, "loss": 0.0034, "step": 8737 }, { "epoch": 7.979908675799087, "grad_norm": 0.5078375339508057, "learning_rate": 2.245560629122273e-06, "loss": 0.0033, "step": 8738 }, { "epoch": 7.980821917808219, "grad_norm": 6.696829795837402, "learning_rate": 2.2445459157787927e-06, "loss": 0.0244, "step": 8739 }, { "epoch": 7.981735159817352, "grad_norm": 1.2173877954483032, "learning_rate": 2.2435312024353124e-06, "loss": 0.0092, "step": 8740 }, { "epoch": 7.982648401826484, "grad_norm": 121.71446228027344, "learning_rate": 2.2425164890918316e-06, "loss": 7.3769, "step": 8741 }, { "epoch": 7.983561643835617, "grad_norm": 0.6220061182975769, "learning_rate": 2.2415017757483513e-06, "loss": 0.003, "step": 8742 }, { "epoch": 7.984474885844749, "grad_norm": 47.195167541503906, "learning_rate": 2.240487062404871e-06, "loss": 0.1921, "step": 8743 }, { "epoch": 7.985388127853882, "grad_norm": 0.15572910010814667, "learning_rate": 2.2394723490613903e-06, "loss": 0.0008, "step": 8744 }, { "epoch": 7.986301369863014, "grad_norm": 0.7817493677139282, "learning_rate": 2.23845763571791e-06, "loss": 0.0049, "step": 8745 }, { "epoch": 7.987214611872146, "grad_norm": 0.4962098300457001, "learning_rate": 2.2374429223744292e-06, "loss": 0.0026, "step": 8746 }, { "epoch": 7.988127853881279, "grad_norm": 0.3359716534614563, "learning_rate": 2.236428209030949e-06, "loss": 0.0013, "step": 8747 }, { "epoch": 7.989041095890411, "grad_norm": 1.1259573698043823, "learning_rate": 2.2354134956874686e-06, "loss": 0.0054, "step": 8748 }, { "epoch": 7.989954337899543, "grad_norm": 143.63510131835938, "learning_rate": 2.234398782343988e-06, "loss": 0.8047, "step": 8749 }, { "epoch": 7.9908675799086755, "grad_norm": 1.4526798725128174, "learning_rate": 2.2333840690005076e-06, "loss": 0.007, "step": 8750 }, { "epoch": 7.991780821917808, "grad_norm": 0.23376937210559845, "learning_rate": 2.232369355657027e-06, "loss": 0.0014, "step": 8751 }, { "epoch": 7.99269406392694, "grad_norm": 0.3621926009654999, "learning_rate": 2.2313546423135465e-06, "loss": 0.0008, "step": 8752 }, { "epoch": 7.993607305936073, "grad_norm": 5.0306596755981445, "learning_rate": 2.2303399289700662e-06, "loss": 0.0306, "step": 8753 }, { "epoch": 7.994520547945205, "grad_norm": 1.6740788221359253, "learning_rate": 2.229325215626586e-06, "loss": 0.0087, "step": 8754 }, { "epoch": 7.995433789954338, "grad_norm": 10.40986442565918, "learning_rate": 2.228310502283105e-06, "loss": 0.0463, "step": 8755 }, { "epoch": 7.99634703196347, "grad_norm": 0.08996139466762543, "learning_rate": 2.2272957889396245e-06, "loss": 0.0004, "step": 8756 }, { "epoch": 7.997260273972603, "grad_norm": 5.379939079284668, "learning_rate": 2.226281075596144e-06, "loss": 0.0351, "step": 8757 }, { "epoch": 7.998173515981735, "grad_norm": 0.051301125437021255, "learning_rate": 2.225266362252664e-06, "loss": 0.0003, "step": 8758 }, { "epoch": 7.9990867579908675, "grad_norm": 0.23117657005786896, "learning_rate": 2.2242516489091835e-06, "loss": 0.0015, "step": 8759 }, { "epoch": 8.0, "grad_norm": 0.12424344569444656, "learning_rate": 2.223236935565703e-06, "loss": 0.0008, "step": 8760 }, { "epoch": 8.000913242009132, "grad_norm": 8.419550895690918, "learning_rate": 2.222222222222222e-06, "loss": 0.0603, "step": 8761 }, { "epoch": 8.001826484018265, "grad_norm": 0.1302221268415451, "learning_rate": 2.2212075088787418e-06, "loss": 0.0009, "step": 8762 }, { "epoch": 8.002739726027396, "grad_norm": 0.0755145475268364, "learning_rate": 2.2201927955352615e-06, "loss": 0.0003, "step": 8763 }, { "epoch": 8.00365296803653, "grad_norm": 1.3536001443862915, "learning_rate": 2.219178082191781e-06, "loss": 0.0072, "step": 8764 }, { "epoch": 8.004566210045661, "grad_norm": 3.866661548614502, "learning_rate": 2.218163368848301e-06, "loss": 0.0157, "step": 8765 }, { "epoch": 8.005479452054795, "grad_norm": 0.2678946852684021, "learning_rate": 2.21714865550482e-06, "loss": 0.0014, "step": 8766 }, { "epoch": 8.006392694063926, "grad_norm": 6.915736198425293, "learning_rate": 2.2161339421613394e-06, "loss": 0.0431, "step": 8767 }, { "epoch": 8.00730593607306, "grad_norm": 0.26297858357429504, "learning_rate": 2.215119228817859e-06, "loss": 0.0014, "step": 8768 }, { "epoch": 8.008219178082191, "grad_norm": 4.605223178863525, "learning_rate": 2.2141045154743788e-06, "loss": 0.0273, "step": 8769 }, { "epoch": 8.009132420091325, "grad_norm": 5.640315055847168, "learning_rate": 2.2130898021308985e-06, "loss": 0.0394, "step": 8770 }, { "epoch": 8.010045662100456, "grad_norm": 4.122231960296631, "learning_rate": 2.2120750887874177e-06, "loss": 0.0152, "step": 8771 }, { "epoch": 8.01095890410959, "grad_norm": 0.13915082812309265, "learning_rate": 2.211060375443937e-06, "loss": 0.0011, "step": 8772 }, { "epoch": 8.011872146118721, "grad_norm": 0.24740096926689148, "learning_rate": 2.2100456621004567e-06, "loss": 0.0017, "step": 8773 }, { "epoch": 8.012785388127854, "grad_norm": 2.460348606109619, "learning_rate": 2.2090309487569764e-06, "loss": 0.012, "step": 8774 }, { "epoch": 8.013698630136986, "grad_norm": 12.462983131408691, "learning_rate": 2.208016235413496e-06, "loss": 0.0729, "step": 8775 }, { "epoch": 8.01461187214612, "grad_norm": 0.01834648661315441, "learning_rate": 2.2070015220700153e-06, "loss": 0.0001, "step": 8776 }, { "epoch": 8.01552511415525, "grad_norm": 14.07861042022705, "learning_rate": 2.205986808726535e-06, "loss": 0.0903, "step": 8777 }, { "epoch": 8.016438356164384, "grad_norm": 5.340443134307861, "learning_rate": 2.2049720953830543e-06, "loss": 0.026, "step": 8778 }, { "epoch": 8.017351598173516, "grad_norm": 13.698121070861816, "learning_rate": 2.203957382039574e-06, "loss": 0.0603, "step": 8779 }, { "epoch": 8.018264840182649, "grad_norm": 16.156597137451172, "learning_rate": 2.2029426686960937e-06, "loss": 0.0808, "step": 8780 }, { "epoch": 8.01917808219178, "grad_norm": 0.3961690068244934, "learning_rate": 2.201927955352613e-06, "loss": 0.0024, "step": 8781 }, { "epoch": 8.020091324200914, "grad_norm": 1.0966075658798218, "learning_rate": 2.2009132420091326e-06, "loss": 0.0072, "step": 8782 }, { "epoch": 8.021004566210046, "grad_norm": 11.100069046020508, "learning_rate": 2.199898528665652e-06, "loss": 0.0566, "step": 8783 }, { "epoch": 8.021917808219179, "grad_norm": 0.9903305768966675, "learning_rate": 2.1988838153221716e-06, "loss": 0.0053, "step": 8784 }, { "epoch": 8.02283105022831, "grad_norm": 0.5797539353370667, "learning_rate": 2.1978691019786913e-06, "loss": 0.0029, "step": 8785 }, { "epoch": 8.023744292237444, "grad_norm": 22.02385902404785, "learning_rate": 2.1968543886352106e-06, "loss": 0.1084, "step": 8786 }, { "epoch": 8.024657534246575, "grad_norm": 7.54217529296875, "learning_rate": 2.1958396752917303e-06, "loss": 0.063, "step": 8787 }, { "epoch": 8.025570776255707, "grad_norm": 0.5055306553840637, "learning_rate": 2.19482496194825e-06, "loss": 0.0038, "step": 8788 }, { "epoch": 8.02648401826484, "grad_norm": 0.9577555656433105, "learning_rate": 2.1938102486047692e-06, "loss": 0.0075, "step": 8789 }, { "epoch": 8.027397260273972, "grad_norm": 0.09328955411911011, "learning_rate": 2.192795535261289e-06, "loss": 0.0007, "step": 8790 }, { "epoch": 8.028310502283105, "grad_norm": 0.01498815044760704, "learning_rate": 2.191780821917808e-06, "loss": 0.0001, "step": 8791 }, { "epoch": 8.029223744292237, "grad_norm": 1.3484541177749634, "learning_rate": 2.190766108574328e-06, "loss": 0.0071, "step": 8792 }, { "epoch": 8.03013698630137, "grad_norm": 13.563158988952637, "learning_rate": 2.1897513952308476e-06, "loss": 0.108, "step": 8793 }, { "epoch": 8.031050228310502, "grad_norm": 1.2872376441955566, "learning_rate": 2.1887366818873673e-06, "loss": 0.0071, "step": 8794 }, { "epoch": 8.031963470319635, "grad_norm": 19.978044509887695, "learning_rate": 2.1877219685438865e-06, "loss": 0.1177, "step": 8795 }, { "epoch": 8.032876712328767, "grad_norm": 104.1176986694336, "learning_rate": 2.186707255200406e-06, "loss": 1.1279, "step": 8796 }, { "epoch": 8.0337899543379, "grad_norm": 0.7932170033454895, "learning_rate": 2.1856925418569255e-06, "loss": 0.0034, "step": 8797 }, { "epoch": 8.034703196347031, "grad_norm": 1.692124605178833, "learning_rate": 2.184677828513445e-06, "loss": 0.0092, "step": 8798 }, { "epoch": 8.035616438356165, "grad_norm": 0.00680588511750102, "learning_rate": 2.183663115169965e-06, "loss": 0.0, "step": 8799 }, { "epoch": 8.036529680365296, "grad_norm": 0.4439351260662079, "learning_rate": 2.182648401826484e-06, "loss": 0.0018, "step": 8800 }, { "epoch": 8.03744292237443, "grad_norm": 8.281052589416504, "learning_rate": 2.1816336884830034e-06, "loss": 0.047, "step": 8801 }, { "epoch": 8.038356164383561, "grad_norm": 0.6041961908340454, "learning_rate": 2.180618975139523e-06, "loss": 0.003, "step": 8802 }, { "epoch": 8.039269406392695, "grad_norm": 14.38916015625, "learning_rate": 2.179604261796043e-06, "loss": 0.0908, "step": 8803 }, { "epoch": 8.040182648401826, "grad_norm": 4.126951217651367, "learning_rate": 2.1785895484525625e-06, "loss": 0.0217, "step": 8804 }, { "epoch": 8.04109589041096, "grad_norm": 0.09926770627498627, "learning_rate": 2.177574835109082e-06, "loss": 0.0007, "step": 8805 }, { "epoch": 8.042009132420091, "grad_norm": 0.4772167503833771, "learning_rate": 2.1765601217656014e-06, "loss": 0.0029, "step": 8806 }, { "epoch": 8.042922374429224, "grad_norm": 2.1539320945739746, "learning_rate": 2.1755454084221207e-06, "loss": 0.009, "step": 8807 }, { "epoch": 8.043835616438356, "grad_norm": 2.165957450866699, "learning_rate": 2.1745306950786404e-06, "loss": 0.0055, "step": 8808 }, { "epoch": 8.04474885844749, "grad_norm": 0.06739775836467743, "learning_rate": 2.17351598173516e-06, "loss": 0.0006, "step": 8809 }, { "epoch": 8.045662100456621, "grad_norm": 5.95347261428833, "learning_rate": 2.1725012683916798e-06, "loss": 0.0377, "step": 8810 }, { "epoch": 8.046575342465754, "grad_norm": 1.2434146404266357, "learning_rate": 2.171486555048199e-06, "loss": 0.0063, "step": 8811 }, { "epoch": 8.047488584474886, "grad_norm": 5.797818660736084, "learning_rate": 2.1704718417047183e-06, "loss": 0.0412, "step": 8812 }, { "epoch": 8.04840182648402, "grad_norm": 0.4657653272151947, "learning_rate": 2.169457128361238e-06, "loss": 0.0035, "step": 8813 }, { "epoch": 8.04931506849315, "grad_norm": 0.5439661145210266, "learning_rate": 2.1684424150177577e-06, "loss": 0.0024, "step": 8814 }, { "epoch": 8.050228310502282, "grad_norm": 2.1787054538726807, "learning_rate": 2.1674277016742774e-06, "loss": 0.0095, "step": 8815 }, { "epoch": 8.051141552511416, "grad_norm": 0.3113441467285156, "learning_rate": 2.1664129883307967e-06, "loss": 0.0017, "step": 8816 }, { "epoch": 8.052054794520547, "grad_norm": 0.19010767340660095, "learning_rate": 2.1653982749873164e-06, "loss": 0.0011, "step": 8817 }, { "epoch": 8.05296803652968, "grad_norm": 6.32167911529541, "learning_rate": 2.1643835616438356e-06, "loss": 0.0328, "step": 8818 }, { "epoch": 8.053881278538812, "grad_norm": 0.006179508287459612, "learning_rate": 2.1633688483003553e-06, "loss": 0.0, "step": 8819 }, { "epoch": 8.054794520547945, "grad_norm": 0.5066574811935425, "learning_rate": 2.162354134956875e-06, "loss": 0.0032, "step": 8820 }, { "epoch": 8.055707762557077, "grad_norm": 0.051369015127420425, "learning_rate": 2.1613394216133943e-06, "loss": 0.0003, "step": 8821 }, { "epoch": 8.05662100456621, "grad_norm": 0.42337164282798767, "learning_rate": 2.160324708269914e-06, "loss": 0.0019, "step": 8822 }, { "epoch": 8.057534246575342, "grad_norm": 0.1205122321844101, "learning_rate": 2.1593099949264332e-06, "loss": 0.0006, "step": 8823 }, { "epoch": 8.058447488584475, "grad_norm": 0.25686606764793396, "learning_rate": 2.158295281582953e-06, "loss": 0.0019, "step": 8824 }, { "epoch": 8.059360730593607, "grad_norm": 0.2962682247161865, "learning_rate": 2.1572805682394726e-06, "loss": 0.0013, "step": 8825 }, { "epoch": 8.06027397260274, "grad_norm": 0.633305549621582, "learning_rate": 2.156265854895992e-06, "loss": 0.0039, "step": 8826 }, { "epoch": 8.061187214611872, "grad_norm": 0.45960360765457153, "learning_rate": 2.1552511415525116e-06, "loss": 0.0032, "step": 8827 }, { "epoch": 8.062100456621005, "grad_norm": 1.7493807077407837, "learning_rate": 2.1542364282090313e-06, "loss": 0.0067, "step": 8828 }, { "epoch": 8.063013698630137, "grad_norm": 7.183986663818359, "learning_rate": 2.1532217148655505e-06, "loss": 0.041, "step": 8829 }, { "epoch": 8.06392694063927, "grad_norm": 0.3333410322666168, "learning_rate": 2.1522070015220702e-06, "loss": 0.0019, "step": 8830 }, { "epoch": 8.064840182648402, "grad_norm": 3.907606363296509, "learning_rate": 2.1511922881785895e-06, "loss": 0.0259, "step": 8831 }, { "epoch": 8.065753424657535, "grad_norm": 0.037609752267599106, "learning_rate": 2.150177574835109e-06, "loss": 0.0003, "step": 8832 }, { "epoch": 8.066666666666666, "grad_norm": 1.554431438446045, "learning_rate": 2.149162861491629e-06, "loss": 0.0062, "step": 8833 }, { "epoch": 8.0675799086758, "grad_norm": 0.14742180705070496, "learning_rate": 2.148148148148148e-06, "loss": 0.001, "step": 8834 }, { "epoch": 8.068493150684931, "grad_norm": 0.3214658498764038, "learning_rate": 2.147133434804668e-06, "loss": 0.0022, "step": 8835 }, { "epoch": 8.069406392694065, "grad_norm": 3.5714149475097656, "learning_rate": 2.146118721461187e-06, "loss": 0.0183, "step": 8836 }, { "epoch": 8.070319634703196, "grad_norm": 0.02553725242614746, "learning_rate": 2.145104008117707e-06, "loss": 0.0001, "step": 8837 }, { "epoch": 8.07123287671233, "grad_norm": 1.0314319133758545, "learning_rate": 2.1440892947742265e-06, "loss": 0.0051, "step": 8838 }, { "epoch": 8.072146118721461, "grad_norm": 0.5222984552383423, "learning_rate": 2.143074581430746e-06, "loss": 0.0033, "step": 8839 }, { "epoch": 8.073059360730593, "grad_norm": 2.495467185974121, "learning_rate": 2.1420598680872655e-06, "loss": 0.0105, "step": 8840 }, { "epoch": 8.073972602739726, "grad_norm": 0.16944347321987152, "learning_rate": 2.1410451547437847e-06, "loss": 0.0009, "step": 8841 }, { "epoch": 8.074885844748858, "grad_norm": 0.40060311555862427, "learning_rate": 2.1400304414003044e-06, "loss": 0.0025, "step": 8842 }, { "epoch": 8.075799086757991, "grad_norm": 13.353869438171387, "learning_rate": 2.139015728056824e-06, "loss": 0.0546, "step": 8843 }, { "epoch": 8.076712328767123, "grad_norm": 3.046522617340088, "learning_rate": 2.138001014713344e-06, "loss": 0.0161, "step": 8844 }, { "epoch": 8.077625570776256, "grad_norm": 2.0379586219787598, "learning_rate": 2.1369863013698635e-06, "loss": 0.0114, "step": 8845 }, { "epoch": 8.078538812785387, "grad_norm": 0.5287152528762817, "learning_rate": 2.1359715880263828e-06, "loss": 0.0035, "step": 8846 }, { "epoch": 8.07945205479452, "grad_norm": 1.1061186790466309, "learning_rate": 2.134956874682902e-06, "loss": 0.0059, "step": 8847 }, { "epoch": 8.080365296803652, "grad_norm": 0.11296027898788452, "learning_rate": 2.1339421613394217e-06, "loss": 0.0008, "step": 8848 }, { "epoch": 8.081278538812786, "grad_norm": 0.45755571126937866, "learning_rate": 2.1329274479959414e-06, "loss": 0.0036, "step": 8849 }, { "epoch": 8.082191780821917, "grad_norm": 5.666804790496826, "learning_rate": 2.131912734652461e-06, "loss": 0.0251, "step": 8850 }, { "epoch": 8.08310502283105, "grad_norm": 79.92664337158203, "learning_rate": 2.1308980213089804e-06, "loss": 0.8277, "step": 8851 }, { "epoch": 8.084018264840182, "grad_norm": 0.8770141005516052, "learning_rate": 2.1298833079654997e-06, "loss": 0.0057, "step": 8852 }, { "epoch": 8.084931506849315, "grad_norm": 3.237903356552124, "learning_rate": 2.1288685946220193e-06, "loss": 0.0224, "step": 8853 }, { "epoch": 8.085844748858447, "grad_norm": 2.496565103530884, "learning_rate": 2.127853881278539e-06, "loss": 0.0098, "step": 8854 }, { "epoch": 8.08675799086758, "grad_norm": 1.0746411085128784, "learning_rate": 2.1268391679350587e-06, "loss": 0.0039, "step": 8855 }, { "epoch": 8.087671232876712, "grad_norm": 2.3414835929870605, "learning_rate": 2.125824454591578e-06, "loss": 0.0113, "step": 8856 }, { "epoch": 8.088584474885845, "grad_norm": 3.829552412033081, "learning_rate": 2.1248097412480977e-06, "loss": 0.0199, "step": 8857 }, { "epoch": 8.089497716894977, "grad_norm": 1.8105577230453491, "learning_rate": 2.123795027904617e-06, "loss": 0.0096, "step": 8858 }, { "epoch": 8.09041095890411, "grad_norm": 0.06447520852088928, "learning_rate": 2.1227803145611367e-06, "loss": 0.0005, "step": 8859 }, { "epoch": 8.091324200913242, "grad_norm": 0.13151755928993225, "learning_rate": 2.1217656012176563e-06, "loss": 0.0008, "step": 8860 }, { "epoch": 8.092237442922375, "grad_norm": 5.409876346588135, "learning_rate": 2.1207508878741756e-06, "loss": 0.0298, "step": 8861 }, { "epoch": 8.093150684931507, "grad_norm": 0.5181869268417358, "learning_rate": 2.1197361745306953e-06, "loss": 0.0024, "step": 8862 }, { "epoch": 8.09406392694064, "grad_norm": 2.8646678924560547, "learning_rate": 2.1187214611872146e-06, "loss": 0.016, "step": 8863 }, { "epoch": 8.094977168949772, "grad_norm": 0.045178771018981934, "learning_rate": 2.1177067478437343e-06, "loss": 0.0003, "step": 8864 }, { "epoch": 8.095890410958905, "grad_norm": 1.6954160928726196, "learning_rate": 2.116692034500254e-06, "loss": 0.0076, "step": 8865 }, { "epoch": 8.096803652968037, "grad_norm": 5.993897914886475, "learning_rate": 2.1156773211567732e-06, "loss": 0.0312, "step": 8866 }, { "epoch": 8.097716894977168, "grad_norm": 9.014994621276855, "learning_rate": 2.114662607813293e-06, "loss": 0.0498, "step": 8867 }, { "epoch": 8.098630136986301, "grad_norm": 0.12840022146701813, "learning_rate": 2.1136478944698126e-06, "loss": 0.0008, "step": 8868 }, { "epoch": 8.099543378995433, "grad_norm": 0.5743170380592346, "learning_rate": 2.112633181126332e-06, "loss": 0.003, "step": 8869 }, { "epoch": 8.100456621004566, "grad_norm": 1.2945736646652222, "learning_rate": 2.1116184677828516e-06, "loss": 0.006, "step": 8870 }, { "epoch": 8.101369863013698, "grad_norm": 0.0093104038387537, "learning_rate": 2.110603754439371e-06, "loss": 0.0, "step": 8871 }, { "epoch": 8.102283105022831, "grad_norm": 1.5068415403366089, "learning_rate": 2.1095890410958905e-06, "loss": 0.0118, "step": 8872 }, { "epoch": 8.103196347031963, "grad_norm": 3.986407995223999, "learning_rate": 2.1085743277524102e-06, "loss": 0.0183, "step": 8873 }, { "epoch": 8.104109589041096, "grad_norm": 0.018633464351296425, "learning_rate": 2.1075596144089295e-06, "loss": 0.0001, "step": 8874 }, { "epoch": 8.105022831050228, "grad_norm": 5.472740650177002, "learning_rate": 2.106544901065449e-06, "loss": 0.0318, "step": 8875 }, { "epoch": 8.105936073059361, "grad_norm": 0.2979547083377838, "learning_rate": 2.1055301877219685e-06, "loss": 0.0023, "step": 8876 }, { "epoch": 8.106849315068493, "grad_norm": 1.1676688194274902, "learning_rate": 2.104515474378488e-06, "loss": 0.0067, "step": 8877 }, { "epoch": 8.107762557077626, "grad_norm": 3.9752144813537598, "learning_rate": 2.103500761035008e-06, "loss": 0.0136, "step": 8878 }, { "epoch": 8.108675799086758, "grad_norm": 0.5789807438850403, "learning_rate": 2.1024860476915275e-06, "loss": 0.0026, "step": 8879 }, { "epoch": 8.10958904109589, "grad_norm": 0.4895038902759552, "learning_rate": 2.101471334348047e-06, "loss": 0.0031, "step": 8880 }, { "epoch": 8.110502283105022, "grad_norm": 0.37347131967544556, "learning_rate": 2.100456621004566e-06, "loss": 0.002, "step": 8881 }, { "epoch": 8.111415525114156, "grad_norm": 38.012237548828125, "learning_rate": 2.0994419076610858e-06, "loss": 0.1803, "step": 8882 }, { "epoch": 8.112328767123287, "grad_norm": 2.2069759368896484, "learning_rate": 2.0984271943176054e-06, "loss": 0.015, "step": 8883 }, { "epoch": 8.11324200913242, "grad_norm": 0.09660995751619339, "learning_rate": 2.097412480974125e-06, "loss": 0.0005, "step": 8884 }, { "epoch": 8.114155251141552, "grad_norm": 0.11740870773792267, "learning_rate": 2.0963977676306444e-06, "loss": 0.0006, "step": 8885 }, { "epoch": 8.115068493150686, "grad_norm": 0.02049860544502735, "learning_rate": 2.095383054287164e-06, "loss": 0.0001, "step": 8886 }, { "epoch": 8.115981735159817, "grad_norm": 6.709043979644775, "learning_rate": 2.0943683409436834e-06, "loss": 0.0398, "step": 8887 }, { "epoch": 8.11689497716895, "grad_norm": 0.13545717298984528, "learning_rate": 2.093353627600203e-06, "loss": 0.0009, "step": 8888 }, { "epoch": 8.117808219178082, "grad_norm": 45.39311981201172, "learning_rate": 2.0923389142567228e-06, "loss": 0.2726, "step": 8889 }, { "epoch": 8.118721461187215, "grad_norm": 104.78461456298828, "learning_rate": 2.0913242009132424e-06, "loss": 2.0588, "step": 8890 }, { "epoch": 8.119634703196347, "grad_norm": 5.960363864898682, "learning_rate": 2.0903094875697617e-06, "loss": 0.0362, "step": 8891 }, { "epoch": 8.12054794520548, "grad_norm": 0.05170811340212822, "learning_rate": 2.089294774226281e-06, "loss": 0.0003, "step": 8892 }, { "epoch": 8.121461187214612, "grad_norm": 0.6037024855613708, "learning_rate": 2.0882800608828007e-06, "loss": 0.0028, "step": 8893 }, { "epoch": 8.122374429223743, "grad_norm": 11.10428237915039, "learning_rate": 2.0872653475393204e-06, "loss": 0.0647, "step": 8894 }, { "epoch": 8.123287671232877, "grad_norm": 1.0164142847061157, "learning_rate": 2.08625063419584e-06, "loss": 0.006, "step": 8895 }, { "epoch": 8.124200913242008, "grad_norm": 0.03302360698580742, "learning_rate": 2.0852359208523593e-06, "loss": 0.0002, "step": 8896 }, { "epoch": 8.125114155251142, "grad_norm": 0.9780012369155884, "learning_rate": 2.084221207508879e-06, "loss": 0.0046, "step": 8897 }, { "epoch": 8.126027397260273, "grad_norm": 0.45824113488197327, "learning_rate": 2.0832064941653983e-06, "loss": 0.0013, "step": 8898 }, { "epoch": 8.126940639269407, "grad_norm": 4.541036605834961, "learning_rate": 2.082191780821918e-06, "loss": 0.0275, "step": 8899 }, { "epoch": 8.127853881278538, "grad_norm": 0.35781145095825195, "learning_rate": 2.0811770674784377e-06, "loss": 0.0024, "step": 8900 }, { "epoch": 8.128767123287671, "grad_norm": 1.9116946458816528, "learning_rate": 2.080162354134957e-06, "loss": 0.0133, "step": 8901 }, { "epoch": 8.129680365296803, "grad_norm": 1.842751145362854, "learning_rate": 2.0791476407914766e-06, "loss": 0.0085, "step": 8902 }, { "epoch": 8.130593607305936, "grad_norm": 0.7213819622993469, "learning_rate": 2.078132927447996e-06, "loss": 0.0049, "step": 8903 }, { "epoch": 8.131506849315068, "grad_norm": 3.0679945945739746, "learning_rate": 2.0771182141045156e-06, "loss": 0.017, "step": 8904 }, { "epoch": 8.132420091324201, "grad_norm": 0.006846337579190731, "learning_rate": 2.0761035007610353e-06, "loss": 0.0, "step": 8905 }, { "epoch": 8.133333333333333, "grad_norm": 2.3012936115264893, "learning_rate": 2.0750887874175546e-06, "loss": 0.0144, "step": 8906 }, { "epoch": 8.134246575342466, "grad_norm": 0.9293278455734253, "learning_rate": 2.0740740740740742e-06, "loss": 0.0062, "step": 8907 }, { "epoch": 8.135159817351598, "grad_norm": 0.12711577117443085, "learning_rate": 2.073059360730594e-06, "loss": 0.001, "step": 8908 }, { "epoch": 8.136073059360731, "grad_norm": 7.438103199005127, "learning_rate": 2.072044647387113e-06, "loss": 0.0259, "step": 8909 }, { "epoch": 8.136986301369863, "grad_norm": 8.63729190826416, "learning_rate": 2.071029934043633e-06, "loss": 0.0356, "step": 8910 }, { "epoch": 8.137899543378996, "grad_norm": 0.02475663833320141, "learning_rate": 2.070015220700152e-06, "loss": 0.0001, "step": 8911 }, { "epoch": 8.138812785388128, "grad_norm": 6.574428558349609, "learning_rate": 2.069000507356672e-06, "loss": 0.0324, "step": 8912 }, { "epoch": 8.139726027397261, "grad_norm": 0.33260297775268555, "learning_rate": 2.0679857940131916e-06, "loss": 0.0019, "step": 8913 }, { "epoch": 8.140639269406392, "grad_norm": 27.791522979736328, "learning_rate": 2.066971080669711e-06, "loss": 0.1851, "step": 8914 }, { "epoch": 8.141552511415526, "grad_norm": 0.2418963462114334, "learning_rate": 2.0659563673262305e-06, "loss": 0.0013, "step": 8915 }, { "epoch": 8.142465753424657, "grad_norm": 1.8604817390441895, "learning_rate": 2.0649416539827498e-06, "loss": 0.0089, "step": 8916 }, { "epoch": 8.14337899543379, "grad_norm": 1.3245757818222046, "learning_rate": 2.0639269406392695e-06, "loss": 0.0065, "step": 8917 }, { "epoch": 8.144292237442922, "grad_norm": 0.25171148777008057, "learning_rate": 2.062912227295789e-06, "loss": 0.0015, "step": 8918 }, { "epoch": 8.145205479452056, "grad_norm": 0.22932392358779907, "learning_rate": 2.061897513952309e-06, "loss": 0.0012, "step": 8919 }, { "epoch": 8.146118721461187, "grad_norm": 11.967888832092285, "learning_rate": 2.060882800608828e-06, "loss": 0.0528, "step": 8920 }, { "epoch": 8.147031963470319, "grad_norm": 0.0961579754948616, "learning_rate": 2.0598680872653474e-06, "loss": 0.0005, "step": 8921 }, { "epoch": 8.147945205479452, "grad_norm": 0.0488375760614872, "learning_rate": 2.058853373921867e-06, "loss": 0.0002, "step": 8922 }, { "epoch": 8.148858447488584, "grad_norm": 3.640669584274292, "learning_rate": 2.0578386605783868e-06, "loss": 0.0233, "step": 8923 }, { "epoch": 8.149771689497717, "grad_norm": 0.29550275206565857, "learning_rate": 2.0568239472349065e-06, "loss": 0.0021, "step": 8924 }, { "epoch": 8.150684931506849, "grad_norm": 0.3440670371055603, "learning_rate": 2.0558092338914257e-06, "loss": 0.0019, "step": 8925 }, { "epoch": 8.151598173515982, "grad_norm": 0.3732658326625824, "learning_rate": 2.0547945205479454e-06, "loss": 0.0017, "step": 8926 }, { "epoch": 8.152511415525113, "grad_norm": 0.0032526827417314053, "learning_rate": 2.0537798072044647e-06, "loss": 0.0, "step": 8927 }, { "epoch": 8.153424657534247, "grad_norm": 7.986534595489502, "learning_rate": 2.0527650938609844e-06, "loss": 0.0334, "step": 8928 }, { "epoch": 8.154337899543378, "grad_norm": 1.73414945602417, "learning_rate": 2.051750380517504e-06, "loss": 0.014, "step": 8929 }, { "epoch": 8.155251141552512, "grad_norm": 0.134624183177948, "learning_rate": 2.0507356671740238e-06, "loss": 0.0008, "step": 8930 }, { "epoch": 8.156164383561643, "grad_norm": 0.2813161611557007, "learning_rate": 2.049720953830543e-06, "loss": 0.0015, "step": 8931 }, { "epoch": 8.157077625570777, "grad_norm": 0.31281378865242004, "learning_rate": 2.0487062404870623e-06, "loss": 0.0012, "step": 8932 }, { "epoch": 8.157990867579908, "grad_norm": 0.24533185362815857, "learning_rate": 2.047691527143582e-06, "loss": 0.0016, "step": 8933 }, { "epoch": 8.158904109589042, "grad_norm": 20.09952735900879, "learning_rate": 2.0466768138001017e-06, "loss": 0.1121, "step": 8934 }, { "epoch": 8.159817351598173, "grad_norm": 0.2668361961841583, "learning_rate": 2.0456621004566214e-06, "loss": 0.0017, "step": 8935 }, { "epoch": 8.160730593607306, "grad_norm": 0.0648585632443428, "learning_rate": 2.0446473871131407e-06, "loss": 0.0004, "step": 8936 }, { "epoch": 8.161643835616438, "grad_norm": 0.036719877272844315, "learning_rate": 2.0436326737696604e-06, "loss": 0.0002, "step": 8937 }, { "epoch": 8.162557077625571, "grad_norm": 0.0663004070520401, "learning_rate": 2.0426179604261796e-06, "loss": 0.0004, "step": 8938 }, { "epoch": 8.163470319634703, "grad_norm": 0.5315661430358887, "learning_rate": 2.0416032470826993e-06, "loss": 0.0032, "step": 8939 }, { "epoch": 8.164383561643836, "grad_norm": 0.0014689594972878695, "learning_rate": 2.040588533739219e-06, "loss": 0.0, "step": 8940 }, { "epoch": 8.165296803652968, "grad_norm": 48.17806625366211, "learning_rate": 2.0395738203957383e-06, "loss": 0.4485, "step": 8941 }, { "epoch": 8.166210045662101, "grad_norm": 0.4880548417568207, "learning_rate": 2.038559107052258e-06, "loss": 0.0035, "step": 8942 }, { "epoch": 8.167123287671233, "grad_norm": 0.5082260370254517, "learning_rate": 2.0375443937087772e-06, "loss": 0.0032, "step": 8943 }, { "epoch": 8.168036529680366, "grad_norm": 0.4262779951095581, "learning_rate": 2.036529680365297e-06, "loss": 0.0021, "step": 8944 }, { "epoch": 8.168949771689498, "grad_norm": 0.16933150589466095, "learning_rate": 2.0355149670218166e-06, "loss": 0.0009, "step": 8945 }, { "epoch": 8.169863013698631, "grad_norm": 9.751063346862793, "learning_rate": 2.034500253678336e-06, "loss": 0.0442, "step": 8946 }, { "epoch": 8.170776255707763, "grad_norm": 4.7485246658325195, "learning_rate": 2.0334855403348556e-06, "loss": 0.0299, "step": 8947 }, { "epoch": 8.171689497716894, "grad_norm": 12.289098739624023, "learning_rate": 2.0324708269913753e-06, "loss": 0.0484, "step": 8948 }, { "epoch": 8.172602739726027, "grad_norm": 0.46602877974510193, "learning_rate": 2.0314561136478945e-06, "loss": 0.0031, "step": 8949 }, { "epoch": 8.173515981735159, "grad_norm": 0.06048088148236275, "learning_rate": 2.0304414003044142e-06, "loss": 0.0003, "step": 8950 }, { "epoch": 8.174429223744292, "grad_norm": 0.0017127222381532192, "learning_rate": 2.0294266869609335e-06, "loss": 0.0, "step": 8951 }, { "epoch": 8.175342465753424, "grad_norm": 0.05022129788994789, "learning_rate": 2.028411973617453e-06, "loss": 0.0003, "step": 8952 }, { "epoch": 8.176255707762557, "grad_norm": 24.72633171081543, "learning_rate": 2.027397260273973e-06, "loss": 0.1282, "step": 8953 }, { "epoch": 8.177168949771689, "grad_norm": 0.22600747644901276, "learning_rate": 2.026382546930492e-06, "loss": 0.0018, "step": 8954 }, { "epoch": 8.178082191780822, "grad_norm": 0.36708730459213257, "learning_rate": 2.025367833587012e-06, "loss": 0.0021, "step": 8955 }, { "epoch": 8.178995433789954, "grad_norm": 5.678482532501221, "learning_rate": 2.024353120243531e-06, "loss": 0.0235, "step": 8956 }, { "epoch": 8.179908675799087, "grad_norm": 0.06422001123428345, "learning_rate": 2.023338406900051e-06, "loss": 0.0003, "step": 8957 }, { "epoch": 8.180821917808219, "grad_norm": 0.8663026094436646, "learning_rate": 2.0223236935565705e-06, "loss": 0.0057, "step": 8958 }, { "epoch": 8.181735159817352, "grad_norm": 15.882858276367188, "learning_rate": 2.02130898021309e-06, "loss": 0.0686, "step": 8959 }, { "epoch": 8.182648401826484, "grad_norm": 0.396340936422348, "learning_rate": 2.0202942668696095e-06, "loss": 0.0022, "step": 8960 }, { "epoch": 8.183561643835617, "grad_norm": 12.841832160949707, "learning_rate": 2.0192795535261287e-06, "loss": 0.0749, "step": 8961 }, { "epoch": 8.184474885844748, "grad_norm": 0.2369745820760727, "learning_rate": 2.0182648401826484e-06, "loss": 0.0012, "step": 8962 }, { "epoch": 8.185388127853882, "grad_norm": 0.6419878602027893, "learning_rate": 2.017250126839168e-06, "loss": 0.0026, "step": 8963 }, { "epoch": 8.186301369863013, "grad_norm": 0.9327883720397949, "learning_rate": 2.016235413495688e-06, "loss": 0.0051, "step": 8964 }, { "epoch": 8.187214611872147, "grad_norm": 1.7815115451812744, "learning_rate": 2.015220700152207e-06, "loss": 0.0097, "step": 8965 }, { "epoch": 8.188127853881278, "grad_norm": 9.07594108581543, "learning_rate": 2.0142059868087268e-06, "loss": 0.0402, "step": 8966 }, { "epoch": 8.189041095890412, "grad_norm": 1.2318392992019653, "learning_rate": 2.013191273465246e-06, "loss": 0.0054, "step": 8967 }, { "epoch": 8.189954337899543, "grad_norm": 0.33607134222984314, "learning_rate": 2.0121765601217657e-06, "loss": 0.0011, "step": 8968 }, { "epoch": 8.190867579908677, "grad_norm": 0.021537482738494873, "learning_rate": 2.0111618467782854e-06, "loss": 0.0001, "step": 8969 }, { "epoch": 8.191780821917808, "grad_norm": 2.010549545288086, "learning_rate": 2.010147133434805e-06, "loss": 0.0126, "step": 8970 }, { "epoch": 8.192694063926941, "grad_norm": 0.018597297370433807, "learning_rate": 2.0091324200913244e-06, "loss": 0.0001, "step": 8971 }, { "epoch": 8.193607305936073, "grad_norm": 0.05552184581756592, "learning_rate": 2.0081177067478436e-06, "loss": 0.0004, "step": 8972 }, { "epoch": 8.194520547945206, "grad_norm": 0.20083428919315338, "learning_rate": 2.0071029934043633e-06, "loss": 0.0012, "step": 8973 }, { "epoch": 8.195433789954338, "grad_norm": 36.318450927734375, "learning_rate": 2.006088280060883e-06, "loss": 0.0654, "step": 8974 }, { "epoch": 8.19634703196347, "grad_norm": 1.1606768369674683, "learning_rate": 2.0050735667174027e-06, "loss": 0.0067, "step": 8975 }, { "epoch": 8.197260273972603, "grad_norm": 6.732656002044678, "learning_rate": 2.004058853373922e-06, "loss": 0.0404, "step": 8976 }, { "epoch": 8.198173515981734, "grad_norm": 3.04176664352417, "learning_rate": 2.0030441400304417e-06, "loss": 0.0132, "step": 8977 }, { "epoch": 8.199086757990868, "grad_norm": 0.5224707722663879, "learning_rate": 2.002029426686961e-06, "loss": 0.0024, "step": 8978 }, { "epoch": 8.2, "grad_norm": 0.19248341023921967, "learning_rate": 2.0010147133434806e-06, "loss": 0.0011, "step": 8979 }, { "epoch": 8.200913242009133, "grad_norm": 1.3838825225830078, "learning_rate": 2.0000000000000003e-06, "loss": 0.0092, "step": 8980 }, { "epoch": 8.201826484018264, "grad_norm": 1.865639090538025, "learning_rate": 1.9989852866565196e-06, "loss": 0.0093, "step": 8981 }, { "epoch": 8.202739726027398, "grad_norm": 3.417637825012207, "learning_rate": 1.9979705733130393e-06, "loss": 0.0129, "step": 8982 }, { "epoch": 8.203652968036529, "grad_norm": 0.8987341523170471, "learning_rate": 1.9969558599695586e-06, "loss": 0.0055, "step": 8983 }, { "epoch": 8.204566210045662, "grad_norm": 0.11961231380701065, "learning_rate": 1.9959411466260783e-06, "loss": 0.0007, "step": 8984 }, { "epoch": 8.205479452054794, "grad_norm": 0.9322972297668457, "learning_rate": 1.994926433282598e-06, "loss": 0.0067, "step": 8985 }, { "epoch": 8.206392694063927, "grad_norm": 0.657634973526001, "learning_rate": 1.9939117199391172e-06, "loss": 0.0047, "step": 8986 }, { "epoch": 8.207305936073059, "grad_norm": 4.7411909103393555, "learning_rate": 1.992897006595637e-06, "loss": 0.0215, "step": 8987 }, { "epoch": 8.208219178082192, "grad_norm": 0.9862819910049438, "learning_rate": 1.9918822932521566e-06, "loss": 0.0071, "step": 8988 }, { "epoch": 8.209132420091324, "grad_norm": 0.6203494071960449, "learning_rate": 1.990867579908676e-06, "loss": 0.003, "step": 8989 }, { "epoch": 8.210045662100457, "grad_norm": 0.005058143753558397, "learning_rate": 1.9898528665651956e-06, "loss": 0.0, "step": 8990 }, { "epoch": 8.210958904109589, "grad_norm": 1.0293577909469604, "learning_rate": 1.988838153221715e-06, "loss": 0.0061, "step": 8991 }, { "epoch": 8.211872146118722, "grad_norm": 0.26585379242897034, "learning_rate": 1.9878234398782345e-06, "loss": 0.0013, "step": 8992 }, { "epoch": 8.212785388127854, "grad_norm": 0.1291116327047348, "learning_rate": 1.9868087265347542e-06, "loss": 0.0006, "step": 8993 }, { "epoch": 8.213698630136987, "grad_norm": 2.4860758781433105, "learning_rate": 1.9857940131912735e-06, "loss": 0.0097, "step": 8994 }, { "epoch": 8.214611872146119, "grad_norm": 0.05064117908477783, "learning_rate": 1.984779299847793e-06, "loss": 0.0002, "step": 8995 }, { "epoch": 8.215525114155252, "grad_norm": 0.024698380380868912, "learning_rate": 1.9837645865043124e-06, "loss": 0.0001, "step": 8996 }, { "epoch": 8.216438356164383, "grad_norm": 3.696268081665039, "learning_rate": 1.982749873160832e-06, "loss": 0.0147, "step": 8997 }, { "epoch": 8.217351598173517, "grad_norm": 0.43740344047546387, "learning_rate": 1.981735159817352e-06, "loss": 0.0028, "step": 8998 }, { "epoch": 8.218264840182648, "grad_norm": 0.05633733421564102, "learning_rate": 1.9807204464738715e-06, "loss": 0.0003, "step": 8999 }, { "epoch": 8.219178082191782, "grad_norm": 0.8588171005249023, "learning_rate": 1.9797057331303908e-06, "loss": 0.0056, "step": 9000 }, { "epoch": 8.220091324200913, "grad_norm": 0.03786709904670715, "learning_rate": 1.97869101978691e-06, "loss": 0.0002, "step": 9001 }, { "epoch": 8.221004566210045, "grad_norm": 0.12619255483150482, "learning_rate": 1.9776763064434297e-06, "loss": 0.0005, "step": 9002 }, { "epoch": 8.221917808219178, "grad_norm": 0.04165070876479149, "learning_rate": 1.9766615930999494e-06, "loss": 0.0002, "step": 9003 }, { "epoch": 8.22283105022831, "grad_norm": 0.6139034628868103, "learning_rate": 1.975646879756469e-06, "loss": 0.0036, "step": 9004 }, { "epoch": 8.223744292237443, "grad_norm": 2.0991735458374023, "learning_rate": 1.9746321664129884e-06, "loss": 0.0127, "step": 9005 }, { "epoch": 8.224657534246575, "grad_norm": 1.8651373386383057, "learning_rate": 1.973617453069508e-06, "loss": 0.0103, "step": 9006 }, { "epoch": 8.225570776255708, "grad_norm": 4.607046604156494, "learning_rate": 1.9726027397260274e-06, "loss": 0.0239, "step": 9007 }, { "epoch": 8.22648401826484, "grad_norm": 0.6125046014785767, "learning_rate": 1.971588026382547e-06, "loss": 0.0025, "step": 9008 }, { "epoch": 8.227397260273973, "grad_norm": 0.4638424217700958, "learning_rate": 1.9705733130390667e-06, "loss": 0.0032, "step": 9009 }, { "epoch": 8.228310502283104, "grad_norm": 21.441776275634766, "learning_rate": 1.9695585996955864e-06, "loss": 0.1371, "step": 9010 }, { "epoch": 8.229223744292238, "grad_norm": 7.753342151641846, "learning_rate": 1.9685438863521057e-06, "loss": 0.0555, "step": 9011 }, { "epoch": 8.23013698630137, "grad_norm": 0.37193506956100464, "learning_rate": 1.967529173008625e-06, "loss": 0.0019, "step": 9012 }, { "epoch": 8.231050228310503, "grad_norm": 1.5861804485321045, "learning_rate": 1.9665144596651447e-06, "loss": 0.007, "step": 9013 }, { "epoch": 8.231963470319634, "grad_norm": 3.7313995361328125, "learning_rate": 1.9654997463216644e-06, "loss": 0.0214, "step": 9014 }, { "epoch": 8.232876712328768, "grad_norm": 1.257755160331726, "learning_rate": 1.964485032978184e-06, "loss": 0.0071, "step": 9015 }, { "epoch": 8.2337899543379, "grad_norm": 0.08243855088949203, "learning_rate": 1.9634703196347033e-06, "loss": 0.0005, "step": 9016 }, { "epoch": 8.234703196347033, "grad_norm": 0.007462960202246904, "learning_rate": 1.962455606291223e-06, "loss": 0.0001, "step": 9017 }, { "epoch": 8.235616438356164, "grad_norm": 0.12173871695995331, "learning_rate": 1.9614408929477423e-06, "loss": 0.0007, "step": 9018 }, { "epoch": 8.236529680365297, "grad_norm": 0.46243172883987427, "learning_rate": 1.960426179604262e-06, "loss": 0.0021, "step": 9019 }, { "epoch": 8.237442922374429, "grad_norm": 0.00533053744584322, "learning_rate": 1.9594114662607817e-06, "loss": 0.0, "step": 9020 }, { "epoch": 8.238356164383562, "grad_norm": 0.6671556234359741, "learning_rate": 1.958396752917301e-06, "loss": 0.0029, "step": 9021 }, { "epoch": 8.239269406392694, "grad_norm": 0.2801554501056671, "learning_rate": 1.9573820395738206e-06, "loss": 0.0014, "step": 9022 }, { "epoch": 8.240182648401827, "grad_norm": 3.9048972129821777, "learning_rate": 1.95636732623034e-06, "loss": 0.0218, "step": 9023 }, { "epoch": 8.241095890410959, "grad_norm": 2.258540153503418, "learning_rate": 1.9553526128868596e-06, "loss": 0.0094, "step": 9024 }, { "epoch": 8.242009132420092, "grad_norm": 18.490989685058594, "learning_rate": 1.9543378995433793e-06, "loss": 0.0821, "step": 9025 }, { "epoch": 8.242922374429224, "grad_norm": 0.08919913321733475, "learning_rate": 1.9533231861998985e-06, "loss": 0.0006, "step": 9026 }, { "epoch": 8.243835616438357, "grad_norm": 0.0845438614487648, "learning_rate": 1.9523084728564182e-06, "loss": 0.0004, "step": 9027 }, { "epoch": 8.244748858447489, "grad_norm": 0.06941215693950653, "learning_rate": 1.951293759512938e-06, "loss": 0.0002, "step": 9028 }, { "epoch": 8.24566210045662, "grad_norm": 0.20400404930114746, "learning_rate": 1.950279046169457e-06, "loss": 0.0009, "step": 9029 }, { "epoch": 8.246575342465754, "grad_norm": 0.40595176815986633, "learning_rate": 1.949264332825977e-06, "loss": 0.0023, "step": 9030 }, { "epoch": 8.247488584474885, "grad_norm": 4.382347106933594, "learning_rate": 1.948249619482496e-06, "loss": 0.0213, "step": 9031 }, { "epoch": 8.248401826484018, "grad_norm": 0.6769644618034363, "learning_rate": 1.947234906139016e-06, "loss": 0.0036, "step": 9032 }, { "epoch": 8.24931506849315, "grad_norm": 0.011800955981016159, "learning_rate": 1.9462201927955355e-06, "loss": 0.0001, "step": 9033 }, { "epoch": 8.250228310502283, "grad_norm": 0.14085286855697632, "learning_rate": 1.945205479452055e-06, "loss": 0.0007, "step": 9034 }, { "epoch": 8.251141552511415, "grad_norm": 0.087453193962574, "learning_rate": 1.9441907661085745e-06, "loss": 0.0004, "step": 9035 }, { "epoch": 8.252054794520548, "grad_norm": 0.871954619884491, "learning_rate": 1.9431760527650938e-06, "loss": 0.0061, "step": 9036 }, { "epoch": 8.25296803652968, "grad_norm": 0.4497056305408478, "learning_rate": 1.9421613394216135e-06, "loss": 0.0026, "step": 9037 }, { "epoch": 8.253881278538813, "grad_norm": 105.39253234863281, "learning_rate": 1.941146626078133e-06, "loss": 1.2471, "step": 9038 }, { "epoch": 8.254794520547945, "grad_norm": 7.048346042633057, "learning_rate": 1.940131912734653e-06, "loss": 0.0228, "step": 9039 }, { "epoch": 8.255707762557078, "grad_norm": 2.813459873199463, "learning_rate": 1.939117199391172e-06, "loss": 0.0127, "step": 9040 }, { "epoch": 8.25662100456621, "grad_norm": 0.20887391269207, "learning_rate": 1.9381024860476914e-06, "loss": 0.0014, "step": 9041 }, { "epoch": 8.257534246575343, "grad_norm": 177.58641052246094, "learning_rate": 1.937087772704211e-06, "loss": 1.2929, "step": 9042 }, { "epoch": 8.258447488584475, "grad_norm": 2.6546552181243896, "learning_rate": 1.9360730593607308e-06, "loss": 0.0186, "step": 9043 }, { "epoch": 8.259360730593608, "grad_norm": 0.5352148413658142, "learning_rate": 1.9350583460172505e-06, "loss": 0.0029, "step": 9044 }, { "epoch": 8.26027397260274, "grad_norm": 0.8506932258605957, "learning_rate": 1.9340436326737697e-06, "loss": 0.0046, "step": 9045 }, { "epoch": 8.261187214611873, "grad_norm": 2.770678758621216, "learning_rate": 1.9330289193302894e-06, "loss": 0.0187, "step": 9046 }, { "epoch": 8.262100456621004, "grad_norm": 0.02515122853219509, "learning_rate": 1.9320142059868087e-06, "loss": 0.0001, "step": 9047 }, { "epoch": 8.263013698630138, "grad_norm": 0.22212238609790802, "learning_rate": 1.9309994926433284e-06, "loss": 0.0014, "step": 9048 }, { "epoch": 8.26392694063927, "grad_norm": 39.73997116088867, "learning_rate": 1.929984779299848e-06, "loss": 0.2556, "step": 9049 }, { "epoch": 8.264840182648403, "grad_norm": 2.9227466583251953, "learning_rate": 1.9289700659563678e-06, "loss": 0.0132, "step": 9050 }, { "epoch": 8.265753424657534, "grad_norm": 0.022633982822299004, "learning_rate": 1.927955352612887e-06, "loss": 0.0001, "step": 9051 }, { "epoch": 8.266666666666667, "grad_norm": 1.6456602811813354, "learning_rate": 1.9269406392694063e-06, "loss": 0.0062, "step": 9052 }, { "epoch": 8.267579908675799, "grad_norm": 144.61080932617188, "learning_rate": 1.925925925925926e-06, "loss": 7.3343, "step": 9053 }, { "epoch": 8.268493150684932, "grad_norm": 6.513430595397949, "learning_rate": 1.9249112125824457e-06, "loss": 0.0349, "step": 9054 }, { "epoch": 8.269406392694064, "grad_norm": 0.6199832558631897, "learning_rate": 1.9238964992389654e-06, "loss": 0.004, "step": 9055 }, { "epoch": 8.270319634703196, "grad_norm": 0.1382068693637848, "learning_rate": 1.9228817858954846e-06, "loss": 0.0009, "step": 9056 }, { "epoch": 8.271232876712329, "grad_norm": 15.725738525390625, "learning_rate": 1.9218670725520043e-06, "loss": 0.0946, "step": 9057 }, { "epoch": 8.27214611872146, "grad_norm": 0.22072330117225647, "learning_rate": 1.9208523592085236e-06, "loss": 0.001, "step": 9058 }, { "epoch": 8.273059360730594, "grad_norm": 18.227191925048828, "learning_rate": 1.9198376458650433e-06, "loss": 0.0603, "step": 9059 }, { "epoch": 8.273972602739725, "grad_norm": 1.5589618682861328, "learning_rate": 1.918822932521563e-06, "loss": 0.007, "step": 9060 }, { "epoch": 8.274885844748859, "grad_norm": 13.04604434967041, "learning_rate": 1.9178082191780823e-06, "loss": 0.0447, "step": 9061 }, { "epoch": 8.27579908675799, "grad_norm": 0.5016592144966125, "learning_rate": 1.916793505834602e-06, "loss": 0.003, "step": 9062 }, { "epoch": 8.276712328767124, "grad_norm": 0.4742789566516876, "learning_rate": 1.9157787924911212e-06, "loss": 0.0023, "step": 9063 }, { "epoch": 8.277625570776255, "grad_norm": 1.255481243133545, "learning_rate": 1.914764079147641e-06, "loss": 0.0037, "step": 9064 }, { "epoch": 8.278538812785389, "grad_norm": 2.8060147762298584, "learning_rate": 1.9137493658041606e-06, "loss": 0.0155, "step": 9065 }, { "epoch": 8.27945205479452, "grad_norm": 0.3952906131744385, "learning_rate": 1.91273465246068e-06, "loss": 0.0027, "step": 9066 }, { "epoch": 8.280365296803653, "grad_norm": 0.1449020504951477, "learning_rate": 1.9117199391171996e-06, "loss": 0.0007, "step": 9067 }, { "epoch": 8.281278538812785, "grad_norm": 2.684607744216919, "learning_rate": 1.9107052257737193e-06, "loss": 0.0136, "step": 9068 }, { "epoch": 8.282191780821918, "grad_norm": 1.3806324005126953, "learning_rate": 1.9096905124302385e-06, "loss": 0.0088, "step": 9069 }, { "epoch": 8.28310502283105, "grad_norm": 0.3173430860042572, "learning_rate": 1.9086757990867582e-06, "loss": 0.0017, "step": 9070 }, { "epoch": 8.284018264840183, "grad_norm": 2.133092164993286, "learning_rate": 1.9076610857432775e-06, "loss": 0.0114, "step": 9071 }, { "epoch": 8.284931506849315, "grad_norm": 11.960549354553223, "learning_rate": 1.9066463723997972e-06, "loss": 0.074, "step": 9072 }, { "epoch": 8.285844748858448, "grad_norm": 1.8969818353652954, "learning_rate": 1.9056316590563167e-06, "loss": 0.0099, "step": 9073 }, { "epoch": 8.28675799086758, "grad_norm": 8.546274185180664, "learning_rate": 1.9046169457128364e-06, "loss": 0.034, "step": 9074 }, { "epoch": 8.287671232876713, "grad_norm": 17.0416259765625, "learning_rate": 1.9036022323693558e-06, "loss": 0.0907, "step": 9075 }, { "epoch": 8.288584474885845, "grad_norm": 0.030568385496735573, "learning_rate": 1.9025875190258753e-06, "loss": 0.0002, "step": 9076 }, { "epoch": 8.289497716894978, "grad_norm": 0.3761798143386841, "learning_rate": 1.9015728056823948e-06, "loss": 0.0022, "step": 9077 }, { "epoch": 8.29041095890411, "grad_norm": 0.2755671739578247, "learning_rate": 1.9005580923389145e-06, "loss": 0.0014, "step": 9078 }, { "epoch": 8.291324200913243, "grad_norm": 0.3910672962665558, "learning_rate": 1.899543378995434e-06, "loss": 0.0023, "step": 9079 }, { "epoch": 8.292237442922374, "grad_norm": 3.2797155380249023, "learning_rate": 1.8985286656519537e-06, "loss": 0.0196, "step": 9080 }, { "epoch": 8.293150684931508, "grad_norm": 0.07049937546253204, "learning_rate": 1.897513952308473e-06, "loss": 0.0004, "step": 9081 }, { "epoch": 8.29406392694064, "grad_norm": 0.4448471963405609, "learning_rate": 1.8964992389649924e-06, "loss": 0.0019, "step": 9082 }, { "epoch": 8.29497716894977, "grad_norm": 6.884011268615723, "learning_rate": 1.895484525621512e-06, "loss": 0.038, "step": 9083 }, { "epoch": 8.295890410958904, "grad_norm": 0.18424585461616516, "learning_rate": 1.8944698122780316e-06, "loss": 0.001, "step": 9084 }, { "epoch": 8.296803652968036, "grad_norm": 0.023515891283750534, "learning_rate": 1.8934550989345513e-06, "loss": 0.0002, "step": 9085 }, { "epoch": 8.29771689497717, "grad_norm": 0.1348266750574112, "learning_rate": 1.8924403855910705e-06, "loss": 0.0006, "step": 9086 }, { "epoch": 8.2986301369863, "grad_norm": 0.35746991634368896, "learning_rate": 1.8914256722475902e-06, "loss": 0.0027, "step": 9087 }, { "epoch": 8.299543378995434, "grad_norm": 1.8423937559127808, "learning_rate": 1.8904109589041097e-06, "loss": 0.0077, "step": 9088 }, { "epoch": 8.300456621004566, "grad_norm": 0.46338656544685364, "learning_rate": 1.8893962455606294e-06, "loss": 0.0022, "step": 9089 }, { "epoch": 8.301369863013699, "grad_norm": 0.39756664633750916, "learning_rate": 1.8883815322171489e-06, "loss": 0.0016, "step": 9090 }, { "epoch": 8.30228310502283, "grad_norm": 0.6125566959381104, "learning_rate": 1.8873668188736682e-06, "loss": 0.0037, "step": 9091 }, { "epoch": 8.303196347031964, "grad_norm": 0.8956695199012756, "learning_rate": 1.8863521055301878e-06, "loss": 0.006, "step": 9092 }, { "epoch": 8.304109589041095, "grad_norm": 0.03235888108611107, "learning_rate": 1.8853373921867073e-06, "loss": 0.0003, "step": 9093 }, { "epoch": 8.305022831050229, "grad_norm": 0.1764174848794937, "learning_rate": 1.884322678843227e-06, "loss": 0.0011, "step": 9094 }, { "epoch": 8.30593607305936, "grad_norm": 3.997727155685425, "learning_rate": 1.8833079654997465e-06, "loss": 0.0087, "step": 9095 }, { "epoch": 8.306849315068494, "grad_norm": 0.06770595163106918, "learning_rate": 1.882293252156266e-06, "loss": 0.0003, "step": 9096 }, { "epoch": 8.307762557077625, "grad_norm": 0.4398794174194336, "learning_rate": 1.8812785388127855e-06, "loss": 0.0029, "step": 9097 }, { "epoch": 8.308675799086759, "grad_norm": 0.6122574806213379, "learning_rate": 1.8802638254693052e-06, "loss": 0.0042, "step": 9098 }, { "epoch": 8.30958904109589, "grad_norm": 0.4526520073413849, "learning_rate": 1.8792491121258246e-06, "loss": 0.0035, "step": 9099 }, { "epoch": 8.310502283105023, "grad_norm": 0.10073140263557434, "learning_rate": 1.8782343987823443e-06, "loss": 0.0005, "step": 9100 }, { "epoch": 8.311415525114155, "grad_norm": 0.38346776366233826, "learning_rate": 1.8772196854388636e-06, "loss": 0.0022, "step": 9101 }, { "epoch": 8.312328767123288, "grad_norm": 0.051234032958745956, "learning_rate": 1.876204972095383e-06, "loss": 0.0003, "step": 9102 }, { "epoch": 8.31324200913242, "grad_norm": 0.8096350431442261, "learning_rate": 1.8751902587519028e-06, "loss": 0.0048, "step": 9103 }, { "epoch": 8.314155251141553, "grad_norm": 28.602277755737305, "learning_rate": 1.8741755454084222e-06, "loss": 0.1611, "step": 9104 }, { "epoch": 8.315068493150685, "grad_norm": 4.494600296020508, "learning_rate": 1.873160832064942e-06, "loss": 0.0233, "step": 9105 }, { "epoch": 8.315981735159818, "grad_norm": 0.6694157123565674, "learning_rate": 1.8721461187214612e-06, "loss": 0.004, "step": 9106 }, { "epoch": 8.31689497716895, "grad_norm": 2.6227691173553467, "learning_rate": 1.871131405377981e-06, "loss": 0.02, "step": 9107 }, { "epoch": 8.317808219178083, "grad_norm": 0.03698158636689186, "learning_rate": 1.8701166920345004e-06, "loss": 0.0002, "step": 9108 }, { "epoch": 8.318721461187215, "grad_norm": 0.840018093585968, "learning_rate": 1.86910197869102e-06, "loss": 0.0031, "step": 9109 }, { "epoch": 8.319634703196346, "grad_norm": 0.3936038613319397, "learning_rate": 1.8680872653475395e-06, "loss": 0.0022, "step": 9110 }, { "epoch": 8.32054794520548, "grad_norm": 0.6655344367027283, "learning_rate": 1.8670725520040588e-06, "loss": 0.003, "step": 9111 }, { "epoch": 8.321461187214611, "grad_norm": 5.216588973999023, "learning_rate": 1.8660578386605785e-06, "loss": 0.024, "step": 9112 }, { "epoch": 8.322374429223744, "grad_norm": 0.8796241283416748, "learning_rate": 1.865043125317098e-06, "loss": 0.0039, "step": 9113 }, { "epoch": 8.323287671232876, "grad_norm": 0.2167358696460724, "learning_rate": 1.8640284119736177e-06, "loss": 0.001, "step": 9114 }, { "epoch": 8.32420091324201, "grad_norm": 0.0497397780418396, "learning_rate": 1.8630136986301372e-06, "loss": 0.0002, "step": 9115 }, { "epoch": 8.325114155251141, "grad_norm": 0.6091943979263306, "learning_rate": 1.8619989852866566e-06, "loss": 0.0029, "step": 9116 }, { "epoch": 8.326027397260274, "grad_norm": 1.471950650215149, "learning_rate": 1.8609842719431761e-06, "loss": 0.0087, "step": 9117 }, { "epoch": 8.326940639269406, "grad_norm": 0.15780478715896606, "learning_rate": 1.8599695585996958e-06, "loss": 0.0009, "step": 9118 }, { "epoch": 8.32785388127854, "grad_norm": 10.242416381835938, "learning_rate": 1.8589548452562153e-06, "loss": 0.0328, "step": 9119 }, { "epoch": 8.32876712328767, "grad_norm": 16.707487106323242, "learning_rate": 1.857940131912735e-06, "loss": 0.0827, "step": 9120 }, { "epoch": 8.329680365296804, "grad_norm": 0.014839738607406616, "learning_rate": 1.8569254185692543e-06, "loss": 0.0001, "step": 9121 }, { "epoch": 8.330593607305936, "grad_norm": 0.3766792416572571, "learning_rate": 1.8559107052257737e-06, "loss": 0.0018, "step": 9122 }, { "epoch": 8.331506849315069, "grad_norm": 0.09129523485898972, "learning_rate": 1.8548959918822934e-06, "loss": 0.0005, "step": 9123 }, { "epoch": 8.3324200913242, "grad_norm": 0.014130211435258389, "learning_rate": 1.853881278538813e-06, "loss": 0.0001, "step": 9124 }, { "epoch": 8.333333333333334, "grad_norm": 0.015650304034352303, "learning_rate": 1.8528665651953326e-06, "loss": 0.0001, "step": 9125 }, { "epoch": 8.334246575342465, "grad_norm": 0.07969406247138977, "learning_rate": 1.8518518518518519e-06, "loss": 0.0005, "step": 9126 }, { "epoch": 8.335159817351599, "grad_norm": 0.3979538083076477, "learning_rate": 1.8508371385083716e-06, "loss": 0.0025, "step": 9127 }, { "epoch": 8.33607305936073, "grad_norm": 2.12269926071167, "learning_rate": 1.849822425164891e-06, "loss": 0.0107, "step": 9128 }, { "epoch": 8.336986301369864, "grad_norm": 3.231083393096924, "learning_rate": 1.8488077118214107e-06, "loss": 0.0194, "step": 9129 }, { "epoch": 8.337899543378995, "grad_norm": 1.1983546018600464, "learning_rate": 1.8477929984779302e-06, "loss": 0.0059, "step": 9130 }, { "epoch": 8.338812785388129, "grad_norm": 0.6143553256988525, "learning_rate": 1.8467782851344495e-06, "loss": 0.0035, "step": 9131 }, { "epoch": 8.33972602739726, "grad_norm": 40.92414855957031, "learning_rate": 1.8457635717909692e-06, "loss": 0.7002, "step": 9132 }, { "epoch": 8.340639269406394, "grad_norm": 13.138723373413086, "learning_rate": 1.8447488584474887e-06, "loss": 0.063, "step": 9133 }, { "epoch": 8.341552511415525, "grad_norm": 6.386185169219971, "learning_rate": 1.8437341451040083e-06, "loss": 0.0418, "step": 9134 }, { "epoch": 8.342465753424657, "grad_norm": 0.6618178486824036, "learning_rate": 1.8427194317605278e-06, "loss": 0.0034, "step": 9135 }, { "epoch": 8.34337899543379, "grad_norm": 0.3678268492221832, "learning_rate": 1.8417047184170473e-06, "loss": 0.0021, "step": 9136 }, { "epoch": 8.344292237442922, "grad_norm": 0.1366378515958786, "learning_rate": 1.8406900050735668e-06, "loss": 0.0007, "step": 9137 }, { "epoch": 8.345205479452055, "grad_norm": 0.5384151339530945, "learning_rate": 1.8396752917300865e-06, "loss": 0.0032, "step": 9138 }, { "epoch": 8.346118721461186, "grad_norm": 3.9250805377960205, "learning_rate": 1.838660578386606e-06, "loss": 0.0242, "step": 9139 }, { "epoch": 8.34703196347032, "grad_norm": 0.17753006517887115, "learning_rate": 1.8376458650431257e-06, "loss": 0.0009, "step": 9140 }, { "epoch": 8.347945205479451, "grad_norm": 0.8712512850761414, "learning_rate": 1.836631151699645e-06, "loss": 0.0055, "step": 9141 }, { "epoch": 8.348858447488585, "grad_norm": 0.0063033327460289, "learning_rate": 1.8356164383561644e-06, "loss": 0.0, "step": 9142 }, { "epoch": 8.349771689497716, "grad_norm": 22.201120376586914, "learning_rate": 1.834601725012684e-06, "loss": 0.0841, "step": 9143 }, { "epoch": 8.35068493150685, "grad_norm": 2.517026424407959, "learning_rate": 1.8335870116692036e-06, "loss": 0.0146, "step": 9144 }, { "epoch": 8.351598173515981, "grad_norm": 0.3228141963481903, "learning_rate": 1.8325722983257233e-06, "loss": 0.0012, "step": 9145 }, { "epoch": 8.352511415525115, "grad_norm": 19.778343200683594, "learning_rate": 1.8315575849822425e-06, "loss": 0.1182, "step": 9146 }, { "epoch": 8.353424657534246, "grad_norm": 1.0199978351593018, "learning_rate": 1.8305428716387622e-06, "loss": 0.0054, "step": 9147 }, { "epoch": 8.35433789954338, "grad_norm": 0.5990049242973328, "learning_rate": 1.8295281582952817e-06, "loss": 0.0026, "step": 9148 }, { "epoch": 8.355251141552511, "grad_norm": 1.8038235902786255, "learning_rate": 1.8285134449518014e-06, "loss": 0.0103, "step": 9149 }, { "epoch": 8.356164383561644, "grad_norm": 0.024779343977570534, "learning_rate": 1.8274987316083209e-06, "loss": 0.0002, "step": 9150 }, { "epoch": 8.357077625570776, "grad_norm": 14.427268981933594, "learning_rate": 1.8264840182648401e-06, "loss": 0.0604, "step": 9151 }, { "epoch": 8.35799086757991, "grad_norm": 0.004092761315405369, "learning_rate": 1.8254693049213598e-06, "loss": 0.0, "step": 9152 }, { "epoch": 8.35890410958904, "grad_norm": 0.04444742947816849, "learning_rate": 1.8244545915778793e-06, "loss": 0.0004, "step": 9153 }, { "epoch": 8.359817351598174, "grad_norm": 1.713154911994934, "learning_rate": 1.823439878234399e-06, "loss": 0.0071, "step": 9154 }, { "epoch": 8.360730593607306, "grad_norm": 0.16407610476016998, "learning_rate": 1.8224251648909185e-06, "loss": 0.0008, "step": 9155 }, { "epoch": 8.361643835616439, "grad_norm": 1.4960509538650513, "learning_rate": 1.821410451547438e-06, "loss": 0.0082, "step": 9156 }, { "epoch": 8.36255707762557, "grad_norm": 0.9425860643386841, "learning_rate": 1.8203957382039575e-06, "loss": 0.0056, "step": 9157 }, { "epoch": 8.363470319634704, "grad_norm": 0.06727273017168045, "learning_rate": 1.8193810248604771e-06, "loss": 0.0004, "step": 9158 }, { "epoch": 8.364383561643836, "grad_norm": 0.1410605013370514, "learning_rate": 1.8183663115169966e-06, "loss": 0.0007, "step": 9159 }, { "epoch": 8.365296803652967, "grad_norm": 15.807878494262695, "learning_rate": 1.8173515981735163e-06, "loss": 0.098, "step": 9160 }, { "epoch": 8.3662100456621, "grad_norm": 1.4997135400772095, "learning_rate": 1.8163368848300356e-06, "loss": 0.0077, "step": 9161 }, { "epoch": 8.367123287671232, "grad_norm": 7.678688049316406, "learning_rate": 1.815322171486555e-06, "loss": 0.0361, "step": 9162 }, { "epoch": 8.368036529680365, "grad_norm": 0.6672989130020142, "learning_rate": 1.8143074581430748e-06, "loss": 0.0033, "step": 9163 }, { "epoch": 8.368949771689497, "grad_norm": 17.895954132080078, "learning_rate": 1.8132927447995942e-06, "loss": 0.0541, "step": 9164 }, { "epoch": 8.36986301369863, "grad_norm": 0.12182532250881195, "learning_rate": 1.812278031456114e-06, "loss": 0.001, "step": 9165 }, { "epoch": 8.370776255707762, "grad_norm": 3.970588445663452, "learning_rate": 1.8112633181126332e-06, "loss": 0.0372, "step": 9166 }, { "epoch": 8.371689497716895, "grad_norm": 0.02594916895031929, "learning_rate": 1.8102486047691529e-06, "loss": 0.0002, "step": 9167 }, { "epoch": 8.372602739726027, "grad_norm": 0.5078279376029968, "learning_rate": 1.8092338914256724e-06, "loss": 0.0038, "step": 9168 }, { "epoch": 8.37351598173516, "grad_norm": 7.0706305503845215, "learning_rate": 1.808219178082192e-06, "loss": 0.0266, "step": 9169 }, { "epoch": 8.374429223744292, "grad_norm": 10.528640747070312, "learning_rate": 1.8072044647387115e-06, "loss": 0.0672, "step": 9170 }, { "epoch": 8.375342465753425, "grad_norm": 0.8274602890014648, "learning_rate": 1.8061897513952308e-06, "loss": 0.0044, "step": 9171 }, { "epoch": 8.376255707762557, "grad_norm": 10.823884010314941, "learning_rate": 1.8051750380517505e-06, "loss": 0.0516, "step": 9172 }, { "epoch": 8.37716894977169, "grad_norm": 0.3463698923587799, "learning_rate": 1.80416032470827e-06, "loss": 0.0021, "step": 9173 }, { "epoch": 8.378082191780821, "grad_norm": 0.48770391941070557, "learning_rate": 1.8031456113647897e-06, "loss": 0.0026, "step": 9174 }, { "epoch": 8.378995433789955, "grad_norm": 0.1543285846710205, "learning_rate": 1.8021308980213092e-06, "loss": 0.0011, "step": 9175 }, { "epoch": 8.379908675799086, "grad_norm": 0.09247571229934692, "learning_rate": 1.8011161846778286e-06, "loss": 0.0005, "step": 9176 }, { "epoch": 8.38082191780822, "grad_norm": 1.9467861652374268, "learning_rate": 1.8001014713343481e-06, "loss": 0.0139, "step": 9177 }, { "epoch": 8.381735159817351, "grad_norm": 7.354555606842041, "learning_rate": 1.7990867579908678e-06, "loss": 0.0395, "step": 9178 }, { "epoch": 8.382648401826485, "grad_norm": 24.536022186279297, "learning_rate": 1.7980720446473873e-06, "loss": 0.1408, "step": 9179 }, { "epoch": 8.383561643835616, "grad_norm": 0.16182802617549896, "learning_rate": 1.797057331303907e-06, "loss": 0.0008, "step": 9180 }, { "epoch": 8.38447488584475, "grad_norm": 0.7976815700531006, "learning_rate": 1.7960426179604263e-06, "loss": 0.0049, "step": 9181 }, { "epoch": 8.385388127853881, "grad_norm": 1.0297608375549316, "learning_rate": 1.7950279046169457e-06, "loss": 0.006, "step": 9182 }, { "epoch": 8.386301369863014, "grad_norm": 0.0972813069820404, "learning_rate": 1.7940131912734654e-06, "loss": 0.0007, "step": 9183 }, { "epoch": 8.387214611872146, "grad_norm": 1.3982313871383667, "learning_rate": 1.792998477929985e-06, "loss": 0.0069, "step": 9184 }, { "epoch": 8.38812785388128, "grad_norm": 0.8359190225601196, "learning_rate": 1.7919837645865046e-06, "loss": 0.0064, "step": 9185 }, { "epoch": 8.389041095890411, "grad_norm": 0.08699057251214981, "learning_rate": 1.7909690512430239e-06, "loss": 0.0006, "step": 9186 }, { "epoch": 8.389954337899542, "grad_norm": 0.06874442100524902, "learning_rate": 1.7899543378995436e-06, "loss": 0.0005, "step": 9187 }, { "epoch": 8.390867579908676, "grad_norm": 3.3842663764953613, "learning_rate": 1.788939624556063e-06, "loss": 0.0203, "step": 9188 }, { "epoch": 8.391780821917807, "grad_norm": 0.5166445374488831, "learning_rate": 1.7879249112125827e-06, "loss": 0.0031, "step": 9189 }, { "epoch": 8.39269406392694, "grad_norm": 0.8591836094856262, "learning_rate": 1.7869101978691022e-06, "loss": 0.004, "step": 9190 }, { "epoch": 8.393607305936072, "grad_norm": 0.3835631310939789, "learning_rate": 1.7858954845256215e-06, "loss": 0.0016, "step": 9191 }, { "epoch": 8.394520547945206, "grad_norm": 7.865382671356201, "learning_rate": 1.7848807711821412e-06, "loss": 0.0381, "step": 9192 }, { "epoch": 8.395433789954337, "grad_norm": 0.5217825174331665, "learning_rate": 1.7838660578386607e-06, "loss": 0.0033, "step": 9193 }, { "epoch": 8.39634703196347, "grad_norm": 0.07025226950645447, "learning_rate": 1.7828513444951803e-06, "loss": 0.0005, "step": 9194 }, { "epoch": 8.397260273972602, "grad_norm": 2.1087124347686768, "learning_rate": 1.7818366311516998e-06, "loss": 0.0134, "step": 9195 }, { "epoch": 8.398173515981735, "grad_norm": 0.013790725730359554, "learning_rate": 1.7808219178082193e-06, "loss": 0.0001, "step": 9196 }, { "epoch": 8.399086757990867, "grad_norm": 0.05415932089090347, "learning_rate": 1.7798072044647388e-06, "loss": 0.0002, "step": 9197 }, { "epoch": 8.4, "grad_norm": 0.7969627380371094, "learning_rate": 1.7787924911212585e-06, "loss": 0.0041, "step": 9198 }, { "epoch": 8.400913242009132, "grad_norm": 0.3175637722015381, "learning_rate": 1.777777777777778e-06, "loss": 0.0019, "step": 9199 }, { "epoch": 8.401826484018265, "grad_norm": 12.608522415161133, "learning_rate": 1.7767630644342976e-06, "loss": 0.0572, "step": 9200 }, { "epoch": 8.402739726027397, "grad_norm": 6.344038486480713, "learning_rate": 1.775748351090817e-06, "loss": 0.0315, "step": 9201 }, { "epoch": 8.40365296803653, "grad_norm": 0.05102664604783058, "learning_rate": 1.7747336377473364e-06, "loss": 0.0003, "step": 9202 }, { "epoch": 8.404566210045662, "grad_norm": 0.5597420930862427, "learning_rate": 1.773718924403856e-06, "loss": 0.0034, "step": 9203 }, { "epoch": 8.405479452054795, "grad_norm": 0.024468837305903435, "learning_rate": 1.7727042110603756e-06, "loss": 0.0002, "step": 9204 }, { "epoch": 8.406392694063927, "grad_norm": 2.1616435050964355, "learning_rate": 1.7716894977168953e-06, "loss": 0.0124, "step": 9205 }, { "epoch": 8.40730593607306, "grad_norm": 12.036452293395996, "learning_rate": 1.7706747843734145e-06, "loss": 0.0701, "step": 9206 }, { "epoch": 8.408219178082192, "grad_norm": 0.10482850670814514, "learning_rate": 1.7696600710299342e-06, "loss": 0.0006, "step": 9207 }, { "epoch": 8.409132420091325, "grad_norm": 18.286312103271484, "learning_rate": 1.7686453576864537e-06, "loss": 0.0654, "step": 9208 }, { "epoch": 8.410045662100456, "grad_norm": 3.3974170684814453, "learning_rate": 1.7676306443429734e-06, "loss": 0.0214, "step": 9209 }, { "epoch": 8.41095890410959, "grad_norm": 0.20608870685100555, "learning_rate": 1.7666159309994929e-06, "loss": 0.0011, "step": 9210 }, { "epoch": 8.411872146118721, "grad_norm": 0.006976564414799213, "learning_rate": 1.7656012176560121e-06, "loss": 0.0, "step": 9211 }, { "epoch": 8.412785388127855, "grad_norm": 2.5653326511383057, "learning_rate": 1.7645865043125318e-06, "loss": 0.014, "step": 9212 }, { "epoch": 8.413698630136986, "grad_norm": 9.76844596862793, "learning_rate": 1.7635717909690513e-06, "loss": 0.0502, "step": 9213 }, { "epoch": 8.414611872146118, "grad_norm": 0.026861947029829025, "learning_rate": 1.762557077625571e-06, "loss": 0.0002, "step": 9214 }, { "epoch": 8.415525114155251, "grad_norm": 36.99720764160156, "learning_rate": 1.7615423642820905e-06, "loss": 0.4237, "step": 9215 }, { "epoch": 8.416438356164383, "grad_norm": 11.441916465759277, "learning_rate": 1.76052765093861e-06, "loss": 0.055, "step": 9216 }, { "epoch": 8.417351598173516, "grad_norm": 0.17668455839157104, "learning_rate": 1.7595129375951294e-06, "loss": 0.0012, "step": 9217 }, { "epoch": 8.418264840182648, "grad_norm": 2.5116913318634033, "learning_rate": 1.7584982242516491e-06, "loss": 0.0133, "step": 9218 }, { "epoch": 8.419178082191781, "grad_norm": 0.3511427938938141, "learning_rate": 1.7574835109081686e-06, "loss": 0.0019, "step": 9219 }, { "epoch": 8.420091324200913, "grad_norm": 0.22044794261455536, "learning_rate": 1.7564687975646883e-06, "loss": 0.001, "step": 9220 }, { "epoch": 8.421004566210046, "grad_norm": 15.510539054870605, "learning_rate": 1.7554540842212076e-06, "loss": 0.0887, "step": 9221 }, { "epoch": 8.421917808219177, "grad_norm": 3.232815980911255, "learning_rate": 1.754439370877727e-06, "loss": 0.0176, "step": 9222 }, { "epoch": 8.42283105022831, "grad_norm": 1.1459887027740479, "learning_rate": 1.7534246575342468e-06, "loss": 0.0056, "step": 9223 }, { "epoch": 8.423744292237442, "grad_norm": 0.6239019632339478, "learning_rate": 1.7524099441907662e-06, "loss": 0.0025, "step": 9224 }, { "epoch": 8.424657534246576, "grad_norm": 0.494392067193985, "learning_rate": 1.751395230847286e-06, "loss": 0.0019, "step": 9225 }, { "epoch": 8.425570776255707, "grad_norm": 4.568635940551758, "learning_rate": 1.7503805175038052e-06, "loss": 0.0209, "step": 9226 }, { "epoch": 8.42648401826484, "grad_norm": 112.23676300048828, "learning_rate": 1.7493658041603249e-06, "loss": 1.3579, "step": 9227 }, { "epoch": 8.427397260273972, "grad_norm": 3.535353899002075, "learning_rate": 1.7483510908168444e-06, "loss": 0.0183, "step": 9228 }, { "epoch": 8.428310502283106, "grad_norm": 0.0013998337090015411, "learning_rate": 1.747336377473364e-06, "loss": 0.0, "step": 9229 }, { "epoch": 8.429223744292237, "grad_norm": 0.10975460708141327, "learning_rate": 1.7463216641298835e-06, "loss": 0.0006, "step": 9230 }, { "epoch": 8.43013698630137, "grad_norm": 0.03297977149486542, "learning_rate": 1.7453069507864028e-06, "loss": 0.0002, "step": 9231 }, { "epoch": 8.431050228310502, "grad_norm": 1.6338930130004883, "learning_rate": 1.7442922374429225e-06, "loss": 0.0072, "step": 9232 }, { "epoch": 8.431963470319635, "grad_norm": 0.3952277898788452, "learning_rate": 1.743277524099442e-06, "loss": 0.0023, "step": 9233 }, { "epoch": 8.432876712328767, "grad_norm": 0.029764611274003983, "learning_rate": 1.7422628107559617e-06, "loss": 0.0002, "step": 9234 }, { "epoch": 8.4337899543379, "grad_norm": 0.041652482002973557, "learning_rate": 1.7412480974124812e-06, "loss": 0.0002, "step": 9235 }, { "epoch": 8.434703196347032, "grad_norm": 0.0756063312292099, "learning_rate": 1.7402333840690006e-06, "loss": 0.0005, "step": 9236 }, { "epoch": 8.435616438356165, "grad_norm": 22.83749771118164, "learning_rate": 1.7392186707255201e-06, "loss": 0.1013, "step": 9237 }, { "epoch": 8.436529680365297, "grad_norm": 0.6367539763450623, "learning_rate": 1.7382039573820398e-06, "loss": 0.0041, "step": 9238 }, { "epoch": 8.43744292237443, "grad_norm": 0.8018650412559509, "learning_rate": 1.7371892440385593e-06, "loss": 0.0049, "step": 9239 }, { "epoch": 8.438356164383562, "grad_norm": 0.8851044178009033, "learning_rate": 1.736174530695079e-06, "loss": 0.0045, "step": 9240 }, { "epoch": 8.439269406392693, "grad_norm": 0.24843855202198029, "learning_rate": 1.7351598173515982e-06, "loss": 0.0012, "step": 9241 }, { "epoch": 8.440182648401827, "grad_norm": 0.015362671576440334, "learning_rate": 1.7341451040081177e-06, "loss": 0.0001, "step": 9242 }, { "epoch": 8.441095890410958, "grad_norm": 0.18882031738758087, "learning_rate": 1.7331303906646374e-06, "loss": 0.001, "step": 9243 }, { "epoch": 8.442009132420091, "grad_norm": 1.1944091320037842, "learning_rate": 1.732115677321157e-06, "loss": 0.0079, "step": 9244 }, { "epoch": 8.442922374429223, "grad_norm": 31.414268493652344, "learning_rate": 1.7311009639776766e-06, "loss": 0.2304, "step": 9245 }, { "epoch": 8.443835616438356, "grad_norm": 29.69770622253418, "learning_rate": 1.7300862506341959e-06, "loss": 0.1676, "step": 9246 }, { "epoch": 8.444748858447488, "grad_norm": 6.7130889892578125, "learning_rate": 1.7290715372907156e-06, "loss": 0.0349, "step": 9247 }, { "epoch": 8.445662100456621, "grad_norm": 4.257328033447266, "learning_rate": 1.728056823947235e-06, "loss": 0.0225, "step": 9248 }, { "epoch": 8.446575342465753, "grad_norm": 0.12741892039775848, "learning_rate": 1.7270421106037547e-06, "loss": 0.0008, "step": 9249 }, { "epoch": 8.447488584474886, "grad_norm": 1.1139836311340332, "learning_rate": 1.7260273972602742e-06, "loss": 0.0069, "step": 9250 }, { "epoch": 8.448401826484018, "grad_norm": 1.5317515134811401, "learning_rate": 1.7250126839167935e-06, "loss": 0.0081, "step": 9251 }, { "epoch": 8.449315068493151, "grad_norm": 0.029847685247659683, "learning_rate": 1.7239979705733132e-06, "loss": 0.0001, "step": 9252 }, { "epoch": 8.450228310502283, "grad_norm": 3.841481924057007, "learning_rate": 1.7229832572298326e-06, "loss": 0.0284, "step": 9253 }, { "epoch": 8.451141552511416, "grad_norm": 2.4292383193969727, "learning_rate": 1.7219685438863523e-06, "loss": 0.0129, "step": 9254 }, { "epoch": 8.452054794520548, "grad_norm": 24.926143646240234, "learning_rate": 1.7209538305428718e-06, "loss": 0.1324, "step": 9255 }, { "epoch": 8.45296803652968, "grad_norm": 1.517297387123108, "learning_rate": 1.7199391171993913e-06, "loss": 0.0054, "step": 9256 }, { "epoch": 8.453881278538812, "grad_norm": 0.2792603373527527, "learning_rate": 1.7189244038559108e-06, "loss": 0.0018, "step": 9257 }, { "epoch": 8.454794520547946, "grad_norm": 0.6637815237045288, "learning_rate": 1.7179096905124305e-06, "loss": 0.0034, "step": 9258 }, { "epoch": 8.455707762557077, "grad_norm": 2.4148948192596436, "learning_rate": 1.71689497716895e-06, "loss": 0.016, "step": 9259 }, { "epoch": 8.45662100456621, "grad_norm": 0.8128251433372498, "learning_rate": 1.7158802638254696e-06, "loss": 0.0042, "step": 9260 }, { "epoch": 8.457534246575342, "grad_norm": 4.727563381195068, "learning_rate": 1.714865550481989e-06, "loss": 0.0249, "step": 9261 }, { "epoch": 8.458447488584476, "grad_norm": 0.27871832251548767, "learning_rate": 1.7138508371385084e-06, "loss": 0.0017, "step": 9262 }, { "epoch": 8.459360730593607, "grad_norm": 0.3501516580581665, "learning_rate": 1.712836123795028e-06, "loss": 0.0028, "step": 9263 }, { "epoch": 8.46027397260274, "grad_norm": 42.041358947753906, "learning_rate": 1.7118214104515476e-06, "loss": 0.1517, "step": 9264 }, { "epoch": 8.461187214611872, "grad_norm": 13.172552108764648, "learning_rate": 1.7108066971080673e-06, "loss": 0.0643, "step": 9265 }, { "epoch": 8.462100456621005, "grad_norm": 0.830193817615509, "learning_rate": 1.7097919837645865e-06, "loss": 0.0055, "step": 9266 }, { "epoch": 8.463013698630137, "grad_norm": 21.192598342895508, "learning_rate": 1.7087772704211062e-06, "loss": 0.1416, "step": 9267 }, { "epoch": 8.463926940639269, "grad_norm": 0.338199645280838, "learning_rate": 1.7077625570776257e-06, "loss": 0.0017, "step": 9268 }, { "epoch": 8.464840182648402, "grad_norm": 0.6424535512924194, "learning_rate": 1.7067478437341454e-06, "loss": 0.0031, "step": 9269 }, { "epoch": 8.465753424657533, "grad_norm": 1.0065701007843018, "learning_rate": 1.7057331303906649e-06, "loss": 0.0063, "step": 9270 }, { "epoch": 8.466666666666667, "grad_norm": 0.2033459097146988, "learning_rate": 1.7047184170471841e-06, "loss": 0.0011, "step": 9271 }, { "epoch": 8.467579908675798, "grad_norm": 5.004323959350586, "learning_rate": 1.7037037037037038e-06, "loss": 0.0247, "step": 9272 }, { "epoch": 8.468493150684932, "grad_norm": 97.98253631591797, "learning_rate": 1.7026889903602233e-06, "loss": 1.8369, "step": 9273 }, { "epoch": 8.469406392694063, "grad_norm": 2.399918794631958, "learning_rate": 1.701674277016743e-06, "loss": 0.0128, "step": 9274 }, { "epoch": 8.470319634703197, "grad_norm": 0.06100725382566452, "learning_rate": 1.7006595636732625e-06, "loss": 0.0004, "step": 9275 }, { "epoch": 8.471232876712328, "grad_norm": 112.7789535522461, "learning_rate": 1.699644850329782e-06, "loss": 1.4378, "step": 9276 }, { "epoch": 8.472146118721462, "grad_norm": 1.2421528100967407, "learning_rate": 1.6986301369863014e-06, "loss": 0.005, "step": 9277 }, { "epoch": 8.473059360730593, "grad_norm": 0.09812675416469574, "learning_rate": 1.6976154236428211e-06, "loss": 0.0005, "step": 9278 }, { "epoch": 8.473972602739726, "grad_norm": 0.003200126113370061, "learning_rate": 1.6966007102993406e-06, "loss": 0.0, "step": 9279 }, { "epoch": 8.474885844748858, "grad_norm": 0.28330183029174805, "learning_rate": 1.6955859969558603e-06, "loss": 0.0015, "step": 9280 }, { "epoch": 8.475799086757991, "grad_norm": 8.830621719360352, "learning_rate": 1.6945712836123796e-06, "loss": 0.0545, "step": 9281 }, { "epoch": 8.476712328767123, "grad_norm": 0.061187610030174255, "learning_rate": 1.693556570268899e-06, "loss": 0.0003, "step": 9282 }, { "epoch": 8.477625570776256, "grad_norm": 0.4441492259502411, "learning_rate": 1.6925418569254187e-06, "loss": 0.0018, "step": 9283 }, { "epoch": 8.478538812785388, "grad_norm": 1.565403699874878, "learning_rate": 1.6915271435819382e-06, "loss": 0.0103, "step": 9284 }, { "epoch": 8.479452054794521, "grad_norm": 17.70329475402832, "learning_rate": 1.690512430238458e-06, "loss": 0.0644, "step": 9285 }, { "epoch": 8.480365296803653, "grad_norm": 10.311530113220215, "learning_rate": 1.6894977168949772e-06, "loss": 0.0412, "step": 9286 }, { "epoch": 8.481278538812786, "grad_norm": 34.291053771972656, "learning_rate": 1.6884830035514969e-06, "loss": 0.2699, "step": 9287 }, { "epoch": 8.482191780821918, "grad_norm": 2.4524753093719482, "learning_rate": 1.6874682902080164e-06, "loss": 0.0121, "step": 9288 }, { "epoch": 8.483105022831051, "grad_norm": 0.5259686708450317, "learning_rate": 1.686453576864536e-06, "loss": 0.0023, "step": 9289 }, { "epoch": 8.484018264840183, "grad_norm": 0.05868885666131973, "learning_rate": 1.6854388635210555e-06, "loss": 0.0003, "step": 9290 }, { "epoch": 8.484931506849316, "grad_norm": 0.5432873964309692, "learning_rate": 1.6844241501775748e-06, "loss": 0.0027, "step": 9291 }, { "epoch": 8.485844748858447, "grad_norm": 14.030356407165527, "learning_rate": 1.6834094368340945e-06, "loss": 0.088, "step": 9292 }, { "epoch": 8.48675799086758, "grad_norm": 32.08101272583008, "learning_rate": 1.682394723490614e-06, "loss": 0.2132, "step": 9293 }, { "epoch": 8.487671232876712, "grad_norm": 3.0640740394592285, "learning_rate": 1.6813800101471337e-06, "loss": 0.0183, "step": 9294 }, { "epoch": 8.488584474885844, "grad_norm": 1.3202948570251465, "learning_rate": 1.6803652968036531e-06, "loss": 0.0081, "step": 9295 }, { "epoch": 8.489497716894977, "grad_norm": 0.4873354136943817, "learning_rate": 1.6793505834601726e-06, "loss": 0.0034, "step": 9296 }, { "epoch": 8.490410958904109, "grad_norm": 4.98423957824707, "learning_rate": 1.6783358701166921e-06, "loss": 0.0382, "step": 9297 }, { "epoch": 8.491324200913242, "grad_norm": 0.25250938534736633, "learning_rate": 1.6773211567732118e-06, "loss": 0.001, "step": 9298 }, { "epoch": 8.492237442922374, "grad_norm": 0.8383185267448425, "learning_rate": 1.6763064434297313e-06, "loss": 0.0063, "step": 9299 }, { "epoch": 8.493150684931507, "grad_norm": 6.433079719543457, "learning_rate": 1.675291730086251e-06, "loss": 0.0318, "step": 9300 }, { "epoch": 8.494063926940639, "grad_norm": 2.352752685546875, "learning_rate": 1.6742770167427702e-06, "loss": 0.0114, "step": 9301 }, { "epoch": 8.494977168949772, "grad_norm": 1.0725914239883423, "learning_rate": 1.6732623033992897e-06, "loss": 0.0052, "step": 9302 }, { "epoch": 8.495890410958904, "grad_norm": 0.4576856791973114, "learning_rate": 1.6722475900558094e-06, "loss": 0.002, "step": 9303 }, { "epoch": 8.496803652968037, "grad_norm": 1.033736228942871, "learning_rate": 1.671232876712329e-06, "loss": 0.0041, "step": 9304 }, { "epoch": 8.497716894977168, "grad_norm": 85.40516662597656, "learning_rate": 1.6702181633688486e-06, "loss": 1.5557, "step": 9305 }, { "epoch": 8.498630136986302, "grad_norm": 15.129812240600586, "learning_rate": 1.6692034500253679e-06, "loss": 0.0756, "step": 9306 }, { "epoch": 8.499543378995433, "grad_norm": 0.2824270725250244, "learning_rate": 1.6681887366818875e-06, "loss": 0.0014, "step": 9307 }, { "epoch": 8.500456621004567, "grad_norm": 0.018002262338995934, "learning_rate": 1.667174023338407e-06, "loss": 0.0001, "step": 9308 }, { "epoch": 8.501369863013698, "grad_norm": 7.628543376922607, "learning_rate": 1.6661593099949267e-06, "loss": 0.0588, "step": 9309 }, { "epoch": 8.502283105022832, "grad_norm": 1.7282942533493042, "learning_rate": 1.6651445966514462e-06, "loss": 0.0031, "step": 9310 }, { "epoch": 8.503196347031963, "grad_norm": 2.792978048324585, "learning_rate": 1.6641298833079655e-06, "loss": 0.0098, "step": 9311 }, { "epoch": 8.504109589041096, "grad_norm": 0.7188498973846436, "learning_rate": 1.6631151699644852e-06, "loss": 0.0028, "step": 9312 }, { "epoch": 8.505022831050228, "grad_norm": 1.0115221738815308, "learning_rate": 1.6621004566210046e-06, "loss": 0.0061, "step": 9313 }, { "epoch": 8.505936073059361, "grad_norm": 0.06348855048418045, "learning_rate": 1.6610857432775243e-06, "loss": 0.0004, "step": 9314 }, { "epoch": 8.506849315068493, "grad_norm": 4.800436019897461, "learning_rate": 1.6600710299340438e-06, "loss": 0.0246, "step": 9315 }, { "epoch": 8.507762557077626, "grad_norm": 1.7879867553710938, "learning_rate": 1.6590563165905633e-06, "loss": 0.0077, "step": 9316 }, { "epoch": 8.508675799086758, "grad_norm": 27.990371704101562, "learning_rate": 1.6580416032470828e-06, "loss": 0.1157, "step": 9317 }, { "epoch": 8.509589041095891, "grad_norm": 0.6198499202728271, "learning_rate": 1.6570268899036025e-06, "loss": 0.0031, "step": 9318 }, { "epoch": 8.510502283105023, "grad_norm": 1.0467368364334106, "learning_rate": 1.656012176560122e-06, "loss": 0.006, "step": 9319 }, { "epoch": 8.511415525114156, "grad_norm": 0.04913211986422539, "learning_rate": 1.6549974632166416e-06, "loss": 0.0003, "step": 9320 }, { "epoch": 8.512328767123288, "grad_norm": 0.6911165714263916, "learning_rate": 1.653982749873161e-06, "loss": 0.0039, "step": 9321 }, { "epoch": 8.51324200913242, "grad_norm": 0.1167692318558693, "learning_rate": 1.6529680365296804e-06, "loss": 0.0007, "step": 9322 }, { "epoch": 8.514155251141553, "grad_norm": 9.042617797851562, "learning_rate": 1.6519533231862e-06, "loss": 0.076, "step": 9323 }, { "epoch": 8.515068493150684, "grad_norm": 1.0015844106674194, "learning_rate": 1.6509386098427196e-06, "loss": 0.0071, "step": 9324 }, { "epoch": 8.515981735159817, "grad_norm": 49.05752944946289, "learning_rate": 1.6499238964992393e-06, "loss": 0.248, "step": 9325 }, { "epoch": 8.516894977168949, "grad_norm": 1.8741137981414795, "learning_rate": 1.6489091831557585e-06, "loss": 0.0076, "step": 9326 }, { "epoch": 8.517808219178082, "grad_norm": 2.949714183807373, "learning_rate": 1.6478944698122782e-06, "loss": 0.015, "step": 9327 }, { "epoch": 8.518721461187214, "grad_norm": 0.07564451545476913, "learning_rate": 1.6468797564687977e-06, "loss": 0.0003, "step": 9328 }, { "epoch": 8.519634703196347, "grad_norm": 0.6978656649589539, "learning_rate": 1.6458650431253174e-06, "loss": 0.0035, "step": 9329 }, { "epoch": 8.520547945205479, "grad_norm": 0.2280295193195343, "learning_rate": 1.6448503297818369e-06, "loss": 0.0011, "step": 9330 }, { "epoch": 8.521461187214612, "grad_norm": 2.787147283554077, "learning_rate": 1.6438356164383561e-06, "loss": 0.0084, "step": 9331 }, { "epoch": 8.522374429223744, "grad_norm": 0.04562496393918991, "learning_rate": 1.6428209030948758e-06, "loss": 0.0003, "step": 9332 }, { "epoch": 8.523287671232877, "grad_norm": 0.6554849147796631, "learning_rate": 1.6418061897513953e-06, "loss": 0.0048, "step": 9333 }, { "epoch": 8.524200913242009, "grad_norm": 8.833813667297363, "learning_rate": 1.640791476407915e-06, "loss": 0.0338, "step": 9334 }, { "epoch": 8.525114155251142, "grad_norm": 0.7082733511924744, "learning_rate": 1.6397767630644345e-06, "loss": 0.0049, "step": 9335 }, { "epoch": 8.526027397260274, "grad_norm": 2.171592950820923, "learning_rate": 1.638762049720954e-06, "loss": 0.0096, "step": 9336 }, { "epoch": 8.526940639269407, "grad_norm": 2.4724903106689453, "learning_rate": 1.6377473363774734e-06, "loss": 0.0105, "step": 9337 }, { "epoch": 8.527853881278538, "grad_norm": 0.7819638252258301, "learning_rate": 1.6367326230339931e-06, "loss": 0.003, "step": 9338 }, { "epoch": 8.528767123287672, "grad_norm": 4.537454128265381, "learning_rate": 1.6357179096905126e-06, "loss": 0.0262, "step": 9339 }, { "epoch": 8.529680365296803, "grad_norm": 0.18481573462486267, "learning_rate": 1.6347031963470323e-06, "loss": 0.0013, "step": 9340 }, { "epoch": 8.530593607305937, "grad_norm": 0.0993756502866745, "learning_rate": 1.6336884830035516e-06, "loss": 0.0005, "step": 9341 }, { "epoch": 8.531506849315068, "grad_norm": 0.3507384955883026, "learning_rate": 1.632673769660071e-06, "loss": 0.002, "step": 9342 }, { "epoch": 8.532420091324202, "grad_norm": 1.1284230947494507, "learning_rate": 1.6316590563165907e-06, "loss": 0.0069, "step": 9343 }, { "epoch": 8.533333333333333, "grad_norm": 0.7857393026351929, "learning_rate": 1.6306443429731102e-06, "loss": 0.0026, "step": 9344 }, { "epoch": 8.534246575342467, "grad_norm": 3.253368616104126, "learning_rate": 1.62962962962963e-06, "loss": 0.0236, "step": 9345 }, { "epoch": 8.535159817351598, "grad_norm": 0.096982441842556, "learning_rate": 1.6286149162861492e-06, "loss": 0.0005, "step": 9346 }, { "epoch": 8.536073059360731, "grad_norm": 4.406095027923584, "learning_rate": 1.6276002029426689e-06, "loss": 0.0354, "step": 9347 }, { "epoch": 8.536986301369863, "grad_norm": 0.35221338272094727, "learning_rate": 1.6265854895991884e-06, "loss": 0.0025, "step": 9348 }, { "epoch": 8.537899543378995, "grad_norm": 1.4980005025863647, "learning_rate": 1.625570776255708e-06, "loss": 0.0066, "step": 9349 }, { "epoch": 8.538812785388128, "grad_norm": 15.81311321258545, "learning_rate": 1.6245560629122275e-06, "loss": 0.0994, "step": 9350 }, { "epoch": 8.53972602739726, "grad_norm": 0.9614526629447937, "learning_rate": 1.6235413495687468e-06, "loss": 0.0058, "step": 9351 }, { "epoch": 8.540639269406393, "grad_norm": 0.41451606154441833, "learning_rate": 1.6225266362252665e-06, "loss": 0.0028, "step": 9352 }, { "epoch": 8.541552511415524, "grad_norm": 1.7722513675689697, "learning_rate": 1.621511922881786e-06, "loss": 0.0096, "step": 9353 }, { "epoch": 8.542465753424658, "grad_norm": 0.05573045462369919, "learning_rate": 1.6204972095383057e-06, "loss": 0.0003, "step": 9354 }, { "epoch": 8.54337899543379, "grad_norm": 8.531365394592285, "learning_rate": 1.6194824961948251e-06, "loss": 0.0401, "step": 9355 }, { "epoch": 8.544292237442923, "grad_norm": 0.7319523096084595, "learning_rate": 1.6184677828513446e-06, "loss": 0.0033, "step": 9356 }, { "epoch": 8.545205479452054, "grad_norm": 2.9658591747283936, "learning_rate": 1.617453069507864e-06, "loss": 0.0135, "step": 9357 }, { "epoch": 8.546118721461188, "grad_norm": 0.24718841910362244, "learning_rate": 1.6164383561643838e-06, "loss": 0.0007, "step": 9358 }, { "epoch": 8.54703196347032, "grad_norm": 0.11328105628490448, "learning_rate": 1.6154236428209033e-06, "loss": 0.0004, "step": 9359 }, { "epoch": 8.547945205479452, "grad_norm": 1.0161532163619995, "learning_rate": 1.614408929477423e-06, "loss": 0.005, "step": 9360 }, { "epoch": 8.548858447488584, "grad_norm": 1.6476280689239502, "learning_rate": 1.6133942161339422e-06, "loss": 0.0072, "step": 9361 }, { "epoch": 8.549771689497717, "grad_norm": 0.10399926453828812, "learning_rate": 1.6123795027904617e-06, "loss": 0.0004, "step": 9362 }, { "epoch": 8.550684931506849, "grad_norm": 0.5850067138671875, "learning_rate": 1.6113647894469814e-06, "loss": 0.0038, "step": 9363 }, { "epoch": 8.551598173515982, "grad_norm": 0.029643891379237175, "learning_rate": 1.6103500761035009e-06, "loss": 0.0002, "step": 9364 }, { "epoch": 8.552511415525114, "grad_norm": 0.031315095722675323, "learning_rate": 1.6093353627600206e-06, "loss": 0.0002, "step": 9365 }, { "epoch": 8.553424657534247, "grad_norm": 91.3435287475586, "learning_rate": 1.6083206494165399e-06, "loss": 0.5346, "step": 9366 }, { "epoch": 8.554337899543379, "grad_norm": 1.2021714448928833, "learning_rate": 1.6073059360730595e-06, "loss": 0.0059, "step": 9367 }, { "epoch": 8.555251141552512, "grad_norm": 0.32936856150627136, "learning_rate": 1.606291222729579e-06, "loss": 0.0023, "step": 9368 }, { "epoch": 8.556164383561644, "grad_norm": 0.20818400382995605, "learning_rate": 1.6052765093860987e-06, "loss": 0.0008, "step": 9369 }, { "epoch": 8.557077625570777, "grad_norm": 0.14077144861221313, "learning_rate": 1.6042617960426182e-06, "loss": 0.0009, "step": 9370 }, { "epoch": 8.557990867579909, "grad_norm": 0.7633925676345825, "learning_rate": 1.6032470826991375e-06, "loss": 0.004, "step": 9371 }, { "epoch": 8.558904109589042, "grad_norm": 0.10747919976711273, "learning_rate": 1.6022323693556572e-06, "loss": 0.0006, "step": 9372 }, { "epoch": 8.559817351598173, "grad_norm": 4.992493152618408, "learning_rate": 1.6012176560121766e-06, "loss": 0.0296, "step": 9373 }, { "epoch": 8.560730593607307, "grad_norm": 0.03058890998363495, "learning_rate": 1.6002029426686963e-06, "loss": 0.0002, "step": 9374 }, { "epoch": 8.561643835616438, "grad_norm": 0.8239970207214355, "learning_rate": 1.5991882293252158e-06, "loss": 0.0069, "step": 9375 }, { "epoch": 8.56255707762557, "grad_norm": 1.4895943403244019, "learning_rate": 1.5981735159817353e-06, "loss": 0.0077, "step": 9376 }, { "epoch": 8.563470319634703, "grad_norm": 4.046015739440918, "learning_rate": 1.5971588026382548e-06, "loss": 0.0202, "step": 9377 }, { "epoch": 8.564383561643835, "grad_norm": 0.9046680927276611, "learning_rate": 1.5961440892947745e-06, "loss": 0.0049, "step": 9378 }, { "epoch": 8.565296803652968, "grad_norm": 0.32079482078552246, "learning_rate": 1.595129375951294e-06, "loss": 0.0022, "step": 9379 }, { "epoch": 8.5662100456621, "grad_norm": 2.845158338546753, "learning_rate": 1.5941146626078136e-06, "loss": 0.0202, "step": 9380 }, { "epoch": 8.567123287671233, "grad_norm": 39.93363952636719, "learning_rate": 1.593099949264333e-06, "loss": 0.0905, "step": 9381 }, { "epoch": 8.568036529680365, "grad_norm": 3.335416793823242, "learning_rate": 1.5920852359208524e-06, "loss": 0.0174, "step": 9382 }, { "epoch": 8.568949771689498, "grad_norm": 0.3291856348514557, "learning_rate": 1.591070522577372e-06, "loss": 0.0025, "step": 9383 }, { "epoch": 8.56986301369863, "grad_norm": 3.6802496910095215, "learning_rate": 1.5900558092338916e-06, "loss": 0.0179, "step": 9384 }, { "epoch": 8.570776255707763, "grad_norm": 0.5855530500411987, "learning_rate": 1.5890410958904112e-06, "loss": 0.0034, "step": 9385 }, { "epoch": 8.571689497716894, "grad_norm": 3.067700147628784, "learning_rate": 1.5880263825469305e-06, "loss": 0.0139, "step": 9386 }, { "epoch": 8.572602739726028, "grad_norm": 0.06438665837049484, "learning_rate": 1.5870116692034502e-06, "loss": 0.0002, "step": 9387 }, { "epoch": 8.57351598173516, "grad_norm": 0.2267671674489975, "learning_rate": 1.5859969558599697e-06, "loss": 0.0014, "step": 9388 }, { "epoch": 8.574429223744293, "grad_norm": 13.431550025939941, "learning_rate": 1.5849822425164894e-06, "loss": 0.0563, "step": 9389 }, { "epoch": 8.575342465753424, "grad_norm": 0.06073077768087387, "learning_rate": 1.5839675291730089e-06, "loss": 0.0003, "step": 9390 }, { "epoch": 8.576255707762558, "grad_norm": 0.10919960588216782, "learning_rate": 1.5829528158295281e-06, "loss": 0.0004, "step": 9391 }, { "epoch": 8.57716894977169, "grad_norm": 4.639636516571045, "learning_rate": 1.5819381024860478e-06, "loss": 0.0237, "step": 9392 }, { "epoch": 8.578082191780823, "grad_norm": 17.73045539855957, "learning_rate": 1.5809233891425673e-06, "loss": 0.1115, "step": 9393 }, { "epoch": 8.578995433789954, "grad_norm": 0.6195639967918396, "learning_rate": 1.579908675799087e-06, "loss": 0.0037, "step": 9394 }, { "epoch": 8.579908675799087, "grad_norm": 0.07260093837976456, "learning_rate": 1.5788939624556065e-06, "loss": 0.0004, "step": 9395 }, { "epoch": 8.580821917808219, "grad_norm": 0.24947436153888702, "learning_rate": 1.577879249112126e-06, "loss": 0.0015, "step": 9396 }, { "epoch": 8.581735159817352, "grad_norm": 0.007796779740601778, "learning_rate": 1.5768645357686454e-06, "loss": 0.0001, "step": 9397 }, { "epoch": 8.582648401826484, "grad_norm": 1.4771233797073364, "learning_rate": 1.5758498224251651e-06, "loss": 0.0071, "step": 9398 }, { "epoch": 8.583561643835617, "grad_norm": 2.250291347503662, "learning_rate": 1.5748351090816846e-06, "loss": 0.0107, "step": 9399 }, { "epoch": 8.584474885844749, "grad_norm": 0.005554442759603262, "learning_rate": 1.5738203957382043e-06, "loss": 0.0, "step": 9400 }, { "epoch": 8.585388127853882, "grad_norm": 0.6296189427375793, "learning_rate": 1.5728056823947236e-06, "loss": 0.002, "step": 9401 }, { "epoch": 8.586301369863014, "grad_norm": 1.2765424251556396, "learning_rate": 1.571790969051243e-06, "loss": 0.0055, "step": 9402 }, { "epoch": 8.587214611872145, "grad_norm": 0.33227431774139404, "learning_rate": 1.5707762557077627e-06, "loss": 0.0019, "step": 9403 }, { "epoch": 8.588127853881279, "grad_norm": 5.948217868804932, "learning_rate": 1.5697615423642822e-06, "loss": 0.0324, "step": 9404 }, { "epoch": 8.58904109589041, "grad_norm": 7.169498920440674, "learning_rate": 1.568746829020802e-06, "loss": 0.0248, "step": 9405 }, { "epoch": 8.589954337899544, "grad_norm": 25.252775192260742, "learning_rate": 1.5677321156773212e-06, "loss": 0.1735, "step": 9406 }, { "epoch": 8.590867579908675, "grad_norm": 0.5182188153266907, "learning_rate": 1.5667174023338409e-06, "loss": 0.0023, "step": 9407 }, { "epoch": 8.591780821917808, "grad_norm": 0.16975213587284088, "learning_rate": 1.5657026889903604e-06, "loss": 0.001, "step": 9408 }, { "epoch": 8.59269406392694, "grad_norm": 1.6402431726455688, "learning_rate": 1.56468797564688e-06, "loss": 0.0077, "step": 9409 }, { "epoch": 8.593607305936073, "grad_norm": 0.14534126222133636, "learning_rate": 1.5636732623033995e-06, "loss": 0.001, "step": 9410 }, { "epoch": 8.594520547945205, "grad_norm": 3.563138484954834, "learning_rate": 1.5626585489599188e-06, "loss": 0.024, "step": 9411 }, { "epoch": 8.595433789954338, "grad_norm": 0.5628222227096558, "learning_rate": 1.5616438356164385e-06, "loss": 0.0031, "step": 9412 }, { "epoch": 8.59634703196347, "grad_norm": 0.34329232573509216, "learning_rate": 1.560629122272958e-06, "loss": 0.0018, "step": 9413 }, { "epoch": 8.597260273972603, "grad_norm": 4.3805060386657715, "learning_rate": 1.5596144089294777e-06, "loss": 0.0293, "step": 9414 }, { "epoch": 8.598173515981735, "grad_norm": 2.5812525749206543, "learning_rate": 1.5585996955859971e-06, "loss": 0.0116, "step": 9415 }, { "epoch": 8.599086757990868, "grad_norm": 3.837311267852783, "learning_rate": 1.5575849822425166e-06, "loss": 0.0191, "step": 9416 }, { "epoch": 8.6, "grad_norm": 99.72293090820312, "learning_rate": 1.556570268899036e-06, "loss": 0.7053, "step": 9417 }, { "epoch": 8.600913242009133, "grad_norm": 0.12144681066274643, "learning_rate": 1.5555555555555558e-06, "loss": 0.0008, "step": 9418 }, { "epoch": 8.601826484018265, "grad_norm": 7.70545768737793, "learning_rate": 1.5545408422120753e-06, "loss": 0.0339, "step": 9419 }, { "epoch": 8.602739726027398, "grad_norm": 0.005188962444663048, "learning_rate": 1.553526128868595e-06, "loss": 0.0, "step": 9420 }, { "epoch": 8.60365296803653, "grad_norm": 1.7297736406326294, "learning_rate": 1.5525114155251142e-06, "loss": 0.0093, "step": 9421 }, { "epoch": 8.604566210045663, "grad_norm": 0.004346971400082111, "learning_rate": 1.5514967021816337e-06, "loss": 0.0, "step": 9422 }, { "epoch": 8.605479452054794, "grad_norm": 0.5331976413726807, "learning_rate": 1.5504819888381534e-06, "loss": 0.0033, "step": 9423 }, { "epoch": 8.606392694063928, "grad_norm": 0.05881304666399956, "learning_rate": 1.5494672754946729e-06, "loss": 0.0003, "step": 9424 }, { "epoch": 8.60730593607306, "grad_norm": 0.1647057682275772, "learning_rate": 1.5484525621511926e-06, "loss": 0.0011, "step": 9425 }, { "epoch": 8.608219178082193, "grad_norm": 0.03384897857904434, "learning_rate": 1.5474378488077118e-06, "loss": 0.0003, "step": 9426 }, { "epoch": 8.609132420091324, "grad_norm": 0.3480058014392853, "learning_rate": 1.5464231354642315e-06, "loss": 0.0023, "step": 9427 }, { "epoch": 8.610045662100458, "grad_norm": 8.329568862915039, "learning_rate": 1.545408422120751e-06, "loss": 0.0348, "step": 9428 }, { "epoch": 8.610958904109589, "grad_norm": 0.024876827374100685, "learning_rate": 1.5443937087772707e-06, "loss": 0.0002, "step": 9429 }, { "epoch": 8.61187214611872, "grad_norm": 0.6899353265762329, "learning_rate": 1.5433789954337902e-06, "loss": 0.0034, "step": 9430 }, { "epoch": 8.612785388127854, "grad_norm": 0.034917473793029785, "learning_rate": 1.5423642820903095e-06, "loss": 0.0001, "step": 9431 }, { "epoch": 8.613698630136986, "grad_norm": 0.043435707688331604, "learning_rate": 1.5413495687468292e-06, "loss": 0.0003, "step": 9432 }, { "epoch": 8.614611872146119, "grad_norm": 0.05797722935676575, "learning_rate": 1.5403348554033486e-06, "loss": 0.0004, "step": 9433 }, { "epoch": 8.61552511415525, "grad_norm": 0.00471965316683054, "learning_rate": 1.5393201420598683e-06, "loss": 0.0, "step": 9434 }, { "epoch": 8.616438356164384, "grad_norm": 0.03471315652132034, "learning_rate": 1.5383054287163878e-06, "loss": 0.0002, "step": 9435 }, { "epoch": 8.617351598173515, "grad_norm": 1.034301519393921, "learning_rate": 1.5372907153729073e-06, "loss": 0.0059, "step": 9436 }, { "epoch": 8.618264840182649, "grad_norm": 0.0344233363866806, "learning_rate": 1.5362760020294268e-06, "loss": 0.0003, "step": 9437 }, { "epoch": 8.61917808219178, "grad_norm": 5.331840991973877, "learning_rate": 1.5352612886859465e-06, "loss": 0.0267, "step": 9438 }, { "epoch": 8.620091324200914, "grad_norm": 2.8415749073028564, "learning_rate": 1.534246575342466e-06, "loss": 0.0199, "step": 9439 }, { "epoch": 8.621004566210045, "grad_norm": 9.165513038635254, "learning_rate": 1.5332318619989856e-06, "loss": 0.0554, "step": 9440 }, { "epoch": 8.621917808219179, "grad_norm": 103.93152618408203, "learning_rate": 1.532217148655505e-06, "loss": 0.5672, "step": 9441 }, { "epoch": 8.62283105022831, "grad_norm": 0.9283806681632996, "learning_rate": 1.5312024353120244e-06, "loss": 0.0053, "step": 9442 }, { "epoch": 8.623744292237443, "grad_norm": 6.550372123718262, "learning_rate": 1.530187721968544e-06, "loss": 0.0362, "step": 9443 }, { "epoch": 8.624657534246575, "grad_norm": 5.968670845031738, "learning_rate": 1.5291730086250635e-06, "loss": 0.0311, "step": 9444 }, { "epoch": 8.625570776255708, "grad_norm": 2.5352659225463867, "learning_rate": 1.5281582952815832e-06, "loss": 0.0113, "step": 9445 }, { "epoch": 8.62648401826484, "grad_norm": 1.245534062385559, "learning_rate": 1.5271435819381025e-06, "loss": 0.0041, "step": 9446 }, { "epoch": 8.627397260273973, "grad_norm": 0.5516621470451355, "learning_rate": 1.5261288685946222e-06, "loss": 0.0029, "step": 9447 }, { "epoch": 8.628310502283105, "grad_norm": 1.3947902917861938, "learning_rate": 1.5251141552511417e-06, "loss": 0.0082, "step": 9448 }, { "epoch": 8.629223744292238, "grad_norm": 0.17222459614276886, "learning_rate": 1.5240994419076614e-06, "loss": 0.001, "step": 9449 }, { "epoch": 8.63013698630137, "grad_norm": 0.1872043013572693, "learning_rate": 1.5230847285641809e-06, "loss": 0.0012, "step": 9450 }, { "epoch": 8.631050228310503, "grad_norm": 0.4737783372402191, "learning_rate": 1.5220700152207001e-06, "loss": 0.0031, "step": 9451 }, { "epoch": 8.631963470319635, "grad_norm": 0.9182717204093933, "learning_rate": 1.5210553018772198e-06, "loss": 0.0066, "step": 9452 }, { "epoch": 8.632876712328766, "grad_norm": 1.7867404222488403, "learning_rate": 1.5200405885337393e-06, "loss": 0.0112, "step": 9453 }, { "epoch": 8.6337899543379, "grad_norm": 0.17152006924152374, "learning_rate": 1.519025875190259e-06, "loss": 0.001, "step": 9454 }, { "epoch": 8.634703196347033, "grad_norm": 199.22039794921875, "learning_rate": 1.5180111618467785e-06, "loss": 0.084, "step": 9455 }, { "epoch": 8.635616438356164, "grad_norm": 0.040636152029037476, "learning_rate": 1.516996448503298e-06, "loss": 0.0002, "step": 9456 }, { "epoch": 8.636529680365296, "grad_norm": 2.32599139213562, "learning_rate": 1.5159817351598174e-06, "loss": 0.0144, "step": 9457 }, { "epoch": 8.63744292237443, "grad_norm": 0.07789665460586548, "learning_rate": 1.5149670218163371e-06, "loss": 0.0006, "step": 9458 }, { "epoch": 8.638356164383561, "grad_norm": 0.5470951199531555, "learning_rate": 1.5139523084728566e-06, "loss": 0.0039, "step": 9459 }, { "epoch": 8.639269406392694, "grad_norm": 0.06637442857027054, "learning_rate": 1.5129375951293763e-06, "loss": 0.0004, "step": 9460 }, { "epoch": 8.640182648401826, "grad_norm": 0.1582905501127243, "learning_rate": 1.5119228817858956e-06, "loss": 0.0009, "step": 9461 }, { "epoch": 8.64109589041096, "grad_norm": 4.016791820526123, "learning_rate": 1.510908168442415e-06, "loss": 0.0302, "step": 9462 }, { "epoch": 8.64200913242009, "grad_norm": 0.7738503813743591, "learning_rate": 1.5098934550989347e-06, "loss": 0.0044, "step": 9463 }, { "epoch": 8.642922374429224, "grad_norm": 0.06401150673627853, "learning_rate": 1.5088787417554542e-06, "loss": 0.0003, "step": 9464 }, { "epoch": 8.643835616438356, "grad_norm": 0.947420597076416, "learning_rate": 1.507864028411974e-06, "loss": 0.0041, "step": 9465 }, { "epoch": 8.644748858447489, "grad_norm": 0.2551170587539673, "learning_rate": 1.5068493150684932e-06, "loss": 0.0012, "step": 9466 }, { "epoch": 8.64566210045662, "grad_norm": 0.7770528793334961, "learning_rate": 1.5058346017250129e-06, "loss": 0.0042, "step": 9467 }, { "epoch": 8.646575342465754, "grad_norm": 1.3048948049545288, "learning_rate": 1.5048198883815323e-06, "loss": 0.0089, "step": 9468 }, { "epoch": 8.647488584474885, "grad_norm": 0.971876323223114, "learning_rate": 1.503805175038052e-06, "loss": 0.0068, "step": 9469 }, { "epoch": 8.648401826484019, "grad_norm": 3.15602707862854, "learning_rate": 1.5027904616945715e-06, "loss": 0.0141, "step": 9470 }, { "epoch": 8.64931506849315, "grad_norm": 0.023990143090486526, "learning_rate": 1.5017757483510908e-06, "loss": 0.0001, "step": 9471 }, { "epoch": 8.650228310502284, "grad_norm": 9.794625282287598, "learning_rate": 1.5007610350076105e-06, "loss": 0.0643, "step": 9472 }, { "epoch": 8.651141552511415, "grad_norm": 7.312427520751953, "learning_rate": 1.49974632166413e-06, "loss": 0.027, "step": 9473 }, { "epoch": 8.652054794520549, "grad_norm": 108.63482666015625, "learning_rate": 1.4987316083206497e-06, "loss": 0.7945, "step": 9474 }, { "epoch": 8.65296803652968, "grad_norm": 1.6788545846939087, "learning_rate": 1.497716894977169e-06, "loss": 0.0066, "step": 9475 }, { "epoch": 8.653881278538814, "grad_norm": 3.8198304176330566, "learning_rate": 1.4967021816336886e-06, "loss": 0.0061, "step": 9476 }, { "epoch": 8.654794520547945, "grad_norm": 0.5605168342590332, "learning_rate": 1.495687468290208e-06, "loss": 0.005, "step": 9477 }, { "epoch": 8.655707762557078, "grad_norm": 1.0303813219070435, "learning_rate": 1.4946727549467278e-06, "loss": 0.0069, "step": 9478 }, { "epoch": 8.65662100456621, "grad_norm": 0.024443862959742546, "learning_rate": 1.4936580416032473e-06, "loss": 0.0002, "step": 9479 }, { "epoch": 8.657534246575342, "grad_norm": 0.12093818932771683, "learning_rate": 1.4926433282597665e-06, "loss": 0.0007, "step": 9480 }, { "epoch": 8.658447488584475, "grad_norm": 22.245840072631836, "learning_rate": 1.4916286149162862e-06, "loss": 0.2426, "step": 9481 }, { "epoch": 8.659360730593608, "grad_norm": 0.1789640337228775, "learning_rate": 1.4906139015728057e-06, "loss": 0.001, "step": 9482 }, { "epoch": 8.66027397260274, "grad_norm": 143.5428009033203, "learning_rate": 1.4895991882293254e-06, "loss": 1.953, "step": 9483 }, { "epoch": 8.661187214611871, "grad_norm": 0.05670319125056267, "learning_rate": 1.4885844748858449e-06, "loss": 0.0003, "step": 9484 }, { "epoch": 8.662100456621005, "grad_norm": 0.15334652364253998, "learning_rate": 1.4875697615423644e-06, "loss": 0.0007, "step": 9485 }, { "epoch": 8.663013698630136, "grad_norm": 0.17580565810203552, "learning_rate": 1.4865550481988838e-06, "loss": 0.001, "step": 9486 }, { "epoch": 8.66392694063927, "grad_norm": 0.2939460873603821, "learning_rate": 1.4855403348554035e-06, "loss": 0.0016, "step": 9487 }, { "epoch": 8.664840182648401, "grad_norm": 0.41236451268196106, "learning_rate": 1.484525621511923e-06, "loss": 0.002, "step": 9488 }, { "epoch": 8.665753424657535, "grad_norm": 18.895423889160156, "learning_rate": 1.4835109081684427e-06, "loss": 0.1344, "step": 9489 }, { "epoch": 8.666666666666666, "grad_norm": 1.2866530418395996, "learning_rate": 1.482496194824962e-06, "loss": 0.0087, "step": 9490 }, { "epoch": 8.6675799086758, "grad_norm": 5.232554912567139, "learning_rate": 1.4814814814814815e-06, "loss": 0.0304, "step": 9491 }, { "epoch": 8.668493150684931, "grad_norm": 0.05104062706232071, "learning_rate": 1.4804667681380011e-06, "loss": 0.0003, "step": 9492 }, { "epoch": 8.669406392694064, "grad_norm": 0.20485611259937286, "learning_rate": 1.4794520547945206e-06, "loss": 0.0015, "step": 9493 }, { "epoch": 8.670319634703196, "grad_norm": 0.03374442085623741, "learning_rate": 1.4784373414510403e-06, "loss": 0.0001, "step": 9494 }, { "epoch": 8.67123287671233, "grad_norm": 2.7633373737335205, "learning_rate": 1.4774226281075596e-06, "loss": 0.0122, "step": 9495 }, { "epoch": 8.67214611872146, "grad_norm": 1.1054388284683228, "learning_rate": 1.4764079147640793e-06, "loss": 0.0072, "step": 9496 }, { "epoch": 8.673059360730594, "grad_norm": 0.07647157460451126, "learning_rate": 1.4753932014205988e-06, "loss": 0.0004, "step": 9497 }, { "epoch": 8.673972602739726, "grad_norm": 0.31416040658950806, "learning_rate": 1.4743784880771185e-06, "loss": 0.0012, "step": 9498 }, { "epoch": 8.674885844748859, "grad_norm": 0.020309612154960632, "learning_rate": 1.473363774733638e-06, "loss": 0.0001, "step": 9499 }, { "epoch": 8.67579908675799, "grad_norm": 5.285809516906738, "learning_rate": 1.4723490613901572e-06, "loss": 0.0303, "step": 9500 }, { "epoch": 8.676712328767124, "grad_norm": 0.4604354798793793, "learning_rate": 1.4713343480466769e-06, "loss": 0.0032, "step": 9501 }, { "epoch": 8.677625570776256, "grad_norm": 0.20485106110572815, "learning_rate": 1.4703196347031964e-06, "loss": 0.0011, "step": 9502 }, { "epoch": 8.678538812785389, "grad_norm": 0.46639105677604675, "learning_rate": 1.469304921359716e-06, "loss": 0.0025, "step": 9503 }, { "epoch": 8.67945205479452, "grad_norm": 2.897679328918457, "learning_rate": 1.4682902080162355e-06, "loss": 0.0242, "step": 9504 }, { "epoch": 8.680365296803654, "grad_norm": 0.7434309124946594, "learning_rate": 1.467275494672755e-06, "loss": 0.0039, "step": 9505 }, { "epoch": 8.681278538812785, "grad_norm": 0.10140382498502731, "learning_rate": 1.4662607813292745e-06, "loss": 0.0008, "step": 9506 }, { "epoch": 8.682191780821917, "grad_norm": 0.7623801827430725, "learning_rate": 1.4652460679857942e-06, "loss": 0.003, "step": 9507 }, { "epoch": 8.68310502283105, "grad_norm": 0.10574105381965637, "learning_rate": 1.4642313546423137e-06, "loss": 0.0006, "step": 9508 }, { "epoch": 8.684018264840184, "grad_norm": 0.3561458885669708, "learning_rate": 1.4632166412988334e-06, "loss": 0.0018, "step": 9509 }, { "epoch": 8.684931506849315, "grad_norm": 15.229548454284668, "learning_rate": 1.4622019279553526e-06, "loss": 0.0305, "step": 9510 }, { "epoch": 8.685844748858447, "grad_norm": 0.9682297706604004, "learning_rate": 1.4611872146118721e-06, "loss": 0.0063, "step": 9511 }, { "epoch": 8.68675799086758, "grad_norm": 0.5251151919364929, "learning_rate": 1.4601725012683918e-06, "loss": 0.0029, "step": 9512 }, { "epoch": 8.687671232876712, "grad_norm": 0.8982272744178772, "learning_rate": 1.4591577879249113e-06, "loss": 0.0063, "step": 9513 }, { "epoch": 8.688584474885845, "grad_norm": 40.40596389770508, "learning_rate": 1.458143074581431e-06, "loss": 0.1868, "step": 9514 }, { "epoch": 8.689497716894977, "grad_norm": 4.441507816314697, "learning_rate": 1.4571283612379503e-06, "loss": 0.0256, "step": 9515 }, { "epoch": 8.69041095890411, "grad_norm": 1.4216493368148804, "learning_rate": 1.45611364789447e-06, "loss": 0.0074, "step": 9516 }, { "epoch": 8.691324200913241, "grad_norm": 0.22314542531967163, "learning_rate": 1.4550989345509894e-06, "loss": 0.0012, "step": 9517 }, { "epoch": 8.692237442922375, "grad_norm": 0.2428581714630127, "learning_rate": 1.4540842212075091e-06, "loss": 0.0012, "step": 9518 }, { "epoch": 8.693150684931506, "grad_norm": 0.03280951827764511, "learning_rate": 1.4530695078640286e-06, "loss": 0.0002, "step": 9519 }, { "epoch": 8.69406392694064, "grad_norm": 1.2536081075668335, "learning_rate": 1.4520547945205479e-06, "loss": 0.0088, "step": 9520 }, { "epoch": 8.694977168949771, "grad_norm": 0.36569681763648987, "learning_rate": 1.4510400811770676e-06, "loss": 0.002, "step": 9521 }, { "epoch": 8.695890410958905, "grad_norm": 0.9263867735862732, "learning_rate": 1.450025367833587e-06, "loss": 0.0029, "step": 9522 }, { "epoch": 8.696803652968036, "grad_norm": 2.6561312675476074, "learning_rate": 1.4490106544901067e-06, "loss": 0.0161, "step": 9523 }, { "epoch": 8.69771689497717, "grad_norm": 2.436506986618042, "learning_rate": 1.4479959411466262e-06, "loss": 0.0132, "step": 9524 }, { "epoch": 8.698630136986301, "grad_norm": 1.6664372682571411, "learning_rate": 1.4469812278031457e-06, "loss": 0.0063, "step": 9525 }, { "epoch": 8.699543378995434, "grad_norm": 0.06704549491405487, "learning_rate": 1.4459665144596652e-06, "loss": 0.0003, "step": 9526 }, { "epoch": 8.700456621004566, "grad_norm": 1.1742684841156006, "learning_rate": 1.4449518011161849e-06, "loss": 0.0036, "step": 9527 }, { "epoch": 8.7013698630137, "grad_norm": 0.0182922575622797, "learning_rate": 1.4439370877727043e-06, "loss": 0.0001, "step": 9528 }, { "epoch": 8.70228310502283, "grad_norm": 0.12276999652385712, "learning_rate": 1.442922374429224e-06, "loss": 0.0008, "step": 9529 }, { "epoch": 8.703196347031964, "grad_norm": 0.5460125803947449, "learning_rate": 1.4419076610857433e-06, "loss": 0.0041, "step": 9530 }, { "epoch": 8.704109589041096, "grad_norm": 0.021035823971033096, "learning_rate": 1.4408929477422628e-06, "loss": 0.0001, "step": 9531 }, { "epoch": 8.70502283105023, "grad_norm": 0.38509947061538696, "learning_rate": 1.4398782343987825e-06, "loss": 0.0016, "step": 9532 }, { "epoch": 8.70593607305936, "grad_norm": 0.0729527473449707, "learning_rate": 1.438863521055302e-06, "loss": 0.0004, "step": 9533 }, { "epoch": 8.706849315068492, "grad_norm": 0.8117966055870056, "learning_rate": 1.4378488077118216e-06, "loss": 0.0044, "step": 9534 }, { "epoch": 8.707762557077626, "grad_norm": 0.2765999734401703, "learning_rate": 1.436834094368341e-06, "loss": 0.0018, "step": 9535 }, { "epoch": 8.708675799086759, "grad_norm": 1.459311842918396, "learning_rate": 1.4358193810248606e-06, "loss": 0.0071, "step": 9536 }, { "epoch": 8.70958904109589, "grad_norm": 0.9991821050643921, "learning_rate": 1.43480466768138e-06, "loss": 0.0071, "step": 9537 }, { "epoch": 8.710502283105022, "grad_norm": 0.4586906433105469, "learning_rate": 1.4337899543378998e-06, "loss": 0.0034, "step": 9538 }, { "epoch": 8.711415525114155, "grad_norm": 117.1446533203125, "learning_rate": 1.4327752409944193e-06, "loss": 1.4217, "step": 9539 }, { "epoch": 8.712328767123287, "grad_norm": 0.022863009944558144, "learning_rate": 1.4317605276509385e-06, "loss": 0.0001, "step": 9540 }, { "epoch": 8.71324200913242, "grad_norm": 6.466253757476807, "learning_rate": 1.4307458143074582e-06, "loss": 0.0275, "step": 9541 }, { "epoch": 8.714155251141552, "grad_norm": 0.21941685676574707, "learning_rate": 1.4297311009639777e-06, "loss": 0.0012, "step": 9542 }, { "epoch": 8.715068493150685, "grad_norm": 3.7865748405456543, "learning_rate": 1.4287163876204974e-06, "loss": 0.012, "step": 9543 }, { "epoch": 8.715981735159817, "grad_norm": 8.116963386535645, "learning_rate": 1.4277016742770169e-06, "loss": 0.039, "step": 9544 }, { "epoch": 8.71689497716895, "grad_norm": 0.09751231968402863, "learning_rate": 1.4266869609335364e-06, "loss": 0.0005, "step": 9545 }, { "epoch": 8.717808219178082, "grad_norm": 0.3233429789543152, "learning_rate": 1.4256722475900558e-06, "loss": 0.0021, "step": 9546 }, { "epoch": 8.718721461187215, "grad_norm": 0.7598696947097778, "learning_rate": 1.4246575342465755e-06, "loss": 0.0067, "step": 9547 }, { "epoch": 8.719634703196347, "grad_norm": 0.888417661190033, "learning_rate": 1.423642820903095e-06, "loss": 0.0036, "step": 9548 }, { "epoch": 8.72054794520548, "grad_norm": 32.35342025756836, "learning_rate": 1.4226281075596147e-06, "loss": 0.1733, "step": 9549 }, { "epoch": 8.721461187214611, "grad_norm": 9.129500389099121, "learning_rate": 1.421613394216134e-06, "loss": 0.0464, "step": 9550 }, { "epoch": 8.722374429223745, "grad_norm": 23.585952758789062, "learning_rate": 1.4205986808726534e-06, "loss": 0.1129, "step": 9551 }, { "epoch": 8.723287671232876, "grad_norm": 0.26294073462486267, "learning_rate": 1.4195839675291731e-06, "loss": 0.0014, "step": 9552 }, { "epoch": 8.72420091324201, "grad_norm": 4.809146404266357, "learning_rate": 1.4185692541856926e-06, "loss": 0.0252, "step": 9553 }, { "epoch": 8.725114155251141, "grad_norm": 0.27061957120895386, "learning_rate": 1.4175545408422123e-06, "loss": 0.0017, "step": 9554 }, { "epoch": 8.726027397260275, "grad_norm": 0.5492986440658569, "learning_rate": 1.4165398274987316e-06, "loss": 0.0028, "step": 9555 }, { "epoch": 8.726940639269406, "grad_norm": 0.3898020386695862, "learning_rate": 1.4155251141552513e-06, "loss": 0.0036, "step": 9556 }, { "epoch": 8.72785388127854, "grad_norm": 3.3378140926361084, "learning_rate": 1.4145104008117708e-06, "loss": 0.021, "step": 9557 }, { "epoch": 8.728767123287671, "grad_norm": 1.2933413982391357, "learning_rate": 1.4134956874682904e-06, "loss": 0.0065, "step": 9558 }, { "epoch": 8.729680365296804, "grad_norm": 3.157425880432129, "learning_rate": 1.41248097412481e-06, "loss": 0.0249, "step": 9559 }, { "epoch": 8.730593607305936, "grad_norm": 1.1465338468551636, "learning_rate": 1.4114662607813292e-06, "loss": 0.0055, "step": 9560 }, { "epoch": 8.731506849315068, "grad_norm": 0.02821960672736168, "learning_rate": 1.4104515474378489e-06, "loss": 0.0002, "step": 9561 }, { "epoch": 8.732420091324201, "grad_norm": 0.6367610096931458, "learning_rate": 1.4094368340943684e-06, "loss": 0.0033, "step": 9562 }, { "epoch": 8.733333333333333, "grad_norm": 0.3318540155887604, "learning_rate": 1.408422120750888e-06, "loss": 0.002, "step": 9563 }, { "epoch": 8.734246575342466, "grad_norm": 0.16112443804740906, "learning_rate": 1.4074074074074075e-06, "loss": 0.001, "step": 9564 }, { "epoch": 8.735159817351597, "grad_norm": 1.1261696815490723, "learning_rate": 1.406392694063927e-06, "loss": 0.0056, "step": 9565 }, { "epoch": 8.73607305936073, "grad_norm": 0.04807722941040993, "learning_rate": 1.4053779807204465e-06, "loss": 0.0002, "step": 9566 }, { "epoch": 8.736986301369862, "grad_norm": 1.5781506299972534, "learning_rate": 1.4043632673769662e-06, "loss": 0.0071, "step": 9567 }, { "epoch": 8.737899543378996, "grad_norm": 0.03973580151796341, "learning_rate": 1.4033485540334857e-06, "loss": 0.0002, "step": 9568 }, { "epoch": 8.738812785388127, "grad_norm": 1.1398874521255493, "learning_rate": 1.4023338406900054e-06, "loss": 0.0075, "step": 9569 }, { "epoch": 8.73972602739726, "grad_norm": 16.506513595581055, "learning_rate": 1.4013191273465246e-06, "loss": 0.0879, "step": 9570 }, { "epoch": 8.740639269406392, "grad_norm": 19.350374221801758, "learning_rate": 1.4003044140030441e-06, "loss": 0.1182, "step": 9571 }, { "epoch": 8.741552511415525, "grad_norm": 0.022722849622368813, "learning_rate": 1.3992897006595638e-06, "loss": 0.0001, "step": 9572 }, { "epoch": 8.742465753424657, "grad_norm": 1.5861023664474487, "learning_rate": 1.3982749873160833e-06, "loss": 0.0065, "step": 9573 }, { "epoch": 8.74337899543379, "grad_norm": 0.06019074469804764, "learning_rate": 1.397260273972603e-06, "loss": 0.0005, "step": 9574 }, { "epoch": 8.744292237442922, "grad_norm": 23.58765983581543, "learning_rate": 1.3962455606291222e-06, "loss": 0.1346, "step": 9575 }, { "epoch": 8.745205479452055, "grad_norm": 0.25624075531959534, "learning_rate": 1.395230847285642e-06, "loss": 0.0016, "step": 9576 }, { "epoch": 8.746118721461187, "grad_norm": 0.1305563747882843, "learning_rate": 1.3942161339421614e-06, "loss": 0.0008, "step": 9577 }, { "epoch": 8.74703196347032, "grad_norm": 0.012552458792924881, "learning_rate": 1.3932014205986811e-06, "loss": 0.0001, "step": 9578 }, { "epoch": 8.747945205479452, "grad_norm": 0.9103161692619324, "learning_rate": 1.3921867072552006e-06, "loss": 0.0062, "step": 9579 }, { "epoch": 8.748858447488585, "grad_norm": 1.8793379068374634, "learning_rate": 1.3911719939117199e-06, "loss": 0.0108, "step": 9580 }, { "epoch": 8.749771689497717, "grad_norm": 14.563716888427734, "learning_rate": 1.3901572805682396e-06, "loss": 0.0901, "step": 9581 }, { "epoch": 8.75068493150685, "grad_norm": 6.924579620361328, "learning_rate": 1.389142567224759e-06, "loss": 0.0471, "step": 9582 }, { "epoch": 8.751598173515982, "grad_norm": 0.3113303482532501, "learning_rate": 1.3881278538812787e-06, "loss": 0.0013, "step": 9583 }, { "epoch": 8.752511415525115, "grad_norm": 0.2460615336894989, "learning_rate": 1.3871131405377982e-06, "loss": 0.0014, "step": 9584 }, { "epoch": 8.753424657534246, "grad_norm": 0.7378223538398743, "learning_rate": 1.3860984271943177e-06, "loss": 0.004, "step": 9585 }, { "epoch": 8.75433789954338, "grad_norm": 1.136569619178772, "learning_rate": 1.3850837138508372e-06, "loss": 0.0074, "step": 9586 }, { "epoch": 8.755251141552511, "grad_norm": 4.713892459869385, "learning_rate": 1.3840690005073569e-06, "loss": 0.0321, "step": 9587 }, { "epoch": 8.756164383561643, "grad_norm": 0.009785049594938755, "learning_rate": 1.3830542871638763e-06, "loss": 0.0, "step": 9588 }, { "epoch": 8.757077625570776, "grad_norm": 3.04779314994812, "learning_rate": 1.382039573820396e-06, "loss": 0.0209, "step": 9589 }, { "epoch": 8.757990867579908, "grad_norm": 6.8957061767578125, "learning_rate": 1.3810248604769153e-06, "loss": 0.0319, "step": 9590 }, { "epoch": 8.758904109589041, "grad_norm": 4.3830413818359375, "learning_rate": 1.3800101471334348e-06, "loss": 0.0218, "step": 9591 }, { "epoch": 8.759817351598173, "grad_norm": 0.04310114309191704, "learning_rate": 1.3789954337899545e-06, "loss": 0.0002, "step": 9592 }, { "epoch": 8.760730593607306, "grad_norm": 0.3553105592727661, "learning_rate": 1.377980720446474e-06, "loss": 0.0021, "step": 9593 }, { "epoch": 8.761643835616438, "grad_norm": 0.2443685680627823, "learning_rate": 1.3769660071029936e-06, "loss": 0.0014, "step": 9594 }, { "epoch": 8.762557077625571, "grad_norm": 0.27837616205215454, "learning_rate": 1.375951293759513e-06, "loss": 0.0019, "step": 9595 }, { "epoch": 8.763470319634703, "grad_norm": 0.5376394391059875, "learning_rate": 1.3749365804160326e-06, "loss": 0.0028, "step": 9596 }, { "epoch": 8.764383561643836, "grad_norm": 1.77074134349823, "learning_rate": 1.373921867072552e-06, "loss": 0.007, "step": 9597 }, { "epoch": 8.765296803652967, "grad_norm": 6.6541032791137695, "learning_rate": 1.3729071537290718e-06, "loss": 0.0405, "step": 9598 }, { "epoch": 8.7662100456621, "grad_norm": 2.0079708099365234, "learning_rate": 1.3718924403855913e-06, "loss": 0.0123, "step": 9599 }, { "epoch": 8.767123287671232, "grad_norm": 3.044673442840576, "learning_rate": 1.3708777270421105e-06, "loss": 0.0201, "step": 9600 }, { "epoch": 8.768036529680366, "grad_norm": 2.174423933029175, "learning_rate": 1.3698630136986302e-06, "loss": 0.0121, "step": 9601 }, { "epoch": 8.768949771689497, "grad_norm": 0.32737231254577637, "learning_rate": 1.3688483003551497e-06, "loss": 0.0022, "step": 9602 }, { "epoch": 8.76986301369863, "grad_norm": 0.8520104289054871, "learning_rate": 1.3678335870116694e-06, "loss": 0.0062, "step": 9603 }, { "epoch": 8.770776255707762, "grad_norm": 0.5322208404541016, "learning_rate": 1.3668188736681889e-06, "loss": 0.0035, "step": 9604 }, { "epoch": 8.771689497716896, "grad_norm": 0.051015377044677734, "learning_rate": 1.3658041603247084e-06, "loss": 0.0003, "step": 9605 }, { "epoch": 8.772602739726027, "grad_norm": 0.10585712641477585, "learning_rate": 1.3647894469812278e-06, "loss": 0.0007, "step": 9606 }, { "epoch": 8.77351598173516, "grad_norm": 0.3959815502166748, "learning_rate": 1.3637747336377475e-06, "loss": 0.0027, "step": 9607 }, { "epoch": 8.774429223744292, "grad_norm": 0.005063083954155445, "learning_rate": 1.362760020294267e-06, "loss": 0.0, "step": 9608 }, { "epoch": 8.775342465753425, "grad_norm": 0.007106944918632507, "learning_rate": 1.3617453069507867e-06, "loss": 0.0, "step": 9609 }, { "epoch": 8.776255707762557, "grad_norm": 1.0355409383773804, "learning_rate": 1.360730593607306e-06, "loss": 0.0051, "step": 9610 }, { "epoch": 8.77716894977169, "grad_norm": 1.8386783599853516, "learning_rate": 1.3597158802638254e-06, "loss": 0.0107, "step": 9611 }, { "epoch": 8.778082191780822, "grad_norm": 0.4392533004283905, "learning_rate": 1.3587011669203451e-06, "loss": 0.0021, "step": 9612 }, { "epoch": 8.778995433789955, "grad_norm": 0.7075071334838867, "learning_rate": 1.3576864535768646e-06, "loss": 0.0053, "step": 9613 }, { "epoch": 8.779908675799087, "grad_norm": 2.4251043796539307, "learning_rate": 1.3566717402333843e-06, "loss": 0.0062, "step": 9614 }, { "epoch": 8.780821917808218, "grad_norm": 0.43473654985427856, "learning_rate": 1.3556570268899036e-06, "loss": 0.0024, "step": 9615 }, { "epoch": 8.781735159817352, "grad_norm": 2.7619822025299072, "learning_rate": 1.3546423135464233e-06, "loss": 0.0132, "step": 9616 }, { "epoch": 8.782648401826483, "grad_norm": 1.8467897176742554, "learning_rate": 1.3536276002029427e-06, "loss": 0.0075, "step": 9617 }, { "epoch": 8.783561643835617, "grad_norm": 0.11526145786046982, "learning_rate": 1.3526128868594624e-06, "loss": 0.0004, "step": 9618 }, { "epoch": 8.784474885844748, "grad_norm": 0.15778794884681702, "learning_rate": 1.351598173515982e-06, "loss": 0.0007, "step": 9619 }, { "epoch": 8.785388127853881, "grad_norm": 0.16831836104393005, "learning_rate": 1.3505834601725012e-06, "loss": 0.0009, "step": 9620 }, { "epoch": 8.786301369863013, "grad_norm": 0.13209225237369537, "learning_rate": 1.3495687468290209e-06, "loss": 0.0008, "step": 9621 }, { "epoch": 8.787214611872146, "grad_norm": 2.952451467514038, "learning_rate": 1.3485540334855404e-06, "loss": 0.0138, "step": 9622 }, { "epoch": 8.788127853881278, "grad_norm": 1.2832752466201782, "learning_rate": 1.34753932014206e-06, "loss": 0.006, "step": 9623 }, { "epoch": 8.789041095890411, "grad_norm": 0.9671258330345154, "learning_rate": 1.3465246067985795e-06, "loss": 0.0061, "step": 9624 }, { "epoch": 8.789954337899543, "grad_norm": 0.01405975129455328, "learning_rate": 1.345509893455099e-06, "loss": 0.0001, "step": 9625 }, { "epoch": 8.790867579908676, "grad_norm": 0.5041508674621582, "learning_rate": 1.3444951801116185e-06, "loss": 0.0038, "step": 9626 }, { "epoch": 8.791780821917808, "grad_norm": 26.533447265625, "learning_rate": 1.3434804667681382e-06, "loss": 0.3326, "step": 9627 }, { "epoch": 8.792694063926941, "grad_norm": 0.0840916559100151, "learning_rate": 1.3424657534246577e-06, "loss": 0.0005, "step": 9628 }, { "epoch": 8.793607305936073, "grad_norm": 121.97681427001953, "learning_rate": 1.3414510400811774e-06, "loss": 1.5512, "step": 9629 }, { "epoch": 8.794520547945206, "grad_norm": 0.1542767435312271, "learning_rate": 1.3404363267376966e-06, "loss": 0.0011, "step": 9630 }, { "epoch": 8.795433789954338, "grad_norm": 0.5775057077407837, "learning_rate": 1.3394216133942161e-06, "loss": 0.0031, "step": 9631 }, { "epoch": 8.796347031963471, "grad_norm": 1.0554691553115845, "learning_rate": 1.3384069000507358e-06, "loss": 0.0054, "step": 9632 }, { "epoch": 8.797260273972602, "grad_norm": 0.82159423828125, "learning_rate": 1.3373921867072553e-06, "loss": 0.0038, "step": 9633 }, { "epoch": 8.798173515981736, "grad_norm": 2.0667614936828613, "learning_rate": 1.336377473363775e-06, "loss": 0.0102, "step": 9634 }, { "epoch": 8.799086757990867, "grad_norm": 1.6722238063812256, "learning_rate": 1.3353627600202942e-06, "loss": 0.0072, "step": 9635 }, { "epoch": 8.8, "grad_norm": 1.0300134420394897, "learning_rate": 1.334348046676814e-06, "loss": 0.0061, "step": 9636 }, { "epoch": 8.800913242009132, "grad_norm": 9.180614471435547, "learning_rate": 1.3333333333333334e-06, "loss": 0.0569, "step": 9637 }, { "epoch": 8.801826484018266, "grad_norm": 0.16025786101818085, "learning_rate": 1.3323186199898531e-06, "loss": 0.0009, "step": 9638 }, { "epoch": 8.802739726027397, "grad_norm": 3.5208792686462402, "learning_rate": 1.3313039066463726e-06, "loss": 0.0301, "step": 9639 }, { "epoch": 8.80365296803653, "grad_norm": 6.0318379402160645, "learning_rate": 1.3302891933028919e-06, "loss": 0.0282, "step": 9640 }, { "epoch": 8.804566210045662, "grad_norm": 0.1584593653678894, "learning_rate": 1.3292744799594115e-06, "loss": 0.001, "step": 9641 }, { "epoch": 8.805479452054794, "grad_norm": 5.968109130859375, "learning_rate": 1.328259766615931e-06, "loss": 0.0188, "step": 9642 }, { "epoch": 8.806392694063927, "grad_norm": 0.4274747669696808, "learning_rate": 1.3272450532724507e-06, "loss": 0.0027, "step": 9643 }, { "epoch": 8.807305936073059, "grad_norm": 0.3900070786476135, "learning_rate": 1.3262303399289702e-06, "loss": 0.0024, "step": 9644 }, { "epoch": 8.808219178082192, "grad_norm": 32.17335891723633, "learning_rate": 1.3252156265854897e-06, "loss": 0.1781, "step": 9645 }, { "epoch": 8.809132420091323, "grad_norm": 0.12945176661014557, "learning_rate": 1.3242009132420092e-06, "loss": 0.0006, "step": 9646 }, { "epoch": 8.810045662100457, "grad_norm": 0.290078341960907, "learning_rate": 1.3231861998985289e-06, "loss": 0.0018, "step": 9647 }, { "epoch": 8.810958904109588, "grad_norm": 0.2678794860839844, "learning_rate": 1.3221714865550483e-06, "loss": 0.0019, "step": 9648 }, { "epoch": 8.811872146118722, "grad_norm": 0.46717309951782227, "learning_rate": 1.321156773211568e-06, "loss": 0.003, "step": 9649 }, { "epoch": 8.812785388127853, "grad_norm": 0.03855760768055916, "learning_rate": 1.3201420598680873e-06, "loss": 0.0002, "step": 9650 }, { "epoch": 8.813698630136987, "grad_norm": 0.369246244430542, "learning_rate": 1.3191273465246068e-06, "loss": 0.0015, "step": 9651 }, { "epoch": 8.814611872146118, "grad_norm": 1.131170630455017, "learning_rate": 1.3181126331811265e-06, "loss": 0.0063, "step": 9652 }, { "epoch": 8.815525114155252, "grad_norm": 0.6960391402244568, "learning_rate": 1.317097919837646e-06, "loss": 0.0041, "step": 9653 }, { "epoch": 8.816438356164383, "grad_norm": 0.307839035987854, "learning_rate": 1.3160832064941656e-06, "loss": 0.0022, "step": 9654 }, { "epoch": 8.817351598173516, "grad_norm": 20.68906593322754, "learning_rate": 1.315068493150685e-06, "loss": 0.2354, "step": 9655 }, { "epoch": 8.818264840182648, "grad_norm": 4.3276567459106445, "learning_rate": 1.3140537798072046e-06, "loss": 0.0193, "step": 9656 }, { "epoch": 8.819178082191781, "grad_norm": 0.686418354511261, "learning_rate": 1.313039066463724e-06, "loss": 0.0047, "step": 9657 }, { "epoch": 8.820091324200913, "grad_norm": 0.08939044922590256, "learning_rate": 1.3120243531202438e-06, "loss": 0.0004, "step": 9658 }, { "epoch": 8.821004566210046, "grad_norm": 0.13784776628017426, "learning_rate": 1.3110096397767633e-06, "loss": 0.0008, "step": 9659 }, { "epoch": 8.821917808219178, "grad_norm": 0.6518397331237793, "learning_rate": 1.3099949264332825e-06, "loss": 0.0043, "step": 9660 }, { "epoch": 8.822831050228311, "grad_norm": 2.699903964996338, "learning_rate": 1.3089802130898022e-06, "loss": 0.0109, "step": 9661 }, { "epoch": 8.823744292237443, "grad_norm": 89.10050964355469, "learning_rate": 1.3079654997463217e-06, "loss": 0.8522, "step": 9662 }, { "epoch": 8.824657534246576, "grad_norm": 4.080134391784668, "learning_rate": 1.3069507864028414e-06, "loss": 0.0193, "step": 9663 }, { "epoch": 8.825570776255708, "grad_norm": 0.27984243631362915, "learning_rate": 1.3059360730593609e-06, "loss": 0.0018, "step": 9664 }, { "epoch": 8.826484018264841, "grad_norm": 0.2696906328201294, "learning_rate": 1.3049213597158803e-06, "loss": 0.0022, "step": 9665 }, { "epoch": 8.827397260273973, "grad_norm": 1.1128484010696411, "learning_rate": 1.3039066463723998e-06, "loss": 0.0067, "step": 9666 }, { "epoch": 8.828310502283106, "grad_norm": 0.5606022477149963, "learning_rate": 1.3028919330289195e-06, "loss": 0.0031, "step": 9667 }, { "epoch": 8.829223744292237, "grad_norm": 23.132305145263672, "learning_rate": 1.301877219685439e-06, "loss": 0.1166, "step": 9668 }, { "epoch": 8.830136986301369, "grad_norm": 3.183936357498169, "learning_rate": 1.3008625063419587e-06, "loss": 0.0211, "step": 9669 }, { "epoch": 8.831050228310502, "grad_norm": 0.042966328561306, "learning_rate": 1.299847792998478e-06, "loss": 0.0002, "step": 9670 }, { "epoch": 8.831963470319634, "grad_norm": 0.5559861660003662, "learning_rate": 1.2988330796549974e-06, "loss": 0.0023, "step": 9671 }, { "epoch": 8.832876712328767, "grad_norm": 0.31401199102401733, "learning_rate": 1.2978183663115171e-06, "loss": 0.0011, "step": 9672 }, { "epoch": 8.833789954337899, "grad_norm": 3.1722848415374756, "learning_rate": 1.2968036529680366e-06, "loss": 0.0142, "step": 9673 }, { "epoch": 8.834703196347032, "grad_norm": 0.1674976348876953, "learning_rate": 1.2957889396245563e-06, "loss": 0.0009, "step": 9674 }, { "epoch": 8.835616438356164, "grad_norm": 0.8023215532302856, "learning_rate": 1.2947742262810756e-06, "loss": 0.0053, "step": 9675 }, { "epoch": 8.836529680365297, "grad_norm": 11.406435012817383, "learning_rate": 1.2937595129375953e-06, "loss": 0.0482, "step": 9676 }, { "epoch": 8.837442922374429, "grad_norm": 81.61151123046875, "learning_rate": 1.2927447995941147e-06, "loss": 0.282, "step": 9677 }, { "epoch": 8.838356164383562, "grad_norm": 2.0333423614501953, "learning_rate": 1.2917300862506344e-06, "loss": 0.0127, "step": 9678 }, { "epoch": 8.839269406392694, "grad_norm": 72.88948822021484, "learning_rate": 1.290715372907154e-06, "loss": 0.7348, "step": 9679 }, { "epoch": 8.840182648401827, "grad_norm": 3.126664638519287, "learning_rate": 1.2897006595636732e-06, "loss": 0.0128, "step": 9680 }, { "epoch": 8.841095890410958, "grad_norm": 0.281814843416214, "learning_rate": 1.2886859462201929e-06, "loss": 0.0013, "step": 9681 }, { "epoch": 8.842009132420092, "grad_norm": 0.0033842413686215878, "learning_rate": 1.2876712328767124e-06, "loss": 0.0, "step": 9682 }, { "epoch": 8.842922374429223, "grad_norm": 7.9141621589660645, "learning_rate": 1.286656519533232e-06, "loss": 0.06, "step": 9683 }, { "epoch": 8.843835616438357, "grad_norm": 6.6083831787109375, "learning_rate": 1.2856418061897515e-06, "loss": 0.0472, "step": 9684 }, { "epoch": 8.844748858447488, "grad_norm": 11.42835807800293, "learning_rate": 1.284627092846271e-06, "loss": 0.0663, "step": 9685 }, { "epoch": 8.845662100456622, "grad_norm": 0.0894039198756218, "learning_rate": 1.2836123795027905e-06, "loss": 0.0005, "step": 9686 }, { "epoch": 8.846575342465753, "grad_norm": 0.9304203987121582, "learning_rate": 1.2825976661593102e-06, "loss": 0.0068, "step": 9687 }, { "epoch": 8.847488584474887, "grad_norm": 0.6729381680488586, "learning_rate": 1.2815829528158297e-06, "loss": 0.0037, "step": 9688 }, { "epoch": 8.848401826484018, "grad_norm": 4.800541877746582, "learning_rate": 1.2805682394723494e-06, "loss": 0.0257, "step": 9689 }, { "epoch": 8.849315068493151, "grad_norm": 3.1245737075805664, "learning_rate": 1.2795535261288686e-06, "loss": 0.0191, "step": 9690 }, { "epoch": 8.850228310502283, "grad_norm": 1.1268409490585327, "learning_rate": 1.278538812785388e-06, "loss": 0.0076, "step": 9691 }, { "epoch": 8.851141552511416, "grad_norm": 0.23602767288684845, "learning_rate": 1.2775240994419078e-06, "loss": 0.0015, "step": 9692 }, { "epoch": 8.852054794520548, "grad_norm": 0.5383099913597107, "learning_rate": 1.2765093860984273e-06, "loss": 0.003, "step": 9693 }, { "epoch": 8.852968036529681, "grad_norm": 1.443052887916565, "learning_rate": 1.275494672754947e-06, "loss": 0.0042, "step": 9694 }, { "epoch": 8.853881278538813, "grad_norm": 0.05222269147634506, "learning_rate": 1.2744799594114662e-06, "loss": 0.0003, "step": 9695 }, { "epoch": 8.854794520547944, "grad_norm": 0.25890713930130005, "learning_rate": 1.273465246067986e-06, "loss": 0.0015, "step": 9696 }, { "epoch": 8.855707762557078, "grad_norm": 0.1495324969291687, "learning_rate": 1.2724505327245054e-06, "loss": 0.0009, "step": 9697 }, { "epoch": 8.85662100456621, "grad_norm": 0.551293671131134, "learning_rate": 1.271435819381025e-06, "loss": 0.0026, "step": 9698 }, { "epoch": 8.857534246575343, "grad_norm": 8.437406539916992, "learning_rate": 1.2704211060375446e-06, "loss": 0.06, "step": 9699 }, { "epoch": 8.858447488584474, "grad_norm": 0.4098663628101349, "learning_rate": 1.2694063926940639e-06, "loss": 0.0018, "step": 9700 }, { "epoch": 8.859360730593608, "grad_norm": 5.836555480957031, "learning_rate": 1.2683916793505835e-06, "loss": 0.0127, "step": 9701 }, { "epoch": 8.860273972602739, "grad_norm": 0.09483985602855682, "learning_rate": 1.267376966007103e-06, "loss": 0.0005, "step": 9702 }, { "epoch": 8.861187214611872, "grad_norm": 0.7040771842002869, "learning_rate": 1.2663622526636227e-06, "loss": 0.0043, "step": 9703 }, { "epoch": 8.862100456621004, "grad_norm": 0.0330829918384552, "learning_rate": 1.2653475393201422e-06, "loss": 0.0002, "step": 9704 }, { "epoch": 8.863013698630137, "grad_norm": 0.5044549107551575, "learning_rate": 1.2643328259766617e-06, "loss": 0.0024, "step": 9705 }, { "epoch": 8.863926940639269, "grad_norm": 1.7172616720199585, "learning_rate": 1.2633181126331812e-06, "loss": 0.0084, "step": 9706 }, { "epoch": 8.864840182648402, "grad_norm": 1.8229087591171265, "learning_rate": 1.2623033992897008e-06, "loss": 0.005, "step": 9707 }, { "epoch": 8.865753424657534, "grad_norm": 1.4484426975250244, "learning_rate": 1.2612886859462203e-06, "loss": 0.0075, "step": 9708 }, { "epoch": 8.866666666666667, "grad_norm": 1.7659810781478882, "learning_rate": 1.26027397260274e-06, "loss": 0.008, "step": 9709 }, { "epoch": 8.867579908675799, "grad_norm": 5.744355201721191, "learning_rate": 1.2592592592592593e-06, "loss": 0.0284, "step": 9710 }, { "epoch": 8.868493150684932, "grad_norm": 6.994355201721191, "learning_rate": 1.2582445459157788e-06, "loss": 0.0335, "step": 9711 }, { "epoch": 8.869406392694064, "grad_norm": 0.08658496290445328, "learning_rate": 1.2572298325722985e-06, "loss": 0.0006, "step": 9712 }, { "epoch": 8.870319634703197, "grad_norm": 6.977646827697754, "learning_rate": 1.256215119228818e-06, "loss": 0.039, "step": 9713 }, { "epoch": 8.871232876712329, "grad_norm": 2.991029739379883, "learning_rate": 1.2552004058853376e-06, "loss": 0.0169, "step": 9714 }, { "epoch": 8.872146118721462, "grad_norm": 1.5611714124679565, "learning_rate": 1.254185692541857e-06, "loss": 0.0117, "step": 9715 }, { "epoch": 8.873059360730593, "grad_norm": 0.4592558443546295, "learning_rate": 1.2531709791983766e-06, "loss": 0.0026, "step": 9716 }, { "epoch": 8.873972602739727, "grad_norm": 7.506753921508789, "learning_rate": 1.252156265854896e-06, "loss": 0.0433, "step": 9717 }, { "epoch": 8.874885844748858, "grad_norm": 1.0694364309310913, "learning_rate": 1.2511415525114158e-06, "loss": 0.006, "step": 9718 }, { "epoch": 8.875799086757992, "grad_norm": 6.676945209503174, "learning_rate": 1.2501268391679352e-06, "loss": 0.0415, "step": 9719 }, { "epoch": 8.876712328767123, "grad_norm": 0.9313507676124573, "learning_rate": 1.2491121258244547e-06, "loss": 0.0056, "step": 9720 }, { "epoch": 8.877625570776257, "grad_norm": 0.1123519167304039, "learning_rate": 1.2480974124809742e-06, "loss": 0.0007, "step": 9721 }, { "epoch": 8.878538812785388, "grad_norm": 0.017076542600989342, "learning_rate": 1.2470826991374937e-06, "loss": 0.0001, "step": 9722 }, { "epoch": 8.87945205479452, "grad_norm": 1.7007049322128296, "learning_rate": 1.2460679857940134e-06, "loss": 0.0068, "step": 9723 }, { "epoch": 8.880365296803653, "grad_norm": 0.305931955575943, "learning_rate": 1.2450532724505329e-06, "loss": 0.0016, "step": 9724 }, { "epoch": 8.881278538812785, "grad_norm": 0.09277945011854172, "learning_rate": 1.2440385591070523e-06, "loss": 0.0006, "step": 9725 }, { "epoch": 8.882191780821918, "grad_norm": 0.08413972705602646, "learning_rate": 1.2430238457635718e-06, "loss": 0.0005, "step": 9726 }, { "epoch": 8.88310502283105, "grad_norm": 3.0284337997436523, "learning_rate": 1.2420091324200915e-06, "loss": 0.0214, "step": 9727 }, { "epoch": 8.884018264840183, "grad_norm": 1.6615945100784302, "learning_rate": 1.240994419076611e-06, "loss": 0.0106, "step": 9728 }, { "epoch": 8.884931506849314, "grad_norm": 0.018086027354002, "learning_rate": 1.2399797057331305e-06, "loss": 0.0001, "step": 9729 }, { "epoch": 8.885844748858448, "grad_norm": 0.03937605395913124, "learning_rate": 1.23896499238965e-06, "loss": 0.0002, "step": 9730 }, { "epoch": 8.88675799086758, "grad_norm": 3.524517059326172, "learning_rate": 1.2379502790461694e-06, "loss": 0.0131, "step": 9731 }, { "epoch": 8.887671232876713, "grad_norm": 4.795469284057617, "learning_rate": 1.2369355657026891e-06, "loss": 0.0195, "step": 9732 }, { "epoch": 8.888584474885844, "grad_norm": 0.33856719732284546, "learning_rate": 1.2359208523592086e-06, "loss": 0.0018, "step": 9733 }, { "epoch": 8.889497716894978, "grad_norm": 0.8565047383308411, "learning_rate": 1.234906139015728e-06, "loss": 0.0048, "step": 9734 }, { "epoch": 8.89041095890411, "grad_norm": 24.099308013916016, "learning_rate": 1.2338914256722478e-06, "loss": 0.1385, "step": 9735 }, { "epoch": 8.891324200913242, "grad_norm": 10.66356086730957, "learning_rate": 1.2328767123287673e-06, "loss": 0.0592, "step": 9736 }, { "epoch": 8.892237442922374, "grad_norm": 0.1356058269739151, "learning_rate": 1.2318619989852867e-06, "loss": 0.0006, "step": 9737 }, { "epoch": 8.893150684931507, "grad_norm": 0.6035162210464478, "learning_rate": 1.2308472856418064e-06, "loss": 0.0051, "step": 9738 }, { "epoch": 8.894063926940639, "grad_norm": 1.1142547130584717, "learning_rate": 1.2298325722983257e-06, "loss": 0.0039, "step": 9739 }, { "epoch": 8.894977168949772, "grad_norm": 1.1447805166244507, "learning_rate": 1.2288178589548454e-06, "loss": 0.0051, "step": 9740 }, { "epoch": 8.895890410958904, "grad_norm": 0.21658724546432495, "learning_rate": 1.2278031456113649e-06, "loss": 0.0012, "step": 9741 }, { "epoch": 8.896803652968037, "grad_norm": 2.898897886276245, "learning_rate": 1.2267884322678844e-06, "loss": 0.0183, "step": 9742 }, { "epoch": 8.897716894977169, "grad_norm": 0.01990916207432747, "learning_rate": 1.225773718924404e-06, "loss": 0.0001, "step": 9743 }, { "epoch": 8.898630136986302, "grad_norm": 2.3559460639953613, "learning_rate": 1.2247590055809235e-06, "loss": 0.0127, "step": 9744 }, { "epoch": 8.899543378995434, "grad_norm": 1.1757619380950928, "learning_rate": 1.223744292237443e-06, "loss": 0.0059, "step": 9745 }, { "epoch": 8.900456621004567, "grad_norm": 2.8522560596466064, "learning_rate": 1.2227295788939625e-06, "loss": 0.0165, "step": 9746 }, { "epoch": 8.901369863013699, "grad_norm": 0.14688876271247864, "learning_rate": 1.2217148655504822e-06, "loss": 0.001, "step": 9747 }, { "epoch": 8.902283105022832, "grad_norm": 1.309316635131836, "learning_rate": 1.2207001522070017e-06, "loss": 0.0084, "step": 9748 }, { "epoch": 8.903196347031963, "grad_norm": 0.331391841173172, "learning_rate": 1.2196854388635211e-06, "loss": 0.0013, "step": 9749 }, { "epoch": 8.904109589041095, "grad_norm": 0.12114604562520981, "learning_rate": 1.2186707255200406e-06, "loss": 0.0009, "step": 9750 }, { "epoch": 8.905022831050228, "grad_norm": 3.6679296493530273, "learning_rate": 1.21765601217656e-06, "loss": 0.019, "step": 9751 }, { "epoch": 8.90593607305936, "grad_norm": 0.4533635079860687, "learning_rate": 1.2166412988330798e-06, "loss": 0.0023, "step": 9752 }, { "epoch": 8.906849315068493, "grad_norm": 26.22311019897461, "learning_rate": 1.2156265854895993e-06, "loss": 0.1099, "step": 9753 }, { "epoch": 8.907762557077625, "grad_norm": 0.054222483187913895, "learning_rate": 1.2146118721461188e-06, "loss": 0.0002, "step": 9754 }, { "epoch": 8.908675799086758, "grad_norm": 20.46169662475586, "learning_rate": 1.2135971588026384e-06, "loss": 0.0992, "step": 9755 }, { "epoch": 8.90958904109589, "grad_norm": 0.023045243695378304, "learning_rate": 1.212582445459158e-06, "loss": 0.0001, "step": 9756 }, { "epoch": 8.910502283105023, "grad_norm": 0.061940453946590424, "learning_rate": 1.2115677321156774e-06, "loss": 0.0005, "step": 9757 }, { "epoch": 8.911415525114155, "grad_norm": 0.8392744064331055, "learning_rate": 1.210553018772197e-06, "loss": 0.0033, "step": 9758 }, { "epoch": 8.912328767123288, "grad_norm": 124.48957061767578, "learning_rate": 1.2095383054287164e-06, "loss": 1.2272, "step": 9759 }, { "epoch": 8.91324200913242, "grad_norm": 8.594968795776367, "learning_rate": 1.208523592085236e-06, "loss": 0.0467, "step": 9760 }, { "epoch": 8.914155251141553, "grad_norm": 5.278337478637695, "learning_rate": 1.2075088787417555e-06, "loss": 0.0305, "step": 9761 }, { "epoch": 8.915068493150685, "grad_norm": 1.504379391670227, "learning_rate": 1.206494165398275e-06, "loss": 0.006, "step": 9762 }, { "epoch": 8.915981735159818, "grad_norm": 0.20954090356826782, "learning_rate": 1.2054794520547947e-06, "loss": 0.0014, "step": 9763 }, { "epoch": 8.91689497716895, "grad_norm": 0.010218850336968899, "learning_rate": 1.2044647387113142e-06, "loss": 0.0001, "step": 9764 }, { "epoch": 8.917808219178083, "grad_norm": 0.1709793210029602, "learning_rate": 1.2034500253678337e-06, "loss": 0.0012, "step": 9765 }, { "epoch": 8.918721461187214, "grad_norm": 13.290421485900879, "learning_rate": 1.2024353120243532e-06, "loss": 0.0802, "step": 9766 }, { "epoch": 8.919634703196348, "grad_norm": 0.05041903257369995, "learning_rate": 1.2014205986808728e-06, "loss": 0.0003, "step": 9767 }, { "epoch": 8.92054794520548, "grad_norm": 0.22123576700687408, "learning_rate": 1.2004058853373923e-06, "loss": 0.0011, "step": 9768 }, { "epoch": 8.921461187214613, "grad_norm": 0.09592333436012268, "learning_rate": 1.1993911719939118e-06, "loss": 0.0007, "step": 9769 }, { "epoch": 8.922374429223744, "grad_norm": 1.7134406566619873, "learning_rate": 1.1983764586504313e-06, "loss": 0.0066, "step": 9770 }, { "epoch": 8.923287671232877, "grad_norm": 6.868242263793945, "learning_rate": 1.1973617453069508e-06, "loss": 0.0305, "step": 9771 }, { "epoch": 8.924200913242009, "grad_norm": 13.885496139526367, "learning_rate": 1.1963470319634705e-06, "loss": 0.0883, "step": 9772 }, { "epoch": 8.925114155251142, "grad_norm": 24.143814086914062, "learning_rate": 1.19533231861999e-06, "loss": 0.1088, "step": 9773 }, { "epoch": 8.926027397260274, "grad_norm": 0.7731567621231079, "learning_rate": 1.1943176052765094e-06, "loss": 0.0032, "step": 9774 }, { "epoch": 8.926940639269407, "grad_norm": 16.130725860595703, "learning_rate": 1.1933028919330291e-06, "loss": 0.0537, "step": 9775 }, { "epoch": 8.927853881278539, "grad_norm": 2.0101699829101562, "learning_rate": 1.1922881785895486e-06, "loss": 0.0089, "step": 9776 }, { "epoch": 8.92876712328767, "grad_norm": 0.03956279158592224, "learning_rate": 1.191273465246068e-06, "loss": 0.0002, "step": 9777 }, { "epoch": 8.929680365296804, "grad_norm": 3.663029193878174, "learning_rate": 1.1902587519025878e-06, "loss": 0.0228, "step": 9778 }, { "epoch": 8.930593607305935, "grad_norm": 0.03720201924443245, "learning_rate": 1.189244038559107e-06, "loss": 0.0002, "step": 9779 }, { "epoch": 8.931506849315069, "grad_norm": 0.9707365036010742, "learning_rate": 1.1882293252156267e-06, "loss": 0.0082, "step": 9780 }, { "epoch": 8.9324200913242, "grad_norm": 157.880615234375, "learning_rate": 1.1872146118721462e-06, "loss": 1.7181, "step": 9781 }, { "epoch": 8.933333333333334, "grad_norm": 14.25920295715332, "learning_rate": 1.1861998985286657e-06, "loss": 0.0845, "step": 9782 }, { "epoch": 8.934246575342465, "grad_norm": 14.405914306640625, "learning_rate": 1.1851851851851854e-06, "loss": 0.0856, "step": 9783 }, { "epoch": 8.935159817351598, "grad_norm": 29.85616111755371, "learning_rate": 1.1841704718417049e-06, "loss": 0.1728, "step": 9784 }, { "epoch": 8.93607305936073, "grad_norm": 24.026073455810547, "learning_rate": 1.1831557584982243e-06, "loss": 0.2751, "step": 9785 }, { "epoch": 8.936986301369863, "grad_norm": 0.22756685316562653, "learning_rate": 1.1821410451547438e-06, "loss": 0.0014, "step": 9786 }, { "epoch": 8.937899543378995, "grad_norm": 4.254950523376465, "learning_rate": 1.1811263318112635e-06, "loss": 0.0181, "step": 9787 }, { "epoch": 8.938812785388128, "grad_norm": 5.8314924240112305, "learning_rate": 1.180111618467783e-06, "loss": 0.0289, "step": 9788 }, { "epoch": 8.93972602739726, "grad_norm": 0.5981774926185608, "learning_rate": 1.1790969051243025e-06, "loss": 0.0033, "step": 9789 }, { "epoch": 8.940639269406393, "grad_norm": 2.147428512573242, "learning_rate": 1.178082191780822e-06, "loss": 0.0099, "step": 9790 }, { "epoch": 8.941552511415525, "grad_norm": 0.06241823732852936, "learning_rate": 1.1770674784373414e-06, "loss": 0.0002, "step": 9791 }, { "epoch": 8.942465753424658, "grad_norm": 0.4320656359195709, "learning_rate": 1.1760527650938611e-06, "loss": 0.002, "step": 9792 }, { "epoch": 8.94337899543379, "grad_norm": 0.27363765239715576, "learning_rate": 1.1750380517503806e-06, "loss": 0.0013, "step": 9793 }, { "epoch": 8.944292237442923, "grad_norm": 11.6824951171875, "learning_rate": 1.1740233384069e-06, "loss": 0.0424, "step": 9794 }, { "epoch": 8.945205479452055, "grad_norm": 2.3616607189178467, "learning_rate": 1.1730086250634198e-06, "loss": 0.0141, "step": 9795 }, { "epoch": 8.946118721461188, "grad_norm": 14.448777198791504, "learning_rate": 1.171993911719939e-06, "loss": 0.1391, "step": 9796 }, { "epoch": 8.94703196347032, "grad_norm": 0.058572281152009964, "learning_rate": 1.1709791983764587e-06, "loss": 0.0003, "step": 9797 }, { "epoch": 8.947945205479453, "grad_norm": 28.9643611907959, "learning_rate": 1.1699644850329784e-06, "loss": 0.1439, "step": 9798 }, { "epoch": 8.948858447488584, "grad_norm": 0.474638432264328, "learning_rate": 1.1689497716894977e-06, "loss": 0.0028, "step": 9799 }, { "epoch": 8.949771689497716, "grad_norm": 28.300525665283203, "learning_rate": 1.1679350583460174e-06, "loss": 0.1898, "step": 9800 }, { "epoch": 8.95068493150685, "grad_norm": 0.26180392503738403, "learning_rate": 1.1669203450025369e-06, "loss": 0.0017, "step": 9801 }, { "epoch": 8.951598173515983, "grad_norm": 0.7964248657226562, "learning_rate": 1.1659056316590563e-06, "loss": 0.0037, "step": 9802 }, { "epoch": 8.952511415525114, "grad_norm": 0.4387677013874054, "learning_rate": 1.164890918315576e-06, "loss": 0.0028, "step": 9803 }, { "epoch": 8.953424657534246, "grad_norm": 37.2636833190918, "learning_rate": 1.1638762049720955e-06, "loss": 0.2294, "step": 9804 }, { "epoch": 8.954337899543379, "grad_norm": 0.7868101000785828, "learning_rate": 1.162861491628615e-06, "loss": 0.0047, "step": 9805 }, { "epoch": 8.95525114155251, "grad_norm": 1.1431406736373901, "learning_rate": 1.1618467782851345e-06, "loss": 0.0054, "step": 9806 }, { "epoch": 8.956164383561644, "grad_norm": 1.6987078189849854, "learning_rate": 1.1608320649416542e-06, "loss": 0.0084, "step": 9807 }, { "epoch": 8.957077625570776, "grad_norm": 0.4893019199371338, "learning_rate": 1.1598173515981737e-06, "loss": 0.0037, "step": 9808 }, { "epoch": 8.957990867579909, "grad_norm": 0.031245365738868713, "learning_rate": 1.1588026382546931e-06, "loss": 0.0002, "step": 9809 }, { "epoch": 8.95890410958904, "grad_norm": 0.8406923413276672, "learning_rate": 1.1577879249112126e-06, "loss": 0.0041, "step": 9810 }, { "epoch": 8.959817351598174, "grad_norm": 0.19969360530376434, "learning_rate": 1.156773211567732e-06, "loss": 0.001, "step": 9811 }, { "epoch": 8.960730593607305, "grad_norm": 2.461169958114624, "learning_rate": 1.1557584982242518e-06, "loss": 0.0114, "step": 9812 }, { "epoch": 8.961643835616439, "grad_norm": 29.251991271972656, "learning_rate": 1.1547437848807713e-06, "loss": 0.1636, "step": 9813 }, { "epoch": 8.96255707762557, "grad_norm": 0.4899478256702423, "learning_rate": 1.1537290715372907e-06, "loss": 0.0023, "step": 9814 }, { "epoch": 8.963470319634704, "grad_norm": 0.956622302532196, "learning_rate": 1.1527143581938104e-06, "loss": 0.0057, "step": 9815 }, { "epoch": 8.964383561643835, "grad_norm": 0.2886306047439575, "learning_rate": 1.1516996448503297e-06, "loss": 0.0016, "step": 9816 }, { "epoch": 8.965296803652969, "grad_norm": 0.2268408089876175, "learning_rate": 1.1506849315068494e-06, "loss": 0.0014, "step": 9817 }, { "epoch": 8.9662100456621, "grad_norm": 0.28143683075904846, "learning_rate": 1.149670218163369e-06, "loss": 0.0014, "step": 9818 }, { "epoch": 8.967123287671233, "grad_norm": 0.05686238035559654, "learning_rate": 1.1486555048198884e-06, "loss": 0.0004, "step": 9819 }, { "epoch": 8.968036529680365, "grad_norm": 2.746457576751709, "learning_rate": 1.147640791476408e-06, "loss": 0.0174, "step": 9820 }, { "epoch": 8.968949771689498, "grad_norm": 0.9722945690155029, "learning_rate": 1.1466260781329275e-06, "loss": 0.0021, "step": 9821 }, { "epoch": 8.96986301369863, "grad_norm": 0.09890467673540115, "learning_rate": 1.145611364789447e-06, "loss": 0.0006, "step": 9822 }, { "epoch": 8.970776255707763, "grad_norm": 0.11426424235105515, "learning_rate": 1.1445966514459667e-06, "loss": 0.0006, "step": 9823 }, { "epoch": 8.971689497716895, "grad_norm": 0.5031099319458008, "learning_rate": 1.1435819381024862e-06, "loss": 0.0029, "step": 9824 }, { "epoch": 8.972602739726028, "grad_norm": 0.06175339221954346, "learning_rate": 1.1425672247590057e-06, "loss": 0.0004, "step": 9825 }, { "epoch": 8.97351598173516, "grad_norm": 1.9063148498535156, "learning_rate": 1.1415525114155251e-06, "loss": 0.0126, "step": 9826 }, { "epoch": 8.974429223744291, "grad_norm": 3.528712511062622, "learning_rate": 1.1405377980720448e-06, "loss": 0.0239, "step": 9827 }, { "epoch": 8.975342465753425, "grad_norm": 0.05131847411394119, "learning_rate": 1.1395230847285643e-06, "loss": 0.0003, "step": 9828 }, { "epoch": 8.976255707762558, "grad_norm": 0.05807149410247803, "learning_rate": 1.1385083713850838e-06, "loss": 0.0004, "step": 9829 }, { "epoch": 8.97716894977169, "grad_norm": 1.0052212476730347, "learning_rate": 1.1374936580416033e-06, "loss": 0.0077, "step": 9830 }, { "epoch": 8.978082191780821, "grad_norm": 0.08861066401004791, "learning_rate": 1.1364789446981228e-06, "loss": 0.0005, "step": 9831 }, { "epoch": 8.978995433789954, "grad_norm": 5.150921821594238, "learning_rate": 1.1354642313546425e-06, "loss": 0.0318, "step": 9832 }, { "epoch": 8.979908675799086, "grad_norm": 1.364149570465088, "learning_rate": 1.134449518011162e-06, "loss": 0.0082, "step": 9833 }, { "epoch": 8.98082191780822, "grad_norm": 0.19222848117351532, "learning_rate": 1.1334348046676814e-06, "loss": 0.001, "step": 9834 }, { "epoch": 8.981735159817351, "grad_norm": 4.661151885986328, "learning_rate": 1.132420091324201e-06, "loss": 0.0257, "step": 9835 }, { "epoch": 8.982648401826484, "grad_norm": 8.728643417358398, "learning_rate": 1.1314053779807204e-06, "loss": 0.0516, "step": 9836 }, { "epoch": 8.983561643835616, "grad_norm": 74.83451843261719, "learning_rate": 1.13039066463724e-06, "loss": 0.9839, "step": 9837 }, { "epoch": 8.98447488584475, "grad_norm": 0.22413042187690735, "learning_rate": 1.1293759512937598e-06, "loss": 0.0017, "step": 9838 }, { "epoch": 8.98538812785388, "grad_norm": 17.630952835083008, "learning_rate": 1.128361237950279e-06, "loss": 0.1419, "step": 9839 }, { "epoch": 8.986301369863014, "grad_norm": 5.496467113494873, "learning_rate": 1.1273465246067987e-06, "loss": 0.0338, "step": 9840 }, { "epoch": 8.987214611872146, "grad_norm": 0.9291619062423706, "learning_rate": 1.1263318112633182e-06, "loss": 0.0056, "step": 9841 }, { "epoch": 8.988127853881279, "grad_norm": 0.4600101113319397, "learning_rate": 1.1253170979198377e-06, "loss": 0.0031, "step": 9842 }, { "epoch": 8.98904109589041, "grad_norm": 0.09243566542863846, "learning_rate": 1.1243023845763574e-06, "loss": 0.0005, "step": 9843 }, { "epoch": 8.989954337899544, "grad_norm": 0.11683381348848343, "learning_rate": 1.1232876712328769e-06, "loss": 0.0006, "step": 9844 }, { "epoch": 8.990867579908675, "grad_norm": 48.02253341674805, "learning_rate": 1.1222729578893963e-06, "loss": 0.1801, "step": 9845 }, { "epoch": 8.991780821917809, "grad_norm": 2.9042551517486572, "learning_rate": 1.1212582445459158e-06, "loss": 0.015, "step": 9846 }, { "epoch": 8.99269406392694, "grad_norm": 0.018116144463419914, "learning_rate": 1.1202435312024355e-06, "loss": 0.0001, "step": 9847 }, { "epoch": 8.993607305936074, "grad_norm": 0.5926612615585327, "learning_rate": 1.119228817858955e-06, "loss": 0.0022, "step": 9848 }, { "epoch": 8.994520547945205, "grad_norm": 3.0790112018585205, "learning_rate": 1.1182141045154745e-06, "loss": 0.0149, "step": 9849 }, { "epoch": 8.995433789954339, "grad_norm": 10.949533462524414, "learning_rate": 1.117199391171994e-06, "loss": 0.055, "step": 9850 }, { "epoch": 8.99634703196347, "grad_norm": 0.24222388863563538, "learning_rate": 1.1161846778285134e-06, "loss": 0.0014, "step": 9851 }, { "epoch": 8.997260273972604, "grad_norm": 2.154059648513794, "learning_rate": 1.1151699644850331e-06, "loss": 0.0097, "step": 9852 }, { "epoch": 8.998173515981735, "grad_norm": 0.17033296823501587, "learning_rate": 1.1141552511415526e-06, "loss": 0.001, "step": 9853 }, { "epoch": 8.999086757990867, "grad_norm": 9.914690017700195, "learning_rate": 1.113140537798072e-06, "loss": 0.0523, "step": 9854 }, { "epoch": 9.0, "grad_norm": 0.19795550405979156, "learning_rate": 1.1121258244545918e-06, "loss": 0.0012, "step": 9855 }, { "epoch": 9.000913242009132, "grad_norm": 0.7033194303512573, "learning_rate": 1.111111111111111e-06, "loss": 0.0039, "step": 9856 }, { "epoch": 9.001826484018265, "grad_norm": 0.2461850941181183, "learning_rate": 1.1100963977676307e-06, "loss": 0.001, "step": 9857 }, { "epoch": 9.002739726027396, "grad_norm": 0.23342566192150116, "learning_rate": 1.1090816844241504e-06, "loss": 0.0008, "step": 9858 }, { "epoch": 9.00365296803653, "grad_norm": 0.23845785856246948, "learning_rate": 1.1080669710806697e-06, "loss": 0.001, "step": 9859 }, { "epoch": 9.004566210045661, "grad_norm": 0.0790095329284668, "learning_rate": 1.1070522577371894e-06, "loss": 0.0005, "step": 9860 }, { "epoch": 9.005479452054795, "grad_norm": 0.11795629560947418, "learning_rate": 1.1060375443937089e-06, "loss": 0.0007, "step": 9861 }, { "epoch": 9.006392694063926, "grad_norm": 6.167659282684326, "learning_rate": 1.1050228310502283e-06, "loss": 0.0549, "step": 9862 }, { "epoch": 9.00730593607306, "grad_norm": 0.004712546709924936, "learning_rate": 1.104008117706748e-06, "loss": 0.0, "step": 9863 }, { "epoch": 9.008219178082191, "grad_norm": 2.4079270362854004, "learning_rate": 1.1029934043632675e-06, "loss": 0.0118, "step": 9864 }, { "epoch": 9.009132420091325, "grad_norm": 12.668756484985352, "learning_rate": 1.101978691019787e-06, "loss": 0.0925, "step": 9865 }, { "epoch": 9.010045662100456, "grad_norm": 0.0034626435954123735, "learning_rate": 1.1009639776763065e-06, "loss": 0.0, "step": 9866 }, { "epoch": 9.01095890410959, "grad_norm": 18.129928588867188, "learning_rate": 1.099949264332826e-06, "loss": 0.0759, "step": 9867 }, { "epoch": 9.011872146118721, "grad_norm": 7.315100193023682, "learning_rate": 1.0989345509893456e-06, "loss": 0.0568, "step": 9868 }, { "epoch": 9.012785388127854, "grad_norm": 0.8563827276229858, "learning_rate": 1.0979198376458651e-06, "loss": 0.0037, "step": 9869 }, { "epoch": 9.013698630136986, "grad_norm": 0.6419850587844849, "learning_rate": 1.0969051243023846e-06, "loss": 0.0051, "step": 9870 }, { "epoch": 9.01461187214612, "grad_norm": 0.02537386305630207, "learning_rate": 1.095890410958904e-06, "loss": 0.0001, "step": 9871 }, { "epoch": 9.01552511415525, "grad_norm": 1.1544907093048096, "learning_rate": 1.0948756976154238e-06, "loss": 0.0061, "step": 9872 }, { "epoch": 9.016438356164384, "grad_norm": 0.025378525257110596, "learning_rate": 1.0938609842719433e-06, "loss": 0.0002, "step": 9873 }, { "epoch": 9.017351598173516, "grad_norm": 2.827392816543579, "learning_rate": 1.0928462709284627e-06, "loss": 0.0199, "step": 9874 }, { "epoch": 9.018264840182649, "grad_norm": 0.6295041441917419, "learning_rate": 1.0918315575849824e-06, "loss": 0.0029, "step": 9875 }, { "epoch": 9.01917808219178, "grad_norm": 0.047370947897434235, "learning_rate": 1.0908168442415017e-06, "loss": 0.0003, "step": 9876 }, { "epoch": 9.020091324200914, "grad_norm": 0.11303432285785675, "learning_rate": 1.0898021308980214e-06, "loss": 0.0005, "step": 9877 }, { "epoch": 9.021004566210046, "grad_norm": 0.017191411927342415, "learning_rate": 1.088787417554541e-06, "loss": 0.0001, "step": 9878 }, { "epoch": 9.021917808219179, "grad_norm": 3.1137351989746094, "learning_rate": 1.0877727042110604e-06, "loss": 0.0158, "step": 9879 }, { "epoch": 9.02283105022831, "grad_norm": 0.7635066509246826, "learning_rate": 1.08675799086758e-06, "loss": 0.0049, "step": 9880 }, { "epoch": 9.023744292237444, "grad_norm": 0.13522183895111084, "learning_rate": 1.0857432775240995e-06, "loss": 0.0011, "step": 9881 }, { "epoch": 9.024657534246575, "grad_norm": 0.07950534671545029, "learning_rate": 1.084728564180619e-06, "loss": 0.0005, "step": 9882 }, { "epoch": 9.025570776255707, "grad_norm": 2.5981945991516113, "learning_rate": 1.0837138508371387e-06, "loss": 0.0143, "step": 9883 }, { "epoch": 9.02648401826484, "grad_norm": 1.0299817323684692, "learning_rate": 1.0826991374936582e-06, "loss": 0.0043, "step": 9884 }, { "epoch": 9.027397260273972, "grad_norm": 0.348317950963974, "learning_rate": 1.0816844241501777e-06, "loss": 0.002, "step": 9885 }, { "epoch": 9.028310502283105, "grad_norm": 2.888322114944458, "learning_rate": 1.0806697108066971e-06, "loss": 0.0201, "step": 9886 }, { "epoch": 9.029223744292237, "grad_norm": 0.14222215116024017, "learning_rate": 1.0796549974632166e-06, "loss": 0.0007, "step": 9887 }, { "epoch": 9.03013698630137, "grad_norm": 62.58492660522461, "learning_rate": 1.0786402841197363e-06, "loss": 0.3474, "step": 9888 }, { "epoch": 9.031050228310502, "grad_norm": 6.529070854187012, "learning_rate": 1.0776255707762558e-06, "loss": 0.0472, "step": 9889 }, { "epoch": 9.031963470319635, "grad_norm": 0.6986181139945984, "learning_rate": 1.0766108574327753e-06, "loss": 0.003, "step": 9890 }, { "epoch": 9.032876712328767, "grad_norm": 0.2958110272884369, "learning_rate": 1.0755961440892948e-06, "loss": 0.0019, "step": 9891 }, { "epoch": 9.0337899543379, "grad_norm": 0.22048281133174896, "learning_rate": 1.0745814307458144e-06, "loss": 0.0013, "step": 9892 }, { "epoch": 9.034703196347031, "grad_norm": 3.088435649871826, "learning_rate": 1.073566717402334e-06, "loss": 0.0139, "step": 9893 }, { "epoch": 9.035616438356165, "grad_norm": 0.07251207530498505, "learning_rate": 1.0725520040588534e-06, "loss": 0.0004, "step": 9894 }, { "epoch": 9.036529680365296, "grad_norm": 1.7766127586364746, "learning_rate": 1.071537290715373e-06, "loss": 0.0139, "step": 9895 }, { "epoch": 9.03744292237443, "grad_norm": 7.084333896636963, "learning_rate": 1.0705225773718924e-06, "loss": 0.0489, "step": 9896 }, { "epoch": 9.038356164383561, "grad_norm": 0.010226517915725708, "learning_rate": 1.069507864028412e-06, "loss": 0.0001, "step": 9897 }, { "epoch": 9.039269406392695, "grad_norm": 32.82107925415039, "learning_rate": 1.0684931506849318e-06, "loss": 0.2811, "step": 9898 }, { "epoch": 9.040182648401826, "grad_norm": 0.20060104131698608, "learning_rate": 1.067478437341451e-06, "loss": 0.0011, "step": 9899 }, { "epoch": 9.04109589041096, "grad_norm": 0.41396239399909973, "learning_rate": 1.0664637239979707e-06, "loss": 0.0021, "step": 9900 }, { "epoch": 9.042009132420091, "grad_norm": 0.09719087928533554, "learning_rate": 1.0654490106544902e-06, "loss": 0.0005, "step": 9901 }, { "epoch": 9.042922374429224, "grad_norm": 0.16014406085014343, "learning_rate": 1.0644342973110097e-06, "loss": 0.0011, "step": 9902 }, { "epoch": 9.043835616438356, "grad_norm": 0.1729292869567871, "learning_rate": 1.0634195839675294e-06, "loss": 0.0012, "step": 9903 }, { "epoch": 9.04474885844749, "grad_norm": 0.7863045334815979, "learning_rate": 1.0624048706240488e-06, "loss": 0.0044, "step": 9904 }, { "epoch": 9.045662100456621, "grad_norm": 0.26320981979370117, "learning_rate": 1.0613901572805683e-06, "loss": 0.0011, "step": 9905 }, { "epoch": 9.046575342465754, "grad_norm": 0.8390126824378967, "learning_rate": 1.0603754439370878e-06, "loss": 0.006, "step": 9906 }, { "epoch": 9.047488584474886, "grad_norm": 0.8700091242790222, "learning_rate": 1.0593607305936073e-06, "loss": 0.0057, "step": 9907 }, { "epoch": 9.04840182648402, "grad_norm": 0.17124205827713013, "learning_rate": 1.058346017250127e-06, "loss": 0.0009, "step": 9908 }, { "epoch": 9.04931506849315, "grad_norm": 0.1567191779613495, "learning_rate": 1.0573313039066465e-06, "loss": 0.0013, "step": 9909 }, { "epoch": 9.050228310502282, "grad_norm": 9.241109848022461, "learning_rate": 1.056316590563166e-06, "loss": 0.0741, "step": 9910 }, { "epoch": 9.051141552511416, "grad_norm": 0.7069733738899231, "learning_rate": 1.0553018772196854e-06, "loss": 0.0051, "step": 9911 }, { "epoch": 9.052054794520547, "grad_norm": 7.405593395233154, "learning_rate": 1.0542871638762051e-06, "loss": 0.0356, "step": 9912 }, { "epoch": 9.05296803652968, "grad_norm": 16.51383399963379, "learning_rate": 1.0532724505327246e-06, "loss": 0.0333, "step": 9913 }, { "epoch": 9.053881278538812, "grad_norm": 0.27233386039733887, "learning_rate": 1.052257737189244e-06, "loss": 0.0013, "step": 9914 }, { "epoch": 9.054794520547945, "grad_norm": 14.045557022094727, "learning_rate": 1.0512430238457638e-06, "loss": 0.1041, "step": 9915 }, { "epoch": 9.055707762557077, "grad_norm": 0.0030863203573971987, "learning_rate": 1.050228310502283e-06, "loss": 0.0, "step": 9916 }, { "epoch": 9.05662100456621, "grad_norm": 1.5302170515060425, "learning_rate": 1.0492135971588027e-06, "loss": 0.0079, "step": 9917 }, { "epoch": 9.057534246575342, "grad_norm": 0.19608023762702942, "learning_rate": 1.0481988838153222e-06, "loss": 0.001, "step": 9918 }, { "epoch": 9.058447488584475, "grad_norm": 0.28417614102363586, "learning_rate": 1.0471841704718417e-06, "loss": 0.0018, "step": 9919 }, { "epoch": 9.059360730593607, "grad_norm": 0.0017333902651444077, "learning_rate": 1.0461694571283614e-06, "loss": 0.0, "step": 9920 }, { "epoch": 9.06027397260274, "grad_norm": 2.863309621810913, "learning_rate": 1.0451547437848809e-06, "loss": 0.0132, "step": 9921 }, { "epoch": 9.061187214611872, "grad_norm": 0.7820354104042053, "learning_rate": 1.0441400304414003e-06, "loss": 0.0025, "step": 9922 }, { "epoch": 9.062100456621005, "grad_norm": 0.30107325315475464, "learning_rate": 1.04312531709792e-06, "loss": 0.0012, "step": 9923 }, { "epoch": 9.063013698630137, "grad_norm": 1.6293275356292725, "learning_rate": 1.0421106037544395e-06, "loss": 0.0085, "step": 9924 }, { "epoch": 9.06392694063927, "grad_norm": 0.30738553404808044, "learning_rate": 1.041095890410959e-06, "loss": 0.0019, "step": 9925 }, { "epoch": 9.064840182648402, "grad_norm": 0.08441651612520218, "learning_rate": 1.0400811770674785e-06, "loss": 0.0005, "step": 9926 }, { "epoch": 9.065753424657535, "grad_norm": 0.09297223389148712, "learning_rate": 1.039066463723998e-06, "loss": 0.0007, "step": 9927 }, { "epoch": 9.066666666666666, "grad_norm": 6.899469375610352, "learning_rate": 1.0380517503805176e-06, "loss": 0.0343, "step": 9928 }, { "epoch": 9.0675799086758, "grad_norm": 0.0629410445690155, "learning_rate": 1.0370370370370371e-06, "loss": 0.0004, "step": 9929 }, { "epoch": 9.068493150684931, "grad_norm": 0.04359840974211693, "learning_rate": 1.0360223236935566e-06, "loss": 0.0001, "step": 9930 }, { "epoch": 9.069406392694065, "grad_norm": 0.19652488827705383, "learning_rate": 1.035007610350076e-06, "loss": 0.0011, "step": 9931 }, { "epoch": 9.070319634703196, "grad_norm": 0.10855748504400253, "learning_rate": 1.0339928970065958e-06, "loss": 0.0004, "step": 9932 }, { "epoch": 9.07123287671233, "grad_norm": 0.45632678270339966, "learning_rate": 1.0329781836631153e-06, "loss": 0.0031, "step": 9933 }, { "epoch": 9.072146118721461, "grad_norm": 0.7137284278869629, "learning_rate": 1.0319634703196347e-06, "loss": 0.0039, "step": 9934 }, { "epoch": 9.073059360730593, "grad_norm": 6.26211404800415, "learning_rate": 1.0309487569761544e-06, "loss": 0.0264, "step": 9935 }, { "epoch": 9.073972602739726, "grad_norm": 1.8947131633758545, "learning_rate": 1.0299340436326737e-06, "loss": 0.0068, "step": 9936 }, { "epoch": 9.074885844748858, "grad_norm": 0.7382813096046448, "learning_rate": 1.0289193302891934e-06, "loss": 0.003, "step": 9937 }, { "epoch": 9.075799086757991, "grad_norm": 0.12381307780742645, "learning_rate": 1.0279046169457129e-06, "loss": 0.0008, "step": 9938 }, { "epoch": 9.076712328767123, "grad_norm": 0.012841838411986828, "learning_rate": 1.0268899036022324e-06, "loss": 0.0001, "step": 9939 }, { "epoch": 9.077625570776256, "grad_norm": 0.23924191296100616, "learning_rate": 1.025875190258752e-06, "loss": 0.0016, "step": 9940 }, { "epoch": 9.078538812785387, "grad_norm": 131.10745239257812, "learning_rate": 1.0248604769152715e-06, "loss": 0.7321, "step": 9941 }, { "epoch": 9.07945205479452, "grad_norm": 6.1469244956970215, "learning_rate": 1.023845763571791e-06, "loss": 0.0353, "step": 9942 }, { "epoch": 9.080365296803652, "grad_norm": 0.4582687020301819, "learning_rate": 1.0228310502283107e-06, "loss": 0.0025, "step": 9943 }, { "epoch": 9.081278538812786, "grad_norm": 0.026813605800271034, "learning_rate": 1.0218163368848302e-06, "loss": 0.0002, "step": 9944 }, { "epoch": 9.082191780821917, "grad_norm": 34.62252426147461, "learning_rate": 1.0208016235413497e-06, "loss": 0.2646, "step": 9945 }, { "epoch": 9.08310502283105, "grad_norm": 1.3972437381744385, "learning_rate": 1.0197869101978691e-06, "loss": 0.0068, "step": 9946 }, { "epoch": 9.084018264840182, "grad_norm": 0.1345592588186264, "learning_rate": 1.0187721968543886e-06, "loss": 0.001, "step": 9947 }, { "epoch": 9.084931506849315, "grad_norm": 15.041068077087402, "learning_rate": 1.0177574835109083e-06, "loss": 0.0725, "step": 9948 }, { "epoch": 9.085844748858447, "grad_norm": 0.25102272629737854, "learning_rate": 1.0167427701674278e-06, "loss": 0.0011, "step": 9949 }, { "epoch": 9.08675799086758, "grad_norm": 2.709902286529541, "learning_rate": 1.0157280568239473e-06, "loss": 0.0109, "step": 9950 }, { "epoch": 9.087671232876712, "grad_norm": 0.20069323480129242, "learning_rate": 1.0147133434804667e-06, "loss": 0.0011, "step": 9951 }, { "epoch": 9.088584474885845, "grad_norm": 1.5916049480438232, "learning_rate": 1.0136986301369864e-06, "loss": 0.0073, "step": 9952 }, { "epoch": 9.089497716894977, "grad_norm": 0.06724170595407486, "learning_rate": 1.012683916793506e-06, "loss": 0.0002, "step": 9953 }, { "epoch": 9.09041095890411, "grad_norm": 0.6219491362571716, "learning_rate": 1.0116692034500254e-06, "loss": 0.0042, "step": 9954 }, { "epoch": 9.091324200913242, "grad_norm": 0.5238685607910156, "learning_rate": 1.010654490106545e-06, "loss": 0.0031, "step": 9955 }, { "epoch": 9.092237442922375, "grad_norm": 0.13240867853164673, "learning_rate": 1.0096397767630644e-06, "loss": 0.0006, "step": 9956 }, { "epoch": 9.093150684931507, "grad_norm": 1.9757167100906372, "learning_rate": 1.008625063419584e-06, "loss": 0.0083, "step": 9957 }, { "epoch": 9.09406392694064, "grad_norm": 0.29185470938682556, "learning_rate": 1.0076103500761035e-06, "loss": 0.0023, "step": 9958 }, { "epoch": 9.094977168949772, "grad_norm": 0.0854046419262886, "learning_rate": 1.006595636732623e-06, "loss": 0.0007, "step": 9959 }, { "epoch": 9.095890410958905, "grad_norm": 0.004793241154402494, "learning_rate": 1.0055809233891427e-06, "loss": 0.0, "step": 9960 }, { "epoch": 9.096803652968037, "grad_norm": 9.810125350952148, "learning_rate": 1.0045662100456622e-06, "loss": 0.0619, "step": 9961 }, { "epoch": 9.097716894977168, "grad_norm": 0.161543607711792, "learning_rate": 1.0035514967021817e-06, "loss": 0.0009, "step": 9962 }, { "epoch": 9.098630136986301, "grad_norm": 0.9641469120979309, "learning_rate": 1.0025367833587014e-06, "loss": 0.0049, "step": 9963 }, { "epoch": 9.099543378995433, "grad_norm": 4.808862209320068, "learning_rate": 1.0015220700152208e-06, "loss": 0.0264, "step": 9964 }, { "epoch": 9.100456621004566, "grad_norm": 4.119946002960205, "learning_rate": 1.0005073566717403e-06, "loss": 0.0181, "step": 9965 }, { "epoch": 9.101369863013698, "grad_norm": 0.5403664112091064, "learning_rate": 9.994926433282598e-07, "loss": 0.0028, "step": 9966 }, { "epoch": 9.102283105022831, "grad_norm": 1.2893764972686768, "learning_rate": 9.984779299847793e-07, "loss": 0.0062, "step": 9967 }, { "epoch": 9.103196347031963, "grad_norm": 0.16790105402469635, "learning_rate": 9.97463216641299e-07, "loss": 0.0011, "step": 9968 }, { "epoch": 9.104109589041096, "grad_norm": 0.022130334749817848, "learning_rate": 9.964485032978185e-07, "loss": 0.0001, "step": 9969 }, { "epoch": 9.105022831050228, "grad_norm": 63.41477966308594, "learning_rate": 9.95433789954338e-07, "loss": 0.4116, "step": 9970 }, { "epoch": 9.105936073059361, "grad_norm": 0.06815167516469955, "learning_rate": 9.944190766108574e-07, "loss": 0.0005, "step": 9971 }, { "epoch": 9.106849315068493, "grad_norm": 0.04162253811955452, "learning_rate": 9.934043632673771e-07, "loss": 0.0003, "step": 9972 }, { "epoch": 9.107762557077626, "grad_norm": 0.5318533182144165, "learning_rate": 9.923896499238966e-07, "loss": 0.0035, "step": 9973 }, { "epoch": 9.108675799086758, "grad_norm": 6.456920623779297, "learning_rate": 9.91374936580416e-07, "loss": 0.0335, "step": 9974 }, { "epoch": 9.10958904109589, "grad_norm": 3.59639573097229, "learning_rate": 9.903602232369358e-07, "loss": 0.0221, "step": 9975 }, { "epoch": 9.110502283105022, "grad_norm": 1.6202579736709595, "learning_rate": 9.89345509893455e-07, "loss": 0.0065, "step": 9976 }, { "epoch": 9.111415525114156, "grad_norm": 7.147988796234131, "learning_rate": 9.883307965499747e-07, "loss": 0.0375, "step": 9977 }, { "epoch": 9.112328767123287, "grad_norm": 0.09781020134687424, "learning_rate": 9.873160832064942e-07, "loss": 0.0006, "step": 9978 }, { "epoch": 9.11324200913242, "grad_norm": 0.2267870157957077, "learning_rate": 9.863013698630137e-07, "loss": 0.0015, "step": 9979 }, { "epoch": 9.114155251141552, "grad_norm": 0.3728238046169281, "learning_rate": 9.852866565195334e-07, "loss": 0.0026, "step": 9980 }, { "epoch": 9.115068493150686, "grad_norm": 0.4645046293735504, "learning_rate": 9.842719431760529e-07, "loss": 0.0026, "step": 9981 }, { "epoch": 9.115981735159817, "grad_norm": 3.0685226917266846, "learning_rate": 9.832572298325723e-07, "loss": 0.0188, "step": 9982 }, { "epoch": 9.11689497716895, "grad_norm": 2.673239231109619, "learning_rate": 9.82242516489092e-07, "loss": 0.0091, "step": 9983 }, { "epoch": 9.117808219178082, "grad_norm": 0.8918501734733582, "learning_rate": 9.812278031456115e-07, "loss": 0.005, "step": 9984 }, { "epoch": 9.118721461187215, "grad_norm": 6.780092716217041, "learning_rate": 9.80213089802131e-07, "loss": 0.031, "step": 9985 }, { "epoch": 9.119634703196347, "grad_norm": 0.7985196709632874, "learning_rate": 9.791983764586505e-07, "loss": 0.0036, "step": 9986 }, { "epoch": 9.12054794520548, "grad_norm": 9.582704544067383, "learning_rate": 9.7818366311517e-07, "loss": 0.0352, "step": 9987 }, { "epoch": 9.121461187214612, "grad_norm": 0.46894705295562744, "learning_rate": 9.771689497716896e-07, "loss": 0.002, "step": 9988 }, { "epoch": 9.122374429223743, "grad_norm": 0.07887005805969238, "learning_rate": 9.761542364282091e-07, "loss": 0.0004, "step": 9989 }, { "epoch": 9.123287671232877, "grad_norm": 1.3474290370941162, "learning_rate": 9.751395230847286e-07, "loss": 0.0077, "step": 9990 }, { "epoch": 9.124200913242008, "grad_norm": 0.6958613395690918, "learning_rate": 9.74124809741248e-07, "loss": 0.0042, "step": 9991 }, { "epoch": 9.125114155251142, "grad_norm": 0.3798616826534271, "learning_rate": 9.731100963977678e-07, "loss": 0.0019, "step": 9992 }, { "epoch": 9.126027397260273, "grad_norm": 0.002431025728583336, "learning_rate": 9.720953830542873e-07, "loss": 0.0, "step": 9993 }, { "epoch": 9.126940639269407, "grad_norm": 0.7110573649406433, "learning_rate": 9.710806697108067e-07, "loss": 0.0049, "step": 9994 }, { "epoch": 9.127853881278538, "grad_norm": 0.6258673667907715, "learning_rate": 9.700659563673264e-07, "loss": 0.0056, "step": 9995 }, { "epoch": 9.128767123287671, "grad_norm": 0.2339453548192978, "learning_rate": 9.690512430238457e-07, "loss": 0.0011, "step": 9996 }, { "epoch": 9.129680365296803, "grad_norm": 0.15655706822872162, "learning_rate": 9.680365296803654e-07, "loss": 0.0008, "step": 9997 }, { "epoch": 9.130593607305936, "grad_norm": 0.05215831473469734, "learning_rate": 9.670218163368849e-07, "loss": 0.0002, "step": 9998 }, { "epoch": 9.131506849315068, "grad_norm": 18.61208152770996, "learning_rate": 9.660071029934043e-07, "loss": 0.1335, "step": 9999 }, { "epoch": 9.132420091324201, "grad_norm": 0.27445361018180847, "learning_rate": 9.64992389649924e-07, "loss": 0.0019, "step": 10000 }, { "epoch": 9.133333333333333, "grad_norm": 7.265506744384766, "learning_rate": 9.639776763064435e-07, "loss": 0.0475, "step": 10001 }, { "epoch": 9.134246575342466, "grad_norm": 0.21330446004867554, "learning_rate": 9.62962962962963e-07, "loss": 0.0013, "step": 10002 }, { "epoch": 9.135159817351598, "grad_norm": 0.5785359740257263, "learning_rate": 9.619482496194827e-07, "loss": 0.0041, "step": 10003 }, { "epoch": 9.136073059360731, "grad_norm": 0.5393810272216797, "learning_rate": 9.609335362760022e-07, "loss": 0.0035, "step": 10004 }, { "epoch": 9.136986301369863, "grad_norm": 0.0013939120108261704, "learning_rate": 9.599188229325217e-07, "loss": 0.0, "step": 10005 }, { "epoch": 9.137899543378996, "grad_norm": 5.0689778327941895, "learning_rate": 9.589041095890411e-07, "loss": 0.0296, "step": 10006 }, { "epoch": 9.138812785388128, "grad_norm": 0.6787777543067932, "learning_rate": 9.578893962455606e-07, "loss": 0.0034, "step": 10007 }, { "epoch": 9.139726027397261, "grad_norm": 7.079331398010254, "learning_rate": 9.568746829020803e-07, "loss": 0.0442, "step": 10008 }, { "epoch": 9.140639269406392, "grad_norm": 0.16447493433952332, "learning_rate": 9.558599695585998e-07, "loss": 0.0007, "step": 10009 }, { "epoch": 9.141552511415526, "grad_norm": 3.114661693572998, "learning_rate": 9.548452562151193e-07, "loss": 0.015, "step": 10010 }, { "epoch": 9.142465753424657, "grad_norm": 0.7822824716567993, "learning_rate": 9.538305428716387e-07, "loss": 0.0034, "step": 10011 }, { "epoch": 9.14337899543379, "grad_norm": 0.03307078778743744, "learning_rate": 9.528158295281583e-07, "loss": 0.0001, "step": 10012 }, { "epoch": 9.144292237442922, "grad_norm": 0.6883293986320496, "learning_rate": 9.518011161846779e-07, "loss": 0.0034, "step": 10013 }, { "epoch": 9.145205479452056, "grad_norm": 0.13208110630512238, "learning_rate": 9.507864028411974e-07, "loss": 0.0007, "step": 10014 }, { "epoch": 9.146118721461187, "grad_norm": 0.1703663319349289, "learning_rate": 9.49771689497717e-07, "loss": 0.0008, "step": 10015 }, { "epoch": 9.147031963470319, "grad_norm": 1.1825382709503174, "learning_rate": 9.487569761542365e-07, "loss": 0.0044, "step": 10016 }, { "epoch": 9.147945205479452, "grad_norm": 5.461810111999512, "learning_rate": 9.47742262810756e-07, "loss": 0.0248, "step": 10017 }, { "epoch": 9.148858447488584, "grad_norm": 2.858227491378784, "learning_rate": 9.467275494672756e-07, "loss": 0.0115, "step": 10018 }, { "epoch": 9.149771689497717, "grad_norm": 1.1559343338012695, "learning_rate": 9.457128361237951e-07, "loss": 0.0057, "step": 10019 }, { "epoch": 9.150684931506849, "grad_norm": 0.14276720583438873, "learning_rate": 9.446981227803147e-07, "loss": 0.0009, "step": 10020 }, { "epoch": 9.151598173515982, "grad_norm": 0.4049501121044159, "learning_rate": 9.436834094368341e-07, "loss": 0.0017, "step": 10021 }, { "epoch": 9.152511415525113, "grad_norm": 4.957235813140869, "learning_rate": 9.426686960933537e-07, "loss": 0.0234, "step": 10022 }, { "epoch": 9.153424657534247, "grad_norm": 0.5400176644325256, "learning_rate": 9.416539827498732e-07, "loss": 0.0032, "step": 10023 }, { "epoch": 9.154337899543378, "grad_norm": 1.5851566791534424, "learning_rate": 9.406392694063927e-07, "loss": 0.0063, "step": 10024 }, { "epoch": 9.155251141552512, "grad_norm": 0.6380433440208435, "learning_rate": 9.396245560629123e-07, "loss": 0.0036, "step": 10025 }, { "epoch": 9.156164383561643, "grad_norm": 0.9292557835578918, "learning_rate": 9.386098427194318e-07, "loss": 0.0029, "step": 10026 }, { "epoch": 9.157077625570777, "grad_norm": 0.025374919176101685, "learning_rate": 9.375951293759514e-07, "loss": 0.0001, "step": 10027 }, { "epoch": 9.157990867579908, "grad_norm": 0.23823760449886322, "learning_rate": 9.36580416032471e-07, "loss": 0.0015, "step": 10028 }, { "epoch": 9.158904109589042, "grad_norm": 3.3788866996765137, "learning_rate": 9.355657026889904e-07, "loss": 0.0232, "step": 10029 }, { "epoch": 9.159817351598173, "grad_norm": 34.46326446533203, "learning_rate": 9.3455098934551e-07, "loss": 0.2691, "step": 10030 }, { "epoch": 9.160730593607306, "grad_norm": 2.058499336242676, "learning_rate": 9.335362760020294e-07, "loss": 0.0113, "step": 10031 }, { "epoch": 9.161643835616438, "grad_norm": 1.0373135805130005, "learning_rate": 9.32521562658549e-07, "loss": 0.0049, "step": 10032 }, { "epoch": 9.162557077625571, "grad_norm": 0.7121723294258118, "learning_rate": 9.315068493150686e-07, "loss": 0.0043, "step": 10033 }, { "epoch": 9.163470319634703, "grad_norm": 0.736646831035614, "learning_rate": 9.304921359715881e-07, "loss": 0.0041, "step": 10034 }, { "epoch": 9.164383561643836, "grad_norm": 0.3269042372703552, "learning_rate": 9.294774226281076e-07, "loss": 0.0017, "step": 10035 }, { "epoch": 9.165296803652968, "grad_norm": 0.03957519680261612, "learning_rate": 9.284627092846271e-07, "loss": 0.0002, "step": 10036 }, { "epoch": 9.166210045662101, "grad_norm": 0.23462018370628357, "learning_rate": 9.274479959411467e-07, "loss": 0.0013, "step": 10037 }, { "epoch": 9.167123287671233, "grad_norm": 0.28789037466049194, "learning_rate": 9.264332825976663e-07, "loss": 0.001, "step": 10038 }, { "epoch": 9.168036529680366, "grad_norm": 0.008782587945461273, "learning_rate": 9.254185692541858e-07, "loss": 0.0001, "step": 10039 }, { "epoch": 9.168949771689498, "grad_norm": 0.9529544711112976, "learning_rate": 9.244038559107054e-07, "loss": 0.0061, "step": 10040 }, { "epoch": 9.169863013698631, "grad_norm": 0.1799914687871933, "learning_rate": 9.233891425672247e-07, "loss": 0.0009, "step": 10041 }, { "epoch": 9.170776255707763, "grad_norm": 17.336702346801758, "learning_rate": 9.223744292237443e-07, "loss": 0.0919, "step": 10042 }, { "epoch": 9.171689497716894, "grad_norm": 2.7859182357788086, "learning_rate": 9.213597158802639e-07, "loss": 0.0149, "step": 10043 }, { "epoch": 9.172602739726027, "grad_norm": 2.1968305110931396, "learning_rate": 9.203450025367834e-07, "loss": 0.0142, "step": 10044 }, { "epoch": 9.173515981735159, "grad_norm": 12.347463607788086, "learning_rate": 9.19330289193303e-07, "loss": 0.0654, "step": 10045 }, { "epoch": 9.174429223744292, "grad_norm": 2.113609790802002, "learning_rate": 9.183155758498225e-07, "loss": 0.012, "step": 10046 }, { "epoch": 9.175342465753424, "grad_norm": 0.13230422139167786, "learning_rate": 9.17300862506342e-07, "loss": 0.001, "step": 10047 }, { "epoch": 9.176255707762557, "grad_norm": 0.02660995163023472, "learning_rate": 9.162861491628616e-07, "loss": 0.0001, "step": 10048 }, { "epoch": 9.177168949771689, "grad_norm": 1.3372746706008911, "learning_rate": 9.152714358193811e-07, "loss": 0.0093, "step": 10049 }, { "epoch": 9.178082191780822, "grad_norm": 72.43403625488281, "learning_rate": 9.142567224759007e-07, "loss": 0.3617, "step": 10050 }, { "epoch": 9.178995433789954, "grad_norm": 0.2385164052248001, "learning_rate": 9.132420091324201e-07, "loss": 0.0012, "step": 10051 }, { "epoch": 9.179908675799087, "grad_norm": 3.2495474815368652, "learning_rate": 9.122272957889397e-07, "loss": 0.0253, "step": 10052 }, { "epoch": 9.180821917808219, "grad_norm": 1.151357650756836, "learning_rate": 9.112125824454592e-07, "loss": 0.0061, "step": 10053 }, { "epoch": 9.181735159817352, "grad_norm": 3.1024534702301025, "learning_rate": 9.101978691019787e-07, "loss": 0.0109, "step": 10054 }, { "epoch": 9.182648401826484, "grad_norm": 0.39829275012016296, "learning_rate": 9.091831557584983e-07, "loss": 0.0022, "step": 10055 }, { "epoch": 9.183561643835617, "grad_norm": 0.01529125589877367, "learning_rate": 9.081684424150178e-07, "loss": 0.0001, "step": 10056 }, { "epoch": 9.184474885844748, "grad_norm": 0.7400612235069275, "learning_rate": 9.071537290715374e-07, "loss": 0.0039, "step": 10057 }, { "epoch": 9.185388127853882, "grad_norm": 3.2193522453308105, "learning_rate": 9.06139015728057e-07, "loss": 0.0226, "step": 10058 }, { "epoch": 9.186301369863013, "grad_norm": 0.6800092458724976, "learning_rate": 9.051243023845764e-07, "loss": 0.005, "step": 10059 }, { "epoch": 9.187214611872147, "grad_norm": 0.024923615157604218, "learning_rate": 9.04109589041096e-07, "loss": 0.0001, "step": 10060 }, { "epoch": 9.188127853881278, "grad_norm": 0.8851109743118286, "learning_rate": 9.030948756976154e-07, "loss": 0.0066, "step": 10061 }, { "epoch": 9.189041095890412, "grad_norm": 0.3963226079940796, "learning_rate": 9.02080162354135e-07, "loss": 0.0025, "step": 10062 }, { "epoch": 9.189954337899543, "grad_norm": 0.1542341560125351, "learning_rate": 9.010654490106546e-07, "loss": 0.001, "step": 10063 }, { "epoch": 9.190867579908677, "grad_norm": 0.7097460627555847, "learning_rate": 9.000507356671741e-07, "loss": 0.0038, "step": 10064 }, { "epoch": 9.191780821917808, "grad_norm": 1.3533381223678589, "learning_rate": 8.990360223236936e-07, "loss": 0.0049, "step": 10065 }, { "epoch": 9.192694063926941, "grad_norm": 0.07592297345399857, "learning_rate": 8.980213089802131e-07, "loss": 0.0005, "step": 10066 }, { "epoch": 9.193607305936073, "grad_norm": 0.3951699733734131, "learning_rate": 8.970065956367327e-07, "loss": 0.0019, "step": 10067 }, { "epoch": 9.194520547945206, "grad_norm": 2.4900455474853516, "learning_rate": 8.959918822932523e-07, "loss": 0.008, "step": 10068 }, { "epoch": 9.195433789954338, "grad_norm": 1.8674266338348389, "learning_rate": 8.949771689497718e-07, "loss": 0.0101, "step": 10069 }, { "epoch": 9.19634703196347, "grad_norm": 1.2604166269302368, "learning_rate": 8.939624556062914e-07, "loss": 0.0067, "step": 10070 }, { "epoch": 9.197260273972603, "grad_norm": 2.296602725982666, "learning_rate": 8.929477422628107e-07, "loss": 0.0106, "step": 10071 }, { "epoch": 9.198173515981734, "grad_norm": 0.038910336792469025, "learning_rate": 8.919330289193303e-07, "loss": 0.0002, "step": 10072 }, { "epoch": 9.199086757990868, "grad_norm": 2.0381479263305664, "learning_rate": 8.909183155758499e-07, "loss": 0.0215, "step": 10073 }, { "epoch": 9.2, "grad_norm": 0.7163468599319458, "learning_rate": 8.899036022323694e-07, "loss": 0.0043, "step": 10074 }, { "epoch": 9.200913242009133, "grad_norm": 6.1155900955200195, "learning_rate": 8.88888888888889e-07, "loss": 0.0385, "step": 10075 }, { "epoch": 9.201826484018264, "grad_norm": 1.3588963747024536, "learning_rate": 8.878741755454085e-07, "loss": 0.0093, "step": 10076 }, { "epoch": 9.202739726027398, "grad_norm": 0.04004841297864914, "learning_rate": 8.86859462201928e-07, "loss": 0.0003, "step": 10077 }, { "epoch": 9.203652968036529, "grad_norm": 0.07481807470321655, "learning_rate": 8.858447488584476e-07, "loss": 0.0004, "step": 10078 }, { "epoch": 9.204566210045662, "grad_norm": 2.83601713180542, "learning_rate": 8.848300355149671e-07, "loss": 0.0166, "step": 10079 }, { "epoch": 9.205479452054794, "grad_norm": 1.3895208835601807, "learning_rate": 8.838153221714867e-07, "loss": 0.0116, "step": 10080 }, { "epoch": 9.206392694063927, "grad_norm": 0.0012823361903429031, "learning_rate": 8.828006088280061e-07, "loss": 0.0, "step": 10081 }, { "epoch": 9.207305936073059, "grad_norm": 0.13773979246616364, "learning_rate": 8.817858954845257e-07, "loss": 0.001, "step": 10082 }, { "epoch": 9.208219178082192, "grad_norm": 0.021945929154753685, "learning_rate": 8.807711821410452e-07, "loss": 0.0002, "step": 10083 }, { "epoch": 9.209132420091324, "grad_norm": 0.12057247012853622, "learning_rate": 8.797564687975647e-07, "loss": 0.0008, "step": 10084 }, { "epoch": 9.210045662100457, "grad_norm": 0.0011962538119405508, "learning_rate": 8.787417554540843e-07, "loss": 0.0, "step": 10085 }, { "epoch": 9.210958904109589, "grad_norm": 0.048085156828165054, "learning_rate": 8.777270421106038e-07, "loss": 0.0003, "step": 10086 }, { "epoch": 9.211872146118722, "grad_norm": 1.5909799337387085, "learning_rate": 8.767123287671234e-07, "loss": 0.0069, "step": 10087 }, { "epoch": 9.212785388127854, "grad_norm": 0.7212578654289246, "learning_rate": 8.75697615423643e-07, "loss": 0.0035, "step": 10088 }, { "epoch": 9.213698630136987, "grad_norm": 0.403184175491333, "learning_rate": 8.746829020801624e-07, "loss": 0.0024, "step": 10089 }, { "epoch": 9.214611872146119, "grad_norm": 0.09552697837352753, "learning_rate": 8.73668188736682e-07, "loss": 0.0005, "step": 10090 }, { "epoch": 9.215525114155252, "grad_norm": 0.6526911854743958, "learning_rate": 8.726534753932014e-07, "loss": 0.0034, "step": 10091 }, { "epoch": 9.216438356164383, "grad_norm": 0.17966124415397644, "learning_rate": 8.71638762049721e-07, "loss": 0.0012, "step": 10092 }, { "epoch": 9.217351598173517, "grad_norm": 0.6303364038467407, "learning_rate": 8.706240487062406e-07, "loss": 0.0031, "step": 10093 }, { "epoch": 9.218264840182648, "grad_norm": 0.13458135724067688, "learning_rate": 8.696093353627601e-07, "loss": 0.0007, "step": 10094 }, { "epoch": 9.219178082191782, "grad_norm": 0.03334148973226547, "learning_rate": 8.685946220192796e-07, "loss": 0.0002, "step": 10095 }, { "epoch": 9.220091324200913, "grad_norm": 0.007540047634392977, "learning_rate": 8.675799086757991e-07, "loss": 0.0001, "step": 10096 }, { "epoch": 9.221004566210045, "grad_norm": 5.629220008850098, "learning_rate": 8.665651953323187e-07, "loss": 0.0239, "step": 10097 }, { "epoch": 9.221917808219178, "grad_norm": 0.6849389672279358, "learning_rate": 8.655504819888383e-07, "loss": 0.0042, "step": 10098 }, { "epoch": 9.22283105022831, "grad_norm": 1.0700299739837646, "learning_rate": 8.645357686453578e-07, "loss": 0.0076, "step": 10099 }, { "epoch": 9.223744292237443, "grad_norm": 1.3993974924087524, "learning_rate": 8.635210553018774e-07, "loss": 0.0112, "step": 10100 }, { "epoch": 9.224657534246575, "grad_norm": 0.13570059835910797, "learning_rate": 8.625063419583967e-07, "loss": 0.0011, "step": 10101 }, { "epoch": 9.225570776255708, "grad_norm": 0.3619762659072876, "learning_rate": 8.614916286149163e-07, "loss": 0.002, "step": 10102 }, { "epoch": 9.22648401826484, "grad_norm": 0.5051670670509338, "learning_rate": 8.604769152714359e-07, "loss": 0.0026, "step": 10103 }, { "epoch": 9.227397260273973, "grad_norm": 1.8270530700683594, "learning_rate": 8.594622019279554e-07, "loss": 0.0119, "step": 10104 }, { "epoch": 9.228310502283104, "grad_norm": 1.208714246749878, "learning_rate": 8.58447488584475e-07, "loss": 0.0068, "step": 10105 }, { "epoch": 9.229223744292238, "grad_norm": 0.5604667663574219, "learning_rate": 8.574327752409945e-07, "loss": 0.0034, "step": 10106 }, { "epoch": 9.23013698630137, "grad_norm": 0.1398620903491974, "learning_rate": 8.56418061897514e-07, "loss": 0.0009, "step": 10107 }, { "epoch": 9.231050228310503, "grad_norm": 4.422758102416992, "learning_rate": 8.554033485540336e-07, "loss": 0.0172, "step": 10108 }, { "epoch": 9.231963470319634, "grad_norm": 7.847346305847168, "learning_rate": 8.543886352105531e-07, "loss": 0.0441, "step": 10109 }, { "epoch": 9.232876712328768, "grad_norm": 1.1270171403884888, "learning_rate": 8.533739218670727e-07, "loss": 0.0044, "step": 10110 }, { "epoch": 9.2337899543379, "grad_norm": 5.643250942230225, "learning_rate": 8.523592085235921e-07, "loss": 0.0208, "step": 10111 }, { "epoch": 9.234703196347033, "grad_norm": 0.11349544674158096, "learning_rate": 8.513444951801117e-07, "loss": 0.0009, "step": 10112 }, { "epoch": 9.235616438356164, "grad_norm": 1.921470284461975, "learning_rate": 8.503297818366312e-07, "loss": 0.0103, "step": 10113 }, { "epoch": 9.236529680365297, "grad_norm": 7.240726947784424, "learning_rate": 8.493150684931507e-07, "loss": 0.0367, "step": 10114 }, { "epoch": 9.237442922374429, "grad_norm": 3.1503043174743652, "learning_rate": 8.483003551496703e-07, "loss": 0.0145, "step": 10115 }, { "epoch": 9.238356164383562, "grad_norm": 0.17905274033546448, "learning_rate": 8.472856418061898e-07, "loss": 0.0009, "step": 10116 }, { "epoch": 9.239269406392694, "grad_norm": 0.7239234447479248, "learning_rate": 8.462709284627094e-07, "loss": 0.0044, "step": 10117 }, { "epoch": 9.240182648401827, "grad_norm": 0.05223357304930687, "learning_rate": 8.45256215119229e-07, "loss": 0.0002, "step": 10118 }, { "epoch": 9.241095890410959, "grad_norm": 2.4307117462158203, "learning_rate": 8.442415017757484e-07, "loss": 0.0131, "step": 10119 }, { "epoch": 9.242009132420092, "grad_norm": 6.75257682800293, "learning_rate": 8.43226788432268e-07, "loss": 0.0366, "step": 10120 }, { "epoch": 9.242922374429224, "grad_norm": 9.842616081237793, "learning_rate": 8.422120750887874e-07, "loss": 0.0706, "step": 10121 }, { "epoch": 9.243835616438357, "grad_norm": 0.10954736918210983, "learning_rate": 8.41197361745307e-07, "loss": 0.0006, "step": 10122 }, { "epoch": 9.244748858447489, "grad_norm": 48.71136474609375, "learning_rate": 8.401826484018266e-07, "loss": 0.2556, "step": 10123 }, { "epoch": 9.24566210045662, "grad_norm": 0.4343299865722656, "learning_rate": 8.391679350583461e-07, "loss": 0.0026, "step": 10124 }, { "epoch": 9.246575342465754, "grad_norm": 0.03309822455048561, "learning_rate": 8.381532217148656e-07, "loss": 0.0002, "step": 10125 }, { "epoch": 9.247488584474885, "grad_norm": 0.8616377711296082, "learning_rate": 8.371385083713851e-07, "loss": 0.0036, "step": 10126 }, { "epoch": 9.248401826484018, "grad_norm": 0.002820259192958474, "learning_rate": 8.361237950279047e-07, "loss": 0.0, "step": 10127 }, { "epoch": 9.24931506849315, "grad_norm": 0.24918237328529358, "learning_rate": 8.351090816844243e-07, "loss": 0.0003, "step": 10128 }, { "epoch": 9.250228310502283, "grad_norm": 0.11791258305311203, "learning_rate": 8.340943683409438e-07, "loss": 0.0008, "step": 10129 }, { "epoch": 9.251141552511415, "grad_norm": 0.2644234001636505, "learning_rate": 8.330796549974634e-07, "loss": 0.0017, "step": 10130 }, { "epoch": 9.252054794520548, "grad_norm": 0.11214946955442429, "learning_rate": 8.320649416539827e-07, "loss": 0.0006, "step": 10131 }, { "epoch": 9.25296803652968, "grad_norm": 0.4992988705635071, "learning_rate": 8.310502283105023e-07, "loss": 0.0036, "step": 10132 }, { "epoch": 9.253881278538813, "grad_norm": 0.7057484984397888, "learning_rate": 8.300355149670219e-07, "loss": 0.0045, "step": 10133 }, { "epoch": 9.254794520547945, "grad_norm": 109.9399642944336, "learning_rate": 8.290208016235414e-07, "loss": 1.4312, "step": 10134 }, { "epoch": 9.255707762557078, "grad_norm": 0.1253427416086197, "learning_rate": 8.28006088280061e-07, "loss": 0.0007, "step": 10135 }, { "epoch": 9.25662100456621, "grad_norm": 0.8853659629821777, "learning_rate": 8.269913749365805e-07, "loss": 0.0041, "step": 10136 }, { "epoch": 9.257534246575343, "grad_norm": 6.9464335441589355, "learning_rate": 8.259766615931e-07, "loss": 0.0339, "step": 10137 }, { "epoch": 9.258447488584475, "grad_norm": 3.406160354614258, "learning_rate": 8.249619482496196e-07, "loss": 0.0256, "step": 10138 }, { "epoch": 9.259360730593608, "grad_norm": 2.7667722702026367, "learning_rate": 8.239472349061391e-07, "loss": 0.0255, "step": 10139 }, { "epoch": 9.26027397260274, "grad_norm": 0.3832886815071106, "learning_rate": 8.229325215626587e-07, "loss": 0.0019, "step": 10140 }, { "epoch": 9.261187214611873, "grad_norm": 0.013114632107317448, "learning_rate": 8.219178082191781e-07, "loss": 0.0001, "step": 10141 }, { "epoch": 9.262100456621004, "grad_norm": 0.3937835693359375, "learning_rate": 8.209030948756977e-07, "loss": 0.0028, "step": 10142 }, { "epoch": 9.263013698630138, "grad_norm": 0.0010824103374034166, "learning_rate": 8.198883815322172e-07, "loss": 0.0, "step": 10143 }, { "epoch": 9.26392694063927, "grad_norm": 7.733593463897705, "learning_rate": 8.188736681887367e-07, "loss": 0.0333, "step": 10144 }, { "epoch": 9.264840182648403, "grad_norm": 0.6468782424926758, "learning_rate": 8.178589548452563e-07, "loss": 0.0015, "step": 10145 }, { "epoch": 9.265753424657534, "grad_norm": 5.845919609069824, "learning_rate": 8.168442415017758e-07, "loss": 0.0221, "step": 10146 }, { "epoch": 9.266666666666667, "grad_norm": 2.0700159072875977, "learning_rate": 8.158295281582954e-07, "loss": 0.0074, "step": 10147 }, { "epoch": 9.267579908675799, "grad_norm": 72.18108367919922, "learning_rate": 8.14814814814815e-07, "loss": 0.368, "step": 10148 }, { "epoch": 9.268493150684932, "grad_norm": 3.524263381958008, "learning_rate": 8.138001014713344e-07, "loss": 0.0259, "step": 10149 }, { "epoch": 9.269406392694064, "grad_norm": 0.03976820036768913, "learning_rate": 8.12785388127854e-07, "loss": 0.0003, "step": 10150 }, { "epoch": 9.270319634703196, "grad_norm": 0.0175052247941494, "learning_rate": 8.117706747843734e-07, "loss": 0.0001, "step": 10151 }, { "epoch": 9.271232876712329, "grad_norm": 0.09407459944486618, "learning_rate": 8.10755961440893e-07, "loss": 0.0006, "step": 10152 }, { "epoch": 9.27214611872146, "grad_norm": 1.7169795036315918, "learning_rate": 8.097412480974126e-07, "loss": 0.009, "step": 10153 }, { "epoch": 9.273059360730594, "grad_norm": 0.011620745994150639, "learning_rate": 8.08726534753932e-07, "loss": 0.0001, "step": 10154 }, { "epoch": 9.273972602739725, "grad_norm": 2.1952154636383057, "learning_rate": 8.077118214104516e-07, "loss": 0.0036, "step": 10155 }, { "epoch": 9.274885844748859, "grad_norm": 0.2216523289680481, "learning_rate": 8.066971080669711e-07, "loss": 0.001, "step": 10156 }, { "epoch": 9.27579908675799, "grad_norm": 2.196373701095581, "learning_rate": 8.056823947234907e-07, "loss": 0.01, "step": 10157 }, { "epoch": 9.276712328767124, "grad_norm": 0.12761779129505157, "learning_rate": 8.046676813800103e-07, "loss": 0.0009, "step": 10158 }, { "epoch": 9.277625570776255, "grad_norm": 1.6184170246124268, "learning_rate": 8.036529680365298e-07, "loss": 0.0087, "step": 10159 }, { "epoch": 9.278538812785389, "grad_norm": 0.8536741733551025, "learning_rate": 8.026382546930494e-07, "loss": 0.004, "step": 10160 }, { "epoch": 9.27945205479452, "grad_norm": 0.6326210498809814, "learning_rate": 8.016235413495687e-07, "loss": 0.003, "step": 10161 }, { "epoch": 9.280365296803653, "grad_norm": 0.03065621480345726, "learning_rate": 8.006088280060883e-07, "loss": 0.0002, "step": 10162 }, { "epoch": 9.281278538812785, "grad_norm": 0.7780981063842773, "learning_rate": 7.995941146626079e-07, "loss": 0.0045, "step": 10163 }, { "epoch": 9.282191780821918, "grad_norm": 6.418294906616211, "learning_rate": 7.985794013191274e-07, "loss": 0.0384, "step": 10164 }, { "epoch": 9.28310502283105, "grad_norm": 0.39947381615638733, "learning_rate": 7.97564687975647e-07, "loss": 0.0024, "step": 10165 }, { "epoch": 9.284018264840183, "grad_norm": 2.295945644378662, "learning_rate": 7.965499746321665e-07, "loss": 0.0162, "step": 10166 }, { "epoch": 9.284931506849315, "grad_norm": 4.156048774719238, "learning_rate": 7.95535261288686e-07, "loss": 0.0285, "step": 10167 }, { "epoch": 9.285844748858448, "grad_norm": 0.22352248430252075, "learning_rate": 7.945205479452056e-07, "loss": 0.0017, "step": 10168 }, { "epoch": 9.28675799086758, "grad_norm": 0.7021631598472595, "learning_rate": 7.935058346017251e-07, "loss": 0.0038, "step": 10169 }, { "epoch": 9.287671232876713, "grad_norm": 0.27411922812461853, "learning_rate": 7.924911212582447e-07, "loss": 0.0015, "step": 10170 }, { "epoch": 9.288584474885845, "grad_norm": 2.3183834552764893, "learning_rate": 7.914764079147641e-07, "loss": 0.0126, "step": 10171 }, { "epoch": 9.289497716894978, "grad_norm": 73.23653411865234, "learning_rate": 7.904616945712837e-07, "loss": 0.7609, "step": 10172 }, { "epoch": 9.29041095890411, "grad_norm": 0.3851809501647949, "learning_rate": 7.894469812278032e-07, "loss": 0.0022, "step": 10173 }, { "epoch": 9.291324200913243, "grad_norm": 1.981040596961975, "learning_rate": 7.884322678843227e-07, "loss": 0.0113, "step": 10174 }, { "epoch": 9.292237442922374, "grad_norm": 2.0058131217956543, "learning_rate": 7.874175545408423e-07, "loss": 0.0094, "step": 10175 }, { "epoch": 9.293150684931508, "grad_norm": 20.222373962402344, "learning_rate": 7.864028411973618e-07, "loss": 0.1303, "step": 10176 }, { "epoch": 9.29406392694064, "grad_norm": 0.2795673906803131, "learning_rate": 7.853881278538814e-07, "loss": 0.0011, "step": 10177 }, { "epoch": 9.29497716894977, "grad_norm": 0.24593095481395721, "learning_rate": 7.84373414510401e-07, "loss": 0.0016, "step": 10178 }, { "epoch": 9.295890410958904, "grad_norm": 0.6751896739006042, "learning_rate": 7.833587011669204e-07, "loss": 0.0045, "step": 10179 }, { "epoch": 9.296803652968036, "grad_norm": 0.031228141859173775, "learning_rate": 7.8234398782344e-07, "loss": 0.0002, "step": 10180 }, { "epoch": 9.29771689497717, "grad_norm": 4.051883697509766, "learning_rate": 7.813292744799594e-07, "loss": 0.016, "step": 10181 }, { "epoch": 9.2986301369863, "grad_norm": 1.6734168529510498, "learning_rate": 7.80314561136479e-07, "loss": 0.01, "step": 10182 }, { "epoch": 9.299543378995434, "grad_norm": 0.4361071288585663, "learning_rate": 7.792998477929986e-07, "loss": 0.0025, "step": 10183 }, { "epoch": 9.300456621004566, "grad_norm": 0.05363297834992409, "learning_rate": 7.78285134449518e-07, "loss": 0.0004, "step": 10184 }, { "epoch": 9.301369863013699, "grad_norm": 1.1838338375091553, "learning_rate": 7.772704211060376e-07, "loss": 0.006, "step": 10185 }, { "epoch": 9.30228310502283, "grad_norm": 0.2962688207626343, "learning_rate": 7.762557077625571e-07, "loss": 0.0015, "step": 10186 }, { "epoch": 9.303196347031964, "grad_norm": 0.03767281025648117, "learning_rate": 7.752409944190767e-07, "loss": 0.0002, "step": 10187 }, { "epoch": 9.304109589041095, "grad_norm": 0.06327088177204132, "learning_rate": 7.742262810755963e-07, "loss": 0.0003, "step": 10188 }, { "epoch": 9.305022831050229, "grad_norm": 0.9902195930480957, "learning_rate": 7.732115677321158e-07, "loss": 0.0037, "step": 10189 }, { "epoch": 9.30593607305936, "grad_norm": 14.78976058959961, "learning_rate": 7.721968543886354e-07, "loss": 0.087, "step": 10190 }, { "epoch": 9.306849315068494, "grad_norm": 0.00324942241422832, "learning_rate": 7.711821410451547e-07, "loss": 0.0, "step": 10191 }, { "epoch": 9.307762557077625, "grad_norm": 0.3561753034591675, "learning_rate": 7.701674277016743e-07, "loss": 0.0015, "step": 10192 }, { "epoch": 9.308675799086759, "grad_norm": 0.013129128143191338, "learning_rate": 7.691527143581939e-07, "loss": 0.0001, "step": 10193 }, { "epoch": 9.30958904109589, "grad_norm": 0.5347163677215576, "learning_rate": 7.681380010147134e-07, "loss": 0.0039, "step": 10194 }, { "epoch": 9.310502283105023, "grad_norm": 0.3421196937561035, "learning_rate": 7.67123287671233e-07, "loss": 0.0019, "step": 10195 }, { "epoch": 9.311415525114155, "grad_norm": 0.03393350914120674, "learning_rate": 7.661085743277524e-07, "loss": 0.0002, "step": 10196 }, { "epoch": 9.312328767123288, "grad_norm": 7.282597541809082, "learning_rate": 7.65093860984272e-07, "loss": 0.0456, "step": 10197 }, { "epoch": 9.31324200913242, "grad_norm": 0.4549020528793335, "learning_rate": 7.640791476407916e-07, "loss": 0.0032, "step": 10198 }, { "epoch": 9.314155251141553, "grad_norm": 0.8077588677406311, "learning_rate": 7.630644342973111e-07, "loss": 0.0035, "step": 10199 }, { "epoch": 9.315068493150685, "grad_norm": 0.008762551471590996, "learning_rate": 7.620497209538307e-07, "loss": 0.0001, "step": 10200 }, { "epoch": 9.315981735159818, "grad_norm": 3.0787782669067383, "learning_rate": 7.610350076103501e-07, "loss": 0.0152, "step": 10201 }, { "epoch": 9.31689497716895, "grad_norm": 3.9146335124969482, "learning_rate": 7.600202942668696e-07, "loss": 0.0241, "step": 10202 }, { "epoch": 9.317808219178083, "grad_norm": 0.38717207312583923, "learning_rate": 7.590055809233892e-07, "loss": 0.0018, "step": 10203 }, { "epoch": 9.318721461187215, "grad_norm": 1.781189203262329, "learning_rate": 7.579908675799087e-07, "loss": 0.012, "step": 10204 }, { "epoch": 9.319634703196346, "grad_norm": 0.037408553063869476, "learning_rate": 7.569761542364283e-07, "loss": 0.0002, "step": 10205 }, { "epoch": 9.32054794520548, "grad_norm": 1.0027382373809814, "learning_rate": 7.559614408929478e-07, "loss": 0.0054, "step": 10206 }, { "epoch": 9.321461187214611, "grad_norm": 1.4837629795074463, "learning_rate": 7.549467275494674e-07, "loss": 0.0059, "step": 10207 }, { "epoch": 9.322374429223744, "grad_norm": 0.10229870676994324, "learning_rate": 7.53932014205987e-07, "loss": 0.0006, "step": 10208 }, { "epoch": 9.323287671232876, "grad_norm": 16.23421287536621, "learning_rate": 7.529173008625064e-07, "loss": 0.0648, "step": 10209 }, { "epoch": 9.32420091324201, "grad_norm": 3.384348154067993, "learning_rate": 7.51902587519026e-07, "loss": 0.0207, "step": 10210 }, { "epoch": 9.325114155251141, "grad_norm": 9.599610328674316, "learning_rate": 7.508878741755454e-07, "loss": 0.0475, "step": 10211 }, { "epoch": 9.326027397260274, "grad_norm": 66.3148193359375, "learning_rate": 7.49873160832065e-07, "loss": 0.9984, "step": 10212 }, { "epoch": 9.326940639269406, "grad_norm": 0.42649951577186584, "learning_rate": 7.488584474885845e-07, "loss": 0.0025, "step": 10213 }, { "epoch": 9.32785388127854, "grad_norm": 0.3253367245197296, "learning_rate": 7.47843734145104e-07, "loss": 0.0019, "step": 10214 }, { "epoch": 9.32876712328767, "grad_norm": 4.119230270385742, "learning_rate": 7.468290208016236e-07, "loss": 0.0192, "step": 10215 }, { "epoch": 9.329680365296804, "grad_norm": 3.3362231254577637, "learning_rate": 7.458143074581431e-07, "loss": 0.016, "step": 10216 }, { "epoch": 9.330593607305936, "grad_norm": 0.11406850069761276, "learning_rate": 7.447995941146627e-07, "loss": 0.0006, "step": 10217 }, { "epoch": 9.331506849315069, "grad_norm": 0.012201097793877125, "learning_rate": 7.437848807711822e-07, "loss": 0.0001, "step": 10218 }, { "epoch": 9.3324200913242, "grad_norm": 0.5130411386489868, "learning_rate": 7.427701674277018e-07, "loss": 0.0031, "step": 10219 }, { "epoch": 9.333333333333334, "grad_norm": 0.13434727489948273, "learning_rate": 7.417554540842214e-07, "loss": 0.0009, "step": 10220 }, { "epoch": 9.334246575342465, "grad_norm": 0.41915038228034973, "learning_rate": 7.407407407407407e-07, "loss": 0.0031, "step": 10221 }, { "epoch": 9.335159817351599, "grad_norm": 0.14588028192520142, "learning_rate": 7.397260273972603e-07, "loss": 0.0007, "step": 10222 }, { "epoch": 9.33607305936073, "grad_norm": 0.13669270277023315, "learning_rate": 7.387113140537798e-07, "loss": 0.0008, "step": 10223 }, { "epoch": 9.336986301369864, "grad_norm": 5.41655158996582, "learning_rate": 7.376966007102994e-07, "loss": 0.0267, "step": 10224 }, { "epoch": 9.337899543378995, "grad_norm": 0.753372073173523, "learning_rate": 7.36681887366819e-07, "loss": 0.0042, "step": 10225 }, { "epoch": 9.338812785388129, "grad_norm": 0.006478679832071066, "learning_rate": 7.356671740233384e-07, "loss": 0.0, "step": 10226 }, { "epoch": 9.33972602739726, "grad_norm": 10.630321502685547, "learning_rate": 7.34652460679858e-07, "loss": 0.0585, "step": 10227 }, { "epoch": 9.340639269406394, "grad_norm": 0.023904474452137947, "learning_rate": 7.336377473363775e-07, "loss": 0.0001, "step": 10228 }, { "epoch": 9.341552511415525, "grad_norm": 0.06254708021879196, "learning_rate": 7.326230339928971e-07, "loss": 0.0002, "step": 10229 }, { "epoch": 9.342465753424657, "grad_norm": 0.3556731343269348, "learning_rate": 7.316083206494167e-07, "loss": 0.0014, "step": 10230 }, { "epoch": 9.34337899543379, "grad_norm": 0.3167681396007538, "learning_rate": 7.305936073059361e-07, "loss": 0.0013, "step": 10231 }, { "epoch": 9.344292237442922, "grad_norm": 0.5869153738021851, "learning_rate": 7.295788939624556e-07, "loss": 0.0019, "step": 10232 }, { "epoch": 9.345205479452055, "grad_norm": 7.625328540802002, "learning_rate": 7.285641806189751e-07, "loss": 0.0682, "step": 10233 }, { "epoch": 9.346118721461186, "grad_norm": 0.09381933510303497, "learning_rate": 7.275494672754947e-07, "loss": 0.0005, "step": 10234 }, { "epoch": 9.34703196347032, "grad_norm": 2.8452606201171875, "learning_rate": 7.265347539320143e-07, "loss": 0.0153, "step": 10235 }, { "epoch": 9.347945205479451, "grad_norm": 1.099677324295044, "learning_rate": 7.255200405885338e-07, "loss": 0.0066, "step": 10236 }, { "epoch": 9.348858447488585, "grad_norm": 74.30960083007812, "learning_rate": 7.245053272450534e-07, "loss": 0.4324, "step": 10237 }, { "epoch": 9.349771689497716, "grad_norm": 1.0791187286376953, "learning_rate": 7.234906139015728e-07, "loss": 0.0051, "step": 10238 }, { "epoch": 9.35068493150685, "grad_norm": 0.37292495369911194, "learning_rate": 7.224759005580924e-07, "loss": 0.0023, "step": 10239 }, { "epoch": 9.351598173515981, "grad_norm": 0.264571875333786, "learning_rate": 7.21461187214612e-07, "loss": 0.0015, "step": 10240 }, { "epoch": 9.352511415525115, "grad_norm": 1.8455339670181274, "learning_rate": 7.204464738711314e-07, "loss": 0.0129, "step": 10241 }, { "epoch": 9.353424657534246, "grad_norm": 0.08686044067144394, "learning_rate": 7.19431760527651e-07, "loss": 0.0005, "step": 10242 }, { "epoch": 9.35433789954338, "grad_norm": 1.4387601613998413, "learning_rate": 7.184170471841705e-07, "loss": 0.0081, "step": 10243 }, { "epoch": 9.355251141552511, "grad_norm": 0.5960010886192322, "learning_rate": 7.1740233384069e-07, "loss": 0.0039, "step": 10244 }, { "epoch": 9.356164383561644, "grad_norm": 0.7574576735496521, "learning_rate": 7.163876204972096e-07, "loss": 0.0039, "step": 10245 }, { "epoch": 9.357077625570776, "grad_norm": 0.1755983531475067, "learning_rate": 7.153729071537291e-07, "loss": 0.0012, "step": 10246 }, { "epoch": 9.35799086757991, "grad_norm": 16.8652400970459, "learning_rate": 7.143581938102487e-07, "loss": 0.0967, "step": 10247 }, { "epoch": 9.35890410958904, "grad_norm": 0.22956602275371552, "learning_rate": 7.133434804667682e-07, "loss": 0.0011, "step": 10248 }, { "epoch": 9.359817351598174, "grad_norm": 6.500998020172119, "learning_rate": 7.123287671232878e-07, "loss": 0.0247, "step": 10249 }, { "epoch": 9.360730593607306, "grad_norm": 0.3702179789543152, "learning_rate": 7.113140537798073e-07, "loss": 0.0007, "step": 10250 }, { "epoch": 9.361643835616439, "grad_norm": 0.26178768277168274, "learning_rate": 7.102993404363267e-07, "loss": 0.0019, "step": 10251 }, { "epoch": 9.36255707762557, "grad_norm": 0.5254257321357727, "learning_rate": 7.092846270928463e-07, "loss": 0.0028, "step": 10252 }, { "epoch": 9.363470319634704, "grad_norm": 1.2695035934448242, "learning_rate": 7.082699137493658e-07, "loss": 0.0095, "step": 10253 }, { "epoch": 9.364383561643836, "grad_norm": 1.2308964729309082, "learning_rate": 7.072552004058854e-07, "loss": 0.0101, "step": 10254 }, { "epoch": 9.365296803652967, "grad_norm": 1.3997362852096558, "learning_rate": 7.06240487062405e-07, "loss": 0.0088, "step": 10255 }, { "epoch": 9.3662100456621, "grad_norm": 0.022238431498408318, "learning_rate": 7.052257737189244e-07, "loss": 0.0001, "step": 10256 }, { "epoch": 9.367123287671232, "grad_norm": 3.087482213973999, "learning_rate": 7.04211060375444e-07, "loss": 0.0185, "step": 10257 }, { "epoch": 9.368036529680365, "grad_norm": 2.3458104133605957, "learning_rate": 7.031963470319635e-07, "loss": 0.0126, "step": 10258 }, { "epoch": 9.368949771689497, "grad_norm": 0.9100714325904846, "learning_rate": 7.021816336884831e-07, "loss": 0.0067, "step": 10259 }, { "epoch": 9.36986301369863, "grad_norm": 3.447843551635742, "learning_rate": 7.011669203450027e-07, "loss": 0.0162, "step": 10260 }, { "epoch": 9.370776255707762, "grad_norm": 0.3932945728302002, "learning_rate": 7.001522070015221e-07, "loss": 0.0024, "step": 10261 }, { "epoch": 9.371689497716895, "grad_norm": 0.53150874376297, "learning_rate": 6.991374936580416e-07, "loss": 0.004, "step": 10262 }, { "epoch": 9.372602739726027, "grad_norm": 0.3711087703704834, "learning_rate": 6.981227803145611e-07, "loss": 0.0019, "step": 10263 }, { "epoch": 9.37351598173516, "grad_norm": 0.0071550337597727776, "learning_rate": 6.971080669710807e-07, "loss": 0.0, "step": 10264 }, { "epoch": 9.374429223744292, "grad_norm": 0.12074043601751328, "learning_rate": 6.960933536276003e-07, "loss": 0.0008, "step": 10265 }, { "epoch": 9.375342465753425, "grad_norm": 0.025314025580883026, "learning_rate": 6.950786402841198e-07, "loss": 0.0001, "step": 10266 }, { "epoch": 9.376255707762557, "grad_norm": 1.3318030834197998, "learning_rate": 6.940639269406394e-07, "loss": 0.006, "step": 10267 }, { "epoch": 9.37716894977169, "grad_norm": 4.739417552947998, "learning_rate": 6.930492135971588e-07, "loss": 0.033, "step": 10268 }, { "epoch": 9.378082191780821, "grad_norm": 2.6617157459259033, "learning_rate": 6.920345002536784e-07, "loss": 0.0132, "step": 10269 }, { "epoch": 9.378995433789955, "grad_norm": 7.957665920257568, "learning_rate": 6.91019786910198e-07, "loss": 0.0535, "step": 10270 }, { "epoch": 9.379908675799086, "grad_norm": 0.3478321135044098, "learning_rate": 6.900050735667174e-07, "loss": 0.0017, "step": 10271 }, { "epoch": 9.38082191780822, "grad_norm": 6.697783946990967, "learning_rate": 6.88990360223237e-07, "loss": 0.0356, "step": 10272 }, { "epoch": 9.381735159817351, "grad_norm": 0.3062914311885834, "learning_rate": 6.879756468797565e-07, "loss": 0.0021, "step": 10273 }, { "epoch": 9.382648401826485, "grad_norm": 5.1042680740356445, "learning_rate": 6.86960933536276e-07, "loss": 0.0249, "step": 10274 }, { "epoch": 9.383561643835616, "grad_norm": 1.1315122842788696, "learning_rate": 6.859462201927956e-07, "loss": 0.0054, "step": 10275 }, { "epoch": 9.38447488584475, "grad_norm": 1.8145970106124878, "learning_rate": 6.849315068493151e-07, "loss": 0.0126, "step": 10276 }, { "epoch": 9.385388127853881, "grad_norm": 2.30922269821167, "learning_rate": 6.839167935058347e-07, "loss": 0.0078, "step": 10277 }, { "epoch": 9.386301369863014, "grad_norm": 25.77382469177246, "learning_rate": 6.829020801623542e-07, "loss": 0.3006, "step": 10278 }, { "epoch": 9.387214611872146, "grad_norm": 0.17708657681941986, "learning_rate": 6.818873668188738e-07, "loss": 0.0015, "step": 10279 }, { "epoch": 9.38812785388128, "grad_norm": 0.1356334239244461, "learning_rate": 6.808726534753933e-07, "loss": 0.0008, "step": 10280 }, { "epoch": 9.389041095890411, "grad_norm": 7.738439083099365, "learning_rate": 6.798579401319127e-07, "loss": 0.0458, "step": 10281 }, { "epoch": 9.389954337899542, "grad_norm": 0.410676509141922, "learning_rate": 6.788432267884323e-07, "loss": 0.0033, "step": 10282 }, { "epoch": 9.390867579908676, "grad_norm": 0.10296501219272614, "learning_rate": 6.778285134449518e-07, "loss": 0.0004, "step": 10283 }, { "epoch": 9.391780821917807, "grad_norm": 0.20599524676799774, "learning_rate": 6.768138001014714e-07, "loss": 0.0011, "step": 10284 }, { "epoch": 9.39269406392694, "grad_norm": 1.2390989065170288, "learning_rate": 6.75799086757991e-07, "loss": 0.0053, "step": 10285 }, { "epoch": 9.393607305936072, "grad_norm": 1.382005214691162, "learning_rate": 6.747843734145104e-07, "loss": 0.0088, "step": 10286 }, { "epoch": 9.394520547945206, "grad_norm": 0.4604414403438568, "learning_rate": 6.7376966007103e-07, "loss": 0.002, "step": 10287 }, { "epoch": 9.395433789954337, "grad_norm": 0.40135249495506287, "learning_rate": 6.727549467275495e-07, "loss": 0.0023, "step": 10288 }, { "epoch": 9.39634703196347, "grad_norm": 51.610443115234375, "learning_rate": 6.717402333840691e-07, "loss": 0.0389, "step": 10289 }, { "epoch": 9.397260273972602, "grad_norm": 1.1160739660263062, "learning_rate": 6.707255200405887e-07, "loss": 0.0072, "step": 10290 }, { "epoch": 9.398173515981735, "grad_norm": 0.10082584619522095, "learning_rate": 6.697108066971081e-07, "loss": 0.0005, "step": 10291 }, { "epoch": 9.399086757990867, "grad_norm": 3.029304027557373, "learning_rate": 6.686960933536276e-07, "loss": 0.0098, "step": 10292 }, { "epoch": 9.4, "grad_norm": 0.1479111909866333, "learning_rate": 6.676813800101471e-07, "loss": 0.0009, "step": 10293 }, { "epoch": 9.400913242009132, "grad_norm": 0.32565248012542725, "learning_rate": 6.666666666666667e-07, "loss": 0.0014, "step": 10294 }, { "epoch": 9.401826484018265, "grad_norm": 103.26589965820312, "learning_rate": 6.656519533231863e-07, "loss": 0.7278, "step": 10295 }, { "epoch": 9.402739726027397, "grad_norm": 0.06559903919696808, "learning_rate": 6.646372399797058e-07, "loss": 0.0004, "step": 10296 }, { "epoch": 9.40365296803653, "grad_norm": 19.022432327270508, "learning_rate": 6.636225266362254e-07, "loss": 0.0879, "step": 10297 }, { "epoch": 9.404566210045662, "grad_norm": 1.7663840055465698, "learning_rate": 6.626078132927448e-07, "loss": 0.0093, "step": 10298 }, { "epoch": 9.405479452054795, "grad_norm": 0.20666535198688507, "learning_rate": 6.615930999492644e-07, "loss": 0.001, "step": 10299 }, { "epoch": 9.406392694063927, "grad_norm": 3.4122109413146973, "learning_rate": 6.60578386605784e-07, "loss": 0.0151, "step": 10300 }, { "epoch": 9.40730593607306, "grad_norm": 9.704099655151367, "learning_rate": 6.595636732623034e-07, "loss": 0.0502, "step": 10301 }, { "epoch": 9.408219178082192, "grad_norm": 1.6637609004974365, "learning_rate": 6.58548959918823e-07, "loss": 0.0083, "step": 10302 }, { "epoch": 9.409132420091325, "grad_norm": 0.041865576058626175, "learning_rate": 6.575342465753425e-07, "loss": 0.0002, "step": 10303 }, { "epoch": 9.410045662100456, "grad_norm": 0.4933309257030487, "learning_rate": 6.56519533231862e-07, "loss": 0.0026, "step": 10304 }, { "epoch": 9.41095890410959, "grad_norm": 0.37403950095176697, "learning_rate": 6.555048198883816e-07, "loss": 0.0022, "step": 10305 }, { "epoch": 9.411872146118721, "grad_norm": 0.18936660885810852, "learning_rate": 6.544901065449011e-07, "loss": 0.0011, "step": 10306 }, { "epoch": 9.412785388127855, "grad_norm": 1.6618555784225464, "learning_rate": 6.534753932014207e-07, "loss": 0.0061, "step": 10307 }, { "epoch": 9.413698630136986, "grad_norm": 0.06248241662979126, "learning_rate": 6.524606798579402e-07, "loss": 0.0003, "step": 10308 }, { "epoch": 9.414611872146118, "grad_norm": 4.237881183624268, "learning_rate": 6.514459665144598e-07, "loss": 0.0222, "step": 10309 }, { "epoch": 9.415525114155251, "grad_norm": 1.6263912916183472, "learning_rate": 6.504312531709793e-07, "loss": 0.0073, "step": 10310 }, { "epoch": 9.416438356164383, "grad_norm": 0.28729844093322754, "learning_rate": 6.494165398274987e-07, "loss": 0.0013, "step": 10311 }, { "epoch": 9.417351598173516, "grad_norm": 0.040741581469774246, "learning_rate": 6.484018264840183e-07, "loss": 0.0002, "step": 10312 }, { "epoch": 9.418264840182648, "grad_norm": 1.0983937978744507, "learning_rate": 6.473871131405378e-07, "loss": 0.0056, "step": 10313 }, { "epoch": 9.419178082191781, "grad_norm": 0.14335940778255463, "learning_rate": 6.463723997970574e-07, "loss": 0.0007, "step": 10314 }, { "epoch": 9.420091324200913, "grad_norm": 0.2422093152999878, "learning_rate": 6.45357686453577e-07, "loss": 0.0013, "step": 10315 }, { "epoch": 9.421004566210046, "grad_norm": 0.06429034471511841, "learning_rate": 6.443429731100964e-07, "loss": 0.0003, "step": 10316 }, { "epoch": 9.421917808219177, "grad_norm": 0.01621151901781559, "learning_rate": 6.43328259766616e-07, "loss": 0.0001, "step": 10317 }, { "epoch": 9.42283105022831, "grad_norm": 0.12102001160383224, "learning_rate": 6.423135464231355e-07, "loss": 0.0006, "step": 10318 }, { "epoch": 9.423744292237442, "grad_norm": 0.08584211021661758, "learning_rate": 6.412988330796551e-07, "loss": 0.0006, "step": 10319 }, { "epoch": 9.424657534246576, "grad_norm": 17.314638137817383, "learning_rate": 6.402841197361747e-07, "loss": 0.0979, "step": 10320 }, { "epoch": 9.425570776255707, "grad_norm": 0.0761321559548378, "learning_rate": 6.39269406392694e-07, "loss": 0.0004, "step": 10321 }, { "epoch": 9.42648401826484, "grad_norm": 1.6370301246643066, "learning_rate": 6.382546930492136e-07, "loss": 0.0055, "step": 10322 }, { "epoch": 9.427397260273972, "grad_norm": 0.0917026549577713, "learning_rate": 6.372399797057331e-07, "loss": 0.0005, "step": 10323 }, { "epoch": 9.428310502283106, "grad_norm": 0.06813373416662216, "learning_rate": 6.362252663622527e-07, "loss": 0.0003, "step": 10324 }, { "epoch": 9.429223744292237, "grad_norm": 0.9216254949569702, "learning_rate": 6.352105530187723e-07, "loss": 0.0056, "step": 10325 }, { "epoch": 9.43013698630137, "grad_norm": 0.00441113393753767, "learning_rate": 6.341958396752918e-07, "loss": 0.0, "step": 10326 }, { "epoch": 9.431050228310502, "grad_norm": 0.11919418722391129, "learning_rate": 6.331811263318114e-07, "loss": 0.0006, "step": 10327 }, { "epoch": 9.431963470319635, "grad_norm": 1.3600306510925293, "learning_rate": 6.321664129883308e-07, "loss": 0.0085, "step": 10328 }, { "epoch": 9.432876712328767, "grad_norm": 1.855618953704834, "learning_rate": 6.311516996448504e-07, "loss": 0.0125, "step": 10329 }, { "epoch": 9.4337899543379, "grad_norm": 2.18448805809021, "learning_rate": 6.3013698630137e-07, "loss": 0.0141, "step": 10330 }, { "epoch": 9.434703196347032, "grad_norm": 2.478743553161621, "learning_rate": 6.291222729578894e-07, "loss": 0.0174, "step": 10331 }, { "epoch": 9.435616438356165, "grad_norm": 3.4580788612365723, "learning_rate": 6.28107559614409e-07, "loss": 0.0165, "step": 10332 }, { "epoch": 9.436529680365297, "grad_norm": 0.9894701242446899, "learning_rate": 6.270928462709285e-07, "loss": 0.006, "step": 10333 }, { "epoch": 9.43744292237443, "grad_norm": 3.8007454872131348, "learning_rate": 6.26078132927448e-07, "loss": 0.0234, "step": 10334 }, { "epoch": 9.438356164383562, "grad_norm": 49.072669982910156, "learning_rate": 6.250634195839676e-07, "loss": 0.3437, "step": 10335 }, { "epoch": 9.439269406392693, "grad_norm": 5.742048740386963, "learning_rate": 6.240487062404871e-07, "loss": 0.0303, "step": 10336 }, { "epoch": 9.440182648401827, "grad_norm": 0.19437766075134277, "learning_rate": 6.230339928970067e-07, "loss": 0.001, "step": 10337 }, { "epoch": 9.441095890410958, "grad_norm": 22.499755859375, "learning_rate": 6.220192795535262e-07, "loss": 0.1257, "step": 10338 }, { "epoch": 9.442009132420091, "grad_norm": 37.91051483154297, "learning_rate": 6.210045662100458e-07, "loss": 0.1811, "step": 10339 }, { "epoch": 9.442922374429223, "grad_norm": 0.4415449798107147, "learning_rate": 6.199898528665652e-07, "loss": 0.003, "step": 10340 }, { "epoch": 9.443835616438356, "grad_norm": 1.497240424156189, "learning_rate": 6.189751395230847e-07, "loss": 0.0085, "step": 10341 }, { "epoch": 9.444748858447488, "grad_norm": 7.924038887023926, "learning_rate": 6.179604261796043e-07, "loss": 0.0146, "step": 10342 }, { "epoch": 9.445662100456621, "grad_norm": 0.7133127450942993, "learning_rate": 6.169457128361239e-07, "loss": 0.0047, "step": 10343 }, { "epoch": 9.446575342465753, "grad_norm": 0.058477018028497696, "learning_rate": 6.159309994926434e-07, "loss": 0.0003, "step": 10344 }, { "epoch": 9.447488584474886, "grad_norm": 10.328676223754883, "learning_rate": 6.149162861491628e-07, "loss": 0.062, "step": 10345 }, { "epoch": 9.448401826484018, "grad_norm": 0.039935898035764694, "learning_rate": 6.139015728056824e-07, "loss": 0.0003, "step": 10346 }, { "epoch": 9.449315068493151, "grad_norm": 0.13386715948581696, "learning_rate": 6.12886859462202e-07, "loss": 0.0008, "step": 10347 }, { "epoch": 9.450228310502283, "grad_norm": 0.15596823394298553, "learning_rate": 6.118721461187215e-07, "loss": 0.0008, "step": 10348 }, { "epoch": 9.451141552511416, "grad_norm": 4.013532638549805, "learning_rate": 6.108574327752411e-07, "loss": 0.018, "step": 10349 }, { "epoch": 9.452054794520548, "grad_norm": 0.22210679948329926, "learning_rate": 6.098427194317606e-07, "loss": 0.001, "step": 10350 }, { "epoch": 9.45296803652968, "grad_norm": 0.01674947701394558, "learning_rate": 6.0882800608828e-07, "loss": 0.0001, "step": 10351 }, { "epoch": 9.453881278538812, "grad_norm": 0.03994818031787872, "learning_rate": 6.078132927447996e-07, "loss": 0.0002, "step": 10352 }, { "epoch": 9.454794520547946, "grad_norm": 0.12888845801353455, "learning_rate": 6.067985794013192e-07, "loss": 0.0007, "step": 10353 }, { "epoch": 9.455707762557077, "grad_norm": 0.5039073824882507, "learning_rate": 6.057838660578387e-07, "loss": 0.0031, "step": 10354 }, { "epoch": 9.45662100456621, "grad_norm": 0.11178150773048401, "learning_rate": 6.047691527143582e-07, "loss": 0.0005, "step": 10355 }, { "epoch": 9.457534246575342, "grad_norm": 0.427119642496109, "learning_rate": 6.037544393708778e-07, "loss": 0.0019, "step": 10356 }, { "epoch": 9.458447488584476, "grad_norm": 2.859621524810791, "learning_rate": 6.027397260273974e-07, "loss": 0.0122, "step": 10357 }, { "epoch": 9.459360730593607, "grad_norm": 0.8908593654632568, "learning_rate": 6.017250126839168e-07, "loss": 0.0067, "step": 10358 }, { "epoch": 9.46027397260274, "grad_norm": 0.27170541882514954, "learning_rate": 6.007102993404364e-07, "loss": 0.0011, "step": 10359 }, { "epoch": 9.461187214611872, "grad_norm": 14.694255828857422, "learning_rate": 5.996955859969559e-07, "loss": 0.0705, "step": 10360 }, { "epoch": 9.462100456621005, "grad_norm": 10.999971389770508, "learning_rate": 5.986808726534754e-07, "loss": 0.0906, "step": 10361 }, { "epoch": 9.463013698630137, "grad_norm": 4.1875176429748535, "learning_rate": 5.97666159309995e-07, "loss": 0.0161, "step": 10362 }, { "epoch": 9.463926940639269, "grad_norm": 0.5638141632080078, "learning_rate": 5.966514459665146e-07, "loss": 0.0039, "step": 10363 }, { "epoch": 9.464840182648402, "grad_norm": 0.09764942526817322, "learning_rate": 5.95636732623034e-07, "loss": 0.0004, "step": 10364 }, { "epoch": 9.465753424657533, "grad_norm": 1.8569709062576294, "learning_rate": 5.946220192795535e-07, "loss": 0.0121, "step": 10365 }, { "epoch": 9.466666666666667, "grad_norm": 0.13163650035858154, "learning_rate": 5.936073059360731e-07, "loss": 0.0008, "step": 10366 }, { "epoch": 9.467579908675798, "grad_norm": 7.07789945602417, "learning_rate": 5.925925925925927e-07, "loss": 0.0462, "step": 10367 }, { "epoch": 9.468493150684932, "grad_norm": 0.2967446744441986, "learning_rate": 5.915778792491122e-07, "loss": 0.0022, "step": 10368 }, { "epoch": 9.469406392694063, "grad_norm": 1.5243672132492065, "learning_rate": 5.905631659056318e-07, "loss": 0.0082, "step": 10369 }, { "epoch": 9.470319634703197, "grad_norm": 31.385068893432617, "learning_rate": 5.895484525621512e-07, "loss": 0.1533, "step": 10370 }, { "epoch": 9.471232876712328, "grad_norm": 12.935246467590332, "learning_rate": 5.885337392186707e-07, "loss": 0.0607, "step": 10371 }, { "epoch": 9.472146118721462, "grad_norm": 2.165856122970581, "learning_rate": 5.875190258751903e-07, "loss": 0.0108, "step": 10372 }, { "epoch": 9.473059360730593, "grad_norm": 3.256988286972046, "learning_rate": 5.865043125317099e-07, "loss": 0.0155, "step": 10373 }, { "epoch": 9.473972602739726, "grad_norm": 7.625792980194092, "learning_rate": 5.854895991882294e-07, "loss": 0.0463, "step": 10374 }, { "epoch": 9.474885844748858, "grad_norm": 0.19077105820178986, "learning_rate": 5.844748858447488e-07, "loss": 0.001, "step": 10375 }, { "epoch": 9.475799086757991, "grad_norm": 1.7169674634933472, "learning_rate": 5.834601725012684e-07, "loss": 0.0089, "step": 10376 }, { "epoch": 9.476712328767123, "grad_norm": 0.2042214274406433, "learning_rate": 5.82445459157788e-07, "loss": 0.0011, "step": 10377 }, { "epoch": 9.477625570776256, "grad_norm": 1.200820803642273, "learning_rate": 5.814307458143075e-07, "loss": 0.006, "step": 10378 }, { "epoch": 9.478538812785388, "grad_norm": 1.6526397466659546, "learning_rate": 5.804160324708271e-07, "loss": 0.0061, "step": 10379 }, { "epoch": 9.479452054794521, "grad_norm": 0.0518312081694603, "learning_rate": 5.794013191273466e-07, "loss": 0.0003, "step": 10380 }, { "epoch": 9.480365296803653, "grad_norm": 0.1493661105632782, "learning_rate": 5.78386605783866e-07, "loss": 0.0008, "step": 10381 }, { "epoch": 9.481278538812786, "grad_norm": 0.061721380800008774, "learning_rate": 5.773718924403856e-07, "loss": 0.0003, "step": 10382 }, { "epoch": 9.482191780821918, "grad_norm": 0.2546387016773224, "learning_rate": 5.763571790969052e-07, "loss": 0.001, "step": 10383 }, { "epoch": 9.483105022831051, "grad_norm": 0.10855932533740997, "learning_rate": 5.753424657534247e-07, "loss": 0.0006, "step": 10384 }, { "epoch": 9.484018264840183, "grad_norm": 0.41703125834465027, "learning_rate": 5.743277524099442e-07, "loss": 0.0035, "step": 10385 }, { "epoch": 9.484931506849316, "grad_norm": 0.25314339995384216, "learning_rate": 5.733130390664638e-07, "loss": 0.0014, "step": 10386 }, { "epoch": 9.485844748858447, "grad_norm": 4.540770053863525, "learning_rate": 5.722983257229834e-07, "loss": 0.0315, "step": 10387 }, { "epoch": 9.48675799086758, "grad_norm": 0.8274728655815125, "learning_rate": 5.712836123795028e-07, "loss": 0.0039, "step": 10388 }, { "epoch": 9.487671232876712, "grad_norm": 22.464143753051758, "learning_rate": 5.702688990360224e-07, "loss": 0.1147, "step": 10389 }, { "epoch": 9.488584474885844, "grad_norm": 1.4614416360855103, "learning_rate": 5.692541856925419e-07, "loss": 0.0085, "step": 10390 }, { "epoch": 9.489497716894977, "grad_norm": 0.12131176888942719, "learning_rate": 5.682394723490614e-07, "loss": 0.0009, "step": 10391 }, { "epoch": 9.490410958904109, "grad_norm": 0.018013767898082733, "learning_rate": 5.67224759005581e-07, "loss": 0.0001, "step": 10392 }, { "epoch": 9.491324200913242, "grad_norm": 9.785690307617188, "learning_rate": 5.662100456621006e-07, "loss": 0.0329, "step": 10393 }, { "epoch": 9.492237442922374, "grad_norm": 0.018454965204000473, "learning_rate": 5.6519533231862e-07, "loss": 0.0001, "step": 10394 }, { "epoch": 9.493150684931507, "grad_norm": 0.40716856718063354, "learning_rate": 5.641806189751395e-07, "loss": 0.0019, "step": 10395 }, { "epoch": 9.494063926940639, "grad_norm": 0.10198758542537689, "learning_rate": 5.631659056316591e-07, "loss": 0.0007, "step": 10396 }, { "epoch": 9.494977168949772, "grad_norm": 0.0015715701738372445, "learning_rate": 5.621511922881787e-07, "loss": 0.0, "step": 10397 }, { "epoch": 9.495890410958904, "grad_norm": 1.912717580795288, "learning_rate": 5.611364789446982e-07, "loss": 0.0116, "step": 10398 }, { "epoch": 9.496803652968037, "grad_norm": 0.5375090837478638, "learning_rate": 5.601217656012178e-07, "loss": 0.0031, "step": 10399 }, { "epoch": 9.497716894977168, "grad_norm": 0.10280657559633255, "learning_rate": 5.591070522577372e-07, "loss": 0.0006, "step": 10400 }, { "epoch": 9.498630136986302, "grad_norm": 0.05292952060699463, "learning_rate": 5.580923389142567e-07, "loss": 0.0004, "step": 10401 }, { "epoch": 9.499543378995433, "grad_norm": 0.10884721577167511, "learning_rate": 5.570776255707763e-07, "loss": 0.0007, "step": 10402 }, { "epoch": 9.500456621004567, "grad_norm": 1.365371584892273, "learning_rate": 5.560629122272959e-07, "loss": 0.0039, "step": 10403 }, { "epoch": 9.501369863013698, "grad_norm": 5.904114723205566, "learning_rate": 5.550481988838154e-07, "loss": 0.0306, "step": 10404 }, { "epoch": 9.502283105022832, "grad_norm": 1.1882370710372925, "learning_rate": 5.540334855403348e-07, "loss": 0.0077, "step": 10405 }, { "epoch": 9.503196347031963, "grad_norm": 1.7470676898956299, "learning_rate": 5.530187721968544e-07, "loss": 0.0121, "step": 10406 }, { "epoch": 9.504109589041096, "grad_norm": 0.14477339386940002, "learning_rate": 5.52004058853374e-07, "loss": 0.0007, "step": 10407 }, { "epoch": 9.505022831050228, "grad_norm": 0.1458183228969574, "learning_rate": 5.509893455098935e-07, "loss": 0.0008, "step": 10408 }, { "epoch": 9.505936073059361, "grad_norm": 0.1399746984243393, "learning_rate": 5.49974632166413e-07, "loss": 0.0007, "step": 10409 }, { "epoch": 9.506849315068493, "grad_norm": 0.023899005725979805, "learning_rate": 5.489599188229326e-07, "loss": 0.0001, "step": 10410 }, { "epoch": 9.507762557077626, "grad_norm": 0.47368231415748596, "learning_rate": 5.47945205479452e-07, "loss": 0.0019, "step": 10411 }, { "epoch": 9.508675799086758, "grad_norm": 0.4180942475795746, "learning_rate": 5.469304921359716e-07, "loss": 0.0018, "step": 10412 }, { "epoch": 9.509589041095891, "grad_norm": 0.06973256170749664, "learning_rate": 5.459157787924912e-07, "loss": 0.0003, "step": 10413 }, { "epoch": 9.510502283105023, "grad_norm": 0.22999326884746552, "learning_rate": 5.449010654490107e-07, "loss": 0.0013, "step": 10414 }, { "epoch": 9.511415525114156, "grad_norm": 2.2151403427124023, "learning_rate": 5.438863521055302e-07, "loss": 0.009, "step": 10415 }, { "epoch": 9.512328767123288, "grad_norm": 2.708052396774292, "learning_rate": 5.428716387620498e-07, "loss": 0.0178, "step": 10416 }, { "epoch": 9.51324200913242, "grad_norm": 1.7108937501907349, "learning_rate": 5.418569254185693e-07, "loss": 0.0126, "step": 10417 }, { "epoch": 9.514155251141553, "grad_norm": 0.02753269299864769, "learning_rate": 5.408422120750888e-07, "loss": 0.0002, "step": 10418 }, { "epoch": 9.515068493150684, "grad_norm": 5.544773101806641, "learning_rate": 5.398274987316083e-07, "loss": 0.0362, "step": 10419 }, { "epoch": 9.515981735159817, "grad_norm": 0.47571223974227905, "learning_rate": 5.388127853881279e-07, "loss": 0.0023, "step": 10420 }, { "epoch": 9.516894977168949, "grad_norm": 0.42231497168540955, "learning_rate": 5.377980720446474e-07, "loss": 0.0022, "step": 10421 }, { "epoch": 9.517808219178082, "grad_norm": 0.054833486676216125, "learning_rate": 5.36783358701167e-07, "loss": 0.0002, "step": 10422 }, { "epoch": 9.518721461187214, "grad_norm": 1.0691111087799072, "learning_rate": 5.357686453576865e-07, "loss": 0.0078, "step": 10423 }, { "epoch": 9.519634703196347, "grad_norm": 0.355144202709198, "learning_rate": 5.34753932014206e-07, "loss": 0.002, "step": 10424 }, { "epoch": 9.520547945205479, "grad_norm": 18.30998420715332, "learning_rate": 5.337392186707255e-07, "loss": 0.1163, "step": 10425 }, { "epoch": 9.521461187214612, "grad_norm": 12.899648666381836, "learning_rate": 5.327245053272451e-07, "loss": 0.0631, "step": 10426 }, { "epoch": 9.522374429223744, "grad_norm": 0.48143357038497925, "learning_rate": 5.317097919837647e-07, "loss": 0.0029, "step": 10427 }, { "epoch": 9.523287671232877, "grad_norm": 0.027472756803035736, "learning_rate": 5.306950786402842e-07, "loss": 0.0002, "step": 10428 }, { "epoch": 9.524200913242009, "grad_norm": 0.10287013649940491, "learning_rate": 5.296803652968036e-07, "loss": 0.0002, "step": 10429 }, { "epoch": 9.525114155251142, "grad_norm": 2.798794746398926, "learning_rate": 5.286656519533232e-07, "loss": 0.0173, "step": 10430 }, { "epoch": 9.526027397260274, "grad_norm": 13.65976619720459, "learning_rate": 5.276509386098427e-07, "loss": 0.0571, "step": 10431 }, { "epoch": 9.526940639269407, "grad_norm": 0.016985280439257622, "learning_rate": 5.266362252663623e-07, "loss": 0.0001, "step": 10432 }, { "epoch": 9.527853881278538, "grad_norm": 0.8065924048423767, "learning_rate": 5.256215119228819e-07, "loss": 0.0044, "step": 10433 }, { "epoch": 9.528767123287672, "grad_norm": 0.11448769271373749, "learning_rate": 5.246067985794014e-07, "loss": 0.0005, "step": 10434 }, { "epoch": 9.529680365296803, "grad_norm": 1.4970993995666504, "learning_rate": 5.235920852359208e-07, "loss": 0.0099, "step": 10435 }, { "epoch": 9.530593607305937, "grad_norm": 0.0020495043136179447, "learning_rate": 5.225773718924404e-07, "loss": 0.0, "step": 10436 }, { "epoch": 9.531506849315068, "grad_norm": 11.1576566696167, "learning_rate": 5.2156265854896e-07, "loss": 0.0584, "step": 10437 }, { "epoch": 9.532420091324202, "grad_norm": 1.6832138299942017, "learning_rate": 5.205479452054795e-07, "loss": 0.0134, "step": 10438 }, { "epoch": 9.533333333333333, "grad_norm": 3.516587734222412, "learning_rate": 5.19533231861999e-07, "loss": 0.0266, "step": 10439 }, { "epoch": 9.534246575342467, "grad_norm": 14.390929222106934, "learning_rate": 5.185185185185186e-07, "loss": 0.0686, "step": 10440 }, { "epoch": 9.535159817351598, "grad_norm": 3.2026724815368652, "learning_rate": 5.17503805175038e-07, "loss": 0.0161, "step": 10441 }, { "epoch": 9.536073059360731, "grad_norm": 2.847811222076416, "learning_rate": 5.164890918315576e-07, "loss": 0.0084, "step": 10442 }, { "epoch": 9.536986301369863, "grad_norm": 0.6330407857894897, "learning_rate": 5.154743784880772e-07, "loss": 0.0022, "step": 10443 }, { "epoch": 9.537899543378995, "grad_norm": 6.623506546020508, "learning_rate": 5.144596651445967e-07, "loss": 0.0405, "step": 10444 }, { "epoch": 9.538812785388128, "grad_norm": 0.040037985891103745, "learning_rate": 5.134449518011162e-07, "loss": 0.0002, "step": 10445 }, { "epoch": 9.53972602739726, "grad_norm": 3.1375818252563477, "learning_rate": 5.124302384576358e-07, "loss": 0.0099, "step": 10446 }, { "epoch": 9.540639269406393, "grad_norm": 0.7772542834281921, "learning_rate": 5.114155251141553e-07, "loss": 0.0036, "step": 10447 }, { "epoch": 9.541552511415524, "grad_norm": 0.5246714949607849, "learning_rate": 5.104008117706748e-07, "loss": 0.0039, "step": 10448 }, { "epoch": 9.542465753424658, "grad_norm": 26.946239471435547, "learning_rate": 5.093860984271943e-07, "loss": 0.1411, "step": 10449 }, { "epoch": 9.54337899543379, "grad_norm": 2.0678415298461914, "learning_rate": 5.083713850837139e-07, "loss": 0.015, "step": 10450 }, { "epoch": 9.544292237442923, "grad_norm": 0.029268402606248856, "learning_rate": 5.073566717402334e-07, "loss": 0.0002, "step": 10451 }, { "epoch": 9.545205479452054, "grad_norm": 10.489130020141602, "learning_rate": 5.06341958396753e-07, "loss": 0.0849, "step": 10452 }, { "epoch": 9.546118721461188, "grad_norm": 3.016216516494751, "learning_rate": 5.053272450532725e-07, "loss": 0.0173, "step": 10453 }, { "epoch": 9.54703196347032, "grad_norm": 0.19424284994602203, "learning_rate": 5.04312531709792e-07, "loss": 0.0014, "step": 10454 }, { "epoch": 9.547945205479452, "grad_norm": 1.864060878753662, "learning_rate": 5.032978183663115e-07, "loss": 0.0099, "step": 10455 }, { "epoch": 9.548858447488584, "grad_norm": 0.3945363759994507, "learning_rate": 5.022831050228311e-07, "loss": 0.0024, "step": 10456 }, { "epoch": 9.549771689497717, "grad_norm": 0.14496882259845734, "learning_rate": 5.012683916793507e-07, "loss": 0.0007, "step": 10457 }, { "epoch": 9.550684931506849, "grad_norm": 0.2900809049606323, "learning_rate": 5.002536783358702e-07, "loss": 0.0016, "step": 10458 }, { "epoch": 9.551598173515982, "grad_norm": 0.9737546443939209, "learning_rate": 4.992389649923896e-07, "loss": 0.0062, "step": 10459 }, { "epoch": 9.552511415525114, "grad_norm": 0.05054236948490143, "learning_rate": 4.982242516489092e-07, "loss": 0.0003, "step": 10460 }, { "epoch": 9.553424657534247, "grad_norm": 0.3850627541542053, "learning_rate": 4.972095383054287e-07, "loss": 0.0022, "step": 10461 }, { "epoch": 9.554337899543379, "grad_norm": 2.7514898777008057, "learning_rate": 4.961948249619483e-07, "loss": 0.0136, "step": 10462 }, { "epoch": 9.555251141552512, "grad_norm": 0.006919137667864561, "learning_rate": 4.951801116184679e-07, "loss": 0.0, "step": 10463 }, { "epoch": 9.556164383561644, "grad_norm": 0.01898546889424324, "learning_rate": 4.941653982749874e-07, "loss": 0.0001, "step": 10464 }, { "epoch": 9.557077625570777, "grad_norm": 0.06983133405447006, "learning_rate": 4.931506849315068e-07, "loss": 0.0004, "step": 10465 }, { "epoch": 9.557990867579909, "grad_norm": 6.132462978363037, "learning_rate": 4.921359715880264e-07, "loss": 0.0258, "step": 10466 }, { "epoch": 9.558904109589042, "grad_norm": 2.101525068283081, "learning_rate": 4.91121258244546e-07, "loss": 0.0126, "step": 10467 }, { "epoch": 9.559817351598173, "grad_norm": 0.08675671368837357, "learning_rate": 4.901065449010655e-07, "loss": 0.0007, "step": 10468 }, { "epoch": 9.560730593607307, "grad_norm": 0.6226363182067871, "learning_rate": 4.89091831557585e-07, "loss": 0.0045, "step": 10469 }, { "epoch": 9.561643835616438, "grad_norm": 5.146440029144287, "learning_rate": 4.880771182141046e-07, "loss": 0.0193, "step": 10470 }, { "epoch": 9.56255707762557, "grad_norm": 0.17680992186069489, "learning_rate": 4.87062404870624e-07, "loss": 0.001, "step": 10471 }, { "epoch": 9.563470319634703, "grad_norm": 0.1317344307899475, "learning_rate": 4.860476915271436e-07, "loss": 0.0007, "step": 10472 }, { "epoch": 9.564383561643835, "grad_norm": 0.2993122935295105, "learning_rate": 4.850329781836632e-07, "loss": 0.0021, "step": 10473 }, { "epoch": 9.565296803652968, "grad_norm": 4.217282772064209, "learning_rate": 4.840182648401827e-07, "loss": 0.0248, "step": 10474 }, { "epoch": 9.5662100456621, "grad_norm": 15.240309715270996, "learning_rate": 4.830035514967022e-07, "loss": 0.0806, "step": 10475 }, { "epoch": 9.567123287671233, "grad_norm": 1.5540140867233276, "learning_rate": 4.819888381532218e-07, "loss": 0.0114, "step": 10476 }, { "epoch": 9.568036529680365, "grad_norm": 3.323908567428589, "learning_rate": 4.809741248097413e-07, "loss": 0.0134, "step": 10477 }, { "epoch": 9.568949771689498, "grad_norm": 5.733242511749268, "learning_rate": 4.799594114662608e-07, "loss": 0.0218, "step": 10478 }, { "epoch": 9.56986301369863, "grad_norm": 8.574411392211914, "learning_rate": 4.789446981227803e-07, "loss": 0.0554, "step": 10479 }, { "epoch": 9.570776255707763, "grad_norm": 0.42142748832702637, "learning_rate": 4.779299847792999e-07, "loss": 0.0022, "step": 10480 }, { "epoch": 9.571689497716894, "grad_norm": 0.49144190549850464, "learning_rate": 4.769152714358194e-07, "loss": 0.0025, "step": 10481 }, { "epoch": 9.572602739726028, "grad_norm": 26.655048370361328, "learning_rate": 4.7590055809233896e-07, "loss": 0.114, "step": 10482 }, { "epoch": 9.57351598173516, "grad_norm": 0.1608947068452835, "learning_rate": 4.748858447488585e-07, "loss": 0.0009, "step": 10483 }, { "epoch": 9.574429223744293, "grad_norm": 0.010127500630915165, "learning_rate": 4.73871131405378e-07, "loss": 0.0001, "step": 10484 }, { "epoch": 9.575342465753424, "grad_norm": 4.974613189697266, "learning_rate": 4.7285641806189756e-07, "loss": 0.024, "step": 10485 }, { "epoch": 9.576255707762558, "grad_norm": 0.09279605001211166, "learning_rate": 4.7184170471841704e-07, "loss": 0.0006, "step": 10486 }, { "epoch": 9.57716894977169, "grad_norm": 1.0792882442474365, "learning_rate": 4.708269913749366e-07, "loss": 0.0036, "step": 10487 }, { "epoch": 9.578082191780823, "grad_norm": 0.018019547685980797, "learning_rate": 4.6981227803145616e-07, "loss": 0.0001, "step": 10488 }, { "epoch": 9.578995433789954, "grad_norm": 0.9549350142478943, "learning_rate": 4.687975646879757e-07, "loss": 0.0053, "step": 10489 }, { "epoch": 9.579908675799087, "grad_norm": 2.689774751663208, "learning_rate": 4.677828513444952e-07, "loss": 0.0117, "step": 10490 }, { "epoch": 9.580821917808219, "grad_norm": 1.5775636434555054, "learning_rate": 4.667681380010147e-07, "loss": 0.0067, "step": 10491 }, { "epoch": 9.581735159817352, "grad_norm": 0.03506266325712204, "learning_rate": 4.657534246575343e-07, "loss": 0.0002, "step": 10492 }, { "epoch": 9.582648401826484, "grad_norm": 1.3749014139175415, "learning_rate": 4.647387113140538e-07, "loss": 0.0081, "step": 10493 }, { "epoch": 9.583561643835617, "grad_norm": 0.08925909548997879, "learning_rate": 4.6372399797057336e-07, "loss": 0.0006, "step": 10494 }, { "epoch": 9.584474885844749, "grad_norm": 0.22139865159988403, "learning_rate": 4.627092846270929e-07, "loss": 0.0013, "step": 10495 }, { "epoch": 9.585388127853882, "grad_norm": 0.24019823968410492, "learning_rate": 4.6169457128361237e-07, "loss": 0.0015, "step": 10496 }, { "epoch": 9.586301369863014, "grad_norm": 28.445301055908203, "learning_rate": 4.6067985794013196e-07, "loss": 0.1242, "step": 10497 }, { "epoch": 9.587214611872145, "grad_norm": 0.2329699546098709, "learning_rate": 4.596651445966515e-07, "loss": 0.0014, "step": 10498 }, { "epoch": 9.588127853881279, "grad_norm": 12.325511932373047, "learning_rate": 4.58650431253171e-07, "loss": 0.0439, "step": 10499 }, { "epoch": 9.58904109589041, "grad_norm": 0.1972113400697708, "learning_rate": 4.5763571790969056e-07, "loss": 0.0009, "step": 10500 }, { "epoch": 9.589954337899544, "grad_norm": 0.5895698666572571, "learning_rate": 4.5662100456621004e-07, "loss": 0.0034, "step": 10501 }, { "epoch": 9.590867579908675, "grad_norm": 0.17655298113822937, "learning_rate": 4.556062912227296e-07, "loss": 0.0008, "step": 10502 }, { "epoch": 9.591780821917808, "grad_norm": 0.29813215136528015, "learning_rate": 4.5459157787924916e-07, "loss": 0.0018, "step": 10503 }, { "epoch": 9.59269406392694, "grad_norm": 0.2680056691169739, "learning_rate": 4.535768645357687e-07, "loss": 0.0017, "step": 10504 }, { "epoch": 9.593607305936073, "grad_norm": 0.27889710664749146, "learning_rate": 4.525621511922882e-07, "loss": 0.0013, "step": 10505 }, { "epoch": 9.594520547945205, "grad_norm": 30.157503128051758, "learning_rate": 4.515474378488077e-07, "loss": 0.177, "step": 10506 }, { "epoch": 9.595433789954338, "grad_norm": 0.21362631022930145, "learning_rate": 4.505327245053273e-07, "loss": 0.001, "step": 10507 }, { "epoch": 9.59634703196347, "grad_norm": 0.6169705986976624, "learning_rate": 4.495180111618468e-07, "loss": 0.0043, "step": 10508 }, { "epoch": 9.597260273972603, "grad_norm": 0.1262662708759308, "learning_rate": 4.4850329781836636e-07, "loss": 0.0008, "step": 10509 }, { "epoch": 9.598173515981735, "grad_norm": 1.1948490142822266, "learning_rate": 4.474885844748859e-07, "loss": 0.0052, "step": 10510 }, { "epoch": 9.599086757990868, "grad_norm": 2.6042287349700928, "learning_rate": 4.4647387113140537e-07, "loss": 0.0099, "step": 10511 }, { "epoch": 9.6, "grad_norm": 0.052663933485746384, "learning_rate": 4.4545915778792496e-07, "loss": 0.0003, "step": 10512 }, { "epoch": 9.600913242009133, "grad_norm": 0.8079361319541931, "learning_rate": 4.444444444444445e-07, "loss": 0.0045, "step": 10513 }, { "epoch": 9.601826484018265, "grad_norm": 60.02363204956055, "learning_rate": 4.43429731100964e-07, "loss": 0.5283, "step": 10514 }, { "epoch": 9.602739726027398, "grad_norm": 0.037067387253046036, "learning_rate": 4.4241501775748356e-07, "loss": 0.0002, "step": 10515 }, { "epoch": 9.60365296803653, "grad_norm": 5.851842880249023, "learning_rate": 4.4140030441400304e-07, "loss": 0.0232, "step": 10516 }, { "epoch": 9.604566210045663, "grad_norm": 1.809003233909607, "learning_rate": 4.403855910705226e-07, "loss": 0.0062, "step": 10517 }, { "epoch": 9.605479452054794, "grad_norm": 0.12929686903953552, "learning_rate": 4.3937087772704216e-07, "loss": 0.0006, "step": 10518 }, { "epoch": 9.606392694063928, "grad_norm": 0.4262963831424713, "learning_rate": 4.383561643835617e-07, "loss": 0.0023, "step": 10519 }, { "epoch": 9.60730593607306, "grad_norm": 1.0248862504959106, "learning_rate": 4.373414510400812e-07, "loss": 0.0049, "step": 10520 }, { "epoch": 9.608219178082193, "grad_norm": 0.8770782947540283, "learning_rate": 4.363267376966007e-07, "loss": 0.0029, "step": 10521 }, { "epoch": 9.609132420091324, "grad_norm": 1.166753888130188, "learning_rate": 4.353120243531203e-07, "loss": 0.0061, "step": 10522 }, { "epoch": 9.610045662100458, "grad_norm": 16.273805618286133, "learning_rate": 4.342973110096398e-07, "loss": 0.0984, "step": 10523 }, { "epoch": 9.610958904109589, "grad_norm": 40.23348617553711, "learning_rate": 4.3328259766615935e-07, "loss": 0.2125, "step": 10524 }, { "epoch": 9.61187214611872, "grad_norm": 0.36671727895736694, "learning_rate": 4.322678843226789e-07, "loss": 0.0025, "step": 10525 }, { "epoch": 9.612785388127854, "grad_norm": 0.45933496952056885, "learning_rate": 4.3125317097919837e-07, "loss": 0.0038, "step": 10526 }, { "epoch": 9.613698630136986, "grad_norm": 1.2133738994598389, "learning_rate": 4.3023845763571795e-07, "loss": 0.0065, "step": 10527 }, { "epoch": 9.614611872146119, "grad_norm": 1.5034552812576294, "learning_rate": 4.292237442922375e-07, "loss": 0.0104, "step": 10528 }, { "epoch": 9.61552511415525, "grad_norm": 8.509368896484375, "learning_rate": 4.28209030948757e-07, "loss": 0.0399, "step": 10529 }, { "epoch": 9.616438356164384, "grad_norm": 0.38122522830963135, "learning_rate": 4.2719431760527655e-07, "loss": 0.0026, "step": 10530 }, { "epoch": 9.617351598173515, "grad_norm": 2.9942426681518555, "learning_rate": 4.2617960426179603e-07, "loss": 0.0141, "step": 10531 }, { "epoch": 9.618264840182649, "grad_norm": 15.33066463470459, "learning_rate": 4.251648909183156e-07, "loss": 0.1021, "step": 10532 }, { "epoch": 9.61917808219178, "grad_norm": 0.017736736685037613, "learning_rate": 4.2415017757483515e-07, "loss": 0.0001, "step": 10533 }, { "epoch": 9.620091324200914, "grad_norm": 0.013833190314471722, "learning_rate": 4.231354642313547e-07, "loss": 0.0001, "step": 10534 }, { "epoch": 9.621004566210045, "grad_norm": 0.2372397482395172, "learning_rate": 4.221207508878742e-07, "loss": 0.0014, "step": 10535 }, { "epoch": 9.621917808219179, "grad_norm": 16.01127052307129, "learning_rate": 4.211060375443937e-07, "loss": 0.0415, "step": 10536 }, { "epoch": 9.62283105022831, "grad_norm": 0.6748915910720825, "learning_rate": 4.200913242009133e-07, "loss": 0.0037, "step": 10537 }, { "epoch": 9.623744292237443, "grad_norm": 0.12879452109336853, "learning_rate": 4.190766108574328e-07, "loss": 0.0011, "step": 10538 }, { "epoch": 9.624657534246575, "grad_norm": 0.07780712842941284, "learning_rate": 4.1806189751395235e-07, "loss": 0.0005, "step": 10539 }, { "epoch": 9.625570776255708, "grad_norm": 0.03396953269839287, "learning_rate": 4.170471841704719e-07, "loss": 0.0003, "step": 10540 }, { "epoch": 9.62648401826484, "grad_norm": 6.319369792938232, "learning_rate": 4.1603247082699137e-07, "loss": 0.0321, "step": 10541 }, { "epoch": 9.627397260273973, "grad_norm": 0.8412399291992188, "learning_rate": 4.1501775748351095e-07, "loss": 0.0035, "step": 10542 }, { "epoch": 9.628310502283105, "grad_norm": 22.109500885009766, "learning_rate": 4.140030441400305e-07, "loss": 0.097, "step": 10543 }, { "epoch": 9.629223744292238, "grad_norm": 1.0430890321731567, "learning_rate": 4.1298833079655e-07, "loss": 0.0051, "step": 10544 }, { "epoch": 9.63013698630137, "grad_norm": 0.33754363656044006, "learning_rate": 4.1197361745306955e-07, "loss": 0.0013, "step": 10545 }, { "epoch": 9.631050228310503, "grad_norm": 0.421532541513443, "learning_rate": 4.1095890410958903e-07, "loss": 0.0018, "step": 10546 }, { "epoch": 9.631963470319635, "grad_norm": 1.5355422496795654, "learning_rate": 4.099441907661086e-07, "loss": 0.0073, "step": 10547 }, { "epoch": 9.632876712328766, "grad_norm": 0.7543661594390869, "learning_rate": 4.0892947742262815e-07, "loss": 0.0027, "step": 10548 }, { "epoch": 9.6337899543379, "grad_norm": 1.8691517114639282, "learning_rate": 4.079147640791477e-07, "loss": 0.0091, "step": 10549 }, { "epoch": 9.634703196347033, "grad_norm": 2.919363260269165, "learning_rate": 4.069000507356672e-07, "loss": 0.0131, "step": 10550 }, { "epoch": 9.635616438356164, "grad_norm": 2.125891923904419, "learning_rate": 4.058853373921867e-07, "loss": 0.0149, "step": 10551 }, { "epoch": 9.636529680365296, "grad_norm": 0.005853475071489811, "learning_rate": 4.048706240487063e-07, "loss": 0.0, "step": 10552 }, { "epoch": 9.63744292237443, "grad_norm": 0.16494424641132355, "learning_rate": 4.038559107052258e-07, "loss": 0.0004, "step": 10553 }, { "epoch": 9.638356164383561, "grad_norm": 1.4753345251083374, "learning_rate": 4.0284119736174535e-07, "loss": 0.0097, "step": 10554 }, { "epoch": 9.639269406392694, "grad_norm": 7.062949180603027, "learning_rate": 4.018264840182649e-07, "loss": 0.0338, "step": 10555 }, { "epoch": 9.640182648401826, "grad_norm": 2.3788137435913086, "learning_rate": 4.0081177067478437e-07, "loss": 0.0128, "step": 10556 }, { "epoch": 9.64109589041096, "grad_norm": 0.5192857384681702, "learning_rate": 3.9979705733130395e-07, "loss": 0.0023, "step": 10557 }, { "epoch": 9.64200913242009, "grad_norm": 19.664186477661133, "learning_rate": 3.987823439878235e-07, "loss": 0.1119, "step": 10558 }, { "epoch": 9.642922374429224, "grad_norm": 0.5061407089233398, "learning_rate": 3.97767630644343e-07, "loss": 0.0028, "step": 10559 }, { "epoch": 9.643835616438356, "grad_norm": 0.018307602033019066, "learning_rate": 3.9675291730086255e-07, "loss": 0.0001, "step": 10560 }, { "epoch": 9.644748858447489, "grad_norm": 0.049493446946144104, "learning_rate": 3.9573820395738203e-07, "loss": 0.0004, "step": 10561 }, { "epoch": 9.64566210045662, "grad_norm": 0.10144434869289398, "learning_rate": 3.947234906139016e-07, "loss": 0.0005, "step": 10562 }, { "epoch": 9.646575342465754, "grad_norm": 0.33820122480392456, "learning_rate": 3.9370877727042115e-07, "loss": 0.0025, "step": 10563 }, { "epoch": 9.647488584474885, "grad_norm": 0.08232814073562622, "learning_rate": 3.926940639269407e-07, "loss": 0.0005, "step": 10564 }, { "epoch": 9.648401826484019, "grad_norm": 0.07879738509654999, "learning_rate": 3.916793505834602e-07, "loss": 0.0006, "step": 10565 }, { "epoch": 9.64931506849315, "grad_norm": 0.2580952048301697, "learning_rate": 3.906646372399797e-07, "loss": 0.0014, "step": 10566 }, { "epoch": 9.650228310502284, "grad_norm": 0.5580341815948486, "learning_rate": 3.896499238964993e-07, "loss": 0.0039, "step": 10567 }, { "epoch": 9.651141552511415, "grad_norm": 3.275768280029297, "learning_rate": 3.886352105530188e-07, "loss": 0.0152, "step": 10568 }, { "epoch": 9.652054794520549, "grad_norm": 0.6450706124305725, "learning_rate": 3.8762049720953835e-07, "loss": 0.004, "step": 10569 }, { "epoch": 9.65296803652968, "grad_norm": 0.13708117604255676, "learning_rate": 3.866057838660579e-07, "loss": 0.0008, "step": 10570 }, { "epoch": 9.653881278538814, "grad_norm": 2.854053258895874, "learning_rate": 3.8559107052257736e-07, "loss": 0.0109, "step": 10571 }, { "epoch": 9.654794520547945, "grad_norm": 2.374911069869995, "learning_rate": 3.8457635717909695e-07, "loss": 0.0105, "step": 10572 }, { "epoch": 9.655707762557078, "grad_norm": 0.49884527921676636, "learning_rate": 3.835616438356165e-07, "loss": 0.0037, "step": 10573 }, { "epoch": 9.65662100456621, "grad_norm": 0.06462673097848892, "learning_rate": 3.82546930492136e-07, "loss": 0.0005, "step": 10574 }, { "epoch": 9.657534246575342, "grad_norm": 14.256170272827148, "learning_rate": 3.8153221714865555e-07, "loss": 0.064, "step": 10575 }, { "epoch": 9.658447488584475, "grad_norm": 0.6069503426551819, "learning_rate": 3.8051750380517503e-07, "loss": 0.0027, "step": 10576 }, { "epoch": 9.659360730593608, "grad_norm": 0.45889991521835327, "learning_rate": 3.795027904616946e-07, "loss": 0.0027, "step": 10577 }, { "epoch": 9.66027397260274, "grad_norm": 0.03492644801735878, "learning_rate": 3.7848807711821415e-07, "loss": 0.0002, "step": 10578 }, { "epoch": 9.661187214611871, "grad_norm": 0.06104857474565506, "learning_rate": 3.774733637747337e-07, "loss": 0.0003, "step": 10579 }, { "epoch": 9.662100456621005, "grad_norm": 0.19619262218475342, "learning_rate": 3.764586504312532e-07, "loss": 0.0012, "step": 10580 }, { "epoch": 9.663013698630136, "grad_norm": 0.863470196723938, "learning_rate": 3.754439370877727e-07, "loss": 0.0055, "step": 10581 }, { "epoch": 9.66392694063927, "grad_norm": 1.0160259008407593, "learning_rate": 3.7442922374429223e-07, "loss": 0.0044, "step": 10582 }, { "epoch": 9.664840182648401, "grad_norm": 0.18092401325702667, "learning_rate": 3.734145104008118e-07, "loss": 0.0013, "step": 10583 }, { "epoch": 9.665753424657535, "grad_norm": 0.42618486285209656, "learning_rate": 3.7239979705733135e-07, "loss": 0.0028, "step": 10584 }, { "epoch": 9.666666666666666, "grad_norm": 0.004286778625100851, "learning_rate": 3.713850837138509e-07, "loss": 0.0, "step": 10585 }, { "epoch": 9.6675799086758, "grad_norm": 0.1639488786458969, "learning_rate": 3.7037037037037036e-07, "loss": 0.0011, "step": 10586 }, { "epoch": 9.668493150684931, "grad_norm": 0.5798527598381042, "learning_rate": 3.693556570268899e-07, "loss": 0.0042, "step": 10587 }, { "epoch": 9.669406392694064, "grad_norm": 0.025639913976192474, "learning_rate": 3.683409436834095e-07, "loss": 0.0002, "step": 10588 }, { "epoch": 9.670319634703196, "grad_norm": 0.027381183579564095, "learning_rate": 3.67326230339929e-07, "loss": 0.0001, "step": 10589 }, { "epoch": 9.67123287671233, "grad_norm": 0.02848464623093605, "learning_rate": 3.6631151699644855e-07, "loss": 0.0002, "step": 10590 }, { "epoch": 9.67214611872146, "grad_norm": 5.453968048095703, "learning_rate": 3.6529680365296803e-07, "loss": 0.017, "step": 10591 }, { "epoch": 9.673059360730594, "grad_norm": 1.104219913482666, "learning_rate": 3.6428209030948756e-07, "loss": 0.0063, "step": 10592 }, { "epoch": 9.673972602739726, "grad_norm": 0.006282254587858915, "learning_rate": 3.6326737696600715e-07, "loss": 0.0, "step": 10593 }, { "epoch": 9.674885844748859, "grad_norm": 0.8346133828163147, "learning_rate": 3.622526636225267e-07, "loss": 0.0041, "step": 10594 }, { "epoch": 9.67579908675799, "grad_norm": 2.2515511512756348, "learning_rate": 3.612379502790462e-07, "loss": 0.008, "step": 10595 }, { "epoch": 9.676712328767124, "grad_norm": 1.9532406330108643, "learning_rate": 3.602232369355657e-07, "loss": 0.0103, "step": 10596 }, { "epoch": 9.677625570776256, "grad_norm": 0.10945838689804077, "learning_rate": 3.5920852359208523e-07, "loss": 0.0007, "step": 10597 }, { "epoch": 9.678538812785389, "grad_norm": 0.18958072364330292, "learning_rate": 3.581938102486048e-07, "loss": 0.001, "step": 10598 }, { "epoch": 9.67945205479452, "grad_norm": 13.51171875, "learning_rate": 3.5717909690512435e-07, "loss": 0.0636, "step": 10599 }, { "epoch": 9.680365296803654, "grad_norm": 0.6077075600624084, "learning_rate": 3.561643835616439e-07, "loss": 0.0022, "step": 10600 }, { "epoch": 9.681278538812785, "grad_norm": 0.2445686310529709, "learning_rate": 3.5514967021816336e-07, "loss": 0.0016, "step": 10601 }, { "epoch": 9.682191780821917, "grad_norm": 0.02624424360692501, "learning_rate": 3.541349568746829e-07, "loss": 0.0002, "step": 10602 }, { "epoch": 9.68310502283105, "grad_norm": 1.520774245262146, "learning_rate": 3.531202435312025e-07, "loss": 0.0016, "step": 10603 }, { "epoch": 9.684018264840184, "grad_norm": 0.029204409569501877, "learning_rate": 3.52105530187722e-07, "loss": 0.0002, "step": 10604 }, { "epoch": 9.684931506849315, "grad_norm": 1.0088368654251099, "learning_rate": 3.5109081684424155e-07, "loss": 0.0055, "step": 10605 }, { "epoch": 9.685844748858447, "grad_norm": 0.07041903585195541, "learning_rate": 3.5007610350076103e-07, "loss": 0.0005, "step": 10606 }, { "epoch": 9.68675799086758, "grad_norm": 1.5814869403839111, "learning_rate": 3.4906139015728056e-07, "loss": 0.0071, "step": 10607 }, { "epoch": 9.687671232876712, "grad_norm": 0.402261883020401, "learning_rate": 3.4804667681380015e-07, "loss": 0.0022, "step": 10608 }, { "epoch": 9.688584474885845, "grad_norm": 0.018866198137402534, "learning_rate": 3.470319634703197e-07, "loss": 0.0001, "step": 10609 }, { "epoch": 9.689497716894977, "grad_norm": 3.581183671951294, "learning_rate": 3.460172501268392e-07, "loss": 0.0175, "step": 10610 }, { "epoch": 9.69041095890411, "grad_norm": 0.46922433376312256, "learning_rate": 3.450025367833587e-07, "loss": 0.0022, "step": 10611 }, { "epoch": 9.691324200913241, "grad_norm": 2.6995270252227783, "learning_rate": 3.4398782343987823e-07, "loss": 0.0143, "step": 10612 }, { "epoch": 9.692237442922375, "grad_norm": 25.127948760986328, "learning_rate": 3.429731100963978e-07, "loss": 0.1299, "step": 10613 }, { "epoch": 9.693150684931506, "grad_norm": 0.206023171544075, "learning_rate": 3.4195839675291735e-07, "loss": 0.0009, "step": 10614 }, { "epoch": 9.69406392694064, "grad_norm": 0.13815881311893463, "learning_rate": 3.409436834094369e-07, "loss": 0.0007, "step": 10615 }, { "epoch": 9.694977168949771, "grad_norm": 0.5354087352752686, "learning_rate": 3.3992897006595636e-07, "loss": 0.0041, "step": 10616 }, { "epoch": 9.695890410958905, "grad_norm": 0.3566213846206665, "learning_rate": 3.389142567224759e-07, "loss": 0.0025, "step": 10617 }, { "epoch": 9.696803652968036, "grad_norm": 0.1710534542798996, "learning_rate": 3.378995433789955e-07, "loss": 0.001, "step": 10618 }, { "epoch": 9.69771689497717, "grad_norm": 0.003579284530133009, "learning_rate": 3.36884830035515e-07, "loss": 0.0, "step": 10619 }, { "epoch": 9.698630136986301, "grad_norm": 0.14180707931518555, "learning_rate": 3.3587011669203455e-07, "loss": 0.0008, "step": 10620 }, { "epoch": 9.699543378995434, "grad_norm": 2.5931925773620605, "learning_rate": 3.3485540334855403e-07, "loss": 0.0134, "step": 10621 }, { "epoch": 9.700456621004566, "grad_norm": 0.40981724858283997, "learning_rate": 3.3384069000507356e-07, "loss": 0.0022, "step": 10622 }, { "epoch": 9.7013698630137, "grad_norm": 0.359395831823349, "learning_rate": 3.3282597666159315e-07, "loss": 0.0022, "step": 10623 }, { "epoch": 9.70228310502283, "grad_norm": 0.6488996744155884, "learning_rate": 3.318112633181127e-07, "loss": 0.0032, "step": 10624 }, { "epoch": 9.703196347031964, "grad_norm": 3.263293504714966, "learning_rate": 3.307965499746322e-07, "loss": 0.0119, "step": 10625 }, { "epoch": 9.704109589041096, "grad_norm": 8.72745132446289, "learning_rate": 3.297818366311517e-07, "loss": 0.0357, "step": 10626 }, { "epoch": 9.70502283105023, "grad_norm": 0.00450256559997797, "learning_rate": 3.2876712328767123e-07, "loss": 0.0, "step": 10627 }, { "epoch": 9.70593607305936, "grad_norm": 0.4118576943874359, "learning_rate": 3.277524099441908e-07, "loss": 0.0016, "step": 10628 }, { "epoch": 9.706849315068492, "grad_norm": 36.196739196777344, "learning_rate": 3.2673769660071035e-07, "loss": 0.2043, "step": 10629 }, { "epoch": 9.707762557077626, "grad_norm": 1.99886155128479, "learning_rate": 3.257229832572299e-07, "loss": 0.0014, "step": 10630 }, { "epoch": 9.708675799086759, "grad_norm": 2.2257680892944336, "learning_rate": 3.2470826991374936e-07, "loss": 0.0134, "step": 10631 }, { "epoch": 9.70958904109589, "grad_norm": 7.497061252593994, "learning_rate": 3.236935565702689e-07, "loss": 0.0383, "step": 10632 }, { "epoch": 9.710502283105022, "grad_norm": 0.8917825818061829, "learning_rate": 3.226788432267885e-07, "loss": 0.0054, "step": 10633 }, { "epoch": 9.711415525114155, "grad_norm": 0.4415012001991272, "learning_rate": 3.21664129883308e-07, "loss": 0.0026, "step": 10634 }, { "epoch": 9.712328767123287, "grad_norm": 0.4319545328617096, "learning_rate": 3.2064941653982755e-07, "loss": 0.0019, "step": 10635 }, { "epoch": 9.71324200913242, "grad_norm": 1.1147181987762451, "learning_rate": 3.19634703196347e-07, "loss": 0.0051, "step": 10636 }, { "epoch": 9.714155251141552, "grad_norm": 4.410091876983643, "learning_rate": 3.1861998985286656e-07, "loss": 0.0233, "step": 10637 }, { "epoch": 9.715068493150685, "grad_norm": 1.2448370456695557, "learning_rate": 3.1760527650938615e-07, "loss": 0.0068, "step": 10638 }, { "epoch": 9.715981735159817, "grad_norm": 0.047893572598695755, "learning_rate": 3.165905631659057e-07, "loss": 0.0002, "step": 10639 }, { "epoch": 9.71689497716895, "grad_norm": 0.8786249160766602, "learning_rate": 3.155758498224252e-07, "loss": 0.0052, "step": 10640 }, { "epoch": 9.717808219178082, "grad_norm": 0.7334330081939697, "learning_rate": 3.145611364789447e-07, "loss": 0.003, "step": 10641 }, { "epoch": 9.718721461187215, "grad_norm": 0.2355203926563263, "learning_rate": 3.135464231354642e-07, "loss": 0.0014, "step": 10642 }, { "epoch": 9.719634703196347, "grad_norm": 0.002697082469239831, "learning_rate": 3.125317097919838e-07, "loss": 0.0, "step": 10643 }, { "epoch": 9.72054794520548, "grad_norm": 29.69963836669922, "learning_rate": 3.1151699644850334e-07, "loss": 0.2203, "step": 10644 }, { "epoch": 9.721461187214611, "grad_norm": 0.8849590420722961, "learning_rate": 3.105022831050229e-07, "loss": 0.0052, "step": 10645 }, { "epoch": 9.722374429223745, "grad_norm": 19.320348739624023, "learning_rate": 3.0948756976154236e-07, "loss": 0.0997, "step": 10646 }, { "epoch": 9.723287671232876, "grad_norm": 0.1412627398967743, "learning_rate": 3.0847285641806194e-07, "loss": 0.0009, "step": 10647 }, { "epoch": 9.72420091324201, "grad_norm": 0.19387581944465637, "learning_rate": 3.074581430745814e-07, "loss": 0.0008, "step": 10648 }, { "epoch": 9.725114155251141, "grad_norm": 0.23900113999843597, "learning_rate": 3.06443429731101e-07, "loss": 0.0009, "step": 10649 }, { "epoch": 9.726027397260275, "grad_norm": 0.011887981556355953, "learning_rate": 3.0542871638762054e-07, "loss": 0.0001, "step": 10650 }, { "epoch": 9.726940639269406, "grad_norm": 0.5387457609176636, "learning_rate": 3.0441400304414e-07, "loss": 0.0031, "step": 10651 }, { "epoch": 9.72785388127854, "grad_norm": 7.270473003387451, "learning_rate": 3.033992897006596e-07, "loss": 0.0472, "step": 10652 }, { "epoch": 9.728767123287671, "grad_norm": 3.372631788253784, "learning_rate": 3.023845763571791e-07, "loss": 0.0141, "step": 10653 }, { "epoch": 9.729680365296804, "grad_norm": 0.2739899158477783, "learning_rate": 3.013698630136987e-07, "loss": 0.0009, "step": 10654 }, { "epoch": 9.730593607305936, "grad_norm": 0.04774124175310135, "learning_rate": 3.003551496702182e-07, "loss": 0.0004, "step": 10655 }, { "epoch": 9.731506849315068, "grad_norm": 0.1124042421579361, "learning_rate": 2.993404363267377e-07, "loss": 0.0007, "step": 10656 }, { "epoch": 9.732420091324201, "grad_norm": 5.741641521453857, "learning_rate": 2.983257229832573e-07, "loss": 0.028, "step": 10657 }, { "epoch": 9.733333333333333, "grad_norm": 0.16670581698417664, "learning_rate": 2.9731100963977676e-07, "loss": 0.001, "step": 10658 }, { "epoch": 9.734246575342466, "grad_norm": 0.2908904254436493, "learning_rate": 2.9629629629629634e-07, "loss": 0.0025, "step": 10659 }, { "epoch": 9.735159817351597, "grad_norm": 0.41129618883132935, "learning_rate": 2.952815829528159e-07, "loss": 0.0019, "step": 10660 }, { "epoch": 9.73607305936073, "grad_norm": 0.0359664112329483, "learning_rate": 2.9426686960933536e-07, "loss": 0.0002, "step": 10661 }, { "epoch": 9.736986301369862, "grad_norm": 0.10171351581811905, "learning_rate": 2.9325215626585494e-07, "loss": 0.0006, "step": 10662 }, { "epoch": 9.737899543378996, "grad_norm": 0.2469593733549118, "learning_rate": 2.922374429223744e-07, "loss": 0.0012, "step": 10663 }, { "epoch": 9.738812785388127, "grad_norm": 0.22819307446479797, "learning_rate": 2.91222729578894e-07, "loss": 0.001, "step": 10664 }, { "epoch": 9.73972602739726, "grad_norm": 0.5989862680435181, "learning_rate": 2.9020801623541354e-07, "loss": 0.0024, "step": 10665 }, { "epoch": 9.740639269406392, "grad_norm": 0.029059525579214096, "learning_rate": 2.89193302891933e-07, "loss": 0.0001, "step": 10666 }, { "epoch": 9.741552511415525, "grad_norm": 0.44685402512550354, "learning_rate": 2.881785895484526e-07, "loss": 0.0024, "step": 10667 }, { "epoch": 9.742465753424657, "grad_norm": 0.10753301531076431, "learning_rate": 2.871638762049721e-07, "loss": 0.0006, "step": 10668 }, { "epoch": 9.74337899543379, "grad_norm": 0.20386040210723877, "learning_rate": 2.861491628614917e-07, "loss": 0.001, "step": 10669 }, { "epoch": 9.744292237442922, "grad_norm": 6.003828525543213, "learning_rate": 2.851344495180112e-07, "loss": 0.0253, "step": 10670 }, { "epoch": 9.745205479452055, "grad_norm": 0.11064280569553375, "learning_rate": 2.841197361745307e-07, "loss": 0.0005, "step": 10671 }, { "epoch": 9.746118721461187, "grad_norm": 0.36990705132484436, "learning_rate": 2.831050228310503e-07, "loss": 0.0022, "step": 10672 }, { "epoch": 9.74703196347032, "grad_norm": 4.629826068878174, "learning_rate": 2.8209030948756976e-07, "loss": 0.0257, "step": 10673 }, { "epoch": 9.747945205479452, "grad_norm": 0.6786235570907593, "learning_rate": 2.8107559614408934e-07, "loss": 0.0043, "step": 10674 }, { "epoch": 9.748858447488585, "grad_norm": 2.966020107269287, "learning_rate": 2.800608828006089e-07, "loss": 0.0129, "step": 10675 }, { "epoch": 9.749771689497717, "grad_norm": 4.246496677398682, "learning_rate": 2.7904616945712836e-07, "loss": 0.0133, "step": 10676 }, { "epoch": 9.75068493150685, "grad_norm": 3.61137318611145, "learning_rate": 2.7803145611364794e-07, "loss": 0.0287, "step": 10677 }, { "epoch": 9.751598173515982, "grad_norm": 0.6952420473098755, "learning_rate": 2.770167427701674e-07, "loss": 0.0032, "step": 10678 }, { "epoch": 9.752511415525115, "grad_norm": 0.05837055668234825, "learning_rate": 2.76002029426687e-07, "loss": 0.0003, "step": 10679 }, { "epoch": 9.753424657534246, "grad_norm": 2.2214128971099854, "learning_rate": 2.749873160832065e-07, "loss": 0.0036, "step": 10680 }, { "epoch": 9.75433789954338, "grad_norm": 2.097791910171509, "learning_rate": 2.73972602739726e-07, "loss": 0.0114, "step": 10681 }, { "epoch": 9.755251141552511, "grad_norm": 0.21917808055877686, "learning_rate": 2.729578893962456e-07, "loss": 0.0013, "step": 10682 }, { "epoch": 9.756164383561643, "grad_norm": 0.07014655321836472, "learning_rate": 2.719431760527651e-07, "loss": 0.0005, "step": 10683 }, { "epoch": 9.757077625570776, "grad_norm": 1.7536795139312744, "learning_rate": 2.709284627092847e-07, "loss": 0.0077, "step": 10684 }, { "epoch": 9.757990867579908, "grad_norm": 0.4595399498939514, "learning_rate": 2.6991374936580416e-07, "loss": 0.0017, "step": 10685 }, { "epoch": 9.758904109589041, "grad_norm": 0.16954965889453888, "learning_rate": 2.688990360223237e-07, "loss": 0.0008, "step": 10686 }, { "epoch": 9.759817351598173, "grad_norm": 4.876499176025391, "learning_rate": 2.678843226788433e-07, "loss": 0.0207, "step": 10687 }, { "epoch": 9.760730593607306, "grad_norm": 0.8515797257423401, "learning_rate": 2.6686960933536276e-07, "loss": 0.0061, "step": 10688 }, { "epoch": 9.761643835616438, "grad_norm": 1.6792666912078857, "learning_rate": 2.6585489599188234e-07, "loss": 0.0081, "step": 10689 }, { "epoch": 9.762557077625571, "grad_norm": 0.3476768434047699, "learning_rate": 2.648401826484018e-07, "loss": 0.0015, "step": 10690 }, { "epoch": 9.763470319634703, "grad_norm": 33.79060745239258, "learning_rate": 2.6382546930492135e-07, "loss": 0.1283, "step": 10691 }, { "epoch": 9.764383561643836, "grad_norm": 0.05546613410115242, "learning_rate": 2.6281075596144094e-07, "loss": 0.0003, "step": 10692 }, { "epoch": 9.765296803652967, "grad_norm": 0.2324116826057434, "learning_rate": 2.617960426179604e-07, "loss": 0.0006, "step": 10693 }, { "epoch": 9.7662100456621, "grad_norm": 0.8154028058052063, "learning_rate": 2.6078132927448e-07, "loss": 0.0042, "step": 10694 }, { "epoch": 9.767123287671232, "grad_norm": 0.7605946660041809, "learning_rate": 2.597666159309995e-07, "loss": 0.0045, "step": 10695 }, { "epoch": 9.768036529680366, "grad_norm": 12.328700065612793, "learning_rate": 2.58751902587519e-07, "loss": 0.0996, "step": 10696 }, { "epoch": 9.768949771689497, "grad_norm": 1.071829080581665, "learning_rate": 2.577371892440386e-07, "loss": 0.0075, "step": 10697 }, { "epoch": 9.76986301369863, "grad_norm": 0.1567387580871582, "learning_rate": 2.567224759005581e-07, "loss": 0.0006, "step": 10698 }, { "epoch": 9.770776255707762, "grad_norm": 0.4110407829284668, "learning_rate": 2.557077625570777e-07, "loss": 0.0026, "step": 10699 }, { "epoch": 9.771689497716896, "grad_norm": 0.723908543586731, "learning_rate": 2.5469304921359715e-07, "loss": 0.0035, "step": 10700 }, { "epoch": 9.772602739726027, "grad_norm": 0.35582923889160156, "learning_rate": 2.536783358701167e-07, "loss": 0.0024, "step": 10701 }, { "epoch": 9.77351598173516, "grad_norm": 3.668079137802124, "learning_rate": 2.5266362252663627e-07, "loss": 0.0212, "step": 10702 }, { "epoch": 9.774429223744292, "grad_norm": 0.41424494981765747, "learning_rate": 2.5164890918315575e-07, "loss": 0.0019, "step": 10703 }, { "epoch": 9.775342465753425, "grad_norm": 0.5245662331581116, "learning_rate": 2.5063419583967534e-07, "loss": 0.0035, "step": 10704 }, { "epoch": 9.776255707762557, "grad_norm": 0.16103582084178925, "learning_rate": 2.496194824961948e-07, "loss": 0.0008, "step": 10705 }, { "epoch": 9.77716894977169, "grad_norm": 0.23217807710170746, "learning_rate": 2.4860476915271435e-07, "loss": 0.0015, "step": 10706 }, { "epoch": 9.778082191780822, "grad_norm": 46.90675354003906, "learning_rate": 2.4759005580923394e-07, "loss": 0.3132, "step": 10707 }, { "epoch": 9.778995433789955, "grad_norm": 1.6451737880706787, "learning_rate": 2.465753424657534e-07, "loss": 0.0111, "step": 10708 }, { "epoch": 9.779908675799087, "grad_norm": 1.0650792121887207, "learning_rate": 2.45560629122273e-07, "loss": 0.0061, "step": 10709 }, { "epoch": 9.780821917808218, "grad_norm": 0.01074875146150589, "learning_rate": 2.445459157787925e-07, "loss": 0.0001, "step": 10710 }, { "epoch": 9.781735159817352, "grad_norm": 5.732211112976074, "learning_rate": 2.43531202435312e-07, "loss": 0.0264, "step": 10711 }, { "epoch": 9.782648401826483, "grad_norm": 0.1474676877260208, "learning_rate": 2.425164890918316e-07, "loss": 0.0008, "step": 10712 }, { "epoch": 9.783561643835617, "grad_norm": 2.841961622238159, "learning_rate": 2.415017757483511e-07, "loss": 0.0126, "step": 10713 }, { "epoch": 9.784474885844748, "grad_norm": 0.006212103180587292, "learning_rate": 2.4048706240487067e-07, "loss": 0.0, "step": 10714 }, { "epoch": 9.785388127853881, "grad_norm": 1.2180026769638062, "learning_rate": 2.3947234906139015e-07, "loss": 0.0052, "step": 10715 }, { "epoch": 9.786301369863013, "grad_norm": 0.09150490164756775, "learning_rate": 2.384576357179097e-07, "loss": 0.0003, "step": 10716 }, { "epoch": 9.787214611872146, "grad_norm": 0.10000083595514297, "learning_rate": 2.3744292237442925e-07, "loss": 0.0005, "step": 10717 }, { "epoch": 9.788127853881278, "grad_norm": 1.346853256225586, "learning_rate": 2.3642820903094878e-07, "loss": 0.0106, "step": 10718 }, { "epoch": 9.789041095890411, "grad_norm": 0.4174039363861084, "learning_rate": 2.354134956874683e-07, "loss": 0.0021, "step": 10719 }, { "epoch": 9.789954337899543, "grad_norm": 0.043866805732250214, "learning_rate": 2.3439878234398785e-07, "loss": 0.0002, "step": 10720 }, { "epoch": 9.790867579908676, "grad_norm": 1.3304355144500732, "learning_rate": 2.3338406900050735e-07, "loss": 0.0089, "step": 10721 }, { "epoch": 9.791780821917808, "grad_norm": 0.16209523379802704, "learning_rate": 2.323693556570269e-07, "loss": 0.001, "step": 10722 }, { "epoch": 9.792694063926941, "grad_norm": 0.03461967036128044, "learning_rate": 2.3135464231354645e-07, "loss": 0.0002, "step": 10723 }, { "epoch": 9.793607305936073, "grad_norm": 20.174509048461914, "learning_rate": 2.3033992897006598e-07, "loss": 0.0968, "step": 10724 }, { "epoch": 9.794520547945206, "grad_norm": 0.11509159952402115, "learning_rate": 2.293252156265855e-07, "loss": 0.0006, "step": 10725 }, { "epoch": 9.795433789954338, "grad_norm": 0.22954173386096954, "learning_rate": 2.2831050228310502e-07, "loss": 0.0016, "step": 10726 }, { "epoch": 9.796347031963471, "grad_norm": 1.256043553352356, "learning_rate": 2.2729578893962458e-07, "loss": 0.0093, "step": 10727 }, { "epoch": 9.797260273972602, "grad_norm": 1.5217626094818115, "learning_rate": 2.262810755961441e-07, "loss": 0.0075, "step": 10728 }, { "epoch": 9.798173515981736, "grad_norm": 3.6315505504608154, "learning_rate": 2.2526636225266364e-07, "loss": 0.0165, "step": 10729 }, { "epoch": 9.799086757990867, "grad_norm": 3.451401948928833, "learning_rate": 2.2425164890918318e-07, "loss": 0.0096, "step": 10730 }, { "epoch": 9.8, "grad_norm": 53.284297943115234, "learning_rate": 2.2323693556570268e-07, "loss": 0.5708, "step": 10731 }, { "epoch": 9.800913242009132, "grad_norm": 0.5066766738891602, "learning_rate": 2.2222222222222224e-07, "loss": 0.0019, "step": 10732 }, { "epoch": 9.801826484018266, "grad_norm": 4.5684685707092285, "learning_rate": 2.2120750887874178e-07, "loss": 0.0311, "step": 10733 }, { "epoch": 9.802739726027397, "grad_norm": 0.771625280380249, "learning_rate": 2.201927955352613e-07, "loss": 0.0031, "step": 10734 }, { "epoch": 9.80365296803653, "grad_norm": 2.4547979831695557, "learning_rate": 2.1917808219178084e-07, "loss": 0.016, "step": 10735 }, { "epoch": 9.804566210045662, "grad_norm": 0.3014325201511383, "learning_rate": 2.1816336884830035e-07, "loss": 0.0018, "step": 10736 }, { "epoch": 9.805479452054794, "grad_norm": 0.06966390460729599, "learning_rate": 2.171486555048199e-07, "loss": 0.0003, "step": 10737 }, { "epoch": 9.806392694063927, "grad_norm": 1.9007269144058228, "learning_rate": 2.1613394216133944e-07, "loss": 0.0124, "step": 10738 }, { "epoch": 9.807305936073059, "grad_norm": 0.33160221576690674, "learning_rate": 2.1511922881785898e-07, "loss": 0.0018, "step": 10739 }, { "epoch": 9.808219178082192, "grad_norm": 0.3331991732120514, "learning_rate": 2.141045154743785e-07, "loss": 0.0022, "step": 10740 }, { "epoch": 9.809132420091323, "grad_norm": 0.05923198163509369, "learning_rate": 2.1308980213089802e-07, "loss": 0.0003, "step": 10741 }, { "epoch": 9.810045662100457, "grad_norm": 1.5133708715438843, "learning_rate": 2.1207508878741758e-07, "loss": 0.0054, "step": 10742 }, { "epoch": 9.810958904109588, "grad_norm": 0.2359091192483902, "learning_rate": 2.110603754439371e-07, "loss": 0.0009, "step": 10743 }, { "epoch": 9.811872146118722, "grad_norm": 1.532950520515442, "learning_rate": 2.1004566210045664e-07, "loss": 0.0074, "step": 10744 }, { "epoch": 9.812785388127853, "grad_norm": 0.18100298941135406, "learning_rate": 2.0903094875697618e-07, "loss": 0.0007, "step": 10745 }, { "epoch": 9.813698630136987, "grad_norm": 0.31961458921432495, "learning_rate": 2.0801623541349568e-07, "loss": 0.0018, "step": 10746 }, { "epoch": 9.814611872146118, "grad_norm": 13.076247215270996, "learning_rate": 2.0700152207001524e-07, "loss": 0.0881, "step": 10747 }, { "epoch": 9.815525114155252, "grad_norm": 0.10896513611078262, "learning_rate": 2.0598680872653478e-07, "loss": 0.0007, "step": 10748 }, { "epoch": 9.816438356164383, "grad_norm": 4.8846940994262695, "learning_rate": 2.049720953830543e-07, "loss": 0.0153, "step": 10749 }, { "epoch": 9.817351598173516, "grad_norm": 0.016900276765227318, "learning_rate": 2.0395738203957384e-07, "loss": 0.0001, "step": 10750 }, { "epoch": 9.818264840182648, "grad_norm": 0.3008733093738556, "learning_rate": 2.0294266869609335e-07, "loss": 0.0013, "step": 10751 }, { "epoch": 9.819178082191781, "grad_norm": 0.13198347389698029, "learning_rate": 2.019279553526129e-07, "loss": 0.001, "step": 10752 }, { "epoch": 9.820091324200913, "grad_norm": 0.0019649916794151068, "learning_rate": 2.0091324200913244e-07, "loss": 0.0, "step": 10753 }, { "epoch": 9.821004566210046, "grad_norm": 0.9569011926651001, "learning_rate": 1.9989852866565198e-07, "loss": 0.0023, "step": 10754 }, { "epoch": 9.821917808219178, "grad_norm": 0.025311928242444992, "learning_rate": 1.988838153221715e-07, "loss": 0.0002, "step": 10755 }, { "epoch": 9.822831050228311, "grad_norm": 0.5614504218101501, "learning_rate": 1.9786910197869102e-07, "loss": 0.0032, "step": 10756 }, { "epoch": 9.823744292237443, "grad_norm": 0.0019572668243199587, "learning_rate": 1.9685438863521058e-07, "loss": 0.0, "step": 10757 }, { "epoch": 9.824657534246576, "grad_norm": 128.91673278808594, "learning_rate": 1.958396752917301e-07, "loss": 2.248, "step": 10758 }, { "epoch": 9.825570776255708, "grad_norm": 0.14242756366729736, "learning_rate": 1.9482496194824964e-07, "loss": 0.0007, "step": 10759 }, { "epoch": 9.826484018264841, "grad_norm": 1.1720141172409058, "learning_rate": 1.9381024860476918e-07, "loss": 0.0061, "step": 10760 }, { "epoch": 9.827397260273973, "grad_norm": 5.028863906860352, "learning_rate": 1.9279553526128868e-07, "loss": 0.0347, "step": 10761 }, { "epoch": 9.828310502283106, "grad_norm": 0.09718964993953705, "learning_rate": 1.9178082191780824e-07, "loss": 0.0007, "step": 10762 }, { "epoch": 9.829223744292237, "grad_norm": 28.65207862854004, "learning_rate": 1.9076610857432778e-07, "loss": 0.076, "step": 10763 }, { "epoch": 9.830136986301369, "grad_norm": 4.329260349273682, "learning_rate": 1.897513952308473e-07, "loss": 0.0137, "step": 10764 }, { "epoch": 9.831050228310502, "grad_norm": 0.473430335521698, "learning_rate": 1.8873668188736684e-07, "loss": 0.0021, "step": 10765 }, { "epoch": 9.831963470319634, "grad_norm": 0.20850031077861786, "learning_rate": 1.8772196854388635e-07, "loss": 0.0009, "step": 10766 }, { "epoch": 9.832876712328767, "grad_norm": 0.09202493727207184, "learning_rate": 1.867072552004059e-07, "loss": 0.0004, "step": 10767 }, { "epoch": 9.833789954337899, "grad_norm": 6.92636251449585, "learning_rate": 1.8569254185692544e-07, "loss": 0.0328, "step": 10768 }, { "epoch": 9.834703196347032, "grad_norm": 0.9603524804115295, "learning_rate": 1.8467782851344495e-07, "loss": 0.0062, "step": 10769 }, { "epoch": 9.835616438356164, "grad_norm": 0.11104227602481842, "learning_rate": 1.836631151699645e-07, "loss": 0.0006, "step": 10770 }, { "epoch": 9.836529680365297, "grad_norm": 20.398759841918945, "learning_rate": 1.8264840182648401e-07, "loss": 0.0817, "step": 10771 }, { "epoch": 9.837442922374429, "grad_norm": 2.598937511444092, "learning_rate": 1.8163368848300357e-07, "loss": 0.0133, "step": 10772 }, { "epoch": 9.838356164383562, "grad_norm": 1.0736331939697266, "learning_rate": 1.806189751395231e-07, "loss": 0.0042, "step": 10773 }, { "epoch": 9.839269406392694, "grad_norm": 0.14864017069339752, "learning_rate": 1.7960426179604261e-07, "loss": 0.0007, "step": 10774 }, { "epoch": 9.840182648401827, "grad_norm": 0.022589026018977165, "learning_rate": 1.7858954845256217e-07, "loss": 0.0001, "step": 10775 }, { "epoch": 9.841095890410958, "grad_norm": 0.09217992424964905, "learning_rate": 1.7757483510908168e-07, "loss": 0.0007, "step": 10776 }, { "epoch": 9.842009132420092, "grad_norm": 0.22001603245735168, "learning_rate": 1.7656012176560124e-07, "loss": 0.0013, "step": 10777 }, { "epoch": 9.842922374429223, "grad_norm": 0.24658240377902985, "learning_rate": 1.7554540842212077e-07, "loss": 0.0018, "step": 10778 }, { "epoch": 9.843835616438357, "grad_norm": 2.7396769523620605, "learning_rate": 1.7453069507864028e-07, "loss": 0.0102, "step": 10779 }, { "epoch": 9.844748858447488, "grad_norm": 1.7930890321731567, "learning_rate": 1.7351598173515984e-07, "loss": 0.0101, "step": 10780 }, { "epoch": 9.845662100456622, "grad_norm": 52.034358978271484, "learning_rate": 1.7250126839167935e-07, "loss": 0.2538, "step": 10781 }, { "epoch": 9.846575342465753, "grad_norm": 0.10084579139947891, "learning_rate": 1.714865550481989e-07, "loss": 0.0004, "step": 10782 }, { "epoch": 9.847488584474887, "grad_norm": 0.7589641213417053, "learning_rate": 1.7047184170471844e-07, "loss": 0.003, "step": 10783 }, { "epoch": 9.848401826484018, "grad_norm": 4.089865684509277, "learning_rate": 1.6945712836123795e-07, "loss": 0.023, "step": 10784 }, { "epoch": 9.849315068493151, "grad_norm": 0.0007738308049738407, "learning_rate": 1.684424150177575e-07, "loss": 0.0, "step": 10785 }, { "epoch": 9.850228310502283, "grad_norm": 0.7399449944496155, "learning_rate": 1.6742770167427701e-07, "loss": 0.0052, "step": 10786 }, { "epoch": 9.851141552511416, "grad_norm": 2.604933738708496, "learning_rate": 1.6641298833079657e-07, "loss": 0.0095, "step": 10787 }, { "epoch": 9.852054794520548, "grad_norm": 0.04459505155682564, "learning_rate": 1.653982749873161e-07, "loss": 0.0003, "step": 10788 }, { "epoch": 9.852968036529681, "grad_norm": 0.6419627070426941, "learning_rate": 1.6438356164383561e-07, "loss": 0.0032, "step": 10789 }, { "epoch": 9.853881278538813, "grad_norm": 18.80365562438965, "learning_rate": 1.6336884830035517e-07, "loss": 0.0987, "step": 10790 }, { "epoch": 9.854794520547944, "grad_norm": 0.09715565294027328, "learning_rate": 1.6235413495687468e-07, "loss": 0.0006, "step": 10791 }, { "epoch": 9.855707762557078, "grad_norm": 0.02254725620150566, "learning_rate": 1.6133942161339424e-07, "loss": 0.0002, "step": 10792 }, { "epoch": 9.85662100456621, "grad_norm": 0.2889353036880493, "learning_rate": 1.6032470826991377e-07, "loss": 0.0014, "step": 10793 }, { "epoch": 9.857534246575343, "grad_norm": 5.216639995574951, "learning_rate": 1.5930999492643328e-07, "loss": 0.033, "step": 10794 }, { "epoch": 9.858447488584474, "grad_norm": 16.424819946289062, "learning_rate": 1.5829528158295284e-07, "loss": 0.055, "step": 10795 }, { "epoch": 9.859360730593608, "grad_norm": 107.74028015136719, "learning_rate": 1.5728056823947235e-07, "loss": 0.7321, "step": 10796 }, { "epoch": 9.860273972602739, "grad_norm": 1.219688057899475, "learning_rate": 1.562658548959919e-07, "loss": 0.0069, "step": 10797 }, { "epoch": 9.861187214611872, "grad_norm": 0.8462106585502625, "learning_rate": 1.5525114155251144e-07, "loss": 0.0041, "step": 10798 }, { "epoch": 9.862100456621004, "grad_norm": 0.5319063067436218, "learning_rate": 1.5423642820903097e-07, "loss": 0.0027, "step": 10799 }, { "epoch": 9.863013698630137, "grad_norm": 74.40694427490234, "learning_rate": 1.532217148655505e-07, "loss": 0.4295, "step": 10800 }, { "epoch": 9.863926940639269, "grad_norm": 0.08256618678569794, "learning_rate": 1.5220700152207e-07, "loss": 0.0005, "step": 10801 }, { "epoch": 9.864840182648402, "grad_norm": 0.2280692756175995, "learning_rate": 1.5119228817858955e-07, "loss": 0.0014, "step": 10802 }, { "epoch": 9.865753424657534, "grad_norm": 0.24253323674201965, "learning_rate": 1.501775748351091e-07, "loss": 0.0016, "step": 10803 }, { "epoch": 9.866666666666667, "grad_norm": 0.1758652776479721, "learning_rate": 1.4916286149162864e-07, "loss": 0.0006, "step": 10804 }, { "epoch": 9.867579908675799, "grad_norm": 0.029157036915421486, "learning_rate": 1.4814814814814817e-07, "loss": 0.0002, "step": 10805 }, { "epoch": 9.868493150684932, "grad_norm": 41.64725875854492, "learning_rate": 1.4713343480466768e-07, "loss": 0.3386, "step": 10806 }, { "epoch": 9.869406392694064, "grad_norm": 0.7160253524780273, "learning_rate": 1.461187214611872e-07, "loss": 0.004, "step": 10807 }, { "epoch": 9.870319634703197, "grad_norm": 11.78588581085205, "learning_rate": 1.4510400811770677e-07, "loss": 0.0653, "step": 10808 }, { "epoch": 9.871232876712329, "grad_norm": 0.03206254914402962, "learning_rate": 1.440892947742263e-07, "loss": 0.0001, "step": 10809 }, { "epoch": 9.872146118721462, "grad_norm": 0.010119383223354816, "learning_rate": 1.4307458143074584e-07, "loss": 0.0001, "step": 10810 }, { "epoch": 9.873059360730593, "grad_norm": 1.0345498323440552, "learning_rate": 1.4205986808726534e-07, "loss": 0.0063, "step": 10811 }, { "epoch": 9.873972602739727, "grad_norm": 0.41448071599006653, "learning_rate": 1.4104515474378488e-07, "loss": 0.0022, "step": 10812 }, { "epoch": 9.874885844748858, "grad_norm": 0.010352805256843567, "learning_rate": 1.4003044140030444e-07, "loss": 0.0001, "step": 10813 }, { "epoch": 9.875799086757992, "grad_norm": 0.606767475605011, "learning_rate": 1.3901572805682397e-07, "loss": 0.0027, "step": 10814 }, { "epoch": 9.876712328767123, "grad_norm": 0.37426239252090454, "learning_rate": 1.380010147133435e-07, "loss": 0.0018, "step": 10815 }, { "epoch": 9.877625570776257, "grad_norm": 0.24530582129955292, "learning_rate": 1.36986301369863e-07, "loss": 0.0008, "step": 10816 }, { "epoch": 9.878538812785388, "grad_norm": 0.04484809190034866, "learning_rate": 1.3597158802638254e-07, "loss": 0.0002, "step": 10817 }, { "epoch": 9.87945205479452, "grad_norm": 0.33681315183639526, "learning_rate": 1.3495687468290208e-07, "loss": 0.0013, "step": 10818 }, { "epoch": 9.880365296803653, "grad_norm": 17.335840225219727, "learning_rate": 1.3394216133942164e-07, "loss": 0.1023, "step": 10819 }, { "epoch": 9.881278538812785, "grad_norm": 0.18975305557250977, "learning_rate": 1.3292744799594117e-07, "loss": 0.0011, "step": 10820 }, { "epoch": 9.882191780821918, "grad_norm": 2.034639358520508, "learning_rate": 1.3191273465246068e-07, "loss": 0.0097, "step": 10821 }, { "epoch": 9.88310502283105, "grad_norm": 4.275416374206543, "learning_rate": 1.308980213089802e-07, "loss": 0.0274, "step": 10822 }, { "epoch": 9.884018264840183, "grad_norm": 0.17130409181118011, "learning_rate": 1.2988330796549974e-07, "loss": 0.0011, "step": 10823 }, { "epoch": 9.884931506849314, "grad_norm": 5.198991775512695, "learning_rate": 1.288685946220193e-07, "loss": 0.0278, "step": 10824 }, { "epoch": 9.885844748858448, "grad_norm": 0.1228628158569336, "learning_rate": 1.2785388127853884e-07, "loss": 0.0008, "step": 10825 }, { "epoch": 9.88675799086758, "grad_norm": 2.058152198791504, "learning_rate": 1.2683916793505834e-07, "loss": 0.0113, "step": 10826 }, { "epoch": 9.887671232876713, "grad_norm": 0.04874560609459877, "learning_rate": 1.2582445459157788e-07, "loss": 0.0002, "step": 10827 }, { "epoch": 9.888584474885844, "grad_norm": 0.042217448353767395, "learning_rate": 1.248097412480974e-07, "loss": 0.0002, "step": 10828 }, { "epoch": 9.889497716894978, "grad_norm": 3.286283254623413, "learning_rate": 1.2379502790461697e-07, "loss": 0.0157, "step": 10829 }, { "epoch": 9.89041095890411, "grad_norm": 1.4099513292312622, "learning_rate": 1.227803145611365e-07, "loss": 0.0079, "step": 10830 }, { "epoch": 9.891324200913242, "grad_norm": 0.12140121310949326, "learning_rate": 1.21765601217656e-07, "loss": 0.001, "step": 10831 }, { "epoch": 9.892237442922374, "grad_norm": 1.3207011222839355, "learning_rate": 1.2075088787417554e-07, "loss": 0.0115, "step": 10832 }, { "epoch": 9.893150684931507, "grad_norm": 0.5746867060661316, "learning_rate": 1.1973617453069508e-07, "loss": 0.0032, "step": 10833 }, { "epoch": 9.894063926940639, "grad_norm": 0.46525511145591736, "learning_rate": 1.1872146118721462e-07, "loss": 0.0025, "step": 10834 }, { "epoch": 9.894977168949772, "grad_norm": 5.862507343292236, "learning_rate": 1.1770674784373416e-07, "loss": 0.0386, "step": 10835 }, { "epoch": 9.895890410958904, "grad_norm": 5.147741794586182, "learning_rate": 1.1669203450025368e-07, "loss": 0.0272, "step": 10836 }, { "epoch": 9.896803652968037, "grad_norm": 1.1053335666656494, "learning_rate": 1.1567732115677322e-07, "loss": 0.0034, "step": 10837 }, { "epoch": 9.897716894977169, "grad_norm": 0.2054731249809265, "learning_rate": 1.1466260781329276e-07, "loss": 0.0011, "step": 10838 }, { "epoch": 9.898630136986302, "grad_norm": 0.15424305200576782, "learning_rate": 1.1364789446981229e-07, "loss": 0.0009, "step": 10839 }, { "epoch": 9.899543378995434, "grad_norm": 0.8419280052185059, "learning_rate": 1.1263318112633182e-07, "loss": 0.004, "step": 10840 }, { "epoch": 9.900456621004567, "grad_norm": 0.027864983305335045, "learning_rate": 1.1161846778285134e-07, "loss": 0.0001, "step": 10841 }, { "epoch": 9.901369863013699, "grad_norm": 0.9174602627754211, "learning_rate": 1.1060375443937089e-07, "loss": 0.0054, "step": 10842 }, { "epoch": 9.902283105022832, "grad_norm": 0.0517578199505806, "learning_rate": 1.0958904109589042e-07, "loss": 0.0003, "step": 10843 }, { "epoch": 9.903196347031963, "grad_norm": 0.5177721381187439, "learning_rate": 1.0857432775240996e-07, "loss": 0.0028, "step": 10844 }, { "epoch": 9.904109589041095, "grad_norm": 17.385684967041016, "learning_rate": 1.0755961440892949e-07, "loss": 0.0971, "step": 10845 }, { "epoch": 9.905022831050228, "grad_norm": 6.521652698516846, "learning_rate": 1.0654490106544901e-07, "loss": 0.0409, "step": 10846 }, { "epoch": 9.90593607305936, "grad_norm": 3.8196520805358887, "learning_rate": 1.0553018772196856e-07, "loss": 0.0234, "step": 10847 }, { "epoch": 9.906849315068493, "grad_norm": 0.6666029691696167, "learning_rate": 1.0451547437848809e-07, "loss": 0.0032, "step": 10848 }, { "epoch": 9.907762557077625, "grad_norm": 5.621994972229004, "learning_rate": 1.0350076103500762e-07, "loss": 0.0373, "step": 10849 }, { "epoch": 9.908675799086758, "grad_norm": 0.04121699556708336, "learning_rate": 1.0248604769152715e-07, "loss": 0.0002, "step": 10850 }, { "epoch": 9.90958904109589, "grad_norm": 0.08802593499422073, "learning_rate": 1.0147133434804667e-07, "loss": 0.0005, "step": 10851 }, { "epoch": 9.910502283105023, "grad_norm": 3.563390016555786, "learning_rate": 1.0045662100456622e-07, "loss": 0.0242, "step": 10852 }, { "epoch": 9.911415525114155, "grad_norm": 0.011217921040952206, "learning_rate": 9.944190766108575e-08, "loss": 0.0, "step": 10853 }, { "epoch": 9.912328767123288, "grad_norm": 0.028244255110621452, "learning_rate": 9.842719431760529e-08, "loss": 0.0002, "step": 10854 }, { "epoch": 9.91324200913242, "grad_norm": 5.226486682891846, "learning_rate": 9.741248097412482e-08, "loss": 0.0312, "step": 10855 }, { "epoch": 9.914155251141553, "grad_norm": 0.8695147037506104, "learning_rate": 9.639776763064434e-08, "loss": 0.007, "step": 10856 }, { "epoch": 9.915068493150685, "grad_norm": 4.459662437438965, "learning_rate": 9.538305428716389e-08, "loss": 0.0324, "step": 10857 }, { "epoch": 9.915981735159818, "grad_norm": 2.8077545166015625, "learning_rate": 9.436834094368342e-08, "loss": 0.0141, "step": 10858 }, { "epoch": 9.91689497716895, "grad_norm": 0.5789884328842163, "learning_rate": 9.335362760020295e-08, "loss": 0.0023, "step": 10859 }, { "epoch": 9.917808219178083, "grad_norm": 0.11626394093036652, "learning_rate": 9.233891425672247e-08, "loss": 0.0005, "step": 10860 }, { "epoch": 9.918721461187214, "grad_norm": 0.7677363157272339, "learning_rate": 9.132420091324201e-08, "loss": 0.0029, "step": 10861 }, { "epoch": 9.919634703196348, "grad_norm": 0.7886059880256653, "learning_rate": 9.030948756976155e-08, "loss": 0.0046, "step": 10862 }, { "epoch": 9.92054794520548, "grad_norm": 0.4431905150413513, "learning_rate": 8.929477422628109e-08, "loss": 0.0025, "step": 10863 }, { "epoch": 9.921461187214613, "grad_norm": 0.040784455835819244, "learning_rate": 8.828006088280062e-08, "loss": 0.0002, "step": 10864 }, { "epoch": 9.922374429223744, "grad_norm": 0.42725440859794617, "learning_rate": 8.726534753932014e-08, "loss": 0.0022, "step": 10865 }, { "epoch": 9.923287671232877, "grad_norm": 0.27632254362106323, "learning_rate": 8.625063419583967e-08, "loss": 0.0021, "step": 10866 }, { "epoch": 9.924200913242009, "grad_norm": 3.4444167613983154, "learning_rate": 8.523592085235922e-08, "loss": 0.0077, "step": 10867 }, { "epoch": 9.925114155251142, "grad_norm": 0.02786806784570217, "learning_rate": 8.422120750887875e-08, "loss": 0.0001, "step": 10868 }, { "epoch": 9.926027397260274, "grad_norm": 0.09418117254972458, "learning_rate": 8.320649416539829e-08, "loss": 0.0005, "step": 10869 }, { "epoch": 9.926940639269407, "grad_norm": 0.47530144453048706, "learning_rate": 8.219178082191781e-08, "loss": 0.0029, "step": 10870 }, { "epoch": 9.927853881278539, "grad_norm": 0.8458877801895142, "learning_rate": 8.117706747843734e-08, "loss": 0.0038, "step": 10871 }, { "epoch": 9.92876712328767, "grad_norm": 4.84132194519043, "learning_rate": 8.016235413495689e-08, "loss": 0.027, "step": 10872 }, { "epoch": 9.929680365296804, "grad_norm": 0.02853993885219097, "learning_rate": 7.914764079147642e-08, "loss": 0.0002, "step": 10873 }, { "epoch": 9.930593607305935, "grad_norm": 0.0034809086937457323, "learning_rate": 7.813292744799595e-08, "loss": 0.0, "step": 10874 }, { "epoch": 9.931506849315069, "grad_norm": 0.05723405256867409, "learning_rate": 7.711821410451549e-08, "loss": 0.0003, "step": 10875 }, { "epoch": 9.9324200913242, "grad_norm": 1.5156595706939697, "learning_rate": 7.6103500761035e-08, "loss": 0.012, "step": 10876 }, { "epoch": 9.933333333333334, "grad_norm": 0.27138882875442505, "learning_rate": 7.508878741755455e-08, "loss": 0.0014, "step": 10877 }, { "epoch": 9.934246575342465, "grad_norm": 35.412960052490234, "learning_rate": 7.407407407407409e-08, "loss": 0.5207, "step": 10878 }, { "epoch": 9.935159817351598, "grad_norm": 1.4494507312774658, "learning_rate": 7.30593607305936e-08, "loss": 0.0066, "step": 10879 }, { "epoch": 9.93607305936073, "grad_norm": 0.2974908947944641, "learning_rate": 7.204464738711315e-08, "loss": 0.0014, "step": 10880 }, { "epoch": 9.936986301369863, "grad_norm": 0.32772475481033325, "learning_rate": 7.102993404363267e-08, "loss": 0.0012, "step": 10881 }, { "epoch": 9.937899543378995, "grad_norm": 0.057385627180337906, "learning_rate": 7.001522070015222e-08, "loss": 0.0002, "step": 10882 }, { "epoch": 9.938812785388128, "grad_norm": 1.5838584899902344, "learning_rate": 6.900050735667175e-08, "loss": 0.0051, "step": 10883 }, { "epoch": 9.93972602739726, "grad_norm": 0.3043552339076996, "learning_rate": 6.798579401319127e-08, "loss": 0.002, "step": 10884 }, { "epoch": 9.940639269406393, "grad_norm": 3.2632994651794434, "learning_rate": 6.697108066971082e-08, "loss": 0.013, "step": 10885 }, { "epoch": 9.941552511415525, "grad_norm": 2.465306043624878, "learning_rate": 6.595636732623034e-08, "loss": 0.0127, "step": 10886 }, { "epoch": 9.942465753424658, "grad_norm": 0.017344841733574867, "learning_rate": 6.494165398274987e-08, "loss": 0.0001, "step": 10887 }, { "epoch": 9.94337899543379, "grad_norm": 0.00501107657328248, "learning_rate": 6.392694063926942e-08, "loss": 0.0, "step": 10888 }, { "epoch": 9.944292237442923, "grad_norm": 66.14179229736328, "learning_rate": 6.291222729578894e-08, "loss": 0.2645, "step": 10889 }, { "epoch": 9.945205479452055, "grad_norm": 0.311479389667511, "learning_rate": 6.189751395230848e-08, "loss": 0.0013, "step": 10890 }, { "epoch": 9.946118721461188, "grad_norm": 0.20408250391483307, "learning_rate": 6.0882800608828e-08, "loss": 0.0011, "step": 10891 }, { "epoch": 9.94703196347032, "grad_norm": 0.18015837669372559, "learning_rate": 5.986808726534754e-08, "loss": 0.001, "step": 10892 }, { "epoch": 9.947945205479453, "grad_norm": 0.09168709814548492, "learning_rate": 5.885337392186708e-08, "loss": 0.0007, "step": 10893 }, { "epoch": 9.948858447488584, "grad_norm": 0.04444999620318413, "learning_rate": 5.783866057838661e-08, "loss": 0.0002, "step": 10894 }, { "epoch": 9.949771689497716, "grad_norm": 0.03139285743236542, "learning_rate": 5.6823947234906145e-08, "loss": 0.0002, "step": 10895 }, { "epoch": 9.95068493150685, "grad_norm": 1.5820322036743164, "learning_rate": 5.580923389142567e-08, "loss": 0.0082, "step": 10896 }, { "epoch": 9.951598173515983, "grad_norm": 0.024753525853157043, "learning_rate": 5.479452054794521e-08, "loss": 0.0001, "step": 10897 }, { "epoch": 9.952511415525114, "grad_norm": 0.3783220052719116, "learning_rate": 5.3779807204464744e-08, "loss": 0.002, "step": 10898 }, { "epoch": 9.953424657534246, "grad_norm": 0.7790499925613403, "learning_rate": 5.276509386098428e-08, "loss": 0.0038, "step": 10899 }, { "epoch": 9.954337899543379, "grad_norm": 0.19117999076843262, "learning_rate": 5.175038051750381e-08, "loss": 0.0007, "step": 10900 }, { "epoch": 9.95525114155251, "grad_norm": 1.9861557483673096, "learning_rate": 5.073566717402334e-08, "loss": 0.0152, "step": 10901 }, { "epoch": 9.956164383561644, "grad_norm": 0.19140906631946564, "learning_rate": 4.972095383054288e-08, "loss": 0.0007, "step": 10902 }, { "epoch": 9.957077625570776, "grad_norm": 1.7070845365524292, "learning_rate": 4.870624048706241e-08, "loss": 0.0086, "step": 10903 }, { "epoch": 9.957990867579909, "grad_norm": 6.187711238861084, "learning_rate": 4.7691527143581944e-08, "loss": 0.0366, "step": 10904 }, { "epoch": 9.95890410958904, "grad_norm": 0.013727677054703236, "learning_rate": 4.667681380010148e-08, "loss": 0.0001, "step": 10905 }, { "epoch": 9.959817351598174, "grad_norm": 1.8960622549057007, "learning_rate": 4.5662100456621004e-08, "loss": 0.0082, "step": 10906 }, { "epoch": 9.960730593607305, "grad_norm": 5.47308349609375, "learning_rate": 4.4647387113140544e-08, "loss": 0.027, "step": 10907 }, { "epoch": 9.961643835616439, "grad_norm": 0.06840189546346664, "learning_rate": 4.363267376966007e-08, "loss": 0.0004, "step": 10908 }, { "epoch": 9.96255707762557, "grad_norm": 0.0757259652018547, "learning_rate": 4.261796042617961e-08, "loss": 0.0003, "step": 10909 }, { "epoch": 9.963470319634704, "grad_norm": 0.9995104074478149, "learning_rate": 4.1603247082699143e-08, "loss": 0.0048, "step": 10910 }, { "epoch": 9.964383561643835, "grad_norm": 1.826054573059082, "learning_rate": 4.058853373921867e-08, "loss": 0.0112, "step": 10911 }, { "epoch": 9.965296803652969, "grad_norm": 0.2701912820339203, "learning_rate": 3.957382039573821e-08, "loss": 0.0017, "step": 10912 }, { "epoch": 9.9662100456621, "grad_norm": 0.08543373644351959, "learning_rate": 3.855910705225774e-08, "loss": 0.0006, "step": 10913 }, { "epoch": 9.967123287671233, "grad_norm": 0.16258345544338226, "learning_rate": 3.7544393708777276e-08, "loss": 0.0007, "step": 10914 }, { "epoch": 9.968036529680365, "grad_norm": 0.5615471601486206, "learning_rate": 3.65296803652968e-08, "loss": 0.0022, "step": 10915 }, { "epoch": 9.968949771689498, "grad_norm": 0.25007665157318115, "learning_rate": 3.5514967021816336e-08, "loss": 0.0013, "step": 10916 }, { "epoch": 9.96986301369863, "grad_norm": 1.3555350303649902, "learning_rate": 3.4500253678335876e-08, "loss": 0.0074, "step": 10917 }, { "epoch": 9.970776255707763, "grad_norm": 1.4051167964935303, "learning_rate": 3.348554033485541e-08, "loss": 0.008, "step": 10918 }, { "epoch": 9.971689497716895, "grad_norm": 0.014578831382095814, "learning_rate": 3.2470826991374936e-08, "loss": 0.0001, "step": 10919 }, { "epoch": 9.972602739726028, "grad_norm": 1.2256345748901367, "learning_rate": 3.145611364789447e-08, "loss": 0.0073, "step": 10920 }, { "epoch": 9.97351598173516, "grad_norm": 1.1287100315093994, "learning_rate": 3.0441400304414e-08, "loss": 0.0057, "step": 10921 }, { "epoch": 9.974429223744291, "grad_norm": 1.048772931098938, "learning_rate": 2.942668696093354e-08, "loss": 0.0072, "step": 10922 }, { "epoch": 9.975342465753425, "grad_norm": 0.42891091108322144, "learning_rate": 2.8411973617453072e-08, "loss": 0.0016, "step": 10923 }, { "epoch": 9.976255707762558, "grad_norm": 0.7142228484153748, "learning_rate": 2.7397260273972606e-08, "loss": 0.004, "step": 10924 }, { "epoch": 9.97716894977169, "grad_norm": 0.5966784954071045, "learning_rate": 2.638254693049214e-08, "loss": 0.003, "step": 10925 }, { "epoch": 9.978082191780821, "grad_norm": 0.28838998079299927, "learning_rate": 2.536783358701167e-08, "loss": 0.0012, "step": 10926 }, { "epoch": 9.978995433789954, "grad_norm": 0.38860321044921875, "learning_rate": 2.4353120243531205e-08, "loss": 0.002, "step": 10927 }, { "epoch": 9.979908675799086, "grad_norm": 0.8097764253616333, "learning_rate": 2.333840690005074e-08, "loss": 0.005, "step": 10928 }, { "epoch": 9.98082191780822, "grad_norm": 0.020862853154540062, "learning_rate": 2.2323693556570272e-08, "loss": 0.0001, "step": 10929 }, { "epoch": 9.981735159817351, "grad_norm": 0.12467361986637115, "learning_rate": 2.1308980213089805e-08, "loss": 0.0006, "step": 10930 }, { "epoch": 9.982648401826484, "grad_norm": 0.04435146227478981, "learning_rate": 2.0294266869609335e-08, "loss": 0.0002, "step": 10931 }, { "epoch": 9.983561643835616, "grad_norm": 1.293745756149292, "learning_rate": 1.927955352612887e-08, "loss": 0.0073, "step": 10932 }, { "epoch": 9.98447488584475, "grad_norm": 0.3622720539569855, "learning_rate": 1.82648401826484e-08, "loss": 0.0019, "step": 10933 }, { "epoch": 9.98538812785388, "grad_norm": 1.357434868812561, "learning_rate": 1.7250126839167938e-08, "loss": 0.0072, "step": 10934 }, { "epoch": 9.986301369863014, "grad_norm": 0.3337659239768982, "learning_rate": 1.6235413495687468e-08, "loss": 0.002, "step": 10935 }, { "epoch": 9.987214611872146, "grad_norm": 0.3955340087413788, "learning_rate": 1.5220700152207e-08, "loss": 0.002, "step": 10936 }, { "epoch": 9.988127853881279, "grad_norm": 3.6347150802612305, "learning_rate": 1.4205986808726536e-08, "loss": 0.0233, "step": 10937 }, { "epoch": 9.98904109589041, "grad_norm": 2.3611559867858887, "learning_rate": 1.319127346524607e-08, "loss": 0.0137, "step": 10938 }, { "epoch": 9.989954337899544, "grad_norm": 18.2177677154541, "learning_rate": 1.2176560121765603e-08, "loss": 0.1016, "step": 10939 }, { "epoch": 9.990867579908675, "grad_norm": 0.13721176981925964, "learning_rate": 1.1161846778285136e-08, "loss": 0.001, "step": 10940 }, { "epoch": 9.991780821917809, "grad_norm": 0.08696836978197098, "learning_rate": 1.0147133434804667e-08, "loss": 0.0006, "step": 10941 }, { "epoch": 9.99269406392694, "grad_norm": 0.024994652718305588, "learning_rate": 9.1324200913242e-09, "loss": 0.0001, "step": 10942 }, { "epoch": 9.993607305936074, "grad_norm": 0.02151474542915821, "learning_rate": 8.117706747843734e-09, "loss": 0.0001, "step": 10943 }, { "epoch": 9.994520547945205, "grad_norm": 0.0183547455817461, "learning_rate": 7.102993404363268e-09, "loss": 0.0001, "step": 10944 }, { "epoch": 9.995433789954339, "grad_norm": 0.2755773365497589, "learning_rate": 6.088280060882801e-09, "loss": 0.0009, "step": 10945 }, { "epoch": 9.99634703196347, "grad_norm": 0.051379360258579254, "learning_rate": 5.073566717402334e-09, "loss": 0.0004, "step": 10946 }, { "epoch": 9.997260273972604, "grad_norm": 0.052370741963386536, "learning_rate": 4.058853373921867e-09, "loss": 0.0003, "step": 10947 }, { "epoch": 9.998173515981735, "grad_norm": 0.07074364274740219, "learning_rate": 3.0441400304414007e-09, "loss": 0.0004, "step": 10948 }, { "epoch": 9.999086757990867, "grad_norm": 0.08032790571451187, "learning_rate": 2.0294266869609335e-09, "loss": 0.0006, "step": 10949 }, { "epoch": 10.0, "grad_norm": 2.5664374828338623, "learning_rate": 1.0147133434804667e-09, "loss": 0.0104, "step": 10950 } ], "logging_steps": 1.0, "max_steps": 10950, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": -10950, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }