diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,148652 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.999952897953416, + "eval_steps": 500, + "global_step": 21230, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 9.420409316784815e-05, + "grad_norm": 11.339726448059082, + "learning_rate": 4.705882352941176e-08, + "loss": 1.4014, + "step": 1 + }, + { + "epoch": 0.0001884081863356963, + "grad_norm": 11.068122863769531, + "learning_rate": 9.411764705882353e-08, + "loss": 1.4854, + "step": 2 + }, + { + "epoch": 0.00028261227950354445, + "grad_norm": 12.699030876159668, + "learning_rate": 1.4117647058823532e-07, + "loss": 1.4708, + "step": 3 + }, + { + "epoch": 0.0003768163726713926, + "grad_norm": 11.916887283325195, + "learning_rate": 1.8823529411764705e-07, + "loss": 1.4012, + "step": 4 + }, + { + "epoch": 0.0004710204658392407, + "grad_norm": 11.894647598266602, + "learning_rate": 2.3529411764705883e-07, + "loss": 1.4301, + "step": 5 + }, + { + "epoch": 0.0005652245590070889, + "grad_norm": 11.18047046661377, + "learning_rate": 2.8235294117647064e-07, + "loss": 1.3958, + "step": 6 + }, + { + "epoch": 0.000659428652174937, + "grad_norm": 11.70823860168457, + "learning_rate": 3.294117647058824e-07, + "loss": 1.4063, + "step": 7 + }, + { + "epoch": 0.0007536327453427852, + "grad_norm": 10.865617752075195, + "learning_rate": 3.764705882352941e-07, + "loss": 1.3843, + "step": 8 + }, + { + "epoch": 0.0008478368385106332, + "grad_norm": 12.764007568359375, + "learning_rate": 4.235294117647059e-07, + "loss": 1.4925, + "step": 9 + }, + { + "epoch": 0.0009420409316784814, + "grad_norm": 11.885594367980957, + "learning_rate": 4.7058823529411767e-07, + "loss": 1.4427, + "step": 10 + }, + { + "epoch": 0.0010362450248463295, + "grad_norm": 10.534202575683594, + "learning_rate": 5.176470588235294e-07, + "loss": 1.4676, + "step": 11 + }, + { + "epoch": 0.0011304491180141778, + "grad_norm": 9.053123474121094, + "learning_rate": 5.647058823529413e-07, + "loss": 1.4124, + "step": 12 + }, + { + "epoch": 0.0012246532111820259, + "grad_norm": 9.09373664855957, + "learning_rate": 6.11764705882353e-07, + "loss": 1.2907, + "step": 13 + }, + { + "epoch": 0.001318857304349874, + "grad_norm": 10.74179744720459, + "learning_rate": 6.588235294117648e-07, + "loss": 1.4167, + "step": 14 + }, + { + "epoch": 0.001413061397517722, + "grad_norm": 8.731218338012695, + "learning_rate": 7.058823529411766e-07, + "loss": 1.3377, + "step": 15 + }, + { + "epoch": 0.0015072654906855703, + "grad_norm": 8.675665855407715, + "learning_rate": 7.529411764705882e-07, + "loss": 1.3483, + "step": 16 + }, + { + "epoch": 0.0016014695838534184, + "grad_norm": 8.480453491210938, + "learning_rate": 8.000000000000001e-07, + "loss": 1.3519, + "step": 17 + }, + { + "epoch": 0.0016956736770212665, + "grad_norm": 8.892087936401367, + "learning_rate": 8.470588235294118e-07, + "loss": 1.415, + "step": 18 + }, + { + "epoch": 0.0017898777701891148, + "grad_norm": 7.941059112548828, + "learning_rate": 8.941176470588237e-07, + "loss": 1.4556, + "step": 19 + }, + { + "epoch": 0.0018840818633569629, + "grad_norm": 7.159634590148926, + "learning_rate": 9.411764705882353e-07, + "loss": 1.1699, + "step": 20 + }, + { + "epoch": 0.001978285956524811, + "grad_norm": 7.125668048858643, + "learning_rate": 9.88235294117647e-07, + "loss": 1.224, + "step": 21 + }, + { + "epoch": 0.002072490049692659, + "grad_norm": 7.0029497146606445, + "learning_rate": 1.0352941176470589e-06, + "loss": 1.2028, + "step": 22 + }, + { + "epoch": 0.002166694142860507, + "grad_norm": 6.187039375305176, + "learning_rate": 1.0823529411764707e-06, + "loss": 1.1646, + "step": 23 + }, + { + "epoch": 0.0022608982360283556, + "grad_norm": 6.738177299499512, + "learning_rate": 1.1294117647058826e-06, + "loss": 1.1445, + "step": 24 + }, + { + "epoch": 0.0023551023291962037, + "grad_norm": 6.802069664001465, + "learning_rate": 1.1764705882352942e-06, + "loss": 1.1857, + "step": 25 + }, + { + "epoch": 0.0024493064223640518, + "grad_norm": 6.856109142303467, + "learning_rate": 1.223529411764706e-06, + "loss": 1.2034, + "step": 26 + }, + { + "epoch": 0.0025435105155319, + "grad_norm": 5.405038833618164, + "learning_rate": 1.2705882352941175e-06, + "loss": 1.0185, + "step": 27 + }, + { + "epoch": 0.002637714608699748, + "grad_norm": 4.919756889343262, + "learning_rate": 1.3176470588235296e-06, + "loss": 1.0102, + "step": 28 + }, + { + "epoch": 0.002731918701867596, + "grad_norm": 4.2810773849487305, + "learning_rate": 1.3647058823529413e-06, + "loss": 0.9412, + "step": 29 + }, + { + "epoch": 0.002826122795035444, + "grad_norm": 4.479569911956787, + "learning_rate": 1.4117647058823531e-06, + "loss": 0.9872, + "step": 30 + }, + { + "epoch": 0.0029203268882032926, + "grad_norm": 4.528472900390625, + "learning_rate": 1.4588235294117648e-06, + "loss": 0.9284, + "step": 31 + }, + { + "epoch": 0.0030145309813711407, + "grad_norm": 3.8819658756256104, + "learning_rate": 1.5058823529411764e-06, + "loss": 0.9209, + "step": 32 + }, + { + "epoch": 0.0031087350745389888, + "grad_norm": 4.198963642120361, + "learning_rate": 1.5529411764705885e-06, + "loss": 0.9574, + "step": 33 + }, + { + "epoch": 0.003202939167706837, + "grad_norm": 3.9804775714874268, + "learning_rate": 1.6000000000000001e-06, + "loss": 0.9235, + "step": 34 + }, + { + "epoch": 0.003297143260874685, + "grad_norm": 4.031086444854736, + "learning_rate": 1.6470588235294118e-06, + "loss": 0.8726, + "step": 35 + }, + { + "epoch": 0.003391347354042533, + "grad_norm": 3.4946067333221436, + "learning_rate": 1.6941176470588237e-06, + "loss": 0.8327, + "step": 36 + }, + { + "epoch": 0.0034855514472103815, + "grad_norm": 3.3714377880096436, + "learning_rate": 1.7411764705882353e-06, + "loss": 0.9078, + "step": 37 + }, + { + "epoch": 0.0035797555403782296, + "grad_norm": 3.398184299468994, + "learning_rate": 1.7882352941176474e-06, + "loss": 0.8618, + "step": 38 + }, + { + "epoch": 0.0036739596335460777, + "grad_norm": 3.3403360843658447, + "learning_rate": 1.835294117647059e-06, + "loss": 0.8712, + "step": 39 + }, + { + "epoch": 0.0037681637267139257, + "grad_norm": 2.760078191757202, + "learning_rate": 1.8823529411764707e-06, + "loss": 0.8793, + "step": 40 + }, + { + "epoch": 0.003862367819881774, + "grad_norm": 2.781355619430542, + "learning_rate": 1.9294117647058825e-06, + "loss": 0.8844, + "step": 41 + }, + { + "epoch": 0.003956571913049622, + "grad_norm": 2.2941536903381348, + "learning_rate": 1.976470588235294e-06, + "loss": 0.7767, + "step": 42 + }, + { + "epoch": 0.00405077600621747, + "grad_norm": 2.1851463317871094, + "learning_rate": 2.0235294117647063e-06, + "loss": 0.7546, + "step": 43 + }, + { + "epoch": 0.004144980099385318, + "grad_norm": 2.916654109954834, + "learning_rate": 2.0705882352941177e-06, + "loss": 0.8267, + "step": 44 + }, + { + "epoch": 0.004239184192553166, + "grad_norm": 2.311211585998535, + "learning_rate": 2.1176470588235296e-06, + "loss": 0.7443, + "step": 45 + }, + { + "epoch": 0.004333388285721014, + "grad_norm": 2.1399497985839844, + "learning_rate": 2.1647058823529414e-06, + "loss": 0.7353, + "step": 46 + }, + { + "epoch": 0.004427592378888863, + "grad_norm": 2.1802375316619873, + "learning_rate": 2.2117647058823533e-06, + "loss": 0.7627, + "step": 47 + }, + { + "epoch": 0.004521796472056711, + "grad_norm": 2.017062187194824, + "learning_rate": 2.258823529411765e-06, + "loss": 0.7589, + "step": 48 + }, + { + "epoch": 0.004616000565224559, + "grad_norm": 2.568002700805664, + "learning_rate": 2.3058823529411766e-06, + "loss": 0.7944, + "step": 49 + }, + { + "epoch": 0.004710204658392407, + "grad_norm": 2.188521146774292, + "learning_rate": 2.3529411764705885e-06, + "loss": 0.8467, + "step": 50 + }, + { + "epoch": 0.0048044087515602555, + "grad_norm": 1.8111073970794678, + "learning_rate": 2.4000000000000003e-06, + "loss": 0.752, + "step": 51 + }, + { + "epoch": 0.0048986128447281035, + "grad_norm": 1.927584171295166, + "learning_rate": 2.447058823529412e-06, + "loss": 0.7251, + "step": 52 + }, + { + "epoch": 0.004992816937895952, + "grad_norm": 1.8764548301696777, + "learning_rate": 2.4941176470588236e-06, + "loss": 0.7833, + "step": 53 + }, + { + "epoch": 0.0050870210310638, + "grad_norm": 1.7211430072784424, + "learning_rate": 2.541176470588235e-06, + "loss": 0.7421, + "step": 54 + }, + { + "epoch": 0.005181225124231648, + "grad_norm": 1.9138911962509155, + "learning_rate": 2.5882352941176473e-06, + "loss": 0.7632, + "step": 55 + }, + { + "epoch": 0.005275429217399496, + "grad_norm": 1.6788952350616455, + "learning_rate": 2.635294117647059e-06, + "loss": 0.7039, + "step": 56 + }, + { + "epoch": 0.005369633310567344, + "grad_norm": 1.7721251249313354, + "learning_rate": 2.682352941176471e-06, + "loss": 0.7089, + "step": 57 + }, + { + "epoch": 0.005463837403735192, + "grad_norm": 1.8875666856765747, + "learning_rate": 2.7294117647058825e-06, + "loss": 0.6769, + "step": 58 + }, + { + "epoch": 0.00555804149690304, + "grad_norm": 1.7260370254516602, + "learning_rate": 2.7764705882352944e-06, + "loss": 0.697, + "step": 59 + }, + { + "epoch": 0.005652245590070888, + "grad_norm": 1.7355884313583374, + "learning_rate": 2.8235294117647062e-06, + "loss": 0.6625, + "step": 60 + }, + { + "epoch": 0.005746449683238737, + "grad_norm": 1.7094358205795288, + "learning_rate": 2.8705882352941177e-06, + "loss": 0.6575, + "step": 61 + }, + { + "epoch": 0.005840653776406585, + "grad_norm": 1.5695163011550903, + "learning_rate": 2.9176470588235295e-06, + "loss": 0.6273, + "step": 62 + }, + { + "epoch": 0.005934857869574433, + "grad_norm": 2.2251622676849365, + "learning_rate": 2.9647058823529414e-06, + "loss": 0.7796, + "step": 63 + }, + { + "epoch": 0.006029061962742281, + "grad_norm": 1.889656901359558, + "learning_rate": 3.011764705882353e-06, + "loss": 0.6638, + "step": 64 + }, + { + "epoch": 0.006123266055910129, + "grad_norm": 1.8280856609344482, + "learning_rate": 3.058823529411765e-06, + "loss": 0.7595, + "step": 65 + }, + { + "epoch": 0.0062174701490779775, + "grad_norm": 1.7405308485031128, + "learning_rate": 3.105882352941177e-06, + "loss": 0.6778, + "step": 66 + }, + { + "epoch": 0.006311674242245826, + "grad_norm": 1.758604645729065, + "learning_rate": 3.1529411764705884e-06, + "loss": 0.6882, + "step": 67 + }, + { + "epoch": 0.006405878335413674, + "grad_norm": 1.6205955743789673, + "learning_rate": 3.2000000000000003e-06, + "loss": 0.6489, + "step": 68 + }, + { + "epoch": 0.006500082428581522, + "grad_norm": 1.5852848291397095, + "learning_rate": 3.247058823529412e-06, + "loss": 0.698, + "step": 69 + }, + { + "epoch": 0.00659428652174937, + "grad_norm": 1.7549113035202026, + "learning_rate": 3.2941176470588236e-06, + "loss": 0.6203, + "step": 70 + }, + { + "epoch": 0.006688490614917218, + "grad_norm": 1.6429790258407593, + "learning_rate": 3.3411764705882354e-06, + "loss": 0.6237, + "step": 71 + }, + { + "epoch": 0.006782694708085066, + "grad_norm": 1.7007336616516113, + "learning_rate": 3.3882352941176473e-06, + "loss": 0.7162, + "step": 72 + }, + { + "epoch": 0.006876898801252914, + "grad_norm": 1.6153029203414917, + "learning_rate": 3.4352941176470587e-06, + "loss": 0.6938, + "step": 73 + }, + { + "epoch": 0.006971102894420763, + "grad_norm": 1.8349709510803223, + "learning_rate": 3.4823529411764706e-06, + "loss": 0.6437, + "step": 74 + }, + { + "epoch": 0.007065306987588611, + "grad_norm": 1.6667909622192383, + "learning_rate": 3.529411764705883e-06, + "loss": 0.6905, + "step": 75 + }, + { + "epoch": 0.007159511080756459, + "grad_norm": 1.966651201248169, + "learning_rate": 3.5764705882352948e-06, + "loss": 0.6857, + "step": 76 + }, + { + "epoch": 0.007253715173924307, + "grad_norm": 1.5965569019317627, + "learning_rate": 3.623529411764706e-06, + "loss": 0.5966, + "step": 77 + }, + { + "epoch": 0.007347919267092155, + "grad_norm": 1.6134767532348633, + "learning_rate": 3.670588235294118e-06, + "loss": 0.6519, + "step": 78 + }, + { + "epoch": 0.007442123360260003, + "grad_norm": 1.6604300737380981, + "learning_rate": 3.71764705882353e-06, + "loss": 0.6105, + "step": 79 + }, + { + "epoch": 0.0075363274534278515, + "grad_norm": 1.5630711317062378, + "learning_rate": 3.7647058823529414e-06, + "loss": 0.5989, + "step": 80 + }, + { + "epoch": 0.0076305315465956996, + "grad_norm": 1.7707780599594116, + "learning_rate": 3.8117647058823532e-06, + "loss": 0.705, + "step": 81 + }, + { + "epoch": 0.007724735639763548, + "grad_norm": 1.6089433431625366, + "learning_rate": 3.858823529411765e-06, + "loss": 0.6568, + "step": 82 + }, + { + "epoch": 0.007818939732931397, + "grad_norm": 1.6167658567428589, + "learning_rate": 3.905882352941177e-06, + "loss": 0.6381, + "step": 83 + }, + { + "epoch": 0.007913143826099244, + "grad_norm": 1.7541074752807617, + "learning_rate": 3.952941176470588e-06, + "loss": 0.7424, + "step": 84 + }, + { + "epoch": 0.008007347919267093, + "grad_norm": 1.5577178001403809, + "learning_rate": 4.000000000000001e-06, + "loss": 0.6776, + "step": 85 + }, + { + "epoch": 0.00810155201243494, + "grad_norm": 1.5912531614303589, + "learning_rate": 4.0470588235294125e-06, + "loss": 0.5949, + "step": 86 + }, + { + "epoch": 0.008195756105602789, + "grad_norm": 1.5673514604568481, + "learning_rate": 4.094117647058824e-06, + "loss": 0.7048, + "step": 87 + }, + { + "epoch": 0.008289960198770636, + "grad_norm": 1.7601391077041626, + "learning_rate": 4.141176470588235e-06, + "loss": 0.6333, + "step": 88 + }, + { + "epoch": 0.008384164291938485, + "grad_norm": 1.598659634590149, + "learning_rate": 4.188235294117647e-06, + "loss": 0.6692, + "step": 89 + }, + { + "epoch": 0.008478368385106332, + "grad_norm": 1.6280843019485474, + "learning_rate": 4.235294117647059e-06, + "loss": 0.6205, + "step": 90 + }, + { + "epoch": 0.008572572478274181, + "grad_norm": 1.5283360481262207, + "learning_rate": 4.282352941176471e-06, + "loss": 0.6305, + "step": 91 + }, + { + "epoch": 0.008666776571442028, + "grad_norm": 1.6552764177322388, + "learning_rate": 4.329411764705883e-06, + "loss": 0.6371, + "step": 92 + }, + { + "epoch": 0.008760980664609877, + "grad_norm": 1.6012983322143555, + "learning_rate": 4.376470588235294e-06, + "loss": 0.6803, + "step": 93 + }, + { + "epoch": 0.008855184757777726, + "grad_norm": 1.5913658142089844, + "learning_rate": 4.423529411764707e-06, + "loss": 0.6094, + "step": 94 + }, + { + "epoch": 0.008949388850945574, + "grad_norm": 1.5518425703048706, + "learning_rate": 4.4705882352941184e-06, + "loss": 0.6185, + "step": 95 + }, + { + "epoch": 0.009043592944113422, + "grad_norm": 1.5862501859664917, + "learning_rate": 4.51764705882353e-06, + "loss": 0.6281, + "step": 96 + }, + { + "epoch": 0.00913779703728127, + "grad_norm": 1.6757488250732422, + "learning_rate": 4.564705882352941e-06, + "loss": 0.6599, + "step": 97 + }, + { + "epoch": 0.009232001130449119, + "grad_norm": 1.4505435228347778, + "learning_rate": 4.611764705882353e-06, + "loss": 0.6064, + "step": 98 + }, + { + "epoch": 0.009326205223616966, + "grad_norm": 1.4430831670761108, + "learning_rate": 4.658823529411765e-06, + "loss": 0.5856, + "step": 99 + }, + { + "epoch": 0.009420409316784815, + "grad_norm": 1.755677580833435, + "learning_rate": 4.705882352941177e-06, + "loss": 0.7195, + "step": 100 + }, + { + "epoch": 0.009514613409952662, + "grad_norm": 1.4079543352127075, + "learning_rate": 4.752941176470589e-06, + "loss": 0.5292, + "step": 101 + }, + { + "epoch": 0.009608817503120511, + "grad_norm": 1.7317286729812622, + "learning_rate": 4.800000000000001e-06, + "loss": 0.6123, + "step": 102 + }, + { + "epoch": 0.009703021596288358, + "grad_norm": 1.6857632398605347, + "learning_rate": 4.847058823529412e-06, + "loss": 0.6198, + "step": 103 + }, + { + "epoch": 0.009797225689456207, + "grad_norm": 1.7745858430862427, + "learning_rate": 4.894117647058824e-06, + "loss": 0.5912, + "step": 104 + }, + { + "epoch": 0.009891429782624054, + "grad_norm": 1.6427191495895386, + "learning_rate": 4.941176470588236e-06, + "loss": 0.5242, + "step": 105 + }, + { + "epoch": 0.009985633875791903, + "grad_norm": 1.5532233715057373, + "learning_rate": 4.988235294117647e-06, + "loss": 0.6075, + "step": 106 + }, + { + "epoch": 0.01007983796895975, + "grad_norm": 1.5551834106445312, + "learning_rate": 5.035294117647059e-06, + "loss": 0.588, + "step": 107 + }, + { + "epoch": 0.0101740420621276, + "grad_norm": 1.4823707342147827, + "learning_rate": 5.08235294117647e-06, + "loss": 0.6076, + "step": 108 + }, + { + "epoch": 0.010268246155295448, + "grad_norm": 1.7012345790863037, + "learning_rate": 5.129411764705883e-06, + "loss": 0.6516, + "step": 109 + }, + { + "epoch": 0.010362450248463296, + "grad_norm": 1.598178744316101, + "learning_rate": 5.176470588235295e-06, + "loss": 0.5925, + "step": 110 + }, + { + "epoch": 0.010456654341631145, + "grad_norm": 1.4713191986083984, + "learning_rate": 5.2235294117647065e-06, + "loss": 0.5498, + "step": 111 + }, + { + "epoch": 0.010550858434798992, + "grad_norm": 1.4787997007369995, + "learning_rate": 5.270588235294118e-06, + "loss": 0.5936, + "step": 112 + }, + { + "epoch": 0.01064506252796684, + "grad_norm": 1.4660656452178955, + "learning_rate": 5.317647058823529e-06, + "loss": 0.563, + "step": 113 + }, + { + "epoch": 0.010739266621134688, + "grad_norm": 1.5136562585830688, + "learning_rate": 5.364705882352942e-06, + "loss": 0.5717, + "step": 114 + }, + { + "epoch": 0.010833470714302537, + "grad_norm": 1.4835213422775269, + "learning_rate": 5.411764705882353e-06, + "loss": 0.5547, + "step": 115 + }, + { + "epoch": 0.010927674807470384, + "grad_norm": 1.4667916297912598, + "learning_rate": 5.458823529411765e-06, + "loss": 0.5683, + "step": 116 + }, + { + "epoch": 0.011021878900638233, + "grad_norm": 1.5353646278381348, + "learning_rate": 5.505882352941177e-06, + "loss": 0.5581, + "step": 117 + }, + { + "epoch": 0.01111608299380608, + "grad_norm": 1.4183745384216309, + "learning_rate": 5.552941176470589e-06, + "loss": 0.5556, + "step": 118 + }, + { + "epoch": 0.011210287086973929, + "grad_norm": 1.6031017303466797, + "learning_rate": 5.600000000000001e-06, + "loss": 0.6011, + "step": 119 + }, + { + "epoch": 0.011304491180141776, + "grad_norm": 1.57628333568573, + "learning_rate": 5.6470588235294125e-06, + "loss": 0.5728, + "step": 120 + }, + { + "epoch": 0.011398695273309625, + "grad_norm": 1.5039172172546387, + "learning_rate": 5.694117647058824e-06, + "loss": 0.6005, + "step": 121 + }, + { + "epoch": 0.011492899366477474, + "grad_norm": 1.4251632690429688, + "learning_rate": 5.741176470588235e-06, + "loss": 0.5717, + "step": 122 + }, + { + "epoch": 0.011587103459645321, + "grad_norm": 1.5075788497924805, + "learning_rate": 5.788235294117648e-06, + "loss": 0.6307, + "step": 123 + }, + { + "epoch": 0.01168130755281317, + "grad_norm": 1.4773801565170288, + "learning_rate": 5.835294117647059e-06, + "loss": 0.5513, + "step": 124 + }, + { + "epoch": 0.011775511645981018, + "grad_norm": 1.4166651964187622, + "learning_rate": 5.882352941176471e-06, + "loss": 0.6316, + "step": 125 + }, + { + "epoch": 0.011869715739148867, + "grad_norm": 1.4896390438079834, + "learning_rate": 5.929411764705883e-06, + "loss": 0.5854, + "step": 126 + }, + { + "epoch": 0.011963919832316714, + "grad_norm": 1.5579148530960083, + "learning_rate": 5.976470588235295e-06, + "loss": 0.6574, + "step": 127 + }, + { + "epoch": 0.012058123925484563, + "grad_norm": 1.5000208616256714, + "learning_rate": 6.023529411764706e-06, + "loss": 0.605, + "step": 128 + }, + { + "epoch": 0.01215232801865241, + "grad_norm": 1.848293423652649, + "learning_rate": 6.070588235294118e-06, + "loss": 0.5875, + "step": 129 + }, + { + "epoch": 0.012246532111820259, + "grad_norm": 1.6110161542892456, + "learning_rate": 6.11764705882353e-06, + "loss": 0.6874, + "step": 130 + }, + { + "epoch": 0.012340736204988106, + "grad_norm": 1.4359551668167114, + "learning_rate": 6.164705882352941e-06, + "loss": 0.5507, + "step": 131 + }, + { + "epoch": 0.012434940298155955, + "grad_norm": 1.3417483568191528, + "learning_rate": 6.211764705882354e-06, + "loss": 0.5285, + "step": 132 + }, + { + "epoch": 0.012529144391323802, + "grad_norm": 1.3211323022842407, + "learning_rate": 6.258823529411765e-06, + "loss": 0.57, + "step": 133 + }, + { + "epoch": 0.012623348484491651, + "grad_norm": 1.3868370056152344, + "learning_rate": 6.305882352941177e-06, + "loss": 0.5669, + "step": 134 + }, + { + "epoch": 0.0127175525776595, + "grad_norm": 1.4950891733169556, + "learning_rate": 6.352941176470589e-06, + "loss": 0.63, + "step": 135 + }, + { + "epoch": 0.012811756670827347, + "grad_norm": 1.4748525619506836, + "learning_rate": 6.4000000000000006e-06, + "loss": 0.6375, + "step": 136 + }, + { + "epoch": 0.012905960763995196, + "grad_norm": 1.4417543411254883, + "learning_rate": 6.4470588235294116e-06, + "loss": 0.567, + "step": 137 + }, + { + "epoch": 0.013000164857163043, + "grad_norm": 1.3991072177886963, + "learning_rate": 6.494117647058824e-06, + "loss": 0.5565, + "step": 138 + }, + { + "epoch": 0.013094368950330892, + "grad_norm": 1.2583264112472534, + "learning_rate": 6.541176470588236e-06, + "loss": 0.5257, + "step": 139 + }, + { + "epoch": 0.01318857304349874, + "grad_norm": 1.4247316122055054, + "learning_rate": 6.588235294117647e-06, + "loss": 0.5586, + "step": 140 + }, + { + "epoch": 0.013282777136666589, + "grad_norm": 1.4726084470748901, + "learning_rate": 6.63529411764706e-06, + "loss": 0.5374, + "step": 141 + }, + { + "epoch": 0.013376981229834436, + "grad_norm": 1.5030871629714966, + "learning_rate": 6.682352941176471e-06, + "loss": 0.581, + "step": 142 + }, + { + "epoch": 0.013471185323002285, + "grad_norm": 1.4666228294372559, + "learning_rate": 6.729411764705884e-06, + "loss": 0.6157, + "step": 143 + }, + { + "epoch": 0.013565389416170132, + "grad_norm": 1.3930963277816772, + "learning_rate": 6.776470588235295e-06, + "loss": 0.5412, + "step": 144 + }, + { + "epoch": 0.013659593509337981, + "grad_norm": 1.4461039304733276, + "learning_rate": 6.8235294117647065e-06, + "loss": 0.5156, + "step": 145 + }, + { + "epoch": 0.013753797602505828, + "grad_norm": 1.3485381603240967, + "learning_rate": 6.8705882352941175e-06, + "loss": 0.6053, + "step": 146 + }, + { + "epoch": 0.013848001695673677, + "grad_norm": 1.4633151292800903, + "learning_rate": 6.91764705882353e-06, + "loss": 0.5936, + "step": 147 + }, + { + "epoch": 0.013942205788841526, + "grad_norm": 1.366125464439392, + "learning_rate": 6.964705882352941e-06, + "loss": 0.5454, + "step": 148 + }, + { + "epoch": 0.014036409882009373, + "grad_norm": 1.674149751663208, + "learning_rate": 7.011764705882353e-06, + "loss": 0.5619, + "step": 149 + }, + { + "epoch": 0.014130613975177222, + "grad_norm": 1.3823537826538086, + "learning_rate": 7.058823529411766e-06, + "loss": 0.5343, + "step": 150 + }, + { + "epoch": 0.01422481806834507, + "grad_norm": 1.4399135112762451, + "learning_rate": 7.105882352941177e-06, + "loss": 0.5469, + "step": 151 + }, + { + "epoch": 0.014319022161512918, + "grad_norm": 1.4885296821594238, + "learning_rate": 7.1529411764705895e-06, + "loss": 0.5506, + "step": 152 + }, + { + "epoch": 0.014413226254680766, + "grad_norm": 1.4054043292999268, + "learning_rate": 7.2000000000000005e-06, + "loss": 0.5174, + "step": 153 + }, + { + "epoch": 0.014507430347848614, + "grad_norm": 1.6000251770019531, + "learning_rate": 7.247058823529412e-06, + "loss": 0.6446, + "step": 154 + }, + { + "epoch": 0.014601634441016462, + "grad_norm": 1.3519340753555298, + "learning_rate": 7.294117647058823e-06, + "loss": 0.4738, + "step": 155 + }, + { + "epoch": 0.01469583853418431, + "grad_norm": 1.4328761100769043, + "learning_rate": 7.341176470588236e-06, + "loss": 0.5681, + "step": 156 + }, + { + "epoch": 0.014790042627352158, + "grad_norm": 1.3303059339523315, + "learning_rate": 7.388235294117647e-06, + "loss": 0.5603, + "step": 157 + }, + { + "epoch": 0.014884246720520007, + "grad_norm": 1.3826634883880615, + "learning_rate": 7.43529411764706e-06, + "loss": 0.5284, + "step": 158 + }, + { + "epoch": 0.014978450813687854, + "grad_norm": 1.2612242698669434, + "learning_rate": 7.482352941176472e-06, + "loss": 0.5077, + "step": 159 + }, + { + "epoch": 0.015072654906855703, + "grad_norm": 1.3535127639770508, + "learning_rate": 7.529411764705883e-06, + "loss": 0.5265, + "step": 160 + }, + { + "epoch": 0.015166859000023552, + "grad_norm": 1.3785712718963623, + "learning_rate": 7.576470588235295e-06, + "loss": 0.4815, + "step": 161 + }, + { + "epoch": 0.015261063093191399, + "grad_norm": 1.4843330383300781, + "learning_rate": 7.6235294117647064e-06, + "loss": 0.5591, + "step": 162 + }, + { + "epoch": 0.015355267186359248, + "grad_norm": 1.5826313495635986, + "learning_rate": 7.670588235294119e-06, + "loss": 0.5464, + "step": 163 + }, + { + "epoch": 0.015449471279527095, + "grad_norm": 1.3580889701843262, + "learning_rate": 7.71764705882353e-06, + "loss": 0.5522, + "step": 164 + }, + { + "epoch": 0.015543675372694944, + "grad_norm": 1.3601776361465454, + "learning_rate": 7.764705882352941e-06, + "loss": 0.5444, + "step": 165 + }, + { + "epoch": 0.015637879465862793, + "grad_norm": 1.6120527982711792, + "learning_rate": 7.811764705882354e-06, + "loss": 0.568, + "step": 166 + }, + { + "epoch": 0.01573208355903064, + "grad_norm": 1.3571330308914185, + "learning_rate": 7.858823529411765e-06, + "loss": 0.581, + "step": 167 + }, + { + "epoch": 0.015826287652198488, + "grad_norm": 1.3575953245162964, + "learning_rate": 7.905882352941176e-06, + "loss": 0.4842, + "step": 168 + }, + { + "epoch": 0.015920491745366337, + "grad_norm": 1.4324496984481812, + "learning_rate": 7.952941176470589e-06, + "loss": 0.5788, + "step": 169 + }, + { + "epoch": 0.016014695838534185, + "grad_norm": 1.3828068971633911, + "learning_rate": 8.000000000000001e-06, + "loss": 0.5675, + "step": 170 + }, + { + "epoch": 0.01610889993170203, + "grad_norm": 1.3581621646881104, + "learning_rate": 8.047058823529412e-06, + "loss": 0.5055, + "step": 171 + }, + { + "epoch": 0.01620310402486988, + "grad_norm": 1.3914538621902466, + "learning_rate": 8.094117647058825e-06, + "loss": 0.5261, + "step": 172 + }, + { + "epoch": 0.01629730811803773, + "grad_norm": 1.4400177001953125, + "learning_rate": 8.141176470588236e-06, + "loss": 0.5764, + "step": 173 + }, + { + "epoch": 0.016391512211205578, + "grad_norm": 1.4791513681411743, + "learning_rate": 8.188235294117649e-06, + "loss": 0.5258, + "step": 174 + }, + { + "epoch": 0.016485716304373427, + "grad_norm": 1.4356977939605713, + "learning_rate": 8.23529411764706e-06, + "loss": 0.584, + "step": 175 + }, + { + "epoch": 0.016579920397541272, + "grad_norm": 1.3307058811187744, + "learning_rate": 8.28235294117647e-06, + "loss": 0.5009, + "step": 176 + }, + { + "epoch": 0.01667412449070912, + "grad_norm": 1.3605982065200806, + "learning_rate": 8.329411764705882e-06, + "loss": 0.5552, + "step": 177 + }, + { + "epoch": 0.01676832858387697, + "grad_norm": 1.4693851470947266, + "learning_rate": 8.376470588235295e-06, + "loss": 0.5796, + "step": 178 + }, + { + "epoch": 0.01686253267704482, + "grad_norm": 1.4685381650924683, + "learning_rate": 8.423529411764707e-06, + "loss": 0.6125, + "step": 179 + }, + { + "epoch": 0.016956736770212665, + "grad_norm": 1.2329943180084229, + "learning_rate": 8.470588235294118e-06, + "loss": 0.4956, + "step": 180 + }, + { + "epoch": 0.017050940863380513, + "grad_norm": 1.5707588195800781, + "learning_rate": 8.517647058823531e-06, + "loss": 0.5738, + "step": 181 + }, + { + "epoch": 0.017145144956548362, + "grad_norm": 1.6525170803070068, + "learning_rate": 8.564705882352942e-06, + "loss": 0.5336, + "step": 182 + }, + { + "epoch": 0.01723934904971621, + "grad_norm": 1.3461308479309082, + "learning_rate": 8.611764705882355e-06, + "loss": 0.5926, + "step": 183 + }, + { + "epoch": 0.017333553142884057, + "grad_norm": 1.3954113721847534, + "learning_rate": 8.658823529411766e-06, + "loss": 0.4952, + "step": 184 + }, + { + "epoch": 0.017427757236051906, + "grad_norm": 1.3195507526397705, + "learning_rate": 8.705882352941177e-06, + "loss": 0.5116, + "step": 185 + }, + { + "epoch": 0.017521961329219755, + "grad_norm": 1.3457469940185547, + "learning_rate": 8.752941176470588e-06, + "loss": 0.5605, + "step": 186 + }, + { + "epoch": 0.017616165422387604, + "grad_norm": 1.3406563997268677, + "learning_rate": 8.8e-06, + "loss": 0.5213, + "step": 187 + }, + { + "epoch": 0.017710369515555453, + "grad_norm": 1.3711072206497192, + "learning_rate": 8.847058823529413e-06, + "loss": 0.5083, + "step": 188 + }, + { + "epoch": 0.017804573608723298, + "grad_norm": 1.4641438722610474, + "learning_rate": 8.894117647058824e-06, + "loss": 0.4863, + "step": 189 + }, + { + "epoch": 0.017898777701891147, + "grad_norm": 1.3969779014587402, + "learning_rate": 8.941176470588237e-06, + "loss": 0.5855, + "step": 190 + }, + { + "epoch": 0.017992981795058996, + "grad_norm": 1.3376325368881226, + "learning_rate": 8.988235294117648e-06, + "loss": 0.5644, + "step": 191 + }, + { + "epoch": 0.018087185888226845, + "grad_norm": 1.3779734373092651, + "learning_rate": 9.03529411764706e-06, + "loss": 0.5194, + "step": 192 + }, + { + "epoch": 0.01818138998139469, + "grad_norm": 1.4868650436401367, + "learning_rate": 9.082352941176472e-06, + "loss": 0.5184, + "step": 193 + }, + { + "epoch": 0.01827559407456254, + "grad_norm": 1.3568449020385742, + "learning_rate": 9.129411764705883e-06, + "loss": 0.553, + "step": 194 + }, + { + "epoch": 0.01836979816773039, + "grad_norm": 1.3707945346832275, + "learning_rate": 9.176470588235294e-06, + "loss": 0.5774, + "step": 195 + }, + { + "epoch": 0.018464002260898237, + "grad_norm": 1.3641828298568726, + "learning_rate": 9.223529411764706e-06, + "loss": 0.5567, + "step": 196 + }, + { + "epoch": 0.018558206354066083, + "grad_norm": 1.3307080268859863, + "learning_rate": 9.270588235294117e-06, + "loss": 0.4996, + "step": 197 + }, + { + "epoch": 0.01865241044723393, + "grad_norm": 1.4373600482940674, + "learning_rate": 9.31764705882353e-06, + "loss": 0.4957, + "step": 198 + }, + { + "epoch": 0.01874661454040178, + "grad_norm": 1.361508846282959, + "learning_rate": 9.364705882352943e-06, + "loss": 0.5227, + "step": 199 + }, + { + "epoch": 0.01884081863356963, + "grad_norm": 1.3544015884399414, + "learning_rate": 9.411764705882354e-06, + "loss": 0.5122, + "step": 200 + }, + { + "epoch": 0.018935022726737475, + "grad_norm": 1.2734888792037964, + "learning_rate": 9.458823529411767e-06, + "loss": 0.5555, + "step": 201 + }, + { + "epoch": 0.019029226819905324, + "grad_norm": 1.4170773029327393, + "learning_rate": 9.505882352941178e-06, + "loss": 0.5838, + "step": 202 + }, + { + "epoch": 0.019123430913073173, + "grad_norm": 1.3663331270217896, + "learning_rate": 9.552941176470589e-06, + "loss": 0.5348, + "step": 203 + }, + { + "epoch": 0.019217635006241022, + "grad_norm": 1.4586267471313477, + "learning_rate": 9.600000000000001e-06, + "loss": 0.5567, + "step": 204 + }, + { + "epoch": 0.01931183909940887, + "grad_norm": 1.3278580904006958, + "learning_rate": 9.647058823529412e-06, + "loss": 0.5193, + "step": 205 + }, + { + "epoch": 0.019406043192576716, + "grad_norm": 1.3905582427978516, + "learning_rate": 9.694117647058823e-06, + "loss": 0.5318, + "step": 206 + }, + { + "epoch": 0.019500247285744565, + "grad_norm": 1.427916407585144, + "learning_rate": 9.741176470588236e-06, + "loss": 0.4598, + "step": 207 + }, + { + "epoch": 0.019594451378912414, + "grad_norm": 1.3491191864013672, + "learning_rate": 9.788235294117649e-06, + "loss": 0.505, + "step": 208 + }, + { + "epoch": 0.019688655472080263, + "grad_norm": 1.3706151247024536, + "learning_rate": 9.83529411764706e-06, + "loss": 0.5623, + "step": 209 + }, + { + "epoch": 0.01978285956524811, + "grad_norm": 1.4672727584838867, + "learning_rate": 9.882352941176472e-06, + "loss": 0.556, + "step": 210 + }, + { + "epoch": 0.019877063658415958, + "grad_norm": 1.283095121383667, + "learning_rate": 9.929411764705883e-06, + "loss": 0.5038, + "step": 211 + }, + { + "epoch": 0.019971267751583806, + "grad_norm": 1.3932231664657593, + "learning_rate": 9.976470588235294e-06, + "loss": 0.5366, + "step": 212 + }, + { + "epoch": 0.020065471844751655, + "grad_norm": 1.2726454734802246, + "learning_rate": 1.0023529411764707e-05, + "loss": 0.523, + "step": 213 + }, + { + "epoch": 0.0201596759379195, + "grad_norm": 1.360388994216919, + "learning_rate": 1.0070588235294118e-05, + "loss": 0.5856, + "step": 214 + }, + { + "epoch": 0.02025388003108735, + "grad_norm": 1.2967466115951538, + "learning_rate": 1.011764705882353e-05, + "loss": 0.499, + "step": 215 + }, + { + "epoch": 0.0203480841242552, + "grad_norm": 1.2784472703933716, + "learning_rate": 1.016470588235294e-05, + "loss": 0.5509, + "step": 216 + }, + { + "epoch": 0.020442288217423048, + "grad_norm": 1.389125108718872, + "learning_rate": 1.0211764705882355e-05, + "loss": 0.498, + "step": 217 + }, + { + "epoch": 0.020536492310590897, + "grad_norm": 1.286584734916687, + "learning_rate": 1.0258823529411766e-05, + "loss": 0.4908, + "step": 218 + }, + { + "epoch": 0.020630696403758742, + "grad_norm": 1.4463655948638916, + "learning_rate": 1.0305882352941177e-05, + "loss": 0.5232, + "step": 219 + }, + { + "epoch": 0.02072490049692659, + "grad_norm": 1.384580373764038, + "learning_rate": 1.035294117647059e-05, + "loss": 0.5515, + "step": 220 + }, + { + "epoch": 0.02081910459009444, + "grad_norm": 1.3646312952041626, + "learning_rate": 1.04e-05, + "loss": 0.5048, + "step": 221 + }, + { + "epoch": 0.02091330868326229, + "grad_norm": 1.398513674736023, + "learning_rate": 1.0447058823529413e-05, + "loss": 0.5485, + "step": 222 + }, + { + "epoch": 0.021007512776430134, + "grad_norm": 1.4606225490570068, + "learning_rate": 1.0494117647058824e-05, + "loss": 0.5682, + "step": 223 + }, + { + "epoch": 0.021101716869597983, + "grad_norm": 1.404390811920166, + "learning_rate": 1.0541176470588237e-05, + "loss": 0.5791, + "step": 224 + }, + { + "epoch": 0.021195920962765832, + "grad_norm": 1.3128252029418945, + "learning_rate": 1.0588235294117648e-05, + "loss": 0.5609, + "step": 225 + }, + { + "epoch": 0.02129012505593368, + "grad_norm": 1.289519190788269, + "learning_rate": 1.0635294117647059e-05, + "loss": 0.5223, + "step": 226 + }, + { + "epoch": 0.021384329149101527, + "grad_norm": 1.1863501071929932, + "learning_rate": 1.068235294117647e-05, + "loss": 0.4881, + "step": 227 + }, + { + "epoch": 0.021478533242269376, + "grad_norm": 1.232800006866455, + "learning_rate": 1.0729411764705884e-05, + "loss": 0.4874, + "step": 228 + }, + { + "epoch": 0.021572737335437225, + "grad_norm": 1.4398059844970703, + "learning_rate": 1.0776470588235295e-05, + "loss": 0.5385, + "step": 229 + }, + { + "epoch": 0.021666941428605074, + "grad_norm": 1.2320069074630737, + "learning_rate": 1.0823529411764706e-05, + "loss": 0.4659, + "step": 230 + }, + { + "epoch": 0.021761145521772923, + "grad_norm": 1.3715555667877197, + "learning_rate": 1.0870588235294119e-05, + "loss": 0.51, + "step": 231 + }, + { + "epoch": 0.021855349614940768, + "grad_norm": 1.4365489482879639, + "learning_rate": 1.091764705882353e-05, + "loss": 0.5078, + "step": 232 + }, + { + "epoch": 0.021949553708108617, + "grad_norm": 1.2546861171722412, + "learning_rate": 1.0964705882352941e-05, + "loss": 0.5661, + "step": 233 + }, + { + "epoch": 0.022043757801276466, + "grad_norm": 1.481594443321228, + "learning_rate": 1.1011764705882354e-05, + "loss": 0.6318, + "step": 234 + }, + { + "epoch": 0.022137961894444315, + "grad_norm": 1.4932091236114502, + "learning_rate": 1.1058823529411766e-05, + "loss": 0.5458, + "step": 235 + }, + { + "epoch": 0.02223216598761216, + "grad_norm": 1.3674851655960083, + "learning_rate": 1.1105882352941177e-05, + "loss": 0.4709, + "step": 236 + }, + { + "epoch": 0.02232637008078001, + "grad_norm": 1.3787115812301636, + "learning_rate": 1.1152941176470588e-05, + "loss": 0.5804, + "step": 237 + }, + { + "epoch": 0.022420574173947858, + "grad_norm": 1.3375911712646484, + "learning_rate": 1.1200000000000001e-05, + "loss": 0.4798, + "step": 238 + }, + { + "epoch": 0.022514778267115707, + "grad_norm": 1.4716728925704956, + "learning_rate": 1.1247058823529414e-05, + "loss": 0.5825, + "step": 239 + }, + { + "epoch": 0.022608982360283553, + "grad_norm": 1.3478437662124634, + "learning_rate": 1.1294117647058825e-05, + "loss": 0.5459, + "step": 240 + }, + { + "epoch": 0.0227031864534514, + "grad_norm": 1.2855398654937744, + "learning_rate": 1.1341176470588236e-05, + "loss": 0.5437, + "step": 241 + }, + { + "epoch": 0.02279739054661925, + "grad_norm": 1.4099336862564087, + "learning_rate": 1.1388235294117649e-05, + "loss": 0.5369, + "step": 242 + }, + { + "epoch": 0.0228915946397871, + "grad_norm": 1.4721664190292358, + "learning_rate": 1.143529411764706e-05, + "loss": 0.5232, + "step": 243 + }, + { + "epoch": 0.02298579873295495, + "grad_norm": 1.4028209447860718, + "learning_rate": 1.148235294117647e-05, + "loss": 0.5668, + "step": 244 + }, + { + "epoch": 0.023080002826122794, + "grad_norm": 1.2915841341018677, + "learning_rate": 1.1529411764705882e-05, + "loss": 0.5403, + "step": 245 + }, + { + "epoch": 0.023174206919290643, + "grad_norm": 1.385123372077942, + "learning_rate": 1.1576470588235296e-05, + "loss": 0.4851, + "step": 246 + }, + { + "epoch": 0.023268411012458492, + "grad_norm": 1.4114811420440674, + "learning_rate": 1.1623529411764707e-05, + "loss": 0.638, + "step": 247 + }, + { + "epoch": 0.02336261510562634, + "grad_norm": 1.2462477684020996, + "learning_rate": 1.1670588235294118e-05, + "loss": 0.4535, + "step": 248 + }, + { + "epoch": 0.023456819198794186, + "grad_norm": 1.4437800645828247, + "learning_rate": 1.171764705882353e-05, + "loss": 0.5457, + "step": 249 + }, + { + "epoch": 0.023551023291962035, + "grad_norm": 1.3583869934082031, + "learning_rate": 1.1764705882352942e-05, + "loss": 0.5556, + "step": 250 + }, + { + "epoch": 0.023645227385129884, + "grad_norm": 1.3165775537490845, + "learning_rate": 1.1811764705882353e-05, + "loss": 0.4812, + "step": 251 + }, + { + "epoch": 0.023739431478297733, + "grad_norm": 1.3515589237213135, + "learning_rate": 1.1858823529411766e-05, + "loss": 0.5272, + "step": 252 + }, + { + "epoch": 0.02383363557146558, + "grad_norm": 1.3042768239974976, + "learning_rate": 1.1905882352941178e-05, + "loss": 0.5391, + "step": 253 + }, + { + "epoch": 0.023927839664633428, + "grad_norm": 1.318557620048523, + "learning_rate": 1.195294117647059e-05, + "loss": 0.5077, + "step": 254 + }, + { + "epoch": 0.024022043757801276, + "grad_norm": 1.2530382871627808, + "learning_rate": 1.2e-05, + "loss": 0.5204, + "step": 255 + }, + { + "epoch": 0.024116247850969125, + "grad_norm": 1.3513764142990112, + "learning_rate": 1.2047058823529411e-05, + "loss": 0.5431, + "step": 256 + }, + { + "epoch": 0.024210451944136974, + "grad_norm": 1.1965149641036987, + "learning_rate": 1.2094117647058826e-05, + "loss": 0.4621, + "step": 257 + }, + { + "epoch": 0.02430465603730482, + "grad_norm": 1.5096133947372437, + "learning_rate": 1.2141176470588237e-05, + "loss": 0.5475, + "step": 258 + }, + { + "epoch": 0.02439886013047267, + "grad_norm": 1.3592801094055176, + "learning_rate": 1.2188235294117648e-05, + "loss": 0.4901, + "step": 259 + }, + { + "epoch": 0.024493064223640518, + "grad_norm": 1.318343162536621, + "learning_rate": 1.223529411764706e-05, + "loss": 0.5354, + "step": 260 + }, + { + "epoch": 0.024587268316808367, + "grad_norm": 1.2713096141815186, + "learning_rate": 1.2282352941176471e-05, + "loss": 0.5345, + "step": 261 + }, + { + "epoch": 0.024681472409976212, + "grad_norm": 1.2716429233551025, + "learning_rate": 1.2329411764705882e-05, + "loss": 0.4736, + "step": 262 + }, + { + "epoch": 0.02477567650314406, + "grad_norm": 1.2657898664474487, + "learning_rate": 1.2376470588235294e-05, + "loss": 0.522, + "step": 263 + }, + { + "epoch": 0.02486988059631191, + "grad_norm": 1.1999789476394653, + "learning_rate": 1.2423529411764708e-05, + "loss": 0.503, + "step": 264 + }, + { + "epoch": 0.02496408468947976, + "grad_norm": 1.2083820104599, + "learning_rate": 1.2470588235294119e-05, + "loss": 0.5141, + "step": 265 + }, + { + "epoch": 0.025058288782647604, + "grad_norm": 1.289919376373291, + "learning_rate": 1.251764705882353e-05, + "loss": 0.4781, + "step": 266 + }, + { + "epoch": 0.025152492875815453, + "grad_norm": 1.260485053062439, + "learning_rate": 1.2564705882352943e-05, + "loss": 0.5109, + "step": 267 + }, + { + "epoch": 0.025246696968983302, + "grad_norm": 1.3441667556762695, + "learning_rate": 1.2611764705882354e-05, + "loss": 0.523, + "step": 268 + }, + { + "epoch": 0.02534090106215115, + "grad_norm": 1.3340215682983398, + "learning_rate": 1.2658823529411766e-05, + "loss": 0.5996, + "step": 269 + }, + { + "epoch": 0.025435105155319, + "grad_norm": 1.1880944967269897, + "learning_rate": 1.2705882352941177e-05, + "loss": 0.5058, + "step": 270 + }, + { + "epoch": 0.025529309248486846, + "grad_norm": 1.326841950416565, + "learning_rate": 1.275294117647059e-05, + "loss": 0.5749, + "step": 271 + }, + { + "epoch": 0.025623513341654695, + "grad_norm": 1.1489343643188477, + "learning_rate": 1.2800000000000001e-05, + "loss": 0.4905, + "step": 272 + }, + { + "epoch": 0.025717717434822544, + "grad_norm": 1.2286577224731445, + "learning_rate": 1.2847058823529412e-05, + "loss": 0.4753, + "step": 273 + }, + { + "epoch": 0.025811921527990393, + "grad_norm": 1.2917677164077759, + "learning_rate": 1.2894117647058823e-05, + "loss": 0.4893, + "step": 274 + }, + { + "epoch": 0.025906125621158238, + "grad_norm": 1.211621642112732, + "learning_rate": 1.2941176470588238e-05, + "loss": 0.4962, + "step": 275 + }, + { + "epoch": 0.026000329714326087, + "grad_norm": 1.2378382682800293, + "learning_rate": 1.2988235294117649e-05, + "loss": 0.5133, + "step": 276 + }, + { + "epoch": 0.026094533807493936, + "grad_norm": 1.3860008716583252, + "learning_rate": 1.303529411764706e-05, + "loss": 0.5281, + "step": 277 + }, + { + "epoch": 0.026188737900661785, + "grad_norm": 1.2689138650894165, + "learning_rate": 1.3082352941176472e-05, + "loss": 0.4678, + "step": 278 + }, + { + "epoch": 0.02628294199382963, + "grad_norm": 1.2363132238388062, + "learning_rate": 1.3129411764705883e-05, + "loss": 0.4456, + "step": 279 + }, + { + "epoch": 0.02637714608699748, + "grad_norm": 1.2797064781188965, + "learning_rate": 1.3176470588235294e-05, + "loss": 0.4297, + "step": 280 + }, + { + "epoch": 0.026471350180165328, + "grad_norm": 1.382623314857483, + "learning_rate": 1.3223529411764705e-05, + "loss": 0.5486, + "step": 281 + }, + { + "epoch": 0.026565554273333177, + "grad_norm": 1.1852174997329712, + "learning_rate": 1.327058823529412e-05, + "loss": 0.4982, + "step": 282 + }, + { + "epoch": 0.026659758366501026, + "grad_norm": 1.3323570489883423, + "learning_rate": 1.331764705882353e-05, + "loss": 0.5272, + "step": 283 + }, + { + "epoch": 0.02675396245966887, + "grad_norm": 1.3760366439819336, + "learning_rate": 1.3364705882352942e-05, + "loss": 0.5075, + "step": 284 + }, + { + "epoch": 0.02684816655283672, + "grad_norm": 1.3552740812301636, + "learning_rate": 1.3411764705882353e-05, + "loss": 0.5689, + "step": 285 + }, + { + "epoch": 0.02694237064600457, + "grad_norm": 1.250741720199585, + "learning_rate": 1.3458823529411767e-05, + "loss": 0.4848, + "step": 286 + }, + { + "epoch": 0.02703657473917242, + "grad_norm": 1.3446913957595825, + "learning_rate": 1.3505882352941178e-05, + "loss": 0.4934, + "step": 287 + }, + { + "epoch": 0.027130778832340264, + "grad_norm": 1.26736319065094, + "learning_rate": 1.355294117647059e-05, + "loss": 0.4598, + "step": 288 + }, + { + "epoch": 0.027224982925508113, + "grad_norm": 1.2434927225112915, + "learning_rate": 1.3600000000000002e-05, + "loss": 0.5369, + "step": 289 + }, + { + "epoch": 0.027319187018675962, + "grad_norm": 1.2333719730377197, + "learning_rate": 1.3647058823529413e-05, + "loss": 0.5317, + "step": 290 + }, + { + "epoch": 0.02741339111184381, + "grad_norm": 1.21355140209198, + "learning_rate": 1.3694117647058824e-05, + "loss": 0.4773, + "step": 291 + }, + { + "epoch": 0.027507595205011656, + "grad_norm": 1.2929593324661255, + "learning_rate": 1.3741176470588235e-05, + "loss": 0.4926, + "step": 292 + }, + { + "epoch": 0.027601799298179505, + "grad_norm": 1.2802207469940186, + "learning_rate": 1.378823529411765e-05, + "loss": 0.4911, + "step": 293 + }, + { + "epoch": 0.027696003391347354, + "grad_norm": 1.466518759727478, + "learning_rate": 1.383529411764706e-05, + "loss": 0.5255, + "step": 294 + }, + { + "epoch": 0.027790207484515203, + "grad_norm": 1.3376758098602295, + "learning_rate": 1.3882352941176471e-05, + "loss": 0.5025, + "step": 295 + }, + { + "epoch": 0.027884411577683052, + "grad_norm": 1.6920759677886963, + "learning_rate": 1.3929411764705882e-05, + "loss": 0.5366, + "step": 296 + }, + { + "epoch": 0.027978615670850897, + "grad_norm": 1.1678169965744019, + "learning_rate": 1.3976470588235295e-05, + "loss": 0.4639, + "step": 297 + }, + { + "epoch": 0.028072819764018746, + "grad_norm": 1.1555671691894531, + "learning_rate": 1.4023529411764706e-05, + "loss": 0.4613, + "step": 298 + }, + { + "epoch": 0.028167023857186595, + "grad_norm": 1.3847774267196655, + "learning_rate": 1.4070588235294119e-05, + "loss": 0.5177, + "step": 299 + }, + { + "epoch": 0.028261227950354444, + "grad_norm": 1.3086796998977661, + "learning_rate": 1.4117647058823532e-05, + "loss": 0.5475, + "step": 300 + }, + { + "epoch": 0.02835543204352229, + "grad_norm": 1.4727612733840942, + "learning_rate": 1.4164705882352943e-05, + "loss": 0.5696, + "step": 301 + }, + { + "epoch": 0.02844963613669014, + "grad_norm": 1.166216492652893, + "learning_rate": 1.4211764705882354e-05, + "loss": 0.4991, + "step": 302 + }, + { + "epoch": 0.028543840229857988, + "grad_norm": 1.2508559226989746, + "learning_rate": 1.4258823529411765e-05, + "loss": 0.472, + "step": 303 + }, + { + "epoch": 0.028638044323025837, + "grad_norm": 1.341174602508545, + "learning_rate": 1.4305882352941179e-05, + "loss": 0.5393, + "step": 304 + }, + { + "epoch": 0.028732248416193682, + "grad_norm": 1.3411362171173096, + "learning_rate": 1.435294117647059e-05, + "loss": 0.49, + "step": 305 + }, + { + "epoch": 0.02882645250936153, + "grad_norm": 1.2923569679260254, + "learning_rate": 1.4400000000000001e-05, + "loss": 0.529, + "step": 306 + }, + { + "epoch": 0.02892065660252938, + "grad_norm": 1.364295482635498, + "learning_rate": 1.4447058823529414e-05, + "loss": 0.527, + "step": 307 + }, + { + "epoch": 0.02901486069569723, + "grad_norm": 1.2051934003829956, + "learning_rate": 1.4494117647058825e-05, + "loss": 0.4571, + "step": 308 + }, + { + "epoch": 0.029109064788865078, + "grad_norm": 1.301098346710205, + "learning_rate": 1.4541176470588236e-05, + "loss": 0.5431, + "step": 309 + }, + { + "epoch": 0.029203268882032923, + "grad_norm": 1.2850219011306763, + "learning_rate": 1.4588235294117647e-05, + "loss": 0.4506, + "step": 310 + }, + { + "epoch": 0.029297472975200772, + "grad_norm": 1.2272703647613525, + "learning_rate": 1.4635294117647061e-05, + "loss": 0.4411, + "step": 311 + }, + { + "epoch": 0.02939167706836862, + "grad_norm": 1.2605528831481934, + "learning_rate": 1.4682352941176472e-05, + "loss": 0.4766, + "step": 312 + }, + { + "epoch": 0.02948588116153647, + "grad_norm": 1.2444404363632202, + "learning_rate": 1.4729411764705883e-05, + "loss": 0.4705, + "step": 313 + }, + { + "epoch": 0.029580085254704316, + "grad_norm": 1.1735955476760864, + "learning_rate": 1.4776470588235294e-05, + "loss": 0.4668, + "step": 314 + }, + { + "epoch": 0.029674289347872165, + "grad_norm": 1.2663644552230835, + "learning_rate": 1.4823529411764707e-05, + "loss": 0.5078, + "step": 315 + }, + { + "epoch": 0.029768493441040014, + "grad_norm": 1.2020328044891357, + "learning_rate": 1.487058823529412e-05, + "loss": 0.5079, + "step": 316 + }, + { + "epoch": 0.029862697534207863, + "grad_norm": 1.1874061822891235, + "learning_rate": 1.491764705882353e-05, + "loss": 0.4708, + "step": 317 + }, + { + "epoch": 0.029956901627375708, + "grad_norm": 1.1729247570037842, + "learning_rate": 1.4964705882352943e-05, + "loss": 0.4968, + "step": 318 + }, + { + "epoch": 0.030051105720543557, + "grad_norm": 1.2035945653915405, + "learning_rate": 1.5011764705882354e-05, + "loss": 0.4761, + "step": 319 + }, + { + "epoch": 0.030145309813711406, + "grad_norm": 1.1717956066131592, + "learning_rate": 1.5058823529411765e-05, + "loss": 0.504, + "step": 320 + }, + { + "epoch": 0.030239513906879255, + "grad_norm": 1.1463146209716797, + "learning_rate": 1.5105882352941176e-05, + "loss": 0.4883, + "step": 321 + }, + { + "epoch": 0.030333718000047104, + "grad_norm": 1.2568846940994263, + "learning_rate": 1.515294117647059e-05, + "loss": 0.4818, + "step": 322 + }, + { + "epoch": 0.03042792209321495, + "grad_norm": 1.1344101428985596, + "learning_rate": 1.5200000000000002e-05, + "loss": 0.4452, + "step": 323 + }, + { + "epoch": 0.030522126186382798, + "grad_norm": 1.2352666854858398, + "learning_rate": 1.5247058823529413e-05, + "loss": 0.4619, + "step": 324 + }, + { + "epoch": 0.030616330279550647, + "grad_norm": 1.1882596015930176, + "learning_rate": 1.5294117647058822e-05, + "loss": 0.508, + "step": 325 + }, + { + "epoch": 0.030710534372718496, + "grad_norm": 1.341575264930725, + "learning_rate": 1.5341176470588238e-05, + "loss": 0.5217, + "step": 326 + }, + { + "epoch": 0.03080473846588634, + "grad_norm": 1.3158843517303467, + "learning_rate": 1.5388235294117648e-05, + "loss": 0.5428, + "step": 327 + }, + { + "epoch": 0.03089894255905419, + "grad_norm": 1.2857859134674072, + "learning_rate": 1.543529411764706e-05, + "loss": 0.5611, + "step": 328 + }, + { + "epoch": 0.03099314665222204, + "grad_norm": 1.1938344240188599, + "learning_rate": 1.5482352941176473e-05, + "loss": 0.4873, + "step": 329 + }, + { + "epoch": 0.03108735074538989, + "grad_norm": 1.1202720403671265, + "learning_rate": 1.5529411764705882e-05, + "loss": 0.4402, + "step": 330 + }, + { + "epoch": 0.031181554838557734, + "grad_norm": 1.1185312271118164, + "learning_rate": 1.5576470588235295e-05, + "loss": 0.5004, + "step": 331 + }, + { + "epoch": 0.031275758931725586, + "grad_norm": 1.2877079248428345, + "learning_rate": 1.5623529411764708e-05, + "loss": 0.4842, + "step": 332 + }, + { + "epoch": 0.03136996302489343, + "grad_norm": 1.2797932624816895, + "learning_rate": 1.567058823529412e-05, + "loss": 0.4949, + "step": 333 + }, + { + "epoch": 0.03146416711806128, + "grad_norm": 1.0597726106643677, + "learning_rate": 1.571764705882353e-05, + "loss": 0.4292, + "step": 334 + }, + { + "epoch": 0.031558371211229126, + "grad_norm": 1.1882489919662476, + "learning_rate": 1.5764705882352943e-05, + "loss": 0.4819, + "step": 335 + }, + { + "epoch": 0.031652575304396975, + "grad_norm": 1.305355429649353, + "learning_rate": 1.5811764705882352e-05, + "loss": 0.4368, + "step": 336 + }, + { + "epoch": 0.031746779397564824, + "grad_norm": 1.1395972967147827, + "learning_rate": 1.5858823529411768e-05, + "loss": 0.4541, + "step": 337 + }, + { + "epoch": 0.03184098349073267, + "grad_norm": 1.4120413064956665, + "learning_rate": 1.5905882352941177e-05, + "loss": 0.4797, + "step": 338 + }, + { + "epoch": 0.03193518758390052, + "grad_norm": 1.2574928998947144, + "learning_rate": 1.595294117647059e-05, + "loss": 0.4914, + "step": 339 + }, + { + "epoch": 0.03202939167706837, + "grad_norm": 1.1765894889831543, + "learning_rate": 1.6000000000000003e-05, + "loss": 0.4123, + "step": 340 + }, + { + "epoch": 0.03212359577023622, + "grad_norm": 1.1035923957824707, + "learning_rate": 1.6047058823529412e-05, + "loss": 0.4594, + "step": 341 + }, + { + "epoch": 0.03221779986340406, + "grad_norm": 1.3358131647109985, + "learning_rate": 1.6094117647058825e-05, + "loss": 0.5147, + "step": 342 + }, + { + "epoch": 0.03231200395657191, + "grad_norm": 1.2328746318817139, + "learning_rate": 1.6141176470588234e-05, + "loss": 0.4826, + "step": 343 + }, + { + "epoch": 0.03240620804973976, + "grad_norm": 1.3128273487091064, + "learning_rate": 1.618823529411765e-05, + "loss": 0.4934, + "step": 344 + }, + { + "epoch": 0.03250041214290761, + "grad_norm": 1.1448085308074951, + "learning_rate": 1.623529411764706e-05, + "loss": 0.4152, + "step": 345 + }, + { + "epoch": 0.03259461623607546, + "grad_norm": 1.2045378684997559, + "learning_rate": 1.6282352941176472e-05, + "loss": 0.4433, + "step": 346 + }, + { + "epoch": 0.03268882032924331, + "grad_norm": 1.3228363990783691, + "learning_rate": 1.6329411764705885e-05, + "loss": 0.5286, + "step": 347 + }, + { + "epoch": 0.032783024422411156, + "grad_norm": 1.3525841236114502, + "learning_rate": 1.6376470588235298e-05, + "loss": 0.5279, + "step": 348 + }, + { + "epoch": 0.032877228515579004, + "grad_norm": 1.5093525648117065, + "learning_rate": 1.6423529411764707e-05, + "loss": 0.5161, + "step": 349 + }, + { + "epoch": 0.03297143260874685, + "grad_norm": 1.2169058322906494, + "learning_rate": 1.647058823529412e-05, + "loss": 0.4849, + "step": 350 + }, + { + "epoch": 0.033065636701914695, + "grad_norm": 1.2074980735778809, + "learning_rate": 1.6517647058823532e-05, + "loss": 0.4961, + "step": 351 + }, + { + "epoch": 0.033159840795082544, + "grad_norm": 1.128026008605957, + "learning_rate": 1.656470588235294e-05, + "loss": 0.4738, + "step": 352 + }, + { + "epoch": 0.03325404488825039, + "grad_norm": 1.2816585302352905, + "learning_rate": 1.6611764705882354e-05, + "loss": 0.4957, + "step": 353 + }, + { + "epoch": 0.03334824898141824, + "grad_norm": 1.2620776891708374, + "learning_rate": 1.6658823529411764e-05, + "loss": 0.549, + "step": 354 + }, + { + "epoch": 0.03344245307458609, + "grad_norm": 1.2972233295440674, + "learning_rate": 1.670588235294118e-05, + "loss": 0.5037, + "step": 355 + }, + { + "epoch": 0.03353665716775394, + "grad_norm": 1.1903115510940552, + "learning_rate": 1.675294117647059e-05, + "loss": 0.5025, + "step": 356 + }, + { + "epoch": 0.03363086126092179, + "grad_norm": 1.0767686367034912, + "learning_rate": 1.6800000000000002e-05, + "loss": 0.4487, + "step": 357 + }, + { + "epoch": 0.03372506535408964, + "grad_norm": 1.1257960796356201, + "learning_rate": 1.6847058823529414e-05, + "loss": 0.4863, + "step": 358 + }, + { + "epoch": 0.03381926944725748, + "grad_norm": 1.2453886270523071, + "learning_rate": 1.6894117647058824e-05, + "loss": 0.4671, + "step": 359 + }, + { + "epoch": 0.03391347354042533, + "grad_norm": 1.2498379945755005, + "learning_rate": 1.6941176470588237e-05, + "loss": 0.5045, + "step": 360 + }, + { + "epoch": 0.03400767763359318, + "grad_norm": 1.1781715154647827, + "learning_rate": 1.698823529411765e-05, + "loss": 0.4638, + "step": 361 + }, + { + "epoch": 0.03410188172676103, + "grad_norm": 1.3448647260665894, + "learning_rate": 1.7035294117647062e-05, + "loss": 0.5319, + "step": 362 + }, + { + "epoch": 0.034196085819928876, + "grad_norm": 1.0600744485855103, + "learning_rate": 1.708235294117647e-05, + "loss": 0.4685, + "step": 363 + }, + { + "epoch": 0.034290289913096725, + "grad_norm": 1.1628053188323975, + "learning_rate": 1.7129411764705884e-05, + "loss": 0.466, + "step": 364 + }, + { + "epoch": 0.034384494006264574, + "grad_norm": 1.249374270439148, + "learning_rate": 1.7176470588235293e-05, + "loss": 0.4751, + "step": 365 + }, + { + "epoch": 0.03447869809943242, + "grad_norm": 1.2262009382247925, + "learning_rate": 1.722352941176471e-05, + "loss": 0.5231, + "step": 366 + }, + { + "epoch": 0.03457290219260027, + "grad_norm": 1.08585786819458, + "learning_rate": 1.727058823529412e-05, + "loss": 0.4549, + "step": 367 + }, + { + "epoch": 0.034667106285768114, + "grad_norm": 1.250612497329712, + "learning_rate": 1.731764705882353e-05, + "loss": 0.5023, + "step": 368 + }, + { + "epoch": 0.03476131037893596, + "grad_norm": 1.2346527576446533, + "learning_rate": 1.7364705882352944e-05, + "loss": 0.4137, + "step": 369 + }, + { + "epoch": 0.03485551447210381, + "grad_norm": 1.1174893379211426, + "learning_rate": 1.7411764705882353e-05, + "loss": 0.4631, + "step": 370 + }, + { + "epoch": 0.03494971856527166, + "grad_norm": 1.193302869796753, + "learning_rate": 1.7458823529411766e-05, + "loss": 0.4284, + "step": 371 + }, + { + "epoch": 0.03504392265843951, + "grad_norm": 1.3517142534255981, + "learning_rate": 1.7505882352941175e-05, + "loss": 0.4327, + "step": 372 + }, + { + "epoch": 0.03513812675160736, + "grad_norm": 1.2797682285308838, + "learning_rate": 1.755294117647059e-05, + "loss": 0.5303, + "step": 373 + }, + { + "epoch": 0.03523233084477521, + "grad_norm": 1.0743054151535034, + "learning_rate": 1.76e-05, + "loss": 0.385, + "step": 374 + }, + { + "epoch": 0.035326534937943056, + "grad_norm": 1.3105340003967285, + "learning_rate": 1.7647058823529414e-05, + "loss": 0.4938, + "step": 375 + }, + { + "epoch": 0.035420739031110905, + "grad_norm": 1.1594735383987427, + "learning_rate": 1.7694117647058826e-05, + "loss": 0.4668, + "step": 376 + }, + { + "epoch": 0.03551494312427875, + "grad_norm": 1.1395872831344604, + "learning_rate": 1.7741176470588236e-05, + "loss": 0.4816, + "step": 377 + }, + { + "epoch": 0.035609147217446596, + "grad_norm": 1.2569265365600586, + "learning_rate": 1.778823529411765e-05, + "loss": 0.5287, + "step": 378 + }, + { + "epoch": 0.035703351310614445, + "grad_norm": 1.1969325542449951, + "learning_rate": 1.783529411764706e-05, + "loss": 0.4791, + "step": 379 + }, + { + "epoch": 0.035797555403782294, + "grad_norm": 1.286061406135559, + "learning_rate": 1.7882352941176474e-05, + "loss": 0.4755, + "step": 380 + }, + { + "epoch": 0.03589175949695014, + "grad_norm": 1.2607399225234985, + "learning_rate": 1.7929411764705883e-05, + "loss": 0.513, + "step": 381 + }, + { + "epoch": 0.03598596359011799, + "grad_norm": 1.1393650770187378, + "learning_rate": 1.7976470588235296e-05, + "loss": 0.5092, + "step": 382 + }, + { + "epoch": 0.03608016768328584, + "grad_norm": 1.2013413906097412, + "learning_rate": 1.8023529411764705e-05, + "loss": 0.4752, + "step": 383 + }, + { + "epoch": 0.03617437177645369, + "grad_norm": 1.0431921482086182, + "learning_rate": 1.807058823529412e-05, + "loss": 0.4307, + "step": 384 + }, + { + "epoch": 0.03626857586962153, + "grad_norm": 1.1883023977279663, + "learning_rate": 1.811764705882353e-05, + "loss": 0.4929, + "step": 385 + }, + { + "epoch": 0.03636277996278938, + "grad_norm": 1.215178370475769, + "learning_rate": 1.8164705882352943e-05, + "loss": 0.4872, + "step": 386 + }, + { + "epoch": 0.03645698405595723, + "grad_norm": 1.2737501859664917, + "learning_rate": 1.8211764705882356e-05, + "loss": 0.4534, + "step": 387 + }, + { + "epoch": 0.03655118814912508, + "grad_norm": 1.1616137027740479, + "learning_rate": 1.8258823529411765e-05, + "loss": 0.4926, + "step": 388 + }, + { + "epoch": 0.03664539224229293, + "grad_norm": 1.0881837606430054, + "learning_rate": 1.8305882352941178e-05, + "loss": 0.432, + "step": 389 + }, + { + "epoch": 0.03673959633546078, + "grad_norm": 1.3107651472091675, + "learning_rate": 1.8352941176470587e-05, + "loss": 0.5049, + "step": 390 + }, + { + "epoch": 0.036833800428628626, + "grad_norm": 1.1299723386764526, + "learning_rate": 1.8400000000000003e-05, + "loss": 0.4903, + "step": 391 + }, + { + "epoch": 0.036928004521796474, + "grad_norm": 1.2426947355270386, + "learning_rate": 1.8447058823529413e-05, + "loss": 0.5486, + "step": 392 + }, + { + "epoch": 0.03702220861496432, + "grad_norm": 1.1760404109954834, + "learning_rate": 1.8494117647058825e-05, + "loss": 0.558, + "step": 393 + }, + { + "epoch": 0.037116412708132165, + "grad_norm": 1.048372745513916, + "learning_rate": 1.8541176470588235e-05, + "loss": 0.4148, + "step": 394 + }, + { + "epoch": 0.037210616801300014, + "grad_norm": 1.1490461826324463, + "learning_rate": 1.8588235294117647e-05, + "loss": 0.4577, + "step": 395 + }, + { + "epoch": 0.03730482089446786, + "grad_norm": 1.167213797569275, + "learning_rate": 1.863529411764706e-05, + "loss": 0.5022, + "step": 396 + }, + { + "epoch": 0.03739902498763571, + "grad_norm": 1.1789714097976685, + "learning_rate": 1.8682352941176473e-05, + "loss": 0.4654, + "step": 397 + }, + { + "epoch": 0.03749322908080356, + "grad_norm": 1.1918264627456665, + "learning_rate": 1.8729411764705886e-05, + "loss": 0.5157, + "step": 398 + }, + { + "epoch": 0.03758743317397141, + "grad_norm": 1.4549074172973633, + "learning_rate": 1.8776470588235295e-05, + "loss": 0.5104, + "step": 399 + }, + { + "epoch": 0.03768163726713926, + "grad_norm": 1.2526028156280518, + "learning_rate": 1.8823529411764708e-05, + "loss": 0.5051, + "step": 400 + }, + { + "epoch": 0.03777584136030711, + "grad_norm": 1.1502976417541504, + "learning_rate": 1.8870588235294117e-05, + "loss": 0.5033, + "step": 401 + }, + { + "epoch": 0.03787004545347495, + "grad_norm": 1.1888201236724854, + "learning_rate": 1.8917647058823533e-05, + "loss": 0.4586, + "step": 402 + }, + { + "epoch": 0.0379642495466428, + "grad_norm": 1.2222927808761597, + "learning_rate": 1.8964705882352942e-05, + "loss": 0.5019, + "step": 403 + }, + { + "epoch": 0.03805845363981065, + "grad_norm": 1.211268663406372, + "learning_rate": 1.9011764705882355e-05, + "loss": 0.4384, + "step": 404 + }, + { + "epoch": 0.0381526577329785, + "grad_norm": 1.079023838043213, + "learning_rate": 1.9058823529411764e-05, + "loss": 0.4863, + "step": 405 + }, + { + "epoch": 0.038246861826146346, + "grad_norm": 1.2167657613754272, + "learning_rate": 1.9105882352941177e-05, + "loss": 0.4722, + "step": 406 + }, + { + "epoch": 0.038341065919314195, + "grad_norm": 1.325701117515564, + "learning_rate": 1.915294117647059e-05, + "loss": 0.4828, + "step": 407 + }, + { + "epoch": 0.038435270012482044, + "grad_norm": 1.3329956531524658, + "learning_rate": 1.9200000000000003e-05, + "loss": 0.4434, + "step": 408 + }, + { + "epoch": 0.03852947410564989, + "grad_norm": 1.2114311456680298, + "learning_rate": 1.9247058823529415e-05, + "loss": 0.5059, + "step": 409 + }, + { + "epoch": 0.03862367819881774, + "grad_norm": 1.188414454460144, + "learning_rate": 1.9294117647058825e-05, + "loss": 0.4929, + "step": 410 + }, + { + "epoch": 0.038717882291985584, + "grad_norm": 1.505661129951477, + "learning_rate": 1.9341176470588237e-05, + "loss": 0.4896, + "step": 411 + }, + { + "epoch": 0.03881208638515343, + "grad_norm": 1.4131497144699097, + "learning_rate": 1.9388235294117647e-05, + "loss": 0.506, + "step": 412 + }, + { + "epoch": 0.03890629047832128, + "grad_norm": 1.2669509649276733, + "learning_rate": 1.9435294117647063e-05, + "loss": 0.5385, + "step": 413 + }, + { + "epoch": 0.03900049457148913, + "grad_norm": 1.2937191724777222, + "learning_rate": 1.9482352941176472e-05, + "loss": 0.4741, + "step": 414 + }, + { + "epoch": 0.03909469866465698, + "grad_norm": 1.3381024599075317, + "learning_rate": 1.9529411764705885e-05, + "loss": 0.4952, + "step": 415 + }, + { + "epoch": 0.03918890275782483, + "grad_norm": 1.2062170505523682, + "learning_rate": 1.9576470588235297e-05, + "loss": 0.4666, + "step": 416 + }, + { + "epoch": 0.03928310685099268, + "grad_norm": 1.2051784992218018, + "learning_rate": 1.9623529411764707e-05, + "loss": 0.5123, + "step": 417 + }, + { + "epoch": 0.039377310944160526, + "grad_norm": 1.2372658252716064, + "learning_rate": 1.967058823529412e-05, + "loss": 0.5203, + "step": 418 + }, + { + "epoch": 0.039471515037328375, + "grad_norm": 1.284125804901123, + "learning_rate": 1.971764705882353e-05, + "loss": 0.5003, + "step": 419 + }, + { + "epoch": 0.03956571913049622, + "grad_norm": 1.290552020072937, + "learning_rate": 1.9764705882352945e-05, + "loss": 0.477, + "step": 420 + }, + { + "epoch": 0.039659923223664066, + "grad_norm": 1.0635818243026733, + "learning_rate": 1.9811764705882354e-05, + "loss": 0.4348, + "step": 421 + }, + { + "epoch": 0.039754127316831915, + "grad_norm": 1.3787859678268433, + "learning_rate": 1.9858823529411767e-05, + "loss": 0.4778, + "step": 422 + }, + { + "epoch": 0.039848331409999764, + "grad_norm": 1.476486325263977, + "learning_rate": 1.9905882352941176e-05, + "loss": 0.5057, + "step": 423 + }, + { + "epoch": 0.03994253550316761, + "grad_norm": 1.2540762424468994, + "learning_rate": 1.995294117647059e-05, + "loss": 0.4737, + "step": 424 + }, + { + "epoch": 0.04003673959633546, + "grad_norm": 1.1084176301956177, + "learning_rate": 2e-05, + "loss": 0.4288, + "step": 425 + }, + { + "epoch": 0.04013094368950331, + "grad_norm": 1.2009215354919434, + "learning_rate": 1.999999988599227e-05, + "loss": 0.4826, + "step": 426 + }, + { + "epoch": 0.04022514778267116, + "grad_norm": 1.214808464050293, + "learning_rate": 1.9999999543969074e-05, + "loss": 0.4561, + "step": 427 + }, + { + "epoch": 0.040319351875839, + "grad_norm": 1.221494436264038, + "learning_rate": 1.9999998973930422e-05, + "loss": 0.5272, + "step": 428 + }, + { + "epoch": 0.04041355596900685, + "grad_norm": 1.1077792644500732, + "learning_rate": 1.999999817587633e-05, + "loss": 0.455, + "step": 429 + }, + { + "epoch": 0.0405077600621747, + "grad_norm": 1.235834002494812, + "learning_rate": 1.9999997149806816e-05, + "loss": 0.4743, + "step": 430 + }, + { + "epoch": 0.04060196415534255, + "grad_norm": 1.2237988710403442, + "learning_rate": 1.9999995895721897e-05, + "loss": 0.4966, + "step": 431 + }, + { + "epoch": 0.0406961682485104, + "grad_norm": 1.158136248588562, + "learning_rate": 1.999999441362161e-05, + "loss": 0.5112, + "step": 432 + }, + { + "epoch": 0.04079037234167825, + "grad_norm": 1.1336339712142944, + "learning_rate": 1.9999992703505986e-05, + "loss": 0.5015, + "step": 433 + }, + { + "epoch": 0.040884576434846095, + "grad_norm": 1.130056381225586, + "learning_rate": 1.9999990765375062e-05, + "loss": 0.4514, + "step": 434 + }, + { + "epoch": 0.040978780528013944, + "grad_norm": 1.162482738494873, + "learning_rate": 1.999998859922888e-05, + "loss": 0.469, + "step": 435 + }, + { + "epoch": 0.04107298462118179, + "grad_norm": 1.1694494485855103, + "learning_rate": 1.9999986205067496e-05, + "loss": 0.4799, + "step": 436 + }, + { + "epoch": 0.041167188714349635, + "grad_norm": 0.9968825578689575, + "learning_rate": 1.999998358289096e-05, + "loss": 0.4409, + "step": 437 + }, + { + "epoch": 0.041261392807517484, + "grad_norm": 1.028335452079773, + "learning_rate": 1.9999980732699336e-05, + "loss": 0.4539, + "step": 438 + }, + { + "epoch": 0.04135559690068533, + "grad_norm": 1.2932831048965454, + "learning_rate": 1.999997765449268e-05, + "loss": 0.492, + "step": 439 + }, + { + "epoch": 0.04144980099385318, + "grad_norm": 1.2190347909927368, + "learning_rate": 1.9999974348271075e-05, + "loss": 0.4841, + "step": 440 + }, + { + "epoch": 0.04154400508702103, + "grad_norm": 1.1261992454528809, + "learning_rate": 1.9999970814034583e-05, + "loss": 0.4339, + "step": 441 + }, + { + "epoch": 0.04163820918018888, + "grad_norm": 1.215971827507019, + "learning_rate": 1.9999967051783296e-05, + "loss": 0.5165, + "step": 442 + }, + { + "epoch": 0.04173241327335673, + "grad_norm": 1.1952204704284668, + "learning_rate": 1.9999963061517293e-05, + "loss": 0.5068, + "step": 443 + }, + { + "epoch": 0.04182661736652458, + "grad_norm": 1.1849550008773804, + "learning_rate": 1.9999958843236668e-05, + "loss": 0.4797, + "step": 444 + }, + { + "epoch": 0.04192082145969243, + "grad_norm": 1.0811119079589844, + "learning_rate": 1.9999954396941515e-05, + "loss": 0.4631, + "step": 445 + }, + { + "epoch": 0.04201502555286027, + "grad_norm": 1.1640421152114868, + "learning_rate": 1.9999949722631938e-05, + "loss": 0.473, + "step": 446 + }, + { + "epoch": 0.04210922964602812, + "grad_norm": 1.1983455419540405, + "learning_rate": 1.9999944820308043e-05, + "loss": 0.5546, + "step": 447 + }, + { + "epoch": 0.04220343373919597, + "grad_norm": 1.105420708656311, + "learning_rate": 1.9999939689969937e-05, + "loss": 0.4665, + "step": 448 + }, + { + "epoch": 0.042297637832363816, + "grad_norm": 1.109999418258667, + "learning_rate": 1.9999934331617747e-05, + "loss": 0.481, + "step": 449 + }, + { + "epoch": 0.042391841925531665, + "grad_norm": 1.1446915864944458, + "learning_rate": 1.999992874525158e-05, + "loss": 0.5105, + "step": 450 + }, + { + "epoch": 0.042486046018699514, + "grad_norm": 1.1875211000442505, + "learning_rate": 1.999992293087158e-05, + "loss": 0.5124, + "step": 451 + }, + { + "epoch": 0.04258025011186736, + "grad_norm": 1.2004013061523438, + "learning_rate": 1.9999916888477868e-05, + "loss": 0.4798, + "step": 452 + }, + { + "epoch": 0.04267445420503521, + "grad_norm": 1.2664998769760132, + "learning_rate": 1.999991061807059e-05, + "loss": 0.4603, + "step": 453 + }, + { + "epoch": 0.042768658298203054, + "grad_norm": 1.1395792961120605, + "learning_rate": 1.999990411964988e-05, + "loss": 0.5195, + "step": 454 + }, + { + "epoch": 0.0428628623913709, + "grad_norm": 1.1454837322235107, + "learning_rate": 1.9999897393215893e-05, + "loss": 0.512, + "step": 455 + }, + { + "epoch": 0.04295706648453875, + "grad_norm": 1.1613349914550781, + "learning_rate": 1.999989043876878e-05, + "loss": 0.5386, + "step": 456 + }, + { + "epoch": 0.0430512705777066, + "grad_norm": 1.0763133764266968, + "learning_rate": 1.9999883256308702e-05, + "loss": 0.437, + "step": 457 + }, + { + "epoch": 0.04314547467087445, + "grad_norm": 1.0761719942092896, + "learning_rate": 1.999987584583582e-05, + "loss": 0.4538, + "step": 458 + }, + { + "epoch": 0.0432396787640423, + "grad_norm": 1.1660627126693726, + "learning_rate": 1.9999868207350298e-05, + "loss": 0.5102, + "step": 459 + }, + { + "epoch": 0.04333388285721015, + "grad_norm": 1.187119483947754, + "learning_rate": 1.9999860340852318e-05, + "loss": 0.5497, + "step": 460 + }, + { + "epoch": 0.043428086950377996, + "grad_norm": 1.0927040576934814, + "learning_rate": 1.9999852246342064e-05, + "loss": 0.4906, + "step": 461 + }, + { + "epoch": 0.043522291043545845, + "grad_norm": 1.19398832321167, + "learning_rate": 1.9999843923819705e-05, + "loss": 0.461, + "step": 462 + }, + { + "epoch": 0.04361649513671369, + "grad_norm": 1.0197585821151733, + "learning_rate": 1.9999835373285445e-05, + "loss": 0.4999, + "step": 463 + }, + { + "epoch": 0.043710699229881536, + "grad_norm": 1.040513277053833, + "learning_rate": 1.9999826594739473e-05, + "loss": 0.4405, + "step": 464 + }, + { + "epoch": 0.043804903323049385, + "grad_norm": 1.1803407669067383, + "learning_rate": 1.9999817588181987e-05, + "loss": 0.4917, + "step": 465 + }, + { + "epoch": 0.043899107416217234, + "grad_norm": 1.0444204807281494, + "learning_rate": 1.9999808353613196e-05, + "loss": 0.4894, + "step": 466 + }, + { + "epoch": 0.04399331150938508, + "grad_norm": 1.1150667667388916, + "learning_rate": 1.999979889103331e-05, + "loss": 0.4987, + "step": 467 + }, + { + "epoch": 0.04408751560255293, + "grad_norm": 1.2186847925186157, + "learning_rate": 1.9999789200442545e-05, + "loss": 0.4796, + "step": 468 + }, + { + "epoch": 0.04418171969572078, + "grad_norm": 1.144182562828064, + "learning_rate": 1.999977928184112e-05, + "loss": 0.5063, + "step": 469 + }, + { + "epoch": 0.04427592378888863, + "grad_norm": 1.0240905284881592, + "learning_rate": 1.9999769135229267e-05, + "loss": 0.5001, + "step": 470 + }, + { + "epoch": 0.04437012788205648, + "grad_norm": 1.119954228401184, + "learning_rate": 1.9999758760607207e-05, + "loss": 0.4732, + "step": 471 + }, + { + "epoch": 0.04446433197522432, + "grad_norm": 1.1916732788085938, + "learning_rate": 1.9999748157975184e-05, + "loss": 0.5002, + "step": 472 + }, + { + "epoch": 0.04455853606839217, + "grad_norm": 1.103894829750061, + "learning_rate": 1.999973732733344e-05, + "loss": 0.4516, + "step": 473 + }, + { + "epoch": 0.04465274016156002, + "grad_norm": 1.1423009634017944, + "learning_rate": 1.9999726268682217e-05, + "loss": 0.4931, + "step": 474 + }, + { + "epoch": 0.04474694425472787, + "grad_norm": 1.1311986446380615, + "learning_rate": 1.999971498202177e-05, + "loss": 0.4436, + "step": 475 + }, + { + "epoch": 0.044841148347895717, + "grad_norm": 1.1499707698822021, + "learning_rate": 1.9999703467352356e-05, + "loss": 0.4663, + "step": 476 + }, + { + "epoch": 0.044935352441063565, + "grad_norm": 1.1432238817214966, + "learning_rate": 1.999969172467424e-05, + "loss": 0.4648, + "step": 477 + }, + { + "epoch": 0.045029556534231414, + "grad_norm": 1.2627131938934326, + "learning_rate": 1.999967975398769e-05, + "loss": 0.5371, + "step": 478 + }, + { + "epoch": 0.04512376062739926, + "grad_norm": 1.1618503332138062, + "learning_rate": 1.9999667555292975e-05, + "loss": 0.4155, + "step": 479 + }, + { + "epoch": 0.045217964720567105, + "grad_norm": 1.1504175662994385, + "learning_rate": 1.9999655128590373e-05, + "loss": 0.4705, + "step": 480 + }, + { + "epoch": 0.045312168813734954, + "grad_norm": 1.2788769006729126, + "learning_rate": 1.999964247388017e-05, + "loss": 0.5294, + "step": 481 + }, + { + "epoch": 0.0454063729069028, + "grad_norm": 1.2049305438995361, + "learning_rate": 1.9999629591162658e-05, + "loss": 0.5037, + "step": 482 + }, + { + "epoch": 0.04550057700007065, + "grad_norm": 1.1303664445877075, + "learning_rate": 1.9999616480438122e-05, + "loss": 0.48, + "step": 483 + }, + { + "epoch": 0.0455947810932385, + "grad_norm": 1.032772183418274, + "learning_rate": 1.9999603141706866e-05, + "loss": 0.4712, + "step": 484 + }, + { + "epoch": 0.04568898518640635, + "grad_norm": 1.1010346412658691, + "learning_rate": 1.9999589574969198e-05, + "loss": 0.452, + "step": 485 + }, + { + "epoch": 0.0457831892795742, + "grad_norm": 1.2685179710388184, + "learning_rate": 1.9999575780225418e-05, + "loss": 0.4567, + "step": 486 + }, + { + "epoch": 0.04587739337274205, + "grad_norm": 1.0697119235992432, + "learning_rate": 1.9999561757475846e-05, + "loss": 0.4574, + "step": 487 + }, + { + "epoch": 0.0459715974659099, + "grad_norm": 1.0218472480773926, + "learning_rate": 1.9999547506720804e-05, + "loss": 0.4656, + "step": 488 + }, + { + "epoch": 0.04606580155907774, + "grad_norm": 1.01448655128479, + "learning_rate": 1.9999533027960613e-05, + "loss": 0.4836, + "step": 489 + }, + { + "epoch": 0.04616000565224559, + "grad_norm": 1.0936620235443115, + "learning_rate": 1.9999518321195605e-05, + "loss": 0.4991, + "step": 490 + }, + { + "epoch": 0.04625420974541344, + "grad_norm": 1.1408958435058594, + "learning_rate": 1.9999503386426113e-05, + "loss": 0.5001, + "step": 491 + }, + { + "epoch": 0.046348413838581286, + "grad_norm": 0.9541551470756531, + "learning_rate": 1.9999488223652478e-05, + "loss": 0.4713, + "step": 492 + }, + { + "epoch": 0.046442617931749135, + "grad_norm": 1.0181952714920044, + "learning_rate": 1.999947283287505e-05, + "loss": 0.4461, + "step": 493 + }, + { + "epoch": 0.046536822024916984, + "grad_norm": 0.9244551658630371, + "learning_rate": 1.9999457214094177e-05, + "loss": 0.3794, + "step": 494 + }, + { + "epoch": 0.04663102611808483, + "grad_norm": 1.0513863563537598, + "learning_rate": 1.9999441367310216e-05, + "loss": 0.4641, + "step": 495 + }, + { + "epoch": 0.04672523021125268, + "grad_norm": 1.0445611476898193, + "learning_rate": 1.999942529252352e-05, + "loss": 0.4759, + "step": 496 + }, + { + "epoch": 0.04681943430442053, + "grad_norm": 1.1633409261703491, + "learning_rate": 1.9999408989734474e-05, + "loss": 0.4455, + "step": 497 + }, + { + "epoch": 0.04691363839758837, + "grad_norm": 1.0402541160583496, + "learning_rate": 1.9999392458943432e-05, + "loss": 0.4627, + "step": 498 + }, + { + "epoch": 0.04700784249075622, + "grad_norm": 1.0239955186843872, + "learning_rate": 1.999937570015078e-05, + "loss": 0.4638, + "step": 499 + }, + { + "epoch": 0.04710204658392407, + "grad_norm": 1.2518709897994995, + "learning_rate": 1.9999358713356893e-05, + "loss": 0.5401, + "step": 500 + }, + { + "epoch": 0.04719625067709192, + "grad_norm": 1.000349998474121, + "learning_rate": 1.999934149856217e-05, + "loss": 0.4629, + "step": 501 + }, + { + "epoch": 0.04729045477025977, + "grad_norm": 1.187333345413208, + "learning_rate": 1.999932405576699e-05, + "loss": 0.5195, + "step": 502 + }, + { + "epoch": 0.04738465886342762, + "grad_norm": 1.1107083559036255, + "learning_rate": 1.999930638497176e-05, + "loss": 0.4893, + "step": 503 + }, + { + "epoch": 0.047478862956595466, + "grad_norm": 0.9216952323913574, + "learning_rate": 1.9999288486176882e-05, + "loss": 0.4263, + "step": 504 + }, + { + "epoch": 0.047573067049763315, + "grad_norm": 0.9864377379417419, + "learning_rate": 1.9999270359382762e-05, + "loss": 0.4538, + "step": 505 + }, + { + "epoch": 0.04766727114293116, + "grad_norm": 1.0480002164840698, + "learning_rate": 1.999925200458981e-05, + "loss": 0.4523, + "step": 506 + }, + { + "epoch": 0.047761475236099006, + "grad_norm": 1.1025638580322266, + "learning_rate": 1.999923342179845e-05, + "loss": 0.4599, + "step": 507 + }, + { + "epoch": 0.047855679329266855, + "grad_norm": 1.1831530332565308, + "learning_rate": 1.9999214611009105e-05, + "loss": 0.4961, + "step": 508 + }, + { + "epoch": 0.047949883422434704, + "grad_norm": 1.1201529502868652, + "learning_rate": 1.9999195572222204e-05, + "loss": 0.4841, + "step": 509 + }, + { + "epoch": 0.04804408751560255, + "grad_norm": 1.1097553968429565, + "learning_rate": 1.9999176305438178e-05, + "loss": 0.4252, + "step": 510 + }, + { + "epoch": 0.0481382916087704, + "grad_norm": 1.1436023712158203, + "learning_rate": 1.9999156810657466e-05, + "loss": 0.5302, + "step": 511 + }, + { + "epoch": 0.04823249570193825, + "grad_norm": 1.158348560333252, + "learning_rate": 1.999913708788052e-05, + "loss": 0.5032, + "step": 512 + }, + { + "epoch": 0.0483266997951061, + "grad_norm": 0.9677159190177917, + "learning_rate": 1.9999117137107783e-05, + "loss": 0.4697, + "step": 513 + }, + { + "epoch": 0.04842090388827395, + "grad_norm": 1.1214277744293213, + "learning_rate": 1.999909695833971e-05, + "loss": 0.4967, + "step": 514 + }, + { + "epoch": 0.04851510798144179, + "grad_norm": 1.142343282699585, + "learning_rate": 1.999907655157676e-05, + "loss": 0.473, + "step": 515 + }, + { + "epoch": 0.04860931207460964, + "grad_norm": 0.9969754219055176, + "learning_rate": 1.9999055916819402e-05, + "loss": 0.4535, + "step": 516 + }, + { + "epoch": 0.04870351616777749, + "grad_norm": 1.237532377243042, + "learning_rate": 1.9999035054068107e-05, + "loss": 0.5169, + "step": 517 + }, + { + "epoch": 0.04879772026094534, + "grad_norm": 1.1392680406570435, + "learning_rate": 1.999901396332335e-05, + "loss": 0.4856, + "step": 518 + }, + { + "epoch": 0.048891924354113186, + "grad_norm": 1.0848746299743652, + "learning_rate": 1.9998992644585606e-05, + "loss": 0.4951, + "step": 519 + }, + { + "epoch": 0.048986128447281035, + "grad_norm": 1.013777256011963, + "learning_rate": 1.9998971097855372e-05, + "loss": 0.4695, + "step": 520 + }, + { + "epoch": 0.049080332540448884, + "grad_norm": 1.1787139177322388, + "learning_rate": 1.999894932313313e-05, + "loss": 0.5033, + "step": 521 + }, + { + "epoch": 0.04917453663361673, + "grad_norm": 0.9743266701698303, + "learning_rate": 1.999892732041938e-05, + "loss": 0.3751, + "step": 522 + }, + { + "epoch": 0.049268740726784575, + "grad_norm": 1.13385009765625, + "learning_rate": 1.9998905089714622e-05, + "loss": 0.4557, + "step": 523 + }, + { + "epoch": 0.049362944819952424, + "grad_norm": 1.087335467338562, + "learning_rate": 1.9998882631019366e-05, + "loss": 0.4222, + "step": 524 + }, + { + "epoch": 0.04945714891312027, + "grad_norm": 1.0518101453781128, + "learning_rate": 1.9998859944334123e-05, + "loss": 0.4415, + "step": 525 + }, + { + "epoch": 0.04955135300628812, + "grad_norm": 1.0453583002090454, + "learning_rate": 1.9998837029659408e-05, + "loss": 0.4615, + "step": 526 + }, + { + "epoch": 0.04964555709945597, + "grad_norm": 1.1417620182037354, + "learning_rate": 1.9998813886995746e-05, + "loss": 0.4675, + "step": 527 + }, + { + "epoch": 0.04973976119262382, + "grad_norm": 1.0332422256469727, + "learning_rate": 1.9998790516343666e-05, + "loss": 0.4849, + "step": 528 + }, + { + "epoch": 0.04983396528579167, + "grad_norm": 1.053110122680664, + "learning_rate": 1.9998766917703697e-05, + "loss": 0.5184, + "step": 529 + }, + { + "epoch": 0.04992816937895952, + "grad_norm": 1.0806505680084229, + "learning_rate": 1.999874309107638e-05, + "loss": 0.4514, + "step": 530 + }, + { + "epoch": 0.05002237347212737, + "grad_norm": 1.0935556888580322, + "learning_rate": 1.9998719036462255e-05, + "loss": 0.5339, + "step": 531 + }, + { + "epoch": 0.05011657756529521, + "grad_norm": 0.9729276895523071, + "learning_rate": 1.9998694753861873e-05, + "loss": 0.445, + "step": 532 + }, + { + "epoch": 0.05021078165846306, + "grad_norm": 1.0580323934555054, + "learning_rate": 1.9998670243275787e-05, + "loss": 0.5206, + "step": 533 + }, + { + "epoch": 0.05030498575163091, + "grad_norm": 1.11548912525177, + "learning_rate": 1.999864550470456e-05, + "loss": 0.5121, + "step": 534 + }, + { + "epoch": 0.050399189844798756, + "grad_norm": 1.0575588941574097, + "learning_rate": 1.999862053814875e-05, + "loss": 0.4532, + "step": 535 + }, + { + "epoch": 0.050493393937966605, + "grad_norm": 1.0175671577453613, + "learning_rate": 1.999859534360893e-05, + "loss": 0.4673, + "step": 536 + }, + { + "epoch": 0.050587598031134454, + "grad_norm": 1.1149822473526, + "learning_rate": 1.9998569921085667e-05, + "loss": 0.519, + "step": 537 + }, + { + "epoch": 0.0506818021243023, + "grad_norm": 1.0861210823059082, + "learning_rate": 1.999854427057955e-05, + "loss": 0.4713, + "step": 538 + }, + { + "epoch": 0.05077600621747015, + "grad_norm": 1.114902377128601, + "learning_rate": 1.9998518392091163e-05, + "loss": 0.5145, + "step": 539 + }, + { + "epoch": 0.050870210310638, + "grad_norm": 0.9747793674468994, + "learning_rate": 1.9998492285621092e-05, + "loss": 0.461, + "step": 540 + }, + { + "epoch": 0.05096441440380584, + "grad_norm": 1.0336741209030151, + "learning_rate": 1.9998465951169935e-05, + "loss": 0.5079, + "step": 541 + }, + { + "epoch": 0.05105861849697369, + "grad_norm": 1.056095838546753, + "learning_rate": 1.999843938873829e-05, + "loss": 0.4316, + "step": 542 + }, + { + "epoch": 0.05115282259014154, + "grad_norm": 1.0102380514144897, + "learning_rate": 1.9998412598326765e-05, + "loss": 0.5051, + "step": 543 + }, + { + "epoch": 0.05124702668330939, + "grad_norm": 1.142443299293518, + "learning_rate": 1.9998385579935968e-05, + "loss": 0.4933, + "step": 544 + }, + { + "epoch": 0.05134123077647724, + "grad_norm": 1.115990161895752, + "learning_rate": 1.9998358333566518e-05, + "loss": 0.5089, + "step": 545 + }, + { + "epoch": 0.05143543486964509, + "grad_norm": 1.1316367387771606, + "learning_rate": 1.9998330859219037e-05, + "loss": 0.4447, + "step": 546 + }, + { + "epoch": 0.051529638962812936, + "grad_norm": 1.1623283624649048, + "learning_rate": 1.999830315689415e-05, + "loss": 0.4728, + "step": 547 + }, + { + "epoch": 0.051623843055980785, + "grad_norm": 1.0471458435058594, + "learning_rate": 1.9998275226592487e-05, + "loss": 0.4345, + "step": 548 + }, + { + "epoch": 0.05171804714914863, + "grad_norm": 0.9686200022697449, + "learning_rate": 1.9998247068314684e-05, + "loss": 0.4044, + "step": 549 + }, + { + "epoch": 0.051812251242316476, + "grad_norm": 1.1127114295959473, + "learning_rate": 1.999821868206139e-05, + "loss": 0.448, + "step": 550 + }, + { + "epoch": 0.051906455335484325, + "grad_norm": 0.9898568987846375, + "learning_rate": 1.999819006783324e-05, + "loss": 0.425, + "step": 551 + }, + { + "epoch": 0.052000659428652174, + "grad_norm": 1.0763282775878906, + "learning_rate": 1.99981612256309e-05, + "loss": 0.4433, + "step": 552 + }, + { + "epoch": 0.05209486352182002, + "grad_norm": 1.0753988027572632, + "learning_rate": 1.999813215545502e-05, + "loss": 0.4463, + "step": 553 + }, + { + "epoch": 0.05218906761498787, + "grad_norm": 1.0426037311553955, + "learning_rate": 1.9998102857306264e-05, + "loss": 0.394, + "step": 554 + }, + { + "epoch": 0.05228327170815572, + "grad_norm": 0.9977891445159912, + "learning_rate": 1.9998073331185305e-05, + "loss": 0.4555, + "step": 555 + }, + { + "epoch": 0.05237747580132357, + "grad_norm": 1.111856460571289, + "learning_rate": 1.999804357709281e-05, + "loss": 0.4504, + "step": 556 + }, + { + "epoch": 0.05247167989449142, + "grad_norm": 1.0742316246032715, + "learning_rate": 1.9998013595029454e-05, + "loss": 0.4027, + "step": 557 + }, + { + "epoch": 0.05256588398765926, + "grad_norm": 1.355398178100586, + "learning_rate": 1.9997983384995927e-05, + "loss": 0.5059, + "step": 558 + }, + { + "epoch": 0.05266008808082711, + "grad_norm": 1.0624873638153076, + "learning_rate": 1.999795294699292e-05, + "loss": 0.4277, + "step": 559 + }, + { + "epoch": 0.05275429217399496, + "grad_norm": 1.124698281288147, + "learning_rate": 1.999792228102112e-05, + "loss": 0.434, + "step": 560 + }, + { + "epoch": 0.05284849626716281, + "grad_norm": 0.9644001722335815, + "learning_rate": 1.9997891387081235e-05, + "loss": 0.3996, + "step": 561 + }, + { + "epoch": 0.052942700360330656, + "grad_norm": 1.0603997707366943, + "learning_rate": 1.999786026517396e-05, + "loss": 0.4611, + "step": 562 + }, + { + "epoch": 0.053036904453498505, + "grad_norm": 1.1700316667556763, + "learning_rate": 1.999782891530001e-05, + "loss": 0.4712, + "step": 563 + }, + { + "epoch": 0.053131108546666354, + "grad_norm": 1.2461744546890259, + "learning_rate": 1.99977973374601e-05, + "loss": 0.4314, + "step": 564 + }, + { + "epoch": 0.0532253126398342, + "grad_norm": 1.1271206140518188, + "learning_rate": 1.9997765531654945e-05, + "loss": 0.4393, + "step": 565 + }, + { + "epoch": 0.05331951673300205, + "grad_norm": 1.1461455821990967, + "learning_rate": 1.999773349788528e-05, + "loss": 0.4805, + "step": 566 + }, + { + "epoch": 0.053413720826169894, + "grad_norm": 0.9542377591133118, + "learning_rate": 1.9997701236151826e-05, + "loss": 0.4144, + "step": 567 + }, + { + "epoch": 0.05350792491933774, + "grad_norm": 1.0481843948364258, + "learning_rate": 1.9997668746455322e-05, + "loss": 0.44, + "step": 568 + }, + { + "epoch": 0.05360212901250559, + "grad_norm": 1.0989118814468384, + "learning_rate": 1.999763602879651e-05, + "loss": 0.469, + "step": 569 + }, + { + "epoch": 0.05369633310567344, + "grad_norm": 0.987362802028656, + "learning_rate": 1.9997603083176136e-05, + "loss": 0.4572, + "step": 570 + }, + { + "epoch": 0.05379053719884129, + "grad_norm": 0.9620579481124878, + "learning_rate": 1.9997569909594948e-05, + "loss": 0.4203, + "step": 571 + }, + { + "epoch": 0.05388474129200914, + "grad_norm": 1.0838489532470703, + "learning_rate": 1.9997536508053704e-05, + "loss": 0.4737, + "step": 572 + }, + { + "epoch": 0.05397894538517699, + "grad_norm": 0.9939470291137695, + "learning_rate": 1.999750287855317e-05, + "loss": 0.4097, + "step": 573 + }, + { + "epoch": 0.05407314947834484, + "grad_norm": 0.970329999923706, + "learning_rate": 1.9997469021094103e-05, + "loss": 0.422, + "step": 574 + }, + { + "epoch": 0.05416735357151268, + "grad_norm": 1.0482878684997559, + "learning_rate": 1.9997434935677285e-05, + "loss": 0.4607, + "step": 575 + }, + { + "epoch": 0.05426155766468053, + "grad_norm": 1.0672898292541504, + "learning_rate": 1.9997400622303488e-05, + "loss": 0.4587, + "step": 576 + }, + { + "epoch": 0.05435576175784838, + "grad_norm": 1.1420453786849976, + "learning_rate": 1.9997366080973493e-05, + "loss": 0.5164, + "step": 577 + }, + { + "epoch": 0.054449965851016226, + "grad_norm": 1.1007829904556274, + "learning_rate": 1.9997331311688095e-05, + "loss": 0.5201, + "step": 578 + }, + { + "epoch": 0.054544169944184075, + "grad_norm": 0.9502580761909485, + "learning_rate": 1.999729631444808e-05, + "loss": 0.4637, + "step": 579 + }, + { + "epoch": 0.054638374037351924, + "grad_norm": 1.0256754159927368, + "learning_rate": 1.9997261089254246e-05, + "loss": 0.431, + "step": 580 + }, + { + "epoch": 0.05473257813051977, + "grad_norm": 1.1105985641479492, + "learning_rate": 1.9997225636107402e-05, + "loss": 0.479, + "step": 581 + }, + { + "epoch": 0.05482678222368762, + "grad_norm": 1.2762510776519775, + "learning_rate": 1.999718995500835e-05, + "loss": 0.5189, + "step": 582 + }, + { + "epoch": 0.05492098631685547, + "grad_norm": 1.0495827198028564, + "learning_rate": 1.9997154045957903e-05, + "loss": 0.4846, + "step": 583 + }, + { + "epoch": 0.05501519041002331, + "grad_norm": 1.1202806234359741, + "learning_rate": 1.999711790895689e-05, + "loss": 0.4902, + "step": 584 + }, + { + "epoch": 0.05510939450319116, + "grad_norm": 1.0130805969238281, + "learning_rate": 1.9997081544006124e-05, + "loss": 0.4476, + "step": 585 + }, + { + "epoch": 0.05520359859635901, + "grad_norm": 1.071753740310669, + "learning_rate": 1.9997044951106438e-05, + "loss": 0.473, + "step": 586 + }, + { + "epoch": 0.05529780268952686, + "grad_norm": 1.1382759809494019, + "learning_rate": 1.9997008130258665e-05, + "loss": 0.4777, + "step": 587 + }, + { + "epoch": 0.05539200678269471, + "grad_norm": 1.0548356771469116, + "learning_rate": 1.9996971081463647e-05, + "loss": 0.445, + "step": 588 + }, + { + "epoch": 0.05548621087586256, + "grad_norm": 1.115378975868225, + "learning_rate": 1.999693380472223e-05, + "loss": 0.4967, + "step": 589 + }, + { + "epoch": 0.055580414969030406, + "grad_norm": 1.1774591207504272, + "learning_rate": 1.999689630003526e-05, + "loss": 0.4609, + "step": 590 + }, + { + "epoch": 0.055674619062198255, + "grad_norm": 1.0086830854415894, + "learning_rate": 1.9996858567403593e-05, + "loss": 0.4331, + "step": 591 + }, + { + "epoch": 0.055768823155366104, + "grad_norm": 0.9901698231697083, + "learning_rate": 1.999682060682809e-05, + "loss": 0.4519, + "step": 592 + }, + { + "epoch": 0.055863027248533946, + "grad_norm": 1.1610777378082275, + "learning_rate": 1.999678241830962e-05, + "loss": 0.4579, + "step": 593 + }, + { + "epoch": 0.055957231341701795, + "grad_norm": 1.0401917695999146, + "learning_rate": 1.9996744001849047e-05, + "loss": 0.4461, + "step": 594 + }, + { + "epoch": 0.056051435434869644, + "grad_norm": 1.081620216369629, + "learning_rate": 1.999670535744725e-05, + "loss": 0.4835, + "step": 595 + }, + { + "epoch": 0.05614563952803749, + "grad_norm": 1.0446057319641113, + "learning_rate": 1.9996666485105115e-05, + "loss": 0.456, + "step": 596 + }, + { + "epoch": 0.05623984362120534, + "grad_norm": 1.1298969984054565, + "learning_rate": 1.9996627384823522e-05, + "loss": 0.4787, + "step": 597 + }, + { + "epoch": 0.05633404771437319, + "grad_norm": 1.162839651107788, + "learning_rate": 1.999658805660336e-05, + "loss": 0.3753, + "step": 598 + }, + { + "epoch": 0.05642825180754104, + "grad_norm": 1.0797040462493896, + "learning_rate": 1.9996548500445536e-05, + "loss": 0.4798, + "step": 599 + }, + { + "epoch": 0.05652245590070889, + "grad_norm": 0.9650039076805115, + "learning_rate": 1.9996508716350945e-05, + "loss": 0.498, + "step": 600 + }, + { + "epoch": 0.05661665999387673, + "grad_norm": 1.0434175729751587, + "learning_rate": 1.9996468704320496e-05, + "loss": 0.4426, + "step": 601 + }, + { + "epoch": 0.05671086408704458, + "grad_norm": 1.0903749465942383, + "learning_rate": 1.9996428464355097e-05, + "loss": 0.4484, + "step": 602 + }, + { + "epoch": 0.05680506818021243, + "grad_norm": 1.168329119682312, + "learning_rate": 1.9996387996455673e-05, + "loss": 0.4613, + "step": 603 + }, + { + "epoch": 0.05689927227338028, + "grad_norm": 1.1196221113204956, + "learning_rate": 1.9996347300623142e-05, + "loss": 0.4536, + "step": 604 + }, + { + "epoch": 0.056993476366548126, + "grad_norm": 1.053094506263733, + "learning_rate": 1.9996306376858433e-05, + "loss": 0.4514, + "step": 605 + }, + { + "epoch": 0.057087680459715975, + "grad_norm": 0.9870381951332092, + "learning_rate": 1.999626522516248e-05, + "loss": 0.4702, + "step": 606 + }, + { + "epoch": 0.057181884552883824, + "grad_norm": 1.0807932615280151, + "learning_rate": 1.9996223845536216e-05, + "loss": 0.4557, + "step": 607 + }, + { + "epoch": 0.05727608864605167, + "grad_norm": 1.026733160018921, + "learning_rate": 1.9996182237980592e-05, + "loss": 0.4836, + "step": 608 + }, + { + "epoch": 0.05737029273921952, + "grad_norm": 1.0263826847076416, + "learning_rate": 1.9996140402496554e-05, + "loss": 0.4547, + "step": 609 + }, + { + "epoch": 0.057464496832387364, + "grad_norm": 1.1398074626922607, + "learning_rate": 1.9996098339085054e-05, + "loss": 0.4958, + "step": 610 + }, + { + "epoch": 0.05755870092555521, + "grad_norm": 1.3231003284454346, + "learning_rate": 1.9996056047747054e-05, + "loss": 0.4774, + "step": 611 + }, + { + "epoch": 0.05765290501872306, + "grad_norm": 0.9639459848403931, + "learning_rate": 1.999601352848352e-05, + "loss": 0.3985, + "step": 612 + }, + { + "epoch": 0.05774710911189091, + "grad_norm": 1.007828950881958, + "learning_rate": 1.9995970781295412e-05, + "loss": 0.3997, + "step": 613 + }, + { + "epoch": 0.05784131320505876, + "grad_norm": 1.2313796281814575, + "learning_rate": 1.9995927806183713e-05, + "loss": 0.4611, + "step": 614 + }, + { + "epoch": 0.05793551729822661, + "grad_norm": 1.1767758131027222, + "learning_rate": 1.9995884603149403e-05, + "loss": 0.4646, + "step": 615 + }, + { + "epoch": 0.05802972139139446, + "grad_norm": 1.0385993719100952, + "learning_rate": 1.9995841172193465e-05, + "loss": 0.481, + "step": 616 + }, + { + "epoch": 0.05812392548456231, + "grad_norm": 1.0699119567871094, + "learning_rate": 1.999579751331689e-05, + "loss": 0.4517, + "step": 617 + }, + { + "epoch": 0.058218129577730156, + "grad_norm": 1.3174458742141724, + "learning_rate": 1.999575362652067e-05, + "loss": 0.5169, + "step": 618 + }, + { + "epoch": 0.058312333670898, + "grad_norm": 1.0863970518112183, + "learning_rate": 1.999570951180581e-05, + "loss": 0.4659, + "step": 619 + }, + { + "epoch": 0.05840653776406585, + "grad_norm": 1.1128531694412231, + "learning_rate": 1.9995665169173313e-05, + "loss": 0.4469, + "step": 620 + }, + { + "epoch": 0.058500741857233696, + "grad_norm": 1.2424368858337402, + "learning_rate": 1.999562059862419e-05, + "loss": 0.4681, + "step": 621 + }, + { + "epoch": 0.058594945950401545, + "grad_norm": 0.9890236854553223, + "learning_rate": 1.9995575800159462e-05, + "loss": 0.3875, + "step": 622 + }, + { + "epoch": 0.058689150043569394, + "grad_norm": 1.1729191541671753, + "learning_rate": 1.999553077378015e-05, + "loss": 0.4408, + "step": 623 + }, + { + "epoch": 0.05878335413673724, + "grad_norm": 1.2006512880325317, + "learning_rate": 1.9995485519487276e-05, + "loss": 0.4509, + "step": 624 + }, + { + "epoch": 0.05887755822990509, + "grad_norm": 0.9693123698234558, + "learning_rate": 1.9995440037281872e-05, + "loss": 0.3543, + "step": 625 + }, + { + "epoch": 0.05897176232307294, + "grad_norm": 1.1058598756790161, + "learning_rate": 1.999539432716498e-05, + "loss": 0.4959, + "step": 626 + }, + { + "epoch": 0.05906596641624078, + "grad_norm": 1.1529923677444458, + "learning_rate": 1.9995348389137635e-05, + "loss": 0.4325, + "step": 627 + }, + { + "epoch": 0.05916017050940863, + "grad_norm": 1.079768180847168, + "learning_rate": 1.9995302223200893e-05, + "loss": 0.4196, + "step": 628 + }, + { + "epoch": 0.05925437460257648, + "grad_norm": 1.0184563398361206, + "learning_rate": 1.9995255829355798e-05, + "loss": 0.4032, + "step": 629 + }, + { + "epoch": 0.05934857869574433, + "grad_norm": 1.1753013134002686, + "learning_rate": 1.9995209207603414e-05, + "loss": 0.4522, + "step": 630 + }, + { + "epoch": 0.05944278278891218, + "grad_norm": 1.1616015434265137, + "learning_rate": 1.99951623579448e-05, + "loss": 0.4814, + "step": 631 + }, + { + "epoch": 0.05953698688208003, + "grad_norm": 1.0196958780288696, + "learning_rate": 1.999511528038103e-05, + "loss": 0.3906, + "step": 632 + }, + { + "epoch": 0.059631190975247876, + "grad_norm": 1.0535567998886108, + "learning_rate": 1.9995067974913175e-05, + "loss": 0.474, + "step": 633 + }, + { + "epoch": 0.059725395068415725, + "grad_norm": 1.2080250978469849, + "learning_rate": 1.999502044154231e-05, + "loss": 0.499, + "step": 634 + }, + { + "epoch": 0.059819599161583574, + "grad_norm": 1.0950868129730225, + "learning_rate": 1.9994972680269518e-05, + "loss": 0.4774, + "step": 635 + }, + { + "epoch": 0.059913803254751416, + "grad_norm": 0.9520348310470581, + "learning_rate": 1.9994924691095896e-05, + "loss": 0.4219, + "step": 636 + }, + { + "epoch": 0.060008007347919265, + "grad_norm": 1.1531012058258057, + "learning_rate": 1.9994876474022533e-05, + "loss": 0.5586, + "step": 637 + }, + { + "epoch": 0.060102211441087114, + "grad_norm": 1.3068886995315552, + "learning_rate": 1.9994828029050527e-05, + "loss": 0.5327, + "step": 638 + }, + { + "epoch": 0.06019641553425496, + "grad_norm": 0.9474340677261353, + "learning_rate": 1.9994779356180986e-05, + "loss": 0.3797, + "step": 639 + }, + { + "epoch": 0.06029061962742281, + "grad_norm": 1.1141210794448853, + "learning_rate": 1.999473045541502e-05, + "loss": 0.429, + "step": 640 + }, + { + "epoch": 0.06038482372059066, + "grad_norm": 0.9722087383270264, + "learning_rate": 1.999468132675374e-05, + "loss": 0.4279, + "step": 641 + }, + { + "epoch": 0.06047902781375851, + "grad_norm": 0.8949450254440308, + "learning_rate": 1.9994631970198268e-05, + "loss": 0.3797, + "step": 642 + }, + { + "epoch": 0.06057323190692636, + "grad_norm": 1.0000451803207397, + "learning_rate": 1.9994582385749735e-05, + "loss": 0.4243, + "step": 643 + }, + { + "epoch": 0.06066743600009421, + "grad_norm": 0.9662757515907288, + "learning_rate": 1.999453257340926e-05, + "loss": 0.4269, + "step": 644 + }, + { + "epoch": 0.06076164009326205, + "grad_norm": 1.0271151065826416, + "learning_rate": 1.999448253317799e-05, + "loss": 0.4215, + "step": 645 + }, + { + "epoch": 0.0608558441864299, + "grad_norm": 1.0444293022155762, + "learning_rate": 1.9994432265057064e-05, + "loss": 0.4495, + "step": 646 + }, + { + "epoch": 0.06095004827959775, + "grad_norm": 0.9824338555335999, + "learning_rate": 1.999438176904762e-05, + "loss": 0.442, + "step": 647 + }, + { + "epoch": 0.061044252372765596, + "grad_norm": 1.0066092014312744, + "learning_rate": 1.9994331045150814e-05, + "loss": 0.4283, + "step": 648 + }, + { + "epoch": 0.061138456465933445, + "grad_norm": 1.1279113292694092, + "learning_rate": 1.999428009336781e-05, + "loss": 0.5208, + "step": 649 + }, + { + "epoch": 0.061232660559101294, + "grad_norm": 1.0053372383117676, + "learning_rate": 1.999422891369976e-05, + "loss": 0.4212, + "step": 650 + }, + { + "epoch": 0.06132686465226914, + "grad_norm": 1.0733355283737183, + "learning_rate": 1.9994177506147834e-05, + "loss": 0.461, + "step": 651 + }, + { + "epoch": 0.06142106874543699, + "grad_norm": 1.1090284585952759, + "learning_rate": 1.9994125870713207e-05, + "loss": 0.4763, + "step": 652 + }, + { + "epoch": 0.061515272838604834, + "grad_norm": 1.0557955503463745, + "learning_rate": 1.999407400739705e-05, + "loss": 0.4282, + "step": 653 + }, + { + "epoch": 0.06160947693177268, + "grad_norm": 0.9294447898864746, + "learning_rate": 1.999402191620055e-05, + "loss": 0.4095, + "step": 654 + }, + { + "epoch": 0.06170368102494053, + "grad_norm": 1.004593014717102, + "learning_rate": 1.9993969597124896e-05, + "loss": 0.4279, + "step": 655 + }, + { + "epoch": 0.06179788511810838, + "grad_norm": 1.2182408571243286, + "learning_rate": 1.999391705017128e-05, + "loss": 0.4886, + "step": 656 + }, + { + "epoch": 0.06189208921127623, + "grad_norm": 1.0181046724319458, + "learning_rate": 1.99938642753409e-05, + "loss": 0.4573, + "step": 657 + }, + { + "epoch": 0.06198629330444408, + "grad_norm": 1.008346676826477, + "learning_rate": 1.9993811272634954e-05, + "loss": 0.4597, + "step": 658 + }, + { + "epoch": 0.06208049739761193, + "grad_norm": 1.1677072048187256, + "learning_rate": 1.999375804205466e-05, + "loss": 0.4695, + "step": 659 + }, + { + "epoch": 0.06217470149077978, + "grad_norm": 0.9776001572608948, + "learning_rate": 1.9993704583601224e-05, + "loss": 0.4482, + "step": 660 + }, + { + "epoch": 0.062268905583947626, + "grad_norm": 0.99480140209198, + "learning_rate": 1.999365089727587e-05, + "loss": 0.4439, + "step": 661 + }, + { + "epoch": 0.06236310967711547, + "grad_norm": 0.997078537940979, + "learning_rate": 1.999359698307982e-05, + "loss": 0.4439, + "step": 662 + }, + { + "epoch": 0.06245731377028332, + "grad_norm": 1.0121581554412842, + "learning_rate": 1.9993542841014303e-05, + "loss": 0.4432, + "step": 663 + }, + { + "epoch": 0.06255151786345117, + "grad_norm": 1.0929957628250122, + "learning_rate": 1.9993488471080553e-05, + "loss": 0.4819, + "step": 664 + }, + { + "epoch": 0.06264572195661902, + "grad_norm": 1.1690179109573364, + "learning_rate": 1.999343387327981e-05, + "loss": 0.4252, + "step": 665 + }, + { + "epoch": 0.06273992604978686, + "grad_norm": 0.9591436386108398, + "learning_rate": 1.999337904761332e-05, + "loss": 0.438, + "step": 666 + }, + { + "epoch": 0.0628341301429547, + "grad_norm": 0.9770983457565308, + "learning_rate": 1.9993323994082336e-05, + "loss": 0.469, + "step": 667 + }, + { + "epoch": 0.06292833423612255, + "grad_norm": 1.049344539642334, + "learning_rate": 1.9993268712688104e-05, + "loss": 0.4646, + "step": 668 + }, + { + "epoch": 0.0630225383292904, + "grad_norm": 1.0194511413574219, + "learning_rate": 1.9993213203431895e-05, + "loss": 0.4682, + "step": 669 + }, + { + "epoch": 0.06311674242245825, + "grad_norm": 0.9602471590042114, + "learning_rate": 1.999315746631497e-05, + "loss": 0.4507, + "step": 670 + }, + { + "epoch": 0.0632109465156261, + "grad_norm": 1.0650478601455688, + "learning_rate": 1.99931015013386e-05, + "loss": 0.5069, + "step": 671 + }, + { + "epoch": 0.06330515060879395, + "grad_norm": 0.924329936504364, + "learning_rate": 1.999304530850406e-05, + "loss": 0.4437, + "step": 672 + }, + { + "epoch": 0.0633993547019618, + "grad_norm": 1.0018926858901978, + "learning_rate": 1.999298888781263e-05, + "loss": 0.45, + "step": 673 + }, + { + "epoch": 0.06349355879512965, + "grad_norm": 0.9459987878799438, + "learning_rate": 1.9992932239265602e-05, + "loss": 0.4066, + "step": 674 + }, + { + "epoch": 0.0635877628882975, + "grad_norm": 0.9754473567008972, + "learning_rate": 1.9992875362864267e-05, + "loss": 0.4299, + "step": 675 + }, + { + "epoch": 0.06368196698146535, + "grad_norm": 1.0223201513290405, + "learning_rate": 1.9992818258609915e-05, + "loss": 0.4062, + "step": 676 + }, + { + "epoch": 0.0637761710746332, + "grad_norm": 0.9518370628356934, + "learning_rate": 1.9992760926503855e-05, + "loss": 0.4693, + "step": 677 + }, + { + "epoch": 0.06387037516780104, + "grad_norm": 1.127358078956604, + "learning_rate": 1.999270336654739e-05, + "loss": 0.4657, + "step": 678 + }, + { + "epoch": 0.06396457926096889, + "grad_norm": 0.9908960461616516, + "learning_rate": 1.9992645578741836e-05, + "loss": 0.4431, + "step": 679 + }, + { + "epoch": 0.06405878335413674, + "grad_norm": 0.9333703517913818, + "learning_rate": 1.999258756308851e-05, + "loss": 0.4195, + "step": 680 + }, + { + "epoch": 0.06415298744730459, + "grad_norm": 1.0626039505004883, + "learning_rate": 1.999252931958873e-05, + "loss": 0.5016, + "step": 681 + }, + { + "epoch": 0.06424719154047244, + "grad_norm": 1.1983187198638916, + "learning_rate": 1.999247084824383e-05, + "loss": 0.5111, + "step": 682 + }, + { + "epoch": 0.06434139563364027, + "grad_norm": 1.1836464405059814, + "learning_rate": 1.999241214905514e-05, + "loss": 0.4823, + "step": 683 + }, + { + "epoch": 0.06443559972680812, + "grad_norm": 1.056818962097168, + "learning_rate": 1.9992353222023998e-05, + "loss": 0.4278, + "step": 684 + }, + { + "epoch": 0.06452980381997597, + "grad_norm": 0.8685744404792786, + "learning_rate": 1.999229406715175e-05, + "loss": 0.4094, + "step": 685 + }, + { + "epoch": 0.06462400791314382, + "grad_norm": 1.0641740560531616, + "learning_rate": 1.9992234684439746e-05, + "loss": 0.4378, + "step": 686 + }, + { + "epoch": 0.06471821200631167, + "grad_norm": 0.9946682453155518, + "learning_rate": 1.999217507388934e-05, + "loss": 0.4648, + "step": 687 + }, + { + "epoch": 0.06481241609947952, + "grad_norm": 1.04335355758667, + "learning_rate": 1.9992115235501884e-05, + "loss": 0.4465, + "step": 688 + }, + { + "epoch": 0.06490662019264737, + "grad_norm": 0.8747390508651733, + "learning_rate": 1.9992055169278747e-05, + "loss": 0.4335, + "step": 689 + }, + { + "epoch": 0.06500082428581522, + "grad_norm": 0.9814894795417786, + "learning_rate": 1.9991994875221304e-05, + "loss": 0.4501, + "step": 690 + }, + { + "epoch": 0.06509502837898307, + "grad_norm": 1.0981602668762207, + "learning_rate": 1.9991934353330924e-05, + "loss": 0.5084, + "step": 691 + }, + { + "epoch": 0.06518923247215092, + "grad_norm": 0.9652166962623596, + "learning_rate": 1.9991873603608984e-05, + "loss": 0.4426, + "step": 692 + }, + { + "epoch": 0.06528343656531876, + "grad_norm": 0.9243811964988708, + "learning_rate": 1.9991812626056878e-05, + "loss": 0.4351, + "step": 693 + }, + { + "epoch": 0.06537764065848661, + "grad_norm": 1.0646644830703735, + "learning_rate": 1.999175142067599e-05, + "loss": 0.4961, + "step": 694 + }, + { + "epoch": 0.06547184475165446, + "grad_norm": 0.9408712387084961, + "learning_rate": 1.9991689987467714e-05, + "loss": 0.4422, + "step": 695 + }, + { + "epoch": 0.06556604884482231, + "grad_norm": 0.9736141562461853, + "learning_rate": 1.9991628326433457e-05, + "loss": 0.4093, + "step": 696 + }, + { + "epoch": 0.06566025293799016, + "grad_norm": 1.0736565589904785, + "learning_rate": 1.999156643757462e-05, + "loss": 0.4323, + "step": 697 + }, + { + "epoch": 0.06575445703115801, + "grad_norm": 1.196263313293457, + "learning_rate": 1.9991504320892616e-05, + "loss": 0.3956, + "step": 698 + }, + { + "epoch": 0.06584866112432586, + "grad_norm": 0.8757830858230591, + "learning_rate": 1.999144197638886e-05, + "loss": 0.388, + "step": 699 + }, + { + "epoch": 0.0659428652174937, + "grad_norm": 0.9341623187065125, + "learning_rate": 1.9991379404064778e-05, + "loss": 0.3855, + "step": 700 + }, + { + "epoch": 0.06603706931066154, + "grad_norm": 0.9324865341186523, + "learning_rate": 1.9991316603921793e-05, + "loss": 0.3909, + "step": 701 + }, + { + "epoch": 0.06613127340382939, + "grad_norm": 1.286978840827942, + "learning_rate": 1.9991253575961338e-05, + "loss": 0.4446, + "step": 702 + }, + { + "epoch": 0.06622547749699724, + "grad_norm": 1.165391445159912, + "learning_rate": 1.999119032018485e-05, + "loss": 0.4551, + "step": 703 + }, + { + "epoch": 0.06631968159016509, + "grad_norm": 1.060238003730774, + "learning_rate": 1.9991126836593768e-05, + "loss": 0.497, + "step": 704 + }, + { + "epoch": 0.06641388568333294, + "grad_norm": 1.016146183013916, + "learning_rate": 1.9991063125189546e-05, + "loss": 0.4318, + "step": 705 + }, + { + "epoch": 0.06650808977650079, + "grad_norm": 1.0532314777374268, + "learning_rate": 1.999099918597363e-05, + "loss": 0.4218, + "step": 706 + }, + { + "epoch": 0.06660229386966864, + "grad_norm": 1.1155513525009155, + "learning_rate": 1.9990935018947484e-05, + "loss": 0.4937, + "step": 707 + }, + { + "epoch": 0.06669649796283648, + "grad_norm": 1.0174504518508911, + "learning_rate": 1.9990870624112564e-05, + "loss": 0.4627, + "step": 708 + }, + { + "epoch": 0.06679070205600433, + "grad_norm": 0.975045919418335, + "learning_rate": 1.9990806001470346e-05, + "loss": 0.4542, + "step": 709 + }, + { + "epoch": 0.06688490614917218, + "grad_norm": 0.984332799911499, + "learning_rate": 1.9990741151022302e-05, + "loss": 0.429, + "step": 710 + }, + { + "epoch": 0.06697911024234003, + "grad_norm": 1.034583330154419, + "learning_rate": 1.9990676072769904e-05, + "loss": 0.4532, + "step": 711 + }, + { + "epoch": 0.06707331433550788, + "grad_norm": 1.0927097797393799, + "learning_rate": 1.9990610766714646e-05, + "loss": 0.4634, + "step": 712 + }, + { + "epoch": 0.06716751842867573, + "grad_norm": 1.004805088043213, + "learning_rate": 1.9990545232858008e-05, + "loss": 0.492, + "step": 713 + }, + { + "epoch": 0.06726172252184358, + "grad_norm": 1.055342674255371, + "learning_rate": 1.9990479471201488e-05, + "loss": 0.4621, + "step": 714 + }, + { + "epoch": 0.06735592661501143, + "grad_norm": 1.1024476289749146, + "learning_rate": 1.9990413481746587e-05, + "loss": 0.4413, + "step": 715 + }, + { + "epoch": 0.06745013070817928, + "grad_norm": 0.9659948945045471, + "learning_rate": 1.9990347264494806e-05, + "loss": 0.425, + "step": 716 + }, + { + "epoch": 0.06754433480134713, + "grad_norm": 0.9982300400733948, + "learning_rate": 1.9990280819447662e-05, + "loss": 0.4722, + "step": 717 + }, + { + "epoch": 0.06763853889451496, + "grad_norm": 1.2878872156143188, + "learning_rate": 1.999021414660666e-05, + "loss": 0.4372, + "step": 718 + }, + { + "epoch": 0.06773274298768281, + "grad_norm": 1.1230915784835815, + "learning_rate": 1.999014724597333e-05, + "loss": 0.5018, + "step": 719 + }, + { + "epoch": 0.06782694708085066, + "grad_norm": 1.0017739534378052, + "learning_rate": 1.9990080117549188e-05, + "loss": 0.4873, + "step": 720 + }, + { + "epoch": 0.06792115117401851, + "grad_norm": 1.1177877187728882, + "learning_rate": 1.999001276133577e-05, + "loss": 0.5517, + "step": 721 + }, + { + "epoch": 0.06801535526718636, + "grad_norm": 1.0998467206954956, + "learning_rate": 1.9989945177334614e-05, + "loss": 0.5142, + "step": 722 + }, + { + "epoch": 0.0681095593603542, + "grad_norm": 0.9907771944999695, + "learning_rate": 1.9989877365547253e-05, + "loss": 0.4245, + "step": 723 + }, + { + "epoch": 0.06820376345352205, + "grad_norm": 1.0315520763397217, + "learning_rate": 1.9989809325975244e-05, + "loss": 0.4571, + "step": 724 + }, + { + "epoch": 0.0682979675466899, + "grad_norm": 0.9746618270874023, + "learning_rate": 1.998974105862013e-05, + "loss": 0.4492, + "step": 725 + }, + { + "epoch": 0.06839217163985775, + "grad_norm": 0.9446602463722229, + "learning_rate": 1.998967256348347e-05, + "loss": 0.3837, + "step": 726 + }, + { + "epoch": 0.0684863757330256, + "grad_norm": 1.0078535079956055, + "learning_rate": 1.998960384056683e-05, + "loss": 0.39, + "step": 727 + }, + { + "epoch": 0.06858057982619345, + "grad_norm": 1.0134851932525635, + "learning_rate": 1.998953488987177e-05, + "loss": 0.4689, + "step": 728 + }, + { + "epoch": 0.0686747839193613, + "grad_norm": 1.0376334190368652, + "learning_rate": 1.9989465711399865e-05, + "loss": 0.4566, + "step": 729 + }, + { + "epoch": 0.06876898801252915, + "grad_norm": 1.0177689790725708, + "learning_rate": 1.9989396305152694e-05, + "loss": 0.4607, + "step": 730 + }, + { + "epoch": 0.068863192105697, + "grad_norm": 1.0617927312850952, + "learning_rate": 1.9989326671131837e-05, + "loss": 0.4368, + "step": 731 + }, + { + "epoch": 0.06895739619886485, + "grad_norm": 1.0075178146362305, + "learning_rate": 1.9989256809338885e-05, + "loss": 0.4401, + "step": 732 + }, + { + "epoch": 0.0690516002920327, + "grad_norm": 1.0294811725616455, + "learning_rate": 1.998918671977543e-05, + "loss": 0.4661, + "step": 733 + }, + { + "epoch": 0.06914580438520054, + "grad_norm": 1.0549850463867188, + "learning_rate": 1.9989116402443068e-05, + "loss": 0.4234, + "step": 734 + }, + { + "epoch": 0.06924000847836838, + "grad_norm": 1.123194694519043, + "learning_rate": 1.9989045857343403e-05, + "loss": 0.4594, + "step": 735 + }, + { + "epoch": 0.06933421257153623, + "grad_norm": 1.0410248041152954, + "learning_rate": 1.9988975084478044e-05, + "loss": 0.3936, + "step": 736 + }, + { + "epoch": 0.06942841666470408, + "grad_norm": 0.995914101600647, + "learning_rate": 1.9988904083848603e-05, + "loss": 0.4776, + "step": 737 + }, + { + "epoch": 0.06952262075787193, + "grad_norm": 1.1929165124893188, + "learning_rate": 1.9988832855456705e-05, + "loss": 0.4914, + "step": 738 + }, + { + "epoch": 0.06961682485103977, + "grad_norm": 1.121276617050171, + "learning_rate": 1.9988761399303966e-05, + "loss": 0.502, + "step": 739 + }, + { + "epoch": 0.06971102894420762, + "grad_norm": 0.9790893793106079, + "learning_rate": 1.998868971539202e-05, + "loss": 0.4299, + "step": 740 + }, + { + "epoch": 0.06980523303737547, + "grad_norm": 0.9773557782173157, + "learning_rate": 1.9988617803722503e-05, + "loss": 0.4416, + "step": 741 + }, + { + "epoch": 0.06989943713054332, + "grad_norm": 0.9014050364494324, + "learning_rate": 1.998854566429705e-05, + "loss": 0.4372, + "step": 742 + }, + { + "epoch": 0.06999364122371117, + "grad_norm": 1.0881351232528687, + "learning_rate": 1.998847329711731e-05, + "loss": 0.4556, + "step": 743 + }, + { + "epoch": 0.07008784531687902, + "grad_norm": 1.1250412464141846, + "learning_rate": 1.998840070218493e-05, + "loss": 0.46, + "step": 744 + }, + { + "epoch": 0.07018204941004687, + "grad_norm": 0.9465177059173584, + "learning_rate": 1.9988327879501567e-05, + "loss": 0.4083, + "step": 745 + }, + { + "epoch": 0.07027625350321472, + "grad_norm": 0.93609219789505, + "learning_rate": 1.998825482906888e-05, + "loss": 0.4355, + "step": 746 + }, + { + "epoch": 0.07037045759638257, + "grad_norm": 1.0855962038040161, + "learning_rate": 1.998818155088854e-05, + "loss": 0.4833, + "step": 747 + }, + { + "epoch": 0.07046466168955041, + "grad_norm": 1.063692569732666, + "learning_rate": 1.9988108044962207e-05, + "loss": 0.4584, + "step": 748 + }, + { + "epoch": 0.07055886578271826, + "grad_norm": 1.0598742961883545, + "learning_rate": 1.998803431129157e-05, + "loss": 0.4934, + "step": 749 + }, + { + "epoch": 0.07065306987588611, + "grad_norm": 0.9979044795036316, + "learning_rate": 1.9987960349878302e-05, + "loss": 0.4834, + "step": 750 + }, + { + "epoch": 0.07074727396905396, + "grad_norm": 0.9993002414703369, + "learning_rate": 1.998788616072409e-05, + "loss": 0.3989, + "step": 751 + }, + { + "epoch": 0.07084147806222181, + "grad_norm": 1.0453739166259766, + "learning_rate": 1.9987811743830624e-05, + "loss": 0.4515, + "step": 752 + }, + { + "epoch": 0.07093568215538965, + "grad_norm": 1.0170127153396606, + "learning_rate": 1.998773709919961e-05, + "loss": 0.4276, + "step": 753 + }, + { + "epoch": 0.0710298862485575, + "grad_norm": 0.9998757243156433, + "learning_rate": 1.998766222683274e-05, + "loss": 0.4545, + "step": 754 + }, + { + "epoch": 0.07112409034172534, + "grad_norm": 1.049187183380127, + "learning_rate": 1.9987587126731727e-05, + "loss": 0.4716, + "step": 755 + }, + { + "epoch": 0.07121829443489319, + "grad_norm": 1.0449811220169067, + "learning_rate": 1.998751179889828e-05, + "loss": 0.4916, + "step": 756 + }, + { + "epoch": 0.07131249852806104, + "grad_norm": 0.9422582983970642, + "learning_rate": 1.998743624333412e-05, + "loss": 0.378, + "step": 757 + }, + { + "epoch": 0.07140670262122889, + "grad_norm": 0.9875731468200684, + "learning_rate": 1.9987360460040963e-05, + "loss": 0.3882, + "step": 758 + }, + { + "epoch": 0.07150090671439674, + "grad_norm": 1.00863778591156, + "learning_rate": 1.9987284449020548e-05, + "loss": 0.4507, + "step": 759 + }, + { + "epoch": 0.07159511080756459, + "grad_norm": 1.0163146257400513, + "learning_rate": 1.9987208210274597e-05, + "loss": 0.433, + "step": 760 + }, + { + "epoch": 0.07168931490073244, + "grad_norm": 0.9799811244010925, + "learning_rate": 1.9987131743804858e-05, + "loss": 0.452, + "step": 761 + }, + { + "epoch": 0.07178351899390029, + "grad_norm": 1.0506083965301514, + "learning_rate": 1.9987055049613065e-05, + "loss": 0.4551, + "step": 762 + }, + { + "epoch": 0.07187772308706813, + "grad_norm": 1.1049585342407227, + "learning_rate": 1.9986978127700974e-05, + "loss": 0.4713, + "step": 763 + }, + { + "epoch": 0.07197192718023598, + "grad_norm": 0.8913968801498413, + "learning_rate": 1.9986900978070337e-05, + "loss": 0.3691, + "step": 764 + }, + { + "epoch": 0.07206613127340383, + "grad_norm": 1.0600327253341675, + "learning_rate": 1.998682360072291e-05, + "loss": 0.5035, + "step": 765 + }, + { + "epoch": 0.07216033536657168, + "grad_norm": 0.9857485294342041, + "learning_rate": 1.9986745995660463e-05, + "loss": 0.4404, + "step": 766 + }, + { + "epoch": 0.07225453945973953, + "grad_norm": 1.1809180974960327, + "learning_rate": 1.9986668162884763e-05, + "loss": 0.4649, + "step": 767 + }, + { + "epoch": 0.07234874355290738, + "grad_norm": 0.9589232206344604, + "learning_rate": 1.998659010239758e-05, + "loss": 0.4272, + "step": 768 + }, + { + "epoch": 0.07244294764607523, + "grad_norm": 0.9995101690292358, + "learning_rate": 1.99865118142007e-05, + "loss": 0.4669, + "step": 769 + }, + { + "epoch": 0.07253715173924306, + "grad_norm": 0.9336907863616943, + "learning_rate": 1.998643329829591e-05, + "loss": 0.433, + "step": 770 + }, + { + "epoch": 0.07263135583241091, + "grad_norm": 1.0039317607879639, + "learning_rate": 1.9986354554684994e-05, + "loss": 0.4497, + "step": 771 + }, + { + "epoch": 0.07272555992557876, + "grad_norm": 1.0792837142944336, + "learning_rate": 1.9986275583369745e-05, + "loss": 0.4448, + "step": 772 + }, + { + "epoch": 0.07281976401874661, + "grad_norm": 0.9362404346466064, + "learning_rate": 1.9986196384351975e-05, + "loss": 0.3992, + "step": 773 + }, + { + "epoch": 0.07291396811191446, + "grad_norm": 1.0516244173049927, + "learning_rate": 1.998611695763348e-05, + "loss": 0.4449, + "step": 774 + }, + { + "epoch": 0.07300817220508231, + "grad_norm": 1.085710883140564, + "learning_rate": 1.9986037303216076e-05, + "loss": 0.4198, + "step": 775 + }, + { + "epoch": 0.07310237629825016, + "grad_norm": 1.0311400890350342, + "learning_rate": 1.998595742110158e-05, + "loss": 0.431, + "step": 776 + }, + { + "epoch": 0.073196580391418, + "grad_norm": 1.0214941501617432, + "learning_rate": 1.998587731129181e-05, + "loss": 0.402, + "step": 777 + }, + { + "epoch": 0.07329078448458586, + "grad_norm": 1.0891741514205933, + "learning_rate": 1.9985796973788592e-05, + "loss": 0.4059, + "step": 778 + }, + { + "epoch": 0.0733849885777537, + "grad_norm": 1.0049872398376465, + "learning_rate": 1.998571640859376e-05, + "loss": 0.401, + "step": 779 + }, + { + "epoch": 0.07347919267092155, + "grad_norm": 1.0176184177398682, + "learning_rate": 1.998563561570915e-05, + "loss": 0.3997, + "step": 780 + }, + { + "epoch": 0.0735733967640894, + "grad_norm": 1.0183604955673218, + "learning_rate": 1.9985554595136606e-05, + "loss": 0.4529, + "step": 781 + }, + { + "epoch": 0.07366760085725725, + "grad_norm": 1.0522005558013916, + "learning_rate": 1.9985473346877976e-05, + "loss": 0.452, + "step": 782 + }, + { + "epoch": 0.0737618049504251, + "grad_norm": 0.9236129522323608, + "learning_rate": 1.9985391870935108e-05, + "loss": 0.4518, + "step": 783 + }, + { + "epoch": 0.07385600904359295, + "grad_norm": 1.044931411743164, + "learning_rate": 1.9985310167309865e-05, + "loss": 0.4967, + "step": 784 + }, + { + "epoch": 0.0739502131367608, + "grad_norm": 1.0424778461456299, + "learning_rate": 1.9985228236004107e-05, + "loss": 0.4418, + "step": 785 + }, + { + "epoch": 0.07404441722992865, + "grad_norm": 1.0401757955551147, + "learning_rate": 1.9985146077019698e-05, + "loss": 0.4709, + "step": 786 + }, + { + "epoch": 0.07413862132309648, + "grad_norm": 0.90964674949646, + "learning_rate": 1.998506369035852e-05, + "loss": 0.3972, + "step": 787 + }, + { + "epoch": 0.07423282541626433, + "grad_norm": 0.9477034211158752, + "learning_rate": 1.998498107602245e-05, + "loss": 0.3349, + "step": 788 + }, + { + "epoch": 0.07432702950943218, + "grad_norm": 1.0175570249557495, + "learning_rate": 1.9984898234013367e-05, + "loss": 0.4212, + "step": 789 + }, + { + "epoch": 0.07442123360260003, + "grad_norm": 0.9965648651123047, + "learning_rate": 1.9984815164333163e-05, + "loss": 0.4262, + "step": 790 + }, + { + "epoch": 0.07451543769576788, + "grad_norm": 1.0520919561386108, + "learning_rate": 1.998473186698373e-05, + "loss": 0.4884, + "step": 791 + }, + { + "epoch": 0.07460964178893573, + "grad_norm": 0.9975264072418213, + "learning_rate": 1.9984648341966974e-05, + "loss": 0.4992, + "step": 792 + }, + { + "epoch": 0.07470384588210358, + "grad_norm": 1.0246936082839966, + "learning_rate": 1.998456458928479e-05, + "loss": 0.4453, + "step": 793 + }, + { + "epoch": 0.07479804997527142, + "grad_norm": 1.0155010223388672, + "learning_rate": 1.998448060893909e-05, + "loss": 0.4551, + "step": 794 + }, + { + "epoch": 0.07489225406843927, + "grad_norm": 0.9694147109985352, + "learning_rate": 1.9984396400931794e-05, + "loss": 0.4338, + "step": 795 + }, + { + "epoch": 0.07498645816160712, + "grad_norm": 1.0100959539413452, + "learning_rate": 1.9984311965264816e-05, + "loss": 0.4345, + "step": 796 + }, + { + "epoch": 0.07508066225477497, + "grad_norm": 1.0546627044677734, + "learning_rate": 1.9984227301940088e-05, + "loss": 0.5144, + "step": 797 + }, + { + "epoch": 0.07517486634794282, + "grad_norm": 1.2081952095031738, + "learning_rate": 1.9984142410959534e-05, + "loss": 0.3909, + "step": 798 + }, + { + "epoch": 0.07526907044111067, + "grad_norm": 1.051641583442688, + "learning_rate": 1.9984057292325093e-05, + "loss": 0.4785, + "step": 799 + }, + { + "epoch": 0.07536327453427852, + "grad_norm": 1.1095900535583496, + "learning_rate": 1.9983971946038703e-05, + "loss": 0.4382, + "step": 800 + }, + { + "epoch": 0.07545747862744637, + "grad_norm": 1.0389223098754883, + "learning_rate": 1.9983886372102314e-05, + "loss": 0.4288, + "step": 801 + }, + { + "epoch": 0.07555168272061422, + "grad_norm": 0.965965747833252, + "learning_rate": 1.9983800570517876e-05, + "loss": 0.417, + "step": 802 + }, + { + "epoch": 0.07564588681378207, + "grad_norm": 0.9707470536231995, + "learning_rate": 1.9983714541287343e-05, + "loss": 0.3798, + "step": 803 + }, + { + "epoch": 0.0757400909069499, + "grad_norm": 0.9705069065093994, + "learning_rate": 1.998362828441268e-05, + "loss": 0.4492, + "step": 804 + }, + { + "epoch": 0.07583429500011775, + "grad_norm": 1.0146031379699707, + "learning_rate": 1.998354179989585e-05, + "loss": 0.4154, + "step": 805 + }, + { + "epoch": 0.0759284990932856, + "grad_norm": 1.011963963508606, + "learning_rate": 1.9983455087738833e-05, + "loss": 0.4724, + "step": 806 + }, + { + "epoch": 0.07602270318645345, + "grad_norm": 0.9977352023124695, + "learning_rate": 1.9983368147943593e-05, + "loss": 0.4739, + "step": 807 + }, + { + "epoch": 0.0761169072796213, + "grad_norm": 0.9591209888458252, + "learning_rate": 1.9983280980512127e-05, + "loss": 0.397, + "step": 808 + }, + { + "epoch": 0.07621111137278914, + "grad_norm": 1.0379873514175415, + "learning_rate": 1.9983193585446408e-05, + "loss": 0.4637, + "step": 809 + }, + { + "epoch": 0.076305315465957, + "grad_norm": 0.9633691906929016, + "learning_rate": 1.9983105962748438e-05, + "loss": 0.3821, + "step": 810 + }, + { + "epoch": 0.07639951955912484, + "grad_norm": 1.0823371410369873, + "learning_rate": 1.9983018112420213e-05, + "loss": 0.4669, + "step": 811 + }, + { + "epoch": 0.07649372365229269, + "grad_norm": 1.0636144876480103, + "learning_rate": 1.9982930034463738e-05, + "loss": 0.4933, + "step": 812 + }, + { + "epoch": 0.07658792774546054, + "grad_norm": 1.081931233406067, + "learning_rate": 1.9982841728881016e-05, + "loss": 0.4548, + "step": 813 + }, + { + "epoch": 0.07668213183862839, + "grad_norm": 1.0330334901809692, + "learning_rate": 1.998275319567407e-05, + "loss": 0.4312, + "step": 814 + }, + { + "epoch": 0.07677633593179624, + "grad_norm": 1.0809640884399414, + "learning_rate": 1.9982664434844908e-05, + "loss": 0.4582, + "step": 815 + }, + { + "epoch": 0.07687054002496409, + "grad_norm": 0.9635642766952515, + "learning_rate": 1.9982575446395554e-05, + "loss": 0.4516, + "step": 816 + }, + { + "epoch": 0.07696474411813194, + "grad_norm": 0.9754523038864136, + "learning_rate": 1.9982486230328047e-05, + "loss": 0.4632, + "step": 817 + }, + { + "epoch": 0.07705894821129979, + "grad_norm": 1.0695689916610718, + "learning_rate": 1.9982396786644417e-05, + "loss": 0.4215, + "step": 818 + }, + { + "epoch": 0.07715315230446763, + "grad_norm": 0.9689046740531921, + "learning_rate": 1.99823071153467e-05, + "loss": 0.4484, + "step": 819 + }, + { + "epoch": 0.07724735639763548, + "grad_norm": 1.0402729511260986, + "learning_rate": 1.998221721643694e-05, + "loss": 0.5115, + "step": 820 + }, + { + "epoch": 0.07734156049080333, + "grad_norm": 1.0786657333374023, + "learning_rate": 1.9982127089917196e-05, + "loss": 0.5123, + "step": 821 + }, + { + "epoch": 0.07743576458397117, + "grad_norm": 0.9543944001197815, + "learning_rate": 1.9982036735789513e-05, + "loss": 0.4345, + "step": 822 + }, + { + "epoch": 0.07752996867713902, + "grad_norm": 0.9859232902526855, + "learning_rate": 1.9981946154055955e-05, + "loss": 0.4569, + "step": 823 + }, + { + "epoch": 0.07762417277030687, + "grad_norm": 0.9364791512489319, + "learning_rate": 1.9981855344718587e-05, + "loss": 0.4085, + "step": 824 + }, + { + "epoch": 0.07771837686347471, + "grad_norm": 0.960197925567627, + "learning_rate": 1.998176430777948e-05, + "loss": 0.3889, + "step": 825 + }, + { + "epoch": 0.07781258095664256, + "grad_norm": 0.8794482350349426, + "learning_rate": 1.9981673043240712e-05, + "loss": 0.3551, + "step": 826 + }, + { + "epoch": 0.07790678504981041, + "grad_norm": 1.0110254287719727, + "learning_rate": 1.998158155110436e-05, + "loss": 0.4543, + "step": 827 + }, + { + "epoch": 0.07800098914297826, + "grad_norm": 1.0137994289398193, + "learning_rate": 1.9981489831372512e-05, + "loss": 0.4326, + "step": 828 + }, + { + "epoch": 0.07809519323614611, + "grad_norm": 0.9920204281806946, + "learning_rate": 1.9981397884047257e-05, + "loss": 0.4465, + "step": 829 + }, + { + "epoch": 0.07818939732931396, + "grad_norm": 1.0545607805252075, + "learning_rate": 1.9981305709130696e-05, + "loss": 0.4999, + "step": 830 + }, + { + "epoch": 0.07828360142248181, + "grad_norm": 1.0289400815963745, + "learning_rate": 1.998121330662493e-05, + "loss": 0.4756, + "step": 831 + }, + { + "epoch": 0.07837780551564966, + "grad_norm": 1.1255810260772705, + "learning_rate": 1.998112067653206e-05, + "loss": 0.5224, + "step": 832 + }, + { + "epoch": 0.0784720096088175, + "grad_norm": 0.9349611401557922, + "learning_rate": 1.9981027818854205e-05, + "loss": 0.4198, + "step": 833 + }, + { + "epoch": 0.07856621370198535, + "grad_norm": 1.0145128965377808, + "learning_rate": 1.998093473359348e-05, + "loss": 0.404, + "step": 834 + }, + { + "epoch": 0.0786604177951532, + "grad_norm": 1.003151297569275, + "learning_rate": 1.9980841420752008e-05, + "loss": 0.4209, + "step": 835 + }, + { + "epoch": 0.07875462188832105, + "grad_norm": 1.01313316822052, + "learning_rate": 1.998074788033191e-05, + "loss": 0.4449, + "step": 836 + }, + { + "epoch": 0.0788488259814889, + "grad_norm": 1.096969723701477, + "learning_rate": 1.9980654112335333e-05, + "loss": 0.5094, + "step": 837 + }, + { + "epoch": 0.07894303007465675, + "grad_norm": 1.038501501083374, + "learning_rate": 1.9980560116764404e-05, + "loss": 0.4482, + "step": 838 + }, + { + "epoch": 0.07903723416782459, + "grad_norm": 0.9726386070251465, + "learning_rate": 1.9980465893621268e-05, + "loss": 0.407, + "step": 839 + }, + { + "epoch": 0.07913143826099243, + "grad_norm": 0.9721311330795288, + "learning_rate": 1.9980371442908077e-05, + "loss": 0.4037, + "step": 840 + }, + { + "epoch": 0.07922564235416028, + "grad_norm": 1.0406038761138916, + "learning_rate": 1.998027676462698e-05, + "loss": 0.4435, + "step": 841 + }, + { + "epoch": 0.07931984644732813, + "grad_norm": 0.9423885941505432, + "learning_rate": 1.9980181858780136e-05, + "loss": 0.4365, + "step": 842 + }, + { + "epoch": 0.07941405054049598, + "grad_norm": 1.131834864616394, + "learning_rate": 1.9980086725369712e-05, + "loss": 0.471, + "step": 843 + }, + { + "epoch": 0.07950825463366383, + "grad_norm": 0.9440787434577942, + "learning_rate": 1.997999136439788e-05, + "loss": 0.4208, + "step": 844 + }, + { + "epoch": 0.07960245872683168, + "grad_norm": 1.0020167827606201, + "learning_rate": 1.997989577586681e-05, + "loss": 0.4368, + "step": 845 + }, + { + "epoch": 0.07969666281999953, + "grad_norm": 0.9412116408348083, + "learning_rate": 1.997979995977868e-05, + "loss": 0.4033, + "step": 846 + }, + { + "epoch": 0.07979086691316738, + "grad_norm": 0.9708133339881897, + "learning_rate": 1.9979703916135677e-05, + "loss": 0.4364, + "step": 847 + }, + { + "epoch": 0.07988507100633523, + "grad_norm": 0.9239591360092163, + "learning_rate": 1.997960764493999e-05, + "loss": 0.4404, + "step": 848 + }, + { + "epoch": 0.07997927509950307, + "grad_norm": 1.0096538066864014, + "learning_rate": 1.997951114619381e-05, + "loss": 0.4062, + "step": 849 + }, + { + "epoch": 0.08007347919267092, + "grad_norm": 0.858192503452301, + "learning_rate": 1.997941441989935e-05, + "loss": 0.3975, + "step": 850 + }, + { + "epoch": 0.08016768328583877, + "grad_norm": 1.05099356174469, + "learning_rate": 1.997931746605881e-05, + "loss": 0.4651, + "step": 851 + }, + { + "epoch": 0.08026188737900662, + "grad_norm": 1.252518653869629, + "learning_rate": 1.9979220284674392e-05, + "loss": 0.4671, + "step": 852 + }, + { + "epoch": 0.08035609147217447, + "grad_norm": 0.9368758201599121, + "learning_rate": 1.997912287574832e-05, + "loss": 0.4046, + "step": 853 + }, + { + "epoch": 0.08045029556534232, + "grad_norm": 0.9597492814064026, + "learning_rate": 1.997902523928281e-05, + "loss": 0.4595, + "step": 854 + }, + { + "epoch": 0.08054449965851017, + "grad_norm": 0.9732673168182373, + "learning_rate": 1.99789273752801e-05, + "loss": 0.4022, + "step": 855 + }, + { + "epoch": 0.080638703751678, + "grad_norm": 1.036406397819519, + "learning_rate": 1.9978829283742405e-05, + "loss": 0.4246, + "step": 856 + }, + { + "epoch": 0.08073290784484585, + "grad_norm": 0.9169813990592957, + "learning_rate": 1.9978730964671977e-05, + "loss": 0.4408, + "step": 857 + }, + { + "epoch": 0.0808271119380137, + "grad_norm": 1.088754415512085, + "learning_rate": 1.9978632418071044e-05, + "loss": 0.4319, + "step": 858 + }, + { + "epoch": 0.08092131603118155, + "grad_norm": 0.9790443778038025, + "learning_rate": 1.9978533643941865e-05, + "loss": 0.4223, + "step": 859 + }, + { + "epoch": 0.0810155201243494, + "grad_norm": 0.9643515944480896, + "learning_rate": 1.9978434642286684e-05, + "loss": 0.4287, + "step": 860 + }, + { + "epoch": 0.08110972421751725, + "grad_norm": 0.8812493681907654, + "learning_rate": 1.9978335413107764e-05, + "loss": 0.3915, + "step": 861 + }, + { + "epoch": 0.0812039283106851, + "grad_norm": 0.9891970753669739, + "learning_rate": 1.9978235956407358e-05, + "loss": 0.4435, + "step": 862 + }, + { + "epoch": 0.08129813240385295, + "grad_norm": 0.9829233884811401, + "learning_rate": 1.9978136272187745e-05, + "loss": 0.4052, + "step": 863 + }, + { + "epoch": 0.0813923364970208, + "grad_norm": 0.9916782975196838, + "learning_rate": 1.9978036360451197e-05, + "loss": 0.4325, + "step": 864 + }, + { + "epoch": 0.08148654059018864, + "grad_norm": 1.0053255558013916, + "learning_rate": 1.9977936221199983e-05, + "loss": 0.4686, + "step": 865 + }, + { + "epoch": 0.0815807446833565, + "grad_norm": 0.9676094651222229, + "learning_rate": 1.9977835854436398e-05, + "loss": 0.4224, + "step": 866 + }, + { + "epoch": 0.08167494877652434, + "grad_norm": 1.0051491260528564, + "learning_rate": 1.997773526016272e-05, + "loss": 0.4731, + "step": 867 + }, + { + "epoch": 0.08176915286969219, + "grad_norm": 0.9441819787025452, + "learning_rate": 1.9977634438381248e-05, + "loss": 0.4386, + "step": 868 + }, + { + "epoch": 0.08186335696286004, + "grad_norm": 1.0373190641403198, + "learning_rate": 1.9977533389094278e-05, + "loss": 0.4276, + "step": 869 + }, + { + "epoch": 0.08195756105602789, + "grad_norm": 0.9964975118637085, + "learning_rate": 1.9977432112304118e-05, + "loss": 0.4686, + "step": 870 + }, + { + "epoch": 0.08205176514919574, + "grad_norm": 0.9844152331352234, + "learning_rate": 1.9977330608013075e-05, + "loss": 0.4184, + "step": 871 + }, + { + "epoch": 0.08214596924236359, + "grad_norm": 0.9310718178749084, + "learning_rate": 1.9977228876223463e-05, + "loss": 0.382, + "step": 872 + }, + { + "epoch": 0.08224017333553144, + "grad_norm": 0.8855440020561218, + "learning_rate": 1.9977126916937607e-05, + "loss": 0.387, + "step": 873 + }, + { + "epoch": 0.08233437742869927, + "grad_norm": 0.9412824511528015, + "learning_rate": 1.9977024730157824e-05, + "loss": 0.4016, + "step": 874 + }, + { + "epoch": 0.08242858152186712, + "grad_norm": 1.117919683456421, + "learning_rate": 1.9976922315886445e-05, + "loss": 0.4676, + "step": 875 + }, + { + "epoch": 0.08252278561503497, + "grad_norm": 0.9602299332618713, + "learning_rate": 1.9976819674125815e-05, + "loss": 0.4452, + "step": 876 + }, + { + "epoch": 0.08261698970820282, + "grad_norm": 0.8984887003898621, + "learning_rate": 1.997671680487826e-05, + "loss": 0.4055, + "step": 877 + }, + { + "epoch": 0.08271119380137067, + "grad_norm": 1.0107239484786987, + "learning_rate": 1.9976613708146134e-05, + "loss": 0.4739, + "step": 878 + }, + { + "epoch": 0.08280539789453852, + "grad_norm": 0.9731579422950745, + "learning_rate": 1.997651038393179e-05, + "loss": 0.375, + "step": 879 + }, + { + "epoch": 0.08289960198770636, + "grad_norm": 1.032165765762329, + "learning_rate": 1.9976406832237576e-05, + "loss": 0.4491, + "step": 880 + }, + { + "epoch": 0.08299380608087421, + "grad_norm": 0.9235690236091614, + "learning_rate": 1.997630305306586e-05, + "loss": 0.4361, + "step": 881 + }, + { + "epoch": 0.08308801017404206, + "grad_norm": 0.9779563546180725, + "learning_rate": 1.9976199046419006e-05, + "loss": 0.5025, + "step": 882 + }, + { + "epoch": 0.08318221426720991, + "grad_norm": 0.8626536130905151, + "learning_rate": 1.997609481229938e-05, + "loss": 0.386, + "step": 883 + }, + { + "epoch": 0.08327641836037776, + "grad_norm": 1.018281102180481, + "learning_rate": 1.997599035070937e-05, + "loss": 0.4295, + "step": 884 + }, + { + "epoch": 0.08337062245354561, + "grad_norm": 1.0163942575454712, + "learning_rate": 1.997588566165135e-05, + "loss": 0.4134, + "step": 885 + }, + { + "epoch": 0.08346482654671346, + "grad_norm": 0.8940512537956238, + "learning_rate": 1.9975780745127706e-05, + "loss": 0.4126, + "step": 886 + }, + { + "epoch": 0.08355903063988131, + "grad_norm": 0.978265106678009, + "learning_rate": 1.997567560114084e-05, + "loss": 0.468, + "step": 887 + }, + { + "epoch": 0.08365323473304916, + "grad_norm": 1.018894076347351, + "learning_rate": 1.9975570229693137e-05, + "loss": 0.443, + "step": 888 + }, + { + "epoch": 0.083747438826217, + "grad_norm": 0.9922093152999878, + "learning_rate": 1.9975464630787008e-05, + "loss": 0.4468, + "step": 889 + }, + { + "epoch": 0.08384164291938485, + "grad_norm": 0.8547562956809998, + "learning_rate": 1.9975358804424853e-05, + "loss": 0.3887, + "step": 890 + }, + { + "epoch": 0.08393584701255269, + "grad_norm": 0.9433829188346863, + "learning_rate": 1.9975252750609095e-05, + "loss": 0.4038, + "step": 891 + }, + { + "epoch": 0.08403005110572054, + "grad_norm": 0.9846193194389343, + "learning_rate": 1.9975146469342146e-05, + "loss": 0.4322, + "step": 892 + }, + { + "epoch": 0.08412425519888839, + "grad_norm": 0.9687615633010864, + "learning_rate": 1.9975039960626433e-05, + "loss": 0.4211, + "step": 893 + }, + { + "epoch": 0.08421845929205624, + "grad_norm": 1.0523276329040527, + "learning_rate": 1.9974933224464376e-05, + "loss": 0.396, + "step": 894 + }, + { + "epoch": 0.08431266338522408, + "grad_norm": 0.9216560125350952, + "learning_rate": 1.997482626085842e-05, + "loss": 0.4138, + "step": 895 + }, + { + "epoch": 0.08440686747839193, + "grad_norm": 1.0367227792739868, + "learning_rate": 1.9974719069810998e-05, + "loss": 0.485, + "step": 896 + }, + { + "epoch": 0.08450107157155978, + "grad_norm": 0.9510256052017212, + "learning_rate": 1.9974611651324555e-05, + "loss": 0.3629, + "step": 897 + }, + { + "epoch": 0.08459527566472763, + "grad_norm": 0.9600812196731567, + "learning_rate": 1.997450400540154e-05, + "loss": 0.4321, + "step": 898 + }, + { + "epoch": 0.08468947975789548, + "grad_norm": 0.9327564239501953, + "learning_rate": 1.9974396132044405e-05, + "loss": 0.3761, + "step": 899 + }, + { + "epoch": 0.08478368385106333, + "grad_norm": 1.0322734117507935, + "learning_rate": 1.997428803125562e-05, + "loss": 0.4314, + "step": 900 + }, + { + "epoch": 0.08487788794423118, + "grad_norm": 0.9740006923675537, + "learning_rate": 1.9974179703037636e-05, + "loss": 0.4769, + "step": 901 + }, + { + "epoch": 0.08497209203739903, + "grad_norm": 0.9764218926429749, + "learning_rate": 1.997407114739293e-05, + "loss": 0.4944, + "step": 902 + }, + { + "epoch": 0.08506629613056688, + "grad_norm": 0.9930822849273682, + "learning_rate": 1.997396236432398e-05, + "loss": 0.428, + "step": 903 + }, + { + "epoch": 0.08516050022373473, + "grad_norm": 0.9698862433433533, + "learning_rate": 1.9973853353833262e-05, + "loss": 0.4313, + "step": 904 + }, + { + "epoch": 0.08525470431690257, + "grad_norm": 0.9097919464111328, + "learning_rate": 1.997374411592326e-05, + "loss": 0.3919, + "step": 905 + }, + { + "epoch": 0.08534890841007042, + "grad_norm": 0.9172612428665161, + "learning_rate": 1.997363465059647e-05, + "loss": 0.4214, + "step": 906 + }, + { + "epoch": 0.08544311250323827, + "grad_norm": 0.9216505885124207, + "learning_rate": 1.9973524957855384e-05, + "loss": 0.4232, + "step": 907 + }, + { + "epoch": 0.08553731659640611, + "grad_norm": 1.072882056236267, + "learning_rate": 1.9973415037702502e-05, + "loss": 0.4435, + "step": 908 + }, + { + "epoch": 0.08563152068957396, + "grad_norm": 0.9569485783576965, + "learning_rate": 1.9973304890140336e-05, + "loss": 0.435, + "step": 909 + }, + { + "epoch": 0.0857257247827418, + "grad_norm": 0.9770811796188354, + "learning_rate": 1.9973194515171396e-05, + "loss": 0.4541, + "step": 910 + }, + { + "epoch": 0.08581992887590965, + "grad_norm": 0.9245586395263672, + "learning_rate": 1.99730839127982e-05, + "loss": 0.4335, + "step": 911 + }, + { + "epoch": 0.0859141329690775, + "grad_norm": 0.8607360124588013, + "learning_rate": 1.997297308302326e-05, + "loss": 0.4122, + "step": 912 + }, + { + "epoch": 0.08600833706224535, + "grad_norm": 1.036927342414856, + "learning_rate": 1.997286202584911e-05, + "loss": 0.404, + "step": 913 + }, + { + "epoch": 0.0861025411554132, + "grad_norm": 0.9474027752876282, + "learning_rate": 1.9972750741278285e-05, + "loss": 0.4708, + "step": 914 + }, + { + "epoch": 0.08619674524858105, + "grad_norm": 1.0663697719573975, + "learning_rate": 1.9972639229313322e-05, + "loss": 0.4629, + "step": 915 + }, + { + "epoch": 0.0862909493417489, + "grad_norm": 0.9513779878616333, + "learning_rate": 1.9972527489956762e-05, + "loss": 0.4099, + "step": 916 + }, + { + "epoch": 0.08638515343491675, + "grad_norm": 1.0742497444152832, + "learning_rate": 1.997241552321115e-05, + "loss": 0.397, + "step": 917 + }, + { + "epoch": 0.0864793575280846, + "grad_norm": 1.0032830238342285, + "learning_rate": 1.9972303329079042e-05, + "loss": 0.4249, + "step": 918 + }, + { + "epoch": 0.08657356162125245, + "grad_norm": 1.023820400238037, + "learning_rate": 1.9972190907562993e-05, + "loss": 0.446, + "step": 919 + }, + { + "epoch": 0.0866677657144203, + "grad_norm": 0.9125425815582275, + "learning_rate": 1.9972078258665574e-05, + "loss": 0.4342, + "step": 920 + }, + { + "epoch": 0.08676196980758814, + "grad_norm": 0.8609833717346191, + "learning_rate": 1.9971965382389347e-05, + "loss": 0.3637, + "step": 921 + }, + { + "epoch": 0.08685617390075599, + "grad_norm": 0.9849636554718018, + "learning_rate": 1.9971852278736886e-05, + "loss": 0.4722, + "step": 922 + }, + { + "epoch": 0.08695037799392384, + "grad_norm": 0.9862493872642517, + "learning_rate": 1.997173894771077e-05, + "loss": 0.3878, + "step": 923 + }, + { + "epoch": 0.08704458208709169, + "grad_norm": 1.003157138824463, + "learning_rate": 1.9971625389313587e-05, + "loss": 0.4448, + "step": 924 + }, + { + "epoch": 0.08713878618025953, + "grad_norm": 0.9130213856697083, + "learning_rate": 1.9971511603547923e-05, + "loss": 0.4192, + "step": 925 + }, + { + "epoch": 0.08723299027342737, + "grad_norm": 0.972983717918396, + "learning_rate": 1.9971397590416372e-05, + "loss": 0.4194, + "step": 926 + }, + { + "epoch": 0.08732719436659522, + "grad_norm": 1.125463604927063, + "learning_rate": 1.9971283349921538e-05, + "loss": 0.4699, + "step": 927 + }, + { + "epoch": 0.08742139845976307, + "grad_norm": 0.8822147250175476, + "learning_rate": 1.997116888206602e-05, + "loss": 0.3593, + "step": 928 + }, + { + "epoch": 0.08751560255293092, + "grad_norm": 0.9502570033073425, + "learning_rate": 1.997105418685243e-05, + "loss": 0.433, + "step": 929 + }, + { + "epoch": 0.08760980664609877, + "grad_norm": 0.9492387175559998, + "learning_rate": 1.9970939264283386e-05, + "loss": 0.447, + "step": 930 + }, + { + "epoch": 0.08770401073926662, + "grad_norm": 0.9000892043113708, + "learning_rate": 1.9970824114361507e-05, + "loss": 0.4475, + "step": 931 + }, + { + "epoch": 0.08779821483243447, + "grad_norm": 0.9955626726150513, + "learning_rate": 1.9970708737089416e-05, + "loss": 0.4828, + "step": 932 + }, + { + "epoch": 0.08789241892560232, + "grad_norm": 0.8077518939971924, + "learning_rate": 1.9970593132469748e-05, + "loss": 0.397, + "step": 933 + }, + { + "epoch": 0.08798662301877017, + "grad_norm": 0.8611214756965637, + "learning_rate": 1.9970477300505133e-05, + "loss": 0.4351, + "step": 934 + }, + { + "epoch": 0.08808082711193801, + "grad_norm": 1.0117332935333252, + "learning_rate": 1.997036124119822e-05, + "loss": 0.46, + "step": 935 + }, + { + "epoch": 0.08817503120510586, + "grad_norm": 1.0115711688995361, + "learning_rate": 1.9970244954551648e-05, + "loss": 0.4724, + "step": 936 + }, + { + "epoch": 0.08826923529827371, + "grad_norm": 0.8961659669876099, + "learning_rate": 1.9970128440568074e-05, + "loss": 0.4178, + "step": 937 + }, + { + "epoch": 0.08836343939144156, + "grad_norm": 0.9423846006393433, + "learning_rate": 1.997001169925015e-05, + "loss": 0.4318, + "step": 938 + }, + { + "epoch": 0.08845764348460941, + "grad_norm": 1.0483758449554443, + "learning_rate": 1.9969894730600544e-05, + "loss": 0.3863, + "step": 939 + }, + { + "epoch": 0.08855184757777726, + "grad_norm": 0.9910538792610168, + "learning_rate": 1.9969777534621918e-05, + "loss": 0.4692, + "step": 940 + }, + { + "epoch": 0.08864605167094511, + "grad_norm": 0.8945648074150085, + "learning_rate": 1.9969660111316945e-05, + "loss": 0.4039, + "step": 941 + }, + { + "epoch": 0.08874025576411296, + "grad_norm": 0.867096483707428, + "learning_rate": 1.9969542460688305e-05, + "loss": 0.4212, + "step": 942 + }, + { + "epoch": 0.08883445985728079, + "grad_norm": 0.9797136187553406, + "learning_rate": 1.9969424582738676e-05, + "loss": 0.4278, + "step": 943 + }, + { + "epoch": 0.08892866395044864, + "grad_norm": 0.9553640484809875, + "learning_rate": 1.996930647747075e-05, + "loss": 0.4082, + "step": 944 + }, + { + "epoch": 0.08902286804361649, + "grad_norm": 0.9790606498718262, + "learning_rate": 1.9969188144887217e-05, + "loss": 0.4145, + "step": 945 + }, + { + "epoch": 0.08911707213678434, + "grad_norm": 0.8860597014427185, + "learning_rate": 1.9969069584990776e-05, + "loss": 0.3856, + "step": 946 + }, + { + "epoch": 0.08921127622995219, + "grad_norm": 1.0481891632080078, + "learning_rate": 1.9968950797784136e-05, + "loss": 0.4086, + "step": 947 + }, + { + "epoch": 0.08930548032312004, + "grad_norm": 1.065038800239563, + "learning_rate": 1.9968831783269997e-05, + "loss": 0.4608, + "step": 948 + }, + { + "epoch": 0.08939968441628789, + "grad_norm": 1.0611155033111572, + "learning_rate": 1.9968712541451073e-05, + "loss": 0.4528, + "step": 949 + }, + { + "epoch": 0.08949388850945574, + "grad_norm": 0.8630752563476562, + "learning_rate": 1.9968593072330093e-05, + "loss": 0.4143, + "step": 950 + }, + { + "epoch": 0.08958809260262358, + "grad_norm": 0.9093382954597473, + "learning_rate": 1.996847337590977e-05, + "loss": 0.4257, + "step": 951 + }, + { + "epoch": 0.08968229669579143, + "grad_norm": 0.9018155336380005, + "learning_rate": 1.996835345219284e-05, + "loss": 0.3865, + "step": 952 + }, + { + "epoch": 0.08977650078895928, + "grad_norm": 1.040109395980835, + "learning_rate": 1.9968233301182033e-05, + "loss": 0.3863, + "step": 953 + }, + { + "epoch": 0.08987070488212713, + "grad_norm": 1.0499516725540161, + "learning_rate": 1.9968112922880088e-05, + "loss": 0.4221, + "step": 954 + }, + { + "epoch": 0.08996490897529498, + "grad_norm": 0.9960455894470215, + "learning_rate": 1.9967992317289754e-05, + "loss": 0.4535, + "step": 955 + }, + { + "epoch": 0.09005911306846283, + "grad_norm": 0.992347002029419, + "learning_rate": 1.9967871484413782e-05, + "loss": 0.4323, + "step": 956 + }, + { + "epoch": 0.09015331716163068, + "grad_norm": 1.0884090662002563, + "learning_rate": 1.9967750424254922e-05, + "loss": 0.4741, + "step": 957 + }, + { + "epoch": 0.09024752125479853, + "grad_norm": 0.9774399399757385, + "learning_rate": 1.996762913681594e-05, + "loss": 0.419, + "step": 958 + }, + { + "epoch": 0.09034172534796638, + "grad_norm": 1.0485435724258423, + "learning_rate": 1.9967507622099595e-05, + "loss": 0.4387, + "step": 959 + }, + { + "epoch": 0.09043592944113421, + "grad_norm": 1.0433429479599, + "learning_rate": 1.9967385880108663e-05, + "loss": 0.4861, + "step": 960 + }, + { + "epoch": 0.09053013353430206, + "grad_norm": 0.8936190605163574, + "learning_rate": 1.996726391084592e-05, + "loss": 0.4286, + "step": 961 + }, + { + "epoch": 0.09062433762746991, + "grad_norm": 1.089770793914795, + "learning_rate": 1.996714171431414e-05, + "loss": 0.5094, + "step": 962 + }, + { + "epoch": 0.09071854172063776, + "grad_norm": 0.8957109451293945, + "learning_rate": 1.9967019290516115e-05, + "loss": 0.4038, + "step": 963 + }, + { + "epoch": 0.0908127458138056, + "grad_norm": 1.009662389755249, + "learning_rate": 1.996689663945464e-05, + "loss": 0.493, + "step": 964 + }, + { + "epoch": 0.09090694990697346, + "grad_norm": 0.9156802892684937, + "learning_rate": 1.9966773761132506e-05, + "loss": 0.3813, + "step": 965 + }, + { + "epoch": 0.0910011540001413, + "grad_norm": 0.8849820494651794, + "learning_rate": 1.9966650655552516e-05, + "loss": 0.3967, + "step": 966 + }, + { + "epoch": 0.09109535809330915, + "grad_norm": 0.9695305228233337, + "learning_rate": 1.996652732271748e-05, + "loss": 0.4175, + "step": 967 + }, + { + "epoch": 0.091189562186477, + "grad_norm": 1.0623234510421753, + "learning_rate": 1.99664037626302e-05, + "loss": 0.435, + "step": 968 + }, + { + "epoch": 0.09128376627964485, + "grad_norm": 1.0353437662124634, + "learning_rate": 1.996627997529351e-05, + "loss": 0.4506, + "step": 969 + }, + { + "epoch": 0.0913779703728127, + "grad_norm": 0.906459629535675, + "learning_rate": 1.996615596071022e-05, + "loss": 0.4007, + "step": 970 + }, + { + "epoch": 0.09147217446598055, + "grad_norm": 0.9056885242462158, + "learning_rate": 1.9966031718883157e-05, + "loss": 0.392, + "step": 971 + }, + { + "epoch": 0.0915663785591484, + "grad_norm": 0.906808078289032, + "learning_rate": 1.9965907249815163e-05, + "loss": 0.4303, + "step": 972 + }, + { + "epoch": 0.09166058265231625, + "grad_norm": 0.8993813991546631, + "learning_rate": 1.9965782553509067e-05, + "loss": 0.4355, + "step": 973 + }, + { + "epoch": 0.0917547867454841, + "grad_norm": 0.9064143300056458, + "learning_rate": 1.996565762996772e-05, + "loss": 0.3948, + "step": 974 + }, + { + "epoch": 0.09184899083865194, + "grad_norm": 0.9879948496818542, + "learning_rate": 1.9965532479193967e-05, + "loss": 0.4619, + "step": 975 + }, + { + "epoch": 0.0919431949318198, + "grad_norm": 1.0842314958572388, + "learning_rate": 1.996540710119066e-05, + "loss": 0.4527, + "step": 976 + }, + { + "epoch": 0.09203739902498763, + "grad_norm": 1.0031143426895142, + "learning_rate": 1.996528149596066e-05, + "loss": 0.4272, + "step": 977 + }, + { + "epoch": 0.09213160311815548, + "grad_norm": 0.9392336010932922, + "learning_rate": 1.996515566350683e-05, + "loss": 0.4007, + "step": 978 + }, + { + "epoch": 0.09222580721132333, + "grad_norm": 1.0424503087997437, + "learning_rate": 1.9965029603832036e-05, + "loss": 0.4618, + "step": 979 + }, + { + "epoch": 0.09232001130449118, + "grad_norm": 0.9735755324363708, + "learning_rate": 1.996490331693916e-05, + "loss": 0.4586, + "step": 980 + }, + { + "epoch": 0.09241421539765902, + "grad_norm": 0.9512478113174438, + "learning_rate": 1.996477680283108e-05, + "loss": 0.4286, + "step": 981 + }, + { + "epoch": 0.09250841949082687, + "grad_norm": 0.8885504603385925, + "learning_rate": 1.996465006151067e-05, + "loss": 0.4222, + "step": 982 + }, + { + "epoch": 0.09260262358399472, + "grad_norm": 0.9333905577659607, + "learning_rate": 1.9964523092980834e-05, + "loss": 0.4208, + "step": 983 + }, + { + "epoch": 0.09269682767716257, + "grad_norm": 0.8582695126533508, + "learning_rate": 1.996439589724446e-05, + "loss": 0.3992, + "step": 984 + }, + { + "epoch": 0.09279103177033042, + "grad_norm": 0.9202725291252136, + "learning_rate": 1.9964268474304448e-05, + "loss": 0.3822, + "step": 985 + }, + { + "epoch": 0.09288523586349827, + "grad_norm": 0.9933639168739319, + "learning_rate": 1.9964140824163705e-05, + "loss": 0.4417, + "step": 986 + }, + { + "epoch": 0.09297943995666612, + "grad_norm": 0.9218569397926331, + "learning_rate": 1.996401294682514e-05, + "loss": 0.3977, + "step": 987 + }, + { + "epoch": 0.09307364404983397, + "grad_norm": 1.0288808345794678, + "learning_rate": 1.9963884842291677e-05, + "loss": 0.5045, + "step": 988 + }, + { + "epoch": 0.09316784814300182, + "grad_norm": 0.980198860168457, + "learning_rate": 1.9963756510566222e-05, + "loss": 0.467, + "step": 989 + }, + { + "epoch": 0.09326205223616967, + "grad_norm": 1.0243239402770996, + "learning_rate": 1.9963627951651715e-05, + "loss": 0.4765, + "step": 990 + }, + { + "epoch": 0.09335625632933751, + "grad_norm": 0.9720578193664551, + "learning_rate": 1.996349916555108e-05, + "loss": 0.432, + "step": 991 + }, + { + "epoch": 0.09345046042250536, + "grad_norm": 0.922471284866333, + "learning_rate": 1.996337015226725e-05, + "loss": 0.4131, + "step": 992 + }, + { + "epoch": 0.09354466451567321, + "grad_norm": 0.9350531697273254, + "learning_rate": 1.996324091180318e-05, + "loss": 0.4721, + "step": 993 + }, + { + "epoch": 0.09363886860884106, + "grad_norm": 0.9498339295387268, + "learning_rate": 1.9963111444161806e-05, + "loss": 0.4494, + "step": 994 + }, + { + "epoch": 0.0937330727020089, + "grad_norm": 1.0895516872406006, + "learning_rate": 1.996298174934608e-05, + "loss": 0.4975, + "step": 995 + }, + { + "epoch": 0.09382727679517675, + "grad_norm": 0.939751386642456, + "learning_rate": 1.996285182735896e-05, + "loss": 0.4139, + "step": 996 + }, + { + "epoch": 0.0939214808883446, + "grad_norm": 1.0181207656860352, + "learning_rate": 1.9962721678203416e-05, + "loss": 0.4607, + "step": 997 + }, + { + "epoch": 0.09401568498151244, + "grad_norm": 0.9602494835853577, + "learning_rate": 1.996259130188241e-05, + "loss": 0.3947, + "step": 998 + }, + { + "epoch": 0.09410988907468029, + "grad_norm": 0.902991771697998, + "learning_rate": 1.9962460698398914e-05, + "loss": 0.4085, + "step": 999 + }, + { + "epoch": 0.09420409316784814, + "grad_norm": 0.9918155074119568, + "learning_rate": 1.9962329867755906e-05, + "loss": 0.4562, + "step": 1000 + }, + { + "epoch": 0.09429829726101599, + "grad_norm": 1.0148743391036987, + "learning_rate": 1.996219880995637e-05, + "loss": 0.4875, + "step": 1001 + }, + { + "epoch": 0.09439250135418384, + "grad_norm": 0.9490169286727905, + "learning_rate": 1.9962067525003295e-05, + "loss": 0.4258, + "step": 1002 + }, + { + "epoch": 0.09448670544735169, + "grad_norm": 1.0962616205215454, + "learning_rate": 1.9961936012899673e-05, + "loss": 0.4985, + "step": 1003 + }, + { + "epoch": 0.09458090954051954, + "grad_norm": 1.0087071657180786, + "learning_rate": 1.9961804273648502e-05, + "loss": 0.4723, + "step": 1004 + }, + { + "epoch": 0.09467511363368739, + "grad_norm": 0.9063972234725952, + "learning_rate": 1.996167230725279e-05, + "loss": 0.3771, + "step": 1005 + }, + { + "epoch": 0.09476931772685523, + "grad_norm": 1.0970993041992188, + "learning_rate": 1.9961540113715543e-05, + "loss": 0.4679, + "step": 1006 + }, + { + "epoch": 0.09486352182002308, + "grad_norm": 0.9199796319007874, + "learning_rate": 1.9961407693039777e-05, + "loss": 0.4201, + "step": 1007 + }, + { + "epoch": 0.09495772591319093, + "grad_norm": 0.9776504635810852, + "learning_rate": 1.9961275045228506e-05, + "loss": 0.4448, + "step": 1008 + }, + { + "epoch": 0.09505193000635878, + "grad_norm": 0.9351235628128052, + "learning_rate": 1.9961142170284762e-05, + "loss": 0.3965, + "step": 1009 + }, + { + "epoch": 0.09514613409952663, + "grad_norm": 0.9779771566390991, + "learning_rate": 1.996100906821157e-05, + "loss": 0.3929, + "step": 1010 + }, + { + "epoch": 0.09524033819269448, + "grad_norm": 0.9306912422180176, + "learning_rate": 1.9960875739011966e-05, + "loss": 0.4565, + "step": 1011 + }, + { + "epoch": 0.09533454228586231, + "grad_norm": 1.103110909461975, + "learning_rate": 1.996074218268899e-05, + "loss": 0.4952, + "step": 1012 + }, + { + "epoch": 0.09542874637903016, + "grad_norm": 0.8756340146064758, + "learning_rate": 1.9960608399245688e-05, + "loss": 0.4193, + "step": 1013 + }, + { + "epoch": 0.09552295047219801, + "grad_norm": 0.9869973659515381, + "learning_rate": 1.996047438868511e-05, + "loss": 0.4303, + "step": 1014 + }, + { + "epoch": 0.09561715456536586, + "grad_norm": 0.9271620512008667, + "learning_rate": 1.996034015101031e-05, + "loss": 0.43, + "step": 1015 + }, + { + "epoch": 0.09571135865853371, + "grad_norm": 0.9635888338088989, + "learning_rate": 1.9960205686224355e-05, + "loss": 0.394, + "step": 1016 + }, + { + "epoch": 0.09580556275170156, + "grad_norm": 1.020485281944275, + "learning_rate": 1.9960070994330307e-05, + "loss": 0.4472, + "step": 1017 + }, + { + "epoch": 0.09589976684486941, + "grad_norm": 1.0038342475891113, + "learning_rate": 1.995993607533123e-05, + "loss": 0.4227, + "step": 1018 + }, + { + "epoch": 0.09599397093803726, + "grad_norm": 0.8755595088005066, + "learning_rate": 1.9959800929230212e-05, + "loss": 0.3637, + "step": 1019 + }, + { + "epoch": 0.0960881750312051, + "grad_norm": 0.9497532248497009, + "learning_rate": 1.995966555603033e-05, + "loss": 0.4353, + "step": 1020 + }, + { + "epoch": 0.09618237912437295, + "grad_norm": 0.9116272926330566, + "learning_rate": 1.9959529955734668e-05, + "loss": 0.4417, + "step": 1021 + }, + { + "epoch": 0.0962765832175408, + "grad_norm": 0.9214377999305725, + "learning_rate": 1.9959394128346323e-05, + "loss": 0.4261, + "step": 1022 + }, + { + "epoch": 0.09637078731070865, + "grad_norm": 0.8842349648475647, + "learning_rate": 1.9959258073868387e-05, + "loss": 0.4008, + "step": 1023 + }, + { + "epoch": 0.0964649914038765, + "grad_norm": 0.983285665512085, + "learning_rate": 1.9959121792303967e-05, + "loss": 0.384, + "step": 1024 + }, + { + "epoch": 0.09655919549704435, + "grad_norm": 1.0836387872695923, + "learning_rate": 1.9958985283656164e-05, + "loss": 0.4158, + "step": 1025 + }, + { + "epoch": 0.0966533995902122, + "grad_norm": 0.8068239092826843, + "learning_rate": 1.9958848547928098e-05, + "loss": 0.363, + "step": 1026 + }, + { + "epoch": 0.09674760368338005, + "grad_norm": 0.9302456974983215, + "learning_rate": 1.995871158512288e-05, + "loss": 0.4719, + "step": 1027 + }, + { + "epoch": 0.0968418077765479, + "grad_norm": 0.9990631341934204, + "learning_rate": 1.9958574395243643e-05, + "loss": 0.4119, + "step": 1028 + }, + { + "epoch": 0.09693601186971573, + "grad_norm": 0.9791685938835144, + "learning_rate": 1.9958436978293503e-05, + "loss": 0.4358, + "step": 1029 + }, + { + "epoch": 0.09703021596288358, + "grad_norm": 0.9006029367446899, + "learning_rate": 1.9958299334275602e-05, + "loss": 0.376, + "step": 1030 + }, + { + "epoch": 0.09712442005605143, + "grad_norm": 1.055158019065857, + "learning_rate": 1.9958161463193074e-05, + "loss": 0.4614, + "step": 1031 + }, + { + "epoch": 0.09721862414921928, + "grad_norm": 1.053442358970642, + "learning_rate": 1.9958023365049063e-05, + "loss": 0.4223, + "step": 1032 + }, + { + "epoch": 0.09731282824238713, + "grad_norm": 0.9021908044815063, + "learning_rate": 1.995788503984672e-05, + "loss": 0.3829, + "step": 1033 + }, + { + "epoch": 0.09740703233555498, + "grad_norm": 0.9552052617073059, + "learning_rate": 1.99577464875892e-05, + "loss": 0.4522, + "step": 1034 + }, + { + "epoch": 0.09750123642872283, + "grad_norm": 0.9647678732872009, + "learning_rate": 1.9957607708279656e-05, + "loss": 0.3969, + "step": 1035 + }, + { + "epoch": 0.09759544052189068, + "grad_norm": 0.9882518649101257, + "learning_rate": 1.9957468701921257e-05, + "loss": 0.4226, + "step": 1036 + }, + { + "epoch": 0.09768964461505852, + "grad_norm": 1.0046566724777222, + "learning_rate": 1.9957329468517175e-05, + "loss": 0.4545, + "step": 1037 + }, + { + "epoch": 0.09778384870822637, + "grad_norm": 0.9556362628936768, + "learning_rate": 1.995719000807058e-05, + "loss": 0.4151, + "step": 1038 + }, + { + "epoch": 0.09787805280139422, + "grad_norm": 0.9986971020698547, + "learning_rate": 1.9957050320584653e-05, + "loss": 0.3908, + "step": 1039 + }, + { + "epoch": 0.09797225689456207, + "grad_norm": 0.975786566734314, + "learning_rate": 1.9956910406062583e-05, + "loss": 0.4215, + "step": 1040 + }, + { + "epoch": 0.09806646098772992, + "grad_norm": 0.9671193361282349, + "learning_rate": 1.9956770264507555e-05, + "loss": 0.447, + "step": 1041 + }, + { + "epoch": 0.09816066508089777, + "grad_norm": 0.8227758407592773, + "learning_rate": 1.9956629895922765e-05, + "loss": 0.397, + "step": 1042 + }, + { + "epoch": 0.09825486917406562, + "grad_norm": 0.9373804926872253, + "learning_rate": 1.9956489300311416e-05, + "loss": 0.4022, + "step": 1043 + }, + { + "epoch": 0.09834907326723347, + "grad_norm": 0.9801133275032043, + "learning_rate": 1.9956348477676714e-05, + "loss": 0.4443, + "step": 1044 + }, + { + "epoch": 0.09844327736040132, + "grad_norm": 0.975909411907196, + "learning_rate": 1.995620742802187e-05, + "loss": 0.4358, + "step": 1045 + }, + { + "epoch": 0.09853748145356915, + "grad_norm": 1.0563409328460693, + "learning_rate": 1.9956066151350097e-05, + "loss": 0.4765, + "step": 1046 + }, + { + "epoch": 0.098631685546737, + "grad_norm": 1.0944769382476807, + "learning_rate": 1.995592464766462e-05, + "loss": 0.4237, + "step": 1047 + }, + { + "epoch": 0.09872588963990485, + "grad_norm": 1.036834478378296, + "learning_rate": 1.9955782916968663e-05, + "loss": 0.3851, + "step": 1048 + }, + { + "epoch": 0.0988200937330727, + "grad_norm": 0.9747660160064697, + "learning_rate": 1.995564095926546e-05, + "loss": 0.4205, + "step": 1049 + }, + { + "epoch": 0.09891429782624055, + "grad_norm": 0.9076653122901917, + "learning_rate": 1.995549877455824e-05, + "loss": 0.4159, + "step": 1050 + }, + { + "epoch": 0.0990085019194084, + "grad_norm": 1.1281238794326782, + "learning_rate": 1.995535636285026e-05, + "loss": 0.5221, + "step": 1051 + }, + { + "epoch": 0.09910270601257624, + "grad_norm": 0.964013934135437, + "learning_rate": 1.9955213724144754e-05, + "loss": 0.3878, + "step": 1052 + }, + { + "epoch": 0.0991969101057441, + "grad_norm": 1.1707106828689575, + "learning_rate": 1.995507085844498e-05, + "loss": 0.4516, + "step": 1053 + }, + { + "epoch": 0.09929111419891194, + "grad_norm": 1.0508333444595337, + "learning_rate": 1.9954927765754195e-05, + "loss": 0.4217, + "step": 1054 + }, + { + "epoch": 0.09938531829207979, + "grad_norm": 0.9513484239578247, + "learning_rate": 1.995478444607566e-05, + "loss": 0.3977, + "step": 1055 + }, + { + "epoch": 0.09947952238524764, + "grad_norm": 1.0911073684692383, + "learning_rate": 1.9954640899412645e-05, + "loss": 0.4627, + "step": 1056 + }, + { + "epoch": 0.09957372647841549, + "grad_norm": 1.0177876949310303, + "learning_rate": 1.9954497125768423e-05, + "loss": 0.4408, + "step": 1057 + }, + { + "epoch": 0.09966793057158334, + "grad_norm": 0.9709005951881409, + "learning_rate": 1.9954353125146273e-05, + "loss": 0.453, + "step": 1058 + }, + { + "epoch": 0.09976213466475119, + "grad_norm": 0.9255178570747375, + "learning_rate": 1.9954208897549475e-05, + "loss": 0.3988, + "step": 1059 + }, + { + "epoch": 0.09985633875791904, + "grad_norm": 0.8608093857765198, + "learning_rate": 1.995406444298132e-05, + "loss": 0.3715, + "step": 1060 + }, + { + "epoch": 0.09995054285108688, + "grad_norm": 1.0226445198059082, + "learning_rate": 1.99539197614451e-05, + "loss": 0.3748, + "step": 1061 + }, + { + "epoch": 0.10004474694425473, + "grad_norm": 1.0217959880828857, + "learning_rate": 1.995377485294412e-05, + "loss": 0.4088, + "step": 1062 + }, + { + "epoch": 0.10013895103742258, + "grad_norm": 1.042251467704773, + "learning_rate": 1.9953629717481675e-05, + "loss": 0.4356, + "step": 1063 + }, + { + "epoch": 0.10023315513059042, + "grad_norm": 0.9148966670036316, + "learning_rate": 1.9953484355061078e-05, + "loss": 0.358, + "step": 1064 + }, + { + "epoch": 0.10032735922375827, + "grad_norm": 0.8217335343360901, + "learning_rate": 1.995333876568565e-05, + "loss": 0.3864, + "step": 1065 + }, + { + "epoch": 0.10042156331692612, + "grad_norm": 1.1984763145446777, + "learning_rate": 1.99531929493587e-05, + "loss": 0.4146, + "step": 1066 + }, + { + "epoch": 0.10051576741009396, + "grad_norm": 1.2049845457077026, + "learning_rate": 1.995304690608356e-05, + "loss": 0.4345, + "step": 1067 + }, + { + "epoch": 0.10060997150326181, + "grad_norm": 1.0960497856140137, + "learning_rate": 1.9952900635863558e-05, + "loss": 0.4823, + "step": 1068 + }, + { + "epoch": 0.10070417559642966, + "grad_norm": 0.9735412001609802, + "learning_rate": 1.9952754138702025e-05, + "loss": 0.4458, + "step": 1069 + }, + { + "epoch": 0.10079837968959751, + "grad_norm": 1.1528902053833008, + "learning_rate": 1.995260741460231e-05, + "loss": 0.4443, + "step": 1070 + }, + { + "epoch": 0.10089258378276536, + "grad_norm": 0.9167768359184265, + "learning_rate": 1.9952460463567752e-05, + "loss": 0.379, + "step": 1071 + }, + { + "epoch": 0.10098678787593321, + "grad_norm": 1.1124496459960938, + "learning_rate": 1.9952313285601706e-05, + "loss": 0.4275, + "step": 1072 + }, + { + "epoch": 0.10108099196910106, + "grad_norm": 0.8633842468261719, + "learning_rate": 1.9952165880707524e-05, + "loss": 0.3647, + "step": 1073 + }, + { + "epoch": 0.10117519606226891, + "grad_norm": 0.9196866750717163, + "learning_rate": 1.9952018248888567e-05, + "loss": 0.3929, + "step": 1074 + }, + { + "epoch": 0.10126940015543676, + "grad_norm": 0.9396743774414062, + "learning_rate": 1.9951870390148206e-05, + "loss": 0.4321, + "step": 1075 + }, + { + "epoch": 0.1013636042486046, + "grad_norm": 1.0641164779663086, + "learning_rate": 1.9951722304489806e-05, + "loss": 0.4499, + "step": 1076 + }, + { + "epoch": 0.10145780834177245, + "grad_norm": 0.9184996485710144, + "learning_rate": 1.9951573991916747e-05, + "loss": 0.4231, + "step": 1077 + }, + { + "epoch": 0.1015520124349403, + "grad_norm": 1.0018250942230225, + "learning_rate": 1.9951425452432415e-05, + "loss": 0.393, + "step": 1078 + }, + { + "epoch": 0.10164621652810815, + "grad_norm": 1.0571403503417969, + "learning_rate": 1.9951276686040188e-05, + "loss": 0.402, + "step": 1079 + }, + { + "epoch": 0.101740420621276, + "grad_norm": 0.9825863242149353, + "learning_rate": 1.9951127692743463e-05, + "loss": 0.4146, + "step": 1080 + }, + { + "epoch": 0.10183462471444384, + "grad_norm": 0.8631836771965027, + "learning_rate": 1.995097847254564e-05, + "loss": 0.3609, + "step": 1081 + }, + { + "epoch": 0.10192882880761168, + "grad_norm": 0.9502719044685364, + "learning_rate": 1.9950829025450116e-05, + "loss": 0.4387, + "step": 1082 + }, + { + "epoch": 0.10202303290077953, + "grad_norm": 1.0838639736175537, + "learning_rate": 1.9950679351460304e-05, + "loss": 0.4284, + "step": 1083 + }, + { + "epoch": 0.10211723699394738, + "grad_norm": 1.0188603401184082, + "learning_rate": 1.9950529450579607e-05, + "loss": 0.466, + "step": 1084 + }, + { + "epoch": 0.10221144108711523, + "grad_norm": 0.9595416188240051, + "learning_rate": 1.9950379322811456e-05, + "loss": 0.4428, + "step": 1085 + }, + { + "epoch": 0.10230564518028308, + "grad_norm": 0.9125534296035767, + "learning_rate": 1.9950228968159263e-05, + "loss": 0.3809, + "step": 1086 + }, + { + "epoch": 0.10239984927345093, + "grad_norm": 1.0977569818496704, + "learning_rate": 1.9950078386626465e-05, + "loss": 0.4142, + "step": 1087 + }, + { + "epoch": 0.10249405336661878, + "grad_norm": 1.0646089315414429, + "learning_rate": 1.994992757821649e-05, + "loss": 0.3855, + "step": 1088 + }, + { + "epoch": 0.10258825745978663, + "grad_norm": 1.055989146232605, + "learning_rate": 1.994977654293278e-05, + "loss": 0.4116, + "step": 1089 + }, + { + "epoch": 0.10268246155295448, + "grad_norm": 1.0687748193740845, + "learning_rate": 1.994962528077878e-05, + "loss": 0.4367, + "step": 1090 + }, + { + "epoch": 0.10277666564612233, + "grad_norm": 0.9160301089286804, + "learning_rate": 1.994947379175793e-05, + "loss": 0.4101, + "step": 1091 + }, + { + "epoch": 0.10287086973929017, + "grad_norm": 0.9010409116744995, + "learning_rate": 1.994932207587369e-05, + "loss": 0.4078, + "step": 1092 + }, + { + "epoch": 0.10296507383245802, + "grad_norm": 1.0180050134658813, + "learning_rate": 1.9949170133129524e-05, + "loss": 0.4309, + "step": 1093 + }, + { + "epoch": 0.10305927792562587, + "grad_norm": 1.0494129657745361, + "learning_rate": 1.9949017963528893e-05, + "loss": 0.419, + "step": 1094 + }, + { + "epoch": 0.10315348201879372, + "grad_norm": 0.9077717661857605, + "learning_rate": 1.9948865567075262e-05, + "loss": 0.4143, + "step": 1095 + }, + { + "epoch": 0.10324768611196157, + "grad_norm": 1.0535287857055664, + "learning_rate": 1.994871294377211e-05, + "loss": 0.436, + "step": 1096 + }, + { + "epoch": 0.10334189020512942, + "grad_norm": 0.9463797807693481, + "learning_rate": 1.994856009362292e-05, + "loss": 0.4422, + "step": 1097 + }, + { + "epoch": 0.10343609429829725, + "grad_norm": 0.9908978343009949, + "learning_rate": 1.994840701663117e-05, + "loss": 0.4177, + "step": 1098 + }, + { + "epoch": 0.1035302983914651, + "grad_norm": 0.9611839652061462, + "learning_rate": 1.9948253712800358e-05, + "loss": 0.3773, + "step": 1099 + }, + { + "epoch": 0.10362450248463295, + "grad_norm": 1.0818508863449097, + "learning_rate": 1.9948100182133977e-05, + "loss": 0.4429, + "step": 1100 + }, + { + "epoch": 0.1037187065778008, + "grad_norm": 0.9528419375419617, + "learning_rate": 1.9947946424635524e-05, + "loss": 0.3939, + "step": 1101 + }, + { + "epoch": 0.10381291067096865, + "grad_norm": 1.0397058725357056, + "learning_rate": 1.994779244030851e-05, + "loss": 0.4457, + "step": 1102 + }, + { + "epoch": 0.1039071147641365, + "grad_norm": 0.9053099155426025, + "learning_rate": 1.9947638229156442e-05, + "loss": 0.4193, + "step": 1103 + }, + { + "epoch": 0.10400131885730435, + "grad_norm": 0.8966038823127747, + "learning_rate": 1.994748379118284e-05, + "loss": 0.41, + "step": 1104 + }, + { + "epoch": 0.1040955229504722, + "grad_norm": 1.0802412033081055, + "learning_rate": 1.9947329126391218e-05, + "loss": 0.442, + "step": 1105 + }, + { + "epoch": 0.10418972704364005, + "grad_norm": 0.9492148756980896, + "learning_rate": 1.9947174234785115e-05, + "loss": 0.393, + "step": 1106 + }, + { + "epoch": 0.1042839311368079, + "grad_norm": 0.9896593689918518, + "learning_rate": 1.9947019116368052e-05, + "loss": 0.4653, + "step": 1107 + }, + { + "epoch": 0.10437813522997574, + "grad_norm": 1.1006875038146973, + "learning_rate": 1.9946863771143568e-05, + "loss": 0.3914, + "step": 1108 + }, + { + "epoch": 0.10447233932314359, + "grad_norm": 0.8899286389350891, + "learning_rate": 1.994670819911521e-05, + "loss": 0.3807, + "step": 1109 + }, + { + "epoch": 0.10456654341631144, + "grad_norm": 0.9169561266899109, + "learning_rate": 1.9946552400286526e-05, + "loss": 0.3909, + "step": 1110 + }, + { + "epoch": 0.10466074750947929, + "grad_norm": 0.9260783791542053, + "learning_rate": 1.994639637466106e-05, + "loss": 0.4816, + "step": 1111 + }, + { + "epoch": 0.10475495160264714, + "grad_norm": 0.8580223321914673, + "learning_rate": 1.9946240122242374e-05, + "loss": 0.3875, + "step": 1112 + }, + { + "epoch": 0.10484915569581499, + "grad_norm": 0.9351666569709778, + "learning_rate": 1.9946083643034032e-05, + "loss": 0.4119, + "step": 1113 + }, + { + "epoch": 0.10494335978898284, + "grad_norm": 0.96866774559021, + "learning_rate": 1.9945926937039603e-05, + "loss": 0.4657, + "step": 1114 + }, + { + "epoch": 0.10503756388215069, + "grad_norm": 0.9898170232772827, + "learning_rate": 1.9945770004262655e-05, + "loss": 0.4932, + "step": 1115 + }, + { + "epoch": 0.10513176797531852, + "grad_norm": 0.8722140789031982, + "learning_rate": 1.9945612844706768e-05, + "loss": 0.3971, + "step": 1116 + }, + { + "epoch": 0.10522597206848637, + "grad_norm": 0.9951531887054443, + "learning_rate": 1.9945455458375533e-05, + "loss": 0.4647, + "step": 1117 + }, + { + "epoch": 0.10532017616165422, + "grad_norm": 0.9879446625709534, + "learning_rate": 1.9945297845272527e-05, + "loss": 0.4687, + "step": 1118 + }, + { + "epoch": 0.10541438025482207, + "grad_norm": 0.9516083598136902, + "learning_rate": 1.9945140005401352e-05, + "loss": 0.4285, + "step": 1119 + }, + { + "epoch": 0.10550858434798992, + "grad_norm": 1.0435234308242798, + "learning_rate": 1.9944981938765603e-05, + "loss": 0.4443, + "step": 1120 + }, + { + "epoch": 0.10560278844115777, + "grad_norm": 0.9984058141708374, + "learning_rate": 1.9944823645368886e-05, + "loss": 0.3867, + "step": 1121 + }, + { + "epoch": 0.10569699253432562, + "grad_norm": 0.9314647912979126, + "learning_rate": 1.994466512521481e-05, + "loss": 0.382, + "step": 1122 + }, + { + "epoch": 0.10579119662749346, + "grad_norm": 1.0670300722122192, + "learning_rate": 1.9944506378306993e-05, + "loss": 0.4376, + "step": 1123 + }, + { + "epoch": 0.10588540072066131, + "grad_norm": 1.037182092666626, + "learning_rate": 1.9944347404649045e-05, + "loss": 0.4423, + "step": 1124 + }, + { + "epoch": 0.10597960481382916, + "grad_norm": 0.9391227960586548, + "learning_rate": 1.9944188204244602e-05, + "loss": 0.3961, + "step": 1125 + }, + { + "epoch": 0.10607380890699701, + "grad_norm": 0.9991692900657654, + "learning_rate": 1.9944028777097286e-05, + "loss": 0.4452, + "step": 1126 + }, + { + "epoch": 0.10616801300016486, + "grad_norm": 1.015335202217102, + "learning_rate": 1.9943869123210736e-05, + "loss": 0.4103, + "step": 1127 + }, + { + "epoch": 0.10626221709333271, + "grad_norm": 0.8520253300666809, + "learning_rate": 1.9943709242588588e-05, + "loss": 0.3964, + "step": 1128 + }, + { + "epoch": 0.10635642118650056, + "grad_norm": 0.991342306137085, + "learning_rate": 1.9943549135234496e-05, + "loss": 0.389, + "step": 1129 + }, + { + "epoch": 0.1064506252796684, + "grad_norm": 1.0495084524154663, + "learning_rate": 1.9943388801152107e-05, + "loss": 0.3862, + "step": 1130 + }, + { + "epoch": 0.10654482937283626, + "grad_norm": 0.965828001499176, + "learning_rate": 1.994322824034507e-05, + "loss": 0.449, + "step": 1131 + }, + { + "epoch": 0.1066390334660041, + "grad_norm": 0.9767307639122009, + "learning_rate": 1.9943067452817056e-05, + "loss": 0.4644, + "step": 1132 + }, + { + "epoch": 0.10673323755917194, + "grad_norm": 1.0747474431991577, + "learning_rate": 1.9942906438571727e-05, + "loss": 0.4394, + "step": 1133 + }, + { + "epoch": 0.10682744165233979, + "grad_norm": 0.8223479390144348, + "learning_rate": 1.994274519761275e-05, + "loss": 0.3783, + "step": 1134 + }, + { + "epoch": 0.10692164574550764, + "grad_norm": 0.959511399269104, + "learning_rate": 1.9942583729943806e-05, + "loss": 0.4156, + "step": 1135 + }, + { + "epoch": 0.10701584983867549, + "grad_norm": 1.038598895072937, + "learning_rate": 1.994242203556858e-05, + "loss": 0.4593, + "step": 1136 + }, + { + "epoch": 0.10711005393184334, + "grad_norm": 0.9544926285743713, + "learning_rate": 1.9942260114490754e-05, + "loss": 0.4334, + "step": 1137 + }, + { + "epoch": 0.10720425802501118, + "grad_norm": 0.9143279194831848, + "learning_rate": 1.9942097966714022e-05, + "loss": 0.392, + "step": 1138 + }, + { + "epoch": 0.10729846211817903, + "grad_norm": 0.9807520508766174, + "learning_rate": 1.994193559224208e-05, + "loss": 0.4353, + "step": 1139 + }, + { + "epoch": 0.10739266621134688, + "grad_norm": 0.9121593832969666, + "learning_rate": 1.994177299107863e-05, + "loss": 0.4215, + "step": 1140 + }, + { + "epoch": 0.10748687030451473, + "grad_norm": 0.9819267988204956, + "learning_rate": 1.9941610163227382e-05, + "loss": 0.436, + "step": 1141 + }, + { + "epoch": 0.10758107439768258, + "grad_norm": 0.9428817629814148, + "learning_rate": 1.9941447108692047e-05, + "loss": 0.4061, + "step": 1142 + }, + { + "epoch": 0.10767527849085043, + "grad_norm": 1.1399716138839722, + "learning_rate": 1.9941283827476344e-05, + "loss": 0.4847, + "step": 1143 + }, + { + "epoch": 0.10776948258401828, + "grad_norm": 0.9605879187583923, + "learning_rate": 1.9941120319583995e-05, + "loss": 0.4232, + "step": 1144 + }, + { + "epoch": 0.10786368667718613, + "grad_norm": 0.9342689514160156, + "learning_rate": 1.994095658501873e-05, + "loss": 0.3978, + "step": 1145 + }, + { + "epoch": 0.10795789077035398, + "grad_norm": 0.903514564037323, + "learning_rate": 1.9940792623784277e-05, + "loss": 0.3758, + "step": 1146 + }, + { + "epoch": 0.10805209486352182, + "grad_norm": 0.8668076395988464, + "learning_rate": 1.9940628435884378e-05, + "loss": 0.3828, + "step": 1147 + }, + { + "epoch": 0.10814629895668967, + "grad_norm": 0.8557491898536682, + "learning_rate": 1.994046402132278e-05, + "loss": 0.4151, + "step": 1148 + }, + { + "epoch": 0.10824050304985752, + "grad_norm": 0.991773784160614, + "learning_rate": 1.9940299380103226e-05, + "loss": 0.4073, + "step": 1149 + }, + { + "epoch": 0.10833470714302536, + "grad_norm": 0.9056696891784668, + "learning_rate": 1.994013451222948e-05, + "loss": 0.3922, + "step": 1150 + }, + { + "epoch": 0.1084289112361932, + "grad_norm": 0.8937355279922485, + "learning_rate": 1.9939969417705286e-05, + "loss": 0.452, + "step": 1151 + }, + { + "epoch": 0.10852311532936106, + "grad_norm": 0.9577783942222595, + "learning_rate": 1.993980409653442e-05, + "loss": 0.446, + "step": 1152 + }, + { + "epoch": 0.1086173194225289, + "grad_norm": 0.8804211616516113, + "learning_rate": 1.993963854872065e-05, + "loss": 0.3511, + "step": 1153 + }, + { + "epoch": 0.10871152351569675, + "grad_norm": 0.9269177913665771, + "learning_rate": 1.993947277426775e-05, + "loss": 0.448, + "step": 1154 + }, + { + "epoch": 0.1088057276088646, + "grad_norm": 0.934552788734436, + "learning_rate": 1.9939306773179498e-05, + "loss": 0.4598, + "step": 1155 + }, + { + "epoch": 0.10889993170203245, + "grad_norm": 0.8715839385986328, + "learning_rate": 1.9939140545459677e-05, + "loss": 0.4361, + "step": 1156 + }, + { + "epoch": 0.1089941357952003, + "grad_norm": 0.9728548526763916, + "learning_rate": 1.9938974091112084e-05, + "loss": 0.468, + "step": 1157 + }, + { + "epoch": 0.10908833988836815, + "grad_norm": 1.1369495391845703, + "learning_rate": 1.993880741014051e-05, + "loss": 0.4311, + "step": 1158 + }, + { + "epoch": 0.109182543981536, + "grad_norm": 0.9297184944152832, + "learning_rate": 1.9938640502548753e-05, + "loss": 0.417, + "step": 1159 + }, + { + "epoch": 0.10927674807470385, + "grad_norm": 1.0212570428848267, + "learning_rate": 1.9938473368340627e-05, + "loss": 0.4747, + "step": 1160 + }, + { + "epoch": 0.1093709521678717, + "grad_norm": 0.9136896133422852, + "learning_rate": 1.9938306007519936e-05, + "loss": 0.4211, + "step": 1161 + }, + { + "epoch": 0.10946515626103955, + "grad_norm": 1.049403429031372, + "learning_rate": 1.9938138420090502e-05, + "loss": 0.4049, + "step": 1162 + }, + { + "epoch": 0.1095593603542074, + "grad_norm": 0.8913701772689819, + "learning_rate": 1.9937970606056135e-05, + "loss": 0.3839, + "step": 1163 + }, + { + "epoch": 0.10965356444737524, + "grad_norm": 0.8904287219047546, + "learning_rate": 1.9937802565420675e-05, + "loss": 0.4361, + "step": 1164 + }, + { + "epoch": 0.10974776854054309, + "grad_norm": 0.9403328895568848, + "learning_rate": 1.9937634298187944e-05, + "loss": 0.4217, + "step": 1165 + }, + { + "epoch": 0.10984197263371094, + "grad_norm": 0.924608051776886, + "learning_rate": 1.9937465804361783e-05, + "loss": 0.3998, + "step": 1166 + }, + { + "epoch": 0.10993617672687878, + "grad_norm": 0.9521637558937073, + "learning_rate": 1.9937297083946032e-05, + "loss": 0.4222, + "step": 1167 + }, + { + "epoch": 0.11003038082004662, + "grad_norm": 0.9766367077827454, + "learning_rate": 1.9937128136944542e-05, + "loss": 0.4144, + "step": 1168 + }, + { + "epoch": 0.11012458491321447, + "grad_norm": 0.804704487323761, + "learning_rate": 1.993695896336116e-05, + "loss": 0.4075, + "step": 1169 + }, + { + "epoch": 0.11021878900638232, + "grad_norm": 0.9180735349655151, + "learning_rate": 1.9936789563199747e-05, + "loss": 0.422, + "step": 1170 + }, + { + "epoch": 0.11031299309955017, + "grad_norm": 1.005083441734314, + "learning_rate": 1.9936619936464163e-05, + "loss": 0.4462, + "step": 1171 + }, + { + "epoch": 0.11040719719271802, + "grad_norm": 0.8922913074493408, + "learning_rate": 1.9936450083158277e-05, + "loss": 0.3895, + "step": 1172 + }, + { + "epoch": 0.11050140128588587, + "grad_norm": 0.9868902564048767, + "learning_rate": 1.9936280003285966e-05, + "loss": 0.4011, + "step": 1173 + }, + { + "epoch": 0.11059560537905372, + "grad_norm": 1.131201148033142, + "learning_rate": 1.99361096968511e-05, + "loss": 0.4347, + "step": 1174 + }, + { + "epoch": 0.11068980947222157, + "grad_norm": 0.8656303882598877, + "learning_rate": 1.9935939163857568e-05, + "loss": 0.3485, + "step": 1175 + }, + { + "epoch": 0.11078401356538942, + "grad_norm": 0.9164494276046753, + "learning_rate": 1.993576840430926e-05, + "loss": 0.3497, + "step": 1176 + }, + { + "epoch": 0.11087821765855727, + "grad_norm": 0.8910269737243652, + "learning_rate": 1.993559741821006e-05, + "loss": 0.3892, + "step": 1177 + }, + { + "epoch": 0.11097242175172511, + "grad_norm": 1.0199134349822998, + "learning_rate": 1.9935426205563878e-05, + "loss": 0.4074, + "step": 1178 + }, + { + "epoch": 0.11106662584489296, + "grad_norm": 0.9422849416732788, + "learning_rate": 1.993525476637461e-05, + "loss": 0.4027, + "step": 1179 + }, + { + "epoch": 0.11116082993806081, + "grad_norm": 0.8377765417098999, + "learning_rate": 1.993508310064617e-05, + "loss": 0.3536, + "step": 1180 + }, + { + "epoch": 0.11125503403122866, + "grad_norm": 0.998676598072052, + "learning_rate": 1.993491120838247e-05, + "loss": 0.3833, + "step": 1181 + }, + { + "epoch": 0.11134923812439651, + "grad_norm": 1.2441054582595825, + "learning_rate": 1.993473908958743e-05, + "loss": 0.3946, + "step": 1182 + }, + { + "epoch": 0.11144344221756436, + "grad_norm": 1.2046563625335693, + "learning_rate": 1.9934566744264975e-05, + "loss": 0.4182, + "step": 1183 + }, + { + "epoch": 0.11153764631073221, + "grad_norm": 0.8576118350028992, + "learning_rate": 1.9934394172419032e-05, + "loss": 0.382, + "step": 1184 + }, + { + "epoch": 0.11163185040390004, + "grad_norm": 0.9637244343757629, + "learning_rate": 1.9934221374053538e-05, + "loss": 0.4097, + "step": 1185 + }, + { + "epoch": 0.11172605449706789, + "grad_norm": 1.0179036855697632, + "learning_rate": 1.9934048349172433e-05, + "loss": 0.4023, + "step": 1186 + }, + { + "epoch": 0.11182025859023574, + "grad_norm": 1.0819107294082642, + "learning_rate": 1.9933875097779665e-05, + "loss": 0.4271, + "step": 1187 + }, + { + "epoch": 0.11191446268340359, + "grad_norm": 0.960874617099762, + "learning_rate": 1.9933701619879183e-05, + "loss": 0.3348, + "step": 1188 + }, + { + "epoch": 0.11200866677657144, + "grad_norm": 0.9459748268127441, + "learning_rate": 1.9933527915474936e-05, + "loss": 0.4009, + "step": 1189 + }, + { + "epoch": 0.11210287086973929, + "grad_norm": 0.941529393196106, + "learning_rate": 1.9933353984570894e-05, + "loss": 0.3894, + "step": 1190 + }, + { + "epoch": 0.11219707496290714, + "grad_norm": 0.9069638252258301, + "learning_rate": 1.9933179827171017e-05, + "loss": 0.4032, + "step": 1191 + }, + { + "epoch": 0.11229127905607499, + "grad_norm": 0.8663362264633179, + "learning_rate": 1.9933005443279278e-05, + "loss": 0.3971, + "step": 1192 + }, + { + "epoch": 0.11238548314924283, + "grad_norm": 1.1562236547470093, + "learning_rate": 1.9932830832899656e-05, + "loss": 0.4631, + "step": 1193 + }, + { + "epoch": 0.11247968724241068, + "grad_norm": 0.8705164790153503, + "learning_rate": 1.993265599603613e-05, + "loss": 0.4337, + "step": 1194 + }, + { + "epoch": 0.11257389133557853, + "grad_norm": 1.0312925577163696, + "learning_rate": 1.9932480932692682e-05, + "loss": 0.4298, + "step": 1195 + }, + { + "epoch": 0.11266809542874638, + "grad_norm": 0.9345729947090149, + "learning_rate": 1.993230564287331e-05, + "loss": 0.3955, + "step": 1196 + }, + { + "epoch": 0.11276229952191423, + "grad_norm": 0.9218775629997253, + "learning_rate": 1.9932130126582007e-05, + "loss": 0.4478, + "step": 1197 + }, + { + "epoch": 0.11285650361508208, + "grad_norm": 0.8155993223190308, + "learning_rate": 1.9931954383822777e-05, + "loss": 0.358, + "step": 1198 + }, + { + "epoch": 0.11295070770824993, + "grad_norm": 0.9408160448074341, + "learning_rate": 1.993177841459963e-05, + "loss": 0.4146, + "step": 1199 + }, + { + "epoch": 0.11304491180141778, + "grad_norm": 0.8760764598846436, + "learning_rate": 1.9931602218916573e-05, + "loss": 0.4375, + "step": 1200 + }, + { + "epoch": 0.11313911589458563, + "grad_norm": 0.8879175782203674, + "learning_rate": 1.9931425796777627e-05, + "loss": 0.4248, + "step": 1201 + }, + { + "epoch": 0.11323331998775346, + "grad_norm": 1.0035958290100098, + "learning_rate": 1.9931249148186812e-05, + "loss": 0.4354, + "step": 1202 + }, + { + "epoch": 0.11332752408092131, + "grad_norm": 1.1458284854888916, + "learning_rate": 1.993107227314816e-05, + "loss": 0.4596, + "step": 1203 + }, + { + "epoch": 0.11342172817408916, + "grad_norm": 0.9176868796348572, + "learning_rate": 1.9930895171665696e-05, + "loss": 0.425, + "step": 1204 + }, + { + "epoch": 0.11351593226725701, + "grad_norm": 1.0198174715042114, + "learning_rate": 1.993071784374347e-05, + "loss": 0.4721, + "step": 1205 + }, + { + "epoch": 0.11361013636042486, + "grad_norm": 0.8922477960586548, + "learning_rate": 1.9930540289385518e-05, + "loss": 0.405, + "step": 1206 + }, + { + "epoch": 0.1137043404535927, + "grad_norm": 0.8916482329368591, + "learning_rate": 1.9930362508595886e-05, + "loss": 0.3866, + "step": 1207 + }, + { + "epoch": 0.11379854454676055, + "grad_norm": 0.8731459379196167, + "learning_rate": 1.9930184501378633e-05, + "loss": 0.4242, + "step": 1208 + }, + { + "epoch": 0.1138927486399284, + "grad_norm": 1.0246661901474, + "learning_rate": 1.9930006267737815e-05, + "loss": 0.4246, + "step": 1209 + }, + { + "epoch": 0.11398695273309625, + "grad_norm": 0.9761861562728882, + "learning_rate": 1.99298278076775e-05, + "loss": 0.4086, + "step": 1210 + }, + { + "epoch": 0.1140811568262641, + "grad_norm": 0.8512738347053528, + "learning_rate": 1.9929649121201752e-05, + "loss": 0.3949, + "step": 1211 + }, + { + "epoch": 0.11417536091943195, + "grad_norm": 0.998048722743988, + "learning_rate": 1.992947020831465e-05, + "loss": 0.422, + "step": 1212 + }, + { + "epoch": 0.1142695650125998, + "grad_norm": 0.9329831004142761, + "learning_rate": 1.9929291069020267e-05, + "loss": 0.3685, + "step": 1213 + }, + { + "epoch": 0.11436376910576765, + "grad_norm": 0.9052364230155945, + "learning_rate": 1.9929111703322693e-05, + "loss": 0.3486, + "step": 1214 + }, + { + "epoch": 0.1144579731989355, + "grad_norm": 1.0009113550186157, + "learning_rate": 1.992893211122602e-05, + "loss": 0.4671, + "step": 1215 + }, + { + "epoch": 0.11455217729210335, + "grad_norm": 0.9874580502510071, + "learning_rate": 1.9928752292734336e-05, + "loss": 0.4029, + "step": 1216 + }, + { + "epoch": 0.1146463813852712, + "grad_norm": 0.9014765024185181, + "learning_rate": 1.9928572247851745e-05, + "loss": 0.4142, + "step": 1217 + }, + { + "epoch": 0.11474058547843904, + "grad_norm": 0.9365810751914978, + "learning_rate": 1.992839197658235e-05, + "loss": 0.446, + "step": 1218 + }, + { + "epoch": 0.11483478957160688, + "grad_norm": 0.863187849521637, + "learning_rate": 1.9928211478930267e-05, + "loss": 0.3845, + "step": 1219 + }, + { + "epoch": 0.11492899366477473, + "grad_norm": 0.8800891637802124, + "learning_rate": 1.9928030754899607e-05, + "loss": 0.3981, + "step": 1220 + }, + { + "epoch": 0.11502319775794258, + "grad_norm": 0.8876266479492188, + "learning_rate": 1.9927849804494492e-05, + "loss": 0.3657, + "step": 1221 + }, + { + "epoch": 0.11511740185111043, + "grad_norm": 0.8542340397834778, + "learning_rate": 1.992766862771905e-05, + "loss": 0.3308, + "step": 1222 + }, + { + "epoch": 0.11521160594427828, + "grad_norm": 1.0105453729629517, + "learning_rate": 1.9927487224577402e-05, + "loss": 0.4347, + "step": 1223 + }, + { + "epoch": 0.11530581003744612, + "grad_norm": 0.885009765625, + "learning_rate": 1.99273055950737e-05, + "loss": 0.4144, + "step": 1224 + }, + { + "epoch": 0.11540001413061397, + "grad_norm": 0.9243080615997314, + "learning_rate": 1.9927123739212074e-05, + "loss": 0.3689, + "step": 1225 + }, + { + "epoch": 0.11549421822378182, + "grad_norm": 0.9319286346435547, + "learning_rate": 1.9926941656996673e-05, + "loss": 0.3817, + "step": 1226 + }, + { + "epoch": 0.11558842231694967, + "grad_norm": 0.8860958218574524, + "learning_rate": 1.9926759348431653e-05, + "loss": 0.4033, + "step": 1227 + }, + { + "epoch": 0.11568262641011752, + "grad_norm": 0.8765958547592163, + "learning_rate": 1.9926576813521167e-05, + "loss": 0.389, + "step": 1228 + }, + { + "epoch": 0.11577683050328537, + "grad_norm": 0.9463694095611572, + "learning_rate": 1.9926394052269376e-05, + "loss": 0.4109, + "step": 1229 + }, + { + "epoch": 0.11587103459645322, + "grad_norm": 0.9759585857391357, + "learning_rate": 1.992621106468045e-05, + "loss": 0.4117, + "step": 1230 + }, + { + "epoch": 0.11596523868962107, + "grad_norm": 0.9568681716918945, + "learning_rate": 1.9926027850758563e-05, + "loss": 0.4413, + "step": 1231 + }, + { + "epoch": 0.11605944278278892, + "grad_norm": 0.9808611273765564, + "learning_rate": 1.992584441050789e-05, + "loss": 0.4286, + "step": 1232 + }, + { + "epoch": 0.11615364687595676, + "grad_norm": 0.9273642301559448, + "learning_rate": 1.992566074393261e-05, + "loss": 0.4147, + "step": 1233 + }, + { + "epoch": 0.11624785096912461, + "grad_norm": 0.9518542289733887, + "learning_rate": 1.9925476851036918e-05, + "loss": 0.4341, + "step": 1234 + }, + { + "epoch": 0.11634205506229246, + "grad_norm": 0.9704899191856384, + "learning_rate": 1.9925292731825e-05, + "loss": 0.485, + "step": 1235 + }, + { + "epoch": 0.11643625915546031, + "grad_norm": 0.8993477821350098, + "learning_rate": 1.9925108386301063e-05, + "loss": 0.4007, + "step": 1236 + }, + { + "epoch": 0.11653046324862815, + "grad_norm": 1.0151928663253784, + "learning_rate": 1.99249238144693e-05, + "loss": 0.4211, + "step": 1237 + }, + { + "epoch": 0.116624667341796, + "grad_norm": 1.005807876586914, + "learning_rate": 1.992473901633393e-05, + "loss": 0.4874, + "step": 1238 + }, + { + "epoch": 0.11671887143496384, + "grad_norm": 1.0199459791183472, + "learning_rate": 1.992455399189916e-05, + "loss": 0.4673, + "step": 1239 + }, + { + "epoch": 0.1168130755281317, + "grad_norm": 0.9317260980606079, + "learning_rate": 1.992436874116921e-05, + "loss": 0.4715, + "step": 1240 + }, + { + "epoch": 0.11690727962129954, + "grad_norm": 0.9165276885032654, + "learning_rate": 1.9924183264148304e-05, + "loss": 0.3305, + "step": 1241 + }, + { + "epoch": 0.11700148371446739, + "grad_norm": 0.8516448140144348, + "learning_rate": 1.992399756084067e-05, + "loss": 0.3999, + "step": 1242 + }, + { + "epoch": 0.11709568780763524, + "grad_norm": 0.888482391834259, + "learning_rate": 1.9923811631250546e-05, + "loss": 0.4148, + "step": 1243 + }, + { + "epoch": 0.11718989190080309, + "grad_norm": 1.3115497827529907, + "learning_rate": 1.9923625475382166e-05, + "loss": 0.4154, + "step": 1244 + }, + { + "epoch": 0.11728409599397094, + "grad_norm": 0.8743381500244141, + "learning_rate": 1.9923439093239784e-05, + "loss": 0.3939, + "step": 1245 + }, + { + "epoch": 0.11737830008713879, + "grad_norm": 0.9198361039161682, + "learning_rate": 1.992325248482764e-05, + "loss": 0.4033, + "step": 1246 + }, + { + "epoch": 0.11747250418030664, + "grad_norm": 1.0421661138534546, + "learning_rate": 1.9923065650149995e-05, + "loss": 0.4425, + "step": 1247 + }, + { + "epoch": 0.11756670827347449, + "grad_norm": 0.9926596879959106, + "learning_rate": 1.9922878589211102e-05, + "loss": 0.4102, + "step": 1248 + }, + { + "epoch": 0.11766091236664233, + "grad_norm": 0.9322739243507385, + "learning_rate": 1.9922691302015232e-05, + "loss": 0.4311, + "step": 1249 + }, + { + "epoch": 0.11775511645981018, + "grad_norm": 0.9065204858779907, + "learning_rate": 1.992250378856666e-05, + "loss": 0.43, + "step": 1250 + }, + { + "epoch": 0.11784932055297803, + "grad_norm": 0.8456649780273438, + "learning_rate": 1.9922316048869652e-05, + "loss": 0.3513, + "step": 1251 + }, + { + "epoch": 0.11794352464614588, + "grad_norm": 0.891179621219635, + "learning_rate": 1.9922128082928497e-05, + "loss": 0.3792, + "step": 1252 + }, + { + "epoch": 0.11803772873931373, + "grad_norm": 0.9345036745071411, + "learning_rate": 1.992193989074747e-05, + "loss": 0.4186, + "step": 1253 + }, + { + "epoch": 0.11813193283248156, + "grad_norm": 0.9004818201065063, + "learning_rate": 1.9921751472330873e-05, + "loss": 0.433, + "step": 1254 + }, + { + "epoch": 0.11822613692564941, + "grad_norm": 0.91129070520401, + "learning_rate": 1.9921562827683e-05, + "loss": 0.3933, + "step": 1255 + }, + { + "epoch": 0.11832034101881726, + "grad_norm": 0.9426426887512207, + "learning_rate": 1.9921373956808144e-05, + "loss": 0.4362, + "step": 1256 + }, + { + "epoch": 0.11841454511198511, + "grad_norm": 1.0422171354293823, + "learning_rate": 1.9921184859710626e-05, + "loss": 0.4173, + "step": 1257 + }, + { + "epoch": 0.11850874920515296, + "grad_norm": 1.0042086839675903, + "learning_rate": 1.9920995536394745e-05, + "loss": 0.4088, + "step": 1258 + }, + { + "epoch": 0.11860295329832081, + "grad_norm": 1.1132287979125977, + "learning_rate": 1.9920805986864823e-05, + "loss": 0.3819, + "step": 1259 + }, + { + "epoch": 0.11869715739148866, + "grad_norm": 1.0502781867980957, + "learning_rate": 1.9920616211125185e-05, + "loss": 0.4934, + "step": 1260 + }, + { + "epoch": 0.11879136148465651, + "grad_norm": 0.9304486513137817, + "learning_rate": 1.992042620918015e-05, + "loss": 0.3992, + "step": 1261 + }, + { + "epoch": 0.11888556557782436, + "grad_norm": 0.871278703212738, + "learning_rate": 1.9920235981034056e-05, + "loss": 0.3653, + "step": 1262 + }, + { + "epoch": 0.1189797696709922, + "grad_norm": 0.9891937375068665, + "learning_rate": 1.9920045526691245e-05, + "loss": 0.4249, + "step": 1263 + }, + { + "epoch": 0.11907397376416005, + "grad_norm": 0.8685119152069092, + "learning_rate": 1.9919854846156048e-05, + "loss": 0.3826, + "step": 1264 + }, + { + "epoch": 0.1191681778573279, + "grad_norm": 0.8814318180084229, + "learning_rate": 1.9919663939432824e-05, + "loss": 0.3882, + "step": 1265 + }, + { + "epoch": 0.11926238195049575, + "grad_norm": 0.7924103140830994, + "learning_rate": 1.9919472806525915e-05, + "loss": 0.3739, + "step": 1266 + }, + { + "epoch": 0.1193565860436636, + "grad_norm": 1.0052324533462524, + "learning_rate": 1.991928144743969e-05, + "loss": 0.4225, + "step": 1267 + }, + { + "epoch": 0.11945079013683145, + "grad_norm": 0.9484582543373108, + "learning_rate": 1.991908986217851e-05, + "loss": 0.4116, + "step": 1268 + }, + { + "epoch": 0.1195449942299993, + "grad_norm": 1.009057879447937, + "learning_rate": 1.9918898050746738e-05, + "loss": 0.4072, + "step": 1269 + }, + { + "epoch": 0.11963919832316715, + "grad_norm": 1.0325987339019775, + "learning_rate": 1.991870601314875e-05, + "loss": 0.4395, + "step": 1270 + }, + { + "epoch": 0.11973340241633498, + "grad_norm": 0.7867721915245056, + "learning_rate": 1.9918513749388925e-05, + "loss": 0.3657, + "step": 1271 + }, + { + "epoch": 0.11982760650950283, + "grad_norm": 0.9199879169464111, + "learning_rate": 1.991832125947165e-05, + "loss": 0.4368, + "step": 1272 + }, + { + "epoch": 0.11992181060267068, + "grad_norm": 0.9976329803466797, + "learning_rate": 1.9918128543401307e-05, + "loss": 0.4094, + "step": 1273 + }, + { + "epoch": 0.12001601469583853, + "grad_norm": 0.886244535446167, + "learning_rate": 1.9917935601182295e-05, + "loss": 0.3622, + "step": 1274 + }, + { + "epoch": 0.12011021878900638, + "grad_norm": 1.0019792318344116, + "learning_rate": 1.9917742432819015e-05, + "loss": 0.4494, + "step": 1275 + }, + { + "epoch": 0.12020442288217423, + "grad_norm": 1.1114577054977417, + "learning_rate": 1.991754903831587e-05, + "loss": 0.524, + "step": 1276 + }, + { + "epoch": 0.12029862697534208, + "grad_norm": 0.8961803913116455, + "learning_rate": 1.9917355417677266e-05, + "loss": 0.3931, + "step": 1277 + }, + { + "epoch": 0.12039283106850993, + "grad_norm": 1.096078634262085, + "learning_rate": 1.9917161570907626e-05, + "loss": 0.4423, + "step": 1278 + }, + { + "epoch": 0.12048703516167777, + "grad_norm": 0.958937406539917, + "learning_rate": 1.991696749801136e-05, + "loss": 0.4306, + "step": 1279 + }, + { + "epoch": 0.12058123925484562, + "grad_norm": 0.8562394380569458, + "learning_rate": 1.99167731989929e-05, + "loss": 0.3854, + "step": 1280 + }, + { + "epoch": 0.12067544334801347, + "grad_norm": 0.9827834367752075, + "learning_rate": 1.9916578673856676e-05, + "loss": 0.3733, + "step": 1281 + }, + { + "epoch": 0.12076964744118132, + "grad_norm": 0.9177426695823669, + "learning_rate": 1.9916383922607122e-05, + "loss": 0.3531, + "step": 1282 + }, + { + "epoch": 0.12086385153434917, + "grad_norm": 1.0187934637069702, + "learning_rate": 1.9916188945248675e-05, + "loss": 0.4429, + "step": 1283 + }, + { + "epoch": 0.12095805562751702, + "grad_norm": 0.9321007132530212, + "learning_rate": 1.9915993741785788e-05, + "loss": 0.3826, + "step": 1284 + }, + { + "epoch": 0.12105225972068487, + "grad_norm": 1.0457597970962524, + "learning_rate": 1.991579831222291e-05, + "loss": 0.4454, + "step": 1285 + }, + { + "epoch": 0.12114646381385272, + "grad_norm": 0.8614075183868408, + "learning_rate": 1.991560265656449e-05, + "loss": 0.3726, + "step": 1286 + }, + { + "epoch": 0.12124066790702057, + "grad_norm": 0.9820986390113831, + "learning_rate": 1.9915406774814995e-05, + "loss": 0.4007, + "step": 1287 + }, + { + "epoch": 0.12133487200018842, + "grad_norm": 0.7843329310417175, + "learning_rate": 1.9915210666978896e-05, + "loss": 0.3486, + "step": 1288 + }, + { + "epoch": 0.12142907609335625, + "grad_norm": 0.8918038010597229, + "learning_rate": 1.9915014333060653e-05, + "loss": 0.4224, + "step": 1289 + }, + { + "epoch": 0.1215232801865241, + "grad_norm": 0.9657085537910461, + "learning_rate": 1.9914817773064756e-05, + "loss": 0.4088, + "step": 1290 + }, + { + "epoch": 0.12161748427969195, + "grad_norm": 0.9171475172042847, + "learning_rate": 1.9914620986995677e-05, + "loss": 0.422, + "step": 1291 + }, + { + "epoch": 0.1217116883728598, + "grad_norm": 0.9751061797142029, + "learning_rate": 1.9914423974857907e-05, + "loss": 0.4084, + "step": 1292 + }, + { + "epoch": 0.12180589246602765, + "grad_norm": 0.8386781811714172, + "learning_rate": 1.9914226736655936e-05, + "loss": 0.368, + "step": 1293 + }, + { + "epoch": 0.1219000965591955, + "grad_norm": 0.8160566091537476, + "learning_rate": 1.9914029272394265e-05, + "loss": 0.4005, + "step": 1294 + }, + { + "epoch": 0.12199430065236334, + "grad_norm": 0.8406798839569092, + "learning_rate": 1.9913831582077393e-05, + "loss": 0.4013, + "step": 1295 + }, + { + "epoch": 0.12208850474553119, + "grad_norm": 0.9921380877494812, + "learning_rate": 1.991363366570983e-05, + "loss": 0.4594, + "step": 1296 + }, + { + "epoch": 0.12218270883869904, + "grad_norm": 0.8436982035636902, + "learning_rate": 1.9913435523296085e-05, + "loss": 0.3873, + "step": 1297 + }, + { + "epoch": 0.12227691293186689, + "grad_norm": 0.9785180687904358, + "learning_rate": 1.991323715484068e-05, + "loss": 0.4451, + "step": 1298 + }, + { + "epoch": 0.12237111702503474, + "grad_norm": 0.8816764950752258, + "learning_rate": 1.9913038560348135e-05, + "loss": 0.3974, + "step": 1299 + }, + { + "epoch": 0.12246532111820259, + "grad_norm": 0.9945760369300842, + "learning_rate": 1.991283973982298e-05, + "loss": 0.433, + "step": 1300 + }, + { + "epoch": 0.12255952521137044, + "grad_norm": 1.4750216007232666, + "learning_rate": 1.9912640693269754e-05, + "loss": 0.4598, + "step": 1301 + }, + { + "epoch": 0.12265372930453829, + "grad_norm": 0.9504375457763672, + "learning_rate": 1.9912441420692986e-05, + "loss": 0.4003, + "step": 1302 + }, + { + "epoch": 0.12274793339770614, + "grad_norm": 1.042178988456726, + "learning_rate": 1.9912241922097225e-05, + "loss": 0.3946, + "step": 1303 + }, + { + "epoch": 0.12284213749087398, + "grad_norm": 1.08685302734375, + "learning_rate": 1.991204219748702e-05, + "loss": 0.4636, + "step": 1304 + }, + { + "epoch": 0.12293634158404183, + "grad_norm": 0.8524930477142334, + "learning_rate": 1.991184224686692e-05, + "loss": 0.3845, + "step": 1305 + }, + { + "epoch": 0.12303054567720967, + "grad_norm": 0.9859117269515991, + "learning_rate": 1.9911642070241487e-05, + "loss": 0.4568, + "step": 1306 + }, + { + "epoch": 0.12312474977037752, + "grad_norm": 0.8629952669143677, + "learning_rate": 1.991144166761529e-05, + "loss": 0.3693, + "step": 1307 + }, + { + "epoch": 0.12321895386354537, + "grad_norm": 0.9870200157165527, + "learning_rate": 1.9911241038992893e-05, + "loss": 0.4597, + "step": 1308 + }, + { + "epoch": 0.12331315795671322, + "grad_norm": 0.8797679543495178, + "learning_rate": 1.991104018437887e-05, + "loss": 0.3981, + "step": 1309 + }, + { + "epoch": 0.12340736204988106, + "grad_norm": 0.9802530407905579, + "learning_rate": 1.9910839103777805e-05, + "loss": 0.3633, + "step": 1310 + }, + { + "epoch": 0.12350156614304891, + "grad_norm": 0.8832137584686279, + "learning_rate": 1.9910637797194284e-05, + "loss": 0.4296, + "step": 1311 + }, + { + "epoch": 0.12359577023621676, + "grad_norm": 0.992841899394989, + "learning_rate": 1.991043626463289e-05, + "loss": 0.4085, + "step": 1312 + }, + { + "epoch": 0.12368997432938461, + "grad_norm": 1.0332484245300293, + "learning_rate": 1.9910234506098223e-05, + "loss": 0.3891, + "step": 1313 + }, + { + "epoch": 0.12378417842255246, + "grad_norm": 0.883171558380127, + "learning_rate": 1.9910032521594884e-05, + "loss": 0.3858, + "step": 1314 + }, + { + "epoch": 0.12387838251572031, + "grad_norm": 0.92570960521698, + "learning_rate": 1.9909830311127476e-05, + "loss": 0.3472, + "step": 1315 + }, + { + "epoch": 0.12397258660888816, + "grad_norm": 0.9907909035682678, + "learning_rate": 1.9909627874700615e-05, + "loss": 0.4656, + "step": 1316 + }, + { + "epoch": 0.124066790702056, + "grad_norm": 0.9065894484519958, + "learning_rate": 1.990942521231891e-05, + "loss": 0.4012, + "step": 1317 + }, + { + "epoch": 0.12416099479522386, + "grad_norm": 0.8376958966255188, + "learning_rate": 1.9909222323986984e-05, + "loss": 0.4036, + "step": 1318 + }, + { + "epoch": 0.1242551988883917, + "grad_norm": 0.966644287109375, + "learning_rate": 1.9909019209709465e-05, + "loss": 0.4196, + "step": 1319 + }, + { + "epoch": 0.12434940298155955, + "grad_norm": 0.819900393486023, + "learning_rate": 1.990881586949098e-05, + "loss": 0.3805, + "step": 1320 + }, + { + "epoch": 0.1244436070747274, + "grad_norm": 1.0244163274765015, + "learning_rate": 1.9908612303336174e-05, + "loss": 0.475, + "step": 1321 + }, + { + "epoch": 0.12453781116789525, + "grad_norm": 0.8884019255638123, + "learning_rate": 1.9908408511249682e-05, + "loss": 0.3703, + "step": 1322 + }, + { + "epoch": 0.12463201526106309, + "grad_norm": 0.9183163642883301, + "learning_rate": 1.9908204493236153e-05, + "loss": 0.4034, + "step": 1323 + }, + { + "epoch": 0.12472621935423094, + "grad_norm": 0.8690988421440125, + "learning_rate": 1.9908000249300238e-05, + "loss": 0.3972, + "step": 1324 + }, + { + "epoch": 0.12482042344739878, + "grad_norm": 0.9924200177192688, + "learning_rate": 1.990779577944659e-05, + "loss": 0.4268, + "step": 1325 + }, + { + "epoch": 0.12491462754056663, + "grad_norm": 0.9585636854171753, + "learning_rate": 1.9907591083679883e-05, + "loss": 0.3632, + "step": 1326 + }, + { + "epoch": 0.1250088316337345, + "grad_norm": 0.8694257736206055, + "learning_rate": 1.9907386162004775e-05, + "loss": 0.3931, + "step": 1327 + }, + { + "epoch": 0.12510303572690235, + "grad_norm": 0.9315399527549744, + "learning_rate": 1.9907181014425936e-05, + "loss": 0.41, + "step": 1328 + }, + { + "epoch": 0.1251972398200702, + "grad_norm": 0.9038522243499756, + "learning_rate": 1.990697564094805e-05, + "loss": 0.415, + "step": 1329 + }, + { + "epoch": 0.12529144391323804, + "grad_norm": 0.974467933177948, + "learning_rate": 1.99067700415758e-05, + "loss": 0.3798, + "step": 1330 + }, + { + "epoch": 0.1253856480064059, + "grad_norm": 0.9763962030410767, + "learning_rate": 1.990656421631387e-05, + "loss": 0.4399, + "step": 1331 + }, + { + "epoch": 0.1254798520995737, + "grad_norm": 0.8021870851516724, + "learning_rate": 1.9906358165166954e-05, + "loss": 0.3525, + "step": 1332 + }, + { + "epoch": 0.12557405619274156, + "grad_norm": 0.9852421879768372, + "learning_rate": 1.9906151888139753e-05, + "loss": 0.4017, + "step": 1333 + }, + { + "epoch": 0.1256682602859094, + "grad_norm": 0.8635556697845459, + "learning_rate": 1.990594538523697e-05, + "loss": 0.3696, + "step": 1334 + }, + { + "epoch": 0.12576246437907726, + "grad_norm": 0.9297789931297302, + "learning_rate": 1.9905738656463313e-05, + "loss": 0.3839, + "step": 1335 + }, + { + "epoch": 0.1258566684722451, + "grad_norm": 0.8907140493392944, + "learning_rate": 1.990553170182349e-05, + "loss": 0.4279, + "step": 1336 + }, + { + "epoch": 0.12595087256541296, + "grad_norm": 0.9584831595420837, + "learning_rate": 1.990532452132223e-05, + "loss": 0.3897, + "step": 1337 + }, + { + "epoch": 0.1260450766585808, + "grad_norm": 0.9200226664543152, + "learning_rate": 1.990511711496425e-05, + "loss": 0.3503, + "step": 1338 + }, + { + "epoch": 0.12613928075174866, + "grad_norm": 1.0291590690612793, + "learning_rate": 1.9904909482754283e-05, + "loss": 0.4569, + "step": 1339 + }, + { + "epoch": 0.1262334848449165, + "grad_norm": 0.8867980241775513, + "learning_rate": 1.990470162469706e-05, + "loss": 0.3825, + "step": 1340 + }, + { + "epoch": 0.12632768893808435, + "grad_norm": 0.9662911295890808, + "learning_rate": 1.990449354079732e-05, + "loss": 0.4518, + "step": 1341 + }, + { + "epoch": 0.1264218930312522, + "grad_norm": 0.9172998666763306, + "learning_rate": 1.990428523105981e-05, + "loss": 0.4395, + "step": 1342 + }, + { + "epoch": 0.12651609712442005, + "grad_norm": 1.0754560232162476, + "learning_rate": 1.9904076695489282e-05, + "loss": 0.4464, + "step": 1343 + }, + { + "epoch": 0.1266103012175879, + "grad_norm": 0.8089383244514465, + "learning_rate": 1.990386793409049e-05, + "loss": 0.3422, + "step": 1344 + }, + { + "epoch": 0.12670450531075575, + "grad_norm": 0.9732319116592407, + "learning_rate": 1.990365894686819e-05, + "loss": 0.4273, + "step": 1345 + }, + { + "epoch": 0.1267987094039236, + "grad_norm": 0.862170398235321, + "learning_rate": 1.990344973382715e-05, + "loss": 0.3798, + "step": 1346 + }, + { + "epoch": 0.12689291349709145, + "grad_norm": 0.8946678638458252, + "learning_rate": 1.9903240294972138e-05, + "loss": 0.4353, + "step": 1347 + }, + { + "epoch": 0.1269871175902593, + "grad_norm": 1.014513373374939, + "learning_rate": 1.9903030630307937e-05, + "loss": 0.4471, + "step": 1348 + }, + { + "epoch": 0.12708132168342715, + "grad_norm": 1.0029696226119995, + "learning_rate": 1.990282073983932e-05, + "loss": 0.4232, + "step": 1349 + }, + { + "epoch": 0.127175525776595, + "grad_norm": 0.8123729825019836, + "learning_rate": 1.990261062357107e-05, + "loss": 0.3467, + "step": 1350 + }, + { + "epoch": 0.12726972986976284, + "grad_norm": 0.9378588795661926, + "learning_rate": 1.9902400281507986e-05, + "loss": 0.3825, + "step": 1351 + }, + { + "epoch": 0.1273639339629307, + "grad_norm": 0.8797309398651123, + "learning_rate": 1.9902189713654864e-05, + "loss": 0.3793, + "step": 1352 + }, + { + "epoch": 0.12745813805609854, + "grad_norm": 0.9510934948921204, + "learning_rate": 1.99019789200165e-05, + "loss": 0.4268, + "step": 1353 + }, + { + "epoch": 0.1275523421492664, + "grad_norm": 1.120252013206482, + "learning_rate": 1.9901767900597704e-05, + "loss": 0.395, + "step": 1354 + }, + { + "epoch": 0.12764654624243424, + "grad_norm": 0.8084336519241333, + "learning_rate": 1.9901556655403285e-05, + "loss": 0.3601, + "step": 1355 + }, + { + "epoch": 0.1277407503356021, + "grad_norm": 0.8959628939628601, + "learning_rate": 1.9901345184438065e-05, + "loss": 0.4102, + "step": 1356 + }, + { + "epoch": 0.12783495442876994, + "grad_norm": 0.9366346001625061, + "learning_rate": 1.9901133487706858e-05, + "loss": 0.3752, + "step": 1357 + }, + { + "epoch": 0.12792915852193779, + "grad_norm": 1.0087013244628906, + "learning_rate": 1.9900921565214496e-05, + "loss": 0.4343, + "step": 1358 + }, + { + "epoch": 0.12802336261510563, + "grad_norm": 0.9205380082130432, + "learning_rate": 1.990070941696581e-05, + "loss": 0.3817, + "step": 1359 + }, + { + "epoch": 0.12811756670827348, + "grad_norm": 0.9430442452430725, + "learning_rate": 1.990049704296564e-05, + "loss": 0.4224, + "step": 1360 + }, + { + "epoch": 0.12821177080144133, + "grad_norm": 0.9510387778282166, + "learning_rate": 1.9900284443218825e-05, + "loss": 0.3808, + "step": 1361 + }, + { + "epoch": 0.12830597489460918, + "grad_norm": 0.7826544046401978, + "learning_rate": 1.9900071617730212e-05, + "loss": 0.3318, + "step": 1362 + }, + { + "epoch": 0.12840017898777703, + "grad_norm": 0.8948149681091309, + "learning_rate": 1.989985856650466e-05, + "loss": 0.3918, + "step": 1363 + }, + { + "epoch": 0.12849438308094488, + "grad_norm": 0.8969107866287231, + "learning_rate": 1.9899645289547017e-05, + "loss": 0.4408, + "step": 1364 + }, + { + "epoch": 0.12858858717411273, + "grad_norm": 0.9934526085853577, + "learning_rate": 1.9899431786862152e-05, + "loss": 0.4391, + "step": 1365 + }, + { + "epoch": 0.12868279126728055, + "grad_norm": 1.0301306247711182, + "learning_rate": 1.9899218058454933e-05, + "loss": 0.4187, + "step": 1366 + }, + { + "epoch": 0.1287769953604484, + "grad_norm": 0.9641363620758057, + "learning_rate": 1.9899004104330232e-05, + "loss": 0.4363, + "step": 1367 + }, + { + "epoch": 0.12887119945361625, + "grad_norm": 0.9025760293006897, + "learning_rate": 1.989878992449293e-05, + "loss": 0.389, + "step": 1368 + }, + { + "epoch": 0.1289654035467841, + "grad_norm": 0.8414632678031921, + "learning_rate": 1.989857551894791e-05, + "loss": 0.3644, + "step": 1369 + }, + { + "epoch": 0.12905960763995195, + "grad_norm": 0.9714823365211487, + "learning_rate": 1.9898360887700056e-05, + "loss": 0.4002, + "step": 1370 + }, + { + "epoch": 0.1291538117331198, + "grad_norm": 0.9647928476333618, + "learning_rate": 1.989814603075427e-05, + "loss": 0.3831, + "step": 1371 + }, + { + "epoch": 0.12924801582628764, + "grad_norm": 0.9595774412155151, + "learning_rate": 1.9897930948115444e-05, + "loss": 0.4624, + "step": 1372 + }, + { + "epoch": 0.1293422199194555, + "grad_norm": 0.9072070717811584, + "learning_rate": 1.9897715639788483e-05, + "loss": 0.4342, + "step": 1373 + }, + { + "epoch": 0.12943642401262334, + "grad_norm": 0.9162281155586243, + "learning_rate": 1.98975001057783e-05, + "loss": 0.4131, + "step": 1374 + }, + { + "epoch": 0.1295306281057912, + "grad_norm": 0.9252980947494507, + "learning_rate": 1.989728434608981e-05, + "loss": 0.3638, + "step": 1375 + }, + { + "epoch": 0.12962483219895904, + "grad_norm": 1.0374664068222046, + "learning_rate": 1.9897068360727933e-05, + "loss": 0.4535, + "step": 1376 + }, + { + "epoch": 0.1297190362921269, + "grad_norm": 0.9338465929031372, + "learning_rate": 1.9896852149697584e-05, + "loss": 0.3735, + "step": 1377 + }, + { + "epoch": 0.12981324038529474, + "grad_norm": 0.9684034585952759, + "learning_rate": 1.9896635713003706e-05, + "loss": 0.3973, + "step": 1378 + }, + { + "epoch": 0.12990744447846259, + "grad_norm": 0.8919799327850342, + "learning_rate": 1.9896419050651222e-05, + "loss": 0.4, + "step": 1379 + }, + { + "epoch": 0.13000164857163043, + "grad_norm": 0.9550321102142334, + "learning_rate": 1.9896202162645088e-05, + "loss": 0.4329, + "step": 1380 + }, + { + "epoch": 0.13009585266479828, + "grad_norm": 0.9291896224021912, + "learning_rate": 1.989598504899023e-05, + "loss": 0.4063, + "step": 1381 + }, + { + "epoch": 0.13019005675796613, + "grad_norm": 1.0140936374664307, + "learning_rate": 1.9895767709691617e-05, + "loss": 0.4293, + "step": 1382 + }, + { + "epoch": 0.13028426085113398, + "grad_norm": 0.8429473042488098, + "learning_rate": 1.989555014475419e-05, + "loss": 0.3547, + "step": 1383 + }, + { + "epoch": 0.13037846494430183, + "grad_norm": 0.9772353768348694, + "learning_rate": 1.9895332354182917e-05, + "loss": 0.3938, + "step": 1384 + }, + { + "epoch": 0.13047266903746968, + "grad_norm": 0.877116322517395, + "learning_rate": 1.9895114337982765e-05, + "loss": 0.3986, + "step": 1385 + }, + { + "epoch": 0.13056687313063753, + "grad_norm": 0.929966151714325, + "learning_rate": 1.98948960961587e-05, + "loss": 0.4453, + "step": 1386 + }, + { + "epoch": 0.13066107722380538, + "grad_norm": 0.831558108329773, + "learning_rate": 1.9894677628715706e-05, + "loss": 0.3872, + "step": 1387 + }, + { + "epoch": 0.13075528131697323, + "grad_norm": 0.9165502190589905, + "learning_rate": 1.9894458935658752e-05, + "loss": 0.3369, + "step": 1388 + }, + { + "epoch": 0.13084948541014108, + "grad_norm": 0.9749155640602112, + "learning_rate": 1.989424001699284e-05, + "loss": 0.4507, + "step": 1389 + }, + { + "epoch": 0.13094368950330892, + "grad_norm": 0.9584722518920898, + "learning_rate": 1.989402087272295e-05, + "loss": 0.415, + "step": 1390 + }, + { + "epoch": 0.13103789359647677, + "grad_norm": 0.9711076617240906, + "learning_rate": 1.9893801502854084e-05, + "loss": 0.4496, + "step": 1391 + }, + { + "epoch": 0.13113209768964462, + "grad_norm": 0.9657142758369446, + "learning_rate": 1.989358190739124e-05, + "loss": 0.4149, + "step": 1392 + }, + { + "epoch": 0.13122630178281247, + "grad_norm": 0.9022928476333618, + "learning_rate": 1.9893362086339428e-05, + "loss": 0.4107, + "step": 1393 + }, + { + "epoch": 0.13132050587598032, + "grad_norm": 0.9422235488891602, + "learning_rate": 1.9893142039703662e-05, + "loss": 0.4029, + "step": 1394 + }, + { + "epoch": 0.13141470996914817, + "grad_norm": 0.9564428329467773, + "learning_rate": 1.989292176748896e-05, + "loss": 0.3828, + "step": 1395 + }, + { + "epoch": 0.13150891406231602, + "grad_norm": 0.9131461381912231, + "learning_rate": 1.989270126970034e-05, + "loss": 0.4382, + "step": 1396 + }, + { + "epoch": 0.13160311815548387, + "grad_norm": 0.9364377856254578, + "learning_rate": 1.989248054634283e-05, + "loss": 0.3859, + "step": 1397 + }, + { + "epoch": 0.13169732224865172, + "grad_norm": 1.1112204790115356, + "learning_rate": 1.9892259597421466e-05, + "loss": 0.4799, + "step": 1398 + }, + { + "epoch": 0.13179152634181956, + "grad_norm": 0.9870941042900085, + "learning_rate": 1.9892038422941283e-05, + "loss": 0.3575, + "step": 1399 + }, + { + "epoch": 0.1318857304349874, + "grad_norm": 1.0119566917419434, + "learning_rate": 1.9891817022907326e-05, + "loss": 0.4426, + "step": 1400 + }, + { + "epoch": 0.13197993452815523, + "grad_norm": 0.8759822845458984, + "learning_rate": 1.9891595397324647e-05, + "loss": 0.3548, + "step": 1401 + }, + { + "epoch": 0.13207413862132308, + "grad_norm": 0.8948006629943848, + "learning_rate": 1.9891373546198293e-05, + "loss": 0.3958, + "step": 1402 + }, + { + "epoch": 0.13216834271449093, + "grad_norm": 1.0464363098144531, + "learning_rate": 1.9891151469533324e-05, + "loss": 0.4386, + "step": 1403 + }, + { + "epoch": 0.13226254680765878, + "grad_norm": 0.8876453638076782, + "learning_rate": 1.9890929167334803e-05, + "loss": 0.4215, + "step": 1404 + }, + { + "epoch": 0.13235675090082663, + "grad_norm": 0.9584650993347168, + "learning_rate": 1.98907066396078e-05, + "loss": 0.4056, + "step": 1405 + }, + { + "epoch": 0.13245095499399448, + "grad_norm": 0.952660083770752, + "learning_rate": 1.9890483886357393e-05, + "loss": 0.4232, + "step": 1406 + }, + { + "epoch": 0.13254515908716233, + "grad_norm": 0.9979923963546753, + "learning_rate": 1.9890260907588653e-05, + "loss": 0.4204, + "step": 1407 + }, + { + "epoch": 0.13263936318033018, + "grad_norm": 0.8783887028694153, + "learning_rate": 1.989003770330667e-05, + "loss": 0.3743, + "step": 1408 + }, + { + "epoch": 0.13273356727349803, + "grad_norm": 0.8242788910865784, + "learning_rate": 1.9889814273516536e-05, + "loss": 0.3699, + "step": 1409 + }, + { + "epoch": 0.13282777136666588, + "grad_norm": 0.8861004710197449, + "learning_rate": 1.988959061822334e-05, + "loss": 0.4144, + "step": 1410 + }, + { + "epoch": 0.13292197545983372, + "grad_norm": 0.8475175499916077, + "learning_rate": 1.988936673743218e-05, + "loss": 0.3934, + "step": 1411 + }, + { + "epoch": 0.13301617955300157, + "grad_norm": 0.9742621183395386, + "learning_rate": 1.988914263114817e-05, + "loss": 0.4338, + "step": 1412 + }, + { + "epoch": 0.13311038364616942, + "grad_norm": 0.8204099535942078, + "learning_rate": 1.9888918299376407e-05, + "loss": 0.3724, + "step": 1413 + }, + { + "epoch": 0.13320458773933727, + "grad_norm": 0.8778209090232849, + "learning_rate": 1.9888693742122017e-05, + "loss": 0.4136, + "step": 1414 + }, + { + "epoch": 0.13329879183250512, + "grad_norm": 0.9400804042816162, + "learning_rate": 1.9888468959390116e-05, + "loss": 0.4077, + "step": 1415 + }, + { + "epoch": 0.13339299592567297, + "grad_norm": 0.9618299603462219, + "learning_rate": 1.9888243951185834e-05, + "loss": 0.4257, + "step": 1416 + }, + { + "epoch": 0.13348720001884082, + "grad_norm": 0.9355471134185791, + "learning_rate": 1.9888018717514294e-05, + "loss": 0.3974, + "step": 1417 + }, + { + "epoch": 0.13358140411200867, + "grad_norm": 0.8644047379493713, + "learning_rate": 1.9887793258380635e-05, + "loss": 0.397, + "step": 1418 + }, + { + "epoch": 0.13367560820517652, + "grad_norm": 0.8545727729797363, + "learning_rate": 1.9887567573789997e-05, + "loss": 0.3949, + "step": 1419 + }, + { + "epoch": 0.13376981229834436, + "grad_norm": 0.9894800186157227, + "learning_rate": 1.9887341663747527e-05, + "loss": 0.3858, + "step": 1420 + }, + { + "epoch": 0.1338640163915122, + "grad_norm": 0.9499911665916443, + "learning_rate": 1.9887115528258375e-05, + "loss": 0.429, + "step": 1421 + }, + { + "epoch": 0.13395822048468006, + "grad_norm": 0.8847710490226746, + "learning_rate": 1.98868891673277e-05, + "loss": 0.4077, + "step": 1422 + }, + { + "epoch": 0.1340524245778479, + "grad_norm": 0.9346426129341125, + "learning_rate": 1.9886662580960664e-05, + "loss": 0.4167, + "step": 1423 + }, + { + "epoch": 0.13414662867101576, + "grad_norm": 0.8743436336517334, + "learning_rate": 1.988643576916243e-05, + "loss": 0.3943, + "step": 1424 + }, + { + "epoch": 0.1342408327641836, + "grad_norm": 0.8709068894386292, + "learning_rate": 1.988620873193817e-05, + "loss": 0.3863, + "step": 1425 + }, + { + "epoch": 0.13433503685735146, + "grad_norm": 0.876372754573822, + "learning_rate": 1.988598146929306e-05, + "loss": 0.4137, + "step": 1426 + }, + { + "epoch": 0.1344292409505193, + "grad_norm": 0.8295745253562927, + "learning_rate": 1.9885753981232284e-05, + "loss": 0.4034, + "step": 1427 + }, + { + "epoch": 0.13452344504368716, + "grad_norm": 0.9980677962303162, + "learning_rate": 1.9885526267761032e-05, + "loss": 0.4629, + "step": 1428 + }, + { + "epoch": 0.134617649136855, + "grad_norm": 0.8880258798599243, + "learning_rate": 1.9885298328884488e-05, + "loss": 0.3946, + "step": 1429 + }, + { + "epoch": 0.13471185323002285, + "grad_norm": 0.8771301507949829, + "learning_rate": 1.9885070164607855e-05, + "loss": 0.4202, + "step": 1430 + }, + { + "epoch": 0.1348060573231907, + "grad_norm": 0.91556316614151, + "learning_rate": 1.9884841774936337e-05, + "loss": 0.4194, + "step": 1431 + }, + { + "epoch": 0.13490026141635855, + "grad_norm": 0.8650923371315002, + "learning_rate": 1.988461315987514e-05, + "loss": 0.3744, + "step": 1432 + }, + { + "epoch": 0.1349944655095264, + "grad_norm": 0.9072087407112122, + "learning_rate": 1.9884384319429472e-05, + "loss": 0.3709, + "step": 1433 + }, + { + "epoch": 0.13508866960269425, + "grad_norm": 0.9184080958366394, + "learning_rate": 1.988415525360456e-05, + "loss": 0.3984, + "step": 1434 + }, + { + "epoch": 0.13518287369586207, + "grad_norm": 0.8926185369491577, + "learning_rate": 1.988392596240562e-05, + "loss": 0.367, + "step": 1435 + }, + { + "epoch": 0.13527707778902992, + "grad_norm": 0.9695875644683838, + "learning_rate": 1.988369644583788e-05, + "loss": 0.3913, + "step": 1436 + }, + { + "epoch": 0.13537128188219777, + "grad_norm": 0.9078540205955505, + "learning_rate": 1.988346670390658e-05, + "loss": 0.3695, + "step": 1437 + }, + { + "epoch": 0.13546548597536562, + "grad_norm": 0.922349750995636, + "learning_rate": 1.988323673661695e-05, + "loss": 0.3906, + "step": 1438 + }, + { + "epoch": 0.13555969006853347, + "grad_norm": 0.9601535797119141, + "learning_rate": 1.9883006543974238e-05, + "loss": 0.4213, + "step": 1439 + }, + { + "epoch": 0.13565389416170132, + "grad_norm": 0.9403246641159058, + "learning_rate": 1.9882776125983696e-05, + "loss": 0.4118, + "step": 1440 + }, + { + "epoch": 0.13574809825486917, + "grad_norm": 0.9881746768951416, + "learning_rate": 1.988254548265057e-05, + "loss": 0.4009, + "step": 1441 + }, + { + "epoch": 0.13584230234803701, + "grad_norm": 0.9344496130943298, + "learning_rate": 1.988231461398013e-05, + "loss": 0.4319, + "step": 1442 + }, + { + "epoch": 0.13593650644120486, + "grad_norm": 0.8761599063873291, + "learning_rate": 1.9882083519977623e-05, + "loss": 0.3881, + "step": 1443 + }, + { + "epoch": 0.1360307105343727, + "grad_norm": 0.921055793762207, + "learning_rate": 1.9881852200648338e-05, + "loss": 0.4169, + "step": 1444 + }, + { + "epoch": 0.13612491462754056, + "grad_norm": 0.8214818239212036, + "learning_rate": 1.9881620655997535e-05, + "loss": 0.3779, + "step": 1445 + }, + { + "epoch": 0.1362191187207084, + "grad_norm": 0.8684057593345642, + "learning_rate": 1.9881388886030503e-05, + "loss": 0.4157, + "step": 1446 + }, + { + "epoch": 0.13631332281387626, + "grad_norm": 0.9184609651565552, + "learning_rate": 1.9881156890752517e-05, + "loss": 0.3879, + "step": 1447 + }, + { + "epoch": 0.1364075269070441, + "grad_norm": 1.0127325057983398, + "learning_rate": 1.9880924670168877e-05, + "loss": 0.4588, + "step": 1448 + }, + { + "epoch": 0.13650173100021196, + "grad_norm": 0.8852851986885071, + "learning_rate": 1.988069222428487e-05, + "loss": 0.3958, + "step": 1449 + }, + { + "epoch": 0.1365959350933798, + "grad_norm": 0.9741384387016296, + "learning_rate": 1.9880459553105804e-05, + "loss": 0.4393, + "step": 1450 + }, + { + "epoch": 0.13669013918654765, + "grad_norm": 0.8674829602241516, + "learning_rate": 1.9880226656636977e-05, + "loss": 0.3795, + "step": 1451 + }, + { + "epoch": 0.1367843432797155, + "grad_norm": 0.8313913345336914, + "learning_rate": 1.9879993534883702e-05, + "loss": 0.337, + "step": 1452 + }, + { + "epoch": 0.13687854737288335, + "grad_norm": 1.0854746103286743, + "learning_rate": 1.9879760187851297e-05, + "loss": 0.411, + "step": 1453 + }, + { + "epoch": 0.1369727514660512, + "grad_norm": 0.9327601194381714, + "learning_rate": 1.9879526615545076e-05, + "loss": 0.4311, + "step": 1454 + }, + { + "epoch": 0.13706695555921905, + "grad_norm": 0.8348062038421631, + "learning_rate": 1.9879292817970372e-05, + "loss": 0.3995, + "step": 1455 + }, + { + "epoch": 0.1371611596523869, + "grad_norm": 0.8668566942214966, + "learning_rate": 1.9879058795132514e-05, + "loss": 0.3749, + "step": 1456 + }, + { + "epoch": 0.13725536374555475, + "grad_norm": 0.9548278450965881, + "learning_rate": 1.9878824547036838e-05, + "loss": 0.3863, + "step": 1457 + }, + { + "epoch": 0.1373495678387226, + "grad_norm": 1.0754507780075073, + "learning_rate": 1.987859007368868e-05, + "loss": 0.3791, + "step": 1458 + }, + { + "epoch": 0.13744377193189045, + "grad_norm": 0.8589168787002563, + "learning_rate": 1.9878355375093395e-05, + "loss": 0.4323, + "step": 1459 + }, + { + "epoch": 0.1375379760250583, + "grad_norm": 0.7914716005325317, + "learning_rate": 1.9878120451256325e-05, + "loss": 0.3678, + "step": 1460 + }, + { + "epoch": 0.13763218011822614, + "grad_norm": 0.9526810050010681, + "learning_rate": 1.9877885302182836e-05, + "loss": 0.3896, + "step": 1461 + }, + { + "epoch": 0.137726384211394, + "grad_norm": 0.8691291213035583, + "learning_rate": 1.987764992787829e-05, + "loss": 0.3984, + "step": 1462 + }, + { + "epoch": 0.13782058830456184, + "grad_norm": 1.0639795064926147, + "learning_rate": 1.9877414328348045e-05, + "loss": 0.3786, + "step": 1463 + }, + { + "epoch": 0.1379147923977297, + "grad_norm": 0.9380475878715515, + "learning_rate": 1.9877178503597476e-05, + "loss": 0.4111, + "step": 1464 + }, + { + "epoch": 0.13800899649089754, + "grad_norm": 0.9449976086616516, + "learning_rate": 1.9876942453631962e-05, + "loss": 0.3952, + "step": 1465 + }, + { + "epoch": 0.1381032005840654, + "grad_norm": 0.9195828437805176, + "learning_rate": 1.9876706178456884e-05, + "loss": 0.4033, + "step": 1466 + }, + { + "epoch": 0.13819740467723324, + "grad_norm": 0.807725727558136, + "learning_rate": 1.9876469678077634e-05, + "loss": 0.3879, + "step": 1467 + }, + { + "epoch": 0.1382916087704011, + "grad_norm": 0.9038071632385254, + "learning_rate": 1.98762329524996e-05, + "loss": 0.4214, + "step": 1468 + }, + { + "epoch": 0.13838581286356894, + "grad_norm": 0.9691506028175354, + "learning_rate": 1.987599600172818e-05, + "loss": 0.3749, + "step": 1469 + }, + { + "epoch": 0.13848001695673676, + "grad_norm": 0.8957289457321167, + "learning_rate": 1.987575882576878e-05, + "loss": 0.4419, + "step": 1470 + }, + { + "epoch": 0.1385742210499046, + "grad_norm": 1.027971863746643, + "learning_rate": 1.9875521424626802e-05, + "loss": 0.4452, + "step": 1471 + }, + { + "epoch": 0.13866842514307245, + "grad_norm": 1.001281499862671, + "learning_rate": 1.9875283798307664e-05, + "loss": 0.469, + "step": 1472 + }, + { + "epoch": 0.1387626292362403, + "grad_norm": 0.8806446194648743, + "learning_rate": 1.9875045946816784e-05, + "loss": 0.3987, + "step": 1473 + }, + { + "epoch": 0.13885683332940815, + "grad_norm": 0.9220328330993652, + "learning_rate": 1.9874807870159583e-05, + "loss": 0.3969, + "step": 1474 + }, + { + "epoch": 0.138951037422576, + "grad_norm": 0.9270832538604736, + "learning_rate": 1.9874569568341492e-05, + "loss": 0.3844, + "step": 1475 + }, + { + "epoch": 0.13904524151574385, + "grad_norm": 0.8893280029296875, + "learning_rate": 1.9874331041367946e-05, + "loss": 0.4448, + "step": 1476 + }, + { + "epoch": 0.1391394456089117, + "grad_norm": 0.8257045149803162, + "learning_rate": 1.987409228924438e-05, + "loss": 0.3667, + "step": 1477 + }, + { + "epoch": 0.13923364970207955, + "grad_norm": 0.930780291557312, + "learning_rate": 1.9873853311976235e-05, + "loss": 0.3743, + "step": 1478 + }, + { + "epoch": 0.1393278537952474, + "grad_norm": 0.9231829643249512, + "learning_rate": 1.9873614109568967e-05, + "loss": 0.3672, + "step": 1479 + }, + { + "epoch": 0.13942205788841525, + "grad_norm": 0.9927639961242676, + "learning_rate": 1.987337468202803e-05, + "loss": 0.388, + "step": 1480 + }, + { + "epoch": 0.1395162619815831, + "grad_norm": 0.8849180936813354, + "learning_rate": 1.9873135029358877e-05, + "loss": 0.4029, + "step": 1481 + }, + { + "epoch": 0.13961046607475094, + "grad_norm": 1.0194668769836426, + "learning_rate": 1.9872895151566975e-05, + "loss": 0.4712, + "step": 1482 + }, + { + "epoch": 0.1397046701679188, + "grad_norm": 0.9328537583351135, + "learning_rate": 1.9872655048657798e-05, + "loss": 0.4131, + "step": 1483 + }, + { + "epoch": 0.13979887426108664, + "grad_norm": 0.8719202280044556, + "learning_rate": 1.9872414720636815e-05, + "loss": 0.4055, + "step": 1484 + }, + { + "epoch": 0.1398930783542545, + "grad_norm": 0.817643940448761, + "learning_rate": 1.9872174167509515e-05, + "loss": 0.3533, + "step": 1485 + }, + { + "epoch": 0.13998728244742234, + "grad_norm": 0.9272326231002808, + "learning_rate": 1.987193338928137e-05, + "loss": 0.4082, + "step": 1486 + }, + { + "epoch": 0.1400814865405902, + "grad_norm": 0.9619669914245605, + "learning_rate": 1.987169238595788e-05, + "loss": 0.4678, + "step": 1487 + }, + { + "epoch": 0.14017569063375804, + "grad_norm": 0.9963866472244263, + "learning_rate": 1.9871451157544534e-05, + "loss": 0.4112, + "step": 1488 + }, + { + "epoch": 0.1402698947269259, + "grad_norm": 0.9173933863639832, + "learning_rate": 1.9871209704046835e-05, + "loss": 0.4102, + "step": 1489 + }, + { + "epoch": 0.14036409882009374, + "grad_norm": 0.9025501608848572, + "learning_rate": 1.9870968025470293e-05, + "loss": 0.4033, + "step": 1490 + }, + { + "epoch": 0.14045830291326158, + "grad_norm": 0.838789165019989, + "learning_rate": 1.9870726121820408e-05, + "loss": 0.3815, + "step": 1491 + }, + { + "epoch": 0.14055250700642943, + "grad_norm": 0.7602670788764954, + "learning_rate": 1.9870483993102704e-05, + "loss": 0.3658, + "step": 1492 + }, + { + "epoch": 0.14064671109959728, + "grad_norm": 0.8975557684898376, + "learning_rate": 1.98702416393227e-05, + "loss": 0.379, + "step": 1493 + }, + { + "epoch": 0.14074091519276513, + "grad_norm": 0.9210837483406067, + "learning_rate": 1.9869999060485927e-05, + "loss": 0.3882, + "step": 1494 + }, + { + "epoch": 0.14083511928593298, + "grad_norm": 0.8833448886871338, + "learning_rate": 1.9869756256597905e-05, + "loss": 0.3674, + "step": 1495 + }, + { + "epoch": 0.14092932337910083, + "grad_norm": 0.7752265334129333, + "learning_rate": 1.986951322766418e-05, + "loss": 0.3241, + "step": 1496 + }, + { + "epoch": 0.14102352747226868, + "grad_norm": 1.0192210674285889, + "learning_rate": 1.9869269973690287e-05, + "loss": 0.4657, + "step": 1497 + }, + { + "epoch": 0.14111773156543653, + "grad_norm": 1.0888237953186035, + "learning_rate": 1.9869026494681776e-05, + "loss": 0.3868, + "step": 1498 + }, + { + "epoch": 0.14121193565860438, + "grad_norm": 0.8453440070152283, + "learning_rate": 1.98687827906442e-05, + "loss": 0.3684, + "step": 1499 + }, + { + "epoch": 0.14130613975177223, + "grad_norm": 0.9561695456504822, + "learning_rate": 1.9868538861583112e-05, + "loss": 0.4303, + "step": 1500 + }, + { + "epoch": 0.14140034384494007, + "grad_norm": 0.8988255858421326, + "learning_rate": 1.9868294707504077e-05, + "loss": 0.3689, + "step": 1501 + }, + { + "epoch": 0.14149454793810792, + "grad_norm": 0.8484389781951904, + "learning_rate": 1.986805032841266e-05, + "loss": 0.3691, + "step": 1502 + }, + { + "epoch": 0.14158875203127577, + "grad_norm": 0.9466550946235657, + "learning_rate": 1.9867805724314438e-05, + "loss": 0.4252, + "step": 1503 + }, + { + "epoch": 0.14168295612444362, + "grad_norm": 0.9282593727111816, + "learning_rate": 1.986756089521498e-05, + "loss": 0.4692, + "step": 1504 + }, + { + "epoch": 0.14177716021761144, + "grad_norm": 0.8028031587600708, + "learning_rate": 1.9867315841119878e-05, + "loss": 0.3777, + "step": 1505 + }, + { + "epoch": 0.1418713643107793, + "grad_norm": 0.8507390022277832, + "learning_rate": 1.9867070562034712e-05, + "loss": 0.3991, + "step": 1506 + }, + { + "epoch": 0.14196556840394714, + "grad_norm": 0.9004268646240234, + "learning_rate": 1.986682505796508e-05, + "loss": 0.3723, + "step": 1507 + }, + { + "epoch": 0.142059772497115, + "grad_norm": 0.9132339954376221, + "learning_rate": 1.986657932891657e-05, + "loss": 0.4161, + "step": 1508 + }, + { + "epoch": 0.14215397659028284, + "grad_norm": 0.9097040891647339, + "learning_rate": 1.98663333748948e-05, + "loss": 0.3664, + "step": 1509 + }, + { + "epoch": 0.1422481806834507, + "grad_norm": 0.8887312412261963, + "learning_rate": 1.9866087195905365e-05, + "loss": 0.4225, + "step": 1510 + }, + { + "epoch": 0.14234238477661854, + "grad_norm": 0.8952493071556091, + "learning_rate": 1.9865840791953886e-05, + "loss": 0.406, + "step": 1511 + }, + { + "epoch": 0.14243658886978638, + "grad_norm": 0.9774636626243591, + "learning_rate": 1.9865594163045984e-05, + "loss": 0.4049, + "step": 1512 + }, + { + "epoch": 0.14253079296295423, + "grad_norm": 0.9388694167137146, + "learning_rate": 1.986534730918727e-05, + "loss": 0.3929, + "step": 1513 + }, + { + "epoch": 0.14262499705612208, + "grad_norm": 0.841415524482727, + "learning_rate": 1.9865100230383384e-05, + "loss": 0.3623, + "step": 1514 + }, + { + "epoch": 0.14271920114928993, + "grad_norm": 0.9300689697265625, + "learning_rate": 1.9864852926639955e-05, + "loss": 0.4353, + "step": 1515 + }, + { + "epoch": 0.14281340524245778, + "grad_norm": 0.9600360989570618, + "learning_rate": 1.9864605397962624e-05, + "loss": 0.4265, + "step": 1516 + }, + { + "epoch": 0.14290760933562563, + "grad_norm": 0.8746339678764343, + "learning_rate": 1.9864357644357036e-05, + "loss": 0.3741, + "step": 1517 + }, + { + "epoch": 0.14300181342879348, + "grad_norm": 1.224995493888855, + "learning_rate": 1.9864109665828835e-05, + "loss": 0.3732, + "step": 1518 + }, + { + "epoch": 0.14309601752196133, + "grad_norm": 0.9189375638961792, + "learning_rate": 1.986386146238368e-05, + "loss": 0.4001, + "step": 1519 + }, + { + "epoch": 0.14319022161512918, + "grad_norm": 0.8859357237815857, + "learning_rate": 1.9863613034027224e-05, + "loss": 0.4363, + "step": 1520 + }, + { + "epoch": 0.14328442570829703, + "grad_norm": 0.9655970931053162, + "learning_rate": 1.9863364380765144e-05, + "loss": 0.4381, + "step": 1521 + }, + { + "epoch": 0.14337862980146487, + "grad_norm": 0.9548221826553345, + "learning_rate": 1.9863115502603097e-05, + "loss": 0.3759, + "step": 1522 + }, + { + "epoch": 0.14347283389463272, + "grad_norm": 0.9607380032539368, + "learning_rate": 1.9862866399546762e-05, + "loss": 0.422, + "step": 1523 + }, + { + "epoch": 0.14356703798780057, + "grad_norm": 0.8558836579322815, + "learning_rate": 1.9862617071601825e-05, + "loss": 0.3665, + "step": 1524 + }, + { + "epoch": 0.14366124208096842, + "grad_norm": 0.9772205948829651, + "learning_rate": 1.9862367518773963e-05, + "loss": 0.4648, + "step": 1525 + }, + { + "epoch": 0.14375544617413627, + "grad_norm": 1.015708565711975, + "learning_rate": 1.986211774106887e-05, + "loss": 0.3942, + "step": 1526 + }, + { + "epoch": 0.14384965026730412, + "grad_norm": 1.1574227809906006, + "learning_rate": 1.986186773849224e-05, + "loss": 0.3912, + "step": 1527 + }, + { + "epoch": 0.14394385436047197, + "grad_norm": 0.8729410171508789, + "learning_rate": 1.9861617511049773e-05, + "loss": 0.4035, + "step": 1528 + }, + { + "epoch": 0.14403805845363982, + "grad_norm": 0.9859470725059509, + "learning_rate": 1.9861367058747175e-05, + "loss": 0.4218, + "step": 1529 + }, + { + "epoch": 0.14413226254680767, + "grad_norm": 0.9877438545227051, + "learning_rate": 1.986111638159016e-05, + "loss": 0.4801, + "step": 1530 + }, + { + "epoch": 0.14422646663997551, + "grad_norm": 0.8367306590080261, + "learning_rate": 1.986086547958444e-05, + "loss": 0.3492, + "step": 1531 + }, + { + "epoch": 0.14432067073314336, + "grad_norm": 0.8954234719276428, + "learning_rate": 1.9860614352735737e-05, + "loss": 0.421, + "step": 1532 + }, + { + "epoch": 0.1444148748263112, + "grad_norm": 1.0337097644805908, + "learning_rate": 1.9860363001049775e-05, + "loss": 0.4151, + "step": 1533 + }, + { + "epoch": 0.14450907891947906, + "grad_norm": 0.8945963382720947, + "learning_rate": 1.986011142453229e-05, + "loss": 0.3675, + "step": 1534 + }, + { + "epoch": 0.1446032830126469, + "grad_norm": 0.9584569931030273, + "learning_rate": 1.9859859623189015e-05, + "loss": 0.3832, + "step": 1535 + }, + { + "epoch": 0.14469748710581476, + "grad_norm": 0.9371083974838257, + "learning_rate": 1.985960759702569e-05, + "loss": 0.4199, + "step": 1536 + }, + { + "epoch": 0.1447916911989826, + "grad_norm": 0.967965841293335, + "learning_rate": 1.9859355346048065e-05, + "loss": 0.3832, + "step": 1537 + }, + { + "epoch": 0.14488589529215046, + "grad_norm": 0.859764814376831, + "learning_rate": 1.9859102870261887e-05, + "loss": 0.3217, + "step": 1538 + }, + { + "epoch": 0.14498009938531828, + "grad_norm": 0.9350664615631104, + "learning_rate": 1.9858850169672924e-05, + "loss": 0.4131, + "step": 1539 + }, + { + "epoch": 0.14507430347848613, + "grad_norm": 0.9000515341758728, + "learning_rate": 1.9858597244286923e-05, + "loss": 0.4084, + "step": 1540 + }, + { + "epoch": 0.14516850757165398, + "grad_norm": 0.9772770404815674, + "learning_rate": 1.985834409410966e-05, + "loss": 0.4033, + "step": 1541 + }, + { + "epoch": 0.14526271166482183, + "grad_norm": 0.8754479289054871, + "learning_rate": 1.9858090719146908e-05, + "loss": 0.4101, + "step": 1542 + }, + { + "epoch": 0.14535691575798967, + "grad_norm": 0.8851545453071594, + "learning_rate": 1.9857837119404438e-05, + "loss": 0.4137, + "step": 1543 + }, + { + "epoch": 0.14545111985115752, + "grad_norm": 1.01539945602417, + "learning_rate": 1.985758329488804e-05, + "loss": 0.4516, + "step": 1544 + }, + { + "epoch": 0.14554532394432537, + "grad_norm": 0.85793536901474, + "learning_rate": 1.9857329245603495e-05, + "loss": 0.3974, + "step": 1545 + }, + { + "epoch": 0.14563952803749322, + "grad_norm": 0.9479626417160034, + "learning_rate": 1.98570749715566e-05, + "loss": 0.4112, + "step": 1546 + }, + { + "epoch": 0.14573373213066107, + "grad_norm": 1.0737402439117432, + "learning_rate": 1.985682047275315e-05, + "loss": 0.3961, + "step": 1547 + }, + { + "epoch": 0.14582793622382892, + "grad_norm": 0.9221171140670776, + "learning_rate": 1.985656574919895e-05, + "loss": 0.3951, + "step": 1548 + }, + { + "epoch": 0.14592214031699677, + "grad_norm": 0.9689660668373108, + "learning_rate": 1.985631080089981e-05, + "loss": 0.3869, + "step": 1549 + }, + { + "epoch": 0.14601634441016462, + "grad_norm": 0.93543940782547, + "learning_rate": 1.9856055627861537e-05, + "loss": 0.3848, + "step": 1550 + }, + { + "epoch": 0.14611054850333247, + "grad_norm": 0.9185600876808167, + "learning_rate": 1.9855800230089955e-05, + "loss": 0.4093, + "step": 1551 + }, + { + "epoch": 0.14620475259650031, + "grad_norm": 0.8836493492126465, + "learning_rate": 1.9855544607590886e-05, + "loss": 0.3893, + "step": 1552 + }, + { + "epoch": 0.14629895668966816, + "grad_norm": 0.8601776361465454, + "learning_rate": 1.9855288760370154e-05, + "loss": 0.3919, + "step": 1553 + }, + { + "epoch": 0.146393160782836, + "grad_norm": 0.9663978219032288, + "learning_rate": 1.9855032688433603e-05, + "loss": 0.4357, + "step": 1554 + }, + { + "epoch": 0.14648736487600386, + "grad_norm": 0.8969776630401611, + "learning_rate": 1.985477639178706e-05, + "loss": 0.4259, + "step": 1555 + }, + { + "epoch": 0.1465815689691717, + "grad_norm": 0.9066229462623596, + "learning_rate": 1.9854519870436376e-05, + "loss": 0.3964, + "step": 1556 + }, + { + "epoch": 0.14667577306233956, + "grad_norm": 0.8684629797935486, + "learning_rate": 1.98542631243874e-05, + "loss": 0.3706, + "step": 1557 + }, + { + "epoch": 0.1467699771555074, + "grad_norm": 0.9415647387504578, + "learning_rate": 1.9854006153645983e-05, + "loss": 0.3839, + "step": 1558 + }, + { + "epoch": 0.14686418124867526, + "grad_norm": 0.8227822184562683, + "learning_rate": 1.985374895821799e-05, + "loss": 0.3556, + "step": 1559 + }, + { + "epoch": 0.1469583853418431, + "grad_norm": 0.9608584642410278, + "learning_rate": 1.985349153810928e-05, + "loss": 0.4359, + "step": 1560 + }, + { + "epoch": 0.14705258943501096, + "grad_norm": 0.9011355042457581, + "learning_rate": 1.9853233893325722e-05, + "loss": 0.4004, + "step": 1561 + }, + { + "epoch": 0.1471467935281788, + "grad_norm": 0.8800787925720215, + "learning_rate": 1.9852976023873196e-05, + "loss": 0.4057, + "step": 1562 + }, + { + "epoch": 0.14724099762134665, + "grad_norm": 1.0086630582809448, + "learning_rate": 1.9852717929757573e-05, + "loss": 0.452, + "step": 1563 + }, + { + "epoch": 0.1473352017145145, + "grad_norm": 0.927713930606842, + "learning_rate": 1.985245961098475e-05, + "loss": 0.395, + "step": 1564 + }, + { + "epoch": 0.14742940580768235, + "grad_norm": 0.7828457355499268, + "learning_rate": 1.9852201067560607e-05, + "loss": 0.3728, + "step": 1565 + }, + { + "epoch": 0.1475236099008502, + "grad_norm": 0.8762266635894775, + "learning_rate": 1.9851942299491043e-05, + "loss": 0.3737, + "step": 1566 + }, + { + "epoch": 0.14761781399401805, + "grad_norm": 0.967050313949585, + "learning_rate": 1.9851683306781962e-05, + "loss": 0.424, + "step": 1567 + }, + { + "epoch": 0.1477120180871859, + "grad_norm": 1.0238690376281738, + "learning_rate": 1.9851424089439263e-05, + "loss": 0.4298, + "step": 1568 + }, + { + "epoch": 0.14780622218035375, + "grad_norm": 0.8824076652526855, + "learning_rate": 1.985116464746886e-05, + "loss": 0.389, + "step": 1569 + }, + { + "epoch": 0.1479004262735216, + "grad_norm": 0.900592565536499, + "learning_rate": 1.985090498087667e-05, + "loss": 0.3649, + "step": 1570 + }, + { + "epoch": 0.14799463036668944, + "grad_norm": 0.9643316268920898, + "learning_rate": 1.9850645089668608e-05, + "loss": 0.4403, + "step": 1571 + }, + { + "epoch": 0.1480888344598573, + "grad_norm": 0.9956216216087341, + "learning_rate": 1.9850384973850603e-05, + "loss": 0.3818, + "step": 1572 + }, + { + "epoch": 0.14818303855302514, + "grad_norm": 1.0450785160064697, + "learning_rate": 1.985012463342859e-05, + "loss": 0.3986, + "step": 1573 + }, + { + "epoch": 0.14827724264619296, + "grad_norm": 0.8111163973808289, + "learning_rate": 1.98498640684085e-05, + "loss": 0.367, + "step": 1574 + }, + { + "epoch": 0.1483714467393608, + "grad_norm": 1.066404104232788, + "learning_rate": 1.9849603278796275e-05, + "loss": 0.4107, + "step": 1575 + }, + { + "epoch": 0.14846565083252866, + "grad_norm": 0.8669083714485168, + "learning_rate": 1.9849342264597864e-05, + "loss": 0.3758, + "step": 1576 + }, + { + "epoch": 0.1485598549256965, + "grad_norm": 0.8758968114852905, + "learning_rate": 1.984908102581922e-05, + "loss": 0.3903, + "step": 1577 + }, + { + "epoch": 0.14865405901886436, + "grad_norm": 0.8721821904182434, + "learning_rate": 1.9848819562466293e-05, + "loss": 0.3796, + "step": 1578 + }, + { + "epoch": 0.1487482631120322, + "grad_norm": 0.9264764785766602, + "learning_rate": 1.984855787454505e-05, + "loss": 0.4283, + "step": 1579 + }, + { + "epoch": 0.14884246720520006, + "grad_norm": 0.8868725299835205, + "learning_rate": 1.9848295962061455e-05, + "loss": 0.4002, + "step": 1580 + }, + { + "epoch": 0.1489366712983679, + "grad_norm": 0.897774338722229, + "learning_rate": 1.9848033825021482e-05, + "loss": 0.3563, + "step": 1581 + }, + { + "epoch": 0.14903087539153576, + "grad_norm": 0.9838619828224182, + "learning_rate": 1.9847771463431106e-05, + "loss": 0.4587, + "step": 1582 + }, + { + "epoch": 0.1491250794847036, + "grad_norm": 1.0027905702590942, + "learning_rate": 1.9847508877296314e-05, + "loss": 0.4377, + "step": 1583 + }, + { + "epoch": 0.14921928357787145, + "grad_norm": 0.868817150592804, + "learning_rate": 1.9847246066623086e-05, + "loss": 0.3713, + "step": 1584 + }, + { + "epoch": 0.1493134876710393, + "grad_norm": 0.870667576789856, + "learning_rate": 1.9846983031417423e-05, + "loss": 0.4033, + "step": 1585 + }, + { + "epoch": 0.14940769176420715, + "grad_norm": 0.9443947076797485, + "learning_rate": 1.9846719771685317e-05, + "loss": 0.3993, + "step": 1586 + }, + { + "epoch": 0.149501895857375, + "grad_norm": 1.0219776630401611, + "learning_rate": 1.984645628743277e-05, + "loss": 0.416, + "step": 1587 + }, + { + "epoch": 0.14959609995054285, + "grad_norm": 0.8650853633880615, + "learning_rate": 1.9846192578665792e-05, + "loss": 0.4026, + "step": 1588 + }, + { + "epoch": 0.1496903040437107, + "grad_norm": 0.9072234034538269, + "learning_rate": 1.9845928645390397e-05, + "loss": 0.3898, + "step": 1589 + }, + { + "epoch": 0.14978450813687855, + "grad_norm": 0.8843415975570679, + "learning_rate": 1.9845664487612602e-05, + "loss": 0.3917, + "step": 1590 + }, + { + "epoch": 0.1498787122300464, + "grad_norm": 1.0103051662445068, + "learning_rate": 1.984540010533843e-05, + "loss": 0.3474, + "step": 1591 + }, + { + "epoch": 0.14997291632321424, + "grad_norm": 0.7909182906150818, + "learning_rate": 1.984513549857391e-05, + "loss": 0.3542, + "step": 1592 + }, + { + "epoch": 0.1500671204163821, + "grad_norm": 0.9175459742546082, + "learning_rate": 1.9844870667325073e-05, + "loss": 0.4226, + "step": 1593 + }, + { + "epoch": 0.15016132450954994, + "grad_norm": 0.9356094002723694, + "learning_rate": 1.984460561159796e-05, + "loss": 0.4256, + "step": 1594 + }, + { + "epoch": 0.1502555286027178, + "grad_norm": 0.9353117346763611, + "learning_rate": 1.9844340331398613e-05, + "loss": 0.4015, + "step": 1595 + }, + { + "epoch": 0.15034973269588564, + "grad_norm": 0.834352433681488, + "learning_rate": 1.9844074826733085e-05, + "loss": 0.3815, + "step": 1596 + }, + { + "epoch": 0.1504439367890535, + "grad_norm": 0.9325944185256958, + "learning_rate": 1.9843809097607428e-05, + "loss": 0.4054, + "step": 1597 + }, + { + "epoch": 0.15053814088222134, + "grad_norm": 0.9160271286964417, + "learning_rate": 1.9843543144027695e-05, + "loss": 0.3888, + "step": 1598 + }, + { + "epoch": 0.1506323449753892, + "grad_norm": 0.8725243806838989, + "learning_rate": 1.9843276965999956e-05, + "loss": 0.3732, + "step": 1599 + }, + { + "epoch": 0.15072654906855704, + "grad_norm": 0.745169997215271, + "learning_rate": 1.984301056353028e-05, + "loss": 0.3401, + "step": 1600 + }, + { + "epoch": 0.15082075316172489, + "grad_norm": 0.823818027973175, + "learning_rate": 1.9842743936624743e-05, + "loss": 0.3884, + "step": 1601 + }, + { + "epoch": 0.15091495725489273, + "grad_norm": 0.8428900241851807, + "learning_rate": 1.9842477085289417e-05, + "loss": 0.3873, + "step": 1602 + }, + { + "epoch": 0.15100916134806058, + "grad_norm": 0.8186632394790649, + "learning_rate": 1.98422100095304e-05, + "loss": 0.3721, + "step": 1603 + }, + { + "epoch": 0.15110336544122843, + "grad_norm": 0.9110180139541626, + "learning_rate": 1.9841942709353765e-05, + "loss": 0.387, + "step": 1604 + }, + { + "epoch": 0.15119756953439628, + "grad_norm": 0.8393080234527588, + "learning_rate": 1.984167518476562e-05, + "loss": 0.3652, + "step": 1605 + }, + { + "epoch": 0.15129177362756413, + "grad_norm": 0.9034487009048462, + "learning_rate": 1.9841407435772056e-05, + "loss": 0.4181, + "step": 1606 + }, + { + "epoch": 0.15138597772073198, + "grad_norm": 0.8732578158378601, + "learning_rate": 1.9841139462379188e-05, + "loss": 0.4112, + "step": 1607 + }, + { + "epoch": 0.1514801818138998, + "grad_norm": 0.8301663398742676, + "learning_rate": 1.984087126459312e-05, + "loss": 0.3851, + "step": 1608 + }, + { + "epoch": 0.15157438590706765, + "grad_norm": 0.7910391688346863, + "learning_rate": 1.984060284241996e-05, + "loss": 0.3436, + "step": 1609 + }, + { + "epoch": 0.1516685900002355, + "grad_norm": 0.9603357315063477, + "learning_rate": 1.9840334195865846e-05, + "loss": 0.4142, + "step": 1610 + }, + { + "epoch": 0.15176279409340335, + "grad_norm": 0.8982900977134705, + "learning_rate": 1.984006532493689e-05, + "loss": 0.401, + "step": 1611 + }, + { + "epoch": 0.1518569981865712, + "grad_norm": 0.8096650838851929, + "learning_rate": 1.9839796229639226e-05, + "loss": 0.4028, + "step": 1612 + }, + { + "epoch": 0.15195120227973904, + "grad_norm": 0.8454784750938416, + "learning_rate": 1.9839526909978994e-05, + "loss": 0.3682, + "step": 1613 + }, + { + "epoch": 0.1520454063729069, + "grad_norm": 0.847681999206543, + "learning_rate": 1.9839257365962327e-05, + "loss": 0.3267, + "step": 1614 + }, + { + "epoch": 0.15213961046607474, + "grad_norm": 0.8493847250938416, + "learning_rate": 1.9838987597595377e-05, + "loss": 0.3922, + "step": 1615 + }, + { + "epoch": 0.1522338145592426, + "grad_norm": 0.8904460072517395, + "learning_rate": 1.9838717604884293e-05, + "loss": 0.3803, + "step": 1616 + }, + { + "epoch": 0.15232801865241044, + "grad_norm": 0.8912856578826904, + "learning_rate": 1.9838447387835233e-05, + "loss": 0.3799, + "step": 1617 + }, + { + "epoch": 0.1524222227455783, + "grad_norm": 0.8856900334358215, + "learning_rate": 1.9838176946454358e-05, + "loss": 0.3947, + "step": 1618 + }, + { + "epoch": 0.15251642683874614, + "grad_norm": 0.9876874685287476, + "learning_rate": 1.9837906280747832e-05, + "loss": 0.4121, + "step": 1619 + }, + { + "epoch": 0.152610630931914, + "grad_norm": 1.1396756172180176, + "learning_rate": 1.9837635390721828e-05, + "loss": 0.3945, + "step": 1620 + }, + { + "epoch": 0.15270483502508184, + "grad_norm": 1.0195603370666504, + "learning_rate": 1.9837364276382523e-05, + "loss": 0.4291, + "step": 1621 + }, + { + "epoch": 0.15279903911824969, + "grad_norm": 0.8799949884414673, + "learning_rate": 1.98370929377361e-05, + "loss": 0.4075, + "step": 1622 + }, + { + "epoch": 0.15289324321141753, + "grad_norm": 0.9101582765579224, + "learning_rate": 1.9836821374788742e-05, + "loss": 0.4404, + "step": 1623 + }, + { + "epoch": 0.15298744730458538, + "grad_norm": 0.9574260711669922, + "learning_rate": 1.9836549587546646e-05, + "loss": 0.4033, + "step": 1624 + }, + { + "epoch": 0.15308165139775323, + "grad_norm": 0.8660268783569336, + "learning_rate": 1.9836277576016006e-05, + "loss": 0.3577, + "step": 1625 + }, + { + "epoch": 0.15317585549092108, + "grad_norm": 1.075222373008728, + "learning_rate": 1.9836005340203026e-05, + "loss": 0.4407, + "step": 1626 + }, + { + "epoch": 0.15327005958408893, + "grad_norm": 0.9746554493904114, + "learning_rate": 1.9835732880113912e-05, + "loss": 0.4519, + "step": 1627 + }, + { + "epoch": 0.15336426367725678, + "grad_norm": 0.9732099771499634, + "learning_rate": 1.9835460195754878e-05, + "loss": 0.4228, + "step": 1628 + }, + { + "epoch": 0.15345846777042463, + "grad_norm": 0.9029169082641602, + "learning_rate": 1.983518728713214e-05, + "loss": 0.3808, + "step": 1629 + }, + { + "epoch": 0.15355267186359248, + "grad_norm": 0.8966045379638672, + "learning_rate": 1.983491415425192e-05, + "loss": 0.4243, + "step": 1630 + }, + { + "epoch": 0.15364687595676033, + "grad_norm": 0.9925733208656311, + "learning_rate": 1.9834640797120448e-05, + "loss": 0.4396, + "step": 1631 + }, + { + "epoch": 0.15374108004992817, + "grad_norm": 0.9034045934677124, + "learning_rate": 1.9834367215743958e-05, + "loss": 0.4096, + "step": 1632 + }, + { + "epoch": 0.15383528414309602, + "grad_norm": 0.8905357122421265, + "learning_rate": 1.9834093410128682e-05, + "loss": 0.3721, + "step": 1633 + }, + { + "epoch": 0.15392948823626387, + "grad_norm": 0.9021356701850891, + "learning_rate": 1.9833819380280875e-05, + "loss": 0.3734, + "step": 1634 + }, + { + "epoch": 0.15402369232943172, + "grad_norm": 0.8837315440177917, + "learning_rate": 1.983354512620677e-05, + "loss": 0.3884, + "step": 1635 + }, + { + "epoch": 0.15411789642259957, + "grad_norm": 0.9213358163833618, + "learning_rate": 1.983327064791263e-05, + "loss": 0.4331, + "step": 1636 + }, + { + "epoch": 0.15421210051576742, + "grad_norm": 0.8524973392486572, + "learning_rate": 1.9832995945404715e-05, + "loss": 0.342, + "step": 1637 + }, + { + "epoch": 0.15430630460893527, + "grad_norm": 0.8630796074867249, + "learning_rate": 1.9832721018689285e-05, + "loss": 0.3689, + "step": 1638 + }, + { + "epoch": 0.15440050870210312, + "grad_norm": 1.0397515296936035, + "learning_rate": 1.9832445867772606e-05, + "loss": 0.4243, + "step": 1639 + }, + { + "epoch": 0.15449471279527097, + "grad_norm": 0.9372056722640991, + "learning_rate": 1.983217049266096e-05, + "loss": 0.422, + "step": 1640 + }, + { + "epoch": 0.15458891688843882, + "grad_norm": 0.8932582139968872, + "learning_rate": 1.9831894893360617e-05, + "loss": 0.3874, + "step": 1641 + }, + { + "epoch": 0.15468312098160666, + "grad_norm": 0.9698359966278076, + "learning_rate": 1.9831619069877867e-05, + "loss": 0.3943, + "step": 1642 + }, + { + "epoch": 0.15477732507477449, + "grad_norm": 0.7887962460517883, + "learning_rate": 1.9831343022218998e-05, + "loss": 0.327, + "step": 1643 + }, + { + "epoch": 0.15487152916794233, + "grad_norm": 0.9296134114265442, + "learning_rate": 1.98310667503903e-05, + "loss": 0.3722, + "step": 1644 + }, + { + "epoch": 0.15496573326111018, + "grad_norm": 0.9402286410331726, + "learning_rate": 1.9830790254398078e-05, + "loss": 0.4177, + "step": 1645 + }, + { + "epoch": 0.15505993735427803, + "grad_norm": 0.8214709162712097, + "learning_rate": 1.9830513534248635e-05, + "loss": 0.3799, + "step": 1646 + }, + { + "epoch": 0.15515414144744588, + "grad_norm": 0.8556320071220398, + "learning_rate": 1.983023658994828e-05, + "loss": 0.3611, + "step": 1647 + }, + { + "epoch": 0.15524834554061373, + "grad_norm": 0.8597833514213562, + "learning_rate": 1.982995942150333e-05, + "loss": 0.3714, + "step": 1648 + }, + { + "epoch": 0.15534254963378158, + "grad_norm": 0.9170204401016235, + "learning_rate": 1.9829682028920102e-05, + "loss": 0.4182, + "step": 1649 + }, + { + "epoch": 0.15543675372694943, + "grad_norm": 0.840418815612793, + "learning_rate": 1.982940441220492e-05, + "loss": 0.3977, + "step": 1650 + }, + { + "epoch": 0.15553095782011728, + "grad_norm": 1.011001467704773, + "learning_rate": 1.9829126571364114e-05, + "loss": 0.4434, + "step": 1651 + }, + { + "epoch": 0.15562516191328513, + "grad_norm": 0.782615065574646, + "learning_rate": 1.9828848506404025e-05, + "loss": 0.3729, + "step": 1652 + }, + { + "epoch": 0.15571936600645297, + "grad_norm": 0.8115490674972534, + "learning_rate": 1.982857021733099e-05, + "loss": 0.3722, + "step": 1653 + }, + { + "epoch": 0.15581357009962082, + "grad_norm": 0.9715111255645752, + "learning_rate": 1.982829170415135e-05, + "loss": 0.3962, + "step": 1654 + }, + { + "epoch": 0.15590777419278867, + "grad_norm": 1.00404691696167, + "learning_rate": 1.9828012966871463e-05, + "loss": 0.4373, + "step": 1655 + }, + { + "epoch": 0.15600197828595652, + "grad_norm": 0.8605353832244873, + "learning_rate": 1.9827734005497677e-05, + "loss": 0.401, + "step": 1656 + }, + { + "epoch": 0.15609618237912437, + "grad_norm": 0.8684647679328918, + "learning_rate": 1.982745482003636e-05, + "loss": 0.3755, + "step": 1657 + }, + { + "epoch": 0.15619038647229222, + "grad_norm": 1.017825961112976, + "learning_rate": 1.9827175410493874e-05, + "loss": 0.3973, + "step": 1658 + }, + { + "epoch": 0.15628459056546007, + "grad_norm": 0.8150186538696289, + "learning_rate": 1.9826895776876594e-05, + "loss": 0.3319, + "step": 1659 + }, + { + "epoch": 0.15637879465862792, + "grad_norm": 0.8719780445098877, + "learning_rate": 1.9826615919190886e-05, + "loss": 0.3668, + "step": 1660 + }, + { + "epoch": 0.15647299875179577, + "grad_norm": 1.0488592386245728, + "learning_rate": 1.982633583744314e-05, + "loss": 0.414, + "step": 1661 + }, + { + "epoch": 0.15656720284496362, + "grad_norm": 0.8532881736755371, + "learning_rate": 1.982605553163974e-05, + "loss": 0.3757, + "step": 1662 + }, + { + "epoch": 0.15666140693813146, + "grad_norm": 0.9295089840888977, + "learning_rate": 1.9825775001787084e-05, + "loss": 0.3953, + "step": 1663 + }, + { + "epoch": 0.1567556110312993, + "grad_norm": 1.0750173330307007, + "learning_rate": 1.9825494247891557e-05, + "loss": 0.4015, + "step": 1664 + }, + { + "epoch": 0.15684981512446716, + "grad_norm": 0.8728357553482056, + "learning_rate": 1.9825213269959565e-05, + "loss": 0.3892, + "step": 1665 + }, + { + "epoch": 0.156944019217635, + "grad_norm": 0.9398319125175476, + "learning_rate": 1.9824932067997516e-05, + "loss": 0.4319, + "step": 1666 + }, + { + "epoch": 0.15703822331080286, + "grad_norm": 0.9120649099349976, + "learning_rate": 1.982465064201182e-05, + "loss": 0.4217, + "step": 1667 + }, + { + "epoch": 0.1571324274039707, + "grad_norm": 0.934065580368042, + "learning_rate": 1.98243689920089e-05, + "loss": 0.3464, + "step": 1668 + }, + { + "epoch": 0.15722663149713856, + "grad_norm": 0.8361982107162476, + "learning_rate": 1.982408711799517e-05, + "loss": 0.3657, + "step": 1669 + }, + { + "epoch": 0.1573208355903064, + "grad_norm": 0.9220681190490723, + "learning_rate": 1.982380501997706e-05, + "loss": 0.3386, + "step": 1670 + }, + { + "epoch": 0.15741503968347426, + "grad_norm": 0.8694973587989807, + "learning_rate": 1.9823522697961004e-05, + "loss": 0.3802, + "step": 1671 + }, + { + "epoch": 0.1575092437766421, + "grad_norm": 0.797346830368042, + "learning_rate": 1.9823240151953435e-05, + "loss": 0.3777, + "step": 1672 + }, + { + "epoch": 0.15760344786980995, + "grad_norm": 1.0393630266189575, + "learning_rate": 1.9822957381960802e-05, + "loss": 0.4052, + "step": 1673 + }, + { + "epoch": 0.1576976519629778, + "grad_norm": 1.0109260082244873, + "learning_rate": 1.9822674387989548e-05, + "loss": 0.4404, + "step": 1674 + }, + { + "epoch": 0.15779185605614565, + "grad_norm": 1.0433164834976196, + "learning_rate": 1.9822391170046127e-05, + "loss": 0.3708, + "step": 1675 + }, + { + "epoch": 0.1578860601493135, + "grad_norm": 0.8799563646316528, + "learning_rate": 1.9822107728137e-05, + "loss": 0.3827, + "step": 1676 + }, + { + "epoch": 0.15798026424248135, + "grad_norm": 0.9168888926506042, + "learning_rate": 1.982182406226862e-05, + "loss": 0.3589, + "step": 1677 + }, + { + "epoch": 0.15807446833564917, + "grad_norm": 0.9474049210548401, + "learning_rate": 1.9821540172447468e-05, + "loss": 0.3881, + "step": 1678 + }, + { + "epoch": 0.15816867242881702, + "grad_norm": 0.820391833782196, + "learning_rate": 1.982125605868001e-05, + "loss": 0.4008, + "step": 1679 + }, + { + "epoch": 0.15826287652198487, + "grad_norm": 0.857425332069397, + "learning_rate": 1.9820971720972723e-05, + "loss": 0.3829, + "step": 1680 + }, + { + "epoch": 0.15835708061515272, + "grad_norm": 0.8522971868515015, + "learning_rate": 1.9820687159332087e-05, + "loss": 0.4139, + "step": 1681 + }, + { + "epoch": 0.15845128470832057, + "grad_norm": 0.9846909642219543, + "learning_rate": 1.9820402373764604e-05, + "loss": 0.3765, + "step": 1682 + }, + { + "epoch": 0.15854548880148842, + "grad_norm": 0.9963827729225159, + "learning_rate": 1.9820117364276755e-05, + "loss": 0.4687, + "step": 1683 + }, + { + "epoch": 0.15863969289465626, + "grad_norm": 0.8635364770889282, + "learning_rate": 1.9819832130875044e-05, + "loss": 0.377, + "step": 1684 + }, + { + "epoch": 0.1587338969878241, + "grad_norm": 0.9445222616195679, + "learning_rate": 1.9819546673565975e-05, + "loss": 0.3951, + "step": 1685 + }, + { + "epoch": 0.15882810108099196, + "grad_norm": 0.8922547101974487, + "learning_rate": 1.9819260992356055e-05, + "loss": 0.3762, + "step": 1686 + }, + { + "epoch": 0.1589223051741598, + "grad_norm": 0.8504922389984131, + "learning_rate": 1.98189750872518e-05, + "loss": 0.404, + "step": 1687 + }, + { + "epoch": 0.15901650926732766, + "grad_norm": 0.9565543532371521, + "learning_rate": 1.9818688958259724e-05, + "loss": 0.4007, + "step": 1688 + }, + { + "epoch": 0.1591107133604955, + "grad_norm": 0.9205008149147034, + "learning_rate": 1.981840260538636e-05, + "loss": 0.3954, + "step": 1689 + }, + { + "epoch": 0.15920491745366336, + "grad_norm": 0.8571425080299377, + "learning_rate": 1.9818116028638224e-05, + "loss": 0.3808, + "step": 1690 + }, + { + "epoch": 0.1592991215468312, + "grad_norm": 1.0957655906677246, + "learning_rate": 1.9817829228021867e-05, + "loss": 0.4422, + "step": 1691 + }, + { + "epoch": 0.15939332563999906, + "grad_norm": 0.917987048625946, + "learning_rate": 1.9817542203543816e-05, + "loss": 0.3664, + "step": 1692 + }, + { + "epoch": 0.1594875297331669, + "grad_norm": 0.8339284658432007, + "learning_rate": 1.981725495521062e-05, + "loss": 0.3904, + "step": 1693 + }, + { + "epoch": 0.15958173382633475, + "grad_norm": 0.9408522248268127, + "learning_rate": 1.981696748302883e-05, + "loss": 0.4107, + "step": 1694 + }, + { + "epoch": 0.1596759379195026, + "grad_norm": 0.914353609085083, + "learning_rate": 1.9816679787005e-05, + "loss": 0.3887, + "step": 1695 + }, + { + "epoch": 0.15977014201267045, + "grad_norm": 0.859802782535553, + "learning_rate": 1.9816391867145685e-05, + "loss": 0.3477, + "step": 1696 + }, + { + "epoch": 0.1598643461058383, + "grad_norm": 0.9680219888687134, + "learning_rate": 1.9816103723457454e-05, + "loss": 0.3448, + "step": 1697 + }, + { + "epoch": 0.15995855019900615, + "grad_norm": 1.1162104606628418, + "learning_rate": 1.981581535594688e-05, + "loss": 0.3909, + "step": 1698 + }, + { + "epoch": 0.160052754292174, + "grad_norm": 1.0936061143875122, + "learning_rate": 1.9815526764620532e-05, + "loss": 0.4501, + "step": 1699 + }, + { + "epoch": 0.16014695838534185, + "grad_norm": 0.8204500079154968, + "learning_rate": 1.9815237949484998e-05, + "loss": 0.4177, + "step": 1700 + }, + { + "epoch": 0.1602411624785097, + "grad_norm": 0.8426515460014343, + "learning_rate": 1.981494891054686e-05, + "loss": 0.4003, + "step": 1701 + }, + { + "epoch": 0.16033536657167755, + "grad_norm": 0.8222750425338745, + "learning_rate": 1.9814659647812702e-05, + "loss": 0.3544, + "step": 1702 + }, + { + "epoch": 0.1604295706648454, + "grad_norm": 1.068662405014038, + "learning_rate": 1.981437016128913e-05, + "loss": 0.4428, + "step": 1703 + }, + { + "epoch": 0.16052377475801324, + "grad_norm": 0.8300691246986389, + "learning_rate": 1.981408045098274e-05, + "loss": 0.3453, + "step": 1704 + }, + { + "epoch": 0.1606179788511811, + "grad_norm": 0.835858941078186, + "learning_rate": 1.9813790516900134e-05, + "loss": 0.3334, + "step": 1705 + }, + { + "epoch": 0.16071218294434894, + "grad_norm": 0.8602931499481201, + "learning_rate": 1.981350035904793e-05, + "loss": 0.3839, + "step": 1706 + }, + { + "epoch": 0.1608063870375168, + "grad_norm": 0.9044490456581116, + "learning_rate": 1.981320997743274e-05, + "loss": 0.4032, + "step": 1707 + }, + { + "epoch": 0.16090059113068464, + "grad_norm": 0.8686575293540955, + "learning_rate": 1.9812919372061187e-05, + "loss": 0.398, + "step": 1708 + }, + { + "epoch": 0.1609947952238525, + "grad_norm": 0.7816247344017029, + "learning_rate": 1.9812628542939897e-05, + "loss": 0.3386, + "step": 1709 + }, + { + "epoch": 0.16108899931702034, + "grad_norm": 0.9940059781074524, + "learning_rate": 1.98123374900755e-05, + "loss": 0.3753, + "step": 1710 + }, + { + "epoch": 0.16118320341018819, + "grad_norm": 0.9607025980949402, + "learning_rate": 1.9812046213474632e-05, + "loss": 0.4175, + "step": 1711 + }, + { + "epoch": 0.161277407503356, + "grad_norm": 0.993575930595398, + "learning_rate": 1.9811754713143936e-05, + "loss": 0.4561, + "step": 1712 + }, + { + "epoch": 0.16137161159652386, + "grad_norm": 0.966627299785614, + "learning_rate": 1.981146298909006e-05, + "loss": 0.3753, + "step": 1713 + }, + { + "epoch": 0.1614658156896917, + "grad_norm": 0.9251071810722351, + "learning_rate": 1.981117104131965e-05, + "loss": 0.369, + "step": 1714 + }, + { + "epoch": 0.16156001978285955, + "grad_norm": 1.022159457206726, + "learning_rate": 1.981087886983937e-05, + "loss": 0.4351, + "step": 1715 + }, + { + "epoch": 0.1616542238760274, + "grad_norm": 0.9397159814834595, + "learning_rate": 1.981058647465588e-05, + "loss": 0.4274, + "step": 1716 + }, + { + "epoch": 0.16174842796919525, + "grad_norm": 0.9194698333740234, + "learning_rate": 1.9810293855775845e-05, + "loss": 0.416, + "step": 1717 + }, + { + "epoch": 0.1618426320623631, + "grad_norm": 0.848105251789093, + "learning_rate": 1.9810001013205936e-05, + "loss": 0.3991, + "step": 1718 + }, + { + "epoch": 0.16193683615553095, + "grad_norm": 0.9086849689483643, + "learning_rate": 1.9809707946952837e-05, + "loss": 0.3805, + "step": 1719 + }, + { + "epoch": 0.1620310402486988, + "grad_norm": 0.9052044153213501, + "learning_rate": 1.9809414657023222e-05, + "loss": 0.3928, + "step": 1720 + }, + { + "epoch": 0.16212524434186665, + "grad_norm": 0.8989402055740356, + "learning_rate": 1.9809121143423783e-05, + "loss": 0.3895, + "step": 1721 + }, + { + "epoch": 0.1622194484350345, + "grad_norm": 0.909193217754364, + "learning_rate": 1.9808827406161215e-05, + "loss": 0.3809, + "step": 1722 + }, + { + "epoch": 0.16231365252820235, + "grad_norm": 0.886569082736969, + "learning_rate": 1.980853344524221e-05, + "loss": 0.4072, + "step": 1723 + }, + { + "epoch": 0.1624078566213702, + "grad_norm": 0.8686087727546692, + "learning_rate": 1.9808239260673473e-05, + "loss": 0.356, + "step": 1724 + }, + { + "epoch": 0.16250206071453804, + "grad_norm": 0.8799514174461365, + "learning_rate": 1.9807944852461714e-05, + "loss": 0.4252, + "step": 1725 + }, + { + "epoch": 0.1625962648077059, + "grad_norm": 0.9682909250259399, + "learning_rate": 1.980765022061364e-05, + "loss": 0.3522, + "step": 1726 + }, + { + "epoch": 0.16269046890087374, + "grad_norm": 0.9706382155418396, + "learning_rate": 1.9807355365135978e-05, + "loss": 0.3659, + "step": 1727 + }, + { + "epoch": 0.1627846729940416, + "grad_norm": 1.0148420333862305, + "learning_rate": 1.9807060286035443e-05, + "loss": 0.4255, + "step": 1728 + }, + { + "epoch": 0.16287887708720944, + "grad_norm": 0.8712765574455261, + "learning_rate": 1.9806764983318766e-05, + "loss": 0.3685, + "step": 1729 + }, + { + "epoch": 0.1629730811803773, + "grad_norm": 0.9349738955497742, + "learning_rate": 1.9806469456992682e-05, + "loss": 0.4037, + "step": 1730 + }, + { + "epoch": 0.16306728527354514, + "grad_norm": 0.9326770901679993, + "learning_rate": 1.980617370706393e-05, + "loss": 0.409, + "step": 1731 + }, + { + "epoch": 0.163161489366713, + "grad_norm": 0.8601573705673218, + "learning_rate": 1.980587773353925e-05, + "loss": 0.3334, + "step": 1732 + }, + { + "epoch": 0.16325569345988084, + "grad_norm": 0.8509078025817871, + "learning_rate": 1.9805581536425393e-05, + "loss": 0.3676, + "step": 1733 + }, + { + "epoch": 0.16334989755304868, + "grad_norm": 0.934744656085968, + "learning_rate": 1.9805285115729113e-05, + "loss": 0.3749, + "step": 1734 + }, + { + "epoch": 0.16344410164621653, + "grad_norm": 0.875711977481842, + "learning_rate": 1.9804988471457168e-05, + "loss": 0.3843, + "step": 1735 + }, + { + "epoch": 0.16353830573938438, + "grad_norm": 0.8410339951515198, + "learning_rate": 1.9804691603616324e-05, + "loss": 0.3807, + "step": 1736 + }, + { + "epoch": 0.16363250983255223, + "grad_norm": 0.8422248363494873, + "learning_rate": 1.9804394512213342e-05, + "loss": 0.3945, + "step": 1737 + }, + { + "epoch": 0.16372671392572008, + "grad_norm": 0.8934512734413147, + "learning_rate": 1.980409719725501e-05, + "loss": 0.3858, + "step": 1738 + }, + { + "epoch": 0.16382091801888793, + "grad_norm": 0.8517009615898132, + "learning_rate": 1.9803799658748096e-05, + "loss": 0.3946, + "step": 1739 + }, + { + "epoch": 0.16391512211205578, + "grad_norm": 0.988314151763916, + "learning_rate": 1.9803501896699385e-05, + "loss": 0.417, + "step": 1740 + }, + { + "epoch": 0.16400932620522363, + "grad_norm": 0.994186282157898, + "learning_rate": 1.9803203911115677e-05, + "loss": 0.4031, + "step": 1741 + }, + { + "epoch": 0.16410353029839148, + "grad_norm": 0.8861984014511108, + "learning_rate": 1.9802905702003753e-05, + "loss": 0.4167, + "step": 1742 + }, + { + "epoch": 0.16419773439155932, + "grad_norm": 0.9142113327980042, + "learning_rate": 1.9802607269370418e-05, + "loss": 0.4387, + "step": 1743 + }, + { + "epoch": 0.16429193848472717, + "grad_norm": 0.8992695212364197, + "learning_rate": 1.980230861322248e-05, + "loss": 0.3591, + "step": 1744 + }, + { + "epoch": 0.16438614257789502, + "grad_norm": 0.9266336560249329, + "learning_rate": 1.9802009733566744e-05, + "loss": 0.4222, + "step": 1745 + }, + { + "epoch": 0.16448034667106287, + "grad_norm": 0.8919268250465393, + "learning_rate": 1.980171063041003e-05, + "loss": 0.3914, + "step": 1746 + }, + { + "epoch": 0.1645745507642307, + "grad_norm": 0.796708345413208, + "learning_rate": 1.9801411303759154e-05, + "loss": 0.3747, + "step": 1747 + }, + { + "epoch": 0.16466875485739854, + "grad_norm": 0.9004980325698853, + "learning_rate": 1.980111175362094e-05, + "loss": 0.386, + "step": 1748 + }, + { + "epoch": 0.1647629589505664, + "grad_norm": 0.9157950282096863, + "learning_rate": 1.9800811980002218e-05, + "loss": 0.3937, + "step": 1749 + }, + { + "epoch": 0.16485716304373424, + "grad_norm": 0.9503116011619568, + "learning_rate": 1.980051198290983e-05, + "loss": 0.4312, + "step": 1750 + }, + { + "epoch": 0.1649513671369021, + "grad_norm": 0.7941295504570007, + "learning_rate": 1.9800211762350612e-05, + "loss": 0.3391, + "step": 1751 + }, + { + "epoch": 0.16504557123006994, + "grad_norm": 1.0035288333892822, + "learning_rate": 1.9799911318331407e-05, + "loss": 0.4451, + "step": 1752 + }, + { + "epoch": 0.1651397753232378, + "grad_norm": 0.8477427959442139, + "learning_rate": 1.979961065085907e-05, + "loss": 0.3396, + "step": 1753 + }, + { + "epoch": 0.16523397941640564, + "grad_norm": 0.8529664874076843, + "learning_rate": 1.9799309759940457e-05, + "loss": 0.4083, + "step": 1754 + }, + { + "epoch": 0.16532818350957348, + "grad_norm": 0.9405017495155334, + "learning_rate": 1.9799008645582424e-05, + "loss": 0.4651, + "step": 1755 + }, + { + "epoch": 0.16542238760274133, + "grad_norm": 0.8225582242012024, + "learning_rate": 1.9798707307791837e-05, + "loss": 0.3528, + "step": 1756 + }, + { + "epoch": 0.16551659169590918, + "grad_norm": 0.8793336153030396, + "learning_rate": 1.9798405746575572e-05, + "loss": 0.3746, + "step": 1757 + }, + { + "epoch": 0.16561079578907703, + "grad_norm": 0.9157631397247314, + "learning_rate": 1.9798103961940503e-05, + "loss": 0.3768, + "step": 1758 + }, + { + "epoch": 0.16570499988224488, + "grad_norm": 0.8963596224784851, + "learning_rate": 1.979780195389351e-05, + "loss": 0.3725, + "step": 1759 + }, + { + "epoch": 0.16579920397541273, + "grad_norm": 1.0575075149536133, + "learning_rate": 1.979749972244148e-05, + "loss": 0.4207, + "step": 1760 + }, + { + "epoch": 0.16589340806858058, + "grad_norm": 0.8952525854110718, + "learning_rate": 1.9797197267591304e-05, + "loss": 0.3782, + "step": 1761 + }, + { + "epoch": 0.16598761216174843, + "grad_norm": 0.9012593030929565, + "learning_rate": 1.979689458934988e-05, + "loss": 0.3595, + "step": 1762 + }, + { + "epoch": 0.16608181625491628, + "grad_norm": 0.7629408240318298, + "learning_rate": 1.9796591687724103e-05, + "loss": 0.3153, + "step": 1763 + }, + { + "epoch": 0.16617602034808412, + "grad_norm": 1.0397006273269653, + "learning_rate": 1.979628856272089e-05, + "loss": 0.4345, + "step": 1764 + }, + { + "epoch": 0.16627022444125197, + "grad_norm": 0.9040034413337708, + "learning_rate": 1.9795985214347146e-05, + "loss": 0.3802, + "step": 1765 + }, + { + "epoch": 0.16636442853441982, + "grad_norm": 0.8815705180168152, + "learning_rate": 1.979568164260979e-05, + "loss": 0.3452, + "step": 1766 + }, + { + "epoch": 0.16645863262758767, + "grad_norm": 0.9503600001335144, + "learning_rate": 1.9795377847515743e-05, + "loss": 0.3307, + "step": 1767 + }, + { + "epoch": 0.16655283672075552, + "grad_norm": 0.9405672550201416, + "learning_rate": 1.979507382907193e-05, + "loss": 0.4515, + "step": 1768 + }, + { + "epoch": 0.16664704081392337, + "grad_norm": 1.1875604391098022, + "learning_rate": 1.9794769587285287e-05, + "loss": 0.5184, + "step": 1769 + }, + { + "epoch": 0.16674124490709122, + "grad_norm": 0.998553991317749, + "learning_rate": 1.979446512216275e-05, + "loss": 0.41, + "step": 1770 + }, + { + "epoch": 0.16683544900025907, + "grad_norm": 0.8785516023635864, + "learning_rate": 1.979416043371126e-05, + "loss": 0.3665, + "step": 1771 + }, + { + "epoch": 0.16692965309342692, + "grad_norm": 0.8816999197006226, + "learning_rate": 1.9793855521937766e-05, + "loss": 0.3452, + "step": 1772 + }, + { + "epoch": 0.16702385718659477, + "grad_norm": 0.8487377762794495, + "learning_rate": 1.979355038684922e-05, + "loss": 0.4154, + "step": 1773 + }, + { + "epoch": 0.16711806127976261, + "grad_norm": 0.9029588103294373, + "learning_rate": 1.9793245028452577e-05, + "loss": 0.3713, + "step": 1774 + }, + { + "epoch": 0.16721226537293046, + "grad_norm": 0.8425957560539246, + "learning_rate": 1.9792939446754804e-05, + "loss": 0.3727, + "step": 1775 + }, + { + "epoch": 0.1673064694660983, + "grad_norm": 0.83709317445755, + "learning_rate": 1.9792633641762865e-05, + "loss": 0.3617, + "step": 1776 + }, + { + "epoch": 0.16740067355926616, + "grad_norm": 0.8353833556175232, + "learning_rate": 1.9792327613483735e-05, + "loss": 0.3563, + "step": 1777 + }, + { + "epoch": 0.167494877652434, + "grad_norm": 0.9072156548500061, + "learning_rate": 1.9792021361924392e-05, + "loss": 0.3671, + "step": 1778 + }, + { + "epoch": 0.16758908174560186, + "grad_norm": 1.0261179208755493, + "learning_rate": 1.9791714887091816e-05, + "loss": 0.3652, + "step": 1779 + }, + { + "epoch": 0.1676832858387697, + "grad_norm": 0.9820303320884705, + "learning_rate": 1.9791408188993003e-05, + "loss": 0.4634, + "step": 1780 + }, + { + "epoch": 0.16777748993193753, + "grad_norm": 0.8858616352081299, + "learning_rate": 1.9791101267634937e-05, + "loss": 0.3631, + "step": 1781 + }, + { + "epoch": 0.16787169402510538, + "grad_norm": 0.8245965838432312, + "learning_rate": 1.9790794123024618e-05, + "loss": 0.3997, + "step": 1782 + }, + { + "epoch": 0.16796589811827323, + "grad_norm": 0.9756718873977661, + "learning_rate": 1.979048675516905e-05, + "loss": 0.467, + "step": 1783 + }, + { + "epoch": 0.16806010221144108, + "grad_norm": 0.9283931851387024, + "learning_rate": 1.9790179164075247e-05, + "loss": 0.4251, + "step": 1784 + }, + { + "epoch": 0.16815430630460892, + "grad_norm": 1.0229156017303467, + "learning_rate": 1.9789871349750216e-05, + "loss": 0.3911, + "step": 1785 + }, + { + "epoch": 0.16824851039777677, + "grad_norm": 0.9102269411087036, + "learning_rate": 1.978956331220098e-05, + "loss": 0.4388, + "step": 1786 + }, + { + "epoch": 0.16834271449094462, + "grad_norm": 0.8671599626541138, + "learning_rate": 1.978925505143456e-05, + "loss": 0.3785, + "step": 1787 + }, + { + "epoch": 0.16843691858411247, + "grad_norm": 0.7886928915977478, + "learning_rate": 1.9788946567457982e-05, + "loss": 0.3627, + "step": 1788 + }, + { + "epoch": 0.16853112267728032, + "grad_norm": 0.8896774053573608, + "learning_rate": 1.978863786027829e-05, + "loss": 0.3877, + "step": 1789 + }, + { + "epoch": 0.16862532677044817, + "grad_norm": 0.873624861240387, + "learning_rate": 1.978832892990251e-05, + "loss": 0.4248, + "step": 1790 + }, + { + "epoch": 0.16871953086361602, + "grad_norm": 0.9539051055908203, + "learning_rate": 1.9788019776337693e-05, + "loss": 0.4488, + "step": 1791 + }, + { + "epoch": 0.16881373495678387, + "grad_norm": 0.7920552492141724, + "learning_rate": 1.978771039959089e-05, + "loss": 0.3748, + "step": 1792 + }, + { + "epoch": 0.16890793904995172, + "grad_norm": 0.8468660712242126, + "learning_rate": 1.9787400799669155e-05, + "loss": 0.3353, + "step": 1793 + }, + { + "epoch": 0.16900214314311957, + "grad_norm": 1.2447292804718018, + "learning_rate": 1.978709097657954e-05, + "loss": 0.4093, + "step": 1794 + }, + { + "epoch": 0.16909634723628741, + "grad_norm": 0.9498482346534729, + "learning_rate": 1.978678093032912e-05, + "loss": 0.4097, + "step": 1795 + }, + { + "epoch": 0.16919055132945526, + "grad_norm": 0.827974259853363, + "learning_rate": 1.9786470660924958e-05, + "loss": 0.3704, + "step": 1796 + }, + { + "epoch": 0.1692847554226231, + "grad_norm": 0.8106819987297058, + "learning_rate": 1.9786160168374125e-05, + "loss": 0.3429, + "step": 1797 + }, + { + "epoch": 0.16937895951579096, + "grad_norm": 0.8402098417282104, + "learning_rate": 1.978584945268371e-05, + "loss": 0.3797, + "step": 1798 + }, + { + "epoch": 0.1694731636089588, + "grad_norm": 0.8330712914466858, + "learning_rate": 1.9785538513860794e-05, + "loss": 0.3266, + "step": 1799 + }, + { + "epoch": 0.16956736770212666, + "grad_norm": 0.9236308336257935, + "learning_rate": 1.978522735191246e-05, + "loss": 0.3974, + "step": 1800 + }, + { + "epoch": 0.1696615717952945, + "grad_norm": 1.0042564868927002, + "learning_rate": 1.9784915966845817e-05, + "loss": 0.3724, + "step": 1801 + }, + { + "epoch": 0.16975577588846236, + "grad_norm": 0.9548354148864746, + "learning_rate": 1.9784604358667954e-05, + "loss": 0.3869, + "step": 1802 + }, + { + "epoch": 0.1698499799816302, + "grad_norm": 0.8802642226219177, + "learning_rate": 1.978429252738598e-05, + "loss": 0.4395, + "step": 1803 + }, + { + "epoch": 0.16994418407479805, + "grad_norm": 0.8193650245666504, + "learning_rate": 1.9783980473007004e-05, + "loss": 0.3548, + "step": 1804 + }, + { + "epoch": 0.1700383881679659, + "grad_norm": 0.9388848543167114, + "learning_rate": 1.9783668195538143e-05, + "loss": 0.3968, + "step": 1805 + }, + { + "epoch": 0.17013259226113375, + "grad_norm": 0.8756192922592163, + "learning_rate": 1.9783355694986516e-05, + "loss": 0.4299, + "step": 1806 + }, + { + "epoch": 0.1702267963543016, + "grad_norm": 0.9549397230148315, + "learning_rate": 1.978304297135925e-05, + "loss": 0.4035, + "step": 1807 + }, + { + "epoch": 0.17032100044746945, + "grad_norm": 0.8104063272476196, + "learning_rate": 1.978273002466347e-05, + "loss": 0.3794, + "step": 1808 + }, + { + "epoch": 0.1704152045406373, + "grad_norm": 0.84712815284729, + "learning_rate": 1.978241685490632e-05, + "loss": 0.3785, + "step": 1809 + }, + { + "epoch": 0.17050940863380515, + "grad_norm": 0.9174196720123291, + "learning_rate": 1.9782103462094935e-05, + "loss": 0.4004, + "step": 1810 + }, + { + "epoch": 0.170603612726973, + "grad_norm": 0.8831159472465515, + "learning_rate": 1.9781789846236466e-05, + "loss": 0.3719, + "step": 1811 + }, + { + "epoch": 0.17069781682014085, + "grad_norm": 0.8972384333610535, + "learning_rate": 1.9781476007338058e-05, + "loss": 0.3865, + "step": 1812 + }, + { + "epoch": 0.1707920209133087, + "grad_norm": 0.947032630443573, + "learning_rate": 1.978116194540687e-05, + "loss": 0.3844, + "step": 1813 + }, + { + "epoch": 0.17088622500647654, + "grad_norm": 0.9138824343681335, + "learning_rate": 1.9780847660450062e-05, + "loss": 0.3505, + "step": 1814 + }, + { + "epoch": 0.1709804290996444, + "grad_norm": 0.9780575633049011, + "learning_rate": 1.9780533152474802e-05, + "loss": 0.3467, + "step": 1815 + }, + { + "epoch": 0.17107463319281221, + "grad_norm": 0.9022666215896606, + "learning_rate": 1.9780218421488263e-05, + "loss": 0.3679, + "step": 1816 + }, + { + "epoch": 0.17116883728598006, + "grad_norm": 0.9314306974411011, + "learning_rate": 1.9779903467497614e-05, + "loss": 0.4166, + "step": 1817 + }, + { + "epoch": 0.1712630413791479, + "grad_norm": 0.9205523133277893, + "learning_rate": 1.9779588290510044e-05, + "loss": 0.3931, + "step": 1818 + }, + { + "epoch": 0.17135724547231576, + "grad_norm": 1.0727566480636597, + "learning_rate": 1.9779272890532733e-05, + "loss": 0.4415, + "step": 1819 + }, + { + "epoch": 0.1714514495654836, + "grad_norm": 1.0124160051345825, + "learning_rate": 1.977895726757288e-05, + "loss": 0.4145, + "step": 1820 + }, + { + "epoch": 0.17154565365865146, + "grad_norm": 1.020991563796997, + "learning_rate": 1.977864142163768e-05, + "loss": 0.4205, + "step": 1821 + }, + { + "epoch": 0.1716398577518193, + "grad_norm": 0.8599303364753723, + "learning_rate": 1.9778325352734326e-05, + "loss": 0.3583, + "step": 1822 + }, + { + "epoch": 0.17173406184498716, + "grad_norm": 0.9480980038642883, + "learning_rate": 1.9778009060870035e-05, + "loss": 0.4539, + "step": 1823 + }, + { + "epoch": 0.171828265938155, + "grad_norm": 0.9444804191589355, + "learning_rate": 1.9777692546052014e-05, + "loss": 0.4561, + "step": 1824 + }, + { + "epoch": 0.17192247003132285, + "grad_norm": 1.0277150869369507, + "learning_rate": 1.977737580828748e-05, + "loss": 0.4486, + "step": 1825 + }, + { + "epoch": 0.1720166741244907, + "grad_norm": 0.7958475351333618, + "learning_rate": 1.977705884758366e-05, + "loss": 0.3596, + "step": 1826 + }, + { + "epoch": 0.17211087821765855, + "grad_norm": 0.8301064968109131, + "learning_rate": 1.977674166394778e-05, + "loss": 0.3664, + "step": 1827 + }, + { + "epoch": 0.1722050823108264, + "grad_norm": 1.009836196899414, + "learning_rate": 1.9776424257387066e-05, + "loss": 0.418, + "step": 1828 + }, + { + "epoch": 0.17229928640399425, + "grad_norm": 0.8978395462036133, + "learning_rate": 1.977610662790876e-05, + "loss": 0.3601, + "step": 1829 + }, + { + "epoch": 0.1723934904971621, + "grad_norm": 0.9540097713470459, + "learning_rate": 1.9775788775520105e-05, + "loss": 0.4594, + "step": 1830 + }, + { + "epoch": 0.17248769459032995, + "grad_norm": 0.8026547431945801, + "learning_rate": 1.9775470700228346e-05, + "loss": 0.3616, + "step": 1831 + }, + { + "epoch": 0.1725818986834978, + "grad_norm": 0.9324086904525757, + "learning_rate": 1.9775152402040735e-05, + "loss": 0.4165, + "step": 1832 + }, + { + "epoch": 0.17267610277666565, + "grad_norm": 0.9284736514091492, + "learning_rate": 1.9774833880964538e-05, + "loss": 0.4116, + "step": 1833 + }, + { + "epoch": 0.1727703068698335, + "grad_norm": 0.8991754651069641, + "learning_rate": 1.9774515137007006e-05, + "loss": 0.3435, + "step": 1834 + }, + { + "epoch": 0.17286451096300134, + "grad_norm": 1.0078505277633667, + "learning_rate": 1.9774196170175414e-05, + "loss": 0.4604, + "step": 1835 + }, + { + "epoch": 0.1729587150561692, + "grad_norm": 0.9617123007774353, + "learning_rate": 1.9773876980477033e-05, + "loss": 0.3642, + "step": 1836 + }, + { + "epoch": 0.17305291914933704, + "grad_norm": 0.9242467880249023, + "learning_rate": 1.977355756791914e-05, + "loss": 0.3946, + "step": 1837 + }, + { + "epoch": 0.1731471232425049, + "grad_norm": 0.7867587804794312, + "learning_rate": 1.977323793250902e-05, + "loss": 0.3754, + "step": 1838 + }, + { + "epoch": 0.17324132733567274, + "grad_norm": 0.8909556865692139, + "learning_rate": 1.9772918074253965e-05, + "loss": 0.4012, + "step": 1839 + }, + { + "epoch": 0.1733355314288406, + "grad_norm": 0.9703721404075623, + "learning_rate": 1.977259799316126e-05, + "loss": 0.3578, + "step": 1840 + }, + { + "epoch": 0.17342973552200844, + "grad_norm": 0.8696607351303101, + "learning_rate": 1.9772277689238205e-05, + "loss": 0.3712, + "step": 1841 + }, + { + "epoch": 0.1735239396151763, + "grad_norm": 0.9312201142311096, + "learning_rate": 1.977195716249211e-05, + "loss": 0.4205, + "step": 1842 + }, + { + "epoch": 0.17361814370834414, + "grad_norm": 0.951821506023407, + "learning_rate": 1.9771636412930274e-05, + "loss": 0.4594, + "step": 1843 + }, + { + "epoch": 0.17371234780151198, + "grad_norm": 0.900589644908905, + "learning_rate": 1.977131544056002e-05, + "loss": 0.3755, + "step": 1844 + }, + { + "epoch": 0.17380655189467983, + "grad_norm": 0.8959382772445679, + "learning_rate": 1.9770994245388658e-05, + "loss": 0.3854, + "step": 1845 + }, + { + "epoch": 0.17390075598784768, + "grad_norm": 0.84087073802948, + "learning_rate": 1.977067282742352e-05, + "loss": 0.41, + "step": 1846 + }, + { + "epoch": 0.17399496008101553, + "grad_norm": 0.7898133993148804, + "learning_rate": 1.977035118667193e-05, + "loss": 0.393, + "step": 1847 + }, + { + "epoch": 0.17408916417418338, + "grad_norm": 0.8709238171577454, + "learning_rate": 1.9770029323141224e-05, + "loss": 0.4058, + "step": 1848 + }, + { + "epoch": 0.17418336826735123, + "grad_norm": 0.931500256061554, + "learning_rate": 1.9769707236838737e-05, + "loss": 0.4019, + "step": 1849 + }, + { + "epoch": 0.17427757236051905, + "grad_norm": 0.7755554914474487, + "learning_rate": 1.976938492777182e-05, + "loss": 0.3452, + "step": 1850 + }, + { + "epoch": 0.1743717764536869, + "grad_norm": 0.8617866635322571, + "learning_rate": 1.9769062395947814e-05, + "loss": 0.4383, + "step": 1851 + }, + { + "epoch": 0.17446598054685475, + "grad_norm": 0.865623414516449, + "learning_rate": 1.976873964137408e-05, + "loss": 0.4186, + "step": 1852 + }, + { + "epoch": 0.1745601846400226, + "grad_norm": 1.092434287071228, + "learning_rate": 1.9768416664057973e-05, + "loss": 0.4365, + "step": 1853 + }, + { + "epoch": 0.17465438873319045, + "grad_norm": 0.874930739402771, + "learning_rate": 1.9768093464006856e-05, + "loss": 0.3717, + "step": 1854 + }, + { + "epoch": 0.1747485928263583, + "grad_norm": 0.8707634210586548, + "learning_rate": 1.9767770041228104e-05, + "loss": 0.4036, + "step": 1855 + }, + { + "epoch": 0.17484279691952614, + "grad_norm": 0.9189536571502686, + "learning_rate": 1.976744639572909e-05, + "loss": 0.4176, + "step": 1856 + }, + { + "epoch": 0.174937001012694, + "grad_norm": 0.8922774791717529, + "learning_rate": 1.9767122527517194e-05, + "loss": 0.4004, + "step": 1857 + }, + { + "epoch": 0.17503120510586184, + "grad_norm": 0.8719164729118347, + "learning_rate": 1.9766798436599795e-05, + "loss": 0.3678, + "step": 1858 + }, + { + "epoch": 0.1751254091990297, + "grad_norm": 0.9028275012969971, + "learning_rate": 1.9766474122984288e-05, + "loss": 0.4334, + "step": 1859 + }, + { + "epoch": 0.17521961329219754, + "grad_norm": 0.8829405903816223, + "learning_rate": 1.976614958667807e-05, + "loss": 0.4126, + "step": 1860 + }, + { + "epoch": 0.1753138173853654, + "grad_norm": 1.028039813041687, + "learning_rate": 1.9765824827688536e-05, + "loss": 0.4269, + "step": 1861 + }, + { + "epoch": 0.17540802147853324, + "grad_norm": 0.89798504114151, + "learning_rate": 1.976549984602309e-05, + "loss": 0.406, + "step": 1862 + }, + { + "epoch": 0.1755022255717011, + "grad_norm": 0.9848604202270508, + "learning_rate": 1.976517464168915e-05, + "loss": 0.3874, + "step": 1863 + }, + { + "epoch": 0.17559642966486894, + "grad_norm": 0.9031936526298523, + "learning_rate": 1.9764849214694122e-05, + "loss": 0.4119, + "step": 1864 + }, + { + "epoch": 0.17569063375803678, + "grad_norm": 1.0898340940475464, + "learning_rate": 1.976452356504543e-05, + "loss": 0.3884, + "step": 1865 + }, + { + "epoch": 0.17578483785120463, + "grad_norm": 0.8496800661087036, + "learning_rate": 1.9764197692750502e-05, + "loss": 0.3472, + "step": 1866 + }, + { + "epoch": 0.17587904194437248, + "grad_norm": 0.9799139499664307, + "learning_rate": 1.9763871597816765e-05, + "loss": 0.4278, + "step": 1867 + }, + { + "epoch": 0.17597324603754033, + "grad_norm": 0.9434148073196411, + "learning_rate": 1.9763545280251657e-05, + "loss": 0.3745, + "step": 1868 + }, + { + "epoch": 0.17606745013070818, + "grad_norm": 0.9892056584358215, + "learning_rate": 1.9763218740062613e-05, + "loss": 0.4429, + "step": 1869 + }, + { + "epoch": 0.17616165422387603, + "grad_norm": 0.8674705624580383, + "learning_rate": 1.976289197725709e-05, + "loss": 0.3697, + "step": 1870 + }, + { + "epoch": 0.17625585831704388, + "grad_norm": 0.8550935983657837, + "learning_rate": 1.9762564991842524e-05, + "loss": 0.4158, + "step": 1871 + }, + { + "epoch": 0.17635006241021173, + "grad_norm": 0.9619081020355225, + "learning_rate": 1.9762237783826383e-05, + "loss": 0.4016, + "step": 1872 + }, + { + "epoch": 0.17644426650337958, + "grad_norm": 0.9321436882019043, + "learning_rate": 1.9761910353216116e-05, + "loss": 0.363, + "step": 1873 + }, + { + "epoch": 0.17653847059654743, + "grad_norm": 0.965522289276123, + "learning_rate": 1.9761582700019203e-05, + "loss": 0.4403, + "step": 1874 + }, + { + "epoch": 0.17663267468971527, + "grad_norm": 0.8548703193664551, + "learning_rate": 1.9761254824243107e-05, + "loss": 0.3806, + "step": 1875 + }, + { + "epoch": 0.17672687878288312, + "grad_norm": 0.9804152846336365, + "learning_rate": 1.97609267258953e-05, + "loss": 0.3895, + "step": 1876 + }, + { + "epoch": 0.17682108287605097, + "grad_norm": 1.0053352117538452, + "learning_rate": 1.9760598404983274e-05, + "loss": 0.414, + "step": 1877 + }, + { + "epoch": 0.17691528696921882, + "grad_norm": 0.9046112895011902, + "learning_rate": 1.9760269861514507e-05, + "loss": 0.4142, + "step": 1878 + }, + { + "epoch": 0.17700949106238667, + "grad_norm": 0.7706135511398315, + "learning_rate": 1.9759941095496493e-05, + "loss": 0.3311, + "step": 1879 + }, + { + "epoch": 0.17710369515555452, + "grad_norm": 0.9193723797798157, + "learning_rate": 1.9759612106936723e-05, + "loss": 0.3733, + "step": 1880 + }, + { + "epoch": 0.17719789924872237, + "grad_norm": 0.8780800104141235, + "learning_rate": 1.975928289584271e-05, + "loss": 0.379, + "step": 1881 + }, + { + "epoch": 0.17729210334189022, + "grad_norm": 0.8904425501823425, + "learning_rate": 1.975895346222195e-05, + "loss": 0.4162, + "step": 1882 + }, + { + "epoch": 0.17738630743505807, + "grad_norm": 0.8909766674041748, + "learning_rate": 1.975862380608196e-05, + "loss": 0.3837, + "step": 1883 + }, + { + "epoch": 0.17748051152822591, + "grad_norm": 1.136483073234558, + "learning_rate": 1.9758293927430253e-05, + "loss": 0.3665, + "step": 1884 + }, + { + "epoch": 0.17757471562139374, + "grad_norm": 0.8756337761878967, + "learning_rate": 1.9757963826274357e-05, + "loss": 0.3776, + "step": 1885 + }, + { + "epoch": 0.17766891971456159, + "grad_norm": 1.091659665107727, + "learning_rate": 1.9757633502621794e-05, + "loss": 0.4115, + "step": 1886 + }, + { + "epoch": 0.17776312380772943, + "grad_norm": 0.9444653391838074, + "learning_rate": 1.9757302956480094e-05, + "loss": 0.3877, + "step": 1887 + }, + { + "epoch": 0.17785732790089728, + "grad_norm": 3.0061488151550293, + "learning_rate": 1.97569721878568e-05, + "loss": 0.3707, + "step": 1888 + }, + { + "epoch": 0.17795153199406513, + "grad_norm": 0.8458983302116394, + "learning_rate": 1.975664119675945e-05, + "loss": 0.406, + "step": 1889 + }, + { + "epoch": 0.17804573608723298, + "grad_norm": 0.9263994693756104, + "learning_rate": 1.9756309983195588e-05, + "loss": 0.4169, + "step": 1890 + }, + { + "epoch": 0.17813994018040083, + "grad_norm": 0.7784213423728943, + "learning_rate": 1.9755978547172776e-05, + "loss": 0.3559, + "step": 1891 + }, + { + "epoch": 0.17823414427356868, + "grad_norm": 0.9472523927688599, + "learning_rate": 1.9755646888698563e-05, + "loss": 0.3956, + "step": 1892 + }, + { + "epoch": 0.17832834836673653, + "grad_norm": 1.0705050230026245, + "learning_rate": 1.9755315007780507e-05, + "loss": 0.4067, + "step": 1893 + }, + { + "epoch": 0.17842255245990438, + "grad_norm": 0.9809260964393616, + "learning_rate": 1.975498290442619e-05, + "loss": 0.3695, + "step": 1894 + }, + { + "epoch": 0.17851675655307223, + "grad_norm": 0.8297300338745117, + "learning_rate": 1.9754650578643174e-05, + "loss": 0.3607, + "step": 1895 + }, + { + "epoch": 0.17861096064624007, + "grad_norm": 0.9693791270256042, + "learning_rate": 1.9754318030439038e-05, + "loss": 0.4735, + "step": 1896 + }, + { + "epoch": 0.17870516473940792, + "grad_norm": 0.8396703600883484, + "learning_rate": 1.975398525982137e-05, + "loss": 0.372, + "step": 1897 + }, + { + "epoch": 0.17879936883257577, + "grad_norm": 0.9750856161117554, + "learning_rate": 1.9753652266797746e-05, + "loss": 0.383, + "step": 1898 + }, + { + "epoch": 0.17889357292574362, + "grad_norm": 0.942491888999939, + "learning_rate": 1.9753319051375772e-05, + "loss": 0.3777, + "step": 1899 + }, + { + "epoch": 0.17898777701891147, + "grad_norm": 0.9724046587944031, + "learning_rate": 1.9752985613563038e-05, + "loss": 0.4327, + "step": 1900 + }, + { + "epoch": 0.17908198111207932, + "grad_norm": 1.0240174531936646, + "learning_rate": 1.9752651953367147e-05, + "loss": 0.4255, + "step": 1901 + }, + { + "epoch": 0.17917618520524717, + "grad_norm": 0.8948530554771423, + "learning_rate": 1.975231807079571e-05, + "loss": 0.3802, + "step": 1902 + }, + { + "epoch": 0.17927038929841502, + "grad_norm": 1.09205162525177, + "learning_rate": 1.9751983965856342e-05, + "loss": 0.4074, + "step": 1903 + }, + { + "epoch": 0.17936459339158287, + "grad_norm": 1.0225067138671875, + "learning_rate": 1.975164963855665e-05, + "loss": 0.3375, + "step": 1904 + }, + { + "epoch": 0.17945879748475072, + "grad_norm": 0.8763222694396973, + "learning_rate": 1.975131508890427e-05, + "loss": 0.3723, + "step": 1905 + }, + { + "epoch": 0.17955300157791856, + "grad_norm": 0.9270104169845581, + "learning_rate": 1.9750980316906826e-05, + "loss": 0.3886, + "step": 1906 + }, + { + "epoch": 0.1796472056710864, + "grad_norm": 1.0478739738464355, + "learning_rate": 1.9750645322571952e-05, + "loss": 0.348, + "step": 1907 + }, + { + "epoch": 0.17974140976425426, + "grad_norm": 1.2006361484527588, + "learning_rate": 1.975031010590728e-05, + "loss": 0.3926, + "step": 1908 + }, + { + "epoch": 0.1798356138574221, + "grad_norm": 1.0440820455551147, + "learning_rate": 1.9749974666920462e-05, + "loss": 0.4468, + "step": 1909 + }, + { + "epoch": 0.17992981795058996, + "grad_norm": 0.8804754614830017, + "learning_rate": 1.974963900561914e-05, + "loss": 0.4056, + "step": 1910 + }, + { + "epoch": 0.1800240220437578, + "grad_norm": 0.8772451281547546, + "learning_rate": 1.974930312201097e-05, + "loss": 0.3966, + "step": 1911 + }, + { + "epoch": 0.18011822613692566, + "grad_norm": 0.98329758644104, + "learning_rate": 1.9748967016103617e-05, + "loss": 0.4242, + "step": 1912 + }, + { + "epoch": 0.1802124302300935, + "grad_norm": 0.9673380255699158, + "learning_rate": 1.9748630687904735e-05, + "loss": 0.4327, + "step": 1913 + }, + { + "epoch": 0.18030663432326136, + "grad_norm": 0.9209880828857422, + "learning_rate": 1.9748294137421997e-05, + "loss": 0.3492, + "step": 1914 + }, + { + "epoch": 0.1804008384164292, + "grad_norm": 0.8868668675422668, + "learning_rate": 1.9747957364663076e-05, + "loss": 0.3949, + "step": 1915 + }, + { + "epoch": 0.18049504250959705, + "grad_norm": 0.834490954875946, + "learning_rate": 1.9747620369635653e-05, + "loss": 0.3812, + "step": 1916 + }, + { + "epoch": 0.1805892466027649, + "grad_norm": 1.0661643743515015, + "learning_rate": 1.974728315234741e-05, + "loss": 0.4484, + "step": 1917 + }, + { + "epoch": 0.18068345069593275, + "grad_norm": 0.9173630475997925, + "learning_rate": 1.9746945712806037e-05, + "loss": 0.3215, + "step": 1918 + }, + { + "epoch": 0.1807776547891006, + "grad_norm": 1.13764226436615, + "learning_rate": 1.9746608051019227e-05, + "loss": 0.3646, + "step": 1919 + }, + { + "epoch": 0.18087185888226842, + "grad_norm": 0.9189788103103638, + "learning_rate": 1.9746270166994682e-05, + "loss": 0.4053, + "step": 1920 + }, + { + "epoch": 0.18096606297543627, + "grad_norm": 0.7904914617538452, + "learning_rate": 1.9745932060740103e-05, + "loss": 0.3388, + "step": 1921 + }, + { + "epoch": 0.18106026706860412, + "grad_norm": 0.9093592166900635, + "learning_rate": 1.97455937322632e-05, + "loss": 0.3893, + "step": 1922 + }, + { + "epoch": 0.18115447116177197, + "grad_norm": 0.9486411809921265, + "learning_rate": 1.9745255181571686e-05, + "loss": 0.3878, + "step": 1923 + }, + { + "epoch": 0.18124867525493982, + "grad_norm": 1.0105358362197876, + "learning_rate": 1.9744916408673287e-05, + "loss": 0.4407, + "step": 1924 + }, + { + "epoch": 0.18134287934810767, + "grad_norm": 0.8826894164085388, + "learning_rate": 1.9744577413575723e-05, + "loss": 0.3491, + "step": 1925 + }, + { + "epoch": 0.18143708344127552, + "grad_norm": 0.9980819821357727, + "learning_rate": 1.974423819628672e-05, + "loss": 0.3908, + "step": 1926 + }, + { + "epoch": 0.18153128753444336, + "grad_norm": 1.1008509397506714, + "learning_rate": 1.9743898756814018e-05, + "loss": 0.41, + "step": 1927 + }, + { + "epoch": 0.1816254916276112, + "grad_norm": 0.8422188758850098, + "learning_rate": 1.9743559095165357e-05, + "loss": 0.3257, + "step": 1928 + }, + { + "epoch": 0.18171969572077906, + "grad_norm": 0.898560643196106, + "learning_rate": 1.974321921134848e-05, + "loss": 0.3473, + "step": 1929 + }, + { + "epoch": 0.1818138998139469, + "grad_norm": 0.9736857414245605, + "learning_rate": 1.9742879105371132e-05, + "loss": 0.4067, + "step": 1930 + }, + { + "epoch": 0.18190810390711476, + "grad_norm": 0.9870139956474304, + "learning_rate": 1.9742538777241078e-05, + "loss": 0.4176, + "step": 1931 + }, + { + "epoch": 0.1820023080002826, + "grad_norm": 1.0926724672317505, + "learning_rate": 1.9742198226966073e-05, + "loss": 0.415, + "step": 1932 + }, + { + "epoch": 0.18209651209345046, + "grad_norm": 0.9447426795959473, + "learning_rate": 1.9741857454553876e-05, + "loss": 0.3997, + "step": 1933 + }, + { + "epoch": 0.1821907161866183, + "grad_norm": 1.0695010423660278, + "learning_rate": 1.9741516460012268e-05, + "loss": 0.3581, + "step": 1934 + }, + { + "epoch": 0.18228492027978616, + "grad_norm": 0.9521197080612183, + "learning_rate": 1.9741175243349017e-05, + "loss": 0.3883, + "step": 1935 + }, + { + "epoch": 0.182379124372954, + "grad_norm": 0.780623197555542, + "learning_rate": 1.9740833804571907e-05, + "loss": 0.3433, + "step": 1936 + }, + { + "epoch": 0.18247332846612185, + "grad_norm": 0.8583131432533264, + "learning_rate": 1.974049214368872e-05, + "loss": 0.3645, + "step": 1937 + }, + { + "epoch": 0.1825675325592897, + "grad_norm": 0.8302966356277466, + "learning_rate": 1.974015026070725e-05, + "loss": 0.367, + "step": 1938 + }, + { + "epoch": 0.18266173665245755, + "grad_norm": 0.8479434251785278, + "learning_rate": 1.973980815563529e-05, + "loss": 0.327, + "step": 1939 + }, + { + "epoch": 0.1827559407456254, + "grad_norm": 0.9212160110473633, + "learning_rate": 1.973946582848064e-05, + "loss": 0.3893, + "step": 1940 + }, + { + "epoch": 0.18285014483879325, + "grad_norm": 0.8256272077560425, + "learning_rate": 1.9739123279251108e-05, + "loss": 0.3961, + "step": 1941 + }, + { + "epoch": 0.1829443489319611, + "grad_norm": 0.8914819955825806, + "learning_rate": 1.97387805079545e-05, + "loss": 0.3936, + "step": 1942 + }, + { + "epoch": 0.18303855302512895, + "grad_norm": 0.9249416589736938, + "learning_rate": 1.973843751459864e-05, + "loss": 0.3438, + "step": 1943 + }, + { + "epoch": 0.1831327571182968, + "grad_norm": 1.0605783462524414, + "learning_rate": 1.973809429919134e-05, + "loss": 0.4249, + "step": 1944 + }, + { + "epoch": 0.18322696121146465, + "grad_norm": 0.9442580342292786, + "learning_rate": 1.9737750861740434e-05, + "loss": 0.4023, + "step": 1945 + }, + { + "epoch": 0.1833211653046325, + "grad_norm": 0.825469970703125, + "learning_rate": 1.9737407202253745e-05, + "loss": 0.3649, + "step": 1946 + }, + { + "epoch": 0.18341536939780034, + "grad_norm": 0.9349367618560791, + "learning_rate": 1.9737063320739115e-05, + "loss": 0.3942, + "step": 1947 + }, + { + "epoch": 0.1835095734909682, + "grad_norm": 0.941352128982544, + "learning_rate": 1.973671921720438e-05, + "loss": 0.4213, + "step": 1948 + }, + { + "epoch": 0.18360377758413604, + "grad_norm": 0.8577041029930115, + "learning_rate": 1.973637489165739e-05, + "loss": 0.3857, + "step": 1949 + }, + { + "epoch": 0.1836979816773039, + "grad_norm": 0.933680534362793, + "learning_rate": 1.9736030344105997e-05, + "loss": 0.4139, + "step": 1950 + }, + { + "epoch": 0.18379218577047174, + "grad_norm": 0.924106240272522, + "learning_rate": 1.973568557455805e-05, + "loss": 0.4217, + "step": 1951 + }, + { + "epoch": 0.1838863898636396, + "grad_norm": 0.9497937560081482, + "learning_rate": 1.9735340583021417e-05, + "loss": 0.3893, + "step": 1952 + }, + { + "epoch": 0.18398059395680744, + "grad_norm": 0.8727536201477051, + "learning_rate": 1.9734995369503967e-05, + "loss": 0.3175, + "step": 1953 + }, + { + "epoch": 0.18407479804997526, + "grad_norm": 0.9945586323738098, + "learning_rate": 1.9734649934013564e-05, + "loss": 0.4017, + "step": 1954 + }, + { + "epoch": 0.1841690021431431, + "grad_norm": 0.8452147245407104, + "learning_rate": 1.9734304276558086e-05, + "loss": 0.3552, + "step": 1955 + }, + { + "epoch": 0.18426320623631096, + "grad_norm": 0.8479359745979309, + "learning_rate": 1.973395839714542e-05, + "loss": 0.3854, + "step": 1956 + }, + { + "epoch": 0.1843574103294788, + "grad_norm": 0.779893696308136, + "learning_rate": 1.9733612295783448e-05, + "loss": 0.3676, + "step": 1957 + }, + { + "epoch": 0.18445161442264665, + "grad_norm": 0.8586198687553406, + "learning_rate": 1.973326597248006e-05, + "loss": 0.4006, + "step": 1958 + }, + { + "epoch": 0.1845458185158145, + "grad_norm": 0.8984857201576233, + "learning_rate": 1.9732919427243155e-05, + "loss": 0.3877, + "step": 1959 + }, + { + "epoch": 0.18464002260898235, + "grad_norm": 1.2245627641677856, + "learning_rate": 1.9732572660080634e-05, + "loss": 0.4117, + "step": 1960 + }, + { + "epoch": 0.1847342267021502, + "grad_norm": 0.7990955114364624, + "learning_rate": 1.9732225671000408e-05, + "loss": 0.3674, + "step": 1961 + }, + { + "epoch": 0.18482843079531805, + "grad_norm": 0.9033900499343872, + "learning_rate": 1.9731878460010386e-05, + "loss": 0.4033, + "step": 1962 + }, + { + "epoch": 0.1849226348884859, + "grad_norm": 1.060862421989441, + "learning_rate": 1.9731531027118482e-05, + "loss": 0.4059, + "step": 1963 + }, + { + "epoch": 0.18501683898165375, + "grad_norm": 0.9788298010826111, + "learning_rate": 1.973118337233262e-05, + "loss": 0.4468, + "step": 1964 + }, + { + "epoch": 0.1851110430748216, + "grad_norm": 0.8543902635574341, + "learning_rate": 1.973083549566073e-05, + "loss": 0.3672, + "step": 1965 + }, + { + "epoch": 0.18520524716798945, + "grad_norm": 0.8796471953392029, + "learning_rate": 1.973048739711074e-05, + "loss": 0.3988, + "step": 1966 + }, + { + "epoch": 0.1852994512611573, + "grad_norm": 0.8813859820365906, + "learning_rate": 1.973013907669059e-05, + "loss": 0.3839, + "step": 1967 + }, + { + "epoch": 0.18539365535432514, + "grad_norm": 0.8523325324058533, + "learning_rate": 1.9729790534408216e-05, + "loss": 0.3964, + "step": 1968 + }, + { + "epoch": 0.185487859447493, + "grad_norm": 0.7992582321166992, + "learning_rate": 1.972944177027158e-05, + "loss": 0.3735, + "step": 1969 + }, + { + "epoch": 0.18558206354066084, + "grad_norm": 0.7629032731056213, + "learning_rate": 1.9729092784288618e-05, + "loss": 0.339, + "step": 1970 + }, + { + "epoch": 0.1856762676338287, + "grad_norm": 0.9305042624473572, + "learning_rate": 1.9728743576467294e-05, + "loss": 0.3744, + "step": 1971 + }, + { + "epoch": 0.18577047172699654, + "grad_norm": 0.8268446922302246, + "learning_rate": 1.9728394146815573e-05, + "loss": 0.3602, + "step": 1972 + }, + { + "epoch": 0.1858646758201644, + "grad_norm": 0.8215991258621216, + "learning_rate": 1.972804449534142e-05, + "loss": 0.3533, + "step": 1973 + }, + { + "epoch": 0.18595887991333224, + "grad_norm": 0.9372973442077637, + "learning_rate": 1.9727694622052805e-05, + "loss": 0.4443, + "step": 1974 + }, + { + "epoch": 0.18605308400650009, + "grad_norm": 0.8777764439582825, + "learning_rate": 1.9727344526957713e-05, + "loss": 0.3648, + "step": 1975 + }, + { + "epoch": 0.18614728809966793, + "grad_norm": 0.9062953591346741, + "learning_rate": 1.972699421006412e-05, + "loss": 0.3801, + "step": 1976 + }, + { + "epoch": 0.18624149219283578, + "grad_norm": 0.8242834210395813, + "learning_rate": 1.9726643671380014e-05, + "loss": 0.3477, + "step": 1977 + }, + { + "epoch": 0.18633569628600363, + "grad_norm": 0.9338749051094055, + "learning_rate": 1.9726292910913393e-05, + "loss": 0.3845, + "step": 1978 + }, + { + "epoch": 0.18642990037917148, + "grad_norm": 0.8638278841972351, + "learning_rate": 1.972594192867225e-05, + "loss": 0.3864, + "step": 1979 + }, + { + "epoch": 0.18652410447233933, + "grad_norm": 0.9354991316795349, + "learning_rate": 1.9725590724664587e-05, + "loss": 0.3864, + "step": 1980 + }, + { + "epoch": 0.18661830856550718, + "grad_norm": 0.7907112836837769, + "learning_rate": 1.972523929889842e-05, + "loss": 0.3381, + "step": 1981 + }, + { + "epoch": 0.18671251265867503, + "grad_norm": 0.8652738928794861, + "learning_rate": 1.9724887651381756e-05, + "loss": 0.3674, + "step": 1982 + }, + { + "epoch": 0.18680671675184288, + "grad_norm": 0.9682350754737854, + "learning_rate": 1.972453578212261e-05, + "loss": 0.3665, + "step": 1983 + }, + { + "epoch": 0.18690092084501073, + "grad_norm": 0.9161863327026367, + "learning_rate": 1.972418369112901e-05, + "loss": 0.397, + "step": 1984 + }, + { + "epoch": 0.18699512493817858, + "grad_norm": 0.8780052661895752, + "learning_rate": 1.972383137840898e-05, + "loss": 0.4238, + "step": 1985 + }, + { + "epoch": 0.18708932903134642, + "grad_norm": 0.752722978591919, + "learning_rate": 1.9723478843970562e-05, + "loss": 0.3206, + "step": 1986 + }, + { + "epoch": 0.18718353312451427, + "grad_norm": 0.8473148345947266, + "learning_rate": 1.9723126087821788e-05, + "loss": 0.3449, + "step": 1987 + }, + { + "epoch": 0.18727773721768212, + "grad_norm": 0.9261614084243774, + "learning_rate": 1.97227731099707e-05, + "loss": 0.4142, + "step": 1988 + }, + { + "epoch": 0.18737194131084994, + "grad_norm": 1.017713189125061, + "learning_rate": 1.972241991042535e-05, + "loss": 0.4077, + "step": 1989 + }, + { + "epoch": 0.1874661454040178, + "grad_norm": 0.8296705484390259, + "learning_rate": 1.9722066489193788e-05, + "loss": 0.4005, + "step": 1990 + }, + { + "epoch": 0.18756034949718564, + "grad_norm": 0.8227131366729736, + "learning_rate": 1.9721712846284076e-05, + "loss": 0.4067, + "step": 1991 + }, + { + "epoch": 0.1876545535903535, + "grad_norm": 0.7832923531532288, + "learning_rate": 1.9721358981704276e-05, + "loss": 0.3092, + "step": 1992 + }, + { + "epoch": 0.18774875768352134, + "grad_norm": 0.8305931091308594, + "learning_rate": 1.9721004895462457e-05, + "loss": 0.3452, + "step": 1993 + }, + { + "epoch": 0.1878429617766892, + "grad_norm": 0.8937137722969055, + "learning_rate": 1.9720650587566693e-05, + "loss": 0.3766, + "step": 1994 + }, + { + "epoch": 0.18793716586985704, + "grad_norm": 0.855707049369812, + "learning_rate": 1.972029605802506e-05, + "loss": 0.3421, + "step": 1995 + }, + { + "epoch": 0.18803136996302489, + "grad_norm": 0.8995879292488098, + "learning_rate": 1.9719941306845647e-05, + "loss": 0.3777, + "step": 1996 + }, + { + "epoch": 0.18812557405619273, + "grad_norm": 0.7978242039680481, + "learning_rate": 1.971958633403654e-05, + "loss": 0.3401, + "step": 1997 + }, + { + "epoch": 0.18821977814936058, + "grad_norm": 0.8712006211280823, + "learning_rate": 1.9719231139605833e-05, + "loss": 0.4002, + "step": 1998 + }, + { + "epoch": 0.18831398224252843, + "grad_norm": 0.7887129187583923, + "learning_rate": 1.9718875723561622e-05, + "loss": 0.3447, + "step": 1999 + }, + { + "epoch": 0.18840818633569628, + "grad_norm": 0.7492456436157227, + "learning_rate": 1.9718520085912017e-05, + "loss": 0.3674, + "step": 2000 + }, + { + "epoch": 0.18850239042886413, + "grad_norm": 0.8848216533660889, + "learning_rate": 1.971816422666512e-05, + "loss": 0.4458, + "step": 2001 + }, + { + "epoch": 0.18859659452203198, + "grad_norm": 1.1236287355422974, + "learning_rate": 1.9717808145829056e-05, + "loss": 0.3481, + "step": 2002 + }, + { + "epoch": 0.18869079861519983, + "grad_norm": 0.8653777241706848, + "learning_rate": 1.9717451843411934e-05, + "loss": 0.3864, + "step": 2003 + }, + { + "epoch": 0.18878500270836768, + "grad_norm": 0.887043833732605, + "learning_rate": 1.971709531942188e-05, + "loss": 0.3613, + "step": 2004 + }, + { + "epoch": 0.18887920680153553, + "grad_norm": 0.816224217414856, + "learning_rate": 1.9716738573867025e-05, + "loss": 0.3732, + "step": 2005 + }, + { + "epoch": 0.18897341089470338, + "grad_norm": 0.8825797438621521, + "learning_rate": 1.9716381606755502e-05, + "loss": 0.3687, + "step": 2006 + }, + { + "epoch": 0.18906761498787122, + "grad_norm": 0.8316991925239563, + "learning_rate": 1.9716024418095457e-05, + "loss": 0.4058, + "step": 2007 + }, + { + "epoch": 0.18916181908103907, + "grad_norm": 0.9464976787567139, + "learning_rate": 1.9715667007895026e-05, + "loss": 0.4656, + "step": 2008 + }, + { + "epoch": 0.18925602317420692, + "grad_norm": 0.8076721429824829, + "learning_rate": 1.971530937616236e-05, + "loss": 0.3652, + "step": 2009 + }, + { + "epoch": 0.18935022726737477, + "grad_norm": 0.9603601098060608, + "learning_rate": 1.9714951522905618e-05, + "loss": 0.4232, + "step": 2010 + }, + { + "epoch": 0.18944443136054262, + "grad_norm": 0.8963274359703064, + "learning_rate": 1.9714593448132955e-05, + "loss": 0.3481, + "step": 2011 + }, + { + "epoch": 0.18953863545371047, + "grad_norm": 0.7713266611099243, + "learning_rate": 1.9714235151852537e-05, + "loss": 0.3707, + "step": 2012 + }, + { + "epoch": 0.18963283954687832, + "grad_norm": 0.9094200134277344, + "learning_rate": 1.971387663407254e-05, + "loss": 0.4499, + "step": 2013 + }, + { + "epoch": 0.18972704364004617, + "grad_norm": 0.9872961044311523, + "learning_rate": 1.971351789480113e-05, + "loss": 0.4114, + "step": 2014 + }, + { + "epoch": 0.18982124773321402, + "grad_norm": 0.9223184585571289, + "learning_rate": 1.9713158934046485e-05, + "loss": 0.4124, + "step": 2015 + }, + { + "epoch": 0.18991545182638186, + "grad_norm": 0.8363121151924133, + "learning_rate": 1.9712799751816797e-05, + "loss": 0.3427, + "step": 2016 + }, + { + "epoch": 0.1900096559195497, + "grad_norm": 0.9477868676185608, + "learning_rate": 1.9712440348120256e-05, + "loss": 0.444, + "step": 2017 + }, + { + "epoch": 0.19010386001271756, + "grad_norm": 0.892019510269165, + "learning_rate": 1.9712080722965052e-05, + "loss": 0.359, + "step": 2018 + }, + { + "epoch": 0.1901980641058854, + "grad_norm": 0.857984721660614, + "learning_rate": 1.9711720876359387e-05, + "loss": 0.3233, + "step": 2019 + }, + { + "epoch": 0.19029226819905326, + "grad_norm": 0.9077023863792419, + "learning_rate": 1.971136080831147e-05, + "loss": 0.4238, + "step": 2020 + }, + { + "epoch": 0.1903864722922211, + "grad_norm": 0.9694737792015076, + "learning_rate": 1.9711000518829505e-05, + "loss": 0.4655, + "step": 2021 + }, + { + "epoch": 0.19048067638538896, + "grad_norm": 1.1711987257003784, + "learning_rate": 1.971064000792171e-05, + "loss": 0.3534, + "step": 2022 + }, + { + "epoch": 0.19057488047855678, + "grad_norm": 0.8994882106781006, + "learning_rate": 1.9710279275596307e-05, + "loss": 0.4086, + "step": 2023 + }, + { + "epoch": 0.19066908457172463, + "grad_norm": 0.9420601725578308, + "learning_rate": 1.9709918321861517e-05, + "loss": 0.349, + "step": 2024 + }, + { + "epoch": 0.19076328866489248, + "grad_norm": 0.8988878726959229, + "learning_rate": 1.9709557146725572e-05, + "loss": 0.3777, + "step": 2025 + }, + { + "epoch": 0.19085749275806033, + "grad_norm": 1.0279639959335327, + "learning_rate": 1.970919575019671e-05, + "loss": 0.44, + "step": 2026 + }, + { + "epoch": 0.19095169685122818, + "grad_norm": 0.9482018947601318, + "learning_rate": 1.970883413228317e-05, + "loss": 0.3635, + "step": 2027 + }, + { + "epoch": 0.19104590094439602, + "grad_norm": 0.8100656867027283, + "learning_rate": 1.9708472292993195e-05, + "loss": 0.3353, + "step": 2028 + }, + { + "epoch": 0.19114010503756387, + "grad_norm": 0.8380219340324402, + "learning_rate": 1.9708110232335035e-05, + "loss": 0.3713, + "step": 2029 + }, + { + "epoch": 0.19123430913073172, + "grad_norm": 0.8911309838294983, + "learning_rate": 1.9707747950316953e-05, + "loss": 0.4015, + "step": 2030 + }, + { + "epoch": 0.19132851322389957, + "grad_norm": 0.8748728036880493, + "learning_rate": 1.97073854469472e-05, + "loss": 0.4052, + "step": 2031 + }, + { + "epoch": 0.19142271731706742, + "grad_norm": 0.8939458727836609, + "learning_rate": 1.970702272223405e-05, + "loss": 0.3748, + "step": 2032 + }, + { + "epoch": 0.19151692141023527, + "grad_norm": 0.9449240565299988, + "learning_rate": 1.9706659776185767e-05, + "loss": 0.3506, + "step": 2033 + }, + { + "epoch": 0.19161112550340312, + "grad_norm": 0.7787311673164368, + "learning_rate": 1.970629660881063e-05, + "loss": 0.337, + "step": 2034 + }, + { + "epoch": 0.19170532959657097, + "grad_norm": 0.9287323951721191, + "learning_rate": 1.9705933220116918e-05, + "loss": 0.4118, + "step": 2035 + }, + { + "epoch": 0.19179953368973882, + "grad_norm": 1.080183744430542, + "learning_rate": 1.970556961011292e-05, + "loss": 0.451, + "step": 2036 + }, + { + "epoch": 0.19189373778290666, + "grad_norm": 0.8874815106391907, + "learning_rate": 1.9705205778806926e-05, + "loss": 0.3886, + "step": 2037 + }, + { + "epoch": 0.1919879418760745, + "grad_norm": 0.9046630263328552, + "learning_rate": 1.9704841726207228e-05, + "loss": 0.3824, + "step": 2038 + }, + { + "epoch": 0.19208214596924236, + "grad_norm": 0.8540805578231812, + "learning_rate": 1.970447745232213e-05, + "loss": 0.4134, + "step": 2039 + }, + { + "epoch": 0.1921763500624102, + "grad_norm": 0.7967076897621155, + "learning_rate": 1.970411295715994e-05, + "loss": 0.3744, + "step": 2040 + }, + { + "epoch": 0.19227055415557806, + "grad_norm": 1.020021677017212, + "learning_rate": 1.970374824072897e-05, + "loss": 0.4387, + "step": 2041 + }, + { + "epoch": 0.1923647582487459, + "grad_norm": 0.954599142074585, + "learning_rate": 1.9703383303037525e-05, + "loss": 0.3685, + "step": 2042 + }, + { + "epoch": 0.19245896234191376, + "grad_norm": 0.913081169128418, + "learning_rate": 1.970301814409394e-05, + "loss": 0.3236, + "step": 2043 + }, + { + "epoch": 0.1925531664350816, + "grad_norm": 1.0290967226028442, + "learning_rate": 1.9702652763906532e-05, + "loss": 0.3609, + "step": 2044 + }, + { + "epoch": 0.19264737052824946, + "grad_norm": 0.9929534196853638, + "learning_rate": 1.9702287162483634e-05, + "loss": 0.4037, + "step": 2045 + }, + { + "epoch": 0.1927415746214173, + "grad_norm": 0.9307869076728821, + "learning_rate": 1.970192133983359e-05, + "loss": 0.39, + "step": 2046 + }, + { + "epoch": 0.19283577871458515, + "grad_norm": 0.7979583740234375, + "learning_rate": 1.970155529596473e-05, + "loss": 0.33, + "step": 2047 + }, + { + "epoch": 0.192929982807753, + "grad_norm": 0.9032715559005737, + "learning_rate": 1.9701189030885407e-05, + "loss": 0.4202, + "step": 2048 + }, + { + "epoch": 0.19302418690092085, + "grad_norm": 0.927097737789154, + "learning_rate": 1.970082254460397e-05, + "loss": 0.3686, + "step": 2049 + }, + { + "epoch": 0.1931183909940887, + "grad_norm": 0.9695465564727783, + "learning_rate": 1.970045583712878e-05, + "loss": 0.3907, + "step": 2050 + }, + { + "epoch": 0.19321259508725655, + "grad_norm": 0.9572463035583496, + "learning_rate": 1.970008890846819e-05, + "loss": 0.384, + "step": 2051 + }, + { + "epoch": 0.1933067991804244, + "grad_norm": 0.8238348364830017, + "learning_rate": 1.9699721758630573e-05, + "loss": 0.3952, + "step": 2052 + }, + { + "epoch": 0.19340100327359225, + "grad_norm": 0.9012622237205505, + "learning_rate": 1.96993543876243e-05, + "loss": 0.3955, + "step": 2053 + }, + { + "epoch": 0.1934952073667601, + "grad_norm": 0.9931719899177551, + "learning_rate": 1.969898679545775e-05, + "loss": 0.3954, + "step": 2054 + }, + { + "epoch": 0.19358941145992795, + "grad_norm": 0.8941454887390137, + "learning_rate": 1.9698618982139294e-05, + "loss": 0.3701, + "step": 2055 + }, + { + "epoch": 0.1936836155530958, + "grad_norm": 0.9120371341705322, + "learning_rate": 1.969825094767733e-05, + "loss": 0.4075, + "step": 2056 + }, + { + "epoch": 0.19377781964626364, + "grad_norm": 0.8839127421379089, + "learning_rate": 1.9697882692080247e-05, + "loss": 0.3868, + "step": 2057 + }, + { + "epoch": 0.19387202373943146, + "grad_norm": 0.9981152415275574, + "learning_rate": 1.969751421535644e-05, + "loss": 0.3779, + "step": 2058 + }, + { + "epoch": 0.19396622783259931, + "grad_norm": 0.931559681892395, + "learning_rate": 1.969714551751431e-05, + "loss": 0.3703, + "step": 2059 + }, + { + "epoch": 0.19406043192576716, + "grad_norm": 0.9200083017349243, + "learning_rate": 1.9696776598562265e-05, + "loss": 0.3865, + "step": 2060 + }, + { + "epoch": 0.194154636018935, + "grad_norm": 0.8785022497177124, + "learning_rate": 1.969640745850872e-05, + "loss": 0.3803, + "step": 2061 + }, + { + "epoch": 0.19424884011210286, + "grad_norm": 0.874745786190033, + "learning_rate": 1.9696038097362083e-05, + "loss": 0.3634, + "step": 2062 + }, + { + "epoch": 0.1943430442052707, + "grad_norm": 0.8995983600616455, + "learning_rate": 1.969566851513079e-05, + "loss": 0.3825, + "step": 2063 + }, + { + "epoch": 0.19443724829843856, + "grad_norm": 0.8460052013397217, + "learning_rate": 1.9695298711823255e-05, + "loss": 0.3687, + "step": 2064 + }, + { + "epoch": 0.1945314523916064, + "grad_norm": 0.862362265586853, + "learning_rate": 1.9694928687447915e-05, + "loss": 0.4034, + "step": 2065 + }, + { + "epoch": 0.19462565648477426, + "grad_norm": 0.8720821142196655, + "learning_rate": 1.969455844201321e-05, + "loss": 0.3996, + "step": 2066 + }, + { + "epoch": 0.1947198605779421, + "grad_norm": 0.7716377973556519, + "learning_rate": 1.9694187975527577e-05, + "loss": 0.396, + "step": 2067 + }, + { + "epoch": 0.19481406467110995, + "grad_norm": 0.8582832217216492, + "learning_rate": 1.969381728799947e-05, + "loss": 0.3707, + "step": 2068 + }, + { + "epoch": 0.1949082687642778, + "grad_norm": 0.8442918062210083, + "learning_rate": 1.9693446379437336e-05, + "loss": 0.3937, + "step": 2069 + }, + { + "epoch": 0.19500247285744565, + "grad_norm": 0.8860918879508972, + "learning_rate": 1.9693075249849628e-05, + "loss": 0.3463, + "step": 2070 + }, + { + "epoch": 0.1950966769506135, + "grad_norm": 0.9189453721046448, + "learning_rate": 1.969270389924482e-05, + "loss": 0.3964, + "step": 2071 + }, + { + "epoch": 0.19519088104378135, + "grad_norm": 0.8853801488876343, + "learning_rate": 1.969233232763137e-05, + "loss": 0.3894, + "step": 2072 + }, + { + "epoch": 0.1952850851369492, + "grad_norm": 0.8096766471862793, + "learning_rate": 1.9691960535017754e-05, + "loss": 0.329, + "step": 2073 + }, + { + "epoch": 0.19537928923011705, + "grad_norm": 0.9416137337684631, + "learning_rate": 1.969158852141245e-05, + "loss": 0.4208, + "step": 2074 + }, + { + "epoch": 0.1954734933232849, + "grad_norm": 0.9120296239852905, + "learning_rate": 1.969121628682394e-05, + "loss": 0.3593, + "step": 2075 + }, + { + "epoch": 0.19556769741645275, + "grad_norm": 0.9446659684181213, + "learning_rate": 1.9690843831260705e-05, + "loss": 0.4148, + "step": 2076 + }, + { + "epoch": 0.1956619015096206, + "grad_norm": 0.8542370796203613, + "learning_rate": 1.969047115473125e-05, + "loss": 0.3482, + "step": 2077 + }, + { + "epoch": 0.19575610560278844, + "grad_norm": 0.9640230536460876, + "learning_rate": 1.9690098257244063e-05, + "loss": 0.3716, + "step": 2078 + }, + { + "epoch": 0.1958503096959563, + "grad_norm": 0.9649361371994019, + "learning_rate": 1.968972513880765e-05, + "loss": 0.4225, + "step": 2079 + }, + { + "epoch": 0.19594451378912414, + "grad_norm": 0.8834923505783081, + "learning_rate": 1.968935179943052e-05, + "loss": 0.3627, + "step": 2080 + }, + { + "epoch": 0.196038717882292, + "grad_norm": 0.9990054368972778, + "learning_rate": 1.9688978239121183e-05, + "loss": 0.4168, + "step": 2081 + }, + { + "epoch": 0.19613292197545984, + "grad_norm": 0.8540651202201843, + "learning_rate": 1.9688604457888157e-05, + "loss": 0.3607, + "step": 2082 + }, + { + "epoch": 0.1962271260686277, + "grad_norm": 0.8560302257537842, + "learning_rate": 1.9688230455739966e-05, + "loss": 0.391, + "step": 2083 + }, + { + "epoch": 0.19632133016179554, + "grad_norm": 0.8272385597229004, + "learning_rate": 1.968785623268514e-05, + "loss": 0.3915, + "step": 2084 + }, + { + "epoch": 0.1964155342549634, + "grad_norm": 0.9685487151145935, + "learning_rate": 1.9687481788732207e-05, + "loss": 0.3943, + "step": 2085 + }, + { + "epoch": 0.19650973834813124, + "grad_norm": 0.8381835222244263, + "learning_rate": 1.9687107123889708e-05, + "loss": 0.3361, + "step": 2086 + }, + { + "epoch": 0.19660394244129908, + "grad_norm": 0.8928012847900391, + "learning_rate": 1.9686732238166183e-05, + "loss": 0.3813, + "step": 2087 + }, + { + "epoch": 0.19669814653446693, + "grad_norm": 0.9864948987960815, + "learning_rate": 1.9686357131570184e-05, + "loss": 0.3955, + "step": 2088 + }, + { + "epoch": 0.19679235062763478, + "grad_norm": 0.81718909740448, + "learning_rate": 1.9685981804110263e-05, + "loss": 0.3256, + "step": 2089 + }, + { + "epoch": 0.19688655472080263, + "grad_norm": 0.8038667440414429, + "learning_rate": 1.9685606255794978e-05, + "loss": 0.3451, + "step": 2090 + }, + { + "epoch": 0.19698075881397048, + "grad_norm": 0.8458711504936218, + "learning_rate": 1.9685230486632888e-05, + "loss": 0.3539, + "step": 2091 + }, + { + "epoch": 0.1970749629071383, + "grad_norm": 0.733620822429657, + "learning_rate": 1.9684854496632567e-05, + "loss": 0.3123, + "step": 2092 + }, + { + "epoch": 0.19716916700030615, + "grad_norm": 0.8875676989555359, + "learning_rate": 1.9684478285802585e-05, + "loss": 0.3824, + "step": 2093 + }, + { + "epoch": 0.197263371093474, + "grad_norm": 0.895675778388977, + "learning_rate": 1.9684101854151517e-05, + "loss": 0.3813, + "step": 2094 + }, + { + "epoch": 0.19735757518664185, + "grad_norm": 0.7959586381912231, + "learning_rate": 1.9683725201687955e-05, + "loss": 0.38, + "step": 2095 + }, + { + "epoch": 0.1974517792798097, + "grad_norm": 0.9296727180480957, + "learning_rate": 1.968334832842048e-05, + "loss": 0.3732, + "step": 2096 + }, + { + "epoch": 0.19754598337297755, + "grad_norm": 1.003315806388855, + "learning_rate": 1.9682971234357688e-05, + "loss": 0.4405, + "step": 2097 + }, + { + "epoch": 0.1976401874661454, + "grad_norm": 0.919394314289093, + "learning_rate": 1.9682593919508177e-05, + "loss": 0.3746, + "step": 2098 + }, + { + "epoch": 0.19773439155931324, + "grad_norm": 0.9472411870956421, + "learning_rate": 1.968221638388055e-05, + "loss": 0.3974, + "step": 2099 + }, + { + "epoch": 0.1978285956524811, + "grad_norm": 0.9083194136619568, + "learning_rate": 1.9681838627483416e-05, + "loss": 0.3683, + "step": 2100 + }, + { + "epoch": 0.19792279974564894, + "grad_norm": 0.9355015158653259, + "learning_rate": 1.9681460650325387e-05, + "loss": 0.4127, + "step": 2101 + }, + { + "epoch": 0.1980170038388168, + "grad_norm": 0.9019935131072998, + "learning_rate": 1.9681082452415084e-05, + "loss": 0.4031, + "step": 2102 + }, + { + "epoch": 0.19811120793198464, + "grad_norm": 0.8776118755340576, + "learning_rate": 1.9680704033761128e-05, + "loss": 0.4174, + "step": 2103 + }, + { + "epoch": 0.1982054120251525, + "grad_norm": 0.881554901599884, + "learning_rate": 1.968032539437215e-05, + "loss": 0.3819, + "step": 2104 + }, + { + "epoch": 0.19829961611832034, + "grad_norm": 0.9217504262924194, + "learning_rate": 1.967994653425678e-05, + "loss": 0.4157, + "step": 2105 + }, + { + "epoch": 0.1983938202114882, + "grad_norm": 0.9133352041244507, + "learning_rate": 1.967956745342366e-05, + "loss": 0.3631, + "step": 2106 + }, + { + "epoch": 0.19848802430465604, + "grad_norm": 0.8852596879005432, + "learning_rate": 1.967918815188143e-05, + "loss": 0.4074, + "step": 2107 + }, + { + "epoch": 0.19858222839782388, + "grad_norm": 0.8754292130470276, + "learning_rate": 1.9678808629638744e-05, + "loss": 0.3917, + "step": 2108 + }, + { + "epoch": 0.19867643249099173, + "grad_norm": 1.0540508031845093, + "learning_rate": 1.967842888670425e-05, + "loss": 0.4664, + "step": 2109 + }, + { + "epoch": 0.19877063658415958, + "grad_norm": 0.8133193254470825, + "learning_rate": 1.9678048923086614e-05, + "loss": 0.3705, + "step": 2110 + }, + { + "epoch": 0.19886484067732743, + "grad_norm": 0.9278148412704468, + "learning_rate": 1.9677668738794492e-05, + "loss": 0.3982, + "step": 2111 + }, + { + "epoch": 0.19895904477049528, + "grad_norm": 0.9273229241371155, + "learning_rate": 1.9677288333836555e-05, + "loss": 0.3641, + "step": 2112 + }, + { + "epoch": 0.19905324886366313, + "grad_norm": 0.8346200585365295, + "learning_rate": 1.9676907708221476e-05, + "loss": 0.329, + "step": 2113 + }, + { + "epoch": 0.19914745295683098, + "grad_norm": 0.8903515934944153, + "learning_rate": 1.9676526861957944e-05, + "loss": 0.3383, + "step": 2114 + }, + { + "epoch": 0.19924165704999883, + "grad_norm": 0.9228113293647766, + "learning_rate": 1.9676145795054627e-05, + "loss": 0.3719, + "step": 2115 + }, + { + "epoch": 0.19933586114316668, + "grad_norm": 0.9408248066902161, + "learning_rate": 1.9675764507520225e-05, + "loss": 0.3956, + "step": 2116 + }, + { + "epoch": 0.19943006523633452, + "grad_norm": 0.8687821626663208, + "learning_rate": 1.9675382999363422e-05, + "loss": 0.3872, + "step": 2117 + }, + { + "epoch": 0.19952426932950237, + "grad_norm": 1.0548046827316284, + "learning_rate": 1.9675001270592932e-05, + "loss": 0.3836, + "step": 2118 + }, + { + "epoch": 0.19961847342267022, + "grad_norm": 0.743798017501831, + "learning_rate": 1.9674619321217447e-05, + "loss": 0.3141, + "step": 2119 + }, + { + "epoch": 0.19971267751583807, + "grad_norm": 0.8419427275657654, + "learning_rate": 1.9674237151245678e-05, + "loss": 0.3377, + "step": 2120 + }, + { + "epoch": 0.19980688160900592, + "grad_norm": 0.9172351360321045, + "learning_rate": 1.9673854760686343e-05, + "loss": 0.4094, + "step": 2121 + }, + { + "epoch": 0.19990108570217377, + "grad_norm": 0.8844809532165527, + "learning_rate": 1.9673472149548153e-05, + "loss": 0.3762, + "step": 2122 + }, + { + "epoch": 0.19999528979534162, + "grad_norm": 0.9465714693069458, + "learning_rate": 1.9673089317839843e-05, + "loss": 0.3829, + "step": 2123 + }, + { + "epoch": 0.20008949388850947, + "grad_norm": 0.7922621369361877, + "learning_rate": 1.9672706265570137e-05, + "loss": 0.3652, + "step": 2124 + }, + { + "epoch": 0.20018369798167732, + "grad_norm": 0.8763195872306824, + "learning_rate": 1.9672322992747766e-05, + "loss": 0.4096, + "step": 2125 + }, + { + "epoch": 0.20027790207484517, + "grad_norm": 0.8922156095504761, + "learning_rate": 1.9671939499381475e-05, + "loss": 0.4103, + "step": 2126 + }, + { + "epoch": 0.200372106168013, + "grad_norm": 0.8136999607086182, + "learning_rate": 1.967155578548e-05, + "loss": 0.3619, + "step": 2127 + }, + { + "epoch": 0.20046631026118084, + "grad_norm": 0.8069179654121399, + "learning_rate": 1.96711718510521e-05, + "loss": 0.3899, + "step": 2128 + }, + { + "epoch": 0.20056051435434868, + "grad_norm": 0.8518467545509338, + "learning_rate": 1.9670787696106525e-05, + "loss": 0.3772, + "step": 2129 + }, + { + "epoch": 0.20065471844751653, + "grad_norm": 0.8809990286827087, + "learning_rate": 1.967040332065204e-05, + "loss": 0.4185, + "step": 2130 + }, + { + "epoch": 0.20074892254068438, + "grad_norm": 0.6934998631477356, + "learning_rate": 1.9670018724697394e-05, + "loss": 0.3029, + "step": 2131 + }, + { + "epoch": 0.20084312663385223, + "grad_norm": 0.7762834429740906, + "learning_rate": 1.9669633908251372e-05, + "loss": 0.3634, + "step": 2132 + }, + { + "epoch": 0.20093733072702008, + "grad_norm": 0.9136695265769958, + "learning_rate": 1.9669248871322738e-05, + "loss": 0.395, + "step": 2133 + }, + { + "epoch": 0.20103153482018793, + "grad_norm": 1.4183223247528076, + "learning_rate": 1.966886361392028e-05, + "loss": 0.3731, + "step": 2134 + }, + { + "epoch": 0.20112573891335578, + "grad_norm": 1.0961859226226807, + "learning_rate": 1.9668478136052776e-05, + "loss": 0.3798, + "step": 2135 + }, + { + "epoch": 0.20121994300652363, + "grad_norm": 0.8356830477714539, + "learning_rate": 1.966809243772902e-05, + "loss": 0.3971, + "step": 2136 + }, + { + "epoch": 0.20131414709969148, + "grad_norm": 0.8490124940872192, + "learning_rate": 1.9667706518957803e-05, + "loss": 0.3835, + "step": 2137 + }, + { + "epoch": 0.20140835119285933, + "grad_norm": 0.8329681158065796, + "learning_rate": 1.966732037974793e-05, + "loss": 0.3249, + "step": 2138 + }, + { + "epoch": 0.20150255528602717, + "grad_norm": 0.8552191257476807, + "learning_rate": 1.9666934020108192e-05, + "loss": 0.3649, + "step": 2139 + }, + { + "epoch": 0.20159675937919502, + "grad_norm": 0.9588862061500549, + "learning_rate": 1.9666547440047417e-05, + "loss": 0.33, + "step": 2140 + }, + { + "epoch": 0.20169096347236287, + "grad_norm": 0.8517509698867798, + "learning_rate": 1.9666160639574404e-05, + "loss": 0.3459, + "step": 2141 + }, + { + "epoch": 0.20178516756553072, + "grad_norm": 0.8158048987388611, + "learning_rate": 1.9665773618697988e-05, + "loss": 0.338, + "step": 2142 + }, + { + "epoch": 0.20187937165869857, + "grad_norm": 0.8685369491577148, + "learning_rate": 1.966538637742698e-05, + "loss": 0.3572, + "step": 2143 + }, + { + "epoch": 0.20197357575186642, + "grad_norm": 1.0335828065872192, + "learning_rate": 1.9664998915770213e-05, + "loss": 0.3803, + "step": 2144 + }, + { + "epoch": 0.20206777984503427, + "grad_norm": 0.8579703569412231, + "learning_rate": 1.9664611233736527e-05, + "loss": 0.3764, + "step": 2145 + }, + { + "epoch": 0.20216198393820212, + "grad_norm": 0.8927084803581238, + "learning_rate": 1.966422333133476e-05, + "loss": 0.3703, + "step": 2146 + }, + { + "epoch": 0.20225618803136997, + "grad_norm": 1.055751919746399, + "learning_rate": 1.9663835208573747e-05, + "loss": 0.3258, + "step": 2147 + }, + { + "epoch": 0.20235039212453781, + "grad_norm": 1.0749523639678955, + "learning_rate": 1.966344686546235e-05, + "loss": 0.4551, + "step": 2148 + }, + { + "epoch": 0.20244459621770566, + "grad_norm": 0.8448135256767273, + "learning_rate": 1.966305830200942e-05, + "loss": 0.356, + "step": 2149 + }, + { + "epoch": 0.2025388003108735, + "grad_norm": 1.0148167610168457, + "learning_rate": 1.966266951822382e-05, + "loss": 0.3934, + "step": 2150 + }, + { + "epoch": 0.20263300440404136, + "grad_norm": 0.9345589280128479, + "learning_rate": 1.9662280514114408e-05, + "loss": 0.4071, + "step": 2151 + }, + { + "epoch": 0.2027272084972092, + "grad_norm": 0.7750939130783081, + "learning_rate": 1.9661891289690056e-05, + "loss": 0.3413, + "step": 2152 + }, + { + "epoch": 0.20282141259037706, + "grad_norm": 0.9436439275741577, + "learning_rate": 1.9661501844959642e-05, + "loss": 0.3759, + "step": 2153 + }, + { + "epoch": 0.2029156166835449, + "grad_norm": 0.8349786400794983, + "learning_rate": 1.966111217993204e-05, + "loss": 0.38, + "step": 2154 + }, + { + "epoch": 0.20300982077671276, + "grad_norm": 0.8802725672721863, + "learning_rate": 1.9660722294616148e-05, + "loss": 0.3607, + "step": 2155 + }, + { + "epoch": 0.2031040248698806, + "grad_norm": 0.8655793070793152, + "learning_rate": 1.966033218902084e-05, + "loss": 0.3965, + "step": 2156 + }, + { + "epoch": 0.20319822896304846, + "grad_norm": 0.9193881750106812, + "learning_rate": 1.965994186315502e-05, + "loss": 0.3983, + "step": 2157 + }, + { + "epoch": 0.2032924330562163, + "grad_norm": 0.9747195839881897, + "learning_rate": 1.9659551317027586e-05, + "loss": 0.4385, + "step": 2158 + }, + { + "epoch": 0.20338663714938415, + "grad_norm": 0.8492395281791687, + "learning_rate": 1.9659160550647446e-05, + "loss": 0.3729, + "step": 2159 + }, + { + "epoch": 0.203480841242552, + "grad_norm": 0.8659266829490662, + "learning_rate": 1.9658769564023502e-05, + "loss": 0.4089, + "step": 2160 + }, + { + "epoch": 0.20357504533571985, + "grad_norm": 0.8363007307052612, + "learning_rate": 1.9658378357164677e-05, + "loss": 0.4087, + "step": 2161 + }, + { + "epoch": 0.20366924942888767, + "grad_norm": 0.9424063563346863, + "learning_rate": 1.9657986930079888e-05, + "loss": 0.3999, + "step": 2162 + }, + { + "epoch": 0.20376345352205552, + "grad_norm": 0.8132457137107849, + "learning_rate": 1.9657595282778063e-05, + "loss": 0.3573, + "step": 2163 + }, + { + "epoch": 0.20385765761522337, + "grad_norm": 0.9497710466384888, + "learning_rate": 1.9657203415268128e-05, + "loss": 0.4127, + "step": 2164 + }, + { + "epoch": 0.20395186170839122, + "grad_norm": 0.8072819709777832, + "learning_rate": 1.965681132755902e-05, + "loss": 0.3615, + "step": 2165 + }, + { + "epoch": 0.20404606580155907, + "grad_norm": 0.9138504862785339, + "learning_rate": 1.965641901965968e-05, + "loss": 0.3613, + "step": 2166 + }, + { + "epoch": 0.20414026989472692, + "grad_norm": 1.0799670219421387, + "learning_rate": 1.965602649157905e-05, + "loss": 0.3837, + "step": 2167 + }, + { + "epoch": 0.20423447398789477, + "grad_norm": 0.8778116106987, + "learning_rate": 1.9655633743326084e-05, + "loss": 0.3871, + "step": 2168 + }, + { + "epoch": 0.20432867808106261, + "grad_norm": 0.9556635022163391, + "learning_rate": 1.9655240774909736e-05, + "loss": 0.3954, + "step": 2169 + }, + { + "epoch": 0.20442288217423046, + "grad_norm": 0.8863227963447571, + "learning_rate": 1.9654847586338966e-05, + "loss": 0.4123, + "step": 2170 + }, + { + "epoch": 0.2045170862673983, + "grad_norm": 0.805164098739624, + "learning_rate": 1.9654454177622742e-05, + "loss": 0.3697, + "step": 2171 + }, + { + "epoch": 0.20461129036056616, + "grad_norm": 0.9202094674110413, + "learning_rate": 1.965406054877003e-05, + "loss": 0.4277, + "step": 2172 + }, + { + "epoch": 0.204705494453734, + "grad_norm": 0.8749397397041321, + "learning_rate": 1.9653666699789807e-05, + "loss": 0.3708, + "step": 2173 + }, + { + "epoch": 0.20479969854690186, + "grad_norm": 0.8218052387237549, + "learning_rate": 1.9653272630691053e-05, + "loss": 0.3917, + "step": 2174 + }, + { + "epoch": 0.2048939026400697, + "grad_norm": 0.9710571765899658, + "learning_rate": 1.9652878341482755e-05, + "loss": 0.4102, + "step": 2175 + }, + { + "epoch": 0.20498810673323756, + "grad_norm": 0.915249228477478, + "learning_rate": 1.96524838321739e-05, + "loss": 0.395, + "step": 2176 + }, + { + "epoch": 0.2050823108264054, + "grad_norm": 1.1843119859695435, + "learning_rate": 1.9652089102773487e-05, + "loss": 0.3645, + "step": 2177 + }, + { + "epoch": 0.20517651491957326, + "grad_norm": 1.005880355834961, + "learning_rate": 1.9651694153290518e-05, + "loss": 0.4164, + "step": 2178 + }, + { + "epoch": 0.2052707190127411, + "grad_norm": 1.2592356204986572, + "learning_rate": 1.9651298983733993e-05, + "loss": 0.4019, + "step": 2179 + }, + { + "epoch": 0.20536492310590895, + "grad_norm": 0.9137241840362549, + "learning_rate": 1.9650903594112924e-05, + "loss": 0.3834, + "step": 2180 + }, + { + "epoch": 0.2054591271990768, + "grad_norm": 0.989930272102356, + "learning_rate": 1.9650507984436328e-05, + "loss": 0.3791, + "step": 2181 + }, + { + "epoch": 0.20555333129224465, + "grad_norm": 0.9751969575881958, + "learning_rate": 1.9650112154713227e-05, + "loss": 0.3326, + "step": 2182 + }, + { + "epoch": 0.2056475353854125, + "grad_norm": 0.9052295088768005, + "learning_rate": 1.9649716104952644e-05, + "loss": 0.4243, + "step": 2183 + }, + { + "epoch": 0.20574173947858035, + "grad_norm": 0.8224236965179443, + "learning_rate": 1.9649319835163614e-05, + "loss": 0.3847, + "step": 2184 + }, + { + "epoch": 0.2058359435717482, + "grad_norm": 0.9257561564445496, + "learning_rate": 1.964892334535516e-05, + "loss": 0.391, + "step": 2185 + }, + { + "epoch": 0.20593014766491605, + "grad_norm": 0.9604368209838867, + "learning_rate": 1.964852663553634e-05, + "loss": 0.4409, + "step": 2186 + }, + { + "epoch": 0.2060243517580839, + "grad_norm": 1.0182547569274902, + "learning_rate": 1.9648129705716188e-05, + "loss": 0.4224, + "step": 2187 + }, + { + "epoch": 0.20611855585125174, + "grad_norm": 0.7713980078697205, + "learning_rate": 1.9647732555903758e-05, + "loss": 0.3666, + "step": 2188 + }, + { + "epoch": 0.2062127599444196, + "grad_norm": 0.8546023368835449, + "learning_rate": 1.9647335186108104e-05, + "loss": 0.3935, + "step": 2189 + }, + { + "epoch": 0.20630696403758744, + "grad_norm": 1.0893975496292114, + "learning_rate": 1.9646937596338287e-05, + "loss": 0.4027, + "step": 2190 + }, + { + "epoch": 0.2064011681307553, + "grad_norm": 0.9164141416549683, + "learning_rate": 1.9646539786603376e-05, + "loss": 0.3996, + "step": 2191 + }, + { + "epoch": 0.20649537222392314, + "grad_norm": 1.1015797853469849, + "learning_rate": 1.9646141756912437e-05, + "loss": 0.3765, + "step": 2192 + }, + { + "epoch": 0.206589576317091, + "grad_norm": 1.004239797592163, + "learning_rate": 1.964574350727455e-05, + "loss": 0.3567, + "step": 2193 + }, + { + "epoch": 0.20668378041025884, + "grad_norm": 0.940284788608551, + "learning_rate": 1.964534503769879e-05, + "loss": 0.4211, + "step": 2194 + }, + { + "epoch": 0.2067779845034267, + "grad_norm": 1.0183979272842407, + "learning_rate": 1.964494634819425e-05, + "loss": 0.4444, + "step": 2195 + }, + { + "epoch": 0.2068721885965945, + "grad_norm": 0.8018262982368469, + "learning_rate": 1.9644547438770016e-05, + "loss": 0.3636, + "step": 2196 + }, + { + "epoch": 0.20696639268976236, + "grad_norm": 0.8908106088638306, + "learning_rate": 1.964414830943518e-05, + "loss": 0.3659, + "step": 2197 + }, + { + "epoch": 0.2070605967829302, + "grad_norm": 0.9282899498939514, + "learning_rate": 1.9643748960198857e-05, + "loss": 0.4189, + "step": 2198 + }, + { + "epoch": 0.20715480087609806, + "grad_norm": 1.0559660196304321, + "learning_rate": 1.9643349391070137e-05, + "loss": 0.3771, + "step": 2199 + }, + { + "epoch": 0.2072490049692659, + "grad_norm": 0.8157504200935364, + "learning_rate": 1.964294960205814e-05, + "loss": 0.369, + "step": 2200 + }, + { + "epoch": 0.20734320906243375, + "grad_norm": 0.779656171798706, + "learning_rate": 1.9642549593171977e-05, + "loss": 0.3366, + "step": 2201 + }, + { + "epoch": 0.2074374131556016, + "grad_norm": 0.8910467028617859, + "learning_rate": 1.964214936442077e-05, + "loss": 0.393, + "step": 2202 + }, + { + "epoch": 0.20753161724876945, + "grad_norm": 0.8166049718856812, + "learning_rate": 1.964174891581365e-05, + "loss": 0.3564, + "step": 2203 + }, + { + "epoch": 0.2076258213419373, + "grad_norm": 0.9754587411880493, + "learning_rate": 1.964134824735974e-05, + "loss": 0.4071, + "step": 2204 + }, + { + "epoch": 0.20772002543510515, + "grad_norm": 0.8878598809242249, + "learning_rate": 1.964094735906818e-05, + "loss": 0.3691, + "step": 2205 + }, + { + "epoch": 0.207814229528273, + "grad_norm": 0.8549135327339172, + "learning_rate": 1.964054625094811e-05, + "loss": 0.3834, + "step": 2206 + }, + { + "epoch": 0.20790843362144085, + "grad_norm": 0.9318049550056458, + "learning_rate": 1.9640144923008674e-05, + "loss": 0.3769, + "step": 2207 + }, + { + "epoch": 0.2080026377146087, + "grad_norm": 0.8580342531204224, + "learning_rate": 1.963974337525903e-05, + "loss": 0.3583, + "step": 2208 + }, + { + "epoch": 0.20809684180777654, + "grad_norm": 0.7783285975456238, + "learning_rate": 1.9639341607708324e-05, + "loss": 0.3304, + "step": 2209 + }, + { + "epoch": 0.2081910459009444, + "grad_norm": 0.8820192813873291, + "learning_rate": 1.9638939620365724e-05, + "loss": 0.3695, + "step": 2210 + }, + { + "epoch": 0.20828524999411224, + "grad_norm": 0.960730791091919, + "learning_rate": 1.9638537413240395e-05, + "loss": 0.4213, + "step": 2211 + }, + { + "epoch": 0.2083794540872801, + "grad_norm": 1.086413860321045, + "learning_rate": 1.9638134986341502e-05, + "loss": 0.3326, + "step": 2212 + }, + { + "epoch": 0.20847365818044794, + "grad_norm": 0.9690501689910889, + "learning_rate": 1.9637732339678234e-05, + "loss": 0.36, + "step": 2213 + }, + { + "epoch": 0.2085678622736158, + "grad_norm": 0.7819095253944397, + "learning_rate": 1.9637329473259752e-05, + "loss": 0.3757, + "step": 2214 + }, + { + "epoch": 0.20866206636678364, + "grad_norm": 0.8166189193725586, + "learning_rate": 1.9636926387095262e-05, + "loss": 0.3661, + "step": 2215 + }, + { + "epoch": 0.2087562704599515, + "grad_norm": 0.9223487377166748, + "learning_rate": 1.9636523081193946e-05, + "loss": 0.3384, + "step": 2216 + }, + { + "epoch": 0.20885047455311934, + "grad_norm": 0.8755592107772827, + "learning_rate": 1.9636119555565002e-05, + "loss": 0.3726, + "step": 2217 + }, + { + "epoch": 0.20894467864628719, + "grad_norm": 1.0222097635269165, + "learning_rate": 1.9635715810217623e-05, + "loss": 0.3686, + "step": 2218 + }, + { + "epoch": 0.20903888273945503, + "grad_norm": 0.8661210536956787, + "learning_rate": 1.9635311845161026e-05, + "loss": 0.3841, + "step": 2219 + }, + { + "epoch": 0.20913308683262288, + "grad_norm": 1.0370057821273804, + "learning_rate": 1.963490766040442e-05, + "loss": 0.4071, + "step": 2220 + }, + { + "epoch": 0.20922729092579073, + "grad_norm": 0.8432251214981079, + "learning_rate": 1.9634503255957014e-05, + "loss": 0.3458, + "step": 2221 + }, + { + "epoch": 0.20932149501895858, + "grad_norm": 1.3770086765289307, + "learning_rate": 1.9634098631828035e-05, + "loss": 0.3868, + "step": 2222 + }, + { + "epoch": 0.20941569911212643, + "grad_norm": 0.7160378694534302, + "learning_rate": 1.963369378802671e-05, + "loss": 0.302, + "step": 2223 + }, + { + "epoch": 0.20950990320529428, + "grad_norm": 1.0085020065307617, + "learning_rate": 1.9633288724562264e-05, + "loss": 0.354, + "step": 2224 + }, + { + "epoch": 0.20960410729846213, + "grad_norm": 0.8698961138725281, + "learning_rate": 1.963288344144394e-05, + "loss": 0.3967, + "step": 2225 + }, + { + "epoch": 0.20969831139162998, + "grad_norm": 0.8173272609710693, + "learning_rate": 1.9632477938680976e-05, + "loss": 0.3707, + "step": 2226 + }, + { + "epoch": 0.20979251548479783, + "grad_norm": 0.9659057855606079, + "learning_rate": 1.9632072216282617e-05, + "loss": 0.3718, + "step": 2227 + }, + { + "epoch": 0.20988671957796567, + "grad_norm": 0.9052210450172424, + "learning_rate": 1.9631666274258115e-05, + "loss": 0.3932, + "step": 2228 + }, + { + "epoch": 0.20998092367113352, + "grad_norm": 0.7987163066864014, + "learning_rate": 1.9631260112616727e-05, + "loss": 0.3623, + "step": 2229 + }, + { + "epoch": 0.21007512776430137, + "grad_norm": 0.8844078183174133, + "learning_rate": 1.9630853731367715e-05, + "loss": 0.4013, + "step": 2230 + }, + { + "epoch": 0.2101693318574692, + "grad_norm": 1.1359078884124756, + "learning_rate": 1.963044713052034e-05, + "loss": 0.3896, + "step": 2231 + }, + { + "epoch": 0.21026353595063704, + "grad_norm": 0.7967735528945923, + "learning_rate": 1.9630040310083877e-05, + "loss": 0.37, + "step": 2232 + }, + { + "epoch": 0.2103577400438049, + "grad_norm": 1.0041298866271973, + "learning_rate": 1.96296332700676e-05, + "loss": 0.4169, + "step": 2233 + }, + { + "epoch": 0.21045194413697274, + "grad_norm": 0.8157963752746582, + "learning_rate": 1.96292260104808e-05, + "loss": 0.384, + "step": 2234 + }, + { + "epoch": 0.2105461482301406, + "grad_norm": 0.8149320483207703, + "learning_rate": 1.9628818531332746e-05, + "loss": 0.3826, + "step": 2235 + }, + { + "epoch": 0.21064035232330844, + "grad_norm": 0.8771508932113647, + "learning_rate": 1.9628410832632744e-05, + "loss": 0.3834, + "step": 2236 + }, + { + "epoch": 0.2107345564164763, + "grad_norm": 0.849044919013977, + "learning_rate": 1.9628002914390083e-05, + "loss": 0.4438, + "step": 2237 + }, + { + "epoch": 0.21082876050964414, + "grad_norm": 0.8716703057289124, + "learning_rate": 1.9627594776614065e-05, + "loss": 0.4003, + "step": 2238 + }, + { + "epoch": 0.21092296460281199, + "grad_norm": 0.8728709816932678, + "learning_rate": 1.9627186419313997e-05, + "loss": 0.3574, + "step": 2239 + }, + { + "epoch": 0.21101716869597983, + "grad_norm": 0.781938374042511, + "learning_rate": 1.962677784249919e-05, + "loss": 0.337, + "step": 2240 + }, + { + "epoch": 0.21111137278914768, + "grad_norm": 0.8320380449295044, + "learning_rate": 1.962636904617896e-05, + "loss": 0.3323, + "step": 2241 + }, + { + "epoch": 0.21120557688231553, + "grad_norm": 0.8517211675643921, + "learning_rate": 1.962596003036263e-05, + "loss": 0.3537, + "step": 2242 + }, + { + "epoch": 0.21129978097548338, + "grad_norm": 0.9021350741386414, + "learning_rate": 1.962555079505952e-05, + "loss": 0.3711, + "step": 2243 + }, + { + "epoch": 0.21139398506865123, + "grad_norm": 0.8758695721626282, + "learning_rate": 1.962514134027897e-05, + "loss": 0.3948, + "step": 2244 + }, + { + "epoch": 0.21148818916181908, + "grad_norm": 0.9356995820999146, + "learning_rate": 1.9624731666030307e-05, + "loss": 0.4062, + "step": 2245 + }, + { + "epoch": 0.21158239325498693, + "grad_norm": 0.8839170336723328, + "learning_rate": 1.962432177232288e-05, + "loss": 0.3863, + "step": 2246 + }, + { + "epoch": 0.21167659734815478, + "grad_norm": 0.8100844621658325, + "learning_rate": 1.9623911659166034e-05, + "loss": 0.3789, + "step": 2247 + }, + { + "epoch": 0.21177080144132263, + "grad_norm": 0.8421288728713989, + "learning_rate": 1.9623501326569117e-05, + "loss": 0.3697, + "step": 2248 + }, + { + "epoch": 0.21186500553449047, + "grad_norm": 0.898219883441925, + "learning_rate": 1.962309077454149e-05, + "loss": 0.3782, + "step": 2249 + }, + { + "epoch": 0.21195920962765832, + "grad_norm": 0.8538417816162109, + "learning_rate": 1.9622680003092503e-05, + "loss": 0.3519, + "step": 2250 + }, + { + "epoch": 0.21205341372082617, + "grad_norm": 0.905387282371521, + "learning_rate": 1.9622269012231537e-05, + "loss": 0.394, + "step": 2251 + }, + { + "epoch": 0.21214761781399402, + "grad_norm": 0.9117529392242432, + "learning_rate": 1.9621857801967957e-05, + "loss": 0.3622, + "step": 2252 + }, + { + "epoch": 0.21224182190716187, + "grad_norm": 0.9556719064712524, + "learning_rate": 1.9621446372311134e-05, + "loss": 0.4214, + "step": 2253 + }, + { + "epoch": 0.21233602600032972, + "grad_norm": 0.8761550784111023, + "learning_rate": 1.9621034723270456e-05, + "loss": 0.3746, + "step": 2254 + }, + { + "epoch": 0.21243023009349757, + "grad_norm": 0.9756516814231873, + "learning_rate": 1.9620622854855307e-05, + "loss": 0.3567, + "step": 2255 + }, + { + "epoch": 0.21252443418666542, + "grad_norm": 0.8791412115097046, + "learning_rate": 1.9620210767075076e-05, + "loss": 0.3617, + "step": 2256 + }, + { + "epoch": 0.21261863827983327, + "grad_norm": 0.8773496747016907, + "learning_rate": 1.9619798459939164e-05, + "loss": 0.4002, + "step": 2257 + }, + { + "epoch": 0.21271284237300112, + "grad_norm": 0.8872103095054626, + "learning_rate": 1.9619385933456972e-05, + "loss": 0.4032, + "step": 2258 + }, + { + "epoch": 0.21280704646616896, + "grad_norm": 0.9003176093101501, + "learning_rate": 1.9618973187637902e-05, + "loss": 0.386, + "step": 2259 + }, + { + "epoch": 0.2129012505593368, + "grad_norm": 0.8247572183609009, + "learning_rate": 1.9618560222491367e-05, + "loss": 0.3983, + "step": 2260 + }, + { + "epoch": 0.21299545465250466, + "grad_norm": 0.8347664475440979, + "learning_rate": 1.961814703802678e-05, + "loss": 0.379, + "step": 2261 + }, + { + "epoch": 0.2130896587456725, + "grad_norm": 1.040482997894287, + "learning_rate": 1.9617733634253572e-05, + "loss": 0.3926, + "step": 2262 + }, + { + "epoch": 0.21318386283884036, + "grad_norm": 0.7695727348327637, + "learning_rate": 1.961732001118116e-05, + "loss": 0.3633, + "step": 2263 + }, + { + "epoch": 0.2132780669320082, + "grad_norm": 0.9341619610786438, + "learning_rate": 1.9616906168818977e-05, + "loss": 0.3716, + "step": 2264 + }, + { + "epoch": 0.21337227102517603, + "grad_norm": 0.8412464261054993, + "learning_rate": 1.9616492107176464e-05, + "loss": 0.3665, + "step": 2265 + }, + { + "epoch": 0.21346647511834388, + "grad_norm": 0.8501750826835632, + "learning_rate": 1.9616077826263056e-05, + "loss": 0.3644, + "step": 2266 + }, + { + "epoch": 0.21356067921151173, + "grad_norm": 0.8177802562713623, + "learning_rate": 1.9615663326088204e-05, + "loss": 0.358, + "step": 2267 + }, + { + "epoch": 0.21365488330467958, + "grad_norm": 1.0078129768371582, + "learning_rate": 1.9615248606661358e-05, + "loss": 0.4614, + "step": 2268 + }, + { + "epoch": 0.21374908739784743, + "grad_norm": 0.8450458645820618, + "learning_rate": 1.961483366799197e-05, + "loss": 0.3841, + "step": 2269 + }, + { + "epoch": 0.21384329149101527, + "grad_norm": 1.0928857326507568, + "learning_rate": 1.9614418510089504e-05, + "loss": 0.3636, + "step": 2270 + }, + { + "epoch": 0.21393749558418312, + "grad_norm": 0.7948547005653381, + "learning_rate": 1.961400313296343e-05, + "loss": 0.3818, + "step": 2271 + }, + { + "epoch": 0.21403169967735097, + "grad_norm": 0.9856407046318054, + "learning_rate": 1.9613587536623218e-05, + "loss": 0.414, + "step": 2272 + }, + { + "epoch": 0.21412590377051882, + "grad_norm": 0.9461509585380554, + "learning_rate": 1.9613171721078336e-05, + "loss": 0.3476, + "step": 2273 + }, + { + "epoch": 0.21422010786368667, + "grad_norm": 0.8441481590270996, + "learning_rate": 1.9612755686338278e-05, + "loss": 0.4129, + "step": 2274 + }, + { + "epoch": 0.21431431195685452, + "grad_norm": 0.9468504190444946, + "learning_rate": 1.961233943241252e-05, + "loss": 0.3796, + "step": 2275 + }, + { + "epoch": 0.21440851605002237, + "grad_norm": 0.8112934231758118, + "learning_rate": 1.961192295931056e-05, + "loss": 0.3238, + "step": 2276 + }, + { + "epoch": 0.21450272014319022, + "grad_norm": 0.8712162971496582, + "learning_rate": 1.9611506267041892e-05, + "loss": 0.3982, + "step": 2277 + }, + { + "epoch": 0.21459692423635807, + "grad_norm": 0.8355295062065125, + "learning_rate": 1.9611089355616015e-05, + "loss": 0.3834, + "step": 2278 + }, + { + "epoch": 0.21469112832952592, + "grad_norm": 0.8796846270561218, + "learning_rate": 1.9610672225042436e-05, + "loss": 0.3615, + "step": 2279 + }, + { + "epoch": 0.21478533242269376, + "grad_norm": 0.8133159875869751, + "learning_rate": 1.9610254875330666e-05, + "loss": 0.3449, + "step": 2280 + }, + { + "epoch": 0.2148795365158616, + "grad_norm": 0.9933817386627197, + "learning_rate": 1.9609837306490223e-05, + "loss": 0.3976, + "step": 2281 + }, + { + "epoch": 0.21497374060902946, + "grad_norm": 0.8980244398117065, + "learning_rate": 1.9609419518530634e-05, + "loss": 0.3596, + "step": 2282 + }, + { + "epoch": 0.2150679447021973, + "grad_norm": 0.8660174608230591, + "learning_rate": 1.9609001511461412e-05, + "loss": 0.4017, + "step": 2283 + }, + { + "epoch": 0.21516214879536516, + "grad_norm": 0.8546890616416931, + "learning_rate": 1.9608583285292092e-05, + "loss": 0.3473, + "step": 2284 + }, + { + "epoch": 0.215256352888533, + "grad_norm": 0.8344088196754456, + "learning_rate": 1.960816484003222e-05, + "loss": 0.4092, + "step": 2285 + }, + { + "epoch": 0.21535055698170086, + "grad_norm": 0.7839269638061523, + "learning_rate": 1.9607746175691328e-05, + "loss": 0.3648, + "step": 2286 + }, + { + "epoch": 0.2154447610748687, + "grad_norm": 0.8078948855400085, + "learning_rate": 1.9607327292278966e-05, + "loss": 0.3477, + "step": 2287 + }, + { + "epoch": 0.21553896516803656, + "grad_norm": 0.9352668523788452, + "learning_rate": 1.960690818980468e-05, + "loss": 0.399, + "step": 2288 + }, + { + "epoch": 0.2156331692612044, + "grad_norm": 1.1535351276397705, + "learning_rate": 1.9606488868278034e-05, + "loss": 0.3995, + "step": 2289 + }, + { + "epoch": 0.21572737335437225, + "grad_norm": 0.8281784057617188, + "learning_rate": 1.960606932770858e-05, + "loss": 0.4028, + "step": 2290 + }, + { + "epoch": 0.2158215774475401, + "grad_norm": 1.015548825263977, + "learning_rate": 1.9605649568105894e-05, + "loss": 0.4305, + "step": 2291 + }, + { + "epoch": 0.21591578154070795, + "grad_norm": 1.4392837285995483, + "learning_rate": 1.960522958947954e-05, + "loss": 0.3371, + "step": 2292 + }, + { + "epoch": 0.2160099856338758, + "grad_norm": 0.7565015554428101, + "learning_rate": 1.96048093918391e-05, + "loss": 0.3277, + "step": 2293 + }, + { + "epoch": 0.21610418972704365, + "grad_norm": 0.9951907992362976, + "learning_rate": 1.9604388975194147e-05, + "loss": 0.4148, + "step": 2294 + }, + { + "epoch": 0.2161983938202115, + "grad_norm": 0.9554421305656433, + "learning_rate": 1.9603968339554275e-05, + "loss": 0.4156, + "step": 2295 + }, + { + "epoch": 0.21629259791337935, + "grad_norm": 0.9876624345779419, + "learning_rate": 1.960354748492907e-05, + "loss": 0.4328, + "step": 2296 + }, + { + "epoch": 0.2163868020065472, + "grad_norm": 0.9269981980323792, + "learning_rate": 1.960312641132813e-05, + "loss": 0.4169, + "step": 2297 + }, + { + "epoch": 0.21648100609971505, + "grad_norm": 1.119171142578125, + "learning_rate": 1.9602705118761055e-05, + "loss": 0.3629, + "step": 2298 + }, + { + "epoch": 0.2165752101928829, + "grad_norm": 0.882019579410553, + "learning_rate": 1.9602283607237456e-05, + "loss": 0.3466, + "step": 2299 + }, + { + "epoch": 0.21666941428605072, + "grad_norm": 0.9692645072937012, + "learning_rate": 1.960186187676694e-05, + "loss": 0.4182, + "step": 2300 + }, + { + "epoch": 0.21676361837921856, + "grad_norm": 0.9262769818305969, + "learning_rate": 1.9601439927359123e-05, + "loss": 0.3786, + "step": 2301 + }, + { + "epoch": 0.2168578224723864, + "grad_norm": 0.8466152548789978, + "learning_rate": 1.960101775902363e-05, + "loss": 0.3428, + "step": 2302 + }, + { + "epoch": 0.21695202656555426, + "grad_norm": 0.8468576073646545, + "learning_rate": 1.960059537177008e-05, + "loss": 0.3199, + "step": 2303 + }, + { + "epoch": 0.2170462306587221, + "grad_norm": 0.8485195636749268, + "learning_rate": 1.9600172765608106e-05, + "loss": 0.3474, + "step": 2304 + }, + { + "epoch": 0.21714043475188996, + "grad_norm": 0.9259114861488342, + "learning_rate": 1.959974994054735e-05, + "loss": 0.3489, + "step": 2305 + }, + { + "epoch": 0.2172346388450578, + "grad_norm": 0.882205605506897, + "learning_rate": 1.9599326896597448e-05, + "loss": 0.3951, + "step": 2306 + }, + { + "epoch": 0.21732884293822566, + "grad_norm": 0.8148806095123291, + "learning_rate": 1.9598903633768043e-05, + "loss": 0.3793, + "step": 2307 + }, + { + "epoch": 0.2174230470313935, + "grad_norm": 0.8608700037002563, + "learning_rate": 1.9598480152068795e-05, + "loss": 0.3707, + "step": 2308 + }, + { + "epoch": 0.21751725112456136, + "grad_norm": 0.9020053148269653, + "learning_rate": 1.9598056451509355e-05, + "loss": 0.3805, + "step": 2309 + }, + { + "epoch": 0.2176114552177292, + "grad_norm": 0.9641295075416565, + "learning_rate": 1.9597632532099383e-05, + "loss": 0.4584, + "step": 2310 + }, + { + "epoch": 0.21770565931089705, + "grad_norm": 0.7809056639671326, + "learning_rate": 1.9597208393848546e-05, + "loss": 0.3412, + "step": 2311 + }, + { + "epoch": 0.2177998634040649, + "grad_norm": 0.8260183930397034, + "learning_rate": 1.9596784036766517e-05, + "loss": 0.3251, + "step": 2312 + }, + { + "epoch": 0.21789406749723275, + "grad_norm": 1.05546236038208, + "learning_rate": 1.959635946086297e-05, + "loss": 0.3328, + "step": 2313 + }, + { + "epoch": 0.2179882715904006, + "grad_norm": 1.0210241079330444, + "learning_rate": 1.9595934666147584e-05, + "loss": 0.4393, + "step": 2314 + }, + { + "epoch": 0.21808247568356845, + "grad_norm": 0.8821229338645935, + "learning_rate": 1.959550965263005e-05, + "loss": 0.3898, + "step": 2315 + }, + { + "epoch": 0.2181766797767363, + "grad_norm": 1.152212381362915, + "learning_rate": 1.9595084420320054e-05, + "loss": 0.3787, + "step": 2316 + }, + { + "epoch": 0.21827088386990415, + "grad_norm": 0.8634756207466125, + "learning_rate": 1.9594658969227295e-05, + "loss": 0.4109, + "step": 2317 + }, + { + "epoch": 0.218365087963072, + "grad_norm": 1.010636568069458, + "learning_rate": 1.959423329936147e-05, + "loss": 0.4071, + "step": 2318 + }, + { + "epoch": 0.21845929205623985, + "grad_norm": 0.9700372815132141, + "learning_rate": 1.959380741073229e-05, + "loss": 0.3638, + "step": 2319 + }, + { + "epoch": 0.2185534961494077, + "grad_norm": 0.8460371494293213, + "learning_rate": 1.9593381303349467e-05, + "loss": 0.324, + "step": 2320 + }, + { + "epoch": 0.21864770024257554, + "grad_norm": 0.9131547212600708, + "learning_rate": 1.9592954977222712e-05, + "loss": 0.3414, + "step": 2321 + }, + { + "epoch": 0.2187419043357434, + "grad_norm": 1.0337926149368286, + "learning_rate": 1.959252843236175e-05, + "loss": 0.4075, + "step": 2322 + }, + { + "epoch": 0.21883610842891124, + "grad_norm": 0.7619706988334656, + "learning_rate": 1.95921016687763e-05, + "loss": 0.3393, + "step": 2323 + }, + { + "epoch": 0.2189303125220791, + "grad_norm": 0.8029950857162476, + "learning_rate": 1.95916746864761e-05, + "loss": 0.419, + "step": 2324 + }, + { + "epoch": 0.21902451661524694, + "grad_norm": 1.0024975538253784, + "learning_rate": 1.959124748547088e-05, + "loss": 0.427, + "step": 2325 + }, + { + "epoch": 0.2191187207084148, + "grad_norm": 1.0406956672668457, + "learning_rate": 1.9590820065770387e-05, + "loss": 0.3005, + "step": 2326 + }, + { + "epoch": 0.21921292480158264, + "grad_norm": 0.8180007934570312, + "learning_rate": 1.9590392427384364e-05, + "loss": 0.3335, + "step": 2327 + }, + { + "epoch": 0.21930712889475049, + "grad_norm": 0.9022271037101746, + "learning_rate": 1.9589964570322562e-05, + "loss": 0.402, + "step": 2328 + }, + { + "epoch": 0.21940133298791833, + "grad_norm": 0.8517004251480103, + "learning_rate": 1.958953649459474e-05, + "loss": 0.4099, + "step": 2329 + }, + { + "epoch": 0.21949553708108618, + "grad_norm": 0.8223914504051208, + "learning_rate": 1.958910820021065e-05, + "loss": 0.3771, + "step": 2330 + }, + { + "epoch": 0.21958974117425403, + "grad_norm": 0.9121590256690979, + "learning_rate": 1.9588679687180066e-05, + "loss": 0.3847, + "step": 2331 + }, + { + "epoch": 0.21968394526742188, + "grad_norm": 0.9172884225845337, + "learning_rate": 1.9588250955512755e-05, + "loss": 0.3553, + "step": 2332 + }, + { + "epoch": 0.21977814936058973, + "grad_norm": 1.0046167373657227, + "learning_rate": 1.9587822005218495e-05, + "loss": 0.4066, + "step": 2333 + }, + { + "epoch": 0.21987235345375755, + "grad_norm": 0.9230954051017761, + "learning_rate": 1.9587392836307064e-05, + "loss": 0.3668, + "step": 2334 + }, + { + "epoch": 0.2199665575469254, + "grad_norm": 1.1170899868011475, + "learning_rate": 1.9586963448788247e-05, + "loss": 0.3554, + "step": 2335 + }, + { + "epoch": 0.22006076164009325, + "grad_norm": 0.8206403255462646, + "learning_rate": 1.958653384267184e-05, + "loss": 0.3599, + "step": 2336 + }, + { + "epoch": 0.2201549657332611, + "grad_norm": 0.7312343716621399, + "learning_rate": 1.9586104017967632e-05, + "loss": 0.3199, + "step": 2337 + }, + { + "epoch": 0.22024916982642895, + "grad_norm": 0.8107249736785889, + "learning_rate": 1.958567397468543e-05, + "loss": 0.3333, + "step": 2338 + }, + { + "epoch": 0.2203433739195968, + "grad_norm": 0.8425604701042175, + "learning_rate": 1.958524371283504e-05, + "loss": 0.411, + "step": 2339 + }, + { + "epoch": 0.22043757801276465, + "grad_norm": 0.8197388052940369, + "learning_rate": 1.9584813232426266e-05, + "loss": 0.3963, + "step": 2340 + }, + { + "epoch": 0.2205317821059325, + "grad_norm": 0.9427674412727356, + "learning_rate": 1.9584382533468925e-05, + "loss": 0.3883, + "step": 2341 + }, + { + "epoch": 0.22062598619910034, + "grad_norm": 1.010420560836792, + "learning_rate": 1.9583951615972842e-05, + "loss": 0.4528, + "step": 2342 + }, + { + "epoch": 0.2207201902922682, + "grad_norm": 0.8485791683197021, + "learning_rate": 1.958352047994784e-05, + "loss": 0.3837, + "step": 2343 + }, + { + "epoch": 0.22081439438543604, + "grad_norm": 0.8851433992385864, + "learning_rate": 1.9583089125403752e-05, + "loss": 0.4084, + "step": 2344 + }, + { + "epoch": 0.2209085984786039, + "grad_norm": 0.7958040833473206, + "learning_rate": 1.958265755235041e-05, + "loss": 0.3346, + "step": 2345 + }, + { + "epoch": 0.22100280257177174, + "grad_norm": 0.8419966101646423, + "learning_rate": 1.9582225760797657e-05, + "loss": 0.3399, + "step": 2346 + }, + { + "epoch": 0.2210970066649396, + "grad_norm": 0.8404862284660339, + "learning_rate": 1.9581793750755334e-05, + "loss": 0.356, + "step": 2347 + }, + { + "epoch": 0.22119121075810744, + "grad_norm": 0.9682923555374146, + "learning_rate": 1.9581361522233297e-05, + "loss": 0.3622, + "step": 2348 + }, + { + "epoch": 0.22128541485127529, + "grad_norm": 0.8320052027702332, + "learning_rate": 1.9580929075241398e-05, + "loss": 0.3926, + "step": 2349 + }, + { + "epoch": 0.22137961894444314, + "grad_norm": 0.9958815574645996, + "learning_rate": 1.95804964097895e-05, + "loss": 0.4735, + "step": 2350 + }, + { + "epoch": 0.22147382303761098, + "grad_norm": 0.8752058744430542, + "learning_rate": 1.958006352588747e-05, + "loss": 0.3749, + "step": 2351 + }, + { + "epoch": 0.22156802713077883, + "grad_norm": 1.0377271175384521, + "learning_rate": 1.957963042354517e-05, + "loss": 0.3795, + "step": 2352 + }, + { + "epoch": 0.22166223122394668, + "grad_norm": 0.8700814843177795, + "learning_rate": 1.957919710277249e-05, + "loss": 0.3786, + "step": 2353 + }, + { + "epoch": 0.22175643531711453, + "grad_norm": 0.9008076190948486, + "learning_rate": 1.9578763563579292e-05, + "loss": 0.4087, + "step": 2354 + }, + { + "epoch": 0.22185063941028238, + "grad_norm": 0.8733604550361633, + "learning_rate": 1.9578329805975476e-05, + "loss": 0.3679, + "step": 2355 + }, + { + "epoch": 0.22194484350345023, + "grad_norm": 0.82168048620224, + "learning_rate": 1.9577895829970926e-05, + "loss": 0.3823, + "step": 2356 + }, + { + "epoch": 0.22203904759661808, + "grad_norm": 1.019991159439087, + "learning_rate": 1.9577461635575536e-05, + "loss": 0.3931, + "step": 2357 + }, + { + "epoch": 0.22213325168978593, + "grad_norm": 0.8327137231826782, + "learning_rate": 1.9577027222799213e-05, + "loss": 0.3533, + "step": 2358 + }, + { + "epoch": 0.22222745578295378, + "grad_norm": 0.8154199123382568, + "learning_rate": 1.9576592591651856e-05, + "loss": 0.3744, + "step": 2359 + }, + { + "epoch": 0.22232165987612162, + "grad_norm": 0.7728456258773804, + "learning_rate": 1.9576157742143377e-05, + "loss": 0.3196, + "step": 2360 + }, + { + "epoch": 0.22241586396928947, + "grad_norm": 0.903174102306366, + "learning_rate": 1.9575722674283695e-05, + "loss": 0.4026, + "step": 2361 + }, + { + "epoch": 0.22251006806245732, + "grad_norm": 0.8509536981582642, + "learning_rate": 1.9575287388082724e-05, + "loss": 0.3459, + "step": 2362 + }, + { + "epoch": 0.22260427215562517, + "grad_norm": 1.0349715948104858, + "learning_rate": 1.9574851883550395e-05, + "loss": 0.418, + "step": 2363 + }, + { + "epoch": 0.22269847624879302, + "grad_norm": 0.9048078656196594, + "learning_rate": 1.957441616069663e-05, + "loss": 0.3945, + "step": 2364 + }, + { + "epoch": 0.22279268034196087, + "grad_norm": 0.8549168705940247, + "learning_rate": 1.9573980219531375e-05, + "loss": 0.362, + "step": 2365 + }, + { + "epoch": 0.22288688443512872, + "grad_norm": 0.7860109806060791, + "learning_rate": 1.9573544060064562e-05, + "loss": 0.3395, + "step": 2366 + }, + { + "epoch": 0.22298108852829657, + "grad_norm": 1.1166032552719116, + "learning_rate": 1.9573107682306137e-05, + "loss": 0.3713, + "step": 2367 + }, + { + "epoch": 0.22307529262146442, + "grad_norm": 0.803830623626709, + "learning_rate": 1.9572671086266054e-05, + "loss": 0.3673, + "step": 2368 + }, + { + "epoch": 0.22316949671463224, + "grad_norm": 0.8809511661529541, + "learning_rate": 1.9572234271954268e-05, + "loss": 0.3921, + "step": 2369 + }, + { + "epoch": 0.2232637008078001, + "grad_norm": 0.8397429585456848, + "learning_rate": 1.9571797239380733e-05, + "loss": 0.359, + "step": 2370 + }, + { + "epoch": 0.22335790490096794, + "grad_norm": 1.1121681928634644, + "learning_rate": 1.9571359988555417e-05, + "loss": 0.4321, + "step": 2371 + }, + { + "epoch": 0.22345210899413578, + "grad_norm": 0.8415611386299133, + "learning_rate": 1.9570922519488294e-05, + "loss": 0.3582, + "step": 2372 + }, + { + "epoch": 0.22354631308730363, + "grad_norm": 0.8948222398757935, + "learning_rate": 1.9570484832189333e-05, + "loss": 0.3981, + "step": 2373 + }, + { + "epoch": 0.22364051718047148, + "grad_norm": 0.9170864224433899, + "learning_rate": 1.957004692666852e-05, + "loss": 0.4195, + "step": 2374 + }, + { + "epoch": 0.22373472127363933, + "grad_norm": 0.8808520436286926, + "learning_rate": 1.9569608802935834e-05, + "loss": 0.3551, + "step": 2375 + }, + { + "epoch": 0.22382892536680718, + "grad_norm": 0.8277885317802429, + "learning_rate": 1.956917046100127e-05, + "loss": 0.3425, + "step": 2376 + }, + { + "epoch": 0.22392312945997503, + "grad_norm": 0.787257730960846, + "learning_rate": 1.956873190087482e-05, + "loss": 0.3633, + "step": 2377 + }, + { + "epoch": 0.22401733355314288, + "grad_norm": 0.9543466567993164, + "learning_rate": 1.9568293122566485e-05, + "loss": 0.3769, + "step": 2378 + }, + { + "epoch": 0.22411153764631073, + "grad_norm": 0.8029462695121765, + "learning_rate": 1.9567854126086265e-05, + "loss": 0.3813, + "step": 2379 + }, + { + "epoch": 0.22420574173947858, + "grad_norm": 0.8587418794631958, + "learning_rate": 1.956741491144418e-05, + "loss": 0.413, + "step": 2380 + }, + { + "epoch": 0.22429994583264642, + "grad_norm": 0.8977055549621582, + "learning_rate": 1.9566975478650234e-05, + "loss": 0.3904, + "step": 2381 + }, + { + "epoch": 0.22439414992581427, + "grad_norm": 0.8071858882904053, + "learning_rate": 1.9566535827714452e-05, + "loss": 0.3345, + "step": 2382 + }, + { + "epoch": 0.22448835401898212, + "grad_norm": 0.8893081545829773, + "learning_rate": 1.956609595864686e-05, + "loss": 0.3728, + "step": 2383 + }, + { + "epoch": 0.22458255811214997, + "grad_norm": 1.0501766204833984, + "learning_rate": 1.9565655871457486e-05, + "loss": 0.327, + "step": 2384 + }, + { + "epoch": 0.22467676220531782, + "grad_norm": 0.9032719731330872, + "learning_rate": 1.9565215566156363e-05, + "loss": 0.4021, + "step": 2385 + }, + { + "epoch": 0.22477096629848567, + "grad_norm": 0.9261976480484009, + "learning_rate": 1.956477504275353e-05, + "loss": 0.4339, + "step": 2386 + }, + { + "epoch": 0.22486517039165352, + "grad_norm": 0.8292469382286072, + "learning_rate": 1.956433430125904e-05, + "loss": 0.3286, + "step": 2387 + }, + { + "epoch": 0.22495937448482137, + "grad_norm": 0.8598193526268005, + "learning_rate": 1.9563893341682935e-05, + "loss": 0.4096, + "step": 2388 + }, + { + "epoch": 0.22505357857798922, + "grad_norm": 0.8294149041175842, + "learning_rate": 1.9563452164035268e-05, + "loss": 0.3673, + "step": 2389 + }, + { + "epoch": 0.22514778267115707, + "grad_norm": 0.8138467669487, + "learning_rate": 1.95630107683261e-05, + "loss": 0.3452, + "step": 2390 + }, + { + "epoch": 0.22524198676432491, + "grad_norm": 0.9328336715698242, + "learning_rate": 1.95625691545655e-05, + "loss": 0.4335, + "step": 2391 + }, + { + "epoch": 0.22533619085749276, + "grad_norm": 0.8273537755012512, + "learning_rate": 1.956212732276353e-05, + "loss": 0.4029, + "step": 2392 + }, + { + "epoch": 0.2254303949506606, + "grad_norm": 0.9863900542259216, + "learning_rate": 1.9561685272930274e-05, + "loss": 0.4171, + "step": 2393 + }, + { + "epoch": 0.22552459904382846, + "grad_norm": 0.8728018403053284, + "learning_rate": 1.9561243005075807e-05, + "loss": 0.3981, + "step": 2394 + }, + { + "epoch": 0.2256188031369963, + "grad_norm": 0.9171895384788513, + "learning_rate": 1.956080051921021e-05, + "loss": 0.3948, + "step": 2395 + }, + { + "epoch": 0.22571300723016416, + "grad_norm": 0.7338156700134277, + "learning_rate": 1.9560357815343577e-05, + "loss": 0.3817, + "step": 2396 + }, + { + "epoch": 0.225807211323332, + "grad_norm": 0.8601246476173401, + "learning_rate": 1.9559914893486e-05, + "loss": 0.368, + "step": 2397 + }, + { + "epoch": 0.22590141541649986, + "grad_norm": 0.9178417921066284, + "learning_rate": 1.9559471753647577e-05, + "loss": 0.396, + "step": 2398 + }, + { + "epoch": 0.2259956195096677, + "grad_norm": 0.9245316386222839, + "learning_rate": 1.955902839583842e-05, + "loss": 0.4357, + "step": 2399 + }, + { + "epoch": 0.22608982360283555, + "grad_norm": 0.8712064027786255, + "learning_rate": 1.9558584820068625e-05, + "loss": 0.3908, + "step": 2400 + }, + { + "epoch": 0.2261840276960034, + "grad_norm": 0.8233629465103149, + "learning_rate": 1.9558141026348316e-05, + "loss": 0.3785, + "step": 2401 + }, + { + "epoch": 0.22627823178917125, + "grad_norm": 1.1006391048431396, + "learning_rate": 1.9557697014687608e-05, + "loss": 0.4707, + "step": 2402 + }, + { + "epoch": 0.2263724358823391, + "grad_norm": 0.7773765921592712, + "learning_rate": 1.955725278509663e-05, + "loss": 0.3321, + "step": 2403 + }, + { + "epoch": 0.22646663997550692, + "grad_norm": 0.8021467328071594, + "learning_rate": 1.9556808337585504e-05, + "loss": 0.3743, + "step": 2404 + }, + { + "epoch": 0.22656084406867477, + "grad_norm": 0.8988116979598999, + "learning_rate": 1.955636367216437e-05, + "loss": 0.3615, + "step": 2405 + }, + { + "epoch": 0.22665504816184262, + "grad_norm": 0.9378819465637207, + "learning_rate": 1.9555918788843366e-05, + "loss": 0.3699, + "step": 2406 + }, + { + "epoch": 0.22674925225501047, + "grad_norm": 0.9987820386886597, + "learning_rate": 1.9555473687632638e-05, + "loss": 0.4506, + "step": 2407 + }, + { + "epoch": 0.22684345634817832, + "grad_norm": 0.7900689244270325, + "learning_rate": 1.955502836854233e-05, + "loss": 0.3586, + "step": 2408 + }, + { + "epoch": 0.22693766044134617, + "grad_norm": 0.8035356998443604, + "learning_rate": 1.9554582831582594e-05, + "loss": 0.3441, + "step": 2409 + }, + { + "epoch": 0.22703186453451402, + "grad_norm": 0.9192503690719604, + "learning_rate": 1.9554137076763595e-05, + "loss": 0.4067, + "step": 2410 + }, + { + "epoch": 0.22712606862768187, + "grad_norm": 0.8710134029388428, + "learning_rate": 1.9553691104095495e-05, + "loss": 0.4135, + "step": 2411 + }, + { + "epoch": 0.22722027272084971, + "grad_norm": 0.8533443808555603, + "learning_rate": 1.9553244913588464e-05, + "loss": 0.3806, + "step": 2412 + }, + { + "epoch": 0.22731447681401756, + "grad_norm": 0.9453597664833069, + "learning_rate": 1.9552798505252674e-05, + "loss": 0.4078, + "step": 2413 + }, + { + "epoch": 0.2274086809071854, + "grad_norm": 0.8405705094337463, + "learning_rate": 1.9552351879098306e-05, + "loss": 0.4063, + "step": 2414 + }, + { + "epoch": 0.22750288500035326, + "grad_norm": 0.9767290353775024, + "learning_rate": 1.9551905035135542e-05, + "loss": 0.4363, + "step": 2415 + }, + { + "epoch": 0.2275970890935211, + "grad_norm": 0.9663323163986206, + "learning_rate": 1.9551457973374565e-05, + "loss": 0.4438, + "step": 2416 + }, + { + "epoch": 0.22769129318668896, + "grad_norm": 0.7720850110054016, + "learning_rate": 1.9551010693825582e-05, + "loss": 0.3287, + "step": 2417 + }, + { + "epoch": 0.2277854972798568, + "grad_norm": 0.8272558450698853, + "learning_rate": 1.955056319649878e-05, + "loss": 0.3713, + "step": 2418 + }, + { + "epoch": 0.22787970137302466, + "grad_norm": 0.8968222737312317, + "learning_rate": 1.9550115481404368e-05, + "loss": 0.4241, + "step": 2419 + }, + { + "epoch": 0.2279739054661925, + "grad_norm": 0.9261634349822998, + "learning_rate": 1.9549667548552557e-05, + "loss": 0.389, + "step": 2420 + }, + { + "epoch": 0.22806810955936035, + "grad_norm": 0.9221539497375488, + "learning_rate": 1.9549219397953552e-05, + "loss": 0.3776, + "step": 2421 + }, + { + "epoch": 0.2281623136525282, + "grad_norm": 0.8815743327140808, + "learning_rate": 1.9548771029617577e-05, + "loss": 0.3661, + "step": 2422 + }, + { + "epoch": 0.22825651774569605, + "grad_norm": 0.842648983001709, + "learning_rate": 1.9548322443554855e-05, + "loss": 0.3549, + "step": 2423 + }, + { + "epoch": 0.2283507218388639, + "grad_norm": 0.8210880160331726, + "learning_rate": 1.9547873639775616e-05, + "loss": 0.3738, + "step": 2424 + }, + { + "epoch": 0.22844492593203175, + "grad_norm": 0.8543195724487305, + "learning_rate": 1.9547424618290095e-05, + "loss": 0.3679, + "step": 2425 + }, + { + "epoch": 0.2285391300251996, + "grad_norm": 1.115783452987671, + "learning_rate": 1.954697537910852e-05, + "loss": 0.3838, + "step": 2426 + }, + { + "epoch": 0.22863333411836745, + "grad_norm": 0.866740345954895, + "learning_rate": 1.9546525922241147e-05, + "loss": 0.3688, + "step": 2427 + }, + { + "epoch": 0.2287275382115353, + "grad_norm": 0.7821134328842163, + "learning_rate": 1.9546076247698218e-05, + "loss": 0.3072, + "step": 2428 + }, + { + "epoch": 0.22882174230470315, + "grad_norm": 0.9816327095031738, + "learning_rate": 1.9545626355489986e-05, + "loss": 0.386, + "step": 2429 + }, + { + "epoch": 0.228915946397871, + "grad_norm": 0.8954646587371826, + "learning_rate": 1.9545176245626715e-05, + "loss": 0.3639, + "step": 2430 + }, + { + "epoch": 0.22901015049103884, + "grad_norm": 0.8879236578941345, + "learning_rate": 1.954472591811866e-05, + "loss": 0.385, + "step": 2431 + }, + { + "epoch": 0.2291043545842067, + "grad_norm": 0.8517841696739197, + "learning_rate": 1.954427537297609e-05, + "loss": 0.3821, + "step": 2432 + }, + { + "epoch": 0.22919855867737454, + "grad_norm": 0.8577247262001038, + "learning_rate": 1.9543824610209284e-05, + "loss": 0.4077, + "step": 2433 + }, + { + "epoch": 0.2292927627705424, + "grad_norm": 1.0095113515853882, + "learning_rate": 1.954337362982852e-05, + "loss": 0.4262, + "step": 2434 + }, + { + "epoch": 0.22938696686371024, + "grad_norm": 0.8891860246658325, + "learning_rate": 1.9542922431844074e-05, + "loss": 0.3888, + "step": 2435 + }, + { + "epoch": 0.2294811709568781, + "grad_norm": 0.7986401319503784, + "learning_rate": 1.9542471016266236e-05, + "loss": 0.3206, + "step": 2436 + }, + { + "epoch": 0.22957537505004594, + "grad_norm": 0.8828191161155701, + "learning_rate": 1.9542019383105306e-05, + "loss": 0.3792, + "step": 2437 + }, + { + "epoch": 0.22966957914321376, + "grad_norm": 0.873891294002533, + "learning_rate": 1.9541567532371577e-05, + "loss": 0.393, + "step": 2438 + }, + { + "epoch": 0.2297637832363816, + "grad_norm": 0.7492256760597229, + "learning_rate": 1.954111546407535e-05, + "loss": 0.3247, + "step": 2439 + }, + { + "epoch": 0.22985798732954946, + "grad_norm": 0.8440161943435669, + "learning_rate": 1.9540663178226936e-05, + "loss": 0.3894, + "step": 2440 + }, + { + "epoch": 0.2299521914227173, + "grad_norm": 0.828023374080658, + "learning_rate": 1.9540210674836645e-05, + "loss": 0.3587, + "step": 2441 + }, + { + "epoch": 0.23004639551588515, + "grad_norm": 0.8087053298950195, + "learning_rate": 1.9539757953914798e-05, + "loss": 0.3629, + "step": 2442 + }, + { + "epoch": 0.230140599609053, + "grad_norm": 0.8228766918182373, + "learning_rate": 1.9539305015471716e-05, + "loss": 0.3271, + "step": 2443 + }, + { + "epoch": 0.23023480370222085, + "grad_norm": 0.9928690791130066, + "learning_rate": 1.9538851859517728e-05, + "loss": 0.4028, + "step": 2444 + }, + { + "epoch": 0.2303290077953887, + "grad_norm": 0.8397461175918579, + "learning_rate": 1.9538398486063162e-05, + "loss": 0.3758, + "step": 2445 + }, + { + "epoch": 0.23042321188855655, + "grad_norm": 0.9226630926132202, + "learning_rate": 1.9537944895118362e-05, + "loss": 0.3873, + "step": 2446 + }, + { + "epoch": 0.2305174159817244, + "grad_norm": 0.864921510219574, + "learning_rate": 1.953749108669367e-05, + "loss": 0.3904, + "step": 2447 + }, + { + "epoch": 0.23061162007489225, + "grad_norm": 0.9132973551750183, + "learning_rate": 1.9537037060799423e-05, + "loss": 0.4133, + "step": 2448 + }, + { + "epoch": 0.2307058241680601, + "grad_norm": 0.8780419230461121, + "learning_rate": 1.9536582817445988e-05, + "loss": 0.4172, + "step": 2449 + }, + { + "epoch": 0.23080002826122795, + "grad_norm": 0.7681748270988464, + "learning_rate": 1.9536128356643715e-05, + "loss": 0.3133, + "step": 2450 + }, + { + "epoch": 0.2308942323543958, + "grad_norm": 0.7827266454696655, + "learning_rate": 1.9535673678402968e-05, + "loss": 0.3512, + "step": 2451 + }, + { + "epoch": 0.23098843644756364, + "grad_norm": 0.8806381225585938, + "learning_rate": 1.9535218782734112e-05, + "loss": 0.3926, + "step": 2452 + }, + { + "epoch": 0.2310826405407315, + "grad_norm": 0.8704474568367004, + "learning_rate": 1.9534763669647524e-05, + "loss": 0.3953, + "step": 2453 + }, + { + "epoch": 0.23117684463389934, + "grad_norm": 0.9033093452453613, + "learning_rate": 1.9534308339153573e-05, + "loss": 0.3794, + "step": 2454 + }, + { + "epoch": 0.2312710487270672, + "grad_norm": 0.8191359639167786, + "learning_rate": 1.9533852791262653e-05, + "loss": 0.4014, + "step": 2455 + }, + { + "epoch": 0.23136525282023504, + "grad_norm": 0.8146197199821472, + "learning_rate": 1.953339702598514e-05, + "loss": 0.3552, + "step": 2456 + }, + { + "epoch": 0.2314594569134029, + "grad_norm": 1.0001959800720215, + "learning_rate": 1.9532941043331437e-05, + "loss": 0.3923, + "step": 2457 + }, + { + "epoch": 0.23155366100657074, + "grad_norm": 0.9717074632644653, + "learning_rate": 1.953248484331193e-05, + "loss": 0.4005, + "step": 2458 + }, + { + "epoch": 0.2316478650997386, + "grad_norm": 0.9192183017730713, + "learning_rate": 1.9532028425937028e-05, + "loss": 0.4112, + "step": 2459 + }, + { + "epoch": 0.23174206919290644, + "grad_norm": 0.9282395243644714, + "learning_rate": 1.9531571791217137e-05, + "loss": 0.2936, + "step": 2460 + }, + { + "epoch": 0.23183627328607428, + "grad_norm": 0.8893698453903198, + "learning_rate": 1.953111493916267e-05, + "loss": 0.3518, + "step": 2461 + }, + { + "epoch": 0.23193047737924213, + "grad_norm": 0.9180042147636414, + "learning_rate": 1.953065786978404e-05, + "loss": 0.398, + "step": 2462 + }, + { + "epoch": 0.23202468147240998, + "grad_norm": 0.9521582722663879, + "learning_rate": 1.953020058309167e-05, + "loss": 0.407, + "step": 2463 + }, + { + "epoch": 0.23211888556557783, + "grad_norm": 0.835546612739563, + "learning_rate": 1.9529743079095992e-05, + "loss": 0.3468, + "step": 2464 + }, + { + "epoch": 0.23221308965874568, + "grad_norm": 0.8486538529396057, + "learning_rate": 1.9529285357807427e-05, + "loss": 0.3822, + "step": 2465 + }, + { + "epoch": 0.23230729375191353, + "grad_norm": 0.9088033437728882, + "learning_rate": 1.9528827419236425e-05, + "loss": 0.3657, + "step": 2466 + }, + { + "epoch": 0.23240149784508138, + "grad_norm": 0.9178118109703064, + "learning_rate": 1.952836926339342e-05, + "loss": 0.3661, + "step": 2467 + }, + { + "epoch": 0.23249570193824923, + "grad_norm": 0.7571852803230286, + "learning_rate": 1.9527910890288857e-05, + "loss": 0.3395, + "step": 2468 + }, + { + "epoch": 0.23258990603141708, + "grad_norm": 0.8834770917892456, + "learning_rate": 1.9527452299933192e-05, + "loss": 0.3964, + "step": 2469 + }, + { + "epoch": 0.23268411012458493, + "grad_norm": 0.9025447964668274, + "learning_rate": 1.952699349233688e-05, + "loss": 0.3745, + "step": 2470 + }, + { + "epoch": 0.23277831421775277, + "grad_norm": 0.8389468789100647, + "learning_rate": 1.9526534467510382e-05, + "loss": 0.3361, + "step": 2471 + }, + { + "epoch": 0.23287251831092062, + "grad_norm": 0.9149181246757507, + "learning_rate": 1.9526075225464166e-05, + "loss": 0.3805, + "step": 2472 + }, + { + "epoch": 0.23296672240408844, + "grad_norm": 0.9167463779449463, + "learning_rate": 1.9525615766208704e-05, + "loss": 0.3896, + "step": 2473 + }, + { + "epoch": 0.2330609264972563, + "grad_norm": 0.9016802906990051, + "learning_rate": 1.9525156089754468e-05, + "loss": 0.3993, + "step": 2474 + }, + { + "epoch": 0.23315513059042414, + "grad_norm": 0.8447569608688354, + "learning_rate": 1.9524696196111944e-05, + "loss": 0.364, + "step": 2475 + }, + { + "epoch": 0.233249334683592, + "grad_norm": 0.8921357989311218, + "learning_rate": 1.9524236085291614e-05, + "loss": 0.4126, + "step": 2476 + }, + { + "epoch": 0.23334353877675984, + "grad_norm": 0.9665382504463196, + "learning_rate": 1.9523775757303975e-05, + "loss": 0.4028, + "step": 2477 + }, + { + "epoch": 0.2334377428699277, + "grad_norm": 0.8849314451217651, + "learning_rate": 1.9523315212159518e-05, + "loss": 0.3623, + "step": 2478 + }, + { + "epoch": 0.23353194696309554, + "grad_norm": 0.9236788749694824, + "learning_rate": 1.952285444986875e-05, + "loss": 0.4164, + "step": 2479 + }, + { + "epoch": 0.2336261510562634, + "grad_norm": 0.9704916477203369, + "learning_rate": 1.9522393470442165e-05, + "loss": 0.3789, + "step": 2480 + }, + { + "epoch": 0.23372035514943124, + "grad_norm": 0.9800971746444702, + "learning_rate": 1.952193227389029e-05, + "loss": 0.4098, + "step": 2481 + }, + { + "epoch": 0.23381455924259908, + "grad_norm": 0.8776868581771851, + "learning_rate": 1.9521470860223633e-05, + "loss": 0.3835, + "step": 2482 + }, + { + "epoch": 0.23390876333576693, + "grad_norm": 0.7817308902740479, + "learning_rate": 1.952100922945271e-05, + "loss": 0.3545, + "step": 2483 + }, + { + "epoch": 0.23400296742893478, + "grad_norm": 0.8747748732566833, + "learning_rate": 1.9520547381588054e-05, + "loss": 0.3784, + "step": 2484 + }, + { + "epoch": 0.23409717152210263, + "grad_norm": 0.8949740529060364, + "learning_rate": 1.9520085316640196e-05, + "loss": 0.3799, + "step": 2485 + }, + { + "epoch": 0.23419137561527048, + "grad_norm": 0.8536345362663269, + "learning_rate": 1.9519623034619668e-05, + "loss": 0.4348, + "step": 2486 + }, + { + "epoch": 0.23428557970843833, + "grad_norm": 0.8350498080253601, + "learning_rate": 1.9519160535537013e-05, + "loss": 0.4019, + "step": 2487 + }, + { + "epoch": 0.23437978380160618, + "grad_norm": 0.7977070212364197, + "learning_rate": 1.9518697819402775e-05, + "loss": 0.3593, + "step": 2488 + }, + { + "epoch": 0.23447398789477403, + "grad_norm": 0.8070759177207947, + "learning_rate": 1.9518234886227505e-05, + "loss": 0.3255, + "step": 2489 + }, + { + "epoch": 0.23456819198794188, + "grad_norm": 0.9064916968345642, + "learning_rate": 1.9517771736021763e-05, + "loss": 0.357, + "step": 2490 + }, + { + "epoch": 0.23466239608110973, + "grad_norm": 0.9692730903625488, + "learning_rate": 1.95173083687961e-05, + "loss": 0.4252, + "step": 2491 + }, + { + "epoch": 0.23475660017427757, + "grad_norm": 0.8616840839385986, + "learning_rate": 1.9516844784561092e-05, + "loss": 0.3795, + "step": 2492 + }, + { + "epoch": 0.23485080426744542, + "grad_norm": 0.8935884833335876, + "learning_rate": 1.9516380983327305e-05, + "loss": 0.3543, + "step": 2493 + }, + { + "epoch": 0.23494500836061327, + "grad_norm": 0.8609200119972229, + "learning_rate": 1.9515916965105313e-05, + "loss": 0.3687, + "step": 2494 + }, + { + "epoch": 0.23503921245378112, + "grad_norm": 0.8925521969795227, + "learning_rate": 1.9515452729905697e-05, + "loss": 0.3509, + "step": 2495 + }, + { + "epoch": 0.23513341654694897, + "grad_norm": 1.0318409204483032, + "learning_rate": 1.9514988277739046e-05, + "loss": 0.429, + "step": 2496 + }, + { + "epoch": 0.23522762064011682, + "grad_norm": 0.8946929574012756, + "learning_rate": 1.951452360861594e-05, + "loss": 0.3891, + "step": 2497 + }, + { + "epoch": 0.23532182473328467, + "grad_norm": 0.9271129965782166, + "learning_rate": 1.951405872254699e-05, + "loss": 0.4159, + "step": 2498 + }, + { + "epoch": 0.23541602882645252, + "grad_norm": 0.8822172284126282, + "learning_rate": 1.9513593619542784e-05, + "loss": 0.3604, + "step": 2499 + }, + { + "epoch": 0.23551023291962037, + "grad_norm": 0.7919142246246338, + "learning_rate": 1.951312829961393e-05, + "loss": 0.3904, + "step": 2500 + }, + { + "epoch": 0.23560443701278821, + "grad_norm": 0.8769456744194031, + "learning_rate": 1.9512662762771034e-05, + "loss": 0.3692, + "step": 2501 + }, + { + "epoch": 0.23569864110595606, + "grad_norm": 0.8468378186225891, + "learning_rate": 1.9512197009024724e-05, + "loss": 0.3793, + "step": 2502 + }, + { + "epoch": 0.2357928451991239, + "grad_norm": 0.7992005944252014, + "learning_rate": 1.9511731038385605e-05, + "loss": 0.3163, + "step": 2503 + }, + { + "epoch": 0.23588704929229176, + "grad_norm": 1.0220085382461548, + "learning_rate": 1.951126485086431e-05, + "loss": 0.4069, + "step": 2504 + }, + { + "epoch": 0.2359812533854596, + "grad_norm": 0.8660708069801331, + "learning_rate": 1.9510798446471467e-05, + "loss": 0.3913, + "step": 2505 + }, + { + "epoch": 0.23607545747862746, + "grad_norm": 1.070881962776184, + "learning_rate": 1.951033182521771e-05, + "loss": 0.4357, + "step": 2506 + }, + { + "epoch": 0.23616966157179528, + "grad_norm": 0.8952769637107849, + "learning_rate": 1.950986498711368e-05, + "loss": 0.399, + "step": 2507 + }, + { + "epoch": 0.23626386566496313, + "grad_norm": 1.0205018520355225, + "learning_rate": 1.950939793217002e-05, + "loss": 0.375, + "step": 2508 + }, + { + "epoch": 0.23635806975813098, + "grad_norm": 1.0109773874282837, + "learning_rate": 1.950893066039738e-05, + "loss": 0.3929, + "step": 2509 + }, + { + "epoch": 0.23645227385129883, + "grad_norm": 1.0607317686080933, + "learning_rate": 1.9508463171806417e-05, + "loss": 0.3655, + "step": 2510 + }, + { + "epoch": 0.23654647794446668, + "grad_norm": 0.8195115923881531, + "learning_rate": 1.950799546640779e-05, + "loss": 0.3105, + "step": 2511 + }, + { + "epoch": 0.23664068203763453, + "grad_norm": 0.960180401802063, + "learning_rate": 1.950752754421216e-05, + "loss": 0.3719, + "step": 2512 + }, + { + "epoch": 0.23673488613080237, + "grad_norm": 0.8067786693572998, + "learning_rate": 1.9507059405230197e-05, + "loss": 0.3586, + "step": 2513 + }, + { + "epoch": 0.23682909022397022, + "grad_norm": 0.8305049538612366, + "learning_rate": 1.950659104947258e-05, + "loss": 0.3484, + "step": 2514 + }, + { + "epoch": 0.23692329431713807, + "grad_norm": 0.9226266741752625, + "learning_rate": 1.950612247694998e-05, + "loss": 0.3696, + "step": 2515 + }, + { + "epoch": 0.23701749841030592, + "grad_norm": 0.9060239195823669, + "learning_rate": 1.950565368767309e-05, + "loss": 0.3201, + "step": 2516 + }, + { + "epoch": 0.23711170250347377, + "grad_norm": 0.9163451790809631, + "learning_rate": 1.9505184681652594e-05, + "loss": 0.3689, + "step": 2517 + }, + { + "epoch": 0.23720590659664162, + "grad_norm": 0.9099129438400269, + "learning_rate": 1.950471545889919e-05, + "loss": 0.3589, + "step": 2518 + }, + { + "epoch": 0.23730011068980947, + "grad_norm": 0.9115674495697021, + "learning_rate": 1.9504246019423568e-05, + "loss": 0.3723, + "step": 2519 + }, + { + "epoch": 0.23739431478297732, + "grad_norm": 0.848437488079071, + "learning_rate": 1.950377636323644e-05, + "loss": 0.351, + "step": 2520 + }, + { + "epoch": 0.23748851887614517, + "grad_norm": 0.8350842595100403, + "learning_rate": 1.9503306490348518e-05, + "loss": 0.3698, + "step": 2521 + }, + { + "epoch": 0.23758272296931301, + "grad_norm": 0.8197678923606873, + "learning_rate": 1.950283640077051e-05, + "loss": 0.3891, + "step": 2522 + }, + { + "epoch": 0.23767692706248086, + "grad_norm": 0.8316652774810791, + "learning_rate": 1.950236609451313e-05, + "loss": 0.3894, + "step": 2523 + }, + { + "epoch": 0.2377711311556487, + "grad_norm": 0.8374845385551453, + "learning_rate": 1.9501895571587113e-05, + "loss": 0.3821, + "step": 2524 + }, + { + "epoch": 0.23786533524881656, + "grad_norm": 0.9662516713142395, + "learning_rate": 1.9501424832003176e-05, + "loss": 0.4459, + "step": 2525 + }, + { + "epoch": 0.2379595393419844, + "grad_norm": 0.7826052308082581, + "learning_rate": 1.9500953875772064e-05, + "loss": 0.3517, + "step": 2526 + }, + { + "epoch": 0.23805374343515226, + "grad_norm": 0.8361783027648926, + "learning_rate": 1.950048270290451e-05, + "loss": 0.4001, + "step": 2527 + }, + { + "epoch": 0.2381479475283201, + "grad_norm": 1.0288957357406616, + "learning_rate": 1.9500011313411253e-05, + "loss": 0.4515, + "step": 2528 + }, + { + "epoch": 0.23824215162148796, + "grad_norm": 0.7369556427001953, + "learning_rate": 1.949953970730305e-05, + "loss": 0.3296, + "step": 2529 + }, + { + "epoch": 0.2383363557146558, + "grad_norm": 0.8121489882469177, + "learning_rate": 1.9499067884590646e-05, + "loss": 0.4298, + "step": 2530 + }, + { + "epoch": 0.23843055980782366, + "grad_norm": 0.8475870490074158, + "learning_rate": 1.949859584528481e-05, + "loss": 0.3204, + "step": 2531 + }, + { + "epoch": 0.2385247639009915, + "grad_norm": 0.890520453453064, + "learning_rate": 1.9498123589396294e-05, + "loss": 0.3712, + "step": 2532 + }, + { + "epoch": 0.23861896799415935, + "grad_norm": 1.6862674951553345, + "learning_rate": 1.9497651116935874e-05, + "loss": 0.3774, + "step": 2533 + }, + { + "epoch": 0.2387131720873272, + "grad_norm": 0.9394170641899109, + "learning_rate": 1.949717842791432e-05, + "loss": 0.392, + "step": 2534 + }, + { + "epoch": 0.23880737618049505, + "grad_norm": 0.9652701020240784, + "learning_rate": 1.949670552234241e-05, + "loss": 0.3633, + "step": 2535 + }, + { + "epoch": 0.2389015802736629, + "grad_norm": 1.1054222583770752, + "learning_rate": 1.9496232400230927e-05, + "loss": 0.3531, + "step": 2536 + }, + { + "epoch": 0.23899578436683075, + "grad_norm": 1.029251217842102, + "learning_rate": 1.9495759061590654e-05, + "loss": 0.3779, + "step": 2537 + }, + { + "epoch": 0.2390899884599986, + "grad_norm": 0.9493234157562256, + "learning_rate": 1.9495285506432395e-05, + "loss": 0.4521, + "step": 2538 + }, + { + "epoch": 0.23918419255316645, + "grad_norm": 0.8804742097854614, + "learning_rate": 1.949481173476694e-05, + "loss": 0.3991, + "step": 2539 + }, + { + "epoch": 0.2392783966463343, + "grad_norm": 0.8648819327354431, + "learning_rate": 1.949433774660509e-05, + "loss": 0.3673, + "step": 2540 + }, + { + "epoch": 0.23937260073950214, + "grad_norm": 0.9381065964698792, + "learning_rate": 1.9493863541957662e-05, + "loss": 0.3908, + "step": 2541 + }, + { + "epoch": 0.23946680483266997, + "grad_norm": 0.8563313484191895, + "learning_rate": 1.9493389120835462e-05, + "loss": 0.3734, + "step": 2542 + }, + { + "epoch": 0.23956100892583782, + "grad_norm": 0.8014841675758362, + "learning_rate": 1.9492914483249304e-05, + "loss": 0.3001, + "step": 2543 + }, + { + "epoch": 0.23965521301900566, + "grad_norm": 0.9085146188735962, + "learning_rate": 1.949243962921002e-05, + "loss": 0.391, + "step": 2544 + }, + { + "epoch": 0.2397494171121735, + "grad_norm": 0.9402478933334351, + "learning_rate": 1.9491964558728428e-05, + "loss": 0.3498, + "step": 2545 + }, + { + "epoch": 0.23984362120534136, + "grad_norm": 0.8987466096878052, + "learning_rate": 1.9491489271815364e-05, + "loss": 0.4015, + "step": 2546 + }, + { + "epoch": 0.2399378252985092, + "grad_norm": 0.8225127458572388, + "learning_rate": 1.949101376848167e-05, + "loss": 0.4109, + "step": 2547 + }, + { + "epoch": 0.24003202939167706, + "grad_norm": 1.0403307676315308, + "learning_rate": 1.949053804873818e-05, + "loss": 0.3515, + "step": 2548 + }, + { + "epoch": 0.2401262334848449, + "grad_norm": 0.8384838104248047, + "learning_rate": 1.9490062112595745e-05, + "loss": 0.3844, + "step": 2549 + }, + { + "epoch": 0.24022043757801276, + "grad_norm": 1.0988948345184326, + "learning_rate": 1.9489585960065218e-05, + "loss": 0.3582, + "step": 2550 + }, + { + "epoch": 0.2403146416711806, + "grad_norm": 0.8224526643753052, + "learning_rate": 1.9489109591157458e-05, + "loss": 0.3549, + "step": 2551 + }, + { + "epoch": 0.24040884576434846, + "grad_norm": 0.8611648082733154, + "learning_rate": 1.948863300588332e-05, + "loss": 0.3626, + "step": 2552 + }, + { + "epoch": 0.2405030498575163, + "grad_norm": 0.8884101510047913, + "learning_rate": 1.9488156204253678e-05, + "loss": 0.3723, + "step": 2553 + }, + { + "epoch": 0.24059725395068415, + "grad_norm": 0.7818182110786438, + "learning_rate": 1.94876791862794e-05, + "loss": 0.3329, + "step": 2554 + }, + { + "epoch": 0.240691458043852, + "grad_norm": 0.8338054418563843, + "learning_rate": 1.9487201951971363e-05, + "loss": 0.3674, + "step": 2555 + }, + { + "epoch": 0.24078566213701985, + "grad_norm": 0.9098851680755615, + "learning_rate": 1.948672450134045e-05, + "loss": 0.3787, + "step": 2556 + }, + { + "epoch": 0.2408798662301877, + "grad_norm": 0.8057156205177307, + "learning_rate": 1.9486246834397546e-05, + "loss": 0.3679, + "step": 2557 + }, + { + "epoch": 0.24097407032335555, + "grad_norm": 0.9018388390541077, + "learning_rate": 1.9485768951153545e-05, + "loss": 0.3701, + "step": 2558 + }, + { + "epoch": 0.2410682744165234, + "grad_norm": 0.7368927001953125, + "learning_rate": 1.948529085161934e-05, + "loss": 0.3477, + "step": 2559 + }, + { + "epoch": 0.24116247850969125, + "grad_norm": 0.9526506662368774, + "learning_rate": 1.9484812535805835e-05, + "loss": 0.3872, + "step": 2560 + }, + { + "epoch": 0.2412566826028591, + "grad_norm": 0.8121480941772461, + "learning_rate": 1.948433400372394e-05, + "loss": 0.3629, + "step": 2561 + }, + { + "epoch": 0.24135088669602694, + "grad_norm": 0.9268913269042969, + "learning_rate": 1.9483855255384555e-05, + "loss": 0.4345, + "step": 2562 + }, + { + "epoch": 0.2414450907891948, + "grad_norm": 0.8510921001434326, + "learning_rate": 1.9483376290798603e-05, + "loss": 0.3706, + "step": 2563 + }, + { + "epoch": 0.24153929488236264, + "grad_norm": 0.9793437123298645, + "learning_rate": 1.9482897109977007e-05, + "loss": 0.3579, + "step": 2564 + }, + { + "epoch": 0.2416334989755305, + "grad_norm": 0.798780620098114, + "learning_rate": 1.9482417712930696e-05, + "loss": 0.3521, + "step": 2565 + }, + { + "epoch": 0.24172770306869834, + "grad_norm": 0.8711868524551392, + "learning_rate": 1.9481938099670592e-05, + "loss": 0.3817, + "step": 2566 + }, + { + "epoch": 0.2418219071618662, + "grad_norm": 0.7080528736114502, + "learning_rate": 1.9481458270207635e-05, + "loss": 0.328, + "step": 2567 + }, + { + "epoch": 0.24191611125503404, + "grad_norm": 1.154837965965271, + "learning_rate": 1.9480978224552766e-05, + "loss": 0.4186, + "step": 2568 + }, + { + "epoch": 0.2420103153482019, + "grad_norm": 0.9485461115837097, + "learning_rate": 1.9480497962716932e-05, + "loss": 0.3723, + "step": 2569 + }, + { + "epoch": 0.24210451944136974, + "grad_norm": 0.9751293063163757, + "learning_rate": 1.9480017484711083e-05, + "loss": 0.4036, + "step": 2570 + }, + { + "epoch": 0.24219872353453759, + "grad_norm": 0.9526638984680176, + "learning_rate": 1.947953679054617e-05, + "loss": 0.3897, + "step": 2571 + }, + { + "epoch": 0.24229292762770543, + "grad_norm": 0.9223150610923767, + "learning_rate": 1.9479055880233164e-05, + "loss": 0.4004, + "step": 2572 + }, + { + "epoch": 0.24238713172087328, + "grad_norm": 1.0746058225631714, + "learning_rate": 1.947857475378302e-05, + "loss": 0.4136, + "step": 2573 + }, + { + "epoch": 0.24248133581404113, + "grad_norm": 1.041205644607544, + "learning_rate": 1.9478093411206717e-05, + "loss": 0.41, + "step": 2574 + }, + { + "epoch": 0.24257553990720898, + "grad_norm": 0.8298726677894592, + "learning_rate": 1.947761185251522e-05, + "loss": 0.3915, + "step": 2575 + }, + { + "epoch": 0.24266974400037683, + "grad_norm": 0.8218982219696045, + "learning_rate": 1.947713007771952e-05, + "loss": 0.3271, + "step": 2576 + }, + { + "epoch": 0.24276394809354465, + "grad_norm": 0.8517094850540161, + "learning_rate": 1.9476648086830598e-05, + "loss": 0.3578, + "step": 2577 + }, + { + "epoch": 0.2428581521867125, + "grad_norm": 1.017295241355896, + "learning_rate": 1.947616587985944e-05, + "loss": 0.3737, + "step": 2578 + }, + { + "epoch": 0.24295235627988035, + "grad_norm": 0.9262570142745972, + "learning_rate": 1.947568345681705e-05, + "loss": 0.3378, + "step": 2579 + }, + { + "epoch": 0.2430465603730482, + "grad_norm": 0.9176540970802307, + "learning_rate": 1.9475200817714416e-05, + "loss": 0.4121, + "step": 2580 + }, + { + "epoch": 0.24314076446621605, + "grad_norm": 0.9431273341178894, + "learning_rate": 1.9474717962562554e-05, + "loss": 0.3315, + "step": 2581 + }, + { + "epoch": 0.2432349685593839, + "grad_norm": 0.9327338933944702, + "learning_rate": 1.947423489137247e-05, + "loss": 0.3689, + "step": 2582 + }, + { + "epoch": 0.24332917265255175, + "grad_norm": 0.7591254115104675, + "learning_rate": 1.9473751604155176e-05, + "loss": 0.326, + "step": 2583 + }, + { + "epoch": 0.2434233767457196, + "grad_norm": 0.8648123741149902, + "learning_rate": 1.9473268100921697e-05, + "loss": 0.3635, + "step": 2584 + }, + { + "epoch": 0.24351758083888744, + "grad_norm": 0.8905833959579468, + "learning_rate": 1.9472784381683052e-05, + "loss": 0.3974, + "step": 2585 + }, + { + "epoch": 0.2436117849320553, + "grad_norm": 1.354017734527588, + "learning_rate": 1.9472300446450273e-05, + "loss": 0.4085, + "step": 2586 + }, + { + "epoch": 0.24370598902522314, + "grad_norm": 0.8489964008331299, + "learning_rate": 1.9471816295234397e-05, + "loss": 0.3394, + "step": 2587 + }, + { + "epoch": 0.243800193118391, + "grad_norm": 0.832228422164917, + "learning_rate": 1.9471331928046457e-05, + "loss": 0.3554, + "step": 2588 + }, + { + "epoch": 0.24389439721155884, + "grad_norm": 0.9334375262260437, + "learning_rate": 1.9470847344897507e-05, + "loss": 0.3665, + "step": 2589 + }, + { + "epoch": 0.2439886013047267, + "grad_norm": 0.8970177173614502, + "learning_rate": 1.9470362545798583e-05, + "loss": 0.3646, + "step": 2590 + }, + { + "epoch": 0.24408280539789454, + "grad_norm": 0.8576048016548157, + "learning_rate": 1.9469877530760753e-05, + "loss": 0.4117, + "step": 2591 + }, + { + "epoch": 0.24417700949106239, + "grad_norm": 0.789254367351532, + "learning_rate": 1.946939229979507e-05, + "loss": 0.3471, + "step": 2592 + }, + { + "epoch": 0.24427121358423023, + "grad_norm": 0.8189065456390381, + "learning_rate": 1.9468906852912595e-05, + "loss": 0.3545, + "step": 2593 + }, + { + "epoch": 0.24436541767739808, + "grad_norm": 1.0103284120559692, + "learning_rate": 1.94684211901244e-05, + "loss": 0.3514, + "step": 2594 + }, + { + "epoch": 0.24445962177056593, + "grad_norm": 0.7853721380233765, + "learning_rate": 1.946793531144156e-05, + "loss": 0.3254, + "step": 2595 + }, + { + "epoch": 0.24455382586373378, + "grad_norm": 1.181939959526062, + "learning_rate": 1.9467449216875153e-05, + "loss": 0.4127, + "step": 2596 + }, + { + "epoch": 0.24464802995690163, + "grad_norm": 0.938714325428009, + "learning_rate": 1.946696290643626e-05, + "loss": 0.3301, + "step": 2597 + }, + { + "epoch": 0.24474223405006948, + "grad_norm": 0.8742538690567017, + "learning_rate": 1.946647638013597e-05, + "loss": 0.3783, + "step": 2598 + }, + { + "epoch": 0.24483643814323733, + "grad_norm": 0.8467581272125244, + "learning_rate": 1.946598963798538e-05, + "loss": 0.3545, + "step": 2599 + }, + { + "epoch": 0.24493064223640518, + "grad_norm": 0.8565823435783386, + "learning_rate": 1.946550267999559e-05, + "loss": 0.3615, + "step": 2600 + }, + { + "epoch": 0.24502484632957303, + "grad_norm": 0.9403883218765259, + "learning_rate": 1.94650155061777e-05, + "loss": 0.3955, + "step": 2601 + }, + { + "epoch": 0.24511905042274088, + "grad_norm": 0.907534122467041, + "learning_rate": 1.9464528116542816e-05, + "loss": 0.4037, + "step": 2602 + }, + { + "epoch": 0.24521325451590872, + "grad_norm": 0.9900031685829163, + "learning_rate": 1.9464040511102054e-05, + "loss": 0.39, + "step": 2603 + }, + { + "epoch": 0.24530745860907657, + "grad_norm": 0.8356846570968628, + "learning_rate": 1.9463552689866533e-05, + "loss": 0.3728, + "step": 2604 + }, + { + "epoch": 0.24540166270224442, + "grad_norm": 0.8656098246574402, + "learning_rate": 1.9463064652847373e-05, + "loss": 0.3525, + "step": 2605 + }, + { + "epoch": 0.24549586679541227, + "grad_norm": 0.7707147002220154, + "learning_rate": 1.9462576400055707e-05, + "loss": 0.3464, + "step": 2606 + }, + { + "epoch": 0.24559007088858012, + "grad_norm": 0.8406088352203369, + "learning_rate": 1.9462087931502663e-05, + "loss": 0.3718, + "step": 2607 + }, + { + "epoch": 0.24568427498174797, + "grad_norm": 0.9418051838874817, + "learning_rate": 1.946159924719938e-05, + "loss": 0.3888, + "step": 2608 + }, + { + "epoch": 0.24577847907491582, + "grad_norm": 0.9811563491821289, + "learning_rate": 1.9461110347157005e-05, + "loss": 0.4128, + "step": 2609 + }, + { + "epoch": 0.24587268316808367, + "grad_norm": 0.9344661235809326, + "learning_rate": 1.946062123138668e-05, + "loss": 0.3683, + "step": 2610 + }, + { + "epoch": 0.2459668872612515, + "grad_norm": 0.8481505513191223, + "learning_rate": 1.946013189989956e-05, + "loss": 0.332, + "step": 2611 + }, + { + "epoch": 0.24606109135441934, + "grad_norm": 0.8536441922187805, + "learning_rate": 1.94596423527068e-05, + "loss": 0.3681, + "step": 2612 + }, + { + "epoch": 0.24615529544758719, + "grad_norm": 0.8502405285835266, + "learning_rate": 1.945915258981957e-05, + "loss": 0.3773, + "step": 2613 + }, + { + "epoch": 0.24624949954075503, + "grad_norm": 0.8009442687034607, + "learning_rate": 1.9458662611249024e-05, + "loss": 0.3458, + "step": 2614 + }, + { + "epoch": 0.24634370363392288, + "grad_norm": 0.9556559920310974, + "learning_rate": 1.9458172417006347e-05, + "loss": 0.3471, + "step": 2615 + }, + { + "epoch": 0.24643790772709073, + "grad_norm": 0.8855275511741638, + "learning_rate": 1.9457682007102713e-05, + "loss": 0.3865, + "step": 2616 + }, + { + "epoch": 0.24653211182025858, + "grad_norm": 0.8879374265670776, + "learning_rate": 1.9457191381549304e-05, + "loss": 0.3541, + "step": 2617 + }, + { + "epoch": 0.24662631591342643, + "grad_norm": 0.7845891118049622, + "learning_rate": 1.94567005403573e-05, + "loss": 0.3418, + "step": 2618 + }, + { + "epoch": 0.24672052000659428, + "grad_norm": 0.8479444980621338, + "learning_rate": 1.9456209483537902e-05, + "loss": 0.3749, + "step": 2619 + }, + { + "epoch": 0.24681472409976213, + "grad_norm": 0.8177931904792786, + "learning_rate": 1.9455718211102305e-05, + "loss": 0.3965, + "step": 2620 + }, + { + "epoch": 0.24690892819292998, + "grad_norm": 0.9803379774093628, + "learning_rate": 1.9455226723061704e-05, + "loss": 0.4162, + "step": 2621 + }, + { + "epoch": 0.24700313228609783, + "grad_norm": 0.7587687969207764, + "learning_rate": 1.9454735019427315e-05, + "loss": 0.3643, + "step": 2622 + }, + { + "epoch": 0.24709733637926568, + "grad_norm": 0.8936195373535156, + "learning_rate": 1.9454243100210345e-05, + "loss": 0.3969, + "step": 2623 + }, + { + "epoch": 0.24719154047243352, + "grad_norm": 0.9062477350234985, + "learning_rate": 1.945375096542201e-05, + "loss": 0.4177, + "step": 2624 + }, + { + "epoch": 0.24728574456560137, + "grad_norm": 0.8809071183204651, + "learning_rate": 1.9453258615073534e-05, + "loss": 0.3703, + "step": 2625 + }, + { + "epoch": 0.24737994865876922, + "grad_norm": 1.2047926187515259, + "learning_rate": 1.9452766049176143e-05, + "loss": 0.4173, + "step": 2626 + }, + { + "epoch": 0.24747415275193707, + "grad_norm": 1.0378501415252686, + "learning_rate": 1.9452273267741063e-05, + "loss": 0.4247, + "step": 2627 + }, + { + "epoch": 0.24756835684510492, + "grad_norm": 0.9909701943397522, + "learning_rate": 1.9451780270779538e-05, + "loss": 0.4418, + "step": 2628 + }, + { + "epoch": 0.24766256093827277, + "grad_norm": 0.8759151697158813, + "learning_rate": 1.94512870583028e-05, + "loss": 0.368, + "step": 2629 + }, + { + "epoch": 0.24775676503144062, + "grad_norm": 0.7846768498420715, + "learning_rate": 1.9450793630322106e-05, + "loss": 0.3262, + "step": 2630 + }, + { + "epoch": 0.24785096912460847, + "grad_norm": 0.9084791541099548, + "learning_rate": 1.94502999868487e-05, + "loss": 0.3998, + "step": 2631 + }, + { + "epoch": 0.24794517321777632, + "grad_norm": 0.8984910249710083, + "learning_rate": 1.944980612789384e-05, + "loss": 0.3733, + "step": 2632 + }, + { + "epoch": 0.24803937731094416, + "grad_norm": 0.8418054580688477, + "learning_rate": 1.9449312053468782e-05, + "loss": 0.3929, + "step": 2633 + }, + { + "epoch": 0.248133581404112, + "grad_norm": 0.7799889445304871, + "learning_rate": 1.94488177635848e-05, + "loss": 0.3721, + "step": 2634 + }, + { + "epoch": 0.24822778549727986, + "grad_norm": 1.1770952939987183, + "learning_rate": 1.9448323258253157e-05, + "loss": 0.3851, + "step": 2635 + }, + { + "epoch": 0.2483219895904477, + "grad_norm": 0.7383617162704468, + "learning_rate": 1.9447828537485133e-05, + "loss": 0.3525, + "step": 2636 + }, + { + "epoch": 0.24841619368361556, + "grad_norm": 0.9092996120452881, + "learning_rate": 1.9447333601292006e-05, + "loss": 0.3579, + "step": 2637 + }, + { + "epoch": 0.2485103977767834, + "grad_norm": 0.8483371138572693, + "learning_rate": 1.9446838449685064e-05, + "loss": 0.3626, + "step": 2638 + }, + { + "epoch": 0.24860460186995126, + "grad_norm": 0.8367277383804321, + "learning_rate": 1.9446343082675594e-05, + "loss": 0.3537, + "step": 2639 + }, + { + "epoch": 0.2486988059631191, + "grad_norm": 0.7318528890609741, + "learning_rate": 1.9445847500274895e-05, + "loss": 0.3382, + "step": 2640 + }, + { + "epoch": 0.24879301005628696, + "grad_norm": 0.8454012274742126, + "learning_rate": 1.944535170249426e-05, + "loss": 0.3134, + "step": 2641 + }, + { + "epoch": 0.2488872141494548, + "grad_norm": 0.8470341563224792, + "learning_rate": 1.9444855689345002e-05, + "loss": 0.3589, + "step": 2642 + }, + { + "epoch": 0.24898141824262265, + "grad_norm": 0.8552345633506775, + "learning_rate": 1.944435946083843e-05, + "loss": 0.36, + "step": 2643 + }, + { + "epoch": 0.2490756223357905, + "grad_norm": 0.7981129288673401, + "learning_rate": 1.944386301698585e-05, + "loss": 0.3207, + "step": 2644 + }, + { + "epoch": 0.24916982642895835, + "grad_norm": 0.8820785284042358, + "learning_rate": 1.9443366357798594e-05, + "loss": 0.3725, + "step": 2645 + }, + { + "epoch": 0.24926403052212617, + "grad_norm": 0.8401349782943726, + "learning_rate": 1.944286948328798e-05, + "loss": 0.3567, + "step": 2646 + }, + { + "epoch": 0.24935823461529402, + "grad_norm": 0.8499570488929749, + "learning_rate": 1.9442372393465332e-05, + "loss": 0.3798, + "step": 2647 + }, + { + "epoch": 0.24945243870846187, + "grad_norm": 0.8055731058120728, + "learning_rate": 1.9441875088342e-05, + "loss": 0.3517, + "step": 2648 + }, + { + "epoch": 0.24954664280162972, + "grad_norm": 0.8906497359275818, + "learning_rate": 1.9441377567929306e-05, + "loss": 0.3686, + "step": 2649 + }, + { + "epoch": 0.24964084689479757, + "grad_norm": 0.8505920767784119, + "learning_rate": 1.9440879832238603e-05, + "loss": 0.4328, + "step": 2650 + }, + { + "epoch": 0.24973505098796542, + "grad_norm": 0.9310007691383362, + "learning_rate": 1.944038188128124e-05, + "loss": 0.4144, + "step": 2651 + }, + { + "epoch": 0.24982925508113327, + "grad_norm": 0.8184910416603088, + "learning_rate": 1.9439883715068572e-05, + "loss": 0.3753, + "step": 2652 + }, + { + "epoch": 0.24992345917430112, + "grad_norm": 0.7765772938728333, + "learning_rate": 1.9439385333611954e-05, + "loss": 0.3676, + "step": 2653 + }, + { + "epoch": 0.250017663267469, + "grad_norm": 0.9684597253799438, + "learning_rate": 1.9438886736922757e-05, + "loss": 0.4205, + "step": 2654 + }, + { + "epoch": 0.25011186736063684, + "grad_norm": 0.7522620558738708, + "learning_rate": 1.943838792501234e-05, + "loss": 0.3473, + "step": 2655 + }, + { + "epoch": 0.2502060714538047, + "grad_norm": 1.0273616313934326, + "learning_rate": 1.9437888897892085e-05, + "loss": 0.3991, + "step": 2656 + }, + { + "epoch": 0.25030027554697254, + "grad_norm": 0.8650879859924316, + "learning_rate": 1.943738965557336e-05, + "loss": 0.3243, + "step": 2657 + }, + { + "epoch": 0.2503944796401404, + "grad_norm": 0.7960026860237122, + "learning_rate": 1.9436890198067565e-05, + "loss": 0.3458, + "step": 2658 + }, + { + "epoch": 0.25048868373330824, + "grad_norm": 0.8975011110305786, + "learning_rate": 1.9436390525386072e-05, + "loss": 0.3925, + "step": 2659 + }, + { + "epoch": 0.2505828878264761, + "grad_norm": 0.8295131325721741, + "learning_rate": 1.9435890637540284e-05, + "loss": 0.3433, + "step": 2660 + }, + { + "epoch": 0.25067709191964394, + "grad_norm": 0.9529341459274292, + "learning_rate": 1.9435390534541598e-05, + "loss": 0.3631, + "step": 2661 + }, + { + "epoch": 0.2507712960128118, + "grad_norm": 0.7790176868438721, + "learning_rate": 1.9434890216401416e-05, + "loss": 0.367, + "step": 2662 + }, + { + "epoch": 0.2508655001059796, + "grad_norm": 0.9206838011741638, + "learning_rate": 1.943438968313114e-05, + "loss": 0.3433, + "step": 2663 + }, + { + "epoch": 0.2509597041991474, + "grad_norm": 0.8748577237129211, + "learning_rate": 1.9433888934742193e-05, + "loss": 0.3746, + "step": 2664 + }, + { + "epoch": 0.2510539082923153, + "grad_norm": 0.8435645699501038, + "learning_rate": 1.943338797124599e-05, + "loss": 0.3615, + "step": 2665 + }, + { + "epoch": 0.2511481123854831, + "grad_norm": 1.0106738805770874, + "learning_rate": 1.943288679265395e-05, + "loss": 0.4068, + "step": 2666 + }, + { + "epoch": 0.251242316478651, + "grad_norm": 0.911796510219574, + "learning_rate": 1.94323853989775e-05, + "loss": 0.4102, + "step": 2667 + }, + { + "epoch": 0.2513365205718188, + "grad_norm": 0.9706193804740906, + "learning_rate": 1.943188379022808e-05, + "loss": 0.366, + "step": 2668 + }, + { + "epoch": 0.25143072466498667, + "grad_norm": 0.8246974945068359, + "learning_rate": 1.9431381966417125e-05, + "loss": 0.3813, + "step": 2669 + }, + { + "epoch": 0.2515249287581545, + "grad_norm": 0.7846889495849609, + "learning_rate": 1.943087992755607e-05, + "loss": 0.3331, + "step": 2670 + }, + { + "epoch": 0.25161913285132237, + "grad_norm": 0.7623711824417114, + "learning_rate": 1.9430377673656372e-05, + "loss": 0.3241, + "step": 2671 + }, + { + "epoch": 0.2517133369444902, + "grad_norm": 0.7921915650367737, + "learning_rate": 1.9429875204729476e-05, + "loss": 0.3245, + "step": 2672 + }, + { + "epoch": 0.25180754103765807, + "grad_norm": 0.7707971334457397, + "learning_rate": 1.942937252078684e-05, + "loss": 0.3439, + "step": 2673 + }, + { + "epoch": 0.2519017451308259, + "grad_norm": 0.8475367426872253, + "learning_rate": 1.942886962183993e-05, + "loss": 0.3564, + "step": 2674 + }, + { + "epoch": 0.25199594922399376, + "grad_norm": 0.9145054817199707, + "learning_rate": 1.942836650790021e-05, + "loss": 0.4058, + "step": 2675 + }, + { + "epoch": 0.2520901533171616, + "grad_norm": 0.9165285229682922, + "learning_rate": 1.9427863178979152e-05, + "loss": 0.3632, + "step": 2676 + }, + { + "epoch": 0.25218435741032946, + "grad_norm": 0.9520749449729919, + "learning_rate": 1.9427359635088235e-05, + "loss": 0.3624, + "step": 2677 + }, + { + "epoch": 0.2522785615034973, + "grad_norm": 0.9126151204109192, + "learning_rate": 1.9426855876238937e-05, + "loss": 0.3721, + "step": 2678 + }, + { + "epoch": 0.25237276559666516, + "grad_norm": 0.8739978671073914, + "learning_rate": 1.9426351902442746e-05, + "loss": 0.3466, + "step": 2679 + }, + { + "epoch": 0.252466969689833, + "grad_norm": 0.8436156511306763, + "learning_rate": 1.9425847713711155e-05, + "loss": 0.3804, + "step": 2680 + }, + { + "epoch": 0.25256117378300086, + "grad_norm": 0.9655857682228088, + "learning_rate": 1.9425343310055654e-05, + "loss": 0.3739, + "step": 2681 + }, + { + "epoch": 0.2526553778761687, + "grad_norm": 0.8512799143791199, + "learning_rate": 1.9424838691487755e-05, + "loss": 0.3698, + "step": 2682 + }, + { + "epoch": 0.25274958196933656, + "grad_norm": 0.8704470992088318, + "learning_rate": 1.9424333858018954e-05, + "loss": 0.3619, + "step": 2683 + }, + { + "epoch": 0.2528437860625044, + "grad_norm": 0.8650441765785217, + "learning_rate": 1.942382880966077e-05, + "loss": 0.3811, + "step": 2684 + }, + { + "epoch": 0.25293799015567225, + "grad_norm": 0.8505967259407043, + "learning_rate": 1.9423323546424712e-05, + "loss": 0.3659, + "step": 2685 + }, + { + "epoch": 0.2530321942488401, + "grad_norm": 0.8026577234268188, + "learning_rate": 1.9422818068322303e-05, + "loss": 0.3506, + "step": 2686 + }, + { + "epoch": 0.25312639834200795, + "grad_norm": 0.98709636926651, + "learning_rate": 1.942231237536507e-05, + "loss": 0.333, + "step": 2687 + }, + { + "epoch": 0.2532206024351758, + "grad_norm": 0.8574108481407166, + "learning_rate": 1.9421806467564546e-05, + "loss": 0.378, + "step": 2688 + }, + { + "epoch": 0.25331480652834365, + "grad_norm": 0.8704533576965332, + "learning_rate": 1.942130034493226e-05, + "loss": 0.3253, + "step": 2689 + }, + { + "epoch": 0.2534090106215115, + "grad_norm": 0.8911964893341064, + "learning_rate": 1.9420794007479757e-05, + "loss": 0.3907, + "step": 2690 + }, + { + "epoch": 0.25350321471467935, + "grad_norm": 0.811689019203186, + "learning_rate": 1.942028745521858e-05, + "loss": 0.3441, + "step": 2691 + }, + { + "epoch": 0.2535974188078472, + "grad_norm": 0.9497051239013672, + "learning_rate": 1.9419780688160285e-05, + "loss": 0.4407, + "step": 2692 + }, + { + "epoch": 0.25369162290101505, + "grad_norm": 0.8960627317428589, + "learning_rate": 1.9419273706316416e-05, + "loss": 0.3707, + "step": 2693 + }, + { + "epoch": 0.2537858269941829, + "grad_norm": 0.8265642523765564, + "learning_rate": 1.9418766509698544e-05, + "loss": 0.3648, + "step": 2694 + }, + { + "epoch": 0.25388003108735074, + "grad_norm": 0.805389404296875, + "learning_rate": 1.9418259098318226e-05, + "loss": 0.3685, + "step": 2695 + }, + { + "epoch": 0.2539742351805186, + "grad_norm": 1.0355446338653564, + "learning_rate": 1.9417751472187032e-05, + "loss": 0.3896, + "step": 2696 + }, + { + "epoch": 0.25406843927368644, + "grad_norm": 0.9594706892967224, + "learning_rate": 1.9417243631316548e-05, + "loss": 0.3926, + "step": 2697 + }, + { + "epoch": 0.2541626433668543, + "grad_norm": 0.8476678729057312, + "learning_rate": 1.941673557571834e-05, + "loss": 0.3579, + "step": 2698 + }, + { + "epoch": 0.25425684746002214, + "grad_norm": 1.066336989402771, + "learning_rate": 1.9416227305404e-05, + "loss": 0.3944, + "step": 2699 + }, + { + "epoch": 0.25435105155319, + "grad_norm": 0.8706140518188477, + "learning_rate": 1.941571882038511e-05, + "loss": 0.3879, + "step": 2700 + }, + { + "epoch": 0.25444525564635784, + "grad_norm": 0.8646376132965088, + "learning_rate": 1.9415210120673275e-05, + "loss": 0.3663, + "step": 2701 + }, + { + "epoch": 0.2545394597395257, + "grad_norm": 0.9006446003913879, + "learning_rate": 1.9414701206280083e-05, + "loss": 0.3779, + "step": 2702 + }, + { + "epoch": 0.25463366383269354, + "grad_norm": 0.831724226474762, + "learning_rate": 1.941419207721715e-05, + "loss": 0.3611, + "step": 2703 + }, + { + "epoch": 0.2547278679258614, + "grad_norm": 0.8399516940116882, + "learning_rate": 1.9413682733496073e-05, + "loss": 0.3824, + "step": 2704 + }, + { + "epoch": 0.25482207201902923, + "grad_norm": 0.8994430303573608, + "learning_rate": 1.9413173175128472e-05, + "loss": 0.3605, + "step": 2705 + }, + { + "epoch": 0.2549162761121971, + "grad_norm": 0.7757525444030762, + "learning_rate": 1.9412663402125968e-05, + "loss": 0.349, + "step": 2706 + }, + { + "epoch": 0.25501048020536493, + "grad_norm": 0.8783275485038757, + "learning_rate": 1.9412153414500176e-05, + "loss": 0.3803, + "step": 2707 + }, + { + "epoch": 0.2551046842985328, + "grad_norm": 0.9689286947250366, + "learning_rate": 1.9411643212262733e-05, + "loss": 0.3418, + "step": 2708 + }, + { + "epoch": 0.25519888839170063, + "grad_norm": 0.70932537317276, + "learning_rate": 1.941113279542527e-05, + "loss": 0.2944, + "step": 2709 + }, + { + "epoch": 0.2552930924848685, + "grad_norm": 0.9320895671844482, + "learning_rate": 1.9410622163999423e-05, + "loss": 0.4067, + "step": 2710 + }, + { + "epoch": 0.2553872965780363, + "grad_norm": 0.8390423059463501, + "learning_rate": 1.941011131799684e-05, + "loss": 0.3477, + "step": 2711 + }, + { + "epoch": 0.2554815006712042, + "grad_norm": 0.876505434513092, + "learning_rate": 1.9409600257429164e-05, + "loss": 0.388, + "step": 2712 + }, + { + "epoch": 0.255575704764372, + "grad_norm": 0.9468458890914917, + "learning_rate": 1.940908898230805e-05, + "loss": 0.3464, + "step": 2713 + }, + { + "epoch": 0.2556699088575399, + "grad_norm": 0.9577358961105347, + "learning_rate": 1.9408577492645153e-05, + "loss": 0.4196, + "step": 2714 + }, + { + "epoch": 0.2557641129507077, + "grad_norm": 0.7596282362937927, + "learning_rate": 1.940806578845214e-05, + "loss": 0.3717, + "step": 2715 + }, + { + "epoch": 0.25585831704387557, + "grad_norm": 0.8836774826049805, + "learning_rate": 1.940755386974068e-05, + "loss": 0.362, + "step": 2716 + }, + { + "epoch": 0.2559525211370434, + "grad_norm": 0.8619797825813293, + "learning_rate": 1.940704173652244e-05, + "loss": 0.3593, + "step": 2717 + }, + { + "epoch": 0.25604672523021127, + "grad_norm": 0.7850464582443237, + "learning_rate": 1.9406529388809103e-05, + "loss": 0.3108, + "step": 2718 + }, + { + "epoch": 0.2561409293233791, + "grad_norm": 0.8899826407432556, + "learning_rate": 1.9406016826612346e-05, + "loss": 0.4021, + "step": 2719 + }, + { + "epoch": 0.25623513341654697, + "grad_norm": 0.9437614679336548, + "learning_rate": 1.9405504049943858e-05, + "loss": 0.4167, + "step": 2720 + }, + { + "epoch": 0.2563293375097148, + "grad_norm": 0.860201358795166, + "learning_rate": 1.9404991058815335e-05, + "loss": 0.3854, + "step": 2721 + }, + { + "epoch": 0.25642354160288267, + "grad_norm": 0.8130446672439575, + "learning_rate": 1.9404477853238467e-05, + "loss": 0.3256, + "step": 2722 + }, + { + "epoch": 0.2565177456960505, + "grad_norm": 0.834983766078949, + "learning_rate": 1.9403964433224963e-05, + "loss": 0.3715, + "step": 2723 + }, + { + "epoch": 0.25661194978921836, + "grad_norm": 0.9256806969642639, + "learning_rate": 1.9403450798786525e-05, + "loss": 0.3505, + "step": 2724 + }, + { + "epoch": 0.2567061538823862, + "grad_norm": 1.429145097732544, + "learning_rate": 1.9402936949934865e-05, + "loss": 0.3304, + "step": 2725 + }, + { + "epoch": 0.25680035797555406, + "grad_norm": 0.8746187090873718, + "learning_rate": 1.94024228866817e-05, + "loss": 0.3588, + "step": 2726 + }, + { + "epoch": 0.2568945620687219, + "grad_norm": 0.9026733636856079, + "learning_rate": 1.9401908609038752e-05, + "loss": 0.3895, + "step": 2727 + }, + { + "epoch": 0.25698876616188976, + "grad_norm": 0.8675488829612732, + "learning_rate": 1.940139411701775e-05, + "loss": 0.3645, + "step": 2728 + }, + { + "epoch": 0.2570829702550576, + "grad_norm": 0.9900367259979248, + "learning_rate": 1.940087941063042e-05, + "loss": 0.3669, + "step": 2729 + }, + { + "epoch": 0.25717717434822546, + "grad_norm": 0.8831049799919128, + "learning_rate": 1.9400364489888505e-05, + "loss": 0.3588, + "step": 2730 + }, + { + "epoch": 0.2572713784413933, + "grad_norm": 0.7980867624282837, + "learning_rate": 1.9399849354803735e-05, + "loss": 0.3983, + "step": 2731 + }, + { + "epoch": 0.2573655825345611, + "grad_norm": 1.0000221729278564, + "learning_rate": 1.939933400538787e-05, + "loss": 0.4212, + "step": 2732 + }, + { + "epoch": 0.25745978662772895, + "grad_norm": 0.8112549185752869, + "learning_rate": 1.939881844165265e-05, + "loss": 0.3665, + "step": 2733 + }, + { + "epoch": 0.2575539907208968, + "grad_norm": 0.7917676568031311, + "learning_rate": 1.9398302663609835e-05, + "loss": 0.3281, + "step": 2734 + }, + { + "epoch": 0.25764819481406465, + "grad_norm": 0.9711036682128906, + "learning_rate": 1.9397786671271182e-05, + "loss": 0.375, + "step": 2735 + }, + { + "epoch": 0.2577423989072325, + "grad_norm": 0.938240110874176, + "learning_rate": 1.9397270464648457e-05, + "loss": 0.4084, + "step": 2736 + }, + { + "epoch": 0.25783660300040034, + "grad_norm": 0.7782493829727173, + "learning_rate": 1.9396754043753437e-05, + "loss": 0.3454, + "step": 2737 + }, + { + "epoch": 0.2579308070935682, + "grad_norm": 0.7555148005485535, + "learning_rate": 1.9396237408597893e-05, + "loss": 0.3702, + "step": 2738 + }, + { + "epoch": 0.25802501118673604, + "grad_norm": 0.8061802387237549, + "learning_rate": 1.93957205591936e-05, + "loss": 0.371, + "step": 2739 + }, + { + "epoch": 0.2581192152799039, + "grad_norm": 0.8084131479263306, + "learning_rate": 1.9395203495552352e-05, + "loss": 0.3319, + "step": 2740 + }, + { + "epoch": 0.25821341937307174, + "grad_norm": 0.8582677841186523, + "learning_rate": 1.9394686217685933e-05, + "loss": 0.3741, + "step": 2741 + }, + { + "epoch": 0.2583076234662396, + "grad_norm": 0.8153006434440613, + "learning_rate": 1.9394168725606136e-05, + "loss": 0.3701, + "step": 2742 + }, + { + "epoch": 0.25840182755940744, + "grad_norm": 0.8366333246231079, + "learning_rate": 1.939365101932477e-05, + "loss": 0.4172, + "step": 2743 + }, + { + "epoch": 0.2584960316525753, + "grad_norm": 0.7946856021881104, + "learning_rate": 1.9393133098853627e-05, + "loss": 0.3562, + "step": 2744 + }, + { + "epoch": 0.25859023574574314, + "grad_norm": 0.7716690301895142, + "learning_rate": 1.9392614964204526e-05, + "loss": 0.3434, + "step": 2745 + }, + { + "epoch": 0.258684439838911, + "grad_norm": 0.8663679957389832, + "learning_rate": 1.9392096615389273e-05, + "loss": 0.3262, + "step": 2746 + }, + { + "epoch": 0.25877864393207883, + "grad_norm": 0.8803080916404724, + "learning_rate": 1.9391578052419697e-05, + "loss": 0.3629, + "step": 2747 + }, + { + "epoch": 0.2588728480252467, + "grad_norm": 0.9695155024528503, + "learning_rate": 1.9391059275307615e-05, + "loss": 0.4092, + "step": 2748 + }, + { + "epoch": 0.25896705211841453, + "grad_norm": 0.8796296119689941, + "learning_rate": 1.9390540284064858e-05, + "loss": 0.3786, + "step": 2749 + }, + { + "epoch": 0.2590612562115824, + "grad_norm": 0.7485645413398743, + "learning_rate": 1.9390021078703262e-05, + "loss": 0.3524, + "step": 2750 + }, + { + "epoch": 0.25915546030475023, + "grad_norm": 1.0117424726486206, + "learning_rate": 1.938950165923466e-05, + "loss": 0.3258, + "step": 2751 + }, + { + "epoch": 0.2592496643979181, + "grad_norm": 0.7757304906845093, + "learning_rate": 1.9388982025670903e-05, + "loss": 0.3172, + "step": 2752 + }, + { + "epoch": 0.2593438684910859, + "grad_norm": 0.8561500310897827, + "learning_rate": 1.9388462178023834e-05, + "loss": 0.3562, + "step": 2753 + }, + { + "epoch": 0.2594380725842538, + "grad_norm": 0.7588501572608948, + "learning_rate": 1.9387942116305307e-05, + "loss": 0.3297, + "step": 2754 + }, + { + "epoch": 0.2595322766774216, + "grad_norm": 0.8772037625312805, + "learning_rate": 1.938742184052718e-05, + "loss": 0.4068, + "step": 2755 + }, + { + "epoch": 0.2596264807705895, + "grad_norm": 0.8602582812309265, + "learning_rate": 1.938690135070132e-05, + "loss": 0.3845, + "step": 2756 + }, + { + "epoch": 0.2597206848637573, + "grad_norm": 0.88422691822052, + "learning_rate": 1.938638064683959e-05, + "loss": 0.3799, + "step": 2757 + }, + { + "epoch": 0.25981488895692517, + "grad_norm": 0.8500288724899292, + "learning_rate": 1.9385859728953866e-05, + "loss": 0.3806, + "step": 2758 + }, + { + "epoch": 0.259909093050093, + "grad_norm": 0.837928831577301, + "learning_rate": 1.938533859705602e-05, + "loss": 0.3755, + "step": 2759 + }, + { + "epoch": 0.26000329714326087, + "grad_norm": 0.880570113658905, + "learning_rate": 1.9384817251157945e-05, + "loss": 0.4118, + "step": 2760 + }, + { + "epoch": 0.2600975012364287, + "grad_norm": 0.9217453598976135, + "learning_rate": 1.9384295691271523e-05, + "loss": 0.4329, + "step": 2761 + }, + { + "epoch": 0.26019170532959657, + "grad_norm": 0.8721567392349243, + "learning_rate": 1.9383773917408644e-05, + "loss": 0.3772, + "step": 2762 + }, + { + "epoch": 0.2602859094227644, + "grad_norm": 0.9112967848777771, + "learning_rate": 1.9383251929581208e-05, + "loss": 0.3699, + "step": 2763 + }, + { + "epoch": 0.26038011351593227, + "grad_norm": 0.8288027048110962, + "learning_rate": 1.9382729727801116e-05, + "loss": 0.3522, + "step": 2764 + }, + { + "epoch": 0.2604743176091001, + "grad_norm": 0.7674428224563599, + "learning_rate": 1.9382207312080275e-05, + "loss": 0.3699, + "step": 2765 + }, + { + "epoch": 0.26056852170226796, + "grad_norm": 0.8218756318092346, + "learning_rate": 1.9381684682430597e-05, + "loss": 0.4002, + "step": 2766 + }, + { + "epoch": 0.2606627257954358, + "grad_norm": 0.8563632965087891, + "learning_rate": 1.9381161838864e-05, + "loss": 0.3556, + "step": 2767 + }, + { + "epoch": 0.26075692988860366, + "grad_norm": 0.7240821123123169, + "learning_rate": 1.9380638781392406e-05, + "loss": 0.3441, + "step": 2768 + }, + { + "epoch": 0.2608511339817715, + "grad_norm": 0.9017511010169983, + "learning_rate": 1.9380115510027742e-05, + "loss": 0.3579, + "step": 2769 + }, + { + "epoch": 0.26094533807493936, + "grad_norm": 0.8759740591049194, + "learning_rate": 1.9379592024781932e-05, + "loss": 0.3554, + "step": 2770 + }, + { + "epoch": 0.2610395421681072, + "grad_norm": 0.848834753036499, + "learning_rate": 1.937906832566692e-05, + "loss": 0.3683, + "step": 2771 + }, + { + "epoch": 0.26113374626127506, + "grad_norm": 0.8701202273368835, + "learning_rate": 1.937854441269465e-05, + "loss": 0.4041, + "step": 2772 + }, + { + "epoch": 0.2612279503544429, + "grad_norm": 0.8468180298805237, + "learning_rate": 1.9378020285877056e-05, + "loss": 0.3281, + "step": 2773 + }, + { + "epoch": 0.26132215444761075, + "grad_norm": 0.7769880890846252, + "learning_rate": 1.93774959452261e-05, + "loss": 0.3327, + "step": 2774 + }, + { + "epoch": 0.2614163585407786, + "grad_norm": 0.8081656694412231, + "learning_rate": 1.9376971390753736e-05, + "loss": 0.3383, + "step": 2775 + }, + { + "epoch": 0.26151056263394645, + "grad_norm": 0.7840548157691956, + "learning_rate": 1.937644662247192e-05, + "loss": 0.364, + "step": 2776 + }, + { + "epoch": 0.2616047667271143, + "grad_norm": 0.7989134192466736, + "learning_rate": 1.937592164039262e-05, + "loss": 0.342, + "step": 2777 + }, + { + "epoch": 0.26169897082028215, + "grad_norm": 0.8289729356765747, + "learning_rate": 1.9375396444527807e-05, + "loss": 0.4254, + "step": 2778 + }, + { + "epoch": 0.26179317491345, + "grad_norm": 0.7800250053405762, + "learning_rate": 1.9374871034889457e-05, + "loss": 0.337, + "step": 2779 + }, + { + "epoch": 0.26188737900661785, + "grad_norm": 0.866980254650116, + "learning_rate": 1.9374345411489546e-05, + "loss": 0.3631, + "step": 2780 + }, + { + "epoch": 0.2619815830997857, + "grad_norm": 0.8509105443954468, + "learning_rate": 1.9373819574340064e-05, + "loss": 0.3452, + "step": 2781 + }, + { + "epoch": 0.26207578719295355, + "grad_norm": 0.771759033203125, + "learning_rate": 1.9373293523452996e-05, + "loss": 0.3488, + "step": 2782 + }, + { + "epoch": 0.2621699912861214, + "grad_norm": 0.9045215249061584, + "learning_rate": 1.937276725884034e-05, + "loss": 0.4015, + "step": 2783 + }, + { + "epoch": 0.26226419537928924, + "grad_norm": 0.9882833957672119, + "learning_rate": 1.93722407805141e-05, + "loss": 0.4366, + "step": 2784 + }, + { + "epoch": 0.2623583994724571, + "grad_norm": 0.7805891036987305, + "learning_rate": 1.937171408848627e-05, + "loss": 0.3357, + "step": 2785 + }, + { + "epoch": 0.26245260356562494, + "grad_norm": 0.8759221434593201, + "learning_rate": 1.937118718276887e-05, + "loss": 0.3699, + "step": 2786 + }, + { + "epoch": 0.2625468076587928, + "grad_norm": 0.8115664124488831, + "learning_rate": 1.9370660063373905e-05, + "loss": 0.3987, + "step": 2787 + }, + { + "epoch": 0.26264101175196064, + "grad_norm": 0.8264844417572021, + "learning_rate": 1.9370132730313403e-05, + "loss": 0.3385, + "step": 2788 + }, + { + "epoch": 0.2627352158451285, + "grad_norm": 0.8643694519996643, + "learning_rate": 1.9369605183599377e-05, + "loss": 0.38, + "step": 2789 + }, + { + "epoch": 0.26282941993829634, + "grad_norm": 0.7324055433273315, + "learning_rate": 1.936907742324387e-05, + "loss": 0.3248, + "step": 2790 + }, + { + "epoch": 0.2629236240314642, + "grad_norm": 0.928814172744751, + "learning_rate": 1.9368549449258903e-05, + "loss": 0.3749, + "step": 2791 + }, + { + "epoch": 0.26301782812463204, + "grad_norm": 0.8646256327629089, + "learning_rate": 1.9368021261656523e-05, + "loss": 0.3552, + "step": 2792 + }, + { + "epoch": 0.2631120322177999, + "grad_norm": 0.9403034448623657, + "learning_rate": 1.936749286044877e-05, + "loss": 0.3957, + "step": 2793 + }, + { + "epoch": 0.26320623631096773, + "grad_norm": 0.8048885464668274, + "learning_rate": 1.936696424564769e-05, + "loss": 0.3386, + "step": 2794 + }, + { + "epoch": 0.2633004404041356, + "grad_norm": 0.8479284644126892, + "learning_rate": 1.936643541726534e-05, + "loss": 0.3765, + "step": 2795 + }, + { + "epoch": 0.26339464449730343, + "grad_norm": 0.895535409450531, + "learning_rate": 1.936590637531378e-05, + "loss": 0.3849, + "step": 2796 + }, + { + "epoch": 0.2634888485904713, + "grad_norm": 0.7173005938529968, + "learning_rate": 1.9365377119805068e-05, + "loss": 0.3217, + "step": 2797 + }, + { + "epoch": 0.26358305268363913, + "grad_norm": 0.8052489161491394, + "learning_rate": 1.936484765075127e-05, + "loss": 0.3446, + "step": 2798 + }, + { + "epoch": 0.263677256776807, + "grad_norm": 0.8405939340591431, + "learning_rate": 1.9364317968164466e-05, + "loss": 0.3687, + "step": 2799 + }, + { + "epoch": 0.2637714608699748, + "grad_norm": 1.122497797012329, + "learning_rate": 1.936378807205673e-05, + "loss": 0.4123, + "step": 2800 + }, + { + "epoch": 0.2638656649631426, + "grad_norm": 0.8345810174942017, + "learning_rate": 1.9363257962440147e-05, + "loss": 0.4561, + "step": 2801 + }, + { + "epoch": 0.26395986905631047, + "grad_norm": 0.7875660061836243, + "learning_rate": 1.9362727639326798e-05, + "loss": 0.3285, + "step": 2802 + }, + { + "epoch": 0.2640540731494783, + "grad_norm": 0.7944983243942261, + "learning_rate": 1.936219710272878e-05, + "loss": 0.3591, + "step": 2803 + }, + { + "epoch": 0.26414827724264617, + "grad_norm": 0.899957537651062, + "learning_rate": 1.936166635265819e-05, + "loss": 0.4162, + "step": 2804 + }, + { + "epoch": 0.264242481335814, + "grad_norm": 0.8116116523742676, + "learning_rate": 1.936113538912713e-05, + "loss": 0.3344, + "step": 2805 + }, + { + "epoch": 0.26433668542898187, + "grad_norm": 0.8326045870780945, + "learning_rate": 1.9360604212147706e-05, + "loss": 0.3389, + "step": 2806 + }, + { + "epoch": 0.2644308895221497, + "grad_norm": 0.845786452293396, + "learning_rate": 1.9360072821732027e-05, + "loss": 0.422, + "step": 2807 + }, + { + "epoch": 0.26452509361531756, + "grad_norm": 0.7908915877342224, + "learning_rate": 1.935954121789221e-05, + "loss": 0.3638, + "step": 2808 + }, + { + "epoch": 0.2646192977084854, + "grad_norm": 0.8267839550971985, + "learning_rate": 1.9359009400640384e-05, + "loss": 0.3936, + "step": 2809 + }, + { + "epoch": 0.26471350180165326, + "grad_norm": 0.8255811333656311, + "learning_rate": 1.935847736998867e-05, + "loss": 0.3582, + "step": 2810 + }, + { + "epoch": 0.2648077058948211, + "grad_norm": 0.8436806797981262, + "learning_rate": 1.9357945125949194e-05, + "loss": 0.4031, + "step": 2811 + }, + { + "epoch": 0.26490190998798896, + "grad_norm": 0.8261404037475586, + "learning_rate": 1.9357412668534098e-05, + "loss": 0.3373, + "step": 2812 + }, + { + "epoch": 0.2649961140811568, + "grad_norm": 0.8110752105712891, + "learning_rate": 1.9356879997755525e-05, + "loss": 0.3454, + "step": 2813 + }, + { + "epoch": 0.26509031817432466, + "grad_norm": 0.7850409746170044, + "learning_rate": 1.9356347113625612e-05, + "loss": 0.3491, + "step": 2814 + }, + { + "epoch": 0.2651845222674925, + "grad_norm": 0.8864262700080872, + "learning_rate": 1.935581401615652e-05, + "loss": 0.398, + "step": 2815 + }, + { + "epoch": 0.26527872636066036, + "grad_norm": 0.94769686460495, + "learning_rate": 1.9355280705360395e-05, + "loss": 0.3932, + "step": 2816 + }, + { + "epoch": 0.2653729304538282, + "grad_norm": 0.9494755864143372, + "learning_rate": 1.9354747181249406e-05, + "loss": 0.3352, + "step": 2817 + }, + { + "epoch": 0.26546713454699605, + "grad_norm": 0.7967216968536377, + "learning_rate": 1.935421344383571e-05, + "loss": 0.356, + "step": 2818 + }, + { + "epoch": 0.2655613386401639, + "grad_norm": 0.8378462791442871, + "learning_rate": 1.9353679493131486e-05, + "loss": 0.3712, + "step": 2819 + }, + { + "epoch": 0.26565554273333175, + "grad_norm": 0.8824589252471924, + "learning_rate": 1.9353145329148898e-05, + "loss": 0.425, + "step": 2820 + }, + { + "epoch": 0.2657497468264996, + "grad_norm": 0.76907879114151, + "learning_rate": 1.9352610951900135e-05, + "loss": 0.3427, + "step": 2821 + }, + { + "epoch": 0.26584395091966745, + "grad_norm": 0.9133584499359131, + "learning_rate": 1.9352076361397376e-05, + "loss": 0.3723, + "step": 2822 + }, + { + "epoch": 0.2659381550128353, + "grad_norm": 0.8533929586410522, + "learning_rate": 1.9351541557652816e-05, + "loss": 0.3978, + "step": 2823 + }, + { + "epoch": 0.26603235910600315, + "grad_norm": 0.8981824517250061, + "learning_rate": 1.935100654067864e-05, + "loss": 0.3694, + "step": 2824 + }, + { + "epoch": 0.266126563199171, + "grad_norm": 0.9650942087173462, + "learning_rate": 1.9350471310487063e-05, + "loss": 0.409, + "step": 2825 + }, + { + "epoch": 0.26622076729233884, + "grad_norm": 0.8894109129905701, + "learning_rate": 1.9349935867090272e-05, + "loss": 0.3412, + "step": 2826 + }, + { + "epoch": 0.2663149713855067, + "grad_norm": 0.8993169665336609, + "learning_rate": 1.9349400210500482e-05, + "loss": 0.3701, + "step": 2827 + }, + { + "epoch": 0.26640917547867454, + "grad_norm": 0.9565675258636475, + "learning_rate": 1.9348864340729915e-05, + "loss": 0.4271, + "step": 2828 + }, + { + "epoch": 0.2665033795718424, + "grad_norm": 0.8573203682899475, + "learning_rate": 1.9348328257790777e-05, + "loss": 0.3429, + "step": 2829 + }, + { + "epoch": 0.26659758366501024, + "grad_norm": 0.8001130223274231, + "learning_rate": 1.93477919616953e-05, + "loss": 0.3578, + "step": 2830 + }, + { + "epoch": 0.2666917877581781, + "grad_norm": 1.0578571557998657, + "learning_rate": 1.9347255452455707e-05, + "loss": 0.4134, + "step": 2831 + }, + { + "epoch": 0.26678599185134594, + "grad_norm": 1.0042641162872314, + "learning_rate": 1.9346718730084238e-05, + "loss": 0.436, + "step": 2832 + }, + { + "epoch": 0.2668801959445138, + "grad_norm": 0.8931630253791809, + "learning_rate": 1.9346181794593123e-05, + "loss": 0.3758, + "step": 2833 + }, + { + "epoch": 0.26697440003768164, + "grad_norm": 0.885483980178833, + "learning_rate": 1.934564464599461e-05, + "loss": 0.3731, + "step": 2834 + }, + { + "epoch": 0.2670686041308495, + "grad_norm": 0.8488326668739319, + "learning_rate": 1.9345107284300945e-05, + "loss": 0.3707, + "step": 2835 + }, + { + "epoch": 0.26716280822401733, + "grad_norm": 0.7997558116912842, + "learning_rate": 1.9344569709524385e-05, + "loss": 0.3458, + "step": 2836 + }, + { + "epoch": 0.2672570123171852, + "grad_norm": 0.8410530090332031, + "learning_rate": 1.934403192167718e-05, + "loss": 0.3859, + "step": 2837 + }, + { + "epoch": 0.26735121641035303, + "grad_norm": 0.8481351137161255, + "learning_rate": 1.93434939207716e-05, + "loss": 0.3582, + "step": 2838 + }, + { + "epoch": 0.2674454205035209, + "grad_norm": 0.8808439373970032, + "learning_rate": 1.9342955706819905e-05, + "loss": 0.3709, + "step": 2839 + }, + { + "epoch": 0.26753962459668873, + "grad_norm": 0.8731141686439514, + "learning_rate": 1.9342417279834373e-05, + "loss": 0.3665, + "step": 2840 + }, + { + "epoch": 0.2676338286898566, + "grad_norm": 1.0484288930892944, + "learning_rate": 1.9341878639827277e-05, + "loss": 0.4142, + "step": 2841 + }, + { + "epoch": 0.2677280327830244, + "grad_norm": 0.8605663180351257, + "learning_rate": 1.93413397868109e-05, + "loss": 0.3377, + "step": 2842 + }, + { + "epoch": 0.2678222368761923, + "grad_norm": 0.8253269195556641, + "learning_rate": 1.9340800720797533e-05, + "loss": 0.381, + "step": 2843 + }, + { + "epoch": 0.2679164409693601, + "grad_norm": 0.8432108163833618, + "learning_rate": 1.934026144179946e-05, + "loss": 0.3714, + "step": 2844 + }, + { + "epoch": 0.268010645062528, + "grad_norm": 1.0049387216567993, + "learning_rate": 1.9339721949828982e-05, + "loss": 0.4052, + "step": 2845 + }, + { + "epoch": 0.2681048491556958, + "grad_norm": 0.911248505115509, + "learning_rate": 1.93391822448984e-05, + "loss": 0.4099, + "step": 2846 + }, + { + "epoch": 0.2681990532488637, + "grad_norm": 0.7077834010124207, + "learning_rate": 1.933864232702002e-05, + "loss": 0.3201, + "step": 2847 + }, + { + "epoch": 0.2682932573420315, + "grad_norm": 0.9433247447013855, + "learning_rate": 1.9338102196206155e-05, + "loss": 0.3602, + "step": 2848 + }, + { + "epoch": 0.26838746143519937, + "grad_norm": 0.7861822247505188, + "learning_rate": 1.9337561852469113e-05, + "loss": 0.3539, + "step": 2849 + }, + { + "epoch": 0.2684816655283672, + "grad_norm": 0.8620867729187012, + "learning_rate": 1.9337021295821224e-05, + "loss": 0.3702, + "step": 2850 + }, + { + "epoch": 0.26857586962153507, + "grad_norm": 0.7886176705360413, + "learning_rate": 1.9336480526274806e-05, + "loss": 0.3428, + "step": 2851 + }, + { + "epoch": 0.2686700737147029, + "grad_norm": 0.8866832852363586, + "learning_rate": 1.9335939543842195e-05, + "loss": 0.3175, + "step": 2852 + }, + { + "epoch": 0.26876427780787077, + "grad_norm": 0.9585645794868469, + "learning_rate": 1.9335398348535724e-05, + "loss": 0.4021, + "step": 2853 + }, + { + "epoch": 0.2688584819010386, + "grad_norm": 0.8415154218673706, + "learning_rate": 1.933485694036773e-05, + "loss": 0.3992, + "step": 2854 + }, + { + "epoch": 0.26895268599420646, + "grad_norm": 0.8004138469696045, + "learning_rate": 1.9334315319350567e-05, + "loss": 0.3325, + "step": 2855 + }, + { + "epoch": 0.2690468900873743, + "grad_norm": 0.8173907399177551, + "learning_rate": 1.9333773485496575e-05, + "loss": 0.3323, + "step": 2856 + }, + { + "epoch": 0.26914109418054216, + "grad_norm": 0.8906183242797852, + "learning_rate": 1.933323143881811e-05, + "loss": 0.3828, + "step": 2857 + }, + { + "epoch": 0.26923529827371, + "grad_norm": 0.8708525896072388, + "learning_rate": 1.933268917932754e-05, + "loss": 0.3806, + "step": 2858 + }, + { + "epoch": 0.26932950236687786, + "grad_norm": 0.8405883312225342, + "learning_rate": 1.9332146707037222e-05, + "loss": 0.3543, + "step": 2859 + }, + { + "epoch": 0.2694237064600457, + "grad_norm": 0.7336653470993042, + "learning_rate": 1.9331604021959526e-05, + "loss": 0.3285, + "step": 2860 + }, + { + "epoch": 0.26951791055321356, + "grad_norm": 0.8797584176063538, + "learning_rate": 1.9331061124106826e-05, + "loss": 0.3395, + "step": 2861 + }, + { + "epoch": 0.2696121146463814, + "grad_norm": 0.9453029632568359, + "learning_rate": 1.93305180134915e-05, + "loss": 0.4, + "step": 2862 + }, + { + "epoch": 0.26970631873954926, + "grad_norm": 1.0486997365951538, + "learning_rate": 1.9329974690125937e-05, + "loss": 0.3158, + "step": 2863 + }, + { + "epoch": 0.2698005228327171, + "grad_norm": 0.8365334272384644, + "learning_rate": 1.932943115402252e-05, + "loss": 0.3438, + "step": 2864 + }, + { + "epoch": 0.26989472692588495, + "grad_norm": 0.9265006184577942, + "learning_rate": 1.9328887405193645e-05, + "loss": 0.3936, + "step": 2865 + }, + { + "epoch": 0.2699889310190528, + "grad_norm": 0.8348186016082764, + "learning_rate": 1.932834344365171e-05, + "loss": 0.3479, + "step": 2866 + }, + { + "epoch": 0.27008313511222065, + "grad_norm": 0.8081981539726257, + "learning_rate": 1.9327799269409114e-05, + "loss": 0.3326, + "step": 2867 + }, + { + "epoch": 0.2701773392053885, + "grad_norm": 0.8048989176750183, + "learning_rate": 1.9327254882478272e-05, + "loss": 0.3186, + "step": 2868 + }, + { + "epoch": 0.27027154329855635, + "grad_norm": 0.8223410248756409, + "learning_rate": 1.9326710282871596e-05, + "loss": 0.3445, + "step": 2869 + }, + { + "epoch": 0.27036574739172414, + "grad_norm": 0.9238550066947937, + "learning_rate": 1.93261654706015e-05, + "loss": 0.3568, + "step": 2870 + }, + { + "epoch": 0.270459951484892, + "grad_norm": 1.181865930557251, + "learning_rate": 1.932562044568041e-05, + "loss": 0.3, + "step": 2871 + }, + { + "epoch": 0.27055415557805984, + "grad_norm": 0.8466827273368835, + "learning_rate": 1.9325075208120746e-05, + "loss": 0.367, + "step": 2872 + }, + { + "epoch": 0.2706483596712277, + "grad_norm": 0.9028671383857727, + "learning_rate": 1.9324529757934948e-05, + "loss": 0.3833, + "step": 2873 + }, + { + "epoch": 0.27074256376439554, + "grad_norm": 0.8522211313247681, + "learning_rate": 1.9323984095135454e-05, + "loss": 0.343, + "step": 2874 + }, + { + "epoch": 0.2708367678575634, + "grad_norm": 0.9431736469268799, + "learning_rate": 1.9323438219734698e-05, + "loss": 0.3687, + "step": 2875 + }, + { + "epoch": 0.27093097195073124, + "grad_norm": 0.8010151982307434, + "learning_rate": 1.9322892131745135e-05, + "loss": 0.3344, + "step": 2876 + }, + { + "epoch": 0.2710251760438991, + "grad_norm": 0.8623033761978149, + "learning_rate": 1.9322345831179214e-05, + "loss": 0.37, + "step": 2877 + }, + { + "epoch": 0.27111938013706693, + "grad_norm": 0.7385076880455017, + "learning_rate": 1.932179931804939e-05, + "loss": 0.3575, + "step": 2878 + }, + { + "epoch": 0.2712135842302348, + "grad_norm": 0.8643718957901001, + "learning_rate": 1.9321252592368124e-05, + "loss": 0.3725, + "step": 2879 + }, + { + "epoch": 0.27130778832340263, + "grad_norm": 0.8411838412284851, + "learning_rate": 1.9320705654147882e-05, + "loss": 0.3526, + "step": 2880 + }, + { + "epoch": 0.2714019924165705, + "grad_norm": 0.9364979863166809, + "learning_rate": 1.9320158503401137e-05, + "loss": 0.3791, + "step": 2881 + }, + { + "epoch": 0.27149619650973833, + "grad_norm": 0.8383809924125671, + "learning_rate": 1.9319611140140365e-05, + "loss": 0.3696, + "step": 2882 + }, + { + "epoch": 0.2715904006029062, + "grad_norm": 0.8176441788673401, + "learning_rate": 1.9319063564378048e-05, + "loss": 0.3678, + "step": 2883 + }, + { + "epoch": 0.27168460469607403, + "grad_norm": 0.858260452747345, + "learning_rate": 1.9318515776126666e-05, + "loss": 0.35, + "step": 2884 + }, + { + "epoch": 0.2717788087892419, + "grad_norm": 0.8059695363044739, + "learning_rate": 1.9317967775398717e-05, + "loss": 0.4062, + "step": 2885 + }, + { + "epoch": 0.2718730128824097, + "grad_norm": 0.884614884853363, + "learning_rate": 1.9317419562206688e-05, + "loss": 0.4595, + "step": 2886 + }, + { + "epoch": 0.2719672169755776, + "grad_norm": 0.9380375146865845, + "learning_rate": 1.9316871136563085e-05, + "loss": 0.3154, + "step": 2887 + }, + { + "epoch": 0.2720614210687454, + "grad_norm": 0.7669954299926758, + "learning_rate": 1.931632249848041e-05, + "loss": 0.3645, + "step": 2888 + }, + { + "epoch": 0.2721556251619133, + "grad_norm": 0.7636342644691467, + "learning_rate": 1.9315773647971177e-05, + "loss": 0.3302, + "step": 2889 + }, + { + "epoch": 0.2722498292550811, + "grad_norm": 0.966870903968811, + "learning_rate": 1.9315224585047896e-05, + "loss": 0.3283, + "step": 2890 + }, + { + "epoch": 0.27234403334824897, + "grad_norm": 0.9514498710632324, + "learning_rate": 1.931467530972309e-05, + "loss": 0.4385, + "step": 2891 + }, + { + "epoch": 0.2724382374414168, + "grad_norm": 0.8300036787986755, + "learning_rate": 1.931412582200928e-05, + "loss": 0.3483, + "step": 2892 + }, + { + "epoch": 0.27253244153458467, + "grad_norm": 0.9490203857421875, + "learning_rate": 1.9313576121918994e-05, + "loss": 0.2998, + "step": 2893 + }, + { + "epoch": 0.2726266456277525, + "grad_norm": 0.8159230947494507, + "learning_rate": 1.931302620946477e-05, + "loss": 0.3244, + "step": 2894 + }, + { + "epoch": 0.27272084972092037, + "grad_norm": 0.8888617157936096, + "learning_rate": 1.931247608465915e-05, + "loss": 0.3554, + "step": 2895 + }, + { + "epoch": 0.2728150538140882, + "grad_norm": 0.8638678789138794, + "learning_rate": 1.931192574751467e-05, + "loss": 0.3664, + "step": 2896 + }, + { + "epoch": 0.27290925790725606, + "grad_norm": 0.9164793491363525, + "learning_rate": 1.931137519804388e-05, + "loss": 0.3798, + "step": 2897 + }, + { + "epoch": 0.2730034620004239, + "grad_norm": 0.9217027425765991, + "learning_rate": 1.931082443625934e-05, + "loss": 0.3687, + "step": 2898 + }, + { + "epoch": 0.27309766609359176, + "grad_norm": 0.9147371053695679, + "learning_rate": 1.9310273462173597e-05, + "loss": 0.3423, + "step": 2899 + }, + { + "epoch": 0.2731918701867596, + "grad_norm": 0.9040961265563965, + "learning_rate": 1.9309722275799226e-05, + "loss": 0.3966, + "step": 2900 + }, + { + "epoch": 0.27328607427992746, + "grad_norm": 0.969359815120697, + "learning_rate": 1.9309170877148786e-05, + "loss": 0.3762, + "step": 2901 + }, + { + "epoch": 0.2733802783730953, + "grad_norm": 0.9515261650085449, + "learning_rate": 1.9308619266234855e-05, + "loss": 0.3824, + "step": 2902 + }, + { + "epoch": 0.27347448246626316, + "grad_norm": 0.7625579237937927, + "learning_rate": 1.9308067443070008e-05, + "loss": 0.315, + "step": 2903 + }, + { + "epoch": 0.273568686559431, + "grad_norm": 0.8695298433303833, + "learning_rate": 1.9307515407666826e-05, + "loss": 0.3496, + "step": 2904 + }, + { + "epoch": 0.27366289065259886, + "grad_norm": 0.7960308790206909, + "learning_rate": 1.9306963160037902e-05, + "loss": 0.3458, + "step": 2905 + }, + { + "epoch": 0.2737570947457667, + "grad_norm": 0.9033235907554626, + "learning_rate": 1.9306410700195824e-05, + "loss": 0.4401, + "step": 2906 + }, + { + "epoch": 0.27385129883893455, + "grad_norm": 2.428537368774414, + "learning_rate": 1.9305858028153186e-05, + "loss": 0.4223, + "step": 2907 + }, + { + "epoch": 0.2739455029321024, + "grad_norm": 0.7999145984649658, + "learning_rate": 1.9305305143922597e-05, + "loss": 0.3605, + "step": 2908 + }, + { + "epoch": 0.27403970702527025, + "grad_norm": 0.9468194842338562, + "learning_rate": 1.930475204751666e-05, + "loss": 0.3739, + "step": 2909 + }, + { + "epoch": 0.2741339111184381, + "grad_norm": 0.8207023739814758, + "learning_rate": 1.930419873894798e-05, + "loss": 0.3578, + "step": 2910 + }, + { + "epoch": 0.27422811521160595, + "grad_norm": 0.9839059710502625, + "learning_rate": 1.9303645218229185e-05, + "loss": 0.398, + "step": 2911 + }, + { + "epoch": 0.2743223193047738, + "grad_norm": 0.786546528339386, + "learning_rate": 1.930309148537289e-05, + "loss": 0.3149, + "step": 2912 + }, + { + "epoch": 0.27441652339794165, + "grad_norm": 0.9546533823013306, + "learning_rate": 1.930253754039172e-05, + "loss": 0.3486, + "step": 2913 + }, + { + "epoch": 0.2745107274911095, + "grad_norm": 0.8896610736846924, + "learning_rate": 1.9301983383298312e-05, + "loss": 0.3759, + "step": 2914 + }, + { + "epoch": 0.27460493158427735, + "grad_norm": 0.7661377191543579, + "learning_rate": 1.930142901410529e-05, + "loss": 0.3208, + "step": 2915 + }, + { + "epoch": 0.2746991356774452, + "grad_norm": 0.7883954644203186, + "learning_rate": 1.9300874432825307e-05, + "loss": 0.3395, + "step": 2916 + }, + { + "epoch": 0.27479333977061304, + "grad_norm": 0.8070877194404602, + "learning_rate": 1.9300319639471e-05, + "loss": 0.3499, + "step": 2917 + }, + { + "epoch": 0.2748875438637809, + "grad_norm": 0.7798469066619873, + "learning_rate": 1.9299764634055026e-05, + "loss": 0.3389, + "step": 2918 + }, + { + "epoch": 0.27498174795694874, + "grad_norm": 1.0146276950836182, + "learning_rate": 1.929920941659003e-05, + "loss": 0.3387, + "step": 2919 + }, + { + "epoch": 0.2750759520501166, + "grad_norm": 0.813177764415741, + "learning_rate": 1.9298653987088683e-05, + "loss": 0.3374, + "step": 2920 + }, + { + "epoch": 0.27517015614328444, + "grad_norm": 0.921242356300354, + "learning_rate": 1.9298098345563644e-05, + "loss": 0.3477, + "step": 2921 + }, + { + "epoch": 0.2752643602364523, + "grad_norm": 0.9590490460395813, + "learning_rate": 1.929754249202758e-05, + "loss": 0.3914, + "step": 2922 + }, + { + "epoch": 0.27535856432962014, + "grad_norm": 0.881549060344696, + "learning_rate": 1.9296986426493168e-05, + "loss": 0.3492, + "step": 2923 + }, + { + "epoch": 0.275452768422788, + "grad_norm": 0.8650107979774475, + "learning_rate": 1.9296430148973088e-05, + "loss": 0.3648, + "step": 2924 + }, + { + "epoch": 0.27554697251595583, + "grad_norm": 0.8315461874008179, + "learning_rate": 1.9295873659480024e-05, + "loss": 0.367, + "step": 2925 + }, + { + "epoch": 0.2756411766091237, + "grad_norm": 0.8754969835281372, + "learning_rate": 1.9295316958026666e-05, + "loss": 0.3452, + "step": 2926 + }, + { + "epoch": 0.27573538070229153, + "grad_norm": 0.8329753875732422, + "learning_rate": 1.9294760044625705e-05, + "loss": 0.3355, + "step": 2927 + }, + { + "epoch": 0.2758295847954594, + "grad_norm": 0.815252423286438, + "learning_rate": 1.929420291928984e-05, + "loss": 0.3803, + "step": 2928 + }, + { + "epoch": 0.27592378888862723, + "grad_norm": 0.7977908849716187, + "learning_rate": 1.9293645582031775e-05, + "loss": 0.3465, + "step": 2929 + }, + { + "epoch": 0.2760179929817951, + "grad_norm": 0.9232553839683533, + "learning_rate": 1.9293088032864218e-05, + "loss": 0.3313, + "step": 2930 + }, + { + "epoch": 0.27611219707496293, + "grad_norm": 0.9138956069946289, + "learning_rate": 1.929253027179988e-05, + "loss": 0.3818, + "step": 2931 + }, + { + "epoch": 0.2762064011681308, + "grad_norm": 0.8025838732719421, + "learning_rate": 1.9291972298851483e-05, + "loss": 0.3597, + "step": 2932 + }, + { + "epoch": 0.2763006052612986, + "grad_norm": 0.8352190852165222, + "learning_rate": 1.9291414114031744e-05, + "loss": 0.3556, + "step": 2933 + }, + { + "epoch": 0.2763948093544665, + "grad_norm": 0.9157130122184753, + "learning_rate": 1.9290855717353394e-05, + "loss": 0.3827, + "step": 2934 + }, + { + "epoch": 0.2764890134476343, + "grad_norm": 0.8056769371032715, + "learning_rate": 1.929029710882917e-05, + "loss": 0.3371, + "step": 2935 + }, + { + "epoch": 0.2765832175408022, + "grad_norm": 1.0625853538513184, + "learning_rate": 1.92897382884718e-05, + "loss": 0.3627, + "step": 2936 + }, + { + "epoch": 0.27667742163397, + "grad_norm": 0.9039706587791443, + "learning_rate": 1.9289179256294034e-05, + "loss": 0.3833, + "step": 2937 + }, + { + "epoch": 0.27677162572713787, + "grad_norm": 0.972366988658905, + "learning_rate": 1.928862001230861e-05, + "loss": 0.403, + "step": 2938 + }, + { + "epoch": 0.2768658298203057, + "grad_norm": 0.8596249222755432, + "learning_rate": 1.9288060556528287e-05, + "loss": 0.3219, + "step": 2939 + }, + { + "epoch": 0.2769600339134735, + "grad_norm": 0.8336518406867981, + "learning_rate": 1.928750088896582e-05, + "loss": 0.3962, + "step": 2940 + }, + { + "epoch": 0.27705423800664136, + "grad_norm": 0.8468518853187561, + "learning_rate": 1.9286941009633965e-05, + "loss": 0.3824, + "step": 2941 + }, + { + "epoch": 0.2771484420998092, + "grad_norm": 0.8352329730987549, + "learning_rate": 1.9286380918545497e-05, + "loss": 0.3195, + "step": 2942 + }, + { + "epoch": 0.27724264619297706, + "grad_norm": 0.8433838486671448, + "learning_rate": 1.928582061571318e-05, + "loss": 0.3407, + "step": 2943 + }, + { + "epoch": 0.2773368502861449, + "grad_norm": 0.8705379366874695, + "learning_rate": 1.9285260101149795e-05, + "loss": 0.38, + "step": 2944 + }, + { + "epoch": 0.27743105437931276, + "grad_norm": 0.9752731323242188, + "learning_rate": 1.9284699374868118e-05, + "loss": 0.3647, + "step": 2945 + }, + { + "epoch": 0.2775252584724806, + "grad_norm": 0.9700624346733093, + "learning_rate": 1.9284138436880934e-05, + "loss": 0.3757, + "step": 2946 + }, + { + "epoch": 0.27761946256564846, + "grad_norm": 0.9997284412384033, + "learning_rate": 1.928357728720104e-05, + "loss": 0.3503, + "step": 2947 + }, + { + "epoch": 0.2777136666588163, + "grad_norm": 0.8637526035308838, + "learning_rate": 1.928301592584122e-05, + "loss": 0.3199, + "step": 2948 + }, + { + "epoch": 0.27780787075198415, + "grad_norm": 0.8112731575965881, + "learning_rate": 1.928245435281429e-05, + "loss": 0.3754, + "step": 2949 + }, + { + "epoch": 0.277902074845152, + "grad_norm": 0.7939879298210144, + "learning_rate": 1.928189256813304e-05, + "loss": 0.3626, + "step": 2950 + }, + { + "epoch": 0.27799627893831985, + "grad_norm": 0.8063762187957764, + "learning_rate": 1.9281330571810282e-05, + "loss": 0.4303, + "step": 2951 + }, + { + "epoch": 0.2780904830314877, + "grad_norm": 0.795852541923523, + "learning_rate": 1.9280768363858834e-05, + "loss": 0.3721, + "step": 2952 + }, + { + "epoch": 0.27818468712465555, + "grad_norm": 0.9110891819000244, + "learning_rate": 1.9280205944291516e-05, + "loss": 0.3393, + "step": 2953 + }, + { + "epoch": 0.2782788912178234, + "grad_norm": 0.8853954076766968, + "learning_rate": 1.927964331312115e-05, + "loss": 0.3613, + "step": 2954 + }, + { + "epoch": 0.27837309531099125, + "grad_norm": 4.366802215576172, + "learning_rate": 1.9279080470360565e-05, + "loss": 0.3407, + "step": 2955 + }, + { + "epoch": 0.2784672994041591, + "grad_norm": 0.79060959815979, + "learning_rate": 1.92785174160226e-05, + "loss": 0.3472, + "step": 2956 + }, + { + "epoch": 0.27856150349732695, + "grad_norm": 0.9146413803100586, + "learning_rate": 1.927795415012008e-05, + "loss": 0.3651, + "step": 2957 + }, + { + "epoch": 0.2786557075904948, + "grad_norm": 0.9158467054367065, + "learning_rate": 1.927739067266586e-05, + "loss": 0.4311, + "step": 2958 + }, + { + "epoch": 0.27874991168366264, + "grad_norm": 0.8566685318946838, + "learning_rate": 1.9276826983672788e-05, + "loss": 0.3735, + "step": 2959 + }, + { + "epoch": 0.2788441157768305, + "grad_norm": 0.9517194032669067, + "learning_rate": 1.9276263083153708e-05, + "loss": 0.371, + "step": 2960 + }, + { + "epoch": 0.27893831986999834, + "grad_norm": 1.017311453819275, + "learning_rate": 1.927569897112149e-05, + "loss": 0.386, + "step": 2961 + }, + { + "epoch": 0.2790325239631662, + "grad_norm": 0.8881323337554932, + "learning_rate": 1.9275134647588985e-05, + "loss": 0.3573, + "step": 2962 + }, + { + "epoch": 0.27912672805633404, + "grad_norm": 0.768549919128418, + "learning_rate": 1.9274570112569067e-05, + "loss": 0.3186, + "step": 2963 + }, + { + "epoch": 0.2792209321495019, + "grad_norm": 0.883156955242157, + "learning_rate": 1.9274005366074608e-05, + "loss": 0.3517, + "step": 2964 + }, + { + "epoch": 0.27931513624266974, + "grad_norm": 0.833206832408905, + "learning_rate": 1.9273440408118486e-05, + "loss": 0.36, + "step": 2965 + }, + { + "epoch": 0.2794093403358376, + "grad_norm": 0.8506464958190918, + "learning_rate": 1.9272875238713578e-05, + "loss": 0.3828, + "step": 2966 + }, + { + "epoch": 0.27950354442900543, + "grad_norm": 0.8978786468505859, + "learning_rate": 1.9272309857872777e-05, + "loss": 0.3208, + "step": 2967 + }, + { + "epoch": 0.2795977485221733, + "grad_norm": 0.9665808081626892, + "learning_rate": 1.927174426560897e-05, + "loss": 0.3457, + "step": 2968 + }, + { + "epoch": 0.27969195261534113, + "grad_norm": 0.8978529572486877, + "learning_rate": 1.927117846193505e-05, + "loss": 0.359, + "step": 2969 + }, + { + "epoch": 0.279786156708509, + "grad_norm": 0.8304259777069092, + "learning_rate": 1.927061244686393e-05, + "loss": 0.3223, + "step": 2970 + }, + { + "epoch": 0.27988036080167683, + "grad_norm": 0.879393994808197, + "learning_rate": 1.9270046220408506e-05, + "loss": 0.314, + "step": 2971 + }, + { + "epoch": 0.2799745648948447, + "grad_norm": 0.7944085597991943, + "learning_rate": 1.926947978258169e-05, + "loss": 0.3264, + "step": 2972 + }, + { + "epoch": 0.28006876898801253, + "grad_norm": 0.7613434791564941, + "learning_rate": 1.92689131333964e-05, + "loss": 0.3351, + "step": 2973 + }, + { + "epoch": 0.2801629730811804, + "grad_norm": 1.0007861852645874, + "learning_rate": 1.9268346272865558e-05, + "loss": 0.364, + "step": 2974 + }, + { + "epoch": 0.2802571771743482, + "grad_norm": 1.1431987285614014, + "learning_rate": 1.9267779201002085e-05, + "loss": 0.4041, + "step": 2975 + }, + { + "epoch": 0.2803513812675161, + "grad_norm": 1.0769010782241821, + "learning_rate": 1.9267211917818916e-05, + "loss": 0.4118, + "step": 2976 + }, + { + "epoch": 0.2804455853606839, + "grad_norm": 0.7843855023384094, + "learning_rate": 1.926664442332898e-05, + "loss": 0.3406, + "step": 2977 + }, + { + "epoch": 0.2805397894538518, + "grad_norm": 2.05409574508667, + "learning_rate": 1.9266076717545224e-05, + "loss": 0.3654, + "step": 2978 + }, + { + "epoch": 0.2806339935470196, + "grad_norm": 1.0190244913101196, + "learning_rate": 1.9265508800480588e-05, + "loss": 0.3892, + "step": 2979 + }, + { + "epoch": 0.28072819764018747, + "grad_norm": 0.9311212301254272, + "learning_rate": 1.9264940672148018e-05, + "loss": 0.3977, + "step": 2980 + }, + { + "epoch": 0.2808224017333553, + "grad_norm": 1.1254518032073975, + "learning_rate": 1.9264372332560475e-05, + "loss": 0.3527, + "step": 2981 + }, + { + "epoch": 0.28091660582652317, + "grad_norm": 0.9334866404533386, + "learning_rate": 1.9263803781730917e-05, + "loss": 0.3648, + "step": 2982 + }, + { + "epoch": 0.281010809919691, + "grad_norm": 0.928142786026001, + "learning_rate": 1.92632350196723e-05, + "loss": 0.4029, + "step": 2983 + }, + { + "epoch": 0.28110501401285887, + "grad_norm": 0.8772962689399719, + "learning_rate": 1.9262666046397603e-05, + "loss": 0.3563, + "step": 2984 + }, + { + "epoch": 0.2811992181060267, + "grad_norm": 0.959918737411499, + "learning_rate": 1.9262096861919797e-05, + "loss": 0.3952, + "step": 2985 + }, + { + "epoch": 0.28129342219919456, + "grad_norm": 0.8838319778442383, + "learning_rate": 1.9261527466251856e-05, + "loss": 0.3789, + "step": 2986 + }, + { + "epoch": 0.2813876262923624, + "grad_norm": 0.8237535357475281, + "learning_rate": 1.9260957859406763e-05, + "loss": 0.3433, + "step": 2987 + }, + { + "epoch": 0.28148183038553026, + "grad_norm": 0.7726234793663025, + "learning_rate": 1.9260388041397512e-05, + "loss": 0.3504, + "step": 2988 + }, + { + "epoch": 0.2815760344786981, + "grad_norm": 1.0417954921722412, + "learning_rate": 1.9259818012237092e-05, + "loss": 0.388, + "step": 2989 + }, + { + "epoch": 0.28167023857186596, + "grad_norm": 0.7924198508262634, + "learning_rate": 1.92592477719385e-05, + "loss": 0.3674, + "step": 2990 + }, + { + "epoch": 0.2817644426650338, + "grad_norm": 0.8858199119567871, + "learning_rate": 1.925867732051474e-05, + "loss": 0.3828, + "step": 2991 + }, + { + "epoch": 0.28185864675820166, + "grad_norm": 0.9502096176147461, + "learning_rate": 1.9258106657978816e-05, + "loss": 0.3831, + "step": 2992 + }, + { + "epoch": 0.2819528508513695, + "grad_norm": 0.7754489779472351, + "learning_rate": 1.9257535784343743e-05, + "loss": 0.3135, + "step": 2993 + }, + { + "epoch": 0.28204705494453736, + "grad_norm": 1.1764442920684814, + "learning_rate": 1.925696469962254e-05, + "loss": 0.3802, + "step": 2994 + }, + { + "epoch": 0.2821412590377052, + "grad_norm": 1.030421495437622, + "learning_rate": 1.925639340382822e-05, + "loss": 0.4167, + "step": 2995 + }, + { + "epoch": 0.28223546313087305, + "grad_norm": 1.0301690101623535, + "learning_rate": 1.925582189697382e-05, + "loss": 0.3447, + "step": 2996 + }, + { + "epoch": 0.2823296672240409, + "grad_norm": 0.8555263876914978, + "learning_rate": 1.9255250179072365e-05, + "loss": 0.3645, + "step": 2997 + }, + { + "epoch": 0.28242387131720875, + "grad_norm": 1.2720202207565308, + "learning_rate": 1.9254678250136893e-05, + "loss": 0.4207, + "step": 2998 + }, + { + "epoch": 0.2825180754103766, + "grad_norm": 0.7826864123344421, + "learning_rate": 1.9254106110180442e-05, + "loss": 0.3546, + "step": 2999 + }, + { + "epoch": 0.28261227950354445, + "grad_norm": 0.9047046899795532, + "learning_rate": 1.9253533759216063e-05, + "loss": 0.3881, + "step": 3000 + }, + { + "epoch": 0.2827064835967123, + "grad_norm": 0.9287173748016357, + "learning_rate": 1.9252961197256802e-05, + "loss": 0.4003, + "step": 3001 + }, + { + "epoch": 0.28280068768988015, + "grad_norm": 0.8424138426780701, + "learning_rate": 1.9252388424315716e-05, + "loss": 0.3601, + "step": 3002 + }, + { + "epoch": 0.282894891783048, + "grad_norm": 0.8370364308357239, + "learning_rate": 1.9251815440405862e-05, + "loss": 0.354, + "step": 3003 + }, + { + "epoch": 0.28298909587621585, + "grad_norm": 1.424588680267334, + "learning_rate": 1.925124224554031e-05, + "loss": 0.3957, + "step": 3004 + }, + { + "epoch": 0.2830832999693837, + "grad_norm": 0.8218227028846741, + "learning_rate": 1.9250668839732127e-05, + "loss": 0.3777, + "step": 3005 + }, + { + "epoch": 0.28317750406255154, + "grad_norm": 0.9550889730453491, + "learning_rate": 1.925009522299439e-05, + "loss": 0.372, + "step": 3006 + }, + { + "epoch": 0.2832717081557194, + "grad_norm": 1.0440438985824585, + "learning_rate": 1.9249521395340177e-05, + "loss": 0.3646, + "step": 3007 + }, + { + "epoch": 0.28336591224888724, + "grad_norm": 0.9570475220680237, + "learning_rate": 1.924894735678257e-05, + "loss": 0.368, + "step": 3008 + }, + { + "epoch": 0.28346011634205504, + "grad_norm": 0.8169949650764465, + "learning_rate": 1.9248373107334656e-05, + "loss": 0.3582, + "step": 3009 + }, + { + "epoch": 0.2835543204352229, + "grad_norm": 1.0188626050949097, + "learning_rate": 1.9247798647009536e-05, + "loss": 0.3905, + "step": 3010 + }, + { + "epoch": 0.28364852452839073, + "grad_norm": 0.7640846371650696, + "learning_rate": 1.9247223975820303e-05, + "loss": 0.3535, + "step": 3011 + }, + { + "epoch": 0.2837427286215586, + "grad_norm": 0.8944517970085144, + "learning_rate": 1.9246649093780063e-05, + "loss": 0.341, + "step": 3012 + }, + { + "epoch": 0.28383693271472643, + "grad_norm": 0.8176144361495972, + "learning_rate": 1.9246074000901925e-05, + "loss": 0.3574, + "step": 3013 + }, + { + "epoch": 0.2839311368078943, + "grad_norm": 1.0871111154556274, + "learning_rate": 1.9245498697198997e-05, + "loss": 0.4363, + "step": 3014 + }, + { + "epoch": 0.28402534090106213, + "grad_norm": 0.8974987864494324, + "learning_rate": 1.9244923182684406e-05, + "loss": 0.3993, + "step": 3015 + }, + { + "epoch": 0.28411954499423, + "grad_norm": 0.8624328374862671, + "learning_rate": 1.9244347457371266e-05, + "loss": 0.3661, + "step": 3016 + }, + { + "epoch": 0.2842137490873978, + "grad_norm": 0.8629999160766602, + "learning_rate": 1.9243771521272706e-05, + "loss": 0.4342, + "step": 3017 + }, + { + "epoch": 0.2843079531805657, + "grad_norm": 0.9005641341209412, + "learning_rate": 1.924319537440186e-05, + "loss": 0.4474, + "step": 3018 + }, + { + "epoch": 0.2844021572737335, + "grad_norm": 0.9175708293914795, + "learning_rate": 1.9242619016771865e-05, + "loss": 0.3955, + "step": 3019 + }, + { + "epoch": 0.2844963613669014, + "grad_norm": 0.8099044561386108, + "learning_rate": 1.9242042448395862e-05, + "loss": 0.3283, + "step": 3020 + }, + { + "epoch": 0.2845905654600692, + "grad_norm": 0.947959303855896, + "learning_rate": 1.9241465669286996e-05, + "loss": 0.4325, + "step": 3021 + }, + { + "epoch": 0.28468476955323707, + "grad_norm": 0.8129771947860718, + "learning_rate": 1.9240888679458424e-05, + "loss": 0.3597, + "step": 3022 + }, + { + "epoch": 0.2847789736464049, + "grad_norm": 0.9085898399353027, + "learning_rate": 1.92403114789233e-05, + "loss": 0.3587, + "step": 3023 + }, + { + "epoch": 0.28487317773957277, + "grad_norm": 0.9046205282211304, + "learning_rate": 1.923973406769478e-05, + "loss": 0.4014, + "step": 3024 + }, + { + "epoch": 0.2849673818327406, + "grad_norm": 0.8786502480506897, + "learning_rate": 1.9239156445786037e-05, + "loss": 0.3593, + "step": 3025 + }, + { + "epoch": 0.28506158592590847, + "grad_norm": 0.8941583037376404, + "learning_rate": 1.9238578613210236e-05, + "loss": 0.371, + "step": 3026 + }, + { + "epoch": 0.2851557900190763, + "grad_norm": 0.9228273034095764, + "learning_rate": 1.9238000569980553e-05, + "loss": 0.4221, + "step": 3027 + }, + { + "epoch": 0.28524999411224417, + "grad_norm": 0.8081850409507751, + "learning_rate": 1.9237422316110175e-05, + "loss": 0.3951, + "step": 3028 + }, + { + "epoch": 0.285344198205412, + "grad_norm": 0.8693435192108154, + "learning_rate": 1.9236843851612278e-05, + "loss": 0.3937, + "step": 3029 + }, + { + "epoch": 0.28543840229857986, + "grad_norm": 1.069024920463562, + "learning_rate": 1.923626517650006e-05, + "loss": 0.3762, + "step": 3030 + }, + { + "epoch": 0.2855326063917477, + "grad_norm": 0.9611799120903015, + "learning_rate": 1.923568629078671e-05, + "loss": 0.3533, + "step": 3031 + }, + { + "epoch": 0.28562681048491556, + "grad_norm": 0.8414198756217957, + "learning_rate": 1.923510719448543e-05, + "loss": 0.3685, + "step": 3032 + }, + { + "epoch": 0.2857210145780834, + "grad_norm": 0.8800758719444275, + "learning_rate": 1.923452788760942e-05, + "loss": 0.3708, + "step": 3033 + }, + { + "epoch": 0.28581521867125126, + "grad_norm": 0.760816752910614, + "learning_rate": 1.9233948370171894e-05, + "loss": 0.3609, + "step": 3034 + }, + { + "epoch": 0.2859094227644191, + "grad_norm": 0.9222679734230042, + "learning_rate": 1.923336864218607e-05, + "loss": 0.3904, + "step": 3035 + }, + { + "epoch": 0.28600362685758696, + "grad_norm": 0.8487080931663513, + "learning_rate": 1.9232788703665157e-05, + "loss": 0.3236, + "step": 3036 + }, + { + "epoch": 0.2860978309507548, + "grad_norm": 0.9020145535469055, + "learning_rate": 1.9232208554622382e-05, + "loss": 0.4112, + "step": 3037 + }, + { + "epoch": 0.28619203504392265, + "grad_norm": 0.8178302049636841, + "learning_rate": 1.9231628195070973e-05, + "loss": 0.3394, + "step": 3038 + }, + { + "epoch": 0.2862862391370905, + "grad_norm": 0.7846415042877197, + "learning_rate": 1.923104762502417e-05, + "loss": 0.3647, + "step": 3039 + }, + { + "epoch": 0.28638044323025835, + "grad_norm": 0.8704529404640198, + "learning_rate": 1.92304668444952e-05, + "loss": 0.3605, + "step": 3040 + }, + { + "epoch": 0.2864746473234262, + "grad_norm": 0.9248875379562378, + "learning_rate": 1.9229885853497312e-05, + "loss": 0.3989, + "step": 3041 + }, + { + "epoch": 0.28656885141659405, + "grad_norm": 0.8046138882637024, + "learning_rate": 1.9229304652043754e-05, + "loss": 0.402, + "step": 3042 + }, + { + "epoch": 0.2866630555097619, + "grad_norm": 0.869045615196228, + "learning_rate": 1.9228723240147773e-05, + "loss": 0.3706, + "step": 3043 + }, + { + "epoch": 0.28675725960292975, + "grad_norm": 0.7658939957618713, + "learning_rate": 1.9228141617822632e-05, + "loss": 0.338, + "step": 3044 + }, + { + "epoch": 0.2868514636960976, + "grad_norm": 0.7998064160346985, + "learning_rate": 1.922755978508159e-05, + "loss": 0.3526, + "step": 3045 + }, + { + "epoch": 0.28694566778926545, + "grad_norm": 0.8407435417175293, + "learning_rate": 1.9226977741937915e-05, + "loss": 0.3784, + "step": 3046 + }, + { + "epoch": 0.2870398718824333, + "grad_norm": 2.0962464809417725, + "learning_rate": 1.9226395488404875e-05, + "loss": 0.3773, + "step": 3047 + }, + { + "epoch": 0.28713407597560114, + "grad_norm": 0.9090295433998108, + "learning_rate": 1.9225813024495753e-05, + "loss": 0.3856, + "step": 3048 + }, + { + "epoch": 0.287228280068769, + "grad_norm": 0.8656986355781555, + "learning_rate": 1.9225230350223826e-05, + "loss": 0.3295, + "step": 3049 + }, + { + "epoch": 0.28732248416193684, + "grad_norm": 0.7657508850097656, + "learning_rate": 1.9224647465602374e-05, + "loss": 0.3147, + "step": 3050 + }, + { + "epoch": 0.2874166882551047, + "grad_norm": 0.8553867340087891, + "learning_rate": 1.92240643706447e-05, + "loss": 0.3496, + "step": 3051 + }, + { + "epoch": 0.28751089234827254, + "grad_norm": 0.8059185743331909, + "learning_rate": 1.922348106536409e-05, + "loss": 0.315, + "step": 3052 + }, + { + "epoch": 0.2876050964414404, + "grad_norm": 0.7943899035453796, + "learning_rate": 1.922289754977385e-05, + "loss": 0.3599, + "step": 3053 + }, + { + "epoch": 0.28769930053460824, + "grad_norm": 0.9065893888473511, + "learning_rate": 1.922231382388728e-05, + "loss": 0.3816, + "step": 3054 + }, + { + "epoch": 0.2877935046277761, + "grad_norm": 0.7682626843452454, + "learning_rate": 1.9221729887717693e-05, + "loss": 0.3591, + "step": 3055 + }, + { + "epoch": 0.28788770872094394, + "grad_norm": 0.8986775875091553, + "learning_rate": 1.92211457412784e-05, + "loss": 0.3394, + "step": 3056 + }, + { + "epoch": 0.2879819128141118, + "grad_norm": 0.8122639656066895, + "learning_rate": 1.9220561384582726e-05, + "loss": 0.3704, + "step": 3057 + }, + { + "epoch": 0.28807611690727963, + "grad_norm": 0.9597777724266052, + "learning_rate": 1.9219976817643994e-05, + "loss": 0.3579, + "step": 3058 + }, + { + "epoch": 0.2881703210004475, + "grad_norm": 1.2639124393463135, + "learning_rate": 1.921939204047553e-05, + "loss": 0.3733, + "step": 3059 + }, + { + "epoch": 0.28826452509361533, + "grad_norm": 0.875393807888031, + "learning_rate": 1.921880705309067e-05, + "loss": 0.349, + "step": 3060 + }, + { + "epoch": 0.2883587291867832, + "grad_norm": 0.817523181438446, + "learning_rate": 1.921822185550275e-05, + "loss": 0.336, + "step": 3061 + }, + { + "epoch": 0.28845293327995103, + "grad_norm": 0.9193385243415833, + "learning_rate": 1.9217636447725118e-05, + "loss": 0.3918, + "step": 3062 + }, + { + "epoch": 0.2885471373731189, + "grad_norm": 0.8694098591804504, + "learning_rate": 1.9217050829771116e-05, + "loss": 0.282, + "step": 3063 + }, + { + "epoch": 0.2886413414662867, + "grad_norm": 0.7830197811126709, + "learning_rate": 1.9216465001654105e-05, + "loss": 0.3407, + "step": 3064 + }, + { + "epoch": 0.2887355455594546, + "grad_norm": 0.8466004133224487, + "learning_rate": 1.9215878963387434e-05, + "loss": 0.3772, + "step": 3065 + }, + { + "epoch": 0.2888297496526224, + "grad_norm": 0.9068024754524231, + "learning_rate": 1.9215292714984475e-05, + "loss": 0.3431, + "step": 3066 + }, + { + "epoch": 0.2889239537457903, + "grad_norm": 0.9880037307739258, + "learning_rate": 1.9214706256458585e-05, + "loss": 0.3957, + "step": 3067 + }, + { + "epoch": 0.2890181578389581, + "grad_norm": 0.9142576456069946, + "learning_rate": 1.9214119587823147e-05, + "loss": 0.3879, + "step": 3068 + }, + { + "epoch": 0.28911236193212597, + "grad_norm": 0.8890142440795898, + "learning_rate": 1.921353270909153e-05, + "loss": 0.3919, + "step": 3069 + }, + { + "epoch": 0.2892065660252938, + "grad_norm": 0.7431747913360596, + "learning_rate": 1.9212945620277118e-05, + "loss": 0.3461, + "step": 3070 + }, + { + "epoch": 0.28930077011846167, + "grad_norm": 0.8724109530448914, + "learning_rate": 1.9212358321393297e-05, + "loss": 0.3134, + "step": 3071 + }, + { + "epoch": 0.2893949742116295, + "grad_norm": 0.8182178139686584, + "learning_rate": 1.921177081245346e-05, + "loss": 0.3252, + "step": 3072 + }, + { + "epoch": 0.28948917830479737, + "grad_norm": 0.7662839293479919, + "learning_rate": 1.9211183093471004e-05, + "loss": 0.3684, + "step": 3073 + }, + { + "epoch": 0.2895833823979652, + "grad_norm": 0.8630828857421875, + "learning_rate": 1.9210595164459326e-05, + "loss": 0.3703, + "step": 3074 + }, + { + "epoch": 0.28967758649113307, + "grad_norm": 0.8949058651924133, + "learning_rate": 1.9210007025431835e-05, + "loss": 0.3974, + "step": 3075 + }, + { + "epoch": 0.2897717905843009, + "grad_norm": 1.089571475982666, + "learning_rate": 1.9209418676401936e-05, + "loss": 0.3937, + "step": 3076 + }, + { + "epoch": 0.28986599467746876, + "grad_norm": 0.8037537336349487, + "learning_rate": 1.9208830117383056e-05, + "loss": 0.3499, + "step": 3077 + }, + { + "epoch": 0.28996019877063656, + "grad_norm": 0.7318845391273499, + "learning_rate": 1.92082413483886e-05, + "loss": 0.3134, + "step": 3078 + }, + { + "epoch": 0.2900544028638044, + "grad_norm": 0.8375080823898315, + "learning_rate": 1.9207652369432005e-05, + "loss": 0.3803, + "step": 3079 + }, + { + "epoch": 0.29014860695697225, + "grad_norm": 0.812122642993927, + "learning_rate": 1.9207063180526695e-05, + "loss": 0.4008, + "step": 3080 + }, + { + "epoch": 0.2902428110501401, + "grad_norm": 0.7605108618736267, + "learning_rate": 1.9206473781686106e-05, + "loss": 0.3537, + "step": 3081 + }, + { + "epoch": 0.29033701514330795, + "grad_norm": 0.7535540461540222, + "learning_rate": 1.9205884172923675e-05, + "loss": 0.3237, + "step": 3082 + }, + { + "epoch": 0.2904312192364758, + "grad_norm": 0.8617050647735596, + "learning_rate": 1.920529435425285e-05, + "loss": 0.3524, + "step": 3083 + }, + { + "epoch": 0.29052542332964365, + "grad_norm": 0.7852376699447632, + "learning_rate": 1.9204704325687075e-05, + "loss": 0.3551, + "step": 3084 + }, + { + "epoch": 0.2906196274228115, + "grad_norm": 0.8138406276702881, + "learning_rate": 1.9204114087239806e-05, + "loss": 0.347, + "step": 3085 + }, + { + "epoch": 0.29071383151597935, + "grad_norm": 0.8943343162536621, + "learning_rate": 1.9203523638924504e-05, + "loss": 0.3637, + "step": 3086 + }, + { + "epoch": 0.2908080356091472, + "grad_norm": 0.792445182800293, + "learning_rate": 1.9202932980754628e-05, + "loss": 0.3698, + "step": 3087 + }, + { + "epoch": 0.29090223970231505, + "grad_norm": 0.8244633078575134, + "learning_rate": 1.920234211274365e-05, + "loss": 0.3698, + "step": 3088 + }, + { + "epoch": 0.2909964437954829, + "grad_norm": 0.8255532383918762, + "learning_rate": 1.9201751034905037e-05, + "loss": 0.3679, + "step": 3089 + }, + { + "epoch": 0.29109064788865074, + "grad_norm": 1.2693150043487549, + "learning_rate": 1.920115974725227e-05, + "loss": 0.3796, + "step": 3090 + }, + { + "epoch": 0.2911848519818186, + "grad_norm": 0.8762882947921753, + "learning_rate": 1.920056824979883e-05, + "loss": 0.3515, + "step": 3091 + }, + { + "epoch": 0.29127905607498644, + "grad_norm": 0.9370526075363159, + "learning_rate": 1.9199976542558206e-05, + "loss": 0.3211, + "step": 3092 + }, + { + "epoch": 0.2913732601681543, + "grad_norm": 0.8304166793823242, + "learning_rate": 1.919938462554389e-05, + "loss": 0.347, + "step": 3093 + }, + { + "epoch": 0.29146746426132214, + "grad_norm": 0.8803110718727112, + "learning_rate": 1.919879249876938e-05, + "loss": 0.3431, + "step": 3094 + }, + { + "epoch": 0.29156166835449, + "grad_norm": 0.8673834800720215, + "learning_rate": 1.9198200162248167e-05, + "loss": 0.3735, + "step": 3095 + }, + { + "epoch": 0.29165587244765784, + "grad_norm": 0.7874166369438171, + "learning_rate": 1.919760761599377e-05, + "loss": 0.3228, + "step": 3096 + }, + { + "epoch": 0.2917500765408257, + "grad_norm": 0.8854159712791443, + "learning_rate": 1.9197014860019695e-05, + "loss": 0.3854, + "step": 3097 + }, + { + "epoch": 0.29184428063399354, + "grad_norm": 0.8951593637466431, + "learning_rate": 1.9196421894339455e-05, + "loss": 0.3805, + "step": 3098 + }, + { + "epoch": 0.2919384847271614, + "grad_norm": 0.8391319513320923, + "learning_rate": 1.9195828718966577e-05, + "loss": 0.3672, + "step": 3099 + }, + { + "epoch": 0.29203268882032923, + "grad_norm": 0.7788271903991699, + "learning_rate": 1.9195235333914582e-05, + "loss": 0.3448, + "step": 3100 + }, + { + "epoch": 0.2921268929134971, + "grad_norm": 0.7561749219894409, + "learning_rate": 1.9194641739196996e-05, + "loss": 0.3348, + "step": 3101 + }, + { + "epoch": 0.29222109700666493, + "grad_norm": 0.7474419474601746, + "learning_rate": 1.9194047934827365e-05, + "loss": 0.3525, + "step": 3102 + }, + { + "epoch": 0.2923153010998328, + "grad_norm": 0.8443570733070374, + "learning_rate": 1.919345392081922e-05, + "loss": 0.3959, + "step": 3103 + }, + { + "epoch": 0.29240950519300063, + "grad_norm": 0.9475430846214294, + "learning_rate": 1.9192859697186105e-05, + "loss": 0.3922, + "step": 3104 + }, + { + "epoch": 0.2925037092861685, + "grad_norm": 0.8380106091499329, + "learning_rate": 1.9192265263941575e-05, + "loss": 0.3514, + "step": 3105 + }, + { + "epoch": 0.2925979133793363, + "grad_norm": 0.8615056872367859, + "learning_rate": 1.919167062109918e-05, + "loss": 0.3334, + "step": 3106 + }, + { + "epoch": 0.2926921174725042, + "grad_norm": 0.7354727983474731, + "learning_rate": 1.9191075768672477e-05, + "loss": 0.312, + "step": 3107 + }, + { + "epoch": 0.292786321565672, + "grad_norm": 0.9131101369857788, + "learning_rate": 1.9190480706675035e-05, + "loss": 0.4067, + "step": 3108 + }, + { + "epoch": 0.2928805256588399, + "grad_norm": 0.961203932762146, + "learning_rate": 1.918988543512042e-05, + "loss": 0.4389, + "step": 3109 + }, + { + "epoch": 0.2929747297520077, + "grad_norm": 0.7584219574928284, + "learning_rate": 1.9189289954022207e-05, + "loss": 0.3338, + "step": 3110 + }, + { + "epoch": 0.29306893384517557, + "grad_norm": 0.8488348722457886, + "learning_rate": 1.9188694263393967e-05, + "loss": 0.3306, + "step": 3111 + }, + { + "epoch": 0.2931631379383434, + "grad_norm": 0.8354858756065369, + "learning_rate": 1.9188098363249288e-05, + "loss": 0.3299, + "step": 3112 + }, + { + "epoch": 0.29325734203151127, + "grad_norm": 0.9114026427268982, + "learning_rate": 1.9187502253601757e-05, + "loss": 0.3528, + "step": 3113 + }, + { + "epoch": 0.2933515461246791, + "grad_norm": 0.9232490658760071, + "learning_rate": 1.9186905934464967e-05, + "loss": 0.3506, + "step": 3114 + }, + { + "epoch": 0.29344575021784697, + "grad_norm": 0.9491410255432129, + "learning_rate": 1.918630940585251e-05, + "loss": 0.3936, + "step": 3115 + }, + { + "epoch": 0.2935399543110148, + "grad_norm": 0.8008426427841187, + "learning_rate": 1.9185712667777995e-05, + "loss": 0.3512, + "step": 3116 + }, + { + "epoch": 0.29363415840418267, + "grad_norm": 0.8293586373329163, + "learning_rate": 1.9185115720255027e-05, + "loss": 0.4341, + "step": 3117 + }, + { + "epoch": 0.2937283624973505, + "grad_norm": 0.8840357065200806, + "learning_rate": 1.9184518563297213e-05, + "loss": 0.3662, + "step": 3118 + }, + { + "epoch": 0.29382256659051836, + "grad_norm": 0.8755851984024048, + "learning_rate": 1.918392119691817e-05, + "loss": 0.3425, + "step": 3119 + }, + { + "epoch": 0.2939167706836862, + "grad_norm": 0.7893072366714478, + "learning_rate": 1.9183323621131523e-05, + "loss": 0.3207, + "step": 3120 + }, + { + "epoch": 0.29401097477685406, + "grad_norm": 0.905914843082428, + "learning_rate": 1.9182725835950894e-05, + "loss": 0.3187, + "step": 3121 + }, + { + "epoch": 0.2941051788700219, + "grad_norm": 0.8867089748382568, + "learning_rate": 1.9182127841389917e-05, + "loss": 0.3474, + "step": 3122 + }, + { + "epoch": 0.29419938296318976, + "grad_norm": 0.9891049265861511, + "learning_rate": 1.9181529637462222e-05, + "loss": 0.3434, + "step": 3123 + }, + { + "epoch": 0.2942935870563576, + "grad_norm": 0.8790674209594727, + "learning_rate": 1.9180931224181455e-05, + "loss": 0.3652, + "step": 3124 + }, + { + "epoch": 0.29438779114952546, + "grad_norm": 0.7837187051773071, + "learning_rate": 1.9180332601561255e-05, + "loss": 0.3344, + "step": 3125 + }, + { + "epoch": 0.2944819952426933, + "grad_norm": 0.8639039993286133, + "learning_rate": 1.9179733769615273e-05, + "loss": 0.3487, + "step": 3126 + }, + { + "epoch": 0.29457619933586116, + "grad_norm": 0.7621277570724487, + "learning_rate": 1.9179134728357164e-05, + "loss": 0.3546, + "step": 3127 + }, + { + "epoch": 0.294670403429029, + "grad_norm": 0.9621046781539917, + "learning_rate": 1.9178535477800588e-05, + "loss": 0.4256, + "step": 3128 + }, + { + "epoch": 0.29476460752219685, + "grad_norm": 0.9978613257408142, + "learning_rate": 1.9177936017959213e-05, + "loss": 0.3559, + "step": 3129 + }, + { + "epoch": 0.2948588116153647, + "grad_norm": 0.7238567471504211, + "learning_rate": 1.9177336348846696e-05, + "loss": 0.3594, + "step": 3130 + }, + { + "epoch": 0.29495301570853255, + "grad_norm": 0.8040878176689148, + "learning_rate": 1.9176736470476723e-05, + "loss": 0.3409, + "step": 3131 + }, + { + "epoch": 0.2950472198017004, + "grad_norm": 0.7095673084259033, + "learning_rate": 1.917613638286296e-05, + "loss": 0.3179, + "step": 3132 + }, + { + "epoch": 0.29514142389486825, + "grad_norm": 0.7767865657806396, + "learning_rate": 1.91755360860191e-05, + "loss": 0.3149, + "step": 3133 + }, + { + "epoch": 0.2952356279880361, + "grad_norm": 0.8106109499931335, + "learning_rate": 1.9174935579958828e-05, + "loss": 0.353, + "step": 3134 + }, + { + "epoch": 0.29532983208120395, + "grad_norm": 0.8412537574768066, + "learning_rate": 1.9174334864695834e-05, + "loss": 0.3444, + "step": 3135 + }, + { + "epoch": 0.2954240361743718, + "grad_norm": 1.0176568031311035, + "learning_rate": 1.9173733940243817e-05, + "loss": 0.4045, + "step": 3136 + }, + { + "epoch": 0.29551824026753964, + "grad_norm": 0.8890383243560791, + "learning_rate": 1.9173132806616477e-05, + "loss": 0.3678, + "step": 3137 + }, + { + "epoch": 0.2956124443607075, + "grad_norm": 0.7867995500564575, + "learning_rate": 1.917253146382753e-05, + "loss": 0.3349, + "step": 3138 + }, + { + "epoch": 0.29570664845387534, + "grad_norm": 1.007822036743164, + "learning_rate": 1.917192991189067e-05, + "loss": 0.3936, + "step": 3139 + }, + { + "epoch": 0.2958008525470432, + "grad_norm": 0.8795698881149292, + "learning_rate": 1.917132815081963e-05, + "loss": 0.4265, + "step": 3140 + }, + { + "epoch": 0.29589505664021104, + "grad_norm": 0.8458032011985779, + "learning_rate": 1.917072618062812e-05, + "loss": 0.376, + "step": 3141 + }, + { + "epoch": 0.2959892607333789, + "grad_norm": 0.9046775698661804, + "learning_rate": 1.9170124001329873e-05, + "loss": 0.3758, + "step": 3142 + }, + { + "epoch": 0.29608346482654674, + "grad_norm": 0.7450736165046692, + "learning_rate": 1.916952161293862e-05, + "loss": 0.3558, + "step": 3143 + }, + { + "epoch": 0.2961776689197146, + "grad_norm": 0.8185693621635437, + "learning_rate": 1.9168919015468086e-05, + "loss": 0.35, + "step": 3144 + }, + { + "epoch": 0.29627187301288244, + "grad_norm": 0.8537670373916626, + "learning_rate": 1.916831620893202e-05, + "loss": 0.3881, + "step": 3145 + }, + { + "epoch": 0.2963660771060503, + "grad_norm": 0.8352190256118774, + "learning_rate": 1.916771319334417e-05, + "loss": 0.3928, + "step": 3146 + }, + { + "epoch": 0.2964602811992181, + "grad_norm": 0.8838286399841309, + "learning_rate": 1.916710996871828e-05, + "loss": 0.3924, + "step": 3147 + }, + { + "epoch": 0.2965544852923859, + "grad_norm": 1.0096112489700317, + "learning_rate": 1.91665065350681e-05, + "loss": 0.292, + "step": 3148 + }, + { + "epoch": 0.2966486893855538, + "grad_norm": 0.8836523294448853, + "learning_rate": 1.9165902892407402e-05, + "loss": 0.3494, + "step": 3149 + }, + { + "epoch": 0.2967428934787216, + "grad_norm": 0.8463241457939148, + "learning_rate": 1.9165299040749937e-05, + "loss": 0.3971, + "step": 3150 + }, + { + "epoch": 0.2968370975718895, + "grad_norm": 0.8585659861564636, + "learning_rate": 1.9164694980109484e-05, + "loss": 0.3701, + "step": 3151 + }, + { + "epoch": 0.2969313016650573, + "grad_norm": 0.822482705116272, + "learning_rate": 1.916409071049981e-05, + "loss": 0.3571, + "step": 3152 + }, + { + "epoch": 0.2970255057582252, + "grad_norm": 0.9102155566215515, + "learning_rate": 1.9163486231934693e-05, + "loss": 0.3845, + "step": 3153 + }, + { + "epoch": 0.297119709851393, + "grad_norm": 0.9315751194953918, + "learning_rate": 1.916288154442792e-05, + "loss": 0.3398, + "step": 3154 + }, + { + "epoch": 0.29721391394456087, + "grad_norm": 0.8502662777900696, + "learning_rate": 1.9162276647993278e-05, + "loss": 0.3402, + "step": 3155 + }, + { + "epoch": 0.2973081180377287, + "grad_norm": 0.7931502461433411, + "learning_rate": 1.9161671542644557e-05, + "loss": 0.3376, + "step": 3156 + }, + { + "epoch": 0.29740232213089657, + "grad_norm": 0.9208962917327881, + "learning_rate": 1.916106622839556e-05, + "loss": 0.3598, + "step": 3157 + }, + { + "epoch": 0.2974965262240644, + "grad_norm": 0.8193071484565735, + "learning_rate": 1.916046070526008e-05, + "loss": 0.3931, + "step": 3158 + }, + { + "epoch": 0.29759073031723227, + "grad_norm": 0.8380117416381836, + "learning_rate": 1.9159854973251932e-05, + "loss": 0.3781, + "step": 3159 + }, + { + "epoch": 0.2976849344104001, + "grad_norm": 0.9172781109809875, + "learning_rate": 1.9159249032384924e-05, + "loss": 0.3828, + "step": 3160 + }, + { + "epoch": 0.29777913850356796, + "grad_norm": 0.8817245364189148, + "learning_rate": 1.9158642882672873e-05, + "loss": 0.39, + "step": 3161 + }, + { + "epoch": 0.2978733425967358, + "grad_norm": 1.0869908332824707, + "learning_rate": 1.91580365241296e-05, + "loss": 0.4045, + "step": 3162 + }, + { + "epoch": 0.29796754668990366, + "grad_norm": 1.013283610343933, + "learning_rate": 1.9157429956768932e-05, + "loss": 0.3322, + "step": 3163 + }, + { + "epoch": 0.2980617507830715, + "grad_norm": 0.9602604508399963, + "learning_rate": 1.91568231806047e-05, + "loss": 0.3396, + "step": 3164 + }, + { + "epoch": 0.29815595487623936, + "grad_norm": 0.8908482193946838, + "learning_rate": 1.9156216195650735e-05, + "loss": 0.36, + "step": 3165 + }, + { + "epoch": 0.2982501589694072, + "grad_norm": 0.893203854560852, + "learning_rate": 1.915560900192088e-05, + "loss": 0.3369, + "step": 3166 + }, + { + "epoch": 0.29834436306257506, + "grad_norm": 0.824182391166687, + "learning_rate": 1.9155001599428982e-05, + "loss": 0.3034, + "step": 3167 + }, + { + "epoch": 0.2984385671557429, + "grad_norm": 0.8576279282569885, + "learning_rate": 1.915439398818889e-05, + "loss": 0.3643, + "step": 3168 + }, + { + "epoch": 0.29853277124891076, + "grad_norm": 0.8324764966964722, + "learning_rate": 1.9153786168214456e-05, + "loss": 0.377, + "step": 3169 + }, + { + "epoch": 0.2986269753420786, + "grad_norm": 0.9648815989494324, + "learning_rate": 1.9153178139519538e-05, + "loss": 0.3594, + "step": 3170 + }, + { + "epoch": 0.29872117943524645, + "grad_norm": 0.8513934016227722, + "learning_rate": 1.915256990211801e-05, + "loss": 0.3396, + "step": 3171 + }, + { + "epoch": 0.2988153835284143, + "grad_norm": 0.8295388221740723, + "learning_rate": 1.915196145602373e-05, + "loss": 0.3466, + "step": 3172 + }, + { + "epoch": 0.29890958762158215, + "grad_norm": 0.8665640950202942, + "learning_rate": 1.9151352801250572e-05, + "loss": 0.4002, + "step": 3173 + }, + { + "epoch": 0.29900379171475, + "grad_norm": 0.7799671292304993, + "learning_rate": 1.9150743937812423e-05, + "loss": 0.3576, + "step": 3174 + }, + { + "epoch": 0.29909799580791785, + "grad_norm": 0.8510997295379639, + "learning_rate": 1.9150134865723154e-05, + "loss": 0.3358, + "step": 3175 + }, + { + "epoch": 0.2991921999010857, + "grad_norm": 0.9179261326789856, + "learning_rate": 1.9149525584996663e-05, + "loss": 0.3845, + "step": 3176 + }, + { + "epoch": 0.29928640399425355, + "grad_norm": 0.8507550358772278, + "learning_rate": 1.9148916095646838e-05, + "loss": 0.3519, + "step": 3177 + }, + { + "epoch": 0.2993806080874214, + "grad_norm": 0.8420320749282837, + "learning_rate": 1.9148306397687573e-05, + "loss": 0.3551, + "step": 3178 + }, + { + "epoch": 0.29947481218058924, + "grad_norm": 0.809543788433075, + "learning_rate": 1.914769649113278e-05, + "loss": 0.3684, + "step": 3179 + }, + { + "epoch": 0.2995690162737571, + "grad_norm": 2.2630064487457275, + "learning_rate": 1.914708637599636e-05, + "loss": 0.3646, + "step": 3180 + }, + { + "epoch": 0.29966322036692494, + "grad_norm": 0.9354215860366821, + "learning_rate": 1.914647605229222e-05, + "loss": 0.4167, + "step": 3181 + }, + { + "epoch": 0.2997574244600928, + "grad_norm": 0.8339396715164185, + "learning_rate": 1.9145865520034282e-05, + "loss": 0.3651, + "step": 3182 + }, + { + "epoch": 0.29985162855326064, + "grad_norm": 0.7765888571739197, + "learning_rate": 1.914525477923647e-05, + "loss": 0.3686, + "step": 3183 + }, + { + "epoch": 0.2999458326464285, + "grad_norm": 0.8726272583007812, + "learning_rate": 1.9144643829912698e-05, + "loss": 0.3873, + "step": 3184 + }, + { + "epoch": 0.30004003673959634, + "grad_norm": 0.8301263451576233, + "learning_rate": 1.914403267207691e-05, + "loss": 0.3717, + "step": 3185 + }, + { + "epoch": 0.3001342408327642, + "grad_norm": 0.8069661855697632, + "learning_rate": 1.9143421305743035e-05, + "loss": 0.3722, + "step": 3186 + }, + { + "epoch": 0.30022844492593204, + "grad_norm": 0.7408289313316345, + "learning_rate": 1.9142809730925012e-05, + "loss": 0.3821, + "step": 3187 + }, + { + "epoch": 0.3003226490190999, + "grad_norm": 0.8679337501525879, + "learning_rate": 1.9142197947636788e-05, + "loss": 0.3881, + "step": 3188 + }, + { + "epoch": 0.30041685311226773, + "grad_norm": 0.8776544332504272, + "learning_rate": 1.914158595589231e-05, + "loss": 0.4097, + "step": 3189 + }, + { + "epoch": 0.3005110572054356, + "grad_norm": 0.8305463790893555, + "learning_rate": 1.9140973755705538e-05, + "loss": 0.3972, + "step": 3190 + }, + { + "epoch": 0.30060526129860343, + "grad_norm": 0.9195913672447205, + "learning_rate": 1.9140361347090426e-05, + "loss": 0.3575, + "step": 3191 + }, + { + "epoch": 0.3006994653917713, + "grad_norm": 0.7249822020530701, + "learning_rate": 1.913974873006094e-05, + "loss": 0.3394, + "step": 3192 + }, + { + "epoch": 0.30079366948493913, + "grad_norm": 0.8495468497276306, + "learning_rate": 1.913913590463105e-05, + "loss": 0.3949, + "step": 3193 + }, + { + "epoch": 0.300887873578107, + "grad_norm": 0.7819181084632874, + "learning_rate": 1.9138522870814725e-05, + "loss": 0.3607, + "step": 3194 + }, + { + "epoch": 0.30098207767127483, + "grad_norm": 0.8274915814399719, + "learning_rate": 1.9137909628625942e-05, + "loss": 0.3376, + "step": 3195 + }, + { + "epoch": 0.3010762817644427, + "grad_norm": 0.7591769099235535, + "learning_rate": 1.9137296178078692e-05, + "loss": 0.3113, + "step": 3196 + }, + { + "epoch": 0.3011704858576105, + "grad_norm": 0.8821949362754822, + "learning_rate": 1.9136682519186958e-05, + "loss": 0.4159, + "step": 3197 + }, + { + "epoch": 0.3012646899507784, + "grad_norm": 0.8360042572021484, + "learning_rate": 1.913606865196473e-05, + "loss": 0.3614, + "step": 3198 + }, + { + "epoch": 0.3013588940439462, + "grad_norm": 0.8298535346984863, + "learning_rate": 1.913545457642601e-05, + "loss": 0.3452, + "step": 3199 + }, + { + "epoch": 0.3014530981371141, + "grad_norm": 0.8901185989379883, + "learning_rate": 1.91348402925848e-05, + "loss": 0.3411, + "step": 3200 + }, + { + "epoch": 0.3015473022302819, + "grad_norm": 0.8033450245857239, + "learning_rate": 1.91342258004551e-05, + "loss": 0.2998, + "step": 3201 + }, + { + "epoch": 0.30164150632344977, + "grad_norm": 0.9016701579093933, + "learning_rate": 1.9133611100050925e-05, + "loss": 0.3856, + "step": 3202 + }, + { + "epoch": 0.3017357104166176, + "grad_norm": 0.7769457101821899, + "learning_rate": 1.9132996191386292e-05, + "loss": 0.356, + "step": 3203 + }, + { + "epoch": 0.30182991450978547, + "grad_norm": 0.9773686528205872, + "learning_rate": 1.913238107447522e-05, + "loss": 0.3866, + "step": 3204 + }, + { + "epoch": 0.3019241186029533, + "grad_norm": 0.7966118454933167, + "learning_rate": 1.913176574933174e-05, + "loss": 0.3699, + "step": 3205 + }, + { + "epoch": 0.30201832269612117, + "grad_norm": 0.7131487131118774, + "learning_rate": 1.9131150215969875e-05, + "loss": 0.3102, + "step": 3206 + }, + { + "epoch": 0.302112526789289, + "grad_norm": 0.8672724366188049, + "learning_rate": 1.9130534474403664e-05, + "loss": 0.3329, + "step": 3207 + }, + { + "epoch": 0.30220673088245686, + "grad_norm": 0.8312980532646179, + "learning_rate": 1.912991852464715e-05, + "loss": 0.3629, + "step": 3208 + }, + { + "epoch": 0.3023009349756247, + "grad_norm": 0.8550496697425842, + "learning_rate": 1.912930236671437e-05, + "loss": 0.3939, + "step": 3209 + }, + { + "epoch": 0.30239513906879256, + "grad_norm": 0.7986893653869629, + "learning_rate": 1.9128686000619378e-05, + "loss": 0.3419, + "step": 3210 + }, + { + "epoch": 0.3024893431619604, + "grad_norm": 0.9180712103843689, + "learning_rate": 1.9128069426376225e-05, + "loss": 0.4041, + "step": 3211 + }, + { + "epoch": 0.30258354725512826, + "grad_norm": 0.8762322068214417, + "learning_rate": 1.9127452643998975e-05, + "loss": 0.312, + "step": 3212 + }, + { + "epoch": 0.3026777513482961, + "grad_norm": 0.9074586629867554, + "learning_rate": 1.912683565350169e-05, + "loss": 0.3554, + "step": 3213 + }, + { + "epoch": 0.30277195544146396, + "grad_norm": 0.8098133206367493, + "learning_rate": 1.9126218454898437e-05, + "loss": 0.3743, + "step": 3214 + }, + { + "epoch": 0.3028661595346318, + "grad_norm": 0.7473288774490356, + "learning_rate": 1.9125601048203288e-05, + "loss": 0.3636, + "step": 3215 + }, + { + "epoch": 0.3029603636277996, + "grad_norm": 0.754591703414917, + "learning_rate": 1.912498343343032e-05, + "loss": 0.3602, + "step": 3216 + }, + { + "epoch": 0.30305456772096745, + "grad_norm": 0.7766704559326172, + "learning_rate": 1.9124365610593624e-05, + "loss": 0.3099, + "step": 3217 + }, + { + "epoch": 0.3031487718141353, + "grad_norm": 0.7497815489768982, + "learning_rate": 1.9123747579707275e-05, + "loss": 0.3477, + "step": 3218 + }, + { + "epoch": 0.30324297590730315, + "grad_norm": 0.9382441639900208, + "learning_rate": 1.9123129340785372e-05, + "loss": 0.3614, + "step": 3219 + }, + { + "epoch": 0.303337180000471, + "grad_norm": 0.7490923404693604, + "learning_rate": 1.9122510893842013e-05, + "loss": 0.3511, + "step": 3220 + }, + { + "epoch": 0.30343138409363885, + "grad_norm": 0.756651759147644, + "learning_rate": 1.9121892238891296e-05, + "loss": 0.3333, + "step": 3221 + }, + { + "epoch": 0.3035255881868067, + "grad_norm": 0.8930402398109436, + "learning_rate": 1.9121273375947326e-05, + "loss": 0.384, + "step": 3222 + }, + { + "epoch": 0.30361979227997454, + "grad_norm": 0.907922625541687, + "learning_rate": 1.9120654305024224e-05, + "loss": 0.4312, + "step": 3223 + }, + { + "epoch": 0.3037139963731424, + "grad_norm": 0.7934293746948242, + "learning_rate": 1.912003502613609e-05, + "loss": 0.3264, + "step": 3224 + }, + { + "epoch": 0.30380820046631024, + "grad_norm": 0.8397114872932434, + "learning_rate": 1.9119415539297058e-05, + "loss": 0.3524, + "step": 3225 + }, + { + "epoch": 0.3039024045594781, + "grad_norm": 0.7385463714599609, + "learning_rate": 1.911879584452125e-05, + "loss": 0.371, + "step": 3226 + }, + { + "epoch": 0.30399660865264594, + "grad_norm": 0.8333579301834106, + "learning_rate": 1.911817594182279e-05, + "loss": 0.3722, + "step": 3227 + }, + { + "epoch": 0.3040908127458138, + "grad_norm": 0.818589985370636, + "learning_rate": 1.9117555831215818e-05, + "loss": 0.4054, + "step": 3228 + }, + { + "epoch": 0.30418501683898164, + "grad_norm": 0.8527333736419678, + "learning_rate": 1.9116935512714473e-05, + "loss": 0.3594, + "step": 3229 + }, + { + "epoch": 0.3042792209321495, + "grad_norm": 0.840151846408844, + "learning_rate": 1.91163149863329e-05, + "loss": 0.3739, + "step": 3230 + }, + { + "epoch": 0.30437342502531733, + "grad_norm": 0.733751118183136, + "learning_rate": 1.9115694252085246e-05, + "loss": 0.3377, + "step": 3231 + }, + { + "epoch": 0.3044676291184852, + "grad_norm": 0.9031135439872742, + "learning_rate": 1.9115073309985665e-05, + "loss": 0.3651, + "step": 3232 + }, + { + "epoch": 0.30456183321165303, + "grad_norm": 0.7640702724456787, + "learning_rate": 1.9114452160048315e-05, + "loss": 0.2965, + "step": 3233 + }, + { + "epoch": 0.3046560373048209, + "grad_norm": 0.959571361541748, + "learning_rate": 1.911383080228736e-05, + "loss": 0.3829, + "step": 3234 + }, + { + "epoch": 0.30475024139798873, + "grad_norm": 0.799164891242981, + "learning_rate": 1.911320923671697e-05, + "loss": 0.3541, + "step": 3235 + }, + { + "epoch": 0.3048444454911566, + "grad_norm": 0.8224729299545288, + "learning_rate": 1.9112587463351313e-05, + "loss": 0.3418, + "step": 3236 + }, + { + "epoch": 0.30493864958432443, + "grad_norm": 0.7661332488059998, + "learning_rate": 1.9111965482204568e-05, + "loss": 0.3226, + "step": 3237 + }, + { + "epoch": 0.3050328536774923, + "grad_norm": 0.8191931843757629, + "learning_rate": 1.9111343293290923e-05, + "loss": 0.3706, + "step": 3238 + }, + { + "epoch": 0.3051270577706601, + "grad_norm": 0.8266605734825134, + "learning_rate": 1.9110720896624556e-05, + "loss": 0.3394, + "step": 3239 + }, + { + "epoch": 0.305221261863828, + "grad_norm": 1.0821118354797363, + "learning_rate": 1.9110098292219665e-05, + "loss": 0.3564, + "step": 3240 + }, + { + "epoch": 0.3053154659569958, + "grad_norm": 0.7629269361495972, + "learning_rate": 1.910947548009044e-05, + "loss": 0.3565, + "step": 3241 + }, + { + "epoch": 0.3054096700501637, + "grad_norm": 0.8535257577896118, + "learning_rate": 1.9108852460251088e-05, + "loss": 0.2997, + "step": 3242 + }, + { + "epoch": 0.3055038741433315, + "grad_norm": 0.8011213541030884, + "learning_rate": 1.9108229232715814e-05, + "loss": 0.3979, + "step": 3243 + }, + { + "epoch": 0.30559807823649937, + "grad_norm": 0.7791256308555603, + "learning_rate": 1.9107605797498827e-05, + "loss": 0.3114, + "step": 3244 + }, + { + "epoch": 0.3056922823296672, + "grad_norm": 0.8558793663978577, + "learning_rate": 1.910698215461434e-05, + "loss": 0.3712, + "step": 3245 + }, + { + "epoch": 0.30578648642283507, + "grad_norm": 0.8836783170700073, + "learning_rate": 1.910635830407658e-05, + "loss": 0.3945, + "step": 3246 + }, + { + "epoch": 0.3058806905160029, + "grad_norm": 0.7929200530052185, + "learning_rate": 1.9105734245899765e-05, + "loss": 0.3458, + "step": 3247 + }, + { + "epoch": 0.30597489460917077, + "grad_norm": 0.8550586104393005, + "learning_rate": 1.9105109980098126e-05, + "loss": 0.3745, + "step": 3248 + }, + { + "epoch": 0.3060690987023386, + "grad_norm": 0.9599501490592957, + "learning_rate": 1.9104485506685902e-05, + "loss": 0.3805, + "step": 3249 + }, + { + "epoch": 0.30616330279550646, + "grad_norm": 0.8369544148445129, + "learning_rate": 1.910386082567732e-05, + "loss": 0.3691, + "step": 3250 + }, + { + "epoch": 0.3062575068886743, + "grad_norm": 0.8377211689949036, + "learning_rate": 1.910323593708664e-05, + "loss": 0.3911, + "step": 3251 + }, + { + "epoch": 0.30635171098184216, + "grad_norm": 0.866549551486969, + "learning_rate": 1.91026108409281e-05, + "loss": 0.3498, + "step": 3252 + }, + { + "epoch": 0.30644591507501, + "grad_norm": 0.933917760848999, + "learning_rate": 1.9101985537215956e-05, + "loss": 0.3928, + "step": 3253 + }, + { + "epoch": 0.30654011916817786, + "grad_norm": 0.9976698756217957, + "learning_rate": 1.9101360025964464e-05, + "loss": 0.3369, + "step": 3254 + }, + { + "epoch": 0.3066343232613457, + "grad_norm": 0.9265713691711426, + "learning_rate": 1.9100734307187887e-05, + "loss": 0.3807, + "step": 3255 + }, + { + "epoch": 0.30672852735451356, + "grad_norm": 0.8622773885726929, + "learning_rate": 1.9100108380900493e-05, + "loss": 0.3674, + "step": 3256 + }, + { + "epoch": 0.3068227314476814, + "grad_norm": 0.7766557931900024, + "learning_rate": 1.9099482247116556e-05, + "loss": 0.3355, + "step": 3257 + }, + { + "epoch": 0.30691693554084926, + "grad_norm": 0.9115668535232544, + "learning_rate": 1.909885590585035e-05, + "loss": 0.3418, + "step": 3258 + }, + { + "epoch": 0.3070111396340171, + "grad_norm": 0.7861838936805725, + "learning_rate": 1.909822935711616e-05, + "loss": 0.3652, + "step": 3259 + }, + { + "epoch": 0.30710534372718495, + "grad_norm": 0.8084872961044312, + "learning_rate": 1.909760260092827e-05, + "loss": 0.3643, + "step": 3260 + }, + { + "epoch": 0.3071995478203528, + "grad_norm": 0.7769855260848999, + "learning_rate": 1.9096975637300968e-05, + "loss": 0.3532, + "step": 3261 + }, + { + "epoch": 0.30729375191352065, + "grad_norm": 0.847029983997345, + "learning_rate": 1.9096348466248554e-05, + "loss": 0.3546, + "step": 3262 + }, + { + "epoch": 0.3073879560066885, + "grad_norm": 0.8707786798477173, + "learning_rate": 1.9095721087785327e-05, + "loss": 0.372, + "step": 3263 + }, + { + "epoch": 0.30748216009985635, + "grad_norm": 0.8614506125450134, + "learning_rate": 1.909509350192559e-05, + "loss": 0.35, + "step": 3264 + }, + { + "epoch": 0.3075763641930242, + "grad_norm": 0.9336690902709961, + "learning_rate": 1.909446570868366e-05, + "loss": 0.422, + "step": 3265 + }, + { + "epoch": 0.30767056828619205, + "grad_norm": 0.9768769145011902, + "learning_rate": 1.9093837708073843e-05, + "loss": 0.3865, + "step": 3266 + }, + { + "epoch": 0.3077647723793599, + "grad_norm": 0.8792558312416077, + "learning_rate": 1.9093209500110465e-05, + "loss": 0.3646, + "step": 3267 + }, + { + "epoch": 0.30785897647252775, + "grad_norm": 0.8601863980293274, + "learning_rate": 1.9092581084807848e-05, + "loss": 0.3196, + "step": 3268 + }, + { + "epoch": 0.3079531805656956, + "grad_norm": 0.8642717003822327, + "learning_rate": 1.9091952462180317e-05, + "loss": 0.3951, + "step": 3269 + }, + { + "epoch": 0.30804738465886344, + "grad_norm": 0.8343533873558044, + "learning_rate": 1.909132363224221e-05, + "loss": 0.3492, + "step": 3270 + }, + { + "epoch": 0.3081415887520313, + "grad_norm": 0.8218293190002441, + "learning_rate": 1.9090694595007866e-05, + "loss": 0.3518, + "step": 3271 + }, + { + "epoch": 0.30823579284519914, + "grad_norm": 0.7444469332695007, + "learning_rate": 1.909006535049163e-05, + "loss": 0.3121, + "step": 3272 + }, + { + "epoch": 0.308329996938367, + "grad_norm": 0.832141101360321, + "learning_rate": 1.908943589870784e-05, + "loss": 0.3668, + "step": 3273 + }, + { + "epoch": 0.30842420103153484, + "grad_norm": 0.8032460808753967, + "learning_rate": 1.9088806239670855e-05, + "loss": 0.3408, + "step": 3274 + }, + { + "epoch": 0.3085184051247027, + "grad_norm": 0.8065330982208252, + "learning_rate": 1.908817637339503e-05, + "loss": 0.3422, + "step": 3275 + }, + { + "epoch": 0.30861260921787054, + "grad_norm": 0.7814967036247253, + "learning_rate": 1.908754629989473e-05, + "loss": 0.3494, + "step": 3276 + }, + { + "epoch": 0.3087068133110384, + "grad_norm": 0.7664377093315125, + "learning_rate": 1.9086916019184323e-05, + "loss": 0.3393, + "step": 3277 + }, + { + "epoch": 0.30880101740420624, + "grad_norm": 0.926455020904541, + "learning_rate": 1.9086285531278176e-05, + "loss": 0.3679, + "step": 3278 + }, + { + "epoch": 0.3088952214973741, + "grad_norm": 0.833058774471283, + "learning_rate": 1.9085654836190665e-05, + "loss": 0.3592, + "step": 3279 + }, + { + "epoch": 0.30898942559054193, + "grad_norm": 0.8743307590484619, + "learning_rate": 1.9085023933936174e-05, + "loss": 0.3721, + "step": 3280 + }, + { + "epoch": 0.3090836296837098, + "grad_norm": 1.2178056240081787, + "learning_rate": 1.908439282452909e-05, + "loss": 0.3882, + "step": 3281 + }, + { + "epoch": 0.30917783377687763, + "grad_norm": 0.7862185835838318, + "learning_rate": 1.9083761507983794e-05, + "loss": 0.3243, + "step": 3282 + }, + { + "epoch": 0.3092720378700455, + "grad_norm": 1.1203469038009644, + "learning_rate": 1.9083129984314694e-05, + "loss": 0.4072, + "step": 3283 + }, + { + "epoch": 0.30936624196321333, + "grad_norm": 0.7928134799003601, + "learning_rate": 1.9082498253536175e-05, + "loss": 0.3249, + "step": 3284 + }, + { + "epoch": 0.3094604460563811, + "grad_norm": 0.8092864155769348, + "learning_rate": 1.9081866315662655e-05, + "loss": 0.3542, + "step": 3285 + }, + { + "epoch": 0.30955465014954897, + "grad_norm": 0.8150007724761963, + "learning_rate": 1.908123417070854e-05, + "loss": 0.3374, + "step": 3286 + }, + { + "epoch": 0.3096488542427168, + "grad_norm": 0.8889190554618835, + "learning_rate": 1.9080601818688234e-05, + "loss": 0.3871, + "step": 3287 + }, + { + "epoch": 0.30974305833588467, + "grad_norm": 0.7795815467834473, + "learning_rate": 1.907996925961617e-05, + "loss": 0.3329, + "step": 3288 + }, + { + "epoch": 0.3098372624290525, + "grad_norm": 0.892247200012207, + "learning_rate": 1.907933649350676e-05, + "loss": 0.4365, + "step": 3289 + }, + { + "epoch": 0.30993146652222037, + "grad_norm": 0.8202358484268188, + "learning_rate": 1.9078703520374436e-05, + "loss": 0.3698, + "step": 3290 + }, + { + "epoch": 0.3100256706153882, + "grad_norm": 0.7944460511207581, + "learning_rate": 1.9078070340233637e-05, + "loss": 0.3733, + "step": 3291 + }, + { + "epoch": 0.31011987470855606, + "grad_norm": 0.8122773766517639, + "learning_rate": 1.9077436953098792e-05, + "loss": 0.3989, + "step": 3292 + }, + { + "epoch": 0.3102140788017239, + "grad_norm": 0.8976459503173828, + "learning_rate": 1.9076803358984345e-05, + "loss": 0.364, + "step": 3293 + }, + { + "epoch": 0.31030828289489176, + "grad_norm": 0.7487342953681946, + "learning_rate": 1.9076169557904743e-05, + "loss": 0.3332, + "step": 3294 + }, + { + "epoch": 0.3104024869880596, + "grad_norm": 0.9525068402290344, + "learning_rate": 1.9075535549874436e-05, + "loss": 0.3647, + "step": 3295 + }, + { + "epoch": 0.31049669108122746, + "grad_norm": 0.9670138359069824, + "learning_rate": 1.9074901334907888e-05, + "loss": 0.3442, + "step": 3296 + }, + { + "epoch": 0.3105908951743953, + "grad_norm": 0.8507341742515564, + "learning_rate": 1.9074266913019553e-05, + "loss": 0.357, + "step": 3297 + }, + { + "epoch": 0.31068509926756316, + "grad_norm": 0.7282482981681824, + "learning_rate": 1.9073632284223902e-05, + "loss": 0.339, + "step": 3298 + }, + { + "epoch": 0.310779303360731, + "grad_norm": 0.8985245227813721, + "learning_rate": 1.9072997448535398e-05, + "loss": 0.3847, + "step": 3299 + }, + { + "epoch": 0.31087350745389886, + "grad_norm": 0.833671510219574, + "learning_rate": 1.9072362405968522e-05, + "loss": 0.3849, + "step": 3300 + }, + { + "epoch": 0.3109677115470667, + "grad_norm": 0.762282133102417, + "learning_rate": 1.9071727156537753e-05, + "loss": 0.3498, + "step": 3301 + }, + { + "epoch": 0.31106191564023455, + "grad_norm": 0.7250514030456543, + "learning_rate": 1.9071091700257574e-05, + "loss": 0.3302, + "step": 3302 + }, + { + "epoch": 0.3111561197334024, + "grad_norm": 0.7673451900482178, + "learning_rate": 1.9070456037142475e-05, + "loss": 0.3275, + "step": 3303 + }, + { + "epoch": 0.31125032382657025, + "grad_norm": 0.8764957189559937, + "learning_rate": 1.9069820167206953e-05, + "loss": 0.4017, + "step": 3304 + }, + { + "epoch": 0.3113445279197381, + "grad_norm": 0.8258739113807678, + "learning_rate": 1.9069184090465504e-05, + "loss": 0.3298, + "step": 3305 + }, + { + "epoch": 0.31143873201290595, + "grad_norm": 0.7824748754501343, + "learning_rate": 1.906854780693263e-05, + "loss": 0.3577, + "step": 3306 + }, + { + "epoch": 0.3115329361060738, + "grad_norm": 0.8174601793289185, + "learning_rate": 1.906791131662284e-05, + "loss": 0.3411, + "step": 3307 + }, + { + "epoch": 0.31162714019924165, + "grad_norm": 0.7796163558959961, + "learning_rate": 1.906727461955065e-05, + "loss": 0.3098, + "step": 3308 + }, + { + "epoch": 0.3117213442924095, + "grad_norm": 0.7932508587837219, + "learning_rate": 1.9066637715730578e-05, + "loss": 0.3796, + "step": 3309 + }, + { + "epoch": 0.31181554838557735, + "grad_norm": 0.8183960318565369, + "learning_rate": 1.9066000605177143e-05, + "loss": 0.3799, + "step": 3310 + }, + { + "epoch": 0.3119097524787452, + "grad_norm": 0.7983317971229553, + "learning_rate": 1.9065363287904873e-05, + "loss": 0.3644, + "step": 3311 + }, + { + "epoch": 0.31200395657191304, + "grad_norm": 0.8192973136901855, + "learning_rate": 1.90647257639283e-05, + "loss": 0.3484, + "step": 3312 + }, + { + "epoch": 0.3120981606650809, + "grad_norm": 0.8482248783111572, + "learning_rate": 1.906408803326196e-05, + "loss": 0.3503, + "step": 3313 + }, + { + "epoch": 0.31219236475824874, + "grad_norm": 0.8947495818138123, + "learning_rate": 1.9063450095920397e-05, + "loss": 0.3688, + "step": 3314 + }, + { + "epoch": 0.3122865688514166, + "grad_norm": 0.8172609806060791, + "learning_rate": 1.906281195191815e-05, + "loss": 0.3556, + "step": 3315 + }, + { + "epoch": 0.31238077294458444, + "grad_norm": 0.8087257742881775, + "learning_rate": 1.906217360126978e-05, + "loss": 0.3451, + "step": 3316 + }, + { + "epoch": 0.3124749770377523, + "grad_norm": 0.9458954930305481, + "learning_rate": 1.906153504398984e-05, + "loss": 0.3794, + "step": 3317 + }, + { + "epoch": 0.31256918113092014, + "grad_norm": 0.7924895286560059, + "learning_rate": 1.906089628009288e-05, + "loss": 0.3259, + "step": 3318 + }, + { + "epoch": 0.312663385224088, + "grad_norm": 0.8065479397773743, + "learning_rate": 1.9060257309593473e-05, + "loss": 0.368, + "step": 3319 + }, + { + "epoch": 0.31275758931725584, + "grad_norm": 0.9134812355041504, + "learning_rate": 1.905961813250619e-05, + "loss": 0.4007, + "step": 3320 + }, + { + "epoch": 0.3128517934104237, + "grad_norm": 1.23716402053833, + "learning_rate": 1.90589787488456e-05, + "loss": 0.3557, + "step": 3321 + }, + { + "epoch": 0.31294599750359153, + "grad_norm": 0.9040558934211731, + "learning_rate": 1.9058339158626286e-05, + "loss": 0.3511, + "step": 3322 + }, + { + "epoch": 0.3130402015967594, + "grad_norm": 0.8373039960861206, + "learning_rate": 1.9057699361862832e-05, + "loss": 0.3723, + "step": 3323 + }, + { + "epoch": 0.31313440568992723, + "grad_norm": 0.825303852558136, + "learning_rate": 1.905705935856982e-05, + "loss": 0.397, + "step": 3324 + }, + { + "epoch": 0.3132286097830951, + "grad_norm": 0.8393895626068115, + "learning_rate": 1.905641914876185e-05, + "loss": 0.3596, + "step": 3325 + }, + { + "epoch": 0.31332281387626293, + "grad_norm": 0.8771679401397705, + "learning_rate": 1.9055778732453518e-05, + "loss": 0.3707, + "step": 3326 + }, + { + "epoch": 0.3134170179694308, + "grad_norm": 0.833461582660675, + "learning_rate": 1.9055138109659427e-05, + "loss": 0.3577, + "step": 3327 + }, + { + "epoch": 0.3135112220625986, + "grad_norm": 0.8579936623573303, + "learning_rate": 1.905449728039418e-05, + "loss": 0.4035, + "step": 3328 + }, + { + "epoch": 0.3136054261557665, + "grad_norm": 0.8387287259101868, + "learning_rate": 1.9053856244672392e-05, + "loss": 0.3426, + "step": 3329 + }, + { + "epoch": 0.3136996302489343, + "grad_norm": 0.784853994846344, + "learning_rate": 1.905321500250868e-05, + "loss": 0.3615, + "step": 3330 + }, + { + "epoch": 0.3137938343421022, + "grad_norm": 1.0262951850891113, + "learning_rate": 1.9052573553917667e-05, + "loss": 0.3515, + "step": 3331 + }, + { + "epoch": 0.31388803843527, + "grad_norm": 0.7893538475036621, + "learning_rate": 1.9051931898913977e-05, + "loss": 0.3542, + "step": 3332 + }, + { + "epoch": 0.31398224252843787, + "grad_norm": 0.8260703682899475, + "learning_rate": 1.905129003751224e-05, + "loss": 0.3426, + "step": 3333 + }, + { + "epoch": 0.3140764466216057, + "grad_norm": 0.9318860173225403, + "learning_rate": 1.9050647969727088e-05, + "loss": 0.3481, + "step": 3334 + }, + { + "epoch": 0.31417065071477357, + "grad_norm": 0.8694115281105042, + "learning_rate": 1.905000569557317e-05, + "loss": 0.3807, + "step": 3335 + }, + { + "epoch": 0.3142648548079414, + "grad_norm": 0.7466548681259155, + "learning_rate": 1.9049363215065124e-05, + "loss": 0.3221, + "step": 3336 + }, + { + "epoch": 0.31435905890110927, + "grad_norm": 0.8590157628059387, + "learning_rate": 1.9048720528217604e-05, + "loss": 0.3359, + "step": 3337 + }, + { + "epoch": 0.3144532629942771, + "grad_norm": 0.855535089969635, + "learning_rate": 1.9048077635045258e-05, + "loss": 0.4009, + "step": 3338 + }, + { + "epoch": 0.31454746708744497, + "grad_norm": 0.7641542553901672, + "learning_rate": 1.904743453556275e-05, + "loss": 0.3455, + "step": 3339 + }, + { + "epoch": 0.3146416711806128, + "grad_norm": 0.764505922794342, + "learning_rate": 1.9046791229784745e-05, + "loss": 0.343, + "step": 3340 + }, + { + "epoch": 0.31473587527378066, + "grad_norm": 0.8607409596443176, + "learning_rate": 1.904614771772591e-05, + "loss": 0.3363, + "step": 3341 + }, + { + "epoch": 0.3148300793669485, + "grad_norm": 0.7796515822410583, + "learning_rate": 1.904550399940091e-05, + "loss": 0.3583, + "step": 3342 + }, + { + "epoch": 0.31492428346011636, + "grad_norm": 0.7588558793067932, + "learning_rate": 1.9044860074824436e-05, + "loss": 0.3031, + "step": 3343 + }, + { + "epoch": 0.3150184875532842, + "grad_norm": 0.8214284181594849, + "learning_rate": 1.9044215944011163e-05, + "loss": 0.3443, + "step": 3344 + }, + { + "epoch": 0.31511269164645206, + "grad_norm": 0.8406785130500793, + "learning_rate": 1.9043571606975776e-05, + "loss": 0.3502, + "step": 3345 + }, + { + "epoch": 0.3152068957396199, + "grad_norm": 0.8515416979789734, + "learning_rate": 1.9042927063732972e-05, + "loss": 0.384, + "step": 3346 + }, + { + "epoch": 0.31530109983278776, + "grad_norm": 0.8178034424781799, + "learning_rate": 1.9042282314297446e-05, + "loss": 0.3452, + "step": 3347 + }, + { + "epoch": 0.3153953039259556, + "grad_norm": 0.7956291437149048, + "learning_rate": 1.90416373586839e-05, + "loss": 0.3206, + "step": 3348 + }, + { + "epoch": 0.31548950801912345, + "grad_norm": 0.8119598031044006, + "learning_rate": 1.904099219690704e-05, + "loss": 0.3607, + "step": 3349 + }, + { + "epoch": 0.3155837121122913, + "grad_norm": 0.7631762623786926, + "learning_rate": 1.9040346828981574e-05, + "loss": 0.3011, + "step": 3350 + }, + { + "epoch": 0.31567791620545915, + "grad_norm": 0.7502119541168213, + "learning_rate": 1.903970125492222e-05, + "loss": 0.3215, + "step": 3351 + }, + { + "epoch": 0.315772120298627, + "grad_norm": 0.7395917177200317, + "learning_rate": 1.9039055474743694e-05, + "loss": 0.2909, + "step": 3352 + }, + { + "epoch": 0.31586632439179485, + "grad_norm": 0.8743318915367126, + "learning_rate": 1.9038409488460728e-05, + "loss": 0.3779, + "step": 3353 + }, + { + "epoch": 0.3159605284849627, + "grad_norm": 0.7067281007766724, + "learning_rate": 1.9037763296088046e-05, + "loss": 0.3173, + "step": 3354 + }, + { + "epoch": 0.3160547325781305, + "grad_norm": 0.8748904466629028, + "learning_rate": 1.9037116897640386e-05, + "loss": 0.4102, + "step": 3355 + }, + { + "epoch": 0.31614893667129834, + "grad_norm": 0.8747310638427734, + "learning_rate": 1.903647029313248e-05, + "loss": 0.3563, + "step": 3356 + }, + { + "epoch": 0.3162431407644662, + "grad_norm": 0.7853719592094421, + "learning_rate": 1.9035823482579076e-05, + "loss": 0.3073, + "step": 3357 + }, + { + "epoch": 0.31633734485763404, + "grad_norm": 0.8870383501052856, + "learning_rate": 1.9035176465994927e-05, + "loss": 0.3611, + "step": 3358 + }, + { + "epoch": 0.3164315489508019, + "grad_norm": 0.8669750094413757, + "learning_rate": 1.9034529243394777e-05, + "loss": 0.3426, + "step": 3359 + }, + { + "epoch": 0.31652575304396974, + "grad_norm": 0.7884622812271118, + "learning_rate": 1.903388181479339e-05, + "loss": 0.3437, + "step": 3360 + }, + { + "epoch": 0.3166199571371376, + "grad_norm": 0.7719337344169617, + "learning_rate": 1.903323418020553e-05, + "loss": 0.3602, + "step": 3361 + }, + { + "epoch": 0.31671416123030544, + "grad_norm": 1.0963482856750488, + "learning_rate": 1.9032586339645954e-05, + "loss": 0.3569, + "step": 3362 + }, + { + "epoch": 0.3168083653234733, + "grad_norm": 0.9027066826820374, + "learning_rate": 1.9031938293129443e-05, + "loss": 0.377, + "step": 3363 + }, + { + "epoch": 0.31690256941664113, + "grad_norm": 0.8001086711883545, + "learning_rate": 1.9031290040670773e-05, + "loss": 0.3745, + "step": 3364 + }, + { + "epoch": 0.316996773509809, + "grad_norm": 0.8479501605033875, + "learning_rate": 1.903064158228472e-05, + "loss": 0.3432, + "step": 3365 + }, + { + "epoch": 0.31709097760297683, + "grad_norm": 0.7776916027069092, + "learning_rate": 1.9029992917986075e-05, + "loss": 0.3247, + "step": 3366 + }, + { + "epoch": 0.3171851816961447, + "grad_norm": 0.8749487996101379, + "learning_rate": 1.9029344047789627e-05, + "loss": 0.3525, + "step": 3367 + }, + { + "epoch": 0.31727938578931253, + "grad_norm": 0.7998982071876526, + "learning_rate": 1.9028694971710166e-05, + "loss": 0.3642, + "step": 3368 + }, + { + "epoch": 0.3173735898824804, + "grad_norm": 0.8136427402496338, + "learning_rate": 1.90280456897625e-05, + "loss": 0.3358, + "step": 3369 + }, + { + "epoch": 0.3174677939756482, + "grad_norm": 0.9419760704040527, + "learning_rate": 1.902739620196143e-05, + "loss": 0.3391, + "step": 3370 + }, + { + "epoch": 0.3175619980688161, + "grad_norm": 0.8789461255073547, + "learning_rate": 1.9026746508321766e-05, + "loss": 0.3801, + "step": 3371 + }, + { + "epoch": 0.3176562021619839, + "grad_norm": 0.7949413061141968, + "learning_rate": 1.902609660885832e-05, + "loss": 0.3409, + "step": 3372 + }, + { + "epoch": 0.3177504062551518, + "grad_norm": 0.8431394696235657, + "learning_rate": 1.902544650358591e-05, + "loss": 0.3271, + "step": 3373 + }, + { + "epoch": 0.3178446103483196, + "grad_norm": 0.905490517616272, + "learning_rate": 1.9024796192519364e-05, + "loss": 0.3346, + "step": 3374 + }, + { + "epoch": 0.31793881444148747, + "grad_norm": 0.9226703643798828, + "learning_rate": 1.902414567567351e-05, + "loss": 0.3449, + "step": 3375 + }, + { + "epoch": 0.3180330185346553, + "grad_norm": 0.845367431640625, + "learning_rate": 1.9023494953063175e-05, + "loss": 0.3573, + "step": 3376 + }, + { + "epoch": 0.31812722262782317, + "grad_norm": 0.7797483801841736, + "learning_rate": 1.90228440247032e-05, + "loss": 0.3392, + "step": 3377 + }, + { + "epoch": 0.318221426720991, + "grad_norm": 0.8446933031082153, + "learning_rate": 1.9022192890608432e-05, + "loss": 0.3661, + "step": 3378 + }, + { + "epoch": 0.31831563081415887, + "grad_norm": 0.828841507434845, + "learning_rate": 1.9021541550793707e-05, + "loss": 0.3936, + "step": 3379 + }, + { + "epoch": 0.3184098349073267, + "grad_norm": 0.8184050917625427, + "learning_rate": 1.9020890005273887e-05, + "loss": 0.3671, + "step": 3380 + }, + { + "epoch": 0.31850403900049457, + "grad_norm": 0.8316391706466675, + "learning_rate": 1.902023825406382e-05, + "loss": 0.3382, + "step": 3381 + }, + { + "epoch": 0.3185982430936624, + "grad_norm": 0.894083559513092, + "learning_rate": 1.9019586297178374e-05, + "loss": 0.3867, + "step": 3382 + }, + { + "epoch": 0.31869244718683026, + "grad_norm": 0.9373259544372559, + "learning_rate": 1.901893413463241e-05, + "loss": 0.4018, + "step": 3383 + }, + { + "epoch": 0.3187866512799981, + "grad_norm": 0.7572992444038391, + "learning_rate": 1.90182817664408e-05, + "loss": 0.385, + "step": 3384 + }, + { + "epoch": 0.31888085537316596, + "grad_norm": 0.7976850271224976, + "learning_rate": 1.9017629192618418e-05, + "loss": 0.3523, + "step": 3385 + }, + { + "epoch": 0.3189750594663338, + "grad_norm": 0.9599197506904602, + "learning_rate": 1.9016976413180143e-05, + "loss": 0.3978, + "step": 3386 + }, + { + "epoch": 0.31906926355950166, + "grad_norm": 0.724345326423645, + "learning_rate": 1.9016323428140863e-05, + "loss": 0.3274, + "step": 3387 + }, + { + "epoch": 0.3191634676526695, + "grad_norm": 0.7421485781669617, + "learning_rate": 1.9015670237515465e-05, + "loss": 0.3711, + "step": 3388 + }, + { + "epoch": 0.31925767174583736, + "grad_norm": 0.7720860838890076, + "learning_rate": 1.9015016841318843e-05, + "loss": 0.3822, + "step": 3389 + }, + { + "epoch": 0.3193518758390052, + "grad_norm": 0.7846570014953613, + "learning_rate": 1.9014363239565892e-05, + "loss": 0.3715, + "step": 3390 + }, + { + "epoch": 0.31944607993217305, + "grad_norm": 0.7919772863388062, + "learning_rate": 1.901370943227152e-05, + "loss": 0.3364, + "step": 3391 + }, + { + "epoch": 0.3195402840253409, + "grad_norm": 0.8712970614433289, + "learning_rate": 1.9013055419450634e-05, + "loss": 0.3768, + "step": 3392 + }, + { + "epoch": 0.31963448811850875, + "grad_norm": 0.9658766984939575, + "learning_rate": 1.9012401201118145e-05, + "loss": 0.337, + "step": 3393 + }, + { + "epoch": 0.3197286922116766, + "grad_norm": 0.8209637403488159, + "learning_rate": 1.9011746777288968e-05, + "loss": 0.3875, + "step": 3394 + }, + { + "epoch": 0.31982289630484445, + "grad_norm": 0.9107644557952881, + "learning_rate": 1.901109214797803e-05, + "loss": 0.3724, + "step": 3395 + }, + { + "epoch": 0.3199171003980123, + "grad_norm": 0.8732518553733826, + "learning_rate": 1.9010437313200256e-05, + "loss": 0.3586, + "step": 3396 + }, + { + "epoch": 0.32001130449118015, + "grad_norm": 0.9876217842102051, + "learning_rate": 1.9009782272970577e-05, + "loss": 0.3495, + "step": 3397 + }, + { + "epoch": 0.320105508584348, + "grad_norm": 0.7326115369796753, + "learning_rate": 1.9009127027303925e-05, + "loss": 0.3234, + "step": 3398 + }, + { + "epoch": 0.32019971267751585, + "grad_norm": 0.8744220733642578, + "learning_rate": 1.9008471576215247e-05, + "loss": 0.3624, + "step": 3399 + }, + { + "epoch": 0.3202939167706837, + "grad_norm": 0.8153958320617676, + "learning_rate": 1.9007815919719483e-05, + "loss": 0.3374, + "step": 3400 + }, + { + "epoch": 0.32038812086385154, + "grad_norm": 0.7342997789382935, + "learning_rate": 1.9007160057831585e-05, + "loss": 0.3099, + "step": 3401 + }, + { + "epoch": 0.3204823249570194, + "grad_norm": 0.9112691879272461, + "learning_rate": 1.900650399056651e-05, + "loss": 0.3833, + "step": 3402 + }, + { + "epoch": 0.32057652905018724, + "grad_norm": 0.8247086405754089, + "learning_rate": 1.9005847717939213e-05, + "loss": 0.3497, + "step": 3403 + }, + { + "epoch": 0.3206707331433551, + "grad_norm": 0.9501208662986755, + "learning_rate": 1.900519123996466e-05, + "loss": 0.4221, + "step": 3404 + }, + { + "epoch": 0.32076493723652294, + "grad_norm": 0.9292295575141907, + "learning_rate": 1.900453455665782e-05, + "loss": 0.3616, + "step": 3405 + }, + { + "epoch": 0.3208591413296908, + "grad_norm": 0.7539984583854675, + "learning_rate": 1.900387766803367e-05, + "loss": 0.3055, + "step": 3406 + }, + { + "epoch": 0.32095334542285864, + "grad_norm": 0.7906505465507507, + "learning_rate": 1.9003220574107178e-05, + "loss": 0.3281, + "step": 3407 + }, + { + "epoch": 0.3210475495160265, + "grad_norm": 0.8261638283729553, + "learning_rate": 1.9002563274893337e-05, + "loss": 0.357, + "step": 3408 + }, + { + "epoch": 0.32114175360919434, + "grad_norm": 0.7509334683418274, + "learning_rate": 1.900190577040713e-05, + "loss": 0.3546, + "step": 3409 + }, + { + "epoch": 0.3212359577023622, + "grad_norm": 0.9319678544998169, + "learning_rate": 1.900124806066355e-05, + "loss": 0.3577, + "step": 3410 + }, + { + "epoch": 0.32133016179553003, + "grad_norm": 0.7860541343688965, + "learning_rate": 1.9000590145677593e-05, + "loss": 0.3785, + "step": 3411 + }, + { + "epoch": 0.3214243658886979, + "grad_norm": 0.8138558268547058, + "learning_rate": 1.8999932025464262e-05, + "loss": 0.3311, + "step": 3412 + }, + { + "epoch": 0.32151856998186573, + "grad_norm": 0.779015064239502, + "learning_rate": 1.8999273700038564e-05, + "loss": 0.3458, + "step": 3413 + }, + { + "epoch": 0.3216127740750336, + "grad_norm": 0.7839806079864502, + "learning_rate": 1.8998615169415506e-05, + "loss": 0.3003, + "step": 3414 + }, + { + "epoch": 0.32170697816820143, + "grad_norm": 0.788892924785614, + "learning_rate": 1.8997956433610104e-05, + "loss": 0.3594, + "step": 3415 + }, + { + "epoch": 0.3218011822613693, + "grad_norm": 0.7324049472808838, + "learning_rate": 1.899729749263738e-05, + "loss": 0.2969, + "step": 3416 + }, + { + "epoch": 0.3218953863545371, + "grad_norm": 0.8111429810523987, + "learning_rate": 1.899663834651236e-05, + "loss": 0.3821, + "step": 3417 + }, + { + "epoch": 0.321989590447705, + "grad_norm": 0.7825393676757812, + "learning_rate": 1.899597899525007e-05, + "loss": 0.3719, + "step": 3418 + }, + { + "epoch": 0.3220837945408728, + "grad_norm": 0.7664816975593567, + "learning_rate": 1.899531943886555e-05, + "loss": 0.3688, + "step": 3419 + }, + { + "epoch": 0.3221779986340407, + "grad_norm": 0.8949109315872192, + "learning_rate": 1.8994659677373834e-05, + "loss": 0.3789, + "step": 3420 + }, + { + "epoch": 0.3222722027272085, + "grad_norm": 0.9079577922821045, + "learning_rate": 1.8993999710789966e-05, + "loss": 0.3246, + "step": 3421 + }, + { + "epoch": 0.32236640682037637, + "grad_norm": 0.7905574440956116, + "learning_rate": 1.8993339539128993e-05, + "loss": 0.3632, + "step": 3422 + }, + { + "epoch": 0.3224606109135442, + "grad_norm": 0.753884494304657, + "learning_rate": 1.8992679162405975e-05, + "loss": 0.3497, + "step": 3423 + }, + { + "epoch": 0.322554815006712, + "grad_norm": 0.7039985656738281, + "learning_rate": 1.899201858063596e-05, + "loss": 0.3318, + "step": 3424 + }, + { + "epoch": 0.32264901909987986, + "grad_norm": 0.8347176313400269, + "learning_rate": 1.8991357793834018e-05, + "loss": 0.3366, + "step": 3425 + }, + { + "epoch": 0.3227432231930477, + "grad_norm": 0.8024595379829407, + "learning_rate": 1.8990696802015213e-05, + "loss": 0.3639, + "step": 3426 + }, + { + "epoch": 0.32283742728621556, + "grad_norm": 0.9033552408218384, + "learning_rate": 1.8990035605194614e-05, + "loss": 0.3906, + "step": 3427 + }, + { + "epoch": 0.3229316313793834, + "grad_norm": 0.8519493341445923, + "learning_rate": 1.8989374203387302e-05, + "loss": 0.3633, + "step": 3428 + }, + { + "epoch": 0.32302583547255126, + "grad_norm": 0.7858714461326599, + "learning_rate": 1.8988712596608354e-05, + "loss": 0.3604, + "step": 3429 + }, + { + "epoch": 0.3231200395657191, + "grad_norm": 0.7719192504882812, + "learning_rate": 1.898805078487286e-05, + "loss": 0.3646, + "step": 3430 + }, + { + "epoch": 0.32321424365888696, + "grad_norm": 0.8226203322410583, + "learning_rate": 1.8987388768195906e-05, + "loss": 0.4058, + "step": 3431 + }, + { + "epoch": 0.3233084477520548, + "grad_norm": 1.0411338806152344, + "learning_rate": 1.898672654659259e-05, + "loss": 0.374, + "step": 3432 + }, + { + "epoch": 0.32340265184522266, + "grad_norm": 0.8630709052085876, + "learning_rate": 1.898606412007801e-05, + "loss": 0.3801, + "step": 3433 + }, + { + "epoch": 0.3234968559383905, + "grad_norm": 0.7369588613510132, + "learning_rate": 1.898540148866727e-05, + "loss": 0.3255, + "step": 3434 + }, + { + "epoch": 0.32359106003155835, + "grad_norm": 0.7781053781509399, + "learning_rate": 1.898473865237548e-05, + "loss": 0.3523, + "step": 3435 + }, + { + "epoch": 0.3236852641247262, + "grad_norm": 0.8066985607147217, + "learning_rate": 1.898407561121775e-05, + "loss": 0.381, + "step": 3436 + }, + { + "epoch": 0.32377946821789405, + "grad_norm": 0.8260643482208252, + "learning_rate": 1.8983412365209207e-05, + "loss": 0.3586, + "step": 3437 + }, + { + "epoch": 0.3238736723110619, + "grad_norm": 0.8808850049972534, + "learning_rate": 1.898274891436497e-05, + "loss": 0.3441, + "step": 3438 + }, + { + "epoch": 0.32396787640422975, + "grad_norm": 0.8420130014419556, + "learning_rate": 1.8982085258700164e-05, + "loss": 0.4047, + "step": 3439 + }, + { + "epoch": 0.3240620804973976, + "grad_norm": 0.7730211019515991, + "learning_rate": 1.898142139822992e-05, + "loss": 0.3499, + "step": 3440 + }, + { + "epoch": 0.32415628459056545, + "grad_norm": 0.8381409645080566, + "learning_rate": 1.8980757332969382e-05, + "loss": 0.3538, + "step": 3441 + }, + { + "epoch": 0.3242504886837333, + "grad_norm": 0.8234378099441528, + "learning_rate": 1.8980093062933686e-05, + "loss": 0.3609, + "step": 3442 + }, + { + "epoch": 0.32434469277690114, + "grad_norm": 0.807602047920227, + "learning_rate": 1.897942858813798e-05, + "loss": 0.3586, + "step": 3443 + }, + { + "epoch": 0.324438896870069, + "grad_norm": 0.8544538021087646, + "learning_rate": 1.8978763908597417e-05, + "loss": 0.3772, + "step": 3444 + }, + { + "epoch": 0.32453310096323684, + "grad_norm": 0.7575989365577698, + "learning_rate": 1.897809902432715e-05, + "loss": 0.3696, + "step": 3445 + }, + { + "epoch": 0.3246273050564047, + "grad_norm": 0.8299974203109741, + "learning_rate": 1.8977433935342338e-05, + "loss": 0.3541, + "step": 3446 + }, + { + "epoch": 0.32472150914957254, + "grad_norm": 0.7398593425750732, + "learning_rate": 1.897676864165815e-05, + "loss": 0.3588, + "step": 3447 + }, + { + "epoch": 0.3248157132427404, + "grad_norm": 0.8261170983314514, + "learning_rate": 1.8976103143289756e-05, + "loss": 0.3756, + "step": 3448 + }, + { + "epoch": 0.32490991733590824, + "grad_norm": 0.7719841599464417, + "learning_rate": 1.897543744025233e-05, + "loss": 0.3359, + "step": 3449 + }, + { + "epoch": 0.3250041214290761, + "grad_norm": 0.853029727935791, + "learning_rate": 1.8974771532561046e-05, + "loss": 0.3741, + "step": 3450 + }, + { + "epoch": 0.32509832552224394, + "grad_norm": 0.777460515499115, + "learning_rate": 1.897410542023109e-05, + "loss": 0.3836, + "step": 3451 + }, + { + "epoch": 0.3251925296154118, + "grad_norm": 0.8113864660263062, + "learning_rate": 1.8973439103277657e-05, + "loss": 0.3281, + "step": 3452 + }, + { + "epoch": 0.32528673370857963, + "grad_norm": 0.7618050575256348, + "learning_rate": 1.8972772581715932e-05, + "loss": 0.3352, + "step": 3453 + }, + { + "epoch": 0.3253809378017475, + "grad_norm": 0.7826343774795532, + "learning_rate": 1.897210585556111e-05, + "loss": 0.3173, + "step": 3454 + }, + { + "epoch": 0.32547514189491533, + "grad_norm": 0.8260231614112854, + "learning_rate": 1.8971438924828404e-05, + "loss": 0.3396, + "step": 3455 + }, + { + "epoch": 0.3255693459880832, + "grad_norm": 0.8214619159698486, + "learning_rate": 1.897077178953302e-05, + "loss": 0.347, + "step": 3456 + }, + { + "epoch": 0.32566355008125103, + "grad_norm": 0.8712473511695862, + "learning_rate": 1.897010444969016e-05, + "loss": 0.4287, + "step": 3457 + }, + { + "epoch": 0.3257577541744189, + "grad_norm": 0.8250601291656494, + "learning_rate": 1.8969436905315045e-05, + "loss": 0.3639, + "step": 3458 + }, + { + "epoch": 0.3258519582675867, + "grad_norm": 0.8448606133460999, + "learning_rate": 1.89687691564229e-05, + "loss": 0.3821, + "step": 3459 + }, + { + "epoch": 0.3259461623607546, + "grad_norm": 0.7553685307502747, + "learning_rate": 1.8968101203028947e-05, + "loss": 0.3507, + "step": 3460 + }, + { + "epoch": 0.3260403664539224, + "grad_norm": 0.6598140597343445, + "learning_rate": 1.896743304514842e-05, + "loss": 0.2978, + "step": 3461 + }, + { + "epoch": 0.3261345705470903, + "grad_norm": 0.8599854111671448, + "learning_rate": 1.8966764682796546e-05, + "loss": 0.3804, + "step": 3462 + }, + { + "epoch": 0.3262287746402581, + "grad_norm": 0.8068474531173706, + "learning_rate": 1.8966096115988572e-05, + "loss": 0.3663, + "step": 3463 + }, + { + "epoch": 0.326322978733426, + "grad_norm": 0.7263758182525635, + "learning_rate": 1.8965427344739743e-05, + "loss": 0.278, + "step": 3464 + }, + { + "epoch": 0.3264171828265938, + "grad_norm": 0.7682604789733887, + "learning_rate": 1.8964758369065303e-05, + "loss": 0.3688, + "step": 3465 + }, + { + "epoch": 0.32651138691976167, + "grad_norm": 0.7840115427970886, + "learning_rate": 1.8964089188980508e-05, + "loss": 0.3633, + "step": 3466 + }, + { + "epoch": 0.3266055910129295, + "grad_norm": 0.9188367128372192, + "learning_rate": 1.8963419804500615e-05, + "loss": 0.4079, + "step": 3467 + }, + { + "epoch": 0.32669979510609737, + "grad_norm": 0.8452111482620239, + "learning_rate": 1.896275021564089e-05, + "loss": 0.3865, + "step": 3468 + }, + { + "epoch": 0.3267939991992652, + "grad_norm": 0.9212802648544312, + "learning_rate": 1.89620804224166e-05, + "loss": 0.3861, + "step": 3469 + }, + { + "epoch": 0.32688820329243307, + "grad_norm": 0.8427731394767761, + "learning_rate": 1.896141042484301e-05, + "loss": 0.3486, + "step": 3470 + }, + { + "epoch": 0.3269824073856009, + "grad_norm": 0.6711952686309814, + "learning_rate": 1.896074022293541e-05, + "loss": 0.3076, + "step": 3471 + }, + { + "epoch": 0.32707661147876876, + "grad_norm": 0.7252352237701416, + "learning_rate": 1.8960069816709073e-05, + "loss": 0.3372, + "step": 3472 + }, + { + "epoch": 0.3271708155719366, + "grad_norm": 0.9933614730834961, + "learning_rate": 1.8959399206179286e-05, + "loss": 0.3728, + "step": 3473 + }, + { + "epoch": 0.32726501966510446, + "grad_norm": 0.7768873572349548, + "learning_rate": 1.8958728391361343e-05, + "loss": 0.3163, + "step": 3474 + }, + { + "epoch": 0.3273592237582723, + "grad_norm": 0.8002369999885559, + "learning_rate": 1.8958057372270536e-05, + "loss": 0.3197, + "step": 3475 + }, + { + "epoch": 0.32745342785144016, + "grad_norm": 0.8852412700653076, + "learning_rate": 1.8957386148922166e-05, + "loss": 0.329, + "step": 3476 + }, + { + "epoch": 0.327547631944608, + "grad_norm": 0.7686527371406555, + "learning_rate": 1.8956714721331543e-05, + "loss": 0.2952, + "step": 3477 + }, + { + "epoch": 0.32764183603777586, + "grad_norm": 0.7055858373641968, + "learning_rate": 1.8956043089513968e-05, + "loss": 0.3069, + "step": 3478 + }, + { + "epoch": 0.3277360401309437, + "grad_norm": 0.8490488529205322, + "learning_rate": 1.8955371253484762e-05, + "loss": 0.3633, + "step": 3479 + }, + { + "epoch": 0.32783024422411156, + "grad_norm": 0.858548641204834, + "learning_rate": 1.895469921325924e-05, + "loss": 0.3721, + "step": 3480 + }, + { + "epoch": 0.3279244483172794, + "grad_norm": 0.7843692898750305, + "learning_rate": 1.895402696885273e-05, + "loss": 0.3758, + "step": 3481 + }, + { + "epoch": 0.32801865241044725, + "grad_norm": 0.8354014754295349, + "learning_rate": 1.8953354520280557e-05, + "loss": 0.3795, + "step": 3482 + }, + { + "epoch": 0.3281128565036151, + "grad_norm": 0.9440718293190002, + "learning_rate": 1.8952681867558053e-05, + "loss": 0.4087, + "step": 3483 + }, + { + "epoch": 0.32820706059678295, + "grad_norm": 0.8207374811172485, + "learning_rate": 1.8952009010700556e-05, + "loss": 0.3432, + "step": 3484 + }, + { + "epoch": 0.3283012646899508, + "grad_norm": 0.9222376942634583, + "learning_rate": 1.8951335949723412e-05, + "loss": 0.4125, + "step": 3485 + }, + { + "epoch": 0.32839546878311865, + "grad_norm": 0.8984072208404541, + "learning_rate": 1.8950662684641963e-05, + "loss": 0.3678, + "step": 3486 + }, + { + "epoch": 0.3284896728762865, + "grad_norm": 0.7753778696060181, + "learning_rate": 1.894998921547156e-05, + "loss": 0.3421, + "step": 3487 + }, + { + "epoch": 0.32858387696945435, + "grad_norm": 0.8408216834068298, + "learning_rate": 1.8949315542227568e-05, + "loss": 0.3931, + "step": 3488 + }, + { + "epoch": 0.3286780810626222, + "grad_norm": 0.8501113653182983, + "learning_rate": 1.8948641664925335e-05, + "loss": 0.4026, + "step": 3489 + }, + { + "epoch": 0.32877228515579004, + "grad_norm": 0.9063554406166077, + "learning_rate": 1.8947967583580234e-05, + "loss": 0.3686, + "step": 3490 + }, + { + "epoch": 0.3288664892489579, + "grad_norm": 1.043784260749817, + "learning_rate": 1.8947293298207637e-05, + "loss": 0.4105, + "step": 3491 + }, + { + "epoch": 0.32896069334212574, + "grad_norm": 0.9878965616226196, + "learning_rate": 1.8946618808822913e-05, + "loss": 0.3663, + "step": 3492 + }, + { + "epoch": 0.32905489743529354, + "grad_norm": 0.8769758343696594, + "learning_rate": 1.8945944115441444e-05, + "loss": 0.3666, + "step": 3493 + }, + { + "epoch": 0.3291491015284614, + "grad_norm": 0.8564929962158203, + "learning_rate": 1.8945269218078615e-05, + "loss": 0.3991, + "step": 3494 + }, + { + "epoch": 0.32924330562162923, + "grad_norm": 0.91790771484375, + "learning_rate": 1.8944594116749812e-05, + "loss": 0.3838, + "step": 3495 + }, + { + "epoch": 0.3293375097147971, + "grad_norm": 0.7588348388671875, + "learning_rate": 1.894391881147043e-05, + "loss": 0.3331, + "step": 3496 + }, + { + "epoch": 0.32943171380796493, + "grad_norm": 0.7754861116409302, + "learning_rate": 1.894324330225587e-05, + "loss": 0.316, + "step": 3497 + }, + { + "epoch": 0.3295259179011328, + "grad_norm": 0.8175535798072815, + "learning_rate": 1.894256758912153e-05, + "loss": 0.3432, + "step": 3498 + }, + { + "epoch": 0.32962012199430063, + "grad_norm": 0.8538705706596375, + "learning_rate": 1.8941891672082816e-05, + "loss": 0.351, + "step": 3499 + }, + { + "epoch": 0.3297143260874685, + "grad_norm": 0.7761415839195251, + "learning_rate": 1.894121555115515e-05, + "loss": 0.3113, + "step": 3500 + }, + { + "epoch": 0.32980853018063633, + "grad_norm": 0.9469418525695801, + "learning_rate": 1.8940539226353935e-05, + "loss": 0.4011, + "step": 3501 + }, + { + "epoch": 0.3299027342738042, + "grad_norm": 0.8777265548706055, + "learning_rate": 1.8939862697694602e-05, + "loss": 0.3625, + "step": 3502 + }, + { + "epoch": 0.329996938366972, + "grad_norm": 0.8174872398376465, + "learning_rate": 1.8939185965192572e-05, + "loss": 0.3662, + "step": 3503 + }, + { + "epoch": 0.3300911424601399, + "grad_norm": 0.9018071293830872, + "learning_rate": 1.8938509028863278e-05, + "loss": 0.3792, + "step": 3504 + }, + { + "epoch": 0.3301853465533077, + "grad_norm": 0.9734531044960022, + "learning_rate": 1.8937831888722152e-05, + "loss": 0.3997, + "step": 3505 + }, + { + "epoch": 0.3302795506464756, + "grad_norm": 0.9966946840286255, + "learning_rate": 1.8937154544784642e-05, + "loss": 0.3543, + "step": 3506 + }, + { + "epoch": 0.3303737547396434, + "grad_norm": 0.8334128260612488, + "learning_rate": 1.893647699706618e-05, + "loss": 0.3797, + "step": 3507 + }, + { + "epoch": 0.33046795883281127, + "grad_norm": 0.8594630360603333, + "learning_rate": 1.893579924558223e-05, + "loss": 0.3735, + "step": 3508 + }, + { + "epoch": 0.3305621629259791, + "grad_norm": 0.7860228419303894, + "learning_rate": 1.8935121290348232e-05, + "loss": 0.3597, + "step": 3509 + }, + { + "epoch": 0.33065636701914697, + "grad_norm": 0.9093959927558899, + "learning_rate": 1.8934443131379652e-05, + "loss": 0.4012, + "step": 3510 + }, + { + "epoch": 0.3307505711123148, + "grad_norm": 0.7881484031677246, + "learning_rate": 1.893376476869195e-05, + "loss": 0.3599, + "step": 3511 + }, + { + "epoch": 0.33084477520548267, + "grad_norm": 0.851662278175354, + "learning_rate": 1.8933086202300597e-05, + "loss": 0.3637, + "step": 3512 + }, + { + "epoch": 0.3309389792986505, + "grad_norm": 1.1916617155075073, + "learning_rate": 1.893240743222106e-05, + "loss": 0.3572, + "step": 3513 + }, + { + "epoch": 0.33103318339181836, + "grad_norm": 0.8736041188240051, + "learning_rate": 1.8931728458468824e-05, + "loss": 0.3805, + "step": 3514 + }, + { + "epoch": 0.3311273874849862, + "grad_norm": 0.9154252409934998, + "learning_rate": 1.8931049281059362e-05, + "loss": 0.4029, + "step": 3515 + }, + { + "epoch": 0.33122159157815406, + "grad_norm": 0.7905598282814026, + "learning_rate": 1.8930369900008166e-05, + "loss": 0.3388, + "step": 3516 + }, + { + "epoch": 0.3313157956713219, + "grad_norm": 0.782569169998169, + "learning_rate": 1.8929690315330724e-05, + "loss": 0.315, + "step": 3517 + }, + { + "epoch": 0.33140999976448976, + "grad_norm": 0.8409260511398315, + "learning_rate": 1.8929010527042533e-05, + "loss": 0.3778, + "step": 3518 + }, + { + "epoch": 0.3315042038576576, + "grad_norm": 0.8331377506256104, + "learning_rate": 1.8928330535159094e-05, + "loss": 0.3402, + "step": 3519 + }, + { + "epoch": 0.33159840795082546, + "grad_norm": 0.7925302982330322, + "learning_rate": 1.892765033969591e-05, + "loss": 0.3557, + "step": 3520 + }, + { + "epoch": 0.3316926120439933, + "grad_norm": 0.772601842880249, + "learning_rate": 1.8926969940668495e-05, + "loss": 0.3582, + "step": 3521 + }, + { + "epoch": 0.33178681613716116, + "grad_norm": 0.9124496579170227, + "learning_rate": 1.8926289338092352e-05, + "loss": 0.4083, + "step": 3522 + }, + { + "epoch": 0.331881020230329, + "grad_norm": 1.2293285131454468, + "learning_rate": 1.8925608531983012e-05, + "loss": 0.37, + "step": 3523 + }, + { + "epoch": 0.33197522432349685, + "grad_norm": 0.8313436508178711, + "learning_rate": 1.8924927522355996e-05, + "loss": 0.3469, + "step": 3524 + }, + { + "epoch": 0.3320694284166647, + "grad_norm": 0.8663201928138733, + "learning_rate": 1.8924246309226826e-05, + "loss": 0.3485, + "step": 3525 + }, + { + "epoch": 0.33216363250983255, + "grad_norm": 0.8084797859191895, + "learning_rate": 1.892356489261104e-05, + "loss": 0.318, + "step": 3526 + }, + { + "epoch": 0.3322578366030004, + "grad_norm": 0.8676424026489258, + "learning_rate": 1.8922883272524172e-05, + "loss": 0.3519, + "step": 3527 + }, + { + "epoch": 0.33235204069616825, + "grad_norm": 0.8464802503585815, + "learning_rate": 1.8922201448981766e-05, + "loss": 0.3346, + "step": 3528 + }, + { + "epoch": 0.3324462447893361, + "grad_norm": 0.7341082692146301, + "learning_rate": 1.8921519421999374e-05, + "loss": 0.3206, + "step": 3529 + }, + { + "epoch": 0.33254044888250395, + "grad_norm": 0.797134518623352, + "learning_rate": 1.8920837191592535e-05, + "loss": 0.3441, + "step": 3530 + }, + { + "epoch": 0.3326346529756718, + "grad_norm": 0.847991406917572, + "learning_rate": 1.8920154757776816e-05, + "loss": 0.3483, + "step": 3531 + }, + { + "epoch": 0.33272885706883965, + "grad_norm": 0.8134728670120239, + "learning_rate": 1.891947212056777e-05, + "loss": 0.3152, + "step": 3532 + }, + { + "epoch": 0.3328230611620075, + "grad_norm": 0.9314228296279907, + "learning_rate": 1.891878927998097e-05, + "loss": 0.36, + "step": 3533 + }, + { + "epoch": 0.33291726525517534, + "grad_norm": 0.7768193483352661, + "learning_rate": 1.8918106236031976e-05, + "loss": 0.3594, + "step": 3534 + }, + { + "epoch": 0.3330114693483432, + "grad_norm": 0.8359523415565491, + "learning_rate": 1.8917422988736372e-05, + "loss": 0.3816, + "step": 3535 + }, + { + "epoch": 0.33310567344151104, + "grad_norm": 0.7809811234474182, + "learning_rate": 1.891673953810973e-05, + "loss": 0.3489, + "step": 3536 + }, + { + "epoch": 0.3331998775346789, + "grad_norm": 0.8549154996871948, + "learning_rate": 1.8916055884167638e-05, + "loss": 0.3765, + "step": 3537 + }, + { + "epoch": 0.33329408162784674, + "grad_norm": 0.8402571082115173, + "learning_rate": 1.8915372026925682e-05, + "loss": 0.3258, + "step": 3538 + }, + { + "epoch": 0.3333882857210146, + "grad_norm": 0.7873833775520325, + "learning_rate": 1.8914687966399457e-05, + "loss": 0.3103, + "step": 3539 + }, + { + "epoch": 0.33348248981418244, + "grad_norm": 0.7410790324211121, + "learning_rate": 1.8914003702604557e-05, + "loss": 0.3348, + "step": 3540 + }, + { + "epoch": 0.3335766939073503, + "grad_norm": 0.7940850853919983, + "learning_rate": 1.891331923555659e-05, + "loss": 0.3524, + "step": 3541 + }, + { + "epoch": 0.33367089800051813, + "grad_norm": 0.733606219291687, + "learning_rate": 1.8912634565271156e-05, + "loss": 0.3557, + "step": 3542 + }, + { + "epoch": 0.333765102093686, + "grad_norm": 0.914272129535675, + "learning_rate": 1.8911949691763875e-05, + "loss": 0.3898, + "step": 3543 + }, + { + "epoch": 0.33385930618685383, + "grad_norm": 0.7942153811454773, + "learning_rate": 1.8911264615050357e-05, + "loss": 0.3441, + "step": 3544 + }, + { + "epoch": 0.3339535102800217, + "grad_norm": 0.7900113463401794, + "learning_rate": 1.891057933514622e-05, + "loss": 0.3518, + "step": 3545 + }, + { + "epoch": 0.33404771437318953, + "grad_norm": 0.7690131068229675, + "learning_rate": 1.89098938520671e-05, + "loss": 0.3316, + "step": 3546 + }, + { + "epoch": 0.3341419184663574, + "grad_norm": 0.7978188991546631, + "learning_rate": 1.8909208165828618e-05, + "loss": 0.3002, + "step": 3547 + }, + { + "epoch": 0.33423612255952523, + "grad_norm": 0.857190728187561, + "learning_rate": 1.8908522276446408e-05, + "loss": 0.4009, + "step": 3548 + }, + { + "epoch": 0.3343303266526931, + "grad_norm": 0.877989649772644, + "learning_rate": 1.890783618393612e-05, + "loss": 0.326, + "step": 3549 + }, + { + "epoch": 0.3344245307458609, + "grad_norm": 0.8402450084686279, + "learning_rate": 1.8907149888313384e-05, + "loss": 0.3893, + "step": 3550 + }, + { + "epoch": 0.3345187348390288, + "grad_norm": 0.9059008955955505, + "learning_rate": 1.890646338959386e-05, + "loss": 0.3549, + "step": 3551 + }, + { + "epoch": 0.3346129389321966, + "grad_norm": 0.8231593370437622, + "learning_rate": 1.890577668779319e-05, + "loss": 0.3332, + "step": 3552 + }, + { + "epoch": 0.3347071430253645, + "grad_norm": 0.8435704708099365, + "learning_rate": 1.8905089782927047e-05, + "loss": 0.3588, + "step": 3553 + }, + { + "epoch": 0.3348013471185323, + "grad_norm": 0.8788042068481445, + "learning_rate": 1.890440267501108e-05, + "loss": 0.3823, + "step": 3554 + }, + { + "epoch": 0.33489555121170017, + "grad_norm": 0.8267970085144043, + "learning_rate": 1.8903715364060962e-05, + "loss": 0.3574, + "step": 3555 + }, + { + "epoch": 0.334989755304868, + "grad_norm": 0.8716968894004822, + "learning_rate": 1.8903027850092365e-05, + "loss": 0.3856, + "step": 3556 + }, + { + "epoch": 0.33508395939803587, + "grad_norm": 0.8291651010513306, + "learning_rate": 1.8902340133120965e-05, + "loss": 0.351, + "step": 3557 + }, + { + "epoch": 0.3351781634912037, + "grad_norm": 0.8509311079978943, + "learning_rate": 1.890165221316244e-05, + "loss": 0.3379, + "step": 3558 + }, + { + "epoch": 0.33527236758437157, + "grad_norm": 0.7940100431442261, + "learning_rate": 1.8900964090232477e-05, + "loss": 0.3704, + "step": 3559 + }, + { + "epoch": 0.3353665716775394, + "grad_norm": 0.966296911239624, + "learning_rate": 1.890027576434677e-05, + "loss": 0.368, + "step": 3560 + }, + { + "epoch": 0.33546077577070726, + "grad_norm": 0.7433187365531921, + "learning_rate": 1.8899587235521006e-05, + "loss": 0.3277, + "step": 3561 + }, + { + "epoch": 0.33555497986387506, + "grad_norm": 0.8418706059455872, + "learning_rate": 1.8898898503770895e-05, + "loss": 0.3634, + "step": 3562 + }, + { + "epoch": 0.3356491839570429, + "grad_norm": 0.8187812566757202, + "learning_rate": 1.8898209569112133e-05, + "loss": 0.3589, + "step": 3563 + }, + { + "epoch": 0.33574338805021076, + "grad_norm": 0.8903088569641113, + "learning_rate": 1.8897520431560435e-05, + "loss": 0.3559, + "step": 3564 + }, + { + "epoch": 0.3358375921433786, + "grad_norm": 0.8367480039596558, + "learning_rate": 1.8896831091131506e-05, + "loss": 0.3474, + "step": 3565 + }, + { + "epoch": 0.33593179623654645, + "grad_norm": 0.7861426472663879, + "learning_rate": 1.8896141547841072e-05, + "loss": 0.3794, + "step": 3566 + }, + { + "epoch": 0.3360260003297143, + "grad_norm": 0.9281355738639832, + "learning_rate": 1.8895451801704848e-05, + "loss": 0.3604, + "step": 3567 + }, + { + "epoch": 0.33612020442288215, + "grad_norm": 0.9803590774536133, + "learning_rate": 1.8894761852738572e-05, + "loss": 0.3384, + "step": 3568 + }, + { + "epoch": 0.33621440851605, + "grad_norm": 0.8485681414604187, + "learning_rate": 1.889407170095797e-05, + "loss": 0.3576, + "step": 3569 + }, + { + "epoch": 0.33630861260921785, + "grad_norm": 0.8599330186843872, + "learning_rate": 1.8893381346378775e-05, + "loss": 0.3728, + "step": 3570 + }, + { + "epoch": 0.3364028167023857, + "grad_norm": 0.9006670713424683, + "learning_rate": 1.889269078901673e-05, + "loss": 0.3922, + "step": 3571 + }, + { + "epoch": 0.33649702079555355, + "grad_norm": 0.8296506404876709, + "learning_rate": 1.8892000028887586e-05, + "loss": 0.3499, + "step": 3572 + }, + { + "epoch": 0.3365912248887214, + "grad_norm": 0.9027323126792908, + "learning_rate": 1.8891309066007087e-05, + "loss": 0.4125, + "step": 3573 + }, + { + "epoch": 0.33668542898188925, + "grad_norm": 0.8506341576576233, + "learning_rate": 1.889061790039099e-05, + "loss": 0.3719, + "step": 3574 + }, + { + "epoch": 0.3367796330750571, + "grad_norm": 0.8128954768180847, + "learning_rate": 1.8889926532055057e-05, + "loss": 0.3563, + "step": 3575 + }, + { + "epoch": 0.33687383716822494, + "grad_norm": 0.8806983828544617, + "learning_rate": 1.888923496101505e-05, + "loss": 0.3788, + "step": 3576 + }, + { + "epoch": 0.3369680412613928, + "grad_norm": 0.8672088980674744, + "learning_rate": 1.8888543187286738e-05, + "loss": 0.3162, + "step": 3577 + }, + { + "epoch": 0.33706224535456064, + "grad_norm": 0.8352708220481873, + "learning_rate": 1.8887851210885893e-05, + "loss": 0.3146, + "step": 3578 + }, + { + "epoch": 0.3371564494477285, + "grad_norm": 1.2389875650405884, + "learning_rate": 1.88871590318283e-05, + "loss": 0.3541, + "step": 3579 + }, + { + "epoch": 0.33725065354089634, + "grad_norm": 0.7673290371894836, + "learning_rate": 1.8886466650129734e-05, + "loss": 0.3417, + "step": 3580 + }, + { + "epoch": 0.3373448576340642, + "grad_norm": 0.8438095450401306, + "learning_rate": 1.8885774065805987e-05, + "loss": 0.413, + "step": 3581 + }, + { + "epoch": 0.33743906172723204, + "grad_norm": 1.0049430131912231, + "learning_rate": 1.8885081278872846e-05, + "loss": 0.4122, + "step": 3582 + }, + { + "epoch": 0.3375332658203999, + "grad_norm": 0.8492429256439209, + "learning_rate": 1.8884388289346115e-05, + "loss": 0.3387, + "step": 3583 + }, + { + "epoch": 0.33762746991356773, + "grad_norm": 1.0927600860595703, + "learning_rate": 1.8883695097241584e-05, + "loss": 0.4039, + "step": 3584 + }, + { + "epoch": 0.3377216740067356, + "grad_norm": 0.7235574126243591, + "learning_rate": 1.8883001702575074e-05, + "loss": 0.3715, + "step": 3585 + }, + { + "epoch": 0.33781587809990343, + "grad_norm": 1.0561778545379639, + "learning_rate": 1.8882308105362383e-05, + "loss": 0.3545, + "step": 3586 + }, + { + "epoch": 0.3379100821930713, + "grad_norm": 0.7849492430686951, + "learning_rate": 1.888161430561933e-05, + "loss": 0.3578, + "step": 3587 + }, + { + "epoch": 0.33800428628623913, + "grad_norm": 0.868797242641449, + "learning_rate": 1.888092030336174e-05, + "loss": 0.4318, + "step": 3588 + }, + { + "epoch": 0.338098490379407, + "grad_norm": 0.7665191292762756, + "learning_rate": 1.8880226098605427e-05, + "loss": 0.3273, + "step": 3589 + }, + { + "epoch": 0.33819269447257483, + "grad_norm": 0.8948681354522705, + "learning_rate": 1.8879531691366228e-05, + "loss": 0.4198, + "step": 3590 + }, + { + "epoch": 0.3382868985657427, + "grad_norm": 0.7987225651741028, + "learning_rate": 1.887883708165997e-05, + "loss": 0.3509, + "step": 3591 + }, + { + "epoch": 0.3383811026589105, + "grad_norm": 0.8362273573875427, + "learning_rate": 1.88781422695025e-05, + "loss": 0.3478, + "step": 3592 + }, + { + "epoch": 0.3384753067520784, + "grad_norm": 0.815270185470581, + "learning_rate": 1.8877447254909654e-05, + "loss": 0.3879, + "step": 3593 + }, + { + "epoch": 0.3385695108452462, + "grad_norm": 0.8665373921394348, + "learning_rate": 1.8876752037897285e-05, + "loss": 0.3813, + "step": 3594 + }, + { + "epoch": 0.3386637149384141, + "grad_norm": 0.766981303691864, + "learning_rate": 1.887605661848124e-05, + "loss": 0.3206, + "step": 3595 + }, + { + "epoch": 0.3387579190315819, + "grad_norm": 0.7618220448493958, + "learning_rate": 1.8875360996677373e-05, + "loss": 0.3532, + "step": 3596 + }, + { + "epoch": 0.33885212312474977, + "grad_norm": 0.9736241102218628, + "learning_rate": 1.8874665172501548e-05, + "loss": 0.3866, + "step": 3597 + }, + { + "epoch": 0.3389463272179176, + "grad_norm": 0.7357736825942993, + "learning_rate": 1.8873969145969636e-05, + "loss": 0.3321, + "step": 3598 + }, + { + "epoch": 0.33904053131108547, + "grad_norm": 1.4544962644577026, + "learning_rate": 1.8873272917097505e-05, + "loss": 0.3433, + "step": 3599 + }, + { + "epoch": 0.3391347354042533, + "grad_norm": 0.8322678804397583, + "learning_rate": 1.8872576485901027e-05, + "loss": 0.364, + "step": 3600 + }, + { + "epoch": 0.33922893949742117, + "grad_norm": 0.8751059770584106, + "learning_rate": 1.8871879852396083e-05, + "loss": 0.3336, + "step": 3601 + }, + { + "epoch": 0.339323143590589, + "grad_norm": 0.8897792100906372, + "learning_rate": 1.887118301659856e-05, + "loss": 0.377, + "step": 3602 + }, + { + "epoch": 0.33941734768375686, + "grad_norm": 0.8926268219947815, + "learning_rate": 1.887048597852434e-05, + "loss": 0.3551, + "step": 3603 + }, + { + "epoch": 0.3395115517769247, + "grad_norm": 0.9420211911201477, + "learning_rate": 1.8869788738189326e-05, + "loss": 0.3914, + "step": 3604 + }, + { + "epoch": 0.33960575587009256, + "grad_norm": 0.8444533348083496, + "learning_rate": 1.8869091295609407e-05, + "loss": 0.3364, + "step": 3605 + }, + { + "epoch": 0.3396999599632604, + "grad_norm": 0.7704145312309265, + "learning_rate": 1.886839365080049e-05, + "loss": 0.3386, + "step": 3606 + }, + { + "epoch": 0.33979416405642826, + "grad_norm": 0.803094208240509, + "learning_rate": 1.8867695803778487e-05, + "loss": 0.3435, + "step": 3607 + }, + { + "epoch": 0.3398883681495961, + "grad_norm": 0.761256754398346, + "learning_rate": 1.8866997754559304e-05, + "loss": 0.3099, + "step": 3608 + }, + { + "epoch": 0.33998257224276396, + "grad_norm": 0.8315357565879822, + "learning_rate": 1.8866299503158858e-05, + "loss": 0.3901, + "step": 3609 + }, + { + "epoch": 0.3400767763359318, + "grad_norm": 0.7468592524528503, + "learning_rate": 1.8865601049593075e-05, + "loss": 0.3118, + "step": 3610 + }, + { + "epoch": 0.34017098042909966, + "grad_norm": 0.807157039642334, + "learning_rate": 1.8864902393877874e-05, + "loss": 0.3802, + "step": 3611 + }, + { + "epoch": 0.3402651845222675, + "grad_norm": 0.9533775448799133, + "learning_rate": 1.8864203536029188e-05, + "loss": 0.4266, + "step": 3612 + }, + { + "epoch": 0.34035938861543535, + "grad_norm": 0.8372734785079956, + "learning_rate": 1.886350447606295e-05, + "loss": 0.3423, + "step": 3613 + }, + { + "epoch": 0.3404535927086032, + "grad_norm": 0.796859085559845, + "learning_rate": 1.8862805213995107e-05, + "loss": 0.3512, + "step": 3614 + }, + { + "epoch": 0.34054779680177105, + "grad_norm": 0.7418161630630493, + "learning_rate": 1.8862105749841596e-05, + "loss": 0.3033, + "step": 3615 + }, + { + "epoch": 0.3406420008949389, + "grad_norm": 0.7803031802177429, + "learning_rate": 1.8861406083618373e-05, + "loss": 0.3306, + "step": 3616 + }, + { + "epoch": 0.34073620498810675, + "grad_norm": 0.8984958529472351, + "learning_rate": 1.8860706215341383e-05, + "loss": 0.4, + "step": 3617 + }, + { + "epoch": 0.3408304090812746, + "grad_norm": 0.8020842671394348, + "learning_rate": 1.8860006145026585e-05, + "loss": 0.3364, + "step": 3618 + }, + { + "epoch": 0.34092461317444245, + "grad_norm": 0.8880037069320679, + "learning_rate": 1.885930587268995e-05, + "loss": 0.3755, + "step": 3619 + }, + { + "epoch": 0.3410188172676103, + "grad_norm": 0.8976454734802246, + "learning_rate": 1.8858605398347438e-05, + "loss": 0.4055, + "step": 3620 + }, + { + "epoch": 0.34111302136077815, + "grad_norm": 0.7939981818199158, + "learning_rate": 1.885790472201502e-05, + "loss": 0.3137, + "step": 3621 + }, + { + "epoch": 0.341207225453946, + "grad_norm": 0.8750141263008118, + "learning_rate": 1.885720384370868e-05, + "loss": 0.3721, + "step": 3622 + }, + { + "epoch": 0.34130142954711384, + "grad_norm": 0.8971021771430969, + "learning_rate": 1.8856502763444392e-05, + "loss": 0.3571, + "step": 3623 + }, + { + "epoch": 0.3413956336402817, + "grad_norm": 0.7954822182655334, + "learning_rate": 1.8855801481238146e-05, + "loss": 0.3529, + "step": 3624 + }, + { + "epoch": 0.34148983773344954, + "grad_norm": 0.813042402267456, + "learning_rate": 1.8855099997105927e-05, + "loss": 0.3477, + "step": 3625 + }, + { + "epoch": 0.3415840418266174, + "grad_norm": 0.8554645776748657, + "learning_rate": 1.8854398311063734e-05, + "loss": 0.3717, + "step": 3626 + }, + { + "epoch": 0.34167824591978524, + "grad_norm": 0.9292104244232178, + "learning_rate": 1.885369642312757e-05, + "loss": 0.4091, + "step": 3627 + }, + { + "epoch": 0.3417724500129531, + "grad_norm": 1.1380723714828491, + "learning_rate": 1.8852994333313432e-05, + "loss": 0.3633, + "step": 3628 + }, + { + "epoch": 0.34186665410612094, + "grad_norm": 0.9120110273361206, + "learning_rate": 1.885229204163733e-05, + "loss": 0.3871, + "step": 3629 + }, + { + "epoch": 0.3419608581992888, + "grad_norm": 0.8591219186782837, + "learning_rate": 1.8851589548115282e-05, + "loss": 0.4018, + "step": 3630 + }, + { + "epoch": 0.3420550622924566, + "grad_norm": 0.8268954157829285, + "learning_rate": 1.8850886852763302e-05, + "loss": 0.3343, + "step": 3631 + }, + { + "epoch": 0.34214926638562443, + "grad_norm": 0.9206968545913696, + "learning_rate": 1.8850183955597416e-05, + "loss": 0.3305, + "step": 3632 + }, + { + "epoch": 0.3422434704787923, + "grad_norm": 0.7509943246841431, + "learning_rate": 1.8849480856633647e-05, + "loss": 0.3255, + "step": 3633 + }, + { + "epoch": 0.3423376745719601, + "grad_norm": 0.7434682250022888, + "learning_rate": 1.8848777555888027e-05, + "loss": 0.311, + "step": 3634 + }, + { + "epoch": 0.342431878665128, + "grad_norm": 0.8783538341522217, + "learning_rate": 1.8848074053376597e-05, + "loss": 0.4488, + "step": 3635 + }, + { + "epoch": 0.3425260827582958, + "grad_norm": 1.094637393951416, + "learning_rate": 1.884737034911539e-05, + "loss": 0.375, + "step": 3636 + }, + { + "epoch": 0.3426202868514637, + "grad_norm": 0.7740301489830017, + "learning_rate": 1.884666644312046e-05, + "loss": 0.364, + "step": 3637 + }, + { + "epoch": 0.3427144909446315, + "grad_norm": 0.841786801815033, + "learning_rate": 1.8845962335407854e-05, + "loss": 0.3757, + "step": 3638 + }, + { + "epoch": 0.34280869503779937, + "grad_norm": 1.4861465692520142, + "learning_rate": 1.884525802599362e-05, + "loss": 0.364, + "step": 3639 + }, + { + "epoch": 0.3429028991309672, + "grad_norm": 0.7735800743103027, + "learning_rate": 1.884455351489383e-05, + "loss": 0.3298, + "step": 3640 + }, + { + "epoch": 0.34299710322413507, + "grad_norm": 0.87845379114151, + "learning_rate": 1.884384880212454e-05, + "loss": 0.3609, + "step": 3641 + }, + { + "epoch": 0.3430913073173029, + "grad_norm": 0.9437776207923889, + "learning_rate": 1.884314388770182e-05, + "loss": 0.3503, + "step": 3642 + }, + { + "epoch": 0.34318551141047077, + "grad_norm": 0.8418173789978027, + "learning_rate": 1.8842438771641745e-05, + "loss": 0.3316, + "step": 3643 + }, + { + "epoch": 0.3432797155036386, + "grad_norm": 0.8820387721061707, + "learning_rate": 1.884173345396039e-05, + "loss": 0.3608, + "step": 3644 + }, + { + "epoch": 0.34337391959680647, + "grad_norm": 0.8833423852920532, + "learning_rate": 1.8841027934673835e-05, + "loss": 0.3796, + "step": 3645 + }, + { + "epoch": 0.3434681236899743, + "grad_norm": 0.8288242816925049, + "learning_rate": 1.8840322213798173e-05, + "loss": 0.3669, + "step": 3646 + }, + { + "epoch": 0.34356232778314216, + "grad_norm": 0.7158920168876648, + "learning_rate": 1.8839616291349493e-05, + "loss": 0.2771, + "step": 3647 + }, + { + "epoch": 0.34365653187631, + "grad_norm": 0.9035677909851074, + "learning_rate": 1.883891016734389e-05, + "loss": 0.3581, + "step": 3648 + }, + { + "epoch": 0.34375073596947786, + "grad_norm": 0.7641481161117554, + "learning_rate": 1.8838203841797464e-05, + "loss": 0.3837, + "step": 3649 + }, + { + "epoch": 0.3438449400626457, + "grad_norm": 0.8371859192848206, + "learning_rate": 1.8837497314726322e-05, + "loss": 0.3365, + "step": 3650 + }, + { + "epoch": 0.34393914415581356, + "grad_norm": 2.0889203548431396, + "learning_rate": 1.8836790586146578e-05, + "loss": 0.3551, + "step": 3651 + }, + { + "epoch": 0.3440333482489814, + "grad_norm": 0.684039294719696, + "learning_rate": 1.883608365607434e-05, + "loss": 0.3247, + "step": 3652 + }, + { + "epoch": 0.34412755234214926, + "grad_norm": 0.8166343569755554, + "learning_rate": 1.8835376524525725e-05, + "loss": 0.3166, + "step": 3653 + }, + { + "epoch": 0.3442217564353171, + "grad_norm": 0.8467241525650024, + "learning_rate": 1.8834669191516866e-05, + "loss": 0.3941, + "step": 3654 + }, + { + "epoch": 0.34431596052848495, + "grad_norm": 0.8928960561752319, + "learning_rate": 1.8833961657063887e-05, + "loss": 0.3683, + "step": 3655 + }, + { + "epoch": 0.3444101646216528, + "grad_norm": 0.9479843974113464, + "learning_rate": 1.8833253921182916e-05, + "loss": 0.3575, + "step": 3656 + }, + { + "epoch": 0.34450436871482065, + "grad_norm": 0.9287818670272827, + "learning_rate": 1.88325459838901e-05, + "loss": 0.3589, + "step": 3657 + }, + { + "epoch": 0.3445985728079885, + "grad_norm": 0.7292333841323853, + "learning_rate": 1.8831837845201573e-05, + "loss": 0.3028, + "step": 3658 + }, + { + "epoch": 0.34469277690115635, + "grad_norm": 0.7780672907829285, + "learning_rate": 1.8831129505133485e-05, + "loss": 0.3375, + "step": 3659 + }, + { + "epoch": 0.3447869809943242, + "grad_norm": 0.9152745604515076, + "learning_rate": 1.8830420963701985e-05, + "loss": 0.3537, + "step": 3660 + }, + { + "epoch": 0.34488118508749205, + "grad_norm": 0.7500927448272705, + "learning_rate": 1.8829712220923228e-05, + "loss": 0.3127, + "step": 3661 + }, + { + "epoch": 0.3449753891806599, + "grad_norm": 0.9070961475372314, + "learning_rate": 1.8829003276813383e-05, + "loss": 0.3357, + "step": 3662 + }, + { + "epoch": 0.34506959327382775, + "grad_norm": 0.9231308698654175, + "learning_rate": 1.8828294131388607e-05, + "loss": 0.3461, + "step": 3663 + }, + { + "epoch": 0.3451637973669956, + "grad_norm": 0.7690940499305725, + "learning_rate": 1.882758478466507e-05, + "loss": 0.3342, + "step": 3664 + }, + { + "epoch": 0.34525800146016344, + "grad_norm": 0.902086079120636, + "learning_rate": 1.882687523665895e-05, + "loss": 0.3846, + "step": 3665 + }, + { + "epoch": 0.3453522055533313, + "grad_norm": 0.8080546259880066, + "learning_rate": 1.882616548738642e-05, + "loss": 0.3162, + "step": 3666 + }, + { + "epoch": 0.34544640964649914, + "grad_norm": 0.7301914095878601, + "learning_rate": 1.882545553686367e-05, + "loss": 0.3039, + "step": 3667 + }, + { + "epoch": 0.345540613739667, + "grad_norm": 0.9094197750091553, + "learning_rate": 1.8824745385106883e-05, + "loss": 0.3576, + "step": 3668 + }, + { + "epoch": 0.34563481783283484, + "grad_norm": 0.9327391982078552, + "learning_rate": 1.8824035032132256e-05, + "loss": 0.3676, + "step": 3669 + }, + { + "epoch": 0.3457290219260027, + "grad_norm": 0.9570345282554626, + "learning_rate": 1.8823324477955985e-05, + "loss": 0.4053, + "step": 3670 + }, + { + "epoch": 0.34582322601917054, + "grad_norm": 0.8123584985733032, + "learning_rate": 1.8822613722594264e-05, + "loss": 0.3242, + "step": 3671 + }, + { + "epoch": 0.3459174301123384, + "grad_norm": 0.8803601264953613, + "learning_rate": 1.8821902766063312e-05, + "loss": 0.3964, + "step": 3672 + }, + { + "epoch": 0.34601163420550624, + "grad_norm": 0.8959583044052124, + "learning_rate": 1.882119160837933e-05, + "loss": 0.386, + "step": 3673 + }, + { + "epoch": 0.3461058382986741, + "grad_norm": 0.827250063419342, + "learning_rate": 1.8820480249558538e-05, + "loss": 0.3226, + "step": 3674 + }, + { + "epoch": 0.34620004239184193, + "grad_norm": 0.7794238924980164, + "learning_rate": 1.8819768689617154e-05, + "loss": 0.3718, + "step": 3675 + }, + { + "epoch": 0.3462942464850098, + "grad_norm": 1.088750958442688, + "learning_rate": 1.8819056928571405e-05, + "loss": 0.4164, + "step": 3676 + }, + { + "epoch": 0.34638845057817763, + "grad_norm": 0.7334954738616943, + "learning_rate": 1.881834496643752e-05, + "loss": 0.315, + "step": 3677 + }, + { + "epoch": 0.3464826546713455, + "grad_norm": 0.9737700819969177, + "learning_rate": 1.881763280323173e-05, + "loss": 0.3383, + "step": 3678 + }, + { + "epoch": 0.34657685876451333, + "grad_norm": 0.845251202583313, + "learning_rate": 1.881692043897028e-05, + "loss": 0.3463, + "step": 3679 + }, + { + "epoch": 0.3466710628576812, + "grad_norm": 0.7490604519844055, + "learning_rate": 1.8816207873669406e-05, + "loss": 0.3522, + "step": 3680 + }, + { + "epoch": 0.346765266950849, + "grad_norm": 0.9043464064598083, + "learning_rate": 1.8815495107345355e-05, + "loss": 0.3892, + "step": 3681 + }, + { + "epoch": 0.3468594710440169, + "grad_norm": 0.8788156509399414, + "learning_rate": 1.8814782140014386e-05, + "loss": 0.3462, + "step": 3682 + }, + { + "epoch": 0.3469536751371847, + "grad_norm": 1.0707277059555054, + "learning_rate": 1.8814068971692747e-05, + "loss": 0.3827, + "step": 3683 + }, + { + "epoch": 0.3470478792303526, + "grad_norm": 0.8980178236961365, + "learning_rate": 1.881335560239671e-05, + "loss": 0.3636, + "step": 3684 + }, + { + "epoch": 0.3471420833235204, + "grad_norm": 0.8110436201095581, + "learning_rate": 1.881264203214253e-05, + "loss": 0.3325, + "step": 3685 + }, + { + "epoch": 0.34723628741668827, + "grad_norm": 0.871622622013092, + "learning_rate": 1.8811928260946482e-05, + "loss": 0.3942, + "step": 3686 + }, + { + "epoch": 0.3473304915098561, + "grad_norm": 0.8642867803573608, + "learning_rate": 1.8811214288824847e-05, + "loss": 0.3338, + "step": 3687 + }, + { + "epoch": 0.34742469560302397, + "grad_norm": 0.8314307928085327, + "learning_rate": 1.8810500115793895e-05, + "loss": 0.2986, + "step": 3688 + }, + { + "epoch": 0.3475188996961918, + "grad_norm": 0.7452064156532288, + "learning_rate": 1.880978574186992e-05, + "loss": 0.3237, + "step": 3689 + }, + { + "epoch": 0.34761310378935967, + "grad_norm": 0.8303627371788025, + "learning_rate": 1.8809071167069202e-05, + "loss": 0.3095, + "step": 3690 + }, + { + "epoch": 0.3477073078825275, + "grad_norm": 0.8369124531745911, + "learning_rate": 1.8808356391408037e-05, + "loss": 0.345, + "step": 3691 + }, + { + "epoch": 0.34780151197569537, + "grad_norm": 0.7832201719284058, + "learning_rate": 1.8807641414902726e-05, + "loss": 0.3605, + "step": 3692 + }, + { + "epoch": 0.3478957160688632, + "grad_norm": 0.7621707320213318, + "learning_rate": 1.8806926237569566e-05, + "loss": 0.3313, + "step": 3693 + }, + { + "epoch": 0.34798992016203106, + "grad_norm": 0.8039724230766296, + "learning_rate": 1.880621085942487e-05, + "loss": 0.3815, + "step": 3694 + }, + { + "epoch": 0.3480841242551989, + "grad_norm": 0.9030138850212097, + "learning_rate": 1.880549528048495e-05, + "loss": 0.3179, + "step": 3695 + }, + { + "epoch": 0.34817832834836676, + "grad_norm": 0.7929380536079407, + "learning_rate": 1.8804779500766118e-05, + "loss": 0.3375, + "step": 3696 + }, + { + "epoch": 0.3482725324415346, + "grad_norm": 0.8301117420196533, + "learning_rate": 1.8804063520284693e-05, + "loss": 0.3539, + "step": 3697 + }, + { + "epoch": 0.34836673653470246, + "grad_norm": 0.8487005233764648, + "learning_rate": 1.8803347339057008e-05, + "loss": 0.3629, + "step": 3698 + }, + { + "epoch": 0.3484609406278703, + "grad_norm": 0.8481366634368896, + "learning_rate": 1.8802630957099387e-05, + "loss": 0.3991, + "step": 3699 + }, + { + "epoch": 0.3485551447210381, + "grad_norm": 0.8308368921279907, + "learning_rate": 1.880191437442817e-05, + "loss": 0.3254, + "step": 3700 + }, + { + "epoch": 0.34864934881420595, + "grad_norm": 1.0492184162139893, + "learning_rate": 1.8801197591059686e-05, + "loss": 0.3924, + "step": 3701 + }, + { + "epoch": 0.3487435529073738, + "grad_norm": 0.9140027761459351, + "learning_rate": 1.880048060701029e-05, + "loss": 0.3779, + "step": 3702 + }, + { + "epoch": 0.34883775700054165, + "grad_norm": 0.9277212023735046, + "learning_rate": 1.8799763422296327e-05, + "loss": 0.4054, + "step": 3703 + }, + { + "epoch": 0.3489319610937095, + "grad_norm": 0.7970870137214661, + "learning_rate": 1.8799046036934147e-05, + "loss": 0.3884, + "step": 3704 + }, + { + "epoch": 0.34902616518687735, + "grad_norm": 0.8798114657402039, + "learning_rate": 1.879832845094011e-05, + "loss": 0.3645, + "step": 3705 + }, + { + "epoch": 0.3491203692800452, + "grad_norm": 0.7972387075424194, + "learning_rate": 1.8797610664330578e-05, + "loss": 0.3876, + "step": 3706 + }, + { + "epoch": 0.34921457337321304, + "grad_norm": 0.8588185906410217, + "learning_rate": 1.8796892677121917e-05, + "loss": 0.3539, + "step": 3707 + }, + { + "epoch": 0.3493087774663809, + "grad_norm": 0.8626374006271362, + "learning_rate": 1.8796174489330497e-05, + "loss": 0.4001, + "step": 3708 + }, + { + "epoch": 0.34940298155954874, + "grad_norm": 0.8604520559310913, + "learning_rate": 1.8795456100972697e-05, + "loss": 0.3593, + "step": 3709 + }, + { + "epoch": 0.3494971856527166, + "grad_norm": 0.8322687745094299, + "learning_rate": 1.879473751206489e-05, + "loss": 0.3562, + "step": 3710 + }, + { + "epoch": 0.34959138974588444, + "grad_norm": 0.817677915096283, + "learning_rate": 1.879401872262347e-05, + "loss": 0.3697, + "step": 3711 + }, + { + "epoch": 0.3496855938390523, + "grad_norm": 0.7093011736869812, + "learning_rate": 1.8793299732664826e-05, + "loss": 0.3279, + "step": 3712 + }, + { + "epoch": 0.34977979793222014, + "grad_norm": 0.9430680274963379, + "learning_rate": 1.8792580542205348e-05, + "loss": 0.3841, + "step": 3713 + }, + { + "epoch": 0.349874002025388, + "grad_norm": 0.8343164920806885, + "learning_rate": 1.879186115126143e-05, + "loss": 0.3902, + "step": 3714 + }, + { + "epoch": 0.34996820611855584, + "grad_norm": 0.821296751499176, + "learning_rate": 1.879114155984949e-05, + "loss": 0.4154, + "step": 3715 + }, + { + "epoch": 0.3500624102117237, + "grad_norm": 0.7688080668449402, + "learning_rate": 1.879042176798592e-05, + "loss": 0.3671, + "step": 3716 + }, + { + "epoch": 0.35015661430489153, + "grad_norm": 0.6696940064430237, + "learning_rate": 1.8789701775687142e-05, + "loss": 0.3031, + "step": 3717 + }, + { + "epoch": 0.3502508183980594, + "grad_norm": 0.8230246305465698, + "learning_rate": 1.878898158296957e-05, + "loss": 0.3715, + "step": 3718 + }, + { + "epoch": 0.35034502249122723, + "grad_norm": 0.8132197260856628, + "learning_rate": 1.8788261189849622e-05, + "loss": 0.3505, + "step": 3719 + }, + { + "epoch": 0.3504392265843951, + "grad_norm": 0.8106905221939087, + "learning_rate": 1.878754059634373e-05, + "loss": 0.347, + "step": 3720 + }, + { + "epoch": 0.35053343067756293, + "grad_norm": 0.9174231886863708, + "learning_rate": 1.878681980246832e-05, + "loss": 0.371, + "step": 3721 + }, + { + "epoch": 0.3506276347707308, + "grad_norm": 0.9030629396438599, + "learning_rate": 1.8786098808239832e-05, + "loss": 0.3696, + "step": 3722 + }, + { + "epoch": 0.3507218388638986, + "grad_norm": 0.9923394918441772, + "learning_rate": 1.8785377613674705e-05, + "loss": 0.4036, + "step": 3723 + }, + { + "epoch": 0.3508160429570665, + "grad_norm": 1.898908257484436, + "learning_rate": 1.878465621878938e-05, + "loss": 0.3972, + "step": 3724 + }, + { + "epoch": 0.3509102470502343, + "grad_norm": 1.493820309638977, + "learning_rate": 1.8783934623600307e-05, + "loss": 0.3234, + "step": 3725 + }, + { + "epoch": 0.3510044511434022, + "grad_norm": 0.7513868808746338, + "learning_rate": 1.878321282812394e-05, + "loss": 0.347, + "step": 3726 + }, + { + "epoch": 0.35109865523657, + "grad_norm": 0.8554733395576477, + "learning_rate": 1.8782490832376737e-05, + "loss": 0.3507, + "step": 3727 + }, + { + "epoch": 0.35119285932973787, + "grad_norm": 0.8039769530296326, + "learning_rate": 1.8781768636375158e-05, + "loss": 0.3337, + "step": 3728 + }, + { + "epoch": 0.3512870634229057, + "grad_norm": 0.7684638500213623, + "learning_rate": 1.8781046240135678e-05, + "loss": 0.3307, + "step": 3729 + }, + { + "epoch": 0.35138126751607357, + "grad_norm": 0.8677926659584045, + "learning_rate": 1.878032364367476e-05, + "loss": 0.3922, + "step": 3730 + }, + { + "epoch": 0.3514754716092414, + "grad_norm": 0.8814738392829895, + "learning_rate": 1.8779600847008884e-05, + "loss": 0.3616, + "step": 3731 + }, + { + "epoch": 0.35156967570240927, + "grad_norm": 0.7777526378631592, + "learning_rate": 1.8778877850154532e-05, + "loss": 0.3802, + "step": 3732 + }, + { + "epoch": 0.3516638797955771, + "grad_norm": 0.8300305008888245, + "learning_rate": 1.8778154653128186e-05, + "loss": 0.3725, + "step": 3733 + }, + { + "epoch": 0.35175808388874497, + "grad_norm": 0.8085060715675354, + "learning_rate": 1.8777431255946342e-05, + "loss": 0.3436, + "step": 3734 + }, + { + "epoch": 0.3518522879819128, + "grad_norm": 0.8895578980445862, + "learning_rate": 1.877670765862549e-05, + "loss": 0.354, + "step": 3735 + }, + { + "epoch": 0.35194649207508066, + "grad_norm": 0.7110785245895386, + "learning_rate": 1.8775983861182126e-05, + "loss": 0.318, + "step": 3736 + }, + { + "epoch": 0.3520406961682485, + "grad_norm": 0.8436791896820068, + "learning_rate": 1.877525986363276e-05, + "loss": 0.3857, + "step": 3737 + }, + { + "epoch": 0.35213490026141636, + "grad_norm": 0.7469220757484436, + "learning_rate": 1.87745356659939e-05, + "loss": 0.374, + "step": 3738 + }, + { + "epoch": 0.3522291043545842, + "grad_norm": 0.8427312970161438, + "learning_rate": 1.8773811268282053e-05, + "loss": 0.3268, + "step": 3739 + }, + { + "epoch": 0.35232330844775206, + "grad_norm": 0.7238284945487976, + "learning_rate": 1.877308667051374e-05, + "loss": 0.296, + "step": 3740 + }, + { + "epoch": 0.3524175125409199, + "grad_norm": 0.8895593285560608, + "learning_rate": 1.8772361872705486e-05, + "loss": 0.3594, + "step": 3741 + }, + { + "epoch": 0.35251171663408776, + "grad_norm": 0.8994559645652771, + "learning_rate": 1.8771636874873812e-05, + "loss": 0.3459, + "step": 3742 + }, + { + "epoch": 0.3526059207272556, + "grad_norm": 0.9097210764884949, + "learning_rate": 1.8770911677035254e-05, + "loss": 0.4051, + "step": 3743 + }, + { + "epoch": 0.35270012482042346, + "grad_norm": 0.7662896513938904, + "learning_rate": 1.877018627920635e-05, + "loss": 0.3662, + "step": 3744 + }, + { + "epoch": 0.3527943289135913, + "grad_norm": 0.916433572769165, + "learning_rate": 1.8769460681403628e-05, + "loss": 0.3969, + "step": 3745 + }, + { + "epoch": 0.35288853300675915, + "grad_norm": 0.8712926506996155, + "learning_rate": 1.876873488364364e-05, + "loss": 0.3913, + "step": 3746 + }, + { + "epoch": 0.352982737099927, + "grad_norm": 0.9781820774078369, + "learning_rate": 1.876800888594294e-05, + "loss": 0.3743, + "step": 3747 + }, + { + "epoch": 0.35307694119309485, + "grad_norm": 0.7476435303688049, + "learning_rate": 1.8767282688318075e-05, + "loss": 0.3283, + "step": 3748 + }, + { + "epoch": 0.3531711452862627, + "grad_norm": 0.7212189435958862, + "learning_rate": 1.8766556290785605e-05, + "loss": 0.32, + "step": 3749 + }, + { + "epoch": 0.35326534937943055, + "grad_norm": 0.9467104077339172, + "learning_rate": 1.8765829693362097e-05, + "loss": 0.3758, + "step": 3750 + }, + { + "epoch": 0.3533595534725984, + "grad_norm": 0.8207167983055115, + "learning_rate": 1.876510289606411e-05, + "loss": 0.3419, + "step": 3751 + }, + { + "epoch": 0.35345375756576625, + "grad_norm": 0.8008447289466858, + "learning_rate": 1.8764375898908228e-05, + "loss": 0.3645, + "step": 3752 + }, + { + "epoch": 0.3535479616589341, + "grad_norm": 0.7681457996368408, + "learning_rate": 1.876364870191102e-05, + "loss": 0.3415, + "step": 3753 + }, + { + "epoch": 0.35364216575210194, + "grad_norm": 0.8857048153877258, + "learning_rate": 1.8762921305089064e-05, + "loss": 0.3442, + "step": 3754 + }, + { + "epoch": 0.3537363698452698, + "grad_norm": 0.8540251851081848, + "learning_rate": 1.8762193708458952e-05, + "loss": 0.3576, + "step": 3755 + }, + { + "epoch": 0.35383057393843764, + "grad_norm": 0.9321168065071106, + "learning_rate": 1.8761465912037272e-05, + "loss": 0.4115, + "step": 3756 + }, + { + "epoch": 0.3539247780316055, + "grad_norm": 0.9076574444770813, + "learning_rate": 1.876073791584062e-05, + "loss": 0.3748, + "step": 3757 + }, + { + "epoch": 0.35401898212477334, + "grad_norm": 0.8139601945877075, + "learning_rate": 1.8760009719885595e-05, + "loss": 0.3246, + "step": 3758 + }, + { + "epoch": 0.3541131862179412, + "grad_norm": 0.9335690140724182, + "learning_rate": 1.87592813241888e-05, + "loss": 0.3616, + "step": 3759 + }, + { + "epoch": 0.35420739031110904, + "grad_norm": 0.7616863250732422, + "learning_rate": 1.8758552728766845e-05, + "loss": 0.3594, + "step": 3760 + }, + { + "epoch": 0.3543015944042769, + "grad_norm": 0.7629071474075317, + "learning_rate": 1.875782393363634e-05, + "loss": 0.3384, + "step": 3761 + }, + { + "epoch": 0.35439579849744474, + "grad_norm": 0.912653386592865, + "learning_rate": 1.875709493881391e-05, + "loss": 0.3394, + "step": 3762 + }, + { + "epoch": 0.3544900025906126, + "grad_norm": 0.9390417337417603, + "learning_rate": 1.8756365744316172e-05, + "loss": 0.3263, + "step": 3763 + }, + { + "epoch": 0.35458420668378043, + "grad_norm": 0.8299874663352966, + "learning_rate": 1.875563635015975e-05, + "loss": 0.338, + "step": 3764 + }, + { + "epoch": 0.3546784107769483, + "grad_norm": 0.865662157535553, + "learning_rate": 1.875490675636128e-05, + "loss": 0.395, + "step": 3765 + }, + { + "epoch": 0.35477261487011613, + "grad_norm": 0.7513821125030518, + "learning_rate": 1.8754176962937397e-05, + "loss": 0.3485, + "step": 3766 + }, + { + "epoch": 0.354866818963284, + "grad_norm": 0.8353309035301208, + "learning_rate": 1.8753446969904737e-05, + "loss": 0.3605, + "step": 3767 + }, + { + "epoch": 0.35496102305645183, + "grad_norm": 0.8011833429336548, + "learning_rate": 1.8752716777279955e-05, + "loss": 0.357, + "step": 3768 + }, + { + "epoch": 0.3550552271496196, + "grad_norm": 0.7678676247596741, + "learning_rate": 1.8751986385079685e-05, + "loss": 0.3138, + "step": 3769 + }, + { + "epoch": 0.35514943124278747, + "grad_norm": 0.75164395570755, + "learning_rate": 1.8751255793320597e-05, + "loss": 0.3262, + "step": 3770 + }, + { + "epoch": 0.3552436353359553, + "grad_norm": 0.7251051068305969, + "learning_rate": 1.875052500201934e-05, + "loss": 0.3536, + "step": 3771 + }, + { + "epoch": 0.35533783942912317, + "grad_norm": 1.1679542064666748, + "learning_rate": 1.874979401119258e-05, + "loss": 0.3832, + "step": 3772 + }, + { + "epoch": 0.355432043522291, + "grad_norm": 0.910009503364563, + "learning_rate": 1.8749062820856986e-05, + "loss": 0.3567, + "step": 3773 + }, + { + "epoch": 0.35552624761545887, + "grad_norm": 0.8140091300010681, + "learning_rate": 1.8748331431029232e-05, + "loss": 0.3485, + "step": 3774 + }, + { + "epoch": 0.3556204517086267, + "grad_norm": 0.8223723769187927, + "learning_rate": 1.8747599841725988e-05, + "loss": 0.3367, + "step": 3775 + }, + { + "epoch": 0.35571465580179457, + "grad_norm": 0.8308326005935669, + "learning_rate": 1.8746868052963938e-05, + "loss": 0.3205, + "step": 3776 + }, + { + "epoch": 0.3558088598949624, + "grad_norm": 0.8465903401374817, + "learning_rate": 1.8746136064759774e-05, + "loss": 0.3395, + "step": 3777 + }, + { + "epoch": 0.35590306398813026, + "grad_norm": 0.7565946578979492, + "learning_rate": 1.8745403877130176e-05, + "loss": 0.2984, + "step": 3778 + }, + { + "epoch": 0.3559972680812981, + "grad_norm": 0.9386110901832581, + "learning_rate": 1.8744671490091848e-05, + "loss": 0.4277, + "step": 3779 + }, + { + "epoch": 0.35609147217446596, + "grad_norm": 0.7682352662086487, + "learning_rate": 1.8743938903661487e-05, + "loss": 0.3544, + "step": 3780 + }, + { + "epoch": 0.3561856762676338, + "grad_norm": 0.8476884961128235, + "learning_rate": 1.8743206117855792e-05, + "loss": 0.3586, + "step": 3781 + }, + { + "epoch": 0.35627988036080166, + "grad_norm": 0.8152371048927307, + "learning_rate": 1.874247313269148e-05, + "loss": 0.3158, + "step": 3782 + }, + { + "epoch": 0.3563740844539695, + "grad_norm": 0.6988675594329834, + "learning_rate": 1.8741739948185256e-05, + "loss": 0.3067, + "step": 3783 + }, + { + "epoch": 0.35646828854713736, + "grad_norm": 0.844517171382904, + "learning_rate": 1.8741006564353847e-05, + "loss": 0.3534, + "step": 3784 + }, + { + "epoch": 0.3565624926403052, + "grad_norm": 0.768746554851532, + "learning_rate": 1.8740272981213965e-05, + "loss": 0.333, + "step": 3785 + }, + { + "epoch": 0.35665669673347306, + "grad_norm": 0.9396389722824097, + "learning_rate": 1.8739539198782345e-05, + "loss": 0.362, + "step": 3786 + }, + { + "epoch": 0.3567509008266409, + "grad_norm": 0.8951597809791565, + "learning_rate": 1.8738805217075714e-05, + "loss": 0.3854, + "step": 3787 + }, + { + "epoch": 0.35684510491980875, + "grad_norm": 0.8619693517684937, + "learning_rate": 1.873807103611081e-05, + "loss": 0.397, + "step": 3788 + }, + { + "epoch": 0.3569393090129766, + "grad_norm": 0.8142619132995605, + "learning_rate": 1.8737336655904374e-05, + "loss": 0.3517, + "step": 3789 + }, + { + "epoch": 0.35703351310614445, + "grad_norm": 0.7423619627952576, + "learning_rate": 1.8736602076473148e-05, + "loss": 0.334, + "step": 3790 + }, + { + "epoch": 0.3571277171993123, + "grad_norm": 0.8979628682136536, + "learning_rate": 1.8735867297833882e-05, + "loss": 0.354, + "step": 3791 + }, + { + "epoch": 0.35722192129248015, + "grad_norm": 0.9283548593521118, + "learning_rate": 1.8735132320003332e-05, + "loss": 0.4025, + "step": 3792 + }, + { + "epoch": 0.357316125385648, + "grad_norm": 0.7603616714477539, + "learning_rate": 1.8734397142998257e-05, + "loss": 0.3707, + "step": 3793 + }, + { + "epoch": 0.35741032947881585, + "grad_norm": 0.8556933403015137, + "learning_rate": 1.8733661766835417e-05, + "loss": 0.3974, + "step": 3794 + }, + { + "epoch": 0.3575045335719837, + "grad_norm": 0.872689962387085, + "learning_rate": 1.8732926191531584e-05, + "loss": 0.3476, + "step": 3795 + }, + { + "epoch": 0.35759873766515154, + "grad_norm": 0.827884316444397, + "learning_rate": 1.873219041710353e-05, + "loss": 0.3828, + "step": 3796 + }, + { + "epoch": 0.3576929417583194, + "grad_norm": 0.771144449710846, + "learning_rate": 1.8731454443568026e-05, + "loss": 0.3358, + "step": 3797 + }, + { + "epoch": 0.35778714585148724, + "grad_norm": 0.8942975401878357, + "learning_rate": 1.873071827094186e-05, + "loss": 0.357, + "step": 3798 + }, + { + "epoch": 0.3578813499446551, + "grad_norm": 0.8680719137191772, + "learning_rate": 1.8729981899241815e-05, + "loss": 0.3558, + "step": 3799 + }, + { + "epoch": 0.35797555403782294, + "grad_norm": 0.820220947265625, + "learning_rate": 1.872924532848468e-05, + "loss": 0.3441, + "step": 3800 + }, + { + "epoch": 0.3580697581309908, + "grad_norm": 0.8636904358863831, + "learning_rate": 1.872850855868725e-05, + "loss": 0.3445, + "step": 3801 + }, + { + "epoch": 0.35816396222415864, + "grad_norm": 0.816992998123169, + "learning_rate": 1.872777158986633e-05, + "loss": 0.3766, + "step": 3802 + }, + { + "epoch": 0.3582581663173265, + "grad_norm": 0.9569509029388428, + "learning_rate": 1.8727034422038718e-05, + "loss": 0.3857, + "step": 3803 + }, + { + "epoch": 0.35835237041049434, + "grad_norm": 0.9457529187202454, + "learning_rate": 1.8726297055221224e-05, + "loss": 0.37, + "step": 3804 + }, + { + "epoch": 0.3584465745036622, + "grad_norm": 0.7957568168640137, + "learning_rate": 1.8725559489430664e-05, + "loss": 0.3436, + "step": 3805 + }, + { + "epoch": 0.35854077859683003, + "grad_norm": 0.8334015011787415, + "learning_rate": 1.8724821724683847e-05, + "loss": 0.3364, + "step": 3806 + }, + { + "epoch": 0.3586349826899979, + "grad_norm": 0.8348851799964905, + "learning_rate": 1.8724083760997608e-05, + "loss": 0.319, + "step": 3807 + }, + { + "epoch": 0.35872918678316573, + "grad_norm": 0.840887725353241, + "learning_rate": 1.8723345598388764e-05, + "loss": 0.372, + "step": 3808 + }, + { + "epoch": 0.3588233908763336, + "grad_norm": 0.9339392185211182, + "learning_rate": 1.8722607236874155e-05, + "loss": 0.3722, + "step": 3809 + }, + { + "epoch": 0.35891759496950143, + "grad_norm": 0.7736247181892395, + "learning_rate": 1.8721868676470602e-05, + "loss": 0.3575, + "step": 3810 + }, + { + "epoch": 0.3590117990626693, + "grad_norm": 0.8696429133415222, + "learning_rate": 1.8721129917194962e-05, + "loss": 0.3061, + "step": 3811 + }, + { + "epoch": 0.35910600315583713, + "grad_norm": 0.7258344888687134, + "learning_rate": 1.872039095906407e-05, + "loss": 0.3227, + "step": 3812 + }, + { + "epoch": 0.359200207249005, + "grad_norm": 0.7984716296195984, + "learning_rate": 1.8719651802094775e-05, + "loss": 0.3121, + "step": 3813 + }, + { + "epoch": 0.3592944113421728, + "grad_norm": 0.8016639351844788, + "learning_rate": 1.8718912446303938e-05, + "loss": 0.3661, + "step": 3814 + }, + { + "epoch": 0.3593886154353407, + "grad_norm": 0.858106255531311, + "learning_rate": 1.8718172891708412e-05, + "loss": 0.3631, + "step": 3815 + }, + { + "epoch": 0.3594828195285085, + "grad_norm": 0.7816081047058105, + "learning_rate": 1.871743313832506e-05, + "loss": 0.354, + "step": 3816 + }, + { + "epoch": 0.3595770236216764, + "grad_norm": 0.8900384306907654, + "learning_rate": 1.8716693186170748e-05, + "loss": 0.377, + "step": 3817 + }, + { + "epoch": 0.3596712277148442, + "grad_norm": 0.8501194715499878, + "learning_rate": 1.8715953035262354e-05, + "loss": 0.3507, + "step": 3818 + }, + { + "epoch": 0.35976543180801207, + "grad_norm": 0.9237880706787109, + "learning_rate": 1.871521268561675e-05, + "loss": 0.3818, + "step": 3819 + }, + { + "epoch": 0.3598596359011799, + "grad_norm": 0.8632811307907104, + "learning_rate": 1.8714472137250822e-05, + "loss": 0.3362, + "step": 3820 + }, + { + "epoch": 0.35995383999434777, + "grad_norm": 0.8760498762130737, + "learning_rate": 1.871373139018145e-05, + "loss": 0.3772, + "step": 3821 + }, + { + "epoch": 0.3600480440875156, + "grad_norm": 0.7853875160217285, + "learning_rate": 1.8712990444425527e-05, + "loss": 0.3438, + "step": 3822 + }, + { + "epoch": 0.36014224818068347, + "grad_norm": 0.9706776142120361, + "learning_rate": 1.8712249299999948e-05, + "loss": 0.3318, + "step": 3823 + }, + { + "epoch": 0.3602364522738513, + "grad_norm": 1.0395607948303223, + "learning_rate": 1.871150795692161e-05, + "loss": 0.3257, + "step": 3824 + }, + { + "epoch": 0.36033065636701916, + "grad_norm": 0.817263662815094, + "learning_rate": 1.8710766415207417e-05, + "loss": 0.3363, + "step": 3825 + }, + { + "epoch": 0.360424860460187, + "grad_norm": 0.7719822525978088, + "learning_rate": 1.8710024674874278e-05, + "loss": 0.3349, + "step": 3826 + }, + { + "epoch": 0.36051906455335486, + "grad_norm": 0.7931634187698364, + "learning_rate": 1.8709282735939106e-05, + "loss": 0.3832, + "step": 3827 + }, + { + "epoch": 0.3606132686465227, + "grad_norm": 0.7968805432319641, + "learning_rate": 1.8708540598418818e-05, + "loss": 0.3211, + "step": 3828 + }, + { + "epoch": 0.36070747273969056, + "grad_norm": 0.9289378523826599, + "learning_rate": 1.8707798262330337e-05, + "loss": 0.4267, + "step": 3829 + }, + { + "epoch": 0.3608016768328584, + "grad_norm": 0.7175437808036804, + "learning_rate": 1.8707055727690592e-05, + "loss": 0.2979, + "step": 3830 + }, + { + "epoch": 0.36089588092602626, + "grad_norm": 0.7979193329811096, + "learning_rate": 1.8706312994516508e-05, + "loss": 0.3269, + "step": 3831 + }, + { + "epoch": 0.3609900850191941, + "grad_norm": 0.7682902812957764, + "learning_rate": 1.870557006282502e-05, + "loss": 0.3194, + "step": 3832 + }, + { + "epoch": 0.36108428911236196, + "grad_norm": 0.7294326424598694, + "learning_rate": 1.8704826932633072e-05, + "loss": 0.3405, + "step": 3833 + }, + { + "epoch": 0.3611784932055298, + "grad_norm": 0.8687989711761475, + "learning_rate": 1.870408360395761e-05, + "loss": 0.3971, + "step": 3834 + }, + { + "epoch": 0.36127269729869765, + "grad_norm": 0.8161478042602539, + "learning_rate": 1.8703340076815585e-05, + "loss": 0.3438, + "step": 3835 + }, + { + "epoch": 0.3613669013918655, + "grad_norm": 0.8905363082885742, + "learning_rate": 1.8702596351223938e-05, + "loss": 0.388, + "step": 3836 + }, + { + "epoch": 0.36146110548503335, + "grad_norm": 0.796614408493042, + "learning_rate": 1.870185242719964e-05, + "loss": 0.2826, + "step": 3837 + }, + { + "epoch": 0.3615553095782012, + "grad_norm": 0.8235340714454651, + "learning_rate": 1.870110830475965e-05, + "loss": 0.3215, + "step": 3838 + }, + { + "epoch": 0.361649513671369, + "grad_norm": 0.8155031204223633, + "learning_rate": 1.870036398392093e-05, + "loss": 0.3472, + "step": 3839 + }, + { + "epoch": 0.36174371776453684, + "grad_norm": 0.8262405395507812, + "learning_rate": 1.869961946470046e-05, + "loss": 0.3223, + "step": 3840 + }, + { + "epoch": 0.3618379218577047, + "grad_norm": 0.762555718421936, + "learning_rate": 1.8698874747115206e-05, + "loss": 0.3327, + "step": 3841 + }, + { + "epoch": 0.36193212595087254, + "grad_norm": 0.8004673719406128, + "learning_rate": 1.8698129831182165e-05, + "loss": 0.3517, + "step": 3842 + }, + { + "epoch": 0.3620263300440404, + "grad_norm": 0.9113042950630188, + "learning_rate": 1.86973847169183e-05, + "loss": 0.3716, + "step": 3843 + }, + { + "epoch": 0.36212053413720824, + "grad_norm": 0.9958996772766113, + "learning_rate": 1.869663940434062e-05, + "loss": 0.3498, + "step": 3844 + }, + { + "epoch": 0.3622147382303761, + "grad_norm": 0.8684587478637695, + "learning_rate": 1.869589389346611e-05, + "loss": 0.3915, + "step": 3845 + }, + { + "epoch": 0.36230894232354394, + "grad_norm": 0.7987110614776611, + "learning_rate": 1.8695148184311772e-05, + "loss": 0.3386, + "step": 3846 + }, + { + "epoch": 0.3624031464167118, + "grad_norm": 0.87180495262146, + "learning_rate": 1.8694402276894607e-05, + "loss": 0.3813, + "step": 3847 + }, + { + "epoch": 0.36249735050987963, + "grad_norm": 1.040747046470642, + "learning_rate": 1.8693656171231623e-05, + "loss": 0.3869, + "step": 3848 + }, + { + "epoch": 0.3625915546030475, + "grad_norm": 0.8927814960479736, + "learning_rate": 1.8692909867339834e-05, + "loss": 0.3794, + "step": 3849 + }, + { + "epoch": 0.36268575869621533, + "grad_norm": 0.8000523447990417, + "learning_rate": 1.869216336523626e-05, + "loss": 0.3241, + "step": 3850 + }, + { + "epoch": 0.3627799627893832, + "grad_norm": 1.3600900173187256, + "learning_rate": 1.8691416664937915e-05, + "loss": 0.3616, + "step": 3851 + }, + { + "epoch": 0.36287416688255103, + "grad_norm": 0.79920494556427, + "learning_rate": 1.8690669766461827e-05, + "loss": 0.3241, + "step": 3852 + }, + { + "epoch": 0.3629683709757189, + "grad_norm": 0.8300445079803467, + "learning_rate": 1.868992266982503e-05, + "loss": 0.3844, + "step": 3853 + }, + { + "epoch": 0.36306257506888673, + "grad_norm": 0.8212257027626038, + "learning_rate": 1.8689175375044558e-05, + "loss": 0.3618, + "step": 3854 + }, + { + "epoch": 0.3631567791620546, + "grad_norm": 0.7338040471076965, + "learning_rate": 1.8688427882137448e-05, + "loss": 0.3252, + "step": 3855 + }, + { + "epoch": 0.3632509832552224, + "grad_norm": 0.8911314010620117, + "learning_rate": 1.8687680191120746e-05, + "loss": 0.373, + "step": 3856 + }, + { + "epoch": 0.3633451873483903, + "grad_norm": 0.8007513880729675, + "learning_rate": 1.8686932302011498e-05, + "loss": 0.324, + "step": 3857 + }, + { + "epoch": 0.3634393914415581, + "grad_norm": 0.8686206936836243, + "learning_rate": 1.868618421482676e-05, + "loss": 0.346, + "step": 3858 + }, + { + "epoch": 0.363533595534726, + "grad_norm": 0.9042792320251465, + "learning_rate": 1.8685435929583587e-05, + "loss": 0.3793, + "step": 3859 + }, + { + "epoch": 0.3636277996278938, + "grad_norm": 0.8040450811386108, + "learning_rate": 1.8684687446299046e-05, + "loss": 0.3419, + "step": 3860 + }, + { + "epoch": 0.36372200372106167, + "grad_norm": 0.9083951115608215, + "learning_rate": 1.8683938764990196e-05, + "loss": 0.3588, + "step": 3861 + }, + { + "epoch": 0.3638162078142295, + "grad_norm": 0.8532748222351074, + "learning_rate": 1.8683189885674117e-05, + "loss": 0.3637, + "step": 3862 + }, + { + "epoch": 0.36391041190739737, + "grad_norm": 1.0092157125473022, + "learning_rate": 1.8682440808367874e-05, + "loss": 0.4086, + "step": 3863 + }, + { + "epoch": 0.3640046160005652, + "grad_norm": 0.7411501407623291, + "learning_rate": 1.868169153308856e-05, + "loss": 0.3316, + "step": 3864 + }, + { + "epoch": 0.36409882009373307, + "grad_norm": 0.8762557506561279, + "learning_rate": 1.8680942059853246e-05, + "loss": 0.3776, + "step": 3865 + }, + { + "epoch": 0.3641930241869009, + "grad_norm": 0.755689799785614, + "learning_rate": 1.868019238867903e-05, + "loss": 0.3312, + "step": 3866 + }, + { + "epoch": 0.36428722828006876, + "grad_norm": 0.786252498626709, + "learning_rate": 1.8679442519583004e-05, + "loss": 0.3425, + "step": 3867 + }, + { + "epoch": 0.3643814323732366, + "grad_norm": 0.833122193813324, + "learning_rate": 1.8678692452582266e-05, + "loss": 0.3558, + "step": 3868 + }, + { + "epoch": 0.36447563646640446, + "grad_norm": 0.8415796756744385, + "learning_rate": 1.8677942187693917e-05, + "loss": 0.3323, + "step": 3869 + }, + { + "epoch": 0.3645698405595723, + "grad_norm": 0.8360006213188171, + "learning_rate": 1.8677191724935066e-05, + "loss": 0.3747, + "step": 3870 + }, + { + "epoch": 0.36466404465274016, + "grad_norm": 0.8955270051956177, + "learning_rate": 1.8676441064322827e-05, + "loss": 0.3843, + "step": 3871 + }, + { + "epoch": 0.364758248745908, + "grad_norm": 0.8831353187561035, + "learning_rate": 1.8675690205874306e-05, + "loss": 0.3441, + "step": 3872 + }, + { + "epoch": 0.36485245283907586, + "grad_norm": 0.7926917672157288, + "learning_rate": 1.867493914960664e-05, + "loss": 0.3264, + "step": 3873 + }, + { + "epoch": 0.3649466569322437, + "grad_norm": 0.8308305740356445, + "learning_rate": 1.867418789553694e-05, + "loss": 0.3619, + "step": 3874 + }, + { + "epoch": 0.36504086102541156, + "grad_norm": 0.8902815580368042, + "learning_rate": 1.867343644368234e-05, + "loss": 0.377, + "step": 3875 + }, + { + "epoch": 0.3651350651185794, + "grad_norm": 0.8571057319641113, + "learning_rate": 1.867268479405998e-05, + "loss": 0.348, + "step": 3876 + }, + { + "epoch": 0.36522926921174725, + "grad_norm": 1.425744652748108, + "learning_rate": 1.8671932946686988e-05, + "loss": 0.3631, + "step": 3877 + }, + { + "epoch": 0.3653234733049151, + "grad_norm": 0.7357031106948853, + "learning_rate": 1.8671180901580518e-05, + "loss": 0.3295, + "step": 3878 + }, + { + "epoch": 0.36541767739808295, + "grad_norm": 0.892597496509552, + "learning_rate": 1.867042865875771e-05, + "loss": 0.3744, + "step": 3879 + }, + { + "epoch": 0.3655118814912508, + "grad_norm": 0.8237472176551819, + "learning_rate": 1.866967621823572e-05, + "loss": 0.3421, + "step": 3880 + }, + { + "epoch": 0.36560608558441865, + "grad_norm": 0.8594658374786377, + "learning_rate": 1.866892358003171e-05, + "loss": 0.3364, + "step": 3881 + }, + { + "epoch": 0.3657002896775865, + "grad_norm": 0.9222422242164612, + "learning_rate": 1.866817074416283e-05, + "loss": 0.3701, + "step": 3882 + }, + { + "epoch": 0.36579449377075435, + "grad_norm": 0.8643504977226257, + "learning_rate": 1.8667417710646253e-05, + "loss": 0.3472, + "step": 3883 + }, + { + "epoch": 0.3658886978639222, + "grad_norm": 0.9093027114868164, + "learning_rate": 1.8666664479499148e-05, + "loss": 0.3786, + "step": 3884 + }, + { + "epoch": 0.36598290195709005, + "grad_norm": 0.8560269474983215, + "learning_rate": 1.866591105073869e-05, + "loss": 0.3382, + "step": 3885 + }, + { + "epoch": 0.3660771060502579, + "grad_norm": 0.8399043679237366, + "learning_rate": 1.8665157424382058e-05, + "loss": 0.3606, + "step": 3886 + }, + { + "epoch": 0.36617131014342574, + "grad_norm": 0.9316064119338989, + "learning_rate": 1.8664403600446435e-05, + "loss": 0.4022, + "step": 3887 + }, + { + "epoch": 0.3662655142365936, + "grad_norm": 0.8026871681213379, + "learning_rate": 1.8663649578949008e-05, + "loss": 0.345, + "step": 3888 + }, + { + "epoch": 0.36635971832976144, + "grad_norm": 0.763960599899292, + "learning_rate": 1.8662895359906973e-05, + "loss": 0.346, + "step": 3889 + }, + { + "epoch": 0.3664539224229293, + "grad_norm": 0.9255404472351074, + "learning_rate": 1.866214094333753e-05, + "loss": 0.3639, + "step": 3890 + }, + { + "epoch": 0.36654812651609714, + "grad_norm": 0.7488731145858765, + "learning_rate": 1.8661386329257874e-05, + "loss": 0.3219, + "step": 3891 + }, + { + "epoch": 0.366642330609265, + "grad_norm": 0.949048638343811, + "learning_rate": 1.8660631517685216e-05, + "loss": 0.4642, + "step": 3892 + }, + { + "epoch": 0.36673653470243284, + "grad_norm": 0.9563046097755432, + "learning_rate": 1.8659876508636764e-05, + "loss": 0.3703, + "step": 3893 + }, + { + "epoch": 0.3668307387956007, + "grad_norm": 0.8219785690307617, + "learning_rate": 1.8659121302129737e-05, + "loss": 0.3185, + "step": 3894 + }, + { + "epoch": 0.36692494288876853, + "grad_norm": 0.8395075798034668, + "learning_rate": 1.8658365898181352e-05, + "loss": 0.3503, + "step": 3895 + }, + { + "epoch": 0.3670191469819364, + "grad_norm": 0.8385910987854004, + "learning_rate": 1.8657610296808832e-05, + "loss": 0.3574, + "step": 3896 + }, + { + "epoch": 0.36711335107510423, + "grad_norm": 0.7934730052947998, + "learning_rate": 1.8656854498029408e-05, + "loss": 0.3486, + "step": 3897 + }, + { + "epoch": 0.3672075551682721, + "grad_norm": 0.7972601652145386, + "learning_rate": 1.8656098501860316e-05, + "loss": 0.3216, + "step": 3898 + }, + { + "epoch": 0.36730175926143993, + "grad_norm": 0.8734428882598877, + "learning_rate": 1.8655342308318792e-05, + "loss": 0.3677, + "step": 3899 + }, + { + "epoch": 0.3673959633546078, + "grad_norm": 0.8547226190567017, + "learning_rate": 1.8654585917422075e-05, + "loss": 0.363, + "step": 3900 + }, + { + "epoch": 0.36749016744777563, + "grad_norm": 0.8113787770271301, + "learning_rate": 1.8653829329187415e-05, + "loss": 0.3459, + "step": 3901 + }, + { + "epoch": 0.3675843715409435, + "grad_norm": 0.775188148021698, + "learning_rate": 1.8653072543632064e-05, + "loss": 0.3819, + "step": 3902 + }, + { + "epoch": 0.3676785756341113, + "grad_norm": 0.9267304539680481, + "learning_rate": 1.8652315560773276e-05, + "loss": 0.3697, + "step": 3903 + }, + { + "epoch": 0.3677727797272792, + "grad_norm": 0.808070182800293, + "learning_rate": 1.865155838062831e-05, + "loss": 0.2989, + "step": 3904 + }, + { + "epoch": 0.367866983820447, + "grad_norm": 0.7583027482032776, + "learning_rate": 1.8650801003214436e-05, + "loss": 0.3611, + "step": 3905 + }, + { + "epoch": 0.3679611879136149, + "grad_norm": 0.7289133667945862, + "learning_rate": 1.865004342854892e-05, + "loss": 0.3533, + "step": 3906 + }, + { + "epoch": 0.3680553920067827, + "grad_norm": 0.7687907814979553, + "learning_rate": 1.8649285656649035e-05, + "loss": 0.3498, + "step": 3907 + }, + { + "epoch": 0.3681495960999505, + "grad_norm": 0.8461649417877197, + "learning_rate": 1.8648527687532062e-05, + "loss": 0.348, + "step": 3908 + }, + { + "epoch": 0.36824380019311836, + "grad_norm": 1.2314167022705078, + "learning_rate": 1.8647769521215283e-05, + "loss": 0.3559, + "step": 3909 + }, + { + "epoch": 0.3683380042862862, + "grad_norm": 0.8200388550758362, + "learning_rate": 1.8647011157715983e-05, + "loss": 0.3453, + "step": 3910 + }, + { + "epoch": 0.36843220837945406, + "grad_norm": 0.9525389075279236, + "learning_rate": 1.8646252597051458e-05, + "loss": 0.3999, + "step": 3911 + }, + { + "epoch": 0.3685264124726219, + "grad_norm": 0.9427107572555542, + "learning_rate": 1.8645493839238998e-05, + "loss": 0.4111, + "step": 3912 + }, + { + "epoch": 0.36862061656578976, + "grad_norm": 0.8151512742042542, + "learning_rate": 1.8644734884295913e-05, + "loss": 0.4052, + "step": 3913 + }, + { + "epoch": 0.3687148206589576, + "grad_norm": 1.0325891971588135, + "learning_rate": 1.86439757322395e-05, + "loss": 0.3589, + "step": 3914 + }, + { + "epoch": 0.36880902475212546, + "grad_norm": 0.8066545724868774, + "learning_rate": 1.864321638308707e-05, + "loss": 0.3739, + "step": 3915 + }, + { + "epoch": 0.3689032288452933, + "grad_norm": 1.0493665933609009, + "learning_rate": 1.8642456836855944e-05, + "loss": 0.3612, + "step": 3916 + }, + { + "epoch": 0.36899743293846116, + "grad_norm": 0.8509879112243652, + "learning_rate": 1.8641697093563435e-05, + "loss": 0.3753, + "step": 3917 + }, + { + "epoch": 0.369091637031629, + "grad_norm": 0.8060911297798157, + "learning_rate": 1.864093715322687e-05, + "loss": 0.3234, + "step": 3918 + }, + { + "epoch": 0.36918584112479685, + "grad_norm": 0.8596218824386597, + "learning_rate": 1.864017701586357e-05, + "loss": 0.3767, + "step": 3919 + }, + { + "epoch": 0.3692800452179647, + "grad_norm": 0.8724544644355774, + "learning_rate": 1.8639416681490875e-05, + "loss": 0.3412, + "step": 3920 + }, + { + "epoch": 0.36937424931113255, + "grad_norm": 0.7228909730911255, + "learning_rate": 1.863865615012612e-05, + "loss": 0.3406, + "step": 3921 + }, + { + "epoch": 0.3694684534043004, + "grad_norm": 0.8851790428161621, + "learning_rate": 1.863789542178664e-05, + "loss": 0.3716, + "step": 3922 + }, + { + "epoch": 0.36956265749746825, + "grad_norm": 0.7576696872711182, + "learning_rate": 1.8637134496489786e-05, + "loss": 0.3367, + "step": 3923 + }, + { + "epoch": 0.3696568615906361, + "grad_norm": 0.7606495022773743, + "learning_rate": 1.8636373374252908e-05, + "loss": 0.3288, + "step": 3924 + }, + { + "epoch": 0.36975106568380395, + "grad_norm": 0.766771137714386, + "learning_rate": 1.8635612055093365e-05, + "loss": 0.3599, + "step": 3925 + }, + { + "epoch": 0.3698452697769718, + "grad_norm": 0.8141925930976868, + "learning_rate": 1.8634850539028508e-05, + "loss": 0.3571, + "step": 3926 + }, + { + "epoch": 0.36993947387013965, + "grad_norm": 0.7698119282722473, + "learning_rate": 1.8634088826075704e-05, + "loss": 0.2954, + "step": 3927 + }, + { + "epoch": 0.3700336779633075, + "grad_norm": 0.9796344041824341, + "learning_rate": 1.8633326916252325e-05, + "loss": 0.3692, + "step": 3928 + }, + { + "epoch": 0.37012788205647534, + "grad_norm": 1.10334050655365, + "learning_rate": 1.863256480957574e-05, + "loss": 0.4254, + "step": 3929 + }, + { + "epoch": 0.3702220861496432, + "grad_norm": 0.7263920903205872, + "learning_rate": 1.8631802506063326e-05, + "loss": 0.307, + "step": 3930 + }, + { + "epoch": 0.37031629024281104, + "grad_norm": 0.943158745765686, + "learning_rate": 1.8631040005732466e-05, + "loss": 0.3579, + "step": 3931 + }, + { + "epoch": 0.3704104943359789, + "grad_norm": 0.9819375276565552, + "learning_rate": 1.8630277308600548e-05, + "loss": 0.436, + "step": 3932 + }, + { + "epoch": 0.37050469842914674, + "grad_norm": 0.8973391056060791, + "learning_rate": 1.8629514414684956e-05, + "loss": 0.3431, + "step": 3933 + }, + { + "epoch": 0.3705989025223146, + "grad_norm": 0.7966498136520386, + "learning_rate": 1.862875132400309e-05, + "loss": 0.3537, + "step": 3934 + }, + { + "epoch": 0.37069310661548244, + "grad_norm": 0.8658461570739746, + "learning_rate": 1.862798803657235e-05, + "loss": 0.3343, + "step": 3935 + }, + { + "epoch": 0.3707873107086503, + "grad_norm": 0.9335054755210876, + "learning_rate": 1.862722455241014e-05, + "loss": 0.3932, + "step": 3936 + }, + { + "epoch": 0.37088151480181814, + "grad_norm": 0.7528518438339233, + "learning_rate": 1.8626460871533866e-05, + "loss": 0.3043, + "step": 3937 + }, + { + "epoch": 0.370975718894986, + "grad_norm": 0.8395419716835022, + "learning_rate": 1.8625696993960945e-05, + "loss": 0.3763, + "step": 3938 + }, + { + "epoch": 0.37106992298815383, + "grad_norm": 0.9247756004333496, + "learning_rate": 1.8624932919708794e-05, + "loss": 0.4132, + "step": 3939 + }, + { + "epoch": 0.3711641270813217, + "grad_norm": 1.244017243385315, + "learning_rate": 1.8624168648794833e-05, + "loss": 0.3528, + "step": 3940 + }, + { + "epoch": 0.37125833117448953, + "grad_norm": 1.0481994152069092, + "learning_rate": 1.8623404181236488e-05, + "loss": 0.3886, + "step": 3941 + }, + { + "epoch": 0.3713525352676574, + "grad_norm": 0.8091402649879456, + "learning_rate": 1.862263951705119e-05, + "loss": 0.3523, + "step": 3942 + }, + { + "epoch": 0.37144673936082523, + "grad_norm": 0.7829480767250061, + "learning_rate": 1.8621874656256378e-05, + "loss": 0.3771, + "step": 3943 + }, + { + "epoch": 0.3715409434539931, + "grad_norm": 0.7555829882621765, + "learning_rate": 1.862110959886949e-05, + "loss": 0.3509, + "step": 3944 + }, + { + "epoch": 0.3716351475471609, + "grad_norm": 0.7845138311386108, + "learning_rate": 1.8620344344907973e-05, + "loss": 0.3753, + "step": 3945 + }, + { + "epoch": 0.3717293516403288, + "grad_norm": 0.8772324919700623, + "learning_rate": 1.861957889438927e-05, + "loss": 0.381, + "step": 3946 + }, + { + "epoch": 0.3718235557334966, + "grad_norm": 0.8137223720550537, + "learning_rate": 1.8618813247330836e-05, + "loss": 0.3497, + "step": 3947 + }, + { + "epoch": 0.3719177598266645, + "grad_norm": 0.7370177507400513, + "learning_rate": 1.8618047403750132e-05, + "loss": 0.2666, + "step": 3948 + }, + { + "epoch": 0.3720119639198323, + "grad_norm": 0.7705456614494324, + "learning_rate": 1.861728136366462e-05, + "loss": 0.3215, + "step": 3949 + }, + { + "epoch": 0.37210616801300017, + "grad_norm": 0.9182578921318054, + "learning_rate": 1.861651512709177e-05, + "loss": 0.3461, + "step": 3950 + }, + { + "epoch": 0.372200372106168, + "grad_norm": 0.8463134765625, + "learning_rate": 1.861574869404904e-05, + "loss": 0.4012, + "step": 3951 + }, + { + "epoch": 0.37229457619933587, + "grad_norm": 0.7903316020965576, + "learning_rate": 1.861498206455392e-05, + "loss": 0.3163, + "step": 3952 + }, + { + "epoch": 0.3723887802925037, + "grad_norm": 0.8026413917541504, + "learning_rate": 1.861421523862389e-05, + "loss": 0.4018, + "step": 3953 + }, + { + "epoch": 0.37248298438567157, + "grad_norm": 0.7673872113227844, + "learning_rate": 1.8613448216276424e-05, + "loss": 0.3272, + "step": 3954 + }, + { + "epoch": 0.3725771884788394, + "grad_norm": 0.9731943011283875, + "learning_rate": 1.861268099752902e-05, + "loss": 0.3321, + "step": 3955 + }, + { + "epoch": 0.37267139257200727, + "grad_norm": 0.8283352255821228, + "learning_rate": 1.861191358239917e-05, + "loss": 0.2921, + "step": 3956 + }, + { + "epoch": 0.3727655966651751, + "grad_norm": 0.734096348285675, + "learning_rate": 1.8611145970904372e-05, + "loss": 0.3303, + "step": 3957 + }, + { + "epoch": 0.37285980075834296, + "grad_norm": 0.8195230960845947, + "learning_rate": 1.8610378163062127e-05, + "loss": 0.3395, + "step": 3958 + }, + { + "epoch": 0.3729540048515108, + "grad_norm": 0.6890751719474792, + "learning_rate": 1.8609610158889943e-05, + "loss": 0.2908, + "step": 3959 + }, + { + "epoch": 0.37304820894467866, + "grad_norm": 0.8741453886032104, + "learning_rate": 1.8608841958405338e-05, + "loss": 0.3464, + "step": 3960 + }, + { + "epoch": 0.3731424130378465, + "grad_norm": 0.83934485912323, + "learning_rate": 1.8608073561625817e-05, + "loss": 0.3746, + "step": 3961 + }, + { + "epoch": 0.37323661713101436, + "grad_norm": 0.9127764105796814, + "learning_rate": 1.8607304968568905e-05, + "loss": 0.3526, + "step": 3962 + }, + { + "epoch": 0.3733308212241822, + "grad_norm": 0.7462518215179443, + "learning_rate": 1.8606536179252132e-05, + "loss": 0.3223, + "step": 3963 + }, + { + "epoch": 0.37342502531735006, + "grad_norm": 0.75594562292099, + "learning_rate": 1.8605767193693023e-05, + "loss": 0.3547, + "step": 3964 + }, + { + "epoch": 0.3735192294105179, + "grad_norm": 0.7274197936058044, + "learning_rate": 1.8604998011909114e-05, + "loss": 0.2963, + "step": 3965 + }, + { + "epoch": 0.37361343350368575, + "grad_norm": 0.779574990272522, + "learning_rate": 1.860422863391794e-05, + "loss": 0.3382, + "step": 3966 + }, + { + "epoch": 0.3737076375968536, + "grad_norm": 0.9977636933326721, + "learning_rate": 1.8603459059737046e-05, + "loss": 0.4237, + "step": 3967 + }, + { + "epoch": 0.37380184169002145, + "grad_norm": 0.8320953845977783, + "learning_rate": 1.8602689289383982e-05, + "loss": 0.3311, + "step": 3968 + }, + { + "epoch": 0.3738960457831893, + "grad_norm": 0.8946393132209778, + "learning_rate": 1.86019193228763e-05, + "loss": 0.4028, + "step": 3969 + }, + { + "epoch": 0.37399024987635715, + "grad_norm": 0.8022248148918152, + "learning_rate": 1.860114916023155e-05, + "loss": 0.3935, + "step": 3970 + }, + { + "epoch": 0.374084453969525, + "grad_norm": 0.8089770078659058, + "learning_rate": 1.8600378801467297e-05, + "loss": 0.357, + "step": 3971 + }, + { + "epoch": 0.37417865806269285, + "grad_norm": 0.95083087682724, + "learning_rate": 1.8599608246601112e-05, + "loss": 0.3627, + "step": 3972 + }, + { + "epoch": 0.3742728621558607, + "grad_norm": 0.8489338159561157, + "learning_rate": 1.8598837495650555e-05, + "loss": 0.3766, + "step": 3973 + }, + { + "epoch": 0.37436706624902855, + "grad_norm": 0.7435100078582764, + "learning_rate": 1.8598066548633205e-05, + "loss": 0.3194, + "step": 3974 + }, + { + "epoch": 0.3744612703421964, + "grad_norm": 0.776519238948822, + "learning_rate": 1.859729540556664e-05, + "loss": 0.3767, + "step": 3975 + }, + { + "epoch": 0.37455547443536424, + "grad_norm": 1.5168535709381104, + "learning_rate": 1.8596524066468444e-05, + "loss": 0.3531, + "step": 3976 + }, + { + "epoch": 0.37464967852853204, + "grad_norm": 0.7984569072723389, + "learning_rate": 1.8595752531356207e-05, + "loss": 0.3643, + "step": 3977 + }, + { + "epoch": 0.3747438826216999, + "grad_norm": 0.8715662956237793, + "learning_rate": 1.859498080024752e-05, + "loss": 0.3749, + "step": 3978 + }, + { + "epoch": 0.37483808671486774, + "grad_norm": 0.8876890540122986, + "learning_rate": 1.8594208873159974e-05, + "loss": 0.3909, + "step": 3979 + }, + { + "epoch": 0.3749322908080356, + "grad_norm": 0.7428289651870728, + "learning_rate": 1.8593436750111174e-05, + "loss": 0.3519, + "step": 3980 + }, + { + "epoch": 0.37502649490120343, + "grad_norm": 0.8159217238426208, + "learning_rate": 1.859266443111873e-05, + "loss": 0.3498, + "step": 3981 + }, + { + "epoch": 0.3751206989943713, + "grad_norm": 0.9959158897399902, + "learning_rate": 1.8591891916200248e-05, + "loss": 0.3643, + "step": 3982 + }, + { + "epoch": 0.37521490308753913, + "grad_norm": 0.6949526071548462, + "learning_rate": 1.8591119205373337e-05, + "loss": 0.3142, + "step": 3983 + }, + { + "epoch": 0.375309107180707, + "grad_norm": 0.9546235799789429, + "learning_rate": 1.8590346298655624e-05, + "loss": 0.3963, + "step": 3984 + }, + { + "epoch": 0.37540331127387483, + "grad_norm": 0.7107335329055786, + "learning_rate": 1.8589573196064734e-05, + "loss": 0.2952, + "step": 3985 + }, + { + "epoch": 0.3754975153670427, + "grad_norm": 1.7656023502349854, + "learning_rate": 1.858879989761829e-05, + "loss": 0.3878, + "step": 3986 + }, + { + "epoch": 0.3755917194602105, + "grad_norm": 0.8152875304222107, + "learning_rate": 1.858802640333392e-05, + "loss": 0.3453, + "step": 3987 + }, + { + "epoch": 0.3756859235533784, + "grad_norm": 0.8879491686820984, + "learning_rate": 1.8587252713229276e-05, + "loss": 0.3229, + "step": 3988 + }, + { + "epoch": 0.3757801276465462, + "grad_norm": 0.857681155204773, + "learning_rate": 1.8586478827321985e-05, + "loss": 0.3701, + "step": 3989 + }, + { + "epoch": 0.3758743317397141, + "grad_norm": 0.970120370388031, + "learning_rate": 1.8585704745629695e-05, + "loss": 0.3036, + "step": 3990 + }, + { + "epoch": 0.3759685358328819, + "grad_norm": 0.8371943831443787, + "learning_rate": 1.858493046817006e-05, + "loss": 0.3575, + "step": 3991 + }, + { + "epoch": 0.37606273992604977, + "grad_norm": 0.8610896468162537, + "learning_rate": 1.8584155994960734e-05, + "loss": 0.3305, + "step": 3992 + }, + { + "epoch": 0.3761569440192176, + "grad_norm": 0.7553221583366394, + "learning_rate": 1.8583381326019377e-05, + "loss": 0.361, + "step": 3993 + }, + { + "epoch": 0.37625114811238547, + "grad_norm": 0.7915468811988831, + "learning_rate": 1.858260646136365e-05, + "loss": 0.3958, + "step": 3994 + }, + { + "epoch": 0.3763453522055533, + "grad_norm": 0.8172153830528259, + "learning_rate": 1.8581831401011222e-05, + "loss": 0.3379, + "step": 3995 + }, + { + "epoch": 0.37643955629872117, + "grad_norm": 0.7263503074645996, + "learning_rate": 1.858105614497977e-05, + "loss": 0.338, + "step": 3996 + }, + { + "epoch": 0.376533760391889, + "grad_norm": 0.9005851149559021, + "learning_rate": 1.8580280693286958e-05, + "loss": 0.366, + "step": 3997 + }, + { + "epoch": 0.37662796448505687, + "grad_norm": 0.8273485898971558, + "learning_rate": 1.8579505045950485e-05, + "loss": 0.3189, + "step": 3998 + }, + { + "epoch": 0.3767221685782247, + "grad_norm": 0.7768470048904419, + "learning_rate": 1.8578729202988025e-05, + "loss": 0.3504, + "step": 3999 + }, + { + "epoch": 0.37681637267139256, + "grad_norm": 0.7757167220115662, + "learning_rate": 1.857795316441727e-05, + "loss": 0.3889, + "step": 4000 + }, + { + "epoch": 0.3769105767645604, + "grad_norm": 0.7570431232452393, + "learning_rate": 1.8577176930255923e-05, + "loss": 0.377, + "step": 4001 + }, + { + "epoch": 0.37700478085772826, + "grad_norm": 0.8670721054077148, + "learning_rate": 1.8576400500521673e-05, + "loss": 0.3578, + "step": 4002 + }, + { + "epoch": 0.3770989849508961, + "grad_norm": 0.7168530225753784, + "learning_rate": 1.8575623875232228e-05, + "loss": 0.3552, + "step": 4003 + }, + { + "epoch": 0.37719318904406396, + "grad_norm": 0.8752408623695374, + "learning_rate": 1.8574847054405294e-05, + "loss": 0.3424, + "step": 4004 + }, + { + "epoch": 0.3772873931372318, + "grad_norm": 0.9238356947898865, + "learning_rate": 1.857407003805859e-05, + "loss": 0.387, + "step": 4005 + }, + { + "epoch": 0.37738159723039966, + "grad_norm": 0.9070408344268799, + "learning_rate": 1.8573292826209827e-05, + "loss": 0.3439, + "step": 4006 + }, + { + "epoch": 0.3774758013235675, + "grad_norm": 0.794664740562439, + "learning_rate": 1.857251541887673e-05, + "loss": 0.3377, + "step": 4007 + }, + { + "epoch": 0.37757000541673535, + "grad_norm": 0.7839815616607666, + "learning_rate": 1.8571737816077022e-05, + "loss": 0.3117, + "step": 4008 + }, + { + "epoch": 0.3776642095099032, + "grad_norm": 0.843227207660675, + "learning_rate": 1.8570960017828437e-05, + "loss": 0.3365, + "step": 4009 + }, + { + "epoch": 0.37775841360307105, + "grad_norm": 0.7713897824287415, + "learning_rate": 1.8570182024148707e-05, + "loss": 0.3071, + "step": 4010 + }, + { + "epoch": 0.3778526176962389, + "grad_norm": 0.8083482384681702, + "learning_rate": 1.8569403835055576e-05, + "loss": 0.3039, + "step": 4011 + }, + { + "epoch": 0.37794682178940675, + "grad_norm": 0.8170844316482544, + "learning_rate": 1.856862545056678e-05, + "loss": 0.3383, + "step": 4012 + }, + { + "epoch": 0.3780410258825746, + "grad_norm": 0.9065813422203064, + "learning_rate": 1.8567846870700076e-05, + "loss": 0.3459, + "step": 4013 + }, + { + "epoch": 0.37813522997574245, + "grad_norm": 0.8003597259521484, + "learning_rate": 1.856706809547321e-05, + "loss": 0.3747, + "step": 4014 + }, + { + "epoch": 0.3782294340689103, + "grad_norm": 0.7988014221191406, + "learning_rate": 1.8566289124903945e-05, + "loss": 0.3275, + "step": 4015 + }, + { + "epoch": 0.37832363816207815, + "grad_norm": 0.8614307641983032, + "learning_rate": 1.8565509959010037e-05, + "loss": 0.3558, + "step": 4016 + }, + { + "epoch": 0.378417842255246, + "grad_norm": 0.8755281567573547, + "learning_rate": 1.8564730597809258e-05, + "loss": 0.3993, + "step": 4017 + }, + { + "epoch": 0.37851204634841384, + "grad_norm": 0.9249305725097656, + "learning_rate": 1.8563951041319375e-05, + "loss": 0.3839, + "step": 4018 + }, + { + "epoch": 0.3786062504415817, + "grad_norm": 0.9633373618125916, + "learning_rate": 1.856317128955816e-05, + "loss": 0.377, + "step": 4019 + }, + { + "epoch": 0.37870045453474954, + "grad_norm": 0.8036518096923828, + "learning_rate": 1.8562391342543403e-05, + "loss": 0.3539, + "step": 4020 + }, + { + "epoch": 0.3787946586279174, + "grad_norm": 0.6962668895721436, + "learning_rate": 1.856161120029288e-05, + "loss": 0.3327, + "step": 4021 + }, + { + "epoch": 0.37888886272108524, + "grad_norm": 0.8387727737426758, + "learning_rate": 1.8560830862824375e-05, + "loss": 0.3585, + "step": 4022 + }, + { + "epoch": 0.3789830668142531, + "grad_norm": 0.8466220498085022, + "learning_rate": 1.856005033015569e-05, + "loss": 0.3429, + "step": 4023 + }, + { + "epoch": 0.37907727090742094, + "grad_norm": 0.9862387180328369, + "learning_rate": 1.8559269602304623e-05, + "loss": 0.3319, + "step": 4024 + }, + { + "epoch": 0.3791714750005888, + "grad_norm": 0.8368757963180542, + "learning_rate": 1.8558488679288967e-05, + "loss": 0.3342, + "step": 4025 + }, + { + "epoch": 0.37926567909375664, + "grad_norm": 0.8175406455993652, + "learning_rate": 1.8557707561126533e-05, + "loss": 0.3807, + "step": 4026 + }, + { + "epoch": 0.3793598831869245, + "grad_norm": 0.8657997846603394, + "learning_rate": 1.8556926247835135e-05, + "loss": 0.3695, + "step": 4027 + }, + { + "epoch": 0.37945408728009233, + "grad_norm": 0.9448007345199585, + "learning_rate": 1.8556144739432584e-05, + "loss": 0.3488, + "step": 4028 + }, + { + "epoch": 0.3795482913732602, + "grad_norm": 0.7154188752174377, + "learning_rate": 1.8555363035936698e-05, + "loss": 0.3244, + "step": 4029 + }, + { + "epoch": 0.37964249546642803, + "grad_norm": 0.7990254163742065, + "learning_rate": 1.8554581137365307e-05, + "loss": 0.3418, + "step": 4030 + }, + { + "epoch": 0.3797366995595959, + "grad_norm": 0.781462550163269, + "learning_rate": 1.8553799043736234e-05, + "loss": 0.3869, + "step": 4031 + }, + { + "epoch": 0.37983090365276373, + "grad_norm": 0.8279241323471069, + "learning_rate": 1.8553016755067315e-05, + "loss": 0.3443, + "step": 4032 + }, + { + "epoch": 0.3799251077459316, + "grad_norm": 0.8141920566558838, + "learning_rate": 1.8552234271376388e-05, + "loss": 0.3859, + "step": 4033 + }, + { + "epoch": 0.3800193118390994, + "grad_norm": 0.865777313709259, + "learning_rate": 1.8551451592681292e-05, + "loss": 0.3951, + "step": 4034 + }, + { + "epoch": 0.3801135159322673, + "grad_norm": 0.8812288045883179, + "learning_rate": 1.8550668718999873e-05, + "loss": 0.353, + "step": 4035 + }, + { + "epoch": 0.3802077200254351, + "grad_norm": 1.1636347770690918, + "learning_rate": 1.8549885650349985e-05, + "loss": 0.3255, + "step": 4036 + }, + { + "epoch": 0.380301924118603, + "grad_norm": 0.7793205380439758, + "learning_rate": 1.854910238674948e-05, + "loss": 0.3367, + "step": 4037 + }, + { + "epoch": 0.3803961282117708, + "grad_norm": 0.9390274882316589, + "learning_rate": 1.8548318928216223e-05, + "loss": 0.3541, + "step": 4038 + }, + { + "epoch": 0.38049033230493867, + "grad_norm": 0.8669273853302002, + "learning_rate": 1.8547535274768073e-05, + "loss": 0.3415, + "step": 4039 + }, + { + "epoch": 0.3805845363981065, + "grad_norm": 0.8564456701278687, + "learning_rate": 1.8546751426422894e-05, + "loss": 0.3406, + "step": 4040 + }, + { + "epoch": 0.38067874049127437, + "grad_norm": 0.8560154438018799, + "learning_rate": 1.854596738319857e-05, + "loss": 0.3543, + "step": 4041 + }, + { + "epoch": 0.3807729445844422, + "grad_norm": 2.012359619140625, + "learning_rate": 1.8545183145112973e-05, + "loss": 0.3288, + "step": 4042 + }, + { + "epoch": 0.38086714867761007, + "grad_norm": 0.8635702133178711, + "learning_rate": 1.8544398712183987e-05, + "loss": 0.377, + "step": 4043 + }, + { + "epoch": 0.3809613527707779, + "grad_norm": 0.7307999730110168, + "learning_rate": 1.854361408442949e-05, + "loss": 0.3146, + "step": 4044 + }, + { + "epoch": 0.38105555686394577, + "grad_norm": 0.8456552624702454, + "learning_rate": 1.8542829261867382e-05, + "loss": 0.3656, + "step": 4045 + }, + { + "epoch": 0.38114976095711356, + "grad_norm": 0.8214126229286194, + "learning_rate": 1.8542044244515556e-05, + "loss": 0.3461, + "step": 4046 + }, + { + "epoch": 0.3812439650502814, + "grad_norm": 0.8272832632064819, + "learning_rate": 1.8541259032391908e-05, + "loss": 0.3478, + "step": 4047 + }, + { + "epoch": 0.38133816914344926, + "grad_norm": 0.8121527433395386, + "learning_rate": 1.854047362551435e-05, + "loss": 0.332, + "step": 4048 + }, + { + "epoch": 0.3814323732366171, + "grad_norm": 0.7890356183052063, + "learning_rate": 1.853968802390078e-05, + "loss": 0.338, + "step": 4049 + }, + { + "epoch": 0.38152657732978495, + "grad_norm": 0.7535851001739502, + "learning_rate": 1.8538902227569118e-05, + "loss": 0.3473, + "step": 4050 + }, + { + "epoch": 0.3816207814229528, + "grad_norm": 0.8237473368644714, + "learning_rate": 1.8538116236537283e-05, + "loss": 0.3648, + "step": 4051 + }, + { + "epoch": 0.38171498551612065, + "grad_norm": 0.8340440392494202, + "learning_rate": 1.8537330050823188e-05, + "loss": 0.3689, + "step": 4052 + }, + { + "epoch": 0.3818091896092885, + "grad_norm": 0.8527438044548035, + "learning_rate": 1.8536543670444767e-05, + "loss": 0.3525, + "step": 4053 + }, + { + "epoch": 0.38190339370245635, + "grad_norm": 0.7588123679161072, + "learning_rate": 1.8535757095419947e-05, + "loss": 0.3513, + "step": 4054 + }, + { + "epoch": 0.3819975977956242, + "grad_norm": 0.7642732858657837, + "learning_rate": 1.853497032576667e-05, + "loss": 0.3674, + "step": 4055 + }, + { + "epoch": 0.38209180188879205, + "grad_norm": 0.804860532283783, + "learning_rate": 1.8534183361502864e-05, + "loss": 0.3909, + "step": 4056 + }, + { + "epoch": 0.3821860059819599, + "grad_norm": 0.7832939028739929, + "learning_rate": 1.853339620264648e-05, + "loss": 0.3187, + "step": 4057 + }, + { + "epoch": 0.38228021007512775, + "grad_norm": 0.7937828898429871, + "learning_rate": 1.8532608849215464e-05, + "loss": 0.3717, + "step": 4058 + }, + { + "epoch": 0.3823744141682956, + "grad_norm": 0.6621168851852417, + "learning_rate": 1.8531821301227774e-05, + "loss": 0.3237, + "step": 4059 + }, + { + "epoch": 0.38246861826146344, + "grad_norm": 0.8530427813529968, + "learning_rate": 1.8531033558701363e-05, + "loss": 0.3541, + "step": 4060 + }, + { + "epoch": 0.3825628223546313, + "grad_norm": 0.8988747000694275, + "learning_rate": 1.8530245621654188e-05, + "loss": 0.3537, + "step": 4061 + }, + { + "epoch": 0.38265702644779914, + "grad_norm": 0.9490792155265808, + "learning_rate": 1.8529457490104226e-05, + "loss": 0.318, + "step": 4062 + }, + { + "epoch": 0.382751230540967, + "grad_norm": 0.881201446056366, + "learning_rate": 1.8528669164069438e-05, + "loss": 0.3128, + "step": 4063 + }, + { + "epoch": 0.38284543463413484, + "grad_norm": 0.7725705504417419, + "learning_rate": 1.8527880643567808e-05, + "loss": 0.3612, + "step": 4064 + }, + { + "epoch": 0.3829396387273027, + "grad_norm": 0.6691721677780151, + "learning_rate": 1.852709192861731e-05, + "loss": 0.2916, + "step": 4065 + }, + { + "epoch": 0.38303384282047054, + "grad_norm": 0.8485406637191772, + "learning_rate": 1.8526303019235926e-05, + "loss": 0.3415, + "step": 4066 + }, + { + "epoch": 0.3831280469136384, + "grad_norm": 0.9098604321479797, + "learning_rate": 1.852551391544165e-05, + "loss": 0.363, + "step": 4067 + }, + { + "epoch": 0.38322225100680624, + "grad_norm": 0.8491668701171875, + "learning_rate": 1.852472461725247e-05, + "loss": 0.3404, + "step": 4068 + }, + { + "epoch": 0.3833164550999741, + "grad_norm": 0.7346593141555786, + "learning_rate": 1.8523935124686384e-05, + "loss": 0.3099, + "step": 4069 + }, + { + "epoch": 0.38341065919314193, + "grad_norm": 0.9048673510551453, + "learning_rate": 1.8523145437761395e-05, + "loss": 0.4132, + "step": 4070 + }, + { + "epoch": 0.3835048632863098, + "grad_norm": 0.7384666204452515, + "learning_rate": 1.852235555649551e-05, + "loss": 0.302, + "step": 4071 + }, + { + "epoch": 0.38359906737947763, + "grad_norm": 0.6942598223686218, + "learning_rate": 1.8521565480906737e-05, + "loss": 0.3003, + "step": 4072 + }, + { + "epoch": 0.3836932714726455, + "grad_norm": 0.7419342398643494, + "learning_rate": 1.8520775211013094e-05, + "loss": 0.3233, + "step": 4073 + }, + { + "epoch": 0.38378747556581333, + "grad_norm": 0.7706865072250366, + "learning_rate": 1.8519984746832597e-05, + "loss": 0.3459, + "step": 4074 + }, + { + "epoch": 0.3838816796589812, + "grad_norm": 0.8181882500648499, + "learning_rate": 1.851919408838327e-05, + "loss": 0.3547, + "step": 4075 + }, + { + "epoch": 0.383975883752149, + "grad_norm": 0.7399076819419861, + "learning_rate": 1.8518403235683147e-05, + "loss": 0.3457, + "step": 4076 + }, + { + "epoch": 0.3840700878453169, + "grad_norm": 0.8902973532676697, + "learning_rate": 1.8517612188750254e-05, + "loss": 0.3458, + "step": 4077 + }, + { + "epoch": 0.3841642919384847, + "grad_norm": 0.8323981761932373, + "learning_rate": 1.851682094760263e-05, + "loss": 0.3395, + "step": 4078 + }, + { + "epoch": 0.3842584960316526, + "grad_norm": 0.8129898309707642, + "learning_rate": 1.851602951225832e-05, + "loss": 0.3506, + "step": 4079 + }, + { + "epoch": 0.3843527001248204, + "grad_norm": 0.7929735779762268, + "learning_rate": 1.851523788273536e-05, + "loss": 0.3539, + "step": 4080 + }, + { + "epoch": 0.3844469042179883, + "grad_norm": 0.733232855796814, + "learning_rate": 1.8514446059051813e-05, + "loss": 0.3007, + "step": 4081 + }, + { + "epoch": 0.3845411083111561, + "grad_norm": 0.7781715989112854, + "learning_rate": 1.8513654041225723e-05, + "loss": 0.3691, + "step": 4082 + }, + { + "epoch": 0.38463531240432397, + "grad_norm": 0.7716662883758545, + "learning_rate": 1.851286182927516e-05, + "loss": 0.395, + "step": 4083 + }, + { + "epoch": 0.3847295164974918, + "grad_norm": 0.8850762844085693, + "learning_rate": 1.8512069423218176e-05, + "loss": 0.3031, + "step": 4084 + }, + { + "epoch": 0.38482372059065967, + "grad_norm": 0.7346330285072327, + "learning_rate": 1.851127682307285e-05, + "loss": 0.3451, + "step": 4085 + }, + { + "epoch": 0.3849179246838275, + "grad_norm": 0.8006929159164429, + "learning_rate": 1.8510484028857245e-05, + "loss": 0.3467, + "step": 4086 + }, + { + "epoch": 0.38501212877699537, + "grad_norm": 0.7510959506034851, + "learning_rate": 1.8509691040589448e-05, + "loss": 0.3029, + "step": 4087 + }, + { + "epoch": 0.3851063328701632, + "grad_norm": 0.7754647731781006, + "learning_rate": 1.8508897858287528e-05, + "loss": 0.3684, + "step": 4088 + }, + { + "epoch": 0.38520053696333106, + "grad_norm": 1.0007089376449585, + "learning_rate": 1.8508104481969585e-05, + "loss": 0.3311, + "step": 4089 + }, + { + "epoch": 0.3852947410564989, + "grad_norm": 0.7495034337043762, + "learning_rate": 1.8507310911653696e-05, + "loss": 0.3827, + "step": 4090 + }, + { + "epoch": 0.38538894514966676, + "grad_norm": 0.7500461339950562, + "learning_rate": 1.8506517147357966e-05, + "loss": 0.3252, + "step": 4091 + }, + { + "epoch": 0.3854831492428346, + "grad_norm": 0.840074360370636, + "learning_rate": 1.8505723189100483e-05, + "loss": 0.3599, + "step": 4092 + }, + { + "epoch": 0.38557735333600246, + "grad_norm": 0.7502328753471375, + "learning_rate": 1.8504929036899363e-05, + "loss": 0.3037, + "step": 4093 + }, + { + "epoch": 0.3856715574291703, + "grad_norm": 0.8640468120574951, + "learning_rate": 1.8504134690772706e-05, + "loss": 0.4063, + "step": 4094 + }, + { + "epoch": 0.38576576152233816, + "grad_norm": 0.9477185606956482, + "learning_rate": 1.8503340150738626e-05, + "loss": 0.3895, + "step": 4095 + }, + { + "epoch": 0.385859965615506, + "grad_norm": 0.810876190662384, + "learning_rate": 1.8502545416815238e-05, + "loss": 0.3492, + "step": 4096 + }, + { + "epoch": 0.38595416970867386, + "grad_norm": 0.8612748980522156, + "learning_rate": 1.850175048902067e-05, + "loss": 0.3368, + "step": 4097 + }, + { + "epoch": 0.3860483738018417, + "grad_norm": 0.7046990990638733, + "learning_rate": 1.8500955367373038e-05, + "loss": 0.3161, + "step": 4098 + }, + { + "epoch": 0.38614257789500955, + "grad_norm": 0.7437293529510498, + "learning_rate": 1.8500160051890475e-05, + "loss": 0.3364, + "step": 4099 + }, + { + "epoch": 0.3862367819881774, + "grad_norm": 0.9109039306640625, + "learning_rate": 1.849936454259112e-05, + "loss": 0.32, + "step": 4100 + }, + { + "epoch": 0.38633098608134525, + "grad_norm": 0.8287127017974854, + "learning_rate": 1.8498568839493112e-05, + "loss": 0.4096, + "step": 4101 + }, + { + "epoch": 0.3864251901745131, + "grad_norm": 0.8941023945808411, + "learning_rate": 1.849777294261459e-05, + "loss": 0.3508, + "step": 4102 + }, + { + "epoch": 0.38651939426768095, + "grad_norm": 1.0079180002212524, + "learning_rate": 1.84969768519737e-05, + "loss": 0.3376, + "step": 4103 + }, + { + "epoch": 0.3866135983608488, + "grad_norm": 0.9072720408439636, + "learning_rate": 1.84961805675886e-05, + "loss": 0.3379, + "step": 4104 + }, + { + "epoch": 0.38670780245401665, + "grad_norm": 0.8411976099014282, + "learning_rate": 1.849538408947744e-05, + "loss": 0.3411, + "step": 4105 + }, + { + "epoch": 0.3868020065471845, + "grad_norm": 0.9072670340538025, + "learning_rate": 1.8494587417658385e-05, + "loss": 0.3534, + "step": 4106 + }, + { + "epoch": 0.38689621064035234, + "grad_norm": 0.8730388879776001, + "learning_rate": 1.8493790552149602e-05, + "loss": 0.3367, + "step": 4107 + }, + { + "epoch": 0.3869904147335202, + "grad_norm": 0.8113542199134827, + "learning_rate": 1.8492993492969257e-05, + "loss": 0.3655, + "step": 4108 + }, + { + "epoch": 0.38708461882668804, + "grad_norm": 0.6983941793441772, + "learning_rate": 1.8492196240135526e-05, + "loss": 0.3288, + "step": 4109 + }, + { + "epoch": 0.3871788229198559, + "grad_norm": 0.7515357732772827, + "learning_rate": 1.8491398793666587e-05, + "loss": 0.3499, + "step": 4110 + }, + { + "epoch": 0.38727302701302374, + "grad_norm": 0.7293741106987, + "learning_rate": 1.849060115358062e-05, + "loss": 0.3394, + "step": 4111 + }, + { + "epoch": 0.3873672311061916, + "grad_norm": 0.7499194145202637, + "learning_rate": 1.848980331989582e-05, + "loss": 0.328, + "step": 4112 + }, + { + "epoch": 0.38746143519935944, + "grad_norm": 0.8803209662437439, + "learning_rate": 1.8489005292630377e-05, + "loss": 0.3931, + "step": 4113 + }, + { + "epoch": 0.3875556392925273, + "grad_norm": 0.8406884670257568, + "learning_rate": 1.848820707180248e-05, + "loss": 0.3564, + "step": 4114 + }, + { + "epoch": 0.3876498433856951, + "grad_norm": 0.8496602177619934, + "learning_rate": 1.8487408657430338e-05, + "loss": 0.369, + "step": 4115 + }, + { + "epoch": 0.38774404747886293, + "grad_norm": 0.7715287208557129, + "learning_rate": 1.8486610049532146e-05, + "loss": 0.3525, + "step": 4116 + }, + { + "epoch": 0.3878382515720308, + "grad_norm": 0.8887027502059937, + "learning_rate": 1.8485811248126127e-05, + "loss": 0.4193, + "step": 4117 + }, + { + "epoch": 0.38793245566519863, + "grad_norm": 0.7482026815414429, + "learning_rate": 1.8485012253230484e-05, + "loss": 0.3492, + "step": 4118 + }, + { + "epoch": 0.3880266597583665, + "grad_norm": 0.7965260148048401, + "learning_rate": 1.8484213064863442e-05, + "loss": 0.3143, + "step": 4119 + }, + { + "epoch": 0.3881208638515343, + "grad_norm": 0.8715170621871948, + "learning_rate": 1.8483413683043224e-05, + "loss": 0.3614, + "step": 4120 + }, + { + "epoch": 0.3882150679447022, + "grad_norm": 0.8241899013519287, + "learning_rate": 1.848261410778805e-05, + "loss": 0.3076, + "step": 4121 + }, + { + "epoch": 0.38830927203787, + "grad_norm": 0.8795972466468811, + "learning_rate": 1.8481814339116154e-05, + "loss": 0.3467, + "step": 4122 + }, + { + "epoch": 0.3884034761310379, + "grad_norm": 0.7732073664665222, + "learning_rate": 1.8481014377045773e-05, + "loss": 0.3241, + "step": 4123 + }, + { + "epoch": 0.3884976802242057, + "grad_norm": 0.8015792965888977, + "learning_rate": 1.848021422159515e-05, + "loss": 0.3074, + "step": 4124 + }, + { + "epoch": 0.38859188431737357, + "grad_norm": 0.9260653257369995, + "learning_rate": 1.8479413872782532e-05, + "loss": 0.3879, + "step": 4125 + }, + { + "epoch": 0.3886860884105414, + "grad_norm": 0.7994526624679565, + "learning_rate": 1.8478613330626157e-05, + "loss": 0.3578, + "step": 4126 + }, + { + "epoch": 0.38878029250370927, + "grad_norm": 1.007300615310669, + "learning_rate": 1.8477812595144292e-05, + "loss": 0.3817, + "step": 4127 + }, + { + "epoch": 0.3888744965968771, + "grad_norm": 0.9153285622596741, + "learning_rate": 1.8477011666355188e-05, + "loss": 0.3673, + "step": 4128 + }, + { + "epoch": 0.38896870069004497, + "grad_norm": 0.7606241703033447, + "learning_rate": 1.8476210544277105e-05, + "loss": 0.3249, + "step": 4129 + }, + { + "epoch": 0.3890629047832128, + "grad_norm": 0.7886267304420471, + "learning_rate": 1.8475409228928314e-05, + "loss": 0.3611, + "step": 4130 + }, + { + "epoch": 0.38915710887638066, + "grad_norm": 1.02424156665802, + "learning_rate": 1.8474607720327084e-05, + "loss": 0.4415, + "step": 4131 + }, + { + "epoch": 0.3892513129695485, + "grad_norm": 0.9800648093223572, + "learning_rate": 1.847380601849169e-05, + "loss": 0.3471, + "step": 4132 + }, + { + "epoch": 0.38934551706271636, + "grad_norm": 0.8154638409614563, + "learning_rate": 1.847300412344042e-05, + "loss": 0.3675, + "step": 4133 + }, + { + "epoch": 0.3894397211558842, + "grad_norm": 0.7895480394363403, + "learning_rate": 1.8472202035191547e-05, + "loss": 0.3231, + "step": 4134 + }, + { + "epoch": 0.38953392524905206, + "grad_norm": 0.8950668573379517, + "learning_rate": 1.8471399753763366e-05, + "loss": 0.3685, + "step": 4135 + }, + { + "epoch": 0.3896281293422199, + "grad_norm": 0.8978734612464905, + "learning_rate": 1.8470597279174173e-05, + "loss": 0.3351, + "step": 4136 + }, + { + "epoch": 0.38972233343538776, + "grad_norm": 0.8425732851028442, + "learning_rate": 1.8469794611442257e-05, + "loss": 0.3242, + "step": 4137 + }, + { + "epoch": 0.3898165375285556, + "grad_norm": 0.8049556612968445, + "learning_rate": 1.8468991750585928e-05, + "loss": 0.3835, + "step": 4138 + }, + { + "epoch": 0.38991074162172346, + "grad_norm": 0.7788106799125671, + "learning_rate": 1.846818869662349e-05, + "loss": 0.3088, + "step": 4139 + }, + { + "epoch": 0.3900049457148913, + "grad_norm": 0.897233784198761, + "learning_rate": 1.846738544957325e-05, + "loss": 0.3917, + "step": 4140 + }, + { + "epoch": 0.39009914980805915, + "grad_norm": 0.747938871383667, + "learning_rate": 1.846658200945353e-05, + "loss": 0.3766, + "step": 4141 + }, + { + "epoch": 0.390193353901227, + "grad_norm": 0.8425939679145813, + "learning_rate": 1.8465778376282645e-05, + "loss": 0.3325, + "step": 4142 + }, + { + "epoch": 0.39028755799439485, + "grad_norm": 0.7967554926872253, + "learning_rate": 1.846497455007892e-05, + "loss": 0.3271, + "step": 4143 + }, + { + "epoch": 0.3903817620875627, + "grad_norm": 0.7986201643943787, + "learning_rate": 1.8464170530860684e-05, + "loss": 0.3492, + "step": 4144 + }, + { + "epoch": 0.39047596618073055, + "grad_norm": 0.7835004925727844, + "learning_rate": 1.8463366318646274e-05, + "loss": 0.3253, + "step": 4145 + }, + { + "epoch": 0.3905701702738984, + "grad_norm": 0.8994381427764893, + "learning_rate": 1.8462561913454017e-05, + "loss": 0.3736, + "step": 4146 + }, + { + "epoch": 0.39066437436706625, + "grad_norm": 0.7640780806541443, + "learning_rate": 1.8461757315302264e-05, + "loss": 0.3578, + "step": 4147 + }, + { + "epoch": 0.3907585784602341, + "grad_norm": 0.836300253868103, + "learning_rate": 1.8460952524209355e-05, + "loss": 0.3283, + "step": 4148 + }, + { + "epoch": 0.39085278255340195, + "grad_norm": 0.7115257382392883, + "learning_rate": 1.8460147540193648e-05, + "loss": 0.3347, + "step": 4149 + }, + { + "epoch": 0.3909469866465698, + "grad_norm": 0.7805563807487488, + "learning_rate": 1.8459342363273488e-05, + "loss": 0.3487, + "step": 4150 + }, + { + "epoch": 0.39104119073973764, + "grad_norm": 0.8107002377510071, + "learning_rate": 1.8458536993467242e-05, + "loss": 0.3396, + "step": 4151 + }, + { + "epoch": 0.3911353948329055, + "grad_norm": 0.8486902117729187, + "learning_rate": 1.8457731430793272e-05, + "loss": 0.3908, + "step": 4152 + }, + { + "epoch": 0.39122959892607334, + "grad_norm": 0.8985098600387573, + "learning_rate": 1.8456925675269944e-05, + "loss": 0.3819, + "step": 4153 + }, + { + "epoch": 0.3913238030192412, + "grad_norm": 0.7981569170951843, + "learning_rate": 1.8456119726915635e-05, + "loss": 0.4176, + "step": 4154 + }, + { + "epoch": 0.39141800711240904, + "grad_norm": 0.8377177715301514, + "learning_rate": 1.8455313585748716e-05, + "loss": 0.3641, + "step": 4155 + }, + { + "epoch": 0.3915122112055769, + "grad_norm": 1.0901809930801392, + "learning_rate": 1.8454507251787567e-05, + "loss": 0.3688, + "step": 4156 + }, + { + "epoch": 0.39160641529874474, + "grad_norm": 0.7740063667297363, + "learning_rate": 1.8453700725050583e-05, + "loss": 0.3487, + "step": 4157 + }, + { + "epoch": 0.3917006193919126, + "grad_norm": 0.816830575466156, + "learning_rate": 1.8452894005556148e-05, + "loss": 0.3351, + "step": 4158 + }, + { + "epoch": 0.39179482348508043, + "grad_norm": 1.1118569374084473, + "learning_rate": 1.8452087093322655e-05, + "loss": 0.3811, + "step": 4159 + }, + { + "epoch": 0.3918890275782483, + "grad_norm": 0.7970894575119019, + "learning_rate": 1.8451279988368506e-05, + "loss": 0.3356, + "step": 4160 + }, + { + "epoch": 0.39198323167141613, + "grad_norm": 0.8318902254104614, + "learning_rate": 1.84504726907121e-05, + "loss": 0.3474, + "step": 4161 + }, + { + "epoch": 0.392077435764584, + "grad_norm": 0.7451596856117249, + "learning_rate": 1.8449665200371852e-05, + "loss": 0.3627, + "step": 4162 + }, + { + "epoch": 0.39217163985775183, + "grad_norm": 0.8282124996185303, + "learning_rate": 1.8448857517366167e-05, + "loss": 0.3563, + "step": 4163 + }, + { + "epoch": 0.3922658439509197, + "grad_norm": 0.8604304790496826, + "learning_rate": 1.844804964171347e-05, + "loss": 0.3767, + "step": 4164 + }, + { + "epoch": 0.39236004804408753, + "grad_norm": 0.8274044394493103, + "learning_rate": 1.8447241573432168e-05, + "loss": 0.3427, + "step": 4165 + }, + { + "epoch": 0.3924542521372554, + "grad_norm": 0.9021215438842773, + "learning_rate": 1.8446433312540697e-05, + "loss": 0.4002, + "step": 4166 + }, + { + "epoch": 0.3925484562304232, + "grad_norm": 0.829105019569397, + "learning_rate": 1.8445624859057485e-05, + "loss": 0.3666, + "step": 4167 + }, + { + "epoch": 0.3926426603235911, + "grad_norm": 0.7571730017662048, + "learning_rate": 1.8444816213000963e-05, + "loss": 0.3433, + "step": 4168 + }, + { + "epoch": 0.3927368644167589, + "grad_norm": 0.8027662038803101, + "learning_rate": 1.8444007374389573e-05, + "loss": 0.3743, + "step": 4169 + }, + { + "epoch": 0.3928310685099268, + "grad_norm": 0.8503544926643372, + "learning_rate": 1.8443198343241756e-05, + "loss": 0.3864, + "step": 4170 + }, + { + "epoch": 0.3929252726030946, + "grad_norm": 0.8933030962944031, + "learning_rate": 1.844238911957596e-05, + "loss": 0.3639, + "step": 4171 + }, + { + "epoch": 0.39301947669626247, + "grad_norm": 0.7828903198242188, + "learning_rate": 1.8441579703410633e-05, + "loss": 0.378, + "step": 4172 + }, + { + "epoch": 0.3931136807894303, + "grad_norm": 0.8003404140472412, + "learning_rate": 1.8440770094764236e-05, + "loss": 0.3207, + "step": 4173 + }, + { + "epoch": 0.39320788488259817, + "grad_norm": 0.842401921749115, + "learning_rate": 1.8439960293655227e-05, + "loss": 0.3411, + "step": 4174 + }, + { + "epoch": 0.393302088975766, + "grad_norm": 0.8401558995246887, + "learning_rate": 1.843915030010207e-05, + "loss": 0.349, + "step": 4175 + }, + { + "epoch": 0.39339629306893387, + "grad_norm": 0.8366354703903198, + "learning_rate": 1.843834011412323e-05, + "loss": 0.3827, + "step": 4176 + }, + { + "epoch": 0.3934904971621017, + "grad_norm": 0.7191557288169861, + "learning_rate": 1.8437529735737192e-05, + "loss": 0.3263, + "step": 4177 + }, + { + "epoch": 0.39358470125526956, + "grad_norm": 0.9180736541748047, + "learning_rate": 1.8436719164962426e-05, + "loss": 0.356, + "step": 4178 + }, + { + "epoch": 0.3936789053484374, + "grad_norm": 1.4437901973724365, + "learning_rate": 1.8435908401817413e-05, + "loss": 0.2938, + "step": 4179 + }, + { + "epoch": 0.39377310944160526, + "grad_norm": 0.8307088613510132, + "learning_rate": 1.843509744632064e-05, + "loss": 0.3617, + "step": 4180 + }, + { + "epoch": 0.3938673135347731, + "grad_norm": 0.9016256928443909, + "learning_rate": 1.8434286298490605e-05, + "loss": 0.3712, + "step": 4181 + }, + { + "epoch": 0.39396151762794096, + "grad_norm": 0.8308224081993103, + "learning_rate": 1.8433474958345797e-05, + "loss": 0.3318, + "step": 4182 + }, + { + "epoch": 0.3940557217211088, + "grad_norm": 0.7834711670875549, + "learning_rate": 1.8432663425904716e-05, + "loss": 0.3122, + "step": 4183 + }, + { + "epoch": 0.3941499258142766, + "grad_norm": 0.7277787327766418, + "learning_rate": 1.843185170118587e-05, + "loss": 0.3105, + "step": 4184 + }, + { + "epoch": 0.39424412990744445, + "grad_norm": 0.8183156251907349, + "learning_rate": 1.843103978420776e-05, + "loss": 0.3576, + "step": 4185 + }, + { + "epoch": 0.3943383340006123, + "grad_norm": 0.8041008114814758, + "learning_rate": 1.8430227674988913e-05, + "loss": 0.399, + "step": 4186 + }, + { + "epoch": 0.39443253809378015, + "grad_norm": 0.7788959741592407, + "learning_rate": 1.842941537354783e-05, + "loss": 0.3623, + "step": 4187 + }, + { + "epoch": 0.394526742186948, + "grad_norm": 0.8078553676605225, + "learning_rate": 1.842860287990304e-05, + "loss": 0.3368, + "step": 4188 + }, + { + "epoch": 0.39462094628011585, + "grad_norm": 0.8752886652946472, + "learning_rate": 1.8427790194073072e-05, + "loss": 0.3592, + "step": 4189 + }, + { + "epoch": 0.3947151503732837, + "grad_norm": 0.9009623527526855, + "learning_rate": 1.8426977316076455e-05, + "loss": 0.4217, + "step": 4190 + }, + { + "epoch": 0.39480935446645155, + "grad_norm": 0.8680552840232849, + "learning_rate": 1.842616424593172e-05, + "loss": 0.3975, + "step": 4191 + }, + { + "epoch": 0.3949035585596194, + "grad_norm": 0.881288468837738, + "learning_rate": 1.842535098365741e-05, + "loss": 0.4005, + "step": 4192 + }, + { + "epoch": 0.39499776265278724, + "grad_norm": 0.7390192747116089, + "learning_rate": 1.8424537529272068e-05, + "loss": 0.3649, + "step": 4193 + }, + { + "epoch": 0.3950919667459551, + "grad_norm": 0.7918478846549988, + "learning_rate": 1.842372388279424e-05, + "loss": 0.3361, + "step": 4194 + }, + { + "epoch": 0.39518617083912294, + "grad_norm": 0.7925350069999695, + "learning_rate": 1.8422910044242483e-05, + "loss": 0.3365, + "step": 4195 + }, + { + "epoch": 0.3952803749322908, + "grad_norm": 0.8270667195320129, + "learning_rate": 1.8422096013635347e-05, + "loss": 0.3418, + "step": 4196 + }, + { + "epoch": 0.39537457902545864, + "grad_norm": 0.8141732811927795, + "learning_rate": 1.8421281790991398e-05, + "loss": 0.3368, + "step": 4197 + }, + { + "epoch": 0.3954687831186265, + "grad_norm": 0.7719465494155884, + "learning_rate": 1.84204673763292e-05, + "loss": 0.393, + "step": 4198 + }, + { + "epoch": 0.39556298721179434, + "grad_norm": 0.7901314496994019, + "learning_rate": 1.8419652769667324e-05, + "loss": 0.3711, + "step": 4199 + }, + { + "epoch": 0.3956571913049622, + "grad_norm": 0.7895154356956482, + "learning_rate": 1.8418837971024346e-05, + "loss": 0.3691, + "step": 4200 + }, + { + "epoch": 0.39575139539813003, + "grad_norm": 0.7976158857345581, + "learning_rate": 1.8418022980418836e-05, + "loss": 0.3522, + "step": 4201 + }, + { + "epoch": 0.3958455994912979, + "grad_norm": 0.863312304019928, + "learning_rate": 1.841720779786939e-05, + "loss": 0.4068, + "step": 4202 + }, + { + "epoch": 0.39593980358446573, + "grad_norm": 0.7937854528427124, + "learning_rate": 1.8416392423394586e-05, + "loss": 0.3119, + "step": 4203 + }, + { + "epoch": 0.3960340076776336, + "grad_norm": 0.8557348251342773, + "learning_rate": 1.841557685701302e-05, + "loss": 0.3506, + "step": 4204 + }, + { + "epoch": 0.39612821177080143, + "grad_norm": 0.8589996099472046, + "learning_rate": 1.8414761098743285e-05, + "loss": 0.3221, + "step": 4205 + }, + { + "epoch": 0.3962224158639693, + "grad_norm": 0.8741294741630554, + "learning_rate": 1.8413945148603982e-05, + "loss": 0.3906, + "step": 4206 + }, + { + "epoch": 0.39631661995713713, + "grad_norm": 0.8319645524024963, + "learning_rate": 1.841312900661372e-05, + "loss": 0.3474, + "step": 4207 + }, + { + "epoch": 0.396410824050305, + "grad_norm": 0.8147428631782532, + "learning_rate": 1.84123126727911e-05, + "loss": 0.3565, + "step": 4208 + }, + { + "epoch": 0.3965050281434728, + "grad_norm": 0.7853788137435913, + "learning_rate": 1.841149614715475e-05, + "loss": 0.3682, + "step": 4209 + }, + { + "epoch": 0.3965992322366407, + "grad_norm": 0.7525576949119568, + "learning_rate": 1.8410679429723273e-05, + "loss": 0.3215, + "step": 4210 + }, + { + "epoch": 0.3966934363298085, + "grad_norm": 1.671125888824463, + "learning_rate": 1.84098625205153e-05, + "loss": 0.3343, + "step": 4211 + }, + { + "epoch": 0.3967876404229764, + "grad_norm": 0.7521957755088806, + "learning_rate": 1.8409045419549455e-05, + "loss": 0.3478, + "step": 4212 + }, + { + "epoch": 0.3968818445161442, + "grad_norm": 0.8269384503364563, + "learning_rate": 1.840822812684437e-05, + "loss": 0.3967, + "step": 4213 + }, + { + "epoch": 0.39697604860931207, + "grad_norm": 0.7471799850463867, + "learning_rate": 1.8407410642418678e-05, + "loss": 0.328, + "step": 4214 + }, + { + "epoch": 0.3970702527024799, + "grad_norm": 0.7251665592193604, + "learning_rate": 1.840659296629102e-05, + "loss": 0.3575, + "step": 4215 + }, + { + "epoch": 0.39716445679564777, + "grad_norm": 0.7672768831253052, + "learning_rate": 1.8405775098480046e-05, + "loss": 0.3423, + "step": 4216 + }, + { + "epoch": 0.3972586608888156, + "grad_norm": 0.7551266551017761, + "learning_rate": 1.8404957039004397e-05, + "loss": 0.3342, + "step": 4217 + }, + { + "epoch": 0.39735286498198347, + "grad_norm": 0.6800012588500977, + "learning_rate": 1.840413878788273e-05, + "loss": 0.2809, + "step": 4218 + }, + { + "epoch": 0.3974470690751513, + "grad_norm": 0.679454505443573, + "learning_rate": 1.8403320345133703e-05, + "loss": 0.2885, + "step": 4219 + }, + { + "epoch": 0.39754127316831916, + "grad_norm": 0.8225396275520325, + "learning_rate": 1.8402501710775973e-05, + "loss": 0.302, + "step": 4220 + }, + { + "epoch": 0.397635477261487, + "grad_norm": 0.8708465695381165, + "learning_rate": 1.8401682884828212e-05, + "loss": 0.3729, + "step": 4221 + }, + { + "epoch": 0.39772968135465486, + "grad_norm": 0.7675060629844666, + "learning_rate": 1.8400863867309084e-05, + "loss": 0.3545, + "step": 4222 + }, + { + "epoch": 0.3978238854478227, + "grad_norm": 0.7647223472595215, + "learning_rate": 1.840004465823727e-05, + "loss": 0.3975, + "step": 4223 + }, + { + "epoch": 0.39791808954099056, + "grad_norm": 0.8100164532661438, + "learning_rate": 1.8399225257631447e-05, + "loss": 0.3389, + "step": 4224 + }, + { + "epoch": 0.3980122936341584, + "grad_norm": 0.7076160311698914, + "learning_rate": 1.83984056655103e-05, + "loss": 0.3224, + "step": 4225 + }, + { + "epoch": 0.39810649772732626, + "grad_norm": 0.7941327095031738, + "learning_rate": 1.839758588189251e-05, + "loss": 0.3261, + "step": 4226 + }, + { + "epoch": 0.3982007018204941, + "grad_norm": 0.8033230304718018, + "learning_rate": 1.839676590679678e-05, + "loss": 0.3312, + "step": 4227 + }, + { + "epoch": 0.39829490591366196, + "grad_norm": 0.8265009522438049, + "learning_rate": 1.8395945740241802e-05, + "loss": 0.3568, + "step": 4228 + }, + { + "epoch": 0.3983891100068298, + "grad_norm": 0.8011536598205566, + "learning_rate": 1.839512538224627e-05, + "loss": 0.3686, + "step": 4229 + }, + { + "epoch": 0.39848331409999765, + "grad_norm": 0.8207288980484009, + "learning_rate": 1.8394304832828905e-05, + "loss": 0.3327, + "step": 4230 + }, + { + "epoch": 0.3985775181931655, + "grad_norm": 0.771256685256958, + "learning_rate": 1.8393484092008404e-05, + "loss": 0.3302, + "step": 4231 + }, + { + "epoch": 0.39867172228633335, + "grad_norm": 0.6601364016532898, + "learning_rate": 1.8392663159803482e-05, + "loss": 0.3005, + "step": 4232 + }, + { + "epoch": 0.3987659263795012, + "grad_norm": 0.8057689666748047, + "learning_rate": 1.839184203623286e-05, + "loss": 0.3253, + "step": 4233 + }, + { + "epoch": 0.39886013047266905, + "grad_norm": 0.7493592500686646, + "learning_rate": 1.8391020721315266e-05, + "loss": 0.3405, + "step": 4234 + }, + { + "epoch": 0.3989543345658369, + "grad_norm": 0.8171237707138062, + "learning_rate": 1.839019921506942e-05, + "loss": 0.3457, + "step": 4235 + }, + { + "epoch": 0.39904853865900475, + "grad_norm": 0.7632202506065369, + "learning_rate": 1.838937751751406e-05, + "loss": 0.35, + "step": 4236 + }, + { + "epoch": 0.3991427427521726, + "grad_norm": 0.8660060167312622, + "learning_rate": 1.8388555628667913e-05, + "loss": 0.364, + "step": 4237 + }, + { + "epoch": 0.39923694684534045, + "grad_norm": 0.7073898911476135, + "learning_rate": 1.8387733548549728e-05, + "loss": 0.3051, + "step": 4238 + }, + { + "epoch": 0.3993311509385083, + "grad_norm": 0.7002679705619812, + "learning_rate": 1.8386911277178242e-05, + "loss": 0.3133, + "step": 4239 + }, + { + "epoch": 0.39942535503167614, + "grad_norm": 0.777336061000824, + "learning_rate": 1.8386088814572213e-05, + "loss": 0.3331, + "step": 4240 + }, + { + "epoch": 0.399519559124844, + "grad_norm": 0.8492711782455444, + "learning_rate": 1.8385266160750386e-05, + "loss": 0.3352, + "step": 4241 + }, + { + "epoch": 0.39961376321801184, + "grad_norm": 0.7613757252693176, + "learning_rate": 1.8384443315731525e-05, + "loss": 0.3109, + "step": 4242 + }, + { + "epoch": 0.3997079673111797, + "grad_norm": 0.9914324879646301, + "learning_rate": 1.8383620279534387e-05, + "loss": 0.3336, + "step": 4243 + }, + { + "epoch": 0.39980217140434754, + "grad_norm": 0.8649870157241821, + "learning_rate": 1.8382797052177746e-05, + "loss": 0.3809, + "step": 4244 + }, + { + "epoch": 0.3998963754975154, + "grad_norm": 0.7795773148536682, + "learning_rate": 1.8381973633680365e-05, + "loss": 0.3073, + "step": 4245 + }, + { + "epoch": 0.39999057959068324, + "grad_norm": 0.7418776154518127, + "learning_rate": 1.838115002406102e-05, + "loss": 0.3172, + "step": 4246 + }, + { + "epoch": 0.4000847836838511, + "grad_norm": 0.7328126430511475, + "learning_rate": 1.8380326223338497e-05, + "loss": 0.3487, + "step": 4247 + }, + { + "epoch": 0.40017898777701894, + "grad_norm": 1.4264343976974487, + "learning_rate": 1.8379502231531572e-05, + "loss": 0.3381, + "step": 4248 + }, + { + "epoch": 0.4002731918701868, + "grad_norm": 0.7884756326675415, + "learning_rate": 1.8378678048659038e-05, + "loss": 0.3705, + "step": 4249 + }, + { + "epoch": 0.40036739596335463, + "grad_norm": 0.7564305663108826, + "learning_rate": 1.8377853674739687e-05, + "loss": 0.3195, + "step": 4250 + }, + { + "epoch": 0.4004616000565225, + "grad_norm": 0.8775674700737, + "learning_rate": 1.8377029109792315e-05, + "loss": 0.3315, + "step": 4251 + }, + { + "epoch": 0.40055580414969033, + "grad_norm": 0.8128756880760193, + "learning_rate": 1.8376204353835725e-05, + "loss": 0.3092, + "step": 4252 + }, + { + "epoch": 0.4006500082428582, + "grad_norm": 0.7862222790718079, + "learning_rate": 1.8375379406888722e-05, + "loss": 0.3446, + "step": 4253 + }, + { + "epoch": 0.400744212336026, + "grad_norm": 0.8245273232460022, + "learning_rate": 1.8374554268970114e-05, + "loss": 0.386, + "step": 4254 + }, + { + "epoch": 0.4008384164291938, + "grad_norm": 0.9503718018531799, + "learning_rate": 1.8373728940098717e-05, + "loss": 0.3433, + "step": 4255 + }, + { + "epoch": 0.40093262052236167, + "grad_norm": 0.9467640519142151, + "learning_rate": 1.8372903420293354e-05, + "loss": 0.4113, + "step": 4256 + }, + { + "epoch": 0.4010268246155295, + "grad_norm": 0.8559558391571045, + "learning_rate": 1.8372077709572843e-05, + "loss": 0.3089, + "step": 4257 + }, + { + "epoch": 0.40112102870869737, + "grad_norm": 0.7818161845207214, + "learning_rate": 1.8371251807956008e-05, + "loss": 0.3298, + "step": 4258 + }, + { + "epoch": 0.4012152328018652, + "grad_norm": 0.8104586005210876, + "learning_rate": 1.8370425715461688e-05, + "loss": 0.3446, + "step": 4259 + }, + { + "epoch": 0.40130943689503307, + "grad_norm": 0.8099079728126526, + "learning_rate": 1.8369599432108717e-05, + "loss": 0.3472, + "step": 4260 + }, + { + "epoch": 0.4014036409882009, + "grad_norm": 0.7487495541572571, + "learning_rate": 1.8368772957915936e-05, + "loss": 0.3429, + "step": 4261 + }, + { + "epoch": 0.40149784508136876, + "grad_norm": 1.418947458267212, + "learning_rate": 1.8367946292902187e-05, + "loss": 0.3338, + "step": 4262 + }, + { + "epoch": 0.4015920491745366, + "grad_norm": 0.8419050574302673, + "learning_rate": 1.8367119437086322e-05, + "loss": 0.3404, + "step": 4263 + }, + { + "epoch": 0.40168625326770446, + "grad_norm": 0.7401953339576721, + "learning_rate": 1.8366292390487195e-05, + "loss": 0.2862, + "step": 4264 + }, + { + "epoch": 0.4017804573608723, + "grad_norm": 0.8642869591712952, + "learning_rate": 1.836546515312366e-05, + "loss": 0.3216, + "step": 4265 + }, + { + "epoch": 0.40187466145404016, + "grad_norm": 0.7470360994338989, + "learning_rate": 1.8364637725014583e-05, + "loss": 0.3298, + "step": 4266 + }, + { + "epoch": 0.401968865547208, + "grad_norm": 0.8600407838821411, + "learning_rate": 1.8363810106178832e-05, + "loss": 0.3149, + "step": 4267 + }, + { + "epoch": 0.40206306964037586, + "grad_norm": 0.900352418422699, + "learning_rate": 1.8362982296635276e-05, + "loss": 0.3499, + "step": 4268 + }, + { + "epoch": 0.4021572737335437, + "grad_norm": 0.7902324199676514, + "learning_rate": 1.8362154296402787e-05, + "loss": 0.323, + "step": 4269 + }, + { + "epoch": 0.40225147782671156, + "grad_norm": 0.8110232353210449, + "learning_rate": 1.8361326105500252e-05, + "loss": 0.3486, + "step": 4270 + }, + { + "epoch": 0.4023456819198794, + "grad_norm": 0.7503052949905396, + "learning_rate": 1.8360497723946544e-05, + "loss": 0.3286, + "step": 4271 + }, + { + "epoch": 0.40243988601304725, + "grad_norm": 0.9421465396881104, + "learning_rate": 1.835966915176056e-05, + "loss": 0.3038, + "step": 4272 + }, + { + "epoch": 0.4025340901062151, + "grad_norm": 0.816394567489624, + "learning_rate": 1.8358840388961196e-05, + "loss": 0.3973, + "step": 4273 + }, + { + "epoch": 0.40262829419938295, + "grad_norm": 0.7936789989471436, + "learning_rate": 1.835801143556734e-05, + "loss": 0.3043, + "step": 4274 + }, + { + "epoch": 0.4027224982925508, + "grad_norm": 0.9342827796936035, + "learning_rate": 1.8357182291597897e-05, + "loss": 0.386, + "step": 4275 + }, + { + "epoch": 0.40281670238571865, + "grad_norm": 1.0587079524993896, + "learning_rate": 1.8356352957071775e-05, + "loss": 0.3793, + "step": 4276 + }, + { + "epoch": 0.4029109064788865, + "grad_norm": 1.2501111030578613, + "learning_rate": 1.835552343200788e-05, + "loss": 0.3369, + "step": 4277 + }, + { + "epoch": 0.40300511057205435, + "grad_norm": 0.8193202018737793, + "learning_rate": 1.8354693716425132e-05, + "loss": 0.3885, + "step": 4278 + }, + { + "epoch": 0.4030993146652222, + "grad_norm": 0.7024616003036499, + "learning_rate": 1.8353863810342444e-05, + "loss": 0.3362, + "step": 4279 + }, + { + "epoch": 0.40319351875839005, + "grad_norm": 0.9073119759559631, + "learning_rate": 1.835303371377874e-05, + "loss": 0.3905, + "step": 4280 + }, + { + "epoch": 0.4032877228515579, + "grad_norm": 0.7984116077423096, + "learning_rate": 1.835220342675295e-05, + "loss": 0.3215, + "step": 4281 + }, + { + "epoch": 0.40338192694472574, + "grad_norm": 0.8524281978607178, + "learning_rate": 1.835137294928401e-05, + "loss": 0.359, + "step": 4282 + }, + { + "epoch": 0.4034761310378936, + "grad_norm": 0.8253399729728699, + "learning_rate": 1.8350542281390845e-05, + "loss": 0.397, + "step": 4283 + }, + { + "epoch": 0.40357033513106144, + "grad_norm": 0.769839346408844, + "learning_rate": 1.8349711423092403e-05, + "loss": 0.3201, + "step": 4284 + }, + { + "epoch": 0.4036645392242293, + "grad_norm": 0.7591856718063354, + "learning_rate": 1.834888037440763e-05, + "loss": 0.3798, + "step": 4285 + }, + { + "epoch": 0.40375874331739714, + "grad_norm": 0.8179476857185364, + "learning_rate": 1.834804913535547e-05, + "loss": 0.3247, + "step": 4286 + }, + { + "epoch": 0.403852947410565, + "grad_norm": 0.7844939827919006, + "learning_rate": 1.834721770595488e-05, + "loss": 0.3277, + "step": 4287 + }, + { + "epoch": 0.40394715150373284, + "grad_norm": 0.7527651786804199, + "learning_rate": 1.834638608622482e-05, + "loss": 0.3579, + "step": 4288 + }, + { + "epoch": 0.4040413555969007, + "grad_norm": 0.8115012645721436, + "learning_rate": 1.8345554276184247e-05, + "loss": 0.3622, + "step": 4289 + }, + { + "epoch": 0.40413555969006854, + "grad_norm": 0.7577059864997864, + "learning_rate": 1.834472227585213e-05, + "loss": 0.3217, + "step": 4290 + }, + { + "epoch": 0.4042297637832364, + "grad_norm": 0.7256118655204773, + "learning_rate": 1.8343890085247437e-05, + "loss": 0.3539, + "step": 4291 + }, + { + "epoch": 0.40432396787640423, + "grad_norm": 0.8135298490524292, + "learning_rate": 1.834305770438915e-05, + "loss": 0.334, + "step": 4292 + }, + { + "epoch": 0.4044181719695721, + "grad_norm": 0.6899747252464294, + "learning_rate": 1.8342225133296244e-05, + "loss": 0.318, + "step": 4293 + }, + { + "epoch": 0.40451237606273993, + "grad_norm": 0.8328760266304016, + "learning_rate": 1.83413923719877e-05, + "loss": 0.3621, + "step": 4294 + }, + { + "epoch": 0.4046065801559078, + "grad_norm": 0.7262910604476929, + "learning_rate": 1.8340559420482513e-05, + "loss": 0.3362, + "step": 4295 + }, + { + "epoch": 0.40470078424907563, + "grad_norm": 0.8466453552246094, + "learning_rate": 1.833972627879967e-05, + "loss": 0.3469, + "step": 4296 + }, + { + "epoch": 0.4047949883422435, + "grad_norm": 0.9842650890350342, + "learning_rate": 1.8338892946958172e-05, + "loss": 0.3442, + "step": 4297 + }, + { + "epoch": 0.4048891924354113, + "grad_norm": 0.7387780547142029, + "learning_rate": 1.8338059424977017e-05, + "loss": 0.3354, + "step": 4298 + }, + { + "epoch": 0.4049833965285792, + "grad_norm": 0.7268993854522705, + "learning_rate": 1.8337225712875213e-05, + "loss": 0.3373, + "step": 4299 + }, + { + "epoch": 0.405077600621747, + "grad_norm": 0.7663114666938782, + "learning_rate": 1.8336391810671773e-05, + "loss": 0.3274, + "step": 4300 + }, + { + "epoch": 0.4051718047149149, + "grad_norm": 0.809139609336853, + "learning_rate": 1.8335557718385702e-05, + "loss": 0.3712, + "step": 4301 + }, + { + "epoch": 0.4052660088080827, + "grad_norm": 0.8831266760826111, + "learning_rate": 1.8334723436036022e-05, + "loss": 0.3433, + "step": 4302 + }, + { + "epoch": 0.40536021290125057, + "grad_norm": 0.7973636984825134, + "learning_rate": 1.8333888963641762e-05, + "loss": 0.3761, + "step": 4303 + }, + { + "epoch": 0.4054544169944184, + "grad_norm": 0.8654884099960327, + "learning_rate": 1.8333054301221945e-05, + "loss": 0.3722, + "step": 4304 + }, + { + "epoch": 0.40554862108758627, + "grad_norm": 0.6960375308990479, + "learning_rate": 1.8332219448795602e-05, + "loss": 0.3297, + "step": 4305 + }, + { + "epoch": 0.4056428251807541, + "grad_norm": 0.9065430164337158, + "learning_rate": 1.8331384406381768e-05, + "loss": 0.355, + "step": 4306 + }, + { + "epoch": 0.40573702927392197, + "grad_norm": 0.8746337890625, + "learning_rate": 1.8330549173999484e-05, + "loss": 0.3384, + "step": 4307 + }, + { + "epoch": 0.4058312333670898, + "grad_norm": 0.8848806023597717, + "learning_rate": 1.83297137516678e-05, + "loss": 0.3688, + "step": 4308 + }, + { + "epoch": 0.40592543746025767, + "grad_norm": 0.7663493752479553, + "learning_rate": 1.8328878139405756e-05, + "loss": 0.2883, + "step": 4309 + }, + { + "epoch": 0.4060196415534255, + "grad_norm": 0.8225603699684143, + "learning_rate": 1.8328042337232412e-05, + "loss": 0.3586, + "step": 4310 + }, + { + "epoch": 0.40611384564659336, + "grad_norm": 0.7770514488220215, + "learning_rate": 1.832720634516682e-05, + "loss": 0.3499, + "step": 4311 + }, + { + "epoch": 0.4062080497397612, + "grad_norm": 0.7962725162506104, + "learning_rate": 1.832637016322805e-05, + "loss": 0.3759, + "step": 4312 + }, + { + "epoch": 0.40630225383292906, + "grad_norm": 0.8327206373214722, + "learning_rate": 1.832553379143516e-05, + "loss": 0.3704, + "step": 4313 + }, + { + "epoch": 0.4063964579260969, + "grad_norm": 0.7537351250648499, + "learning_rate": 1.832469722980722e-05, + "loss": 0.3408, + "step": 4314 + }, + { + "epoch": 0.40649066201926476, + "grad_norm": 0.6225210428237915, + "learning_rate": 1.8323860478363317e-05, + "loss": 0.2987, + "step": 4315 + }, + { + "epoch": 0.4065848661124326, + "grad_norm": 0.8770931363105774, + "learning_rate": 1.8323023537122518e-05, + "loss": 0.3177, + "step": 4316 + }, + { + "epoch": 0.40667907020560046, + "grad_norm": 0.8675561547279358, + "learning_rate": 1.8322186406103913e-05, + "loss": 0.3088, + "step": 4317 + }, + { + "epoch": 0.4067732742987683, + "grad_norm": 0.7490965723991394, + "learning_rate": 1.8321349085326583e-05, + "loss": 0.3347, + "step": 4318 + }, + { + "epoch": 0.40686747839193615, + "grad_norm": 0.8933178782463074, + "learning_rate": 1.8320511574809625e-05, + "loss": 0.3216, + "step": 4319 + }, + { + "epoch": 0.406961682485104, + "grad_norm": 0.9704069495201111, + "learning_rate": 1.831967387457214e-05, + "loss": 0.407, + "step": 4320 + }, + { + "epoch": 0.40705588657827185, + "grad_norm": 0.7570332884788513, + "learning_rate": 1.831883598463322e-05, + "loss": 0.3584, + "step": 4321 + }, + { + "epoch": 0.4071500906714397, + "grad_norm": 0.7434055805206299, + "learning_rate": 1.8317997905011975e-05, + "loss": 0.3329, + "step": 4322 + }, + { + "epoch": 0.4072442947646075, + "grad_norm": 0.8250865340232849, + "learning_rate": 1.8317159635727517e-05, + "loss": 0.3339, + "step": 4323 + }, + { + "epoch": 0.40733849885777534, + "grad_norm": 0.9020285606384277, + "learning_rate": 1.831632117679895e-05, + "loss": 0.3378, + "step": 4324 + }, + { + "epoch": 0.4074327029509432, + "grad_norm": 0.8903120160102844, + "learning_rate": 1.8315482528245404e-05, + "loss": 0.4144, + "step": 4325 + }, + { + "epoch": 0.40752690704411104, + "grad_norm": 0.8243562579154968, + "learning_rate": 1.8314643690085992e-05, + "loss": 0.3239, + "step": 4326 + }, + { + "epoch": 0.4076211111372789, + "grad_norm": 0.7726811170578003, + "learning_rate": 1.8313804662339847e-05, + "loss": 0.3523, + "step": 4327 + }, + { + "epoch": 0.40771531523044674, + "grad_norm": 0.7769786715507507, + "learning_rate": 1.83129654450261e-05, + "loss": 0.3454, + "step": 4328 + }, + { + "epoch": 0.4078095193236146, + "grad_norm": 0.8535809516906738, + "learning_rate": 1.8312126038163884e-05, + "loss": 0.4068, + "step": 4329 + }, + { + "epoch": 0.40790372341678244, + "grad_norm": 0.6568573713302612, + "learning_rate": 1.831128644177234e-05, + "loss": 0.2698, + "step": 4330 + }, + { + "epoch": 0.4079979275099503, + "grad_norm": 0.7760324478149414, + "learning_rate": 1.8310446655870607e-05, + "loss": 0.3258, + "step": 4331 + }, + { + "epoch": 0.40809213160311814, + "grad_norm": 0.8847746253013611, + "learning_rate": 1.830960668047784e-05, + "loss": 0.384, + "step": 4332 + }, + { + "epoch": 0.408186335696286, + "grad_norm": 0.7380432486534119, + "learning_rate": 1.830876651561319e-05, + "loss": 0.352, + "step": 4333 + }, + { + "epoch": 0.40828053978945383, + "grad_norm": 0.8275204300880432, + "learning_rate": 1.8307926161295818e-05, + "loss": 0.3694, + "step": 4334 + }, + { + "epoch": 0.4083747438826217, + "grad_norm": 0.8669615387916565, + "learning_rate": 1.8307085617544876e-05, + "loss": 0.3764, + "step": 4335 + }, + { + "epoch": 0.40846894797578953, + "grad_norm": 1.0183978080749512, + "learning_rate": 1.8306244884379536e-05, + "loss": 0.3497, + "step": 4336 + }, + { + "epoch": 0.4085631520689574, + "grad_norm": 0.7218812704086304, + "learning_rate": 1.8305403961818967e-05, + "loss": 0.3439, + "step": 4337 + }, + { + "epoch": 0.40865735616212523, + "grad_norm": 0.6889415383338928, + "learning_rate": 1.8304562849882343e-05, + "loss": 0.3266, + "step": 4338 + }, + { + "epoch": 0.4087515602552931, + "grad_norm": 0.7743616104125977, + "learning_rate": 1.8303721548588843e-05, + "loss": 0.327, + "step": 4339 + }, + { + "epoch": 0.4088457643484609, + "grad_norm": 0.7835550904273987, + "learning_rate": 1.8302880057957653e-05, + "loss": 0.3708, + "step": 4340 + }, + { + "epoch": 0.4089399684416288, + "grad_norm": 0.8127569556236267, + "learning_rate": 1.830203837800795e-05, + "loss": 0.3436, + "step": 4341 + }, + { + "epoch": 0.4090341725347966, + "grad_norm": 0.7509648203849792, + "learning_rate": 1.8301196508758935e-05, + "loss": 0.3353, + "step": 4342 + }, + { + "epoch": 0.4091283766279645, + "grad_norm": 0.7613946199417114, + "learning_rate": 1.8300354450229805e-05, + "loss": 0.3009, + "step": 4343 + }, + { + "epoch": 0.4092225807211323, + "grad_norm": 0.9445093870162964, + "learning_rate": 1.8299512202439756e-05, + "loss": 0.4089, + "step": 4344 + }, + { + "epoch": 0.40931678481430017, + "grad_norm": 0.7827582359313965, + "learning_rate": 1.829866976540799e-05, + "loss": 0.3774, + "step": 4345 + }, + { + "epoch": 0.409410988907468, + "grad_norm": 0.7520980834960938, + "learning_rate": 1.829782713915372e-05, + "loss": 0.3072, + "step": 4346 + }, + { + "epoch": 0.40950519300063587, + "grad_norm": 0.8516082763671875, + "learning_rate": 1.8296984323696162e-05, + "loss": 0.3864, + "step": 4347 + }, + { + "epoch": 0.4095993970938037, + "grad_norm": 0.7789568901062012, + "learning_rate": 1.8296141319054527e-05, + "loss": 0.3514, + "step": 4348 + }, + { + "epoch": 0.40969360118697157, + "grad_norm": 0.838067352771759, + "learning_rate": 1.829529812524804e-05, + "loss": 0.3672, + "step": 4349 + }, + { + "epoch": 0.4097878052801394, + "grad_norm": 0.9141841530799866, + "learning_rate": 1.8294454742295927e-05, + "loss": 0.3696, + "step": 4350 + }, + { + "epoch": 0.40988200937330727, + "grad_norm": 0.7358622550964355, + "learning_rate": 1.8293611170217417e-05, + "loss": 0.3099, + "step": 4351 + }, + { + "epoch": 0.4099762134664751, + "grad_norm": 0.734024703502655, + "learning_rate": 1.8292767409031748e-05, + "loss": 0.3305, + "step": 4352 + }, + { + "epoch": 0.41007041755964296, + "grad_norm": 0.7921773791313171, + "learning_rate": 1.8291923458758157e-05, + "loss": 0.2937, + "step": 4353 + }, + { + "epoch": 0.4101646216528108, + "grad_norm": 0.7627155780792236, + "learning_rate": 1.8291079319415888e-05, + "loss": 0.3908, + "step": 4354 + }, + { + "epoch": 0.41025882574597866, + "grad_norm": 0.6983011960983276, + "learning_rate": 1.8290234991024184e-05, + "loss": 0.3196, + "step": 4355 + }, + { + "epoch": 0.4103530298391465, + "grad_norm": 0.8267301321029663, + "learning_rate": 1.8289390473602305e-05, + "loss": 0.3707, + "step": 4356 + }, + { + "epoch": 0.41044723393231436, + "grad_norm": 0.7846118807792664, + "learning_rate": 1.8288545767169503e-05, + "loss": 0.3199, + "step": 4357 + }, + { + "epoch": 0.4105414380254822, + "grad_norm": 0.749038815498352, + "learning_rate": 1.8287700871745036e-05, + "loss": 0.3106, + "step": 4358 + }, + { + "epoch": 0.41063564211865006, + "grad_norm": 0.7653847336769104, + "learning_rate": 1.8286855787348176e-05, + "loss": 0.357, + "step": 4359 + }, + { + "epoch": 0.4107298462118179, + "grad_norm": 0.860435962677002, + "learning_rate": 1.8286010513998188e-05, + "loss": 0.3663, + "step": 4360 + }, + { + "epoch": 0.41082405030498576, + "grad_norm": 0.728864848613739, + "learning_rate": 1.8285165051714343e-05, + "loss": 0.3327, + "step": 4361 + }, + { + "epoch": 0.4109182543981536, + "grad_norm": 0.9315623641014099, + "learning_rate": 1.8284319400515923e-05, + "loss": 0.3747, + "step": 4362 + }, + { + "epoch": 0.41101245849132145, + "grad_norm": 0.8237623572349548, + "learning_rate": 1.828347356042221e-05, + "loss": 0.3875, + "step": 4363 + }, + { + "epoch": 0.4111066625844893, + "grad_norm": 0.8893877267837524, + "learning_rate": 1.8282627531452487e-05, + "loss": 0.3513, + "step": 4364 + }, + { + "epoch": 0.41120086667765715, + "grad_norm": 0.7763910889625549, + "learning_rate": 1.8281781313626047e-05, + "loss": 0.3341, + "step": 4365 + }, + { + "epoch": 0.411295070770825, + "grad_norm": 0.7412164807319641, + "learning_rate": 1.8280934906962184e-05, + "loss": 0.3232, + "step": 4366 + }, + { + "epoch": 0.41138927486399285, + "grad_norm": 0.7896450161933899, + "learning_rate": 1.8280088311480203e-05, + "loss": 0.368, + "step": 4367 + }, + { + "epoch": 0.4114834789571607, + "grad_norm": 0.7353463768959045, + "learning_rate": 1.82792415271994e-05, + "loss": 0.3218, + "step": 4368 + }, + { + "epoch": 0.41157768305032855, + "grad_norm": 0.8171700239181519, + "learning_rate": 1.8278394554139086e-05, + "loss": 0.3807, + "step": 4369 + }, + { + "epoch": 0.4116718871434964, + "grad_norm": 0.7039395570755005, + "learning_rate": 1.8277547392318574e-05, + "loss": 0.2872, + "step": 4370 + }, + { + "epoch": 0.41176609123666424, + "grad_norm": 0.8705812096595764, + "learning_rate": 1.8276700041757177e-05, + "loss": 0.3588, + "step": 4371 + }, + { + "epoch": 0.4118602953298321, + "grad_norm": 0.8330321907997131, + "learning_rate": 1.8275852502474223e-05, + "loss": 0.3837, + "step": 4372 + }, + { + "epoch": 0.41195449942299994, + "grad_norm": 0.7887098789215088, + "learning_rate": 1.8275004774489032e-05, + "loss": 0.37, + "step": 4373 + }, + { + "epoch": 0.4120487035161678, + "grad_norm": 0.8045909404754639, + "learning_rate": 1.8274156857820933e-05, + "loss": 0.3297, + "step": 4374 + }, + { + "epoch": 0.41214290760933564, + "grad_norm": 0.724745512008667, + "learning_rate": 1.8273308752489263e-05, + "loss": 0.3494, + "step": 4375 + }, + { + "epoch": 0.4122371117025035, + "grad_norm": 0.7332324385643005, + "learning_rate": 1.827246045851336e-05, + "loss": 0.3508, + "step": 4376 + }, + { + "epoch": 0.41233131579567134, + "grad_norm": 0.7815578579902649, + "learning_rate": 1.827161197591256e-05, + "loss": 0.338, + "step": 4377 + }, + { + "epoch": 0.4124255198888392, + "grad_norm": 0.8719455599784851, + "learning_rate": 1.827076330470622e-05, + "loss": 0.3601, + "step": 4378 + }, + { + "epoch": 0.41251972398200704, + "grad_norm": 0.8573183417320251, + "learning_rate": 1.8269914444913682e-05, + "loss": 0.3847, + "step": 4379 + }, + { + "epoch": 0.4126139280751749, + "grad_norm": 0.7446406483650208, + "learning_rate": 1.826906539655431e-05, + "loss": 0.344, + "step": 4380 + }, + { + "epoch": 0.41270813216834273, + "grad_norm": 0.9083143472671509, + "learning_rate": 1.8268216159647455e-05, + "loss": 0.3434, + "step": 4381 + }, + { + "epoch": 0.4128023362615106, + "grad_norm": 0.7628920078277588, + "learning_rate": 1.8267366734212483e-05, + "loss": 0.354, + "step": 4382 + }, + { + "epoch": 0.41289654035467843, + "grad_norm": 0.748205304145813, + "learning_rate": 1.8266517120268763e-05, + "loss": 0.3186, + "step": 4383 + }, + { + "epoch": 0.4129907444478463, + "grad_norm": 0.7351827621459961, + "learning_rate": 1.8265667317835673e-05, + "loss": 0.3536, + "step": 4384 + }, + { + "epoch": 0.41308494854101413, + "grad_norm": 0.8449647426605225, + "learning_rate": 1.826481732693258e-05, + "loss": 0.2795, + "step": 4385 + }, + { + "epoch": 0.413179152634182, + "grad_norm": 0.8797194361686707, + "learning_rate": 1.8263967147578875e-05, + "loss": 0.3641, + "step": 4386 + }, + { + "epoch": 0.4132733567273498, + "grad_norm": 0.7823154926300049, + "learning_rate": 1.8263116779793936e-05, + "loss": 0.3688, + "step": 4387 + }, + { + "epoch": 0.4133675608205177, + "grad_norm": 0.7594738602638245, + "learning_rate": 1.8262266223597155e-05, + "loss": 0.3202, + "step": 4388 + }, + { + "epoch": 0.4134617649136855, + "grad_norm": 0.7818530201911926, + "learning_rate": 1.826141547900793e-05, + "loss": 0.3268, + "step": 4389 + }, + { + "epoch": 0.4135559690068534, + "grad_norm": 0.7931559085845947, + "learning_rate": 1.826056454604565e-05, + "loss": 0.3095, + "step": 4390 + }, + { + "epoch": 0.4136501731000212, + "grad_norm": 0.7975097298622131, + "learning_rate": 1.8259713424729723e-05, + "loss": 0.3448, + "step": 4391 + }, + { + "epoch": 0.413744377193189, + "grad_norm": 0.7636379599571228, + "learning_rate": 1.825886211507956e-05, + "loss": 0.3324, + "step": 4392 + }, + { + "epoch": 0.41383858128635687, + "grad_norm": 0.7058627605438232, + "learning_rate": 1.825801061711457e-05, + "loss": 0.3167, + "step": 4393 + }, + { + "epoch": 0.4139327853795247, + "grad_norm": 0.8211748003959656, + "learning_rate": 1.8257158930854163e-05, + "loss": 0.39, + "step": 4394 + }, + { + "epoch": 0.41402698947269256, + "grad_norm": 0.7415569424629211, + "learning_rate": 1.8256307056317763e-05, + "loss": 0.3314, + "step": 4395 + }, + { + "epoch": 0.4141211935658604, + "grad_norm": 1.0090950727462769, + "learning_rate": 1.825545499352479e-05, + "loss": 0.3832, + "step": 4396 + }, + { + "epoch": 0.41421539765902826, + "grad_norm": 0.7703089118003845, + "learning_rate": 1.8254602742494677e-05, + "loss": 0.3447, + "step": 4397 + }, + { + "epoch": 0.4143096017521961, + "grad_norm": 0.7498367428779602, + "learning_rate": 1.825375030324686e-05, + "loss": 0.3216, + "step": 4398 + }, + { + "epoch": 0.41440380584536396, + "grad_norm": 0.7507278919219971, + "learning_rate": 1.825289767580077e-05, + "loss": 0.3267, + "step": 4399 + }, + { + "epoch": 0.4144980099385318, + "grad_norm": 0.7983956336975098, + "learning_rate": 1.8252044860175847e-05, + "loss": 0.3676, + "step": 4400 + }, + { + "epoch": 0.41459221403169966, + "grad_norm": 0.9007934927940369, + "learning_rate": 1.825119185639154e-05, + "loss": 0.3902, + "step": 4401 + }, + { + "epoch": 0.4146864181248675, + "grad_norm": 0.8932568430900574, + "learning_rate": 1.82503386644673e-05, + "loss": 0.3512, + "step": 4402 + }, + { + "epoch": 0.41478062221803536, + "grad_norm": 0.8058125376701355, + "learning_rate": 1.824948528442258e-05, + "loss": 0.3646, + "step": 4403 + }, + { + "epoch": 0.4148748263112032, + "grad_norm": 0.8728594779968262, + "learning_rate": 1.8248631716276835e-05, + "loss": 0.3521, + "step": 4404 + }, + { + "epoch": 0.41496903040437105, + "grad_norm": 0.7817156910896301, + "learning_rate": 1.824777796004953e-05, + "loss": 0.3019, + "step": 4405 + }, + { + "epoch": 0.4150632344975389, + "grad_norm": 0.6912437081336975, + "learning_rate": 1.824692401576013e-05, + "loss": 0.3391, + "step": 4406 + }, + { + "epoch": 0.41515743859070675, + "grad_norm": 0.7800760865211487, + "learning_rate": 1.8246069883428113e-05, + "loss": 0.2977, + "step": 4407 + }, + { + "epoch": 0.4152516426838746, + "grad_norm": 0.8556133508682251, + "learning_rate": 1.8245215563072948e-05, + "loss": 0.3919, + "step": 4408 + }, + { + "epoch": 0.41534584677704245, + "grad_norm": 0.7732865810394287, + "learning_rate": 1.8244361054714118e-05, + "loss": 0.3286, + "step": 4409 + }, + { + "epoch": 0.4154400508702103, + "grad_norm": 0.7321618795394897, + "learning_rate": 1.82435063583711e-05, + "loss": 0.2987, + "step": 4410 + }, + { + "epoch": 0.41553425496337815, + "grad_norm": 0.7288196086883545, + "learning_rate": 1.8242651474063392e-05, + "loss": 0.3123, + "step": 4411 + }, + { + "epoch": 0.415628459056546, + "grad_norm": 0.7751367092132568, + "learning_rate": 1.8241796401810486e-05, + "loss": 0.3809, + "step": 4412 + }, + { + "epoch": 0.41572266314971384, + "grad_norm": 0.7491422891616821, + "learning_rate": 1.8240941141631873e-05, + "loss": 0.3318, + "step": 4413 + }, + { + "epoch": 0.4158168672428817, + "grad_norm": 1.0179657936096191, + "learning_rate": 1.8240085693547058e-05, + "loss": 0.3986, + "step": 4414 + }, + { + "epoch": 0.41591107133604954, + "grad_norm": 0.7423537969589233, + "learning_rate": 1.8239230057575542e-05, + "loss": 0.332, + "step": 4415 + }, + { + "epoch": 0.4160052754292174, + "grad_norm": 0.7375214695930481, + "learning_rate": 1.8238374233736845e-05, + "loss": 0.3516, + "step": 4416 + }, + { + "epoch": 0.41609947952238524, + "grad_norm": 0.7761600017547607, + "learning_rate": 1.823751822205047e-05, + "loss": 0.3683, + "step": 4417 + }, + { + "epoch": 0.4161936836155531, + "grad_norm": 0.8303682208061218, + "learning_rate": 1.823666202253594e-05, + "loss": 0.4009, + "step": 4418 + }, + { + "epoch": 0.41628788770872094, + "grad_norm": 0.8851097226142883, + "learning_rate": 1.8235805635212778e-05, + "loss": 0.3395, + "step": 4419 + }, + { + "epoch": 0.4163820918018888, + "grad_norm": 0.833597719669342, + "learning_rate": 1.8234949060100513e-05, + "loss": 0.3487, + "step": 4420 + }, + { + "epoch": 0.41647629589505664, + "grad_norm": 0.9360535740852356, + "learning_rate": 1.823409229721867e-05, + "loss": 0.335, + "step": 4421 + }, + { + "epoch": 0.4165704999882245, + "grad_norm": 0.8861821293830872, + "learning_rate": 1.823323534658679e-05, + "loss": 0.3483, + "step": 4422 + }, + { + "epoch": 0.41666470408139233, + "grad_norm": 0.9428109526634216, + "learning_rate": 1.8232378208224414e-05, + "loss": 0.3561, + "step": 4423 + }, + { + "epoch": 0.4167589081745602, + "grad_norm": 0.8216004967689514, + "learning_rate": 1.823152088215108e-05, + "loss": 0.3437, + "step": 4424 + }, + { + "epoch": 0.41685311226772803, + "grad_norm": 0.7535174489021301, + "learning_rate": 1.823066336838634e-05, + "loss": 0.334, + "step": 4425 + }, + { + "epoch": 0.4169473163608959, + "grad_norm": 0.7837510108947754, + "learning_rate": 1.8229805666949745e-05, + "loss": 0.3033, + "step": 4426 + }, + { + "epoch": 0.41704152045406373, + "grad_norm": 0.7305752038955688, + "learning_rate": 1.8228947777860858e-05, + "loss": 0.3192, + "step": 4427 + }, + { + "epoch": 0.4171357245472316, + "grad_norm": 0.7885417342185974, + "learning_rate": 1.822808970113923e-05, + "loss": 0.3216, + "step": 4428 + }, + { + "epoch": 0.41722992864039943, + "grad_norm": 0.7974655628204346, + "learning_rate": 1.8227231436804434e-05, + "loss": 0.3781, + "step": 4429 + }, + { + "epoch": 0.4173241327335673, + "grad_norm": 0.8113394975662231, + "learning_rate": 1.822637298487604e-05, + "loss": 0.3168, + "step": 4430 + }, + { + "epoch": 0.4174183368267351, + "grad_norm": 0.8304500579833984, + "learning_rate": 1.8225514345373617e-05, + "loss": 0.3733, + "step": 4431 + }, + { + "epoch": 0.417512540919903, + "grad_norm": 0.7840065956115723, + "learning_rate": 1.8224655518316745e-05, + "loss": 0.3255, + "step": 4432 + }, + { + "epoch": 0.4176067450130708, + "grad_norm": 0.6819175481796265, + "learning_rate": 1.8223796503725007e-05, + "loss": 0.3034, + "step": 4433 + }, + { + "epoch": 0.4177009491062387, + "grad_norm": 0.7774067521095276, + "learning_rate": 1.8222937301617993e-05, + "loss": 0.2968, + "step": 4434 + }, + { + "epoch": 0.4177951531994065, + "grad_norm": 1.0673409700393677, + "learning_rate": 1.822207791201529e-05, + "loss": 0.3672, + "step": 4435 + }, + { + "epoch": 0.41788935729257437, + "grad_norm": 0.7810318470001221, + "learning_rate": 1.8221218334936496e-05, + "loss": 0.3414, + "step": 4436 + }, + { + "epoch": 0.4179835613857422, + "grad_norm": 0.8518936038017273, + "learning_rate": 1.822035857040121e-05, + "loss": 0.3521, + "step": 4437 + }, + { + "epoch": 0.41807776547891007, + "grad_norm": 0.8019551038742065, + "learning_rate": 1.8219498618429033e-05, + "loss": 0.3763, + "step": 4438 + }, + { + "epoch": 0.4181719695720779, + "grad_norm": 0.7329900860786438, + "learning_rate": 1.8218638479039577e-05, + "loss": 0.3561, + "step": 4439 + }, + { + "epoch": 0.41826617366524577, + "grad_norm": 0.7428778409957886, + "learning_rate": 1.821777815225245e-05, + "loss": 0.3027, + "step": 4440 + }, + { + "epoch": 0.4183603777584136, + "grad_norm": 0.8585033416748047, + "learning_rate": 1.8216917638087278e-05, + "loss": 0.3644, + "step": 4441 + }, + { + "epoch": 0.41845458185158146, + "grad_norm": 0.8240845799446106, + "learning_rate": 1.8216056936563675e-05, + "loss": 0.3943, + "step": 4442 + }, + { + "epoch": 0.4185487859447493, + "grad_norm": 0.7246481776237488, + "learning_rate": 1.8215196047701264e-05, + "loss": 0.3578, + "step": 4443 + }, + { + "epoch": 0.41864299003791716, + "grad_norm": 0.6832184195518494, + "learning_rate": 1.821433497151968e-05, + "loss": 0.3556, + "step": 4444 + }, + { + "epoch": 0.418737194131085, + "grad_norm": 0.8402509093284607, + "learning_rate": 1.8213473708038558e-05, + "loss": 0.3425, + "step": 4445 + }, + { + "epoch": 0.41883139822425286, + "grad_norm": 0.8606646656990051, + "learning_rate": 1.821261225727753e-05, + "loss": 0.4296, + "step": 4446 + }, + { + "epoch": 0.4189256023174207, + "grad_norm": 0.78255695104599, + "learning_rate": 1.821175061925624e-05, + "loss": 0.3629, + "step": 4447 + }, + { + "epoch": 0.41901980641058856, + "grad_norm": 0.8021367192268372, + "learning_rate": 1.821088879399434e-05, + "loss": 0.3207, + "step": 4448 + }, + { + "epoch": 0.4191140105037564, + "grad_norm": 0.8378644585609436, + "learning_rate": 1.8210026781511474e-05, + "loss": 0.3379, + "step": 4449 + }, + { + "epoch": 0.41920821459692426, + "grad_norm": 0.7594597935676575, + "learning_rate": 1.8209164581827304e-05, + "loss": 0.3409, + "step": 4450 + }, + { + "epoch": 0.4193024186900921, + "grad_norm": 0.7872465252876282, + "learning_rate": 1.8208302194961484e-05, + "loss": 0.3833, + "step": 4451 + }, + { + "epoch": 0.41939662278325995, + "grad_norm": 0.6902915239334106, + "learning_rate": 1.8207439620933675e-05, + "loss": 0.2875, + "step": 4452 + }, + { + "epoch": 0.4194908268764278, + "grad_norm": 0.7065645456314087, + "learning_rate": 1.8206576859763555e-05, + "loss": 0.3433, + "step": 4453 + }, + { + "epoch": 0.41958503096959565, + "grad_norm": 0.7474706768989563, + "learning_rate": 1.820571391147079e-05, + "loss": 0.3279, + "step": 4454 + }, + { + "epoch": 0.4196792350627635, + "grad_norm": 0.7695397734642029, + "learning_rate": 1.8204850776075055e-05, + "loss": 0.4055, + "step": 4455 + }, + { + "epoch": 0.41977343915593135, + "grad_norm": 0.782455563545227, + "learning_rate": 1.8203987453596035e-05, + "loss": 0.3039, + "step": 4456 + }, + { + "epoch": 0.4198676432490992, + "grad_norm": 0.8355002403259277, + "learning_rate": 1.8203123944053414e-05, + "loss": 0.364, + "step": 4457 + }, + { + "epoch": 0.41996184734226705, + "grad_norm": 0.670702338218689, + "learning_rate": 1.8202260247466883e-05, + "loss": 0.312, + "step": 4458 + }, + { + "epoch": 0.4200560514354349, + "grad_norm": 0.7533585429191589, + "learning_rate": 1.820139636385613e-05, + "loss": 0.3209, + "step": 4459 + }, + { + "epoch": 0.42015025552860275, + "grad_norm": 0.9109524488449097, + "learning_rate": 1.8200532293240855e-05, + "loss": 0.3519, + "step": 4460 + }, + { + "epoch": 0.42024445962177054, + "grad_norm": 1.1653105020523071, + "learning_rate": 1.819966803564076e-05, + "loss": 0.3252, + "step": 4461 + }, + { + "epoch": 0.4203386637149384, + "grad_norm": 0.7316860556602478, + "learning_rate": 1.8198803591075556e-05, + "loss": 0.3401, + "step": 4462 + }, + { + "epoch": 0.42043286780810624, + "grad_norm": 0.8423137664794922, + "learning_rate": 1.8197938959564952e-05, + "loss": 0.3575, + "step": 4463 + }, + { + "epoch": 0.4205270719012741, + "grad_norm": 0.7673403024673462, + "learning_rate": 1.819707414112866e-05, + "loss": 0.3411, + "step": 4464 + }, + { + "epoch": 0.42062127599444193, + "grad_norm": 0.8541059494018555, + "learning_rate": 1.81962091357864e-05, + "loss": 0.3374, + "step": 4465 + }, + { + "epoch": 0.4207154800876098, + "grad_norm": 0.8002111315727234, + "learning_rate": 1.8195343943557894e-05, + "loss": 0.3106, + "step": 4466 + }, + { + "epoch": 0.42080968418077763, + "grad_norm": 0.7413492202758789, + "learning_rate": 1.8194478564462878e-05, + "loss": 0.3338, + "step": 4467 + }, + { + "epoch": 0.4209038882739455, + "grad_norm": 0.7816640138626099, + "learning_rate": 1.819361299852107e-05, + "loss": 0.3172, + "step": 4468 + }, + { + "epoch": 0.42099809236711333, + "grad_norm": 0.8512669205665588, + "learning_rate": 1.8192747245752218e-05, + "loss": 0.3707, + "step": 4469 + }, + { + "epoch": 0.4210922964602812, + "grad_norm": 0.7964914441108704, + "learning_rate": 1.819188130617606e-05, + "loss": 0.328, + "step": 4470 + }, + { + "epoch": 0.42118650055344903, + "grad_norm": 0.816870927810669, + "learning_rate": 1.819101517981234e-05, + "loss": 0.3275, + "step": 4471 + }, + { + "epoch": 0.4212807046466169, + "grad_norm": 0.7393790483474731, + "learning_rate": 1.81901488666808e-05, + "loss": 0.3253, + "step": 4472 + }, + { + "epoch": 0.4213749087397847, + "grad_norm": 0.6618467569351196, + "learning_rate": 1.8189282366801204e-05, + "loss": 0.2977, + "step": 4473 + }, + { + "epoch": 0.4214691128329526, + "grad_norm": 0.9202789068222046, + "learning_rate": 1.8188415680193303e-05, + "loss": 0.3779, + "step": 4474 + }, + { + "epoch": 0.4215633169261204, + "grad_norm": 0.8780861496925354, + "learning_rate": 1.8187548806876863e-05, + "loss": 0.3342, + "step": 4475 + }, + { + "epoch": 0.4216575210192883, + "grad_norm": 0.7859781384468079, + "learning_rate": 1.8186681746871645e-05, + "loss": 0.3419, + "step": 4476 + }, + { + "epoch": 0.4217517251124561, + "grad_norm": 0.9070796370506287, + "learning_rate": 1.8185814500197424e-05, + "loss": 0.3564, + "step": 4477 + }, + { + "epoch": 0.42184592920562397, + "grad_norm": 1.0571869611740112, + "learning_rate": 1.8184947066873974e-05, + "loss": 0.3621, + "step": 4478 + }, + { + "epoch": 0.4219401332987918, + "grad_norm": 0.7637447118759155, + "learning_rate": 1.818407944692107e-05, + "loss": 0.367, + "step": 4479 + }, + { + "epoch": 0.42203433739195967, + "grad_norm": 0.6667627096176147, + "learning_rate": 1.81832116403585e-05, + "loss": 0.3101, + "step": 4480 + }, + { + "epoch": 0.4221285414851275, + "grad_norm": 0.6804084777832031, + "learning_rate": 1.818234364720605e-05, + "loss": 0.2992, + "step": 4481 + }, + { + "epoch": 0.42222274557829537, + "grad_norm": 0.7190942764282227, + "learning_rate": 1.8181475467483508e-05, + "loss": 0.3627, + "step": 4482 + }, + { + "epoch": 0.4223169496714632, + "grad_norm": 0.7158463001251221, + "learning_rate": 1.8180607101210675e-05, + "loss": 0.3421, + "step": 4483 + }, + { + "epoch": 0.42241115376463106, + "grad_norm": 0.7563744783401489, + "learning_rate": 1.8179738548407347e-05, + "loss": 0.3024, + "step": 4484 + }, + { + "epoch": 0.4225053578577989, + "grad_norm": 0.8165837526321411, + "learning_rate": 1.8178869809093327e-05, + "loss": 0.3404, + "step": 4485 + }, + { + "epoch": 0.42259956195096676, + "grad_norm": 0.6919609904289246, + "learning_rate": 1.8178000883288432e-05, + "loss": 0.3356, + "step": 4486 + }, + { + "epoch": 0.4226937660441346, + "grad_norm": 0.778649091720581, + "learning_rate": 1.8177131771012463e-05, + "loss": 0.3981, + "step": 4487 + }, + { + "epoch": 0.42278797013730246, + "grad_norm": 1.1520659923553467, + "learning_rate": 1.817626247228525e-05, + "loss": 0.3886, + "step": 4488 + }, + { + "epoch": 0.4228821742304703, + "grad_norm": 0.8374298214912415, + "learning_rate": 1.8175392987126603e-05, + "loss": 0.3575, + "step": 4489 + }, + { + "epoch": 0.42297637832363816, + "grad_norm": 0.9034845232963562, + "learning_rate": 1.8174523315556354e-05, + "loss": 0.3631, + "step": 4490 + }, + { + "epoch": 0.423070582416806, + "grad_norm": 0.7980120778083801, + "learning_rate": 1.817365345759433e-05, + "loss": 0.3782, + "step": 4491 + }, + { + "epoch": 0.42316478650997386, + "grad_norm": 0.7888476252555847, + "learning_rate": 1.817278341326037e-05, + "loss": 0.3385, + "step": 4492 + }, + { + "epoch": 0.4232589906031417, + "grad_norm": 0.9399864673614502, + "learning_rate": 1.8171913182574306e-05, + "loss": 0.3892, + "step": 4493 + }, + { + "epoch": 0.42335319469630955, + "grad_norm": 0.8143534064292908, + "learning_rate": 1.8171042765555985e-05, + "loss": 0.4158, + "step": 4494 + }, + { + "epoch": 0.4234473987894774, + "grad_norm": 0.8164240717887878, + "learning_rate": 1.817017216222525e-05, + "loss": 0.3458, + "step": 4495 + }, + { + "epoch": 0.42354160288264525, + "grad_norm": 0.8037379384040833, + "learning_rate": 1.8169301372601955e-05, + "loss": 0.3435, + "step": 4496 + }, + { + "epoch": 0.4236358069758131, + "grad_norm": 0.801935076713562, + "learning_rate": 1.8168430396705956e-05, + "loss": 0.3151, + "step": 4497 + }, + { + "epoch": 0.42373001106898095, + "grad_norm": 0.899776816368103, + "learning_rate": 1.8167559234557108e-05, + "loss": 0.3883, + "step": 4498 + }, + { + "epoch": 0.4238242151621488, + "grad_norm": 0.8704493641853333, + "learning_rate": 1.8166687886175283e-05, + "loss": 0.4019, + "step": 4499 + }, + { + "epoch": 0.42391841925531665, + "grad_norm": 0.7331259250640869, + "learning_rate": 1.8165816351580342e-05, + "loss": 0.3274, + "step": 4500 + }, + { + "epoch": 0.4240126233484845, + "grad_norm": 0.7820464372634888, + "learning_rate": 1.8164944630792158e-05, + "loss": 0.3836, + "step": 4501 + }, + { + "epoch": 0.42410682744165235, + "grad_norm": 0.8430935740470886, + "learning_rate": 1.816407272383061e-05, + "loss": 0.3191, + "step": 4502 + }, + { + "epoch": 0.4242010315348202, + "grad_norm": 1.1456849575042725, + "learning_rate": 1.8163200630715575e-05, + "loss": 0.3613, + "step": 4503 + }, + { + "epoch": 0.42429523562798804, + "grad_norm": 0.8349385857582092, + "learning_rate": 1.8162328351466947e-05, + "loss": 0.336, + "step": 4504 + }, + { + "epoch": 0.4243894397211559, + "grad_norm": 0.7342354655265808, + "learning_rate": 1.8161455886104608e-05, + "loss": 0.3007, + "step": 4505 + }, + { + "epoch": 0.42448364381432374, + "grad_norm": 0.7335234880447388, + "learning_rate": 1.816058323464845e-05, + "loss": 0.3106, + "step": 4506 + }, + { + "epoch": 0.4245778479074916, + "grad_norm": 0.8044220209121704, + "learning_rate": 1.8159710397118372e-05, + "loss": 0.3767, + "step": 4507 + }, + { + "epoch": 0.42467205200065944, + "grad_norm": 0.7103661298751831, + "learning_rate": 1.815883737353428e-05, + "loss": 0.3192, + "step": 4508 + }, + { + "epoch": 0.4247662560938273, + "grad_norm": 0.7537291646003723, + "learning_rate": 1.815796416391608e-05, + "loss": 0.377, + "step": 4509 + }, + { + "epoch": 0.42486046018699514, + "grad_norm": 0.7830828428268433, + "learning_rate": 1.815709076828368e-05, + "loss": 0.3996, + "step": 4510 + }, + { + "epoch": 0.424954664280163, + "grad_norm": 0.832629919052124, + "learning_rate": 1.8156217186656992e-05, + "loss": 0.3847, + "step": 4511 + }, + { + "epoch": 0.42504886837333083, + "grad_norm": 0.7425336241722107, + "learning_rate": 1.815534341905594e-05, + "loss": 0.3632, + "step": 4512 + }, + { + "epoch": 0.4251430724664987, + "grad_norm": 0.8146251440048218, + "learning_rate": 1.8154469465500447e-05, + "loss": 0.3255, + "step": 4513 + }, + { + "epoch": 0.42523727655966653, + "grad_norm": 0.6974042654037476, + "learning_rate": 1.815359532601044e-05, + "loss": 0.3256, + "step": 4514 + }, + { + "epoch": 0.4253314806528344, + "grad_norm": 0.6622070670127869, + "learning_rate": 1.815272100060585e-05, + "loss": 0.3304, + "step": 4515 + }, + { + "epoch": 0.42542568474600223, + "grad_norm": 0.7694042921066284, + "learning_rate": 1.8151846489306607e-05, + "loss": 0.3269, + "step": 4516 + }, + { + "epoch": 0.4255198888391701, + "grad_norm": 0.7804112434387207, + "learning_rate": 1.8150971792132663e-05, + "loss": 0.3764, + "step": 4517 + }, + { + "epoch": 0.42561409293233793, + "grad_norm": 0.7945122718811035, + "learning_rate": 1.8150096909103955e-05, + "loss": 0.3396, + "step": 4518 + }, + { + "epoch": 0.4257082970255058, + "grad_norm": 0.6925581097602844, + "learning_rate": 1.814922184024043e-05, + "loss": 0.3447, + "step": 4519 + }, + { + "epoch": 0.4258025011186736, + "grad_norm": 0.8137907385826111, + "learning_rate": 1.8148346585562048e-05, + "loss": 0.3574, + "step": 4520 + }, + { + "epoch": 0.4258967052118415, + "grad_norm": 1.1398564577102661, + "learning_rate": 1.814747114508876e-05, + "loss": 0.319, + "step": 4521 + }, + { + "epoch": 0.4259909093050093, + "grad_norm": 0.8205663561820984, + "learning_rate": 1.814659551884053e-05, + "loss": 0.3725, + "step": 4522 + }, + { + "epoch": 0.4260851133981772, + "grad_norm": 1.0114045143127441, + "learning_rate": 1.8145719706837322e-05, + "loss": 0.3379, + "step": 4523 + }, + { + "epoch": 0.426179317491345, + "grad_norm": 0.7457922697067261, + "learning_rate": 1.814484370909911e-05, + "loss": 0.3423, + "step": 4524 + }, + { + "epoch": 0.42627352158451287, + "grad_norm": 0.8835598230361938, + "learning_rate": 1.8143967525645863e-05, + "loss": 0.3538, + "step": 4525 + }, + { + "epoch": 0.4263677256776807, + "grad_norm": 0.748531699180603, + "learning_rate": 1.8143091156497565e-05, + "loss": 0.3278, + "step": 4526 + }, + { + "epoch": 0.42646192977084857, + "grad_norm": 0.7603402137756348, + "learning_rate": 1.814221460167419e-05, + "loss": 0.3628, + "step": 4527 + }, + { + "epoch": 0.4265561338640164, + "grad_norm": 0.8315427303314209, + "learning_rate": 1.814133786119573e-05, + "loss": 0.376, + "step": 4528 + }, + { + "epoch": 0.42665033795718427, + "grad_norm": 0.8794881105422974, + "learning_rate": 1.814046093508218e-05, + "loss": 0.3615, + "step": 4529 + }, + { + "epoch": 0.42674454205035206, + "grad_norm": 0.8055482506752014, + "learning_rate": 1.813958382335353e-05, + "loss": 0.3395, + "step": 4530 + }, + { + "epoch": 0.4268387461435199, + "grad_norm": 0.7016241550445557, + "learning_rate": 1.813870652602978e-05, + "loss": 0.3322, + "step": 4531 + }, + { + "epoch": 0.42693295023668776, + "grad_norm": 0.8527817726135254, + "learning_rate": 1.813782904313093e-05, + "loss": 0.3761, + "step": 4532 + }, + { + "epoch": 0.4270271543298556, + "grad_norm": 0.8396844863891602, + "learning_rate": 1.8136951374677e-05, + "loss": 0.3072, + "step": 4533 + }, + { + "epoch": 0.42712135842302346, + "grad_norm": 0.8562626838684082, + "learning_rate": 1.8136073520687992e-05, + "loss": 0.3916, + "step": 4534 + }, + { + "epoch": 0.4272155625161913, + "grad_norm": 0.7464911341667175, + "learning_rate": 1.8135195481183925e-05, + "loss": 0.3768, + "step": 4535 + }, + { + "epoch": 0.42730976660935915, + "grad_norm": 1.3565607070922852, + "learning_rate": 1.813431725618482e-05, + "loss": 0.3447, + "step": 4536 + }, + { + "epoch": 0.427403970702527, + "grad_norm": 0.7935023903846741, + "learning_rate": 1.81334388457107e-05, + "loss": 0.3382, + "step": 4537 + }, + { + "epoch": 0.42749817479569485, + "grad_norm": 0.7430272102355957, + "learning_rate": 1.8132560249781597e-05, + "loss": 0.3167, + "step": 4538 + }, + { + "epoch": 0.4275923788888627, + "grad_norm": 0.7452899813652039, + "learning_rate": 1.813168146841754e-05, + "loss": 0.3518, + "step": 4539 + }, + { + "epoch": 0.42768658298203055, + "grad_norm": 0.7788861393928528, + "learning_rate": 1.8130802501638575e-05, + "loss": 0.3556, + "step": 4540 + }, + { + "epoch": 0.4277807870751984, + "grad_norm": 0.7843717336654663, + "learning_rate": 1.8129923349464734e-05, + "loss": 0.3903, + "step": 4541 + }, + { + "epoch": 0.42787499116836625, + "grad_norm": 0.8972139954566956, + "learning_rate": 1.812904401191607e-05, + "loss": 0.3327, + "step": 4542 + }, + { + "epoch": 0.4279691952615341, + "grad_norm": 0.7734087705612183, + "learning_rate": 1.812816448901263e-05, + "loss": 0.3129, + "step": 4543 + }, + { + "epoch": 0.42806339935470195, + "grad_norm": 0.7029120922088623, + "learning_rate": 1.812728478077447e-05, + "loss": 0.3349, + "step": 4544 + }, + { + "epoch": 0.4281576034478698, + "grad_norm": 0.7353888750076294, + "learning_rate": 1.8126404887221646e-05, + "loss": 0.3223, + "step": 4545 + }, + { + "epoch": 0.42825180754103764, + "grad_norm": 0.7380049824714661, + "learning_rate": 1.812552480837422e-05, + "loss": 0.3266, + "step": 4546 + }, + { + "epoch": 0.4283460116342055, + "grad_norm": 0.7154632210731506, + "learning_rate": 1.812464454425227e-05, + "loss": 0.3189, + "step": 4547 + }, + { + "epoch": 0.42844021572737334, + "grad_norm": 0.6878069043159485, + "learning_rate": 1.8123764094875855e-05, + "loss": 0.3043, + "step": 4548 + }, + { + "epoch": 0.4285344198205412, + "grad_norm": 0.8377850651741028, + "learning_rate": 1.8122883460265055e-05, + "loss": 0.355, + "step": 4549 + }, + { + "epoch": 0.42862862391370904, + "grad_norm": 0.7448782920837402, + "learning_rate": 1.812200264043995e-05, + "loss": 0.3119, + "step": 4550 + }, + { + "epoch": 0.4287228280068769, + "grad_norm": 0.8376103043556213, + "learning_rate": 1.8121121635420623e-05, + "loss": 0.3454, + "step": 4551 + }, + { + "epoch": 0.42881703210004474, + "grad_norm": 0.7931928038597107, + "learning_rate": 1.8120240445227164e-05, + "loss": 0.2978, + "step": 4552 + }, + { + "epoch": 0.4289112361932126, + "grad_norm": 0.8019053339958191, + "learning_rate": 1.8119359069879665e-05, + "loss": 0.3702, + "step": 4553 + }, + { + "epoch": 0.42900544028638044, + "grad_norm": 0.9101332426071167, + "learning_rate": 1.811847750939822e-05, + "loss": 0.3298, + "step": 4554 + }, + { + "epoch": 0.4290996443795483, + "grad_norm": 0.7293367385864258, + "learning_rate": 1.8117595763802938e-05, + "loss": 0.3044, + "step": 4555 + }, + { + "epoch": 0.42919384847271613, + "grad_norm": 0.8479684591293335, + "learning_rate": 1.8116713833113913e-05, + "loss": 0.3822, + "step": 4556 + }, + { + "epoch": 0.429288052565884, + "grad_norm": 0.7806784510612488, + "learning_rate": 1.8115831717351263e-05, + "loss": 0.3257, + "step": 4557 + }, + { + "epoch": 0.42938225665905183, + "grad_norm": 0.8044211864471436, + "learning_rate": 1.81149494165351e-05, + "loss": 0.3169, + "step": 4558 + }, + { + "epoch": 0.4294764607522197, + "grad_norm": 0.825407087802887, + "learning_rate": 1.8114066930685535e-05, + "loss": 0.3354, + "step": 4559 + }, + { + "epoch": 0.42957066484538753, + "grad_norm": 0.8185461163520813, + "learning_rate": 1.8113184259822695e-05, + "loss": 0.2993, + "step": 4560 + }, + { + "epoch": 0.4296648689385554, + "grad_norm": 0.8297215104103088, + "learning_rate": 1.811230140396671e-05, + "loss": 0.3611, + "step": 4561 + }, + { + "epoch": 0.4297590730317232, + "grad_norm": 0.8848738074302673, + "learning_rate": 1.811141836313771e-05, + "loss": 0.3444, + "step": 4562 + }, + { + "epoch": 0.4298532771248911, + "grad_norm": 0.8999459743499756, + "learning_rate": 1.811053513735582e-05, + "loss": 0.3424, + "step": 4563 + }, + { + "epoch": 0.4299474812180589, + "grad_norm": 0.726396918296814, + "learning_rate": 1.810965172664119e-05, + "loss": 0.3749, + "step": 4564 + }, + { + "epoch": 0.4300416853112268, + "grad_norm": 0.8606882691383362, + "learning_rate": 1.8108768131013958e-05, + "loss": 0.3863, + "step": 4565 + }, + { + "epoch": 0.4301358894043946, + "grad_norm": 0.8336482644081116, + "learning_rate": 1.8107884350494274e-05, + "loss": 0.3232, + "step": 4566 + }, + { + "epoch": 0.43023009349756247, + "grad_norm": 0.8066686987876892, + "learning_rate": 1.8107000385102284e-05, + "loss": 0.34, + "step": 4567 + }, + { + "epoch": 0.4303242975907303, + "grad_norm": 0.7661321759223938, + "learning_rate": 1.810611623485815e-05, + "loss": 0.3749, + "step": 4568 + }, + { + "epoch": 0.43041850168389817, + "grad_norm": 0.8105167150497437, + "learning_rate": 1.810523189978203e-05, + "loss": 0.3508, + "step": 4569 + }, + { + "epoch": 0.430512705777066, + "grad_norm": 0.8608072400093079, + "learning_rate": 1.8104347379894084e-05, + "loss": 0.3066, + "step": 4570 + }, + { + "epoch": 0.43060690987023387, + "grad_norm": 0.7740591764450073, + "learning_rate": 1.8103462675214485e-05, + "loss": 0.3455, + "step": 4571 + }, + { + "epoch": 0.4307011139634017, + "grad_norm": 0.8389507532119751, + "learning_rate": 1.8102577785763407e-05, + "loss": 0.3804, + "step": 4572 + }, + { + "epoch": 0.43079531805656957, + "grad_norm": 0.9266282320022583, + "learning_rate": 1.8101692711561027e-05, + "loss": 0.3919, + "step": 4573 + }, + { + "epoch": 0.4308895221497374, + "grad_norm": 0.7471932768821716, + "learning_rate": 1.810080745262752e-05, + "loss": 0.3466, + "step": 4574 + }, + { + "epoch": 0.43098372624290526, + "grad_norm": 0.7252283692359924, + "learning_rate": 1.8099922008983075e-05, + "loss": 0.3256, + "step": 4575 + }, + { + "epoch": 0.4310779303360731, + "grad_norm": 0.8164095878601074, + "learning_rate": 1.809903638064788e-05, + "loss": 0.3707, + "step": 4576 + }, + { + "epoch": 0.43117213442924096, + "grad_norm": 0.8254788517951965, + "learning_rate": 1.8098150567642134e-05, + "loss": 0.3351, + "step": 4577 + }, + { + "epoch": 0.4312663385224088, + "grad_norm": 0.8885326385498047, + "learning_rate": 1.809726456998603e-05, + "loss": 0.322, + "step": 4578 + }, + { + "epoch": 0.43136054261557666, + "grad_norm": 0.7449198961257935, + "learning_rate": 1.809637838769977e-05, + "loss": 0.3487, + "step": 4579 + }, + { + "epoch": 0.4314547467087445, + "grad_norm": 0.812122106552124, + "learning_rate": 1.809549202080356e-05, + "loss": 0.3639, + "step": 4580 + }, + { + "epoch": 0.43154895080191236, + "grad_norm": 0.8443834781646729, + "learning_rate": 1.8094605469317613e-05, + "loss": 0.3817, + "step": 4581 + }, + { + "epoch": 0.4316431548950802, + "grad_norm": 0.8098493814468384, + "learning_rate": 1.8093718733262143e-05, + "loss": 0.3586, + "step": 4582 + }, + { + "epoch": 0.43173735898824805, + "grad_norm": 0.8115017414093018, + "learning_rate": 1.809283181265737e-05, + "loss": 0.3549, + "step": 4583 + }, + { + "epoch": 0.4318315630814159, + "grad_norm": 0.873196542263031, + "learning_rate": 1.8091944707523516e-05, + "loss": 0.3747, + "step": 4584 + }, + { + "epoch": 0.43192576717458375, + "grad_norm": 0.7385686635971069, + "learning_rate": 1.8091057417880807e-05, + "loss": 0.3292, + "step": 4585 + }, + { + "epoch": 0.4320199712677516, + "grad_norm": 0.806738555431366, + "learning_rate": 1.8090169943749477e-05, + "loss": 0.3408, + "step": 4586 + }, + { + "epoch": 0.43211417536091945, + "grad_norm": 0.8190243244171143, + "learning_rate": 1.808928228514976e-05, + "loss": 0.3532, + "step": 4587 + }, + { + "epoch": 0.4322083794540873, + "grad_norm": 0.8552412390708923, + "learning_rate": 1.8088394442101896e-05, + "loss": 0.364, + "step": 4588 + }, + { + "epoch": 0.43230258354725515, + "grad_norm": 0.7642033100128174, + "learning_rate": 1.8087506414626127e-05, + "loss": 0.3511, + "step": 4589 + }, + { + "epoch": 0.432396787640423, + "grad_norm": 0.7361543774604797, + "learning_rate": 1.808661820274271e-05, + "loss": 0.3543, + "step": 4590 + }, + { + "epoch": 0.43249099173359085, + "grad_norm": 0.726957380771637, + "learning_rate": 1.8085729806471888e-05, + "loss": 0.3334, + "step": 4591 + }, + { + "epoch": 0.4325851958267587, + "grad_norm": 0.7136059403419495, + "learning_rate": 1.808484122583392e-05, + "loss": 0.3368, + "step": 4592 + }, + { + "epoch": 0.43267939991992654, + "grad_norm": 1.0841796398162842, + "learning_rate": 1.808395246084907e-05, + "loss": 0.3448, + "step": 4593 + }, + { + "epoch": 0.4327736040130944, + "grad_norm": 0.899951696395874, + "learning_rate": 1.8083063511537605e-05, + "loss": 0.3709, + "step": 4594 + }, + { + "epoch": 0.43286780810626224, + "grad_norm": 0.7780426740646362, + "learning_rate": 1.8082174377919792e-05, + "loss": 0.3625, + "step": 4595 + }, + { + "epoch": 0.4329620121994301, + "grad_norm": 0.8089221715927124, + "learning_rate": 1.80812850600159e-05, + "loss": 0.3615, + "step": 4596 + }, + { + "epoch": 0.43305621629259794, + "grad_norm": 0.7787632942199707, + "learning_rate": 1.8080395557846213e-05, + "loss": 0.3034, + "step": 4597 + }, + { + "epoch": 0.4331504203857658, + "grad_norm": 0.9258681535720825, + "learning_rate": 1.807950587143101e-05, + "loss": 0.3316, + "step": 4598 + }, + { + "epoch": 0.4332446244789336, + "grad_norm": 0.7238567471504211, + "learning_rate": 1.8078616000790577e-05, + "loss": 0.3164, + "step": 4599 + }, + { + "epoch": 0.43333882857210143, + "grad_norm": 0.8651083707809448, + "learning_rate": 1.8077725945945203e-05, + "loss": 0.3395, + "step": 4600 + }, + { + "epoch": 0.4334330326652693, + "grad_norm": 0.850307047367096, + "learning_rate": 1.807683570691519e-05, + "loss": 0.3185, + "step": 4601 + }, + { + "epoch": 0.43352723675843713, + "grad_norm": 0.7419808506965637, + "learning_rate": 1.8075945283720832e-05, + "loss": 0.3684, + "step": 4602 + }, + { + "epoch": 0.433621440851605, + "grad_norm": 0.8960341811180115, + "learning_rate": 1.8075054676382426e-05, + "loss": 0.3858, + "step": 4603 + }, + { + "epoch": 0.4337156449447728, + "grad_norm": 0.7304967641830444, + "learning_rate": 1.807416388492029e-05, + "loss": 0.3001, + "step": 4604 + }, + { + "epoch": 0.4338098490379407, + "grad_norm": 0.7991971969604492, + "learning_rate": 1.8073272909354727e-05, + "loss": 0.3751, + "step": 4605 + }, + { + "epoch": 0.4339040531311085, + "grad_norm": 0.785723865032196, + "learning_rate": 1.8072381749706058e-05, + "loss": 0.3603, + "step": 4606 + }, + { + "epoch": 0.4339982572242764, + "grad_norm": 0.7629894614219666, + "learning_rate": 1.80714904059946e-05, + "loss": 0.3187, + "step": 4607 + }, + { + "epoch": 0.4340924613174442, + "grad_norm": 0.8983370065689087, + "learning_rate": 1.807059887824068e-05, + "loss": 0.3929, + "step": 4608 + }, + { + "epoch": 0.43418666541061207, + "grad_norm": 0.8052110075950623, + "learning_rate": 1.8069707166464624e-05, + "loss": 0.3542, + "step": 4609 + }, + { + "epoch": 0.4342808695037799, + "grad_norm": 0.7605381608009338, + "learning_rate": 1.8068815270686763e-05, + "loss": 0.3391, + "step": 4610 + }, + { + "epoch": 0.43437507359694777, + "grad_norm": 0.7355569005012512, + "learning_rate": 1.8067923190927437e-05, + "loss": 0.3396, + "step": 4611 + }, + { + "epoch": 0.4344692776901156, + "grad_norm": 0.7521963715553284, + "learning_rate": 1.8067030927206984e-05, + "loss": 0.3223, + "step": 4612 + }, + { + "epoch": 0.43456348178328347, + "grad_norm": 0.9807536602020264, + "learning_rate": 1.806613847954575e-05, + "loss": 0.4031, + "step": 4613 + }, + { + "epoch": 0.4346576858764513, + "grad_norm": 0.7357286810874939, + "learning_rate": 1.8065245847964085e-05, + "loss": 0.3321, + "step": 4614 + }, + { + "epoch": 0.43475188996961917, + "grad_norm": 0.8147128820419312, + "learning_rate": 1.806435303248234e-05, + "loss": 0.3851, + "step": 4615 + }, + { + "epoch": 0.434846094062787, + "grad_norm": 0.8538789749145508, + "learning_rate": 1.8063460033120873e-05, + "loss": 0.3206, + "step": 4616 + }, + { + "epoch": 0.43494029815595486, + "grad_norm": 0.9050279855728149, + "learning_rate": 1.8062566849900048e-05, + "loss": 0.3643, + "step": 4617 + }, + { + "epoch": 0.4350345022491227, + "grad_norm": 0.7920427918434143, + "learning_rate": 1.8061673482840228e-05, + "loss": 0.3764, + "step": 4618 + }, + { + "epoch": 0.43512870634229056, + "grad_norm": 0.7693020105361938, + "learning_rate": 1.806077993196179e-05, + "loss": 0.4008, + "step": 4619 + }, + { + "epoch": 0.4352229104354584, + "grad_norm": 0.7530403733253479, + "learning_rate": 1.80598861972851e-05, + "loss": 0.3302, + "step": 4620 + }, + { + "epoch": 0.43531711452862626, + "grad_norm": 0.8060566186904907, + "learning_rate": 1.805899227883054e-05, + "loss": 0.3652, + "step": 4621 + }, + { + "epoch": 0.4354113186217941, + "grad_norm": 0.8145161867141724, + "learning_rate": 1.8058098176618496e-05, + "loss": 0.3356, + "step": 4622 + }, + { + "epoch": 0.43550552271496196, + "grad_norm": 0.8926414847373962, + "learning_rate": 1.8057203890669346e-05, + "loss": 0.3025, + "step": 4623 + }, + { + "epoch": 0.4355997268081298, + "grad_norm": 0.9973188638687134, + "learning_rate": 1.805630942100349e-05, + "loss": 0.3763, + "step": 4624 + }, + { + "epoch": 0.43569393090129765, + "grad_norm": 1.064535140991211, + "learning_rate": 1.8055414767641316e-05, + "loss": 0.2956, + "step": 4625 + }, + { + "epoch": 0.4357881349944655, + "grad_norm": 0.7880619168281555, + "learning_rate": 1.805451993060323e-05, + "loss": 0.3397, + "step": 4626 + }, + { + "epoch": 0.43588233908763335, + "grad_norm": 0.7307702898979187, + "learning_rate": 1.8053624909909634e-05, + "loss": 0.3222, + "step": 4627 + }, + { + "epoch": 0.4359765431808012, + "grad_norm": 0.6933474540710449, + "learning_rate": 1.8052729705580935e-05, + "loss": 0.311, + "step": 4628 + }, + { + "epoch": 0.43607074727396905, + "grad_norm": 0.8088884949684143, + "learning_rate": 1.8051834317637547e-05, + "loss": 0.3392, + "step": 4629 + }, + { + "epoch": 0.4361649513671369, + "grad_norm": 0.793786883354187, + "learning_rate": 1.805093874609988e-05, + "loss": 0.3733, + "step": 4630 + }, + { + "epoch": 0.43625915546030475, + "grad_norm": 0.8003841042518616, + "learning_rate": 1.8050042990988358e-05, + "loss": 0.3027, + "step": 4631 + }, + { + "epoch": 0.4363533595534726, + "grad_norm": 0.8275167346000671, + "learning_rate": 1.804914705232341e-05, + "loss": 0.3455, + "step": 4632 + }, + { + "epoch": 0.43644756364664045, + "grad_norm": 0.7653775215148926, + "learning_rate": 1.804825093012546e-05, + "loss": 0.3003, + "step": 4633 + }, + { + "epoch": 0.4365417677398083, + "grad_norm": 0.8436541557312012, + "learning_rate": 1.804735462441494e-05, + "loss": 0.3288, + "step": 4634 + }, + { + "epoch": 0.43663597183297614, + "grad_norm": 0.7628139853477478, + "learning_rate": 1.804645813521229e-05, + "loss": 0.2999, + "step": 4635 + }, + { + "epoch": 0.436730175926144, + "grad_norm": 0.8498921394348145, + "learning_rate": 1.804556146253795e-05, + "loss": 0.3578, + "step": 4636 + }, + { + "epoch": 0.43682438001931184, + "grad_norm": 0.9100565314292908, + "learning_rate": 1.8044664606412366e-05, + "loss": 0.3661, + "step": 4637 + }, + { + "epoch": 0.4369185841124797, + "grad_norm": 0.7419862747192383, + "learning_rate": 1.8043767566855988e-05, + "loss": 0.3399, + "step": 4638 + }, + { + "epoch": 0.43701278820564754, + "grad_norm": 0.8369824290275574, + "learning_rate": 1.804287034388927e-05, + "loss": 0.3425, + "step": 4639 + }, + { + "epoch": 0.4371069922988154, + "grad_norm": 0.7686251401901245, + "learning_rate": 1.804197293753267e-05, + "loss": 0.3035, + "step": 4640 + }, + { + "epoch": 0.43720119639198324, + "grad_norm": 0.7537437081336975, + "learning_rate": 1.8041075347806647e-05, + "loss": 0.3458, + "step": 4641 + }, + { + "epoch": 0.4372954004851511, + "grad_norm": 0.8594371676445007, + "learning_rate": 1.8040177574731673e-05, + "loss": 0.3297, + "step": 4642 + }, + { + "epoch": 0.43738960457831894, + "grad_norm": 0.6765527129173279, + "learning_rate": 1.8039279618328215e-05, + "loss": 0.2741, + "step": 4643 + }, + { + "epoch": 0.4374838086714868, + "grad_norm": 0.8351267576217651, + "learning_rate": 1.8038381478616747e-05, + "loss": 0.3206, + "step": 4644 + }, + { + "epoch": 0.43757801276465463, + "grad_norm": 0.8332872986793518, + "learning_rate": 1.8037483155617755e-05, + "loss": 0.36, + "step": 4645 + }, + { + "epoch": 0.4376722168578225, + "grad_norm": 0.9106321334838867, + "learning_rate": 1.8036584649351713e-05, + "loss": 0.3788, + "step": 4646 + }, + { + "epoch": 0.43776642095099033, + "grad_norm": 0.6762860417366028, + "learning_rate": 1.803568595983911e-05, + "loss": 0.3089, + "step": 4647 + }, + { + "epoch": 0.4378606250441582, + "grad_norm": 0.7984324097633362, + "learning_rate": 1.8034787087100442e-05, + "loss": 0.3536, + "step": 4648 + }, + { + "epoch": 0.43795482913732603, + "grad_norm": 0.7609680891036987, + "learning_rate": 1.8033888031156204e-05, + "loss": 0.3261, + "step": 4649 + }, + { + "epoch": 0.4380490332304939, + "grad_norm": 0.8033218383789062, + "learning_rate": 1.8032988792026894e-05, + "loss": 0.2957, + "step": 4650 + }, + { + "epoch": 0.4381432373236617, + "grad_norm": 0.8091560006141663, + "learning_rate": 1.8032089369733015e-05, + "loss": 0.3087, + "step": 4651 + }, + { + "epoch": 0.4382374414168296, + "grad_norm": 0.7749262452125549, + "learning_rate": 1.8031189764295075e-05, + "loss": 0.3672, + "step": 4652 + }, + { + "epoch": 0.4383316455099974, + "grad_norm": 0.6797177791595459, + "learning_rate": 1.8030289975733592e-05, + "loss": 0.3116, + "step": 4653 + }, + { + "epoch": 0.4384258496031653, + "grad_norm": 0.7387074828147888, + "learning_rate": 1.8029390004069075e-05, + "loss": 0.352, + "step": 4654 + }, + { + "epoch": 0.4385200536963331, + "grad_norm": 0.747401237487793, + "learning_rate": 1.8028489849322052e-05, + "loss": 0.3357, + "step": 4655 + }, + { + "epoch": 0.43861425778950097, + "grad_norm": 0.7775043249130249, + "learning_rate": 1.802758951151304e-05, + "loss": 0.3648, + "step": 4656 + }, + { + "epoch": 0.4387084618826688, + "grad_norm": 0.7466880679130554, + "learning_rate": 1.8026688990662574e-05, + "loss": 0.3228, + "step": 4657 + }, + { + "epoch": 0.43880266597583667, + "grad_norm": 0.7472065687179565, + "learning_rate": 1.8025788286791183e-05, + "loss": 0.3074, + "step": 4658 + }, + { + "epoch": 0.4388968700690045, + "grad_norm": 0.8351142406463623, + "learning_rate": 1.802488739991941e-05, + "loss": 0.2955, + "step": 4659 + }, + { + "epoch": 0.43899107416217237, + "grad_norm": 0.9496912956237793, + "learning_rate": 1.8023986330067795e-05, + "loss": 0.3861, + "step": 4660 + }, + { + "epoch": 0.4390852782553402, + "grad_norm": 0.9483456015586853, + "learning_rate": 1.8023085077256878e-05, + "loss": 0.3534, + "step": 4661 + }, + { + "epoch": 0.43917948234850807, + "grad_norm": 0.6802927851676941, + "learning_rate": 1.8022183641507215e-05, + "loss": 0.3231, + "step": 4662 + }, + { + "epoch": 0.4392736864416759, + "grad_norm": 0.952570378780365, + "learning_rate": 1.8021282022839363e-05, + "loss": 0.4003, + "step": 4663 + }, + { + "epoch": 0.43936789053484376, + "grad_norm": 0.8370252847671509, + "learning_rate": 1.802038022127387e-05, + "loss": 0.3964, + "step": 4664 + }, + { + "epoch": 0.4394620946280116, + "grad_norm": 1.0444804430007935, + "learning_rate": 1.8019478236831306e-05, + "loss": 0.3511, + "step": 4665 + }, + { + "epoch": 0.43955629872117946, + "grad_norm": 0.7669371962547302, + "learning_rate": 1.8018576069532235e-05, + "loss": 0.334, + "step": 4666 + }, + { + "epoch": 0.4396505028143473, + "grad_norm": 0.8720434904098511, + "learning_rate": 1.801767371939723e-05, + "loss": 0.313, + "step": 4667 + }, + { + "epoch": 0.4397447069075151, + "grad_norm": 0.7665590047836304, + "learning_rate": 1.8016771186446864e-05, + "loss": 0.3384, + "step": 4668 + }, + { + "epoch": 0.43983891100068295, + "grad_norm": 0.7445639967918396, + "learning_rate": 1.8015868470701715e-05, + "loss": 0.3087, + "step": 4669 + }, + { + "epoch": 0.4399331150938508, + "grad_norm": 1.0419539213180542, + "learning_rate": 1.801496557218237e-05, + "loss": 0.3779, + "step": 4670 + }, + { + "epoch": 0.44002731918701865, + "grad_norm": 0.6913175582885742, + "learning_rate": 1.8014062490909414e-05, + "loss": 0.3212, + "step": 4671 + }, + { + "epoch": 0.4401215232801865, + "grad_norm": 0.7021580934524536, + "learning_rate": 1.801315922690344e-05, + "loss": 0.3479, + "step": 4672 + }, + { + "epoch": 0.44021572737335435, + "grad_norm": 0.8642510771751404, + "learning_rate": 1.8012255780185043e-05, + "loss": 0.3567, + "step": 4673 + }, + { + "epoch": 0.4403099314665222, + "grad_norm": 1.0100830793380737, + "learning_rate": 1.8011352150774823e-05, + "loss": 0.3944, + "step": 4674 + }, + { + "epoch": 0.44040413555969005, + "grad_norm": 0.7227535247802734, + "learning_rate": 1.8010448338693382e-05, + "loss": 0.3056, + "step": 4675 + }, + { + "epoch": 0.4404983396528579, + "grad_norm": 0.7552227973937988, + "learning_rate": 1.8009544343961335e-05, + "loss": 0.2957, + "step": 4676 + }, + { + "epoch": 0.44059254374602574, + "grad_norm": 0.8633525371551514, + "learning_rate": 1.8008640166599283e-05, + "loss": 0.3687, + "step": 4677 + }, + { + "epoch": 0.4406867478391936, + "grad_norm": 0.9835649728775024, + "learning_rate": 1.8007735806627856e-05, + "loss": 0.3463, + "step": 4678 + }, + { + "epoch": 0.44078095193236144, + "grad_norm": 0.7401981949806213, + "learning_rate": 1.8006831264067668e-05, + "loss": 0.3413, + "step": 4679 + }, + { + "epoch": 0.4408751560255293, + "grad_norm": 0.8326581120491028, + "learning_rate": 1.8005926538939344e-05, + "loss": 0.4091, + "step": 4680 + }, + { + "epoch": 0.44096936011869714, + "grad_norm": 0.9332050085067749, + "learning_rate": 1.800502163126351e-05, + "loss": 0.3743, + "step": 4681 + }, + { + "epoch": 0.441063564211865, + "grad_norm": 0.8416963815689087, + "learning_rate": 1.8004116541060804e-05, + "loss": 0.3267, + "step": 4682 + }, + { + "epoch": 0.44115776830503284, + "grad_norm": 1.0526691675186157, + "learning_rate": 1.8003211268351863e-05, + "loss": 0.3646, + "step": 4683 + }, + { + "epoch": 0.4412519723982007, + "grad_norm": 0.7330875992774963, + "learning_rate": 1.8002305813157327e-05, + "loss": 0.3313, + "step": 4684 + }, + { + "epoch": 0.44134617649136854, + "grad_norm": 0.677757740020752, + "learning_rate": 1.8001400175497844e-05, + "loss": 0.3143, + "step": 4685 + }, + { + "epoch": 0.4414403805845364, + "grad_norm": 0.7628189921379089, + "learning_rate": 1.8000494355394064e-05, + "loss": 0.3374, + "step": 4686 + }, + { + "epoch": 0.44153458467770423, + "grad_norm": 0.8393889665603638, + "learning_rate": 1.7999588352866638e-05, + "loss": 0.3582, + "step": 4687 + }, + { + "epoch": 0.4416287887708721, + "grad_norm": 0.7568100690841675, + "learning_rate": 1.7998682167936227e-05, + "loss": 0.3194, + "step": 4688 + }, + { + "epoch": 0.44172299286403993, + "grad_norm": 1.173956036567688, + "learning_rate": 1.799777580062349e-05, + "loss": 0.3112, + "step": 4689 + }, + { + "epoch": 0.4418171969572078, + "grad_norm": 0.738646388053894, + "learning_rate": 1.7996869250949095e-05, + "loss": 0.3075, + "step": 4690 + }, + { + "epoch": 0.44191140105037563, + "grad_norm": 0.6442466378211975, + "learning_rate": 1.799596251893372e-05, + "loss": 0.2764, + "step": 4691 + }, + { + "epoch": 0.4420056051435435, + "grad_norm": 0.7446337342262268, + "learning_rate": 1.7995055604598027e-05, + "loss": 0.3342, + "step": 4692 + }, + { + "epoch": 0.4420998092367113, + "grad_norm": 0.8343163728713989, + "learning_rate": 1.7994148507962706e-05, + "loss": 0.3139, + "step": 4693 + }, + { + "epoch": 0.4421940133298792, + "grad_norm": 0.8424333930015564, + "learning_rate": 1.7993241229048434e-05, + "loss": 0.3566, + "step": 4694 + }, + { + "epoch": 0.442288217423047, + "grad_norm": 0.8876257538795471, + "learning_rate": 1.79923337678759e-05, + "loss": 0.3852, + "step": 4695 + }, + { + "epoch": 0.4423824215162149, + "grad_norm": 0.7722113728523254, + "learning_rate": 1.7991426124465792e-05, + "loss": 0.3317, + "step": 4696 + }, + { + "epoch": 0.4424766256093827, + "grad_norm": 0.8378174901008606, + "learning_rate": 1.7990518298838817e-05, + "loss": 0.3857, + "step": 4697 + }, + { + "epoch": 0.44257082970255057, + "grad_norm": 0.9277419447898865, + "learning_rate": 1.798961029101566e-05, + "loss": 0.3287, + "step": 4698 + }, + { + "epoch": 0.4426650337957184, + "grad_norm": 0.7418336272239685, + "learning_rate": 1.7988702101017036e-05, + "loss": 0.3008, + "step": 4699 + }, + { + "epoch": 0.44275923788888627, + "grad_norm": 0.9991570711135864, + "learning_rate": 1.798779372886365e-05, + "loss": 0.3649, + "step": 4700 + }, + { + "epoch": 0.4428534419820541, + "grad_norm": 0.858697772026062, + "learning_rate": 1.7986885174576212e-05, + "loss": 0.3509, + "step": 4701 + }, + { + "epoch": 0.44294764607522197, + "grad_norm": 0.7897425889968872, + "learning_rate": 1.7985976438175444e-05, + "loss": 0.3493, + "step": 4702 + }, + { + "epoch": 0.4430418501683898, + "grad_norm": 0.6957587599754333, + "learning_rate": 1.7985067519682057e-05, + "loss": 0.2817, + "step": 4703 + }, + { + "epoch": 0.44313605426155767, + "grad_norm": 0.8322944045066833, + "learning_rate": 1.7984158419116783e-05, + "loss": 0.3411, + "step": 4704 + }, + { + "epoch": 0.4432302583547255, + "grad_norm": 0.924765408039093, + "learning_rate": 1.798324913650035e-05, + "loss": 0.41, + "step": 4705 + }, + { + "epoch": 0.44332446244789336, + "grad_norm": 0.743596613407135, + "learning_rate": 1.7982339671853492e-05, + "loss": 0.3418, + "step": 4706 + }, + { + "epoch": 0.4434186665410612, + "grad_norm": 0.7587670683860779, + "learning_rate": 1.7981430025196946e-05, + "loss": 0.3186, + "step": 4707 + }, + { + "epoch": 0.44351287063422906, + "grad_norm": 0.7345204949378967, + "learning_rate": 1.7980520196551444e-05, + "loss": 0.3578, + "step": 4708 + }, + { + "epoch": 0.4436070747273969, + "grad_norm": 0.8454651236534119, + "learning_rate": 1.7979610185937746e-05, + "loss": 0.3555, + "step": 4709 + }, + { + "epoch": 0.44370127882056476, + "grad_norm": 0.7994370460510254, + "learning_rate": 1.7978699993376593e-05, + "loss": 0.357, + "step": 4710 + }, + { + "epoch": 0.4437954829137326, + "grad_norm": 0.7655448913574219, + "learning_rate": 1.7977789618888742e-05, + "loss": 0.3652, + "step": 4711 + }, + { + "epoch": 0.44388968700690046, + "grad_norm": 0.8610358238220215, + "learning_rate": 1.797687906249495e-05, + "loss": 0.3611, + "step": 4712 + }, + { + "epoch": 0.4439838911000683, + "grad_norm": 0.7728049159049988, + "learning_rate": 1.7975968324215977e-05, + "loss": 0.2849, + "step": 4713 + }, + { + "epoch": 0.44407809519323616, + "grad_norm": 0.7278761267662048, + "learning_rate": 1.797505740407259e-05, + "loss": 0.3296, + "step": 4714 + }, + { + "epoch": 0.444172299286404, + "grad_norm": 0.8426651358604431, + "learning_rate": 1.7974146302085563e-05, + "loss": 0.3408, + "step": 4715 + }, + { + "epoch": 0.44426650337957185, + "grad_norm": 0.8193193674087524, + "learning_rate": 1.797323501827567e-05, + "loss": 0.3588, + "step": 4716 + }, + { + "epoch": 0.4443607074727397, + "grad_norm": 0.8741720914840698, + "learning_rate": 1.7972323552663686e-05, + "loss": 0.4083, + "step": 4717 + }, + { + "epoch": 0.44445491156590755, + "grad_norm": 0.806844174861908, + "learning_rate": 1.797141190527039e-05, + "loss": 0.3395, + "step": 4718 + }, + { + "epoch": 0.4445491156590754, + "grad_norm": 0.8051599264144897, + "learning_rate": 1.7970500076116583e-05, + "loss": 0.3676, + "step": 4719 + }, + { + "epoch": 0.44464331975224325, + "grad_norm": 0.8780580163002014, + "learning_rate": 1.7969588065223043e-05, + "loss": 0.3785, + "step": 4720 + }, + { + "epoch": 0.4447375238454111, + "grad_norm": 0.7754378914833069, + "learning_rate": 1.7968675872610572e-05, + "loss": 0.3094, + "step": 4721 + }, + { + "epoch": 0.44483172793857895, + "grad_norm": 0.730026364326477, + "learning_rate": 1.7967763498299965e-05, + "loss": 0.3207, + "step": 4722 + }, + { + "epoch": 0.4449259320317468, + "grad_norm": 0.6212615966796875, + "learning_rate": 1.796685094231203e-05, + "loss": 0.2724, + "step": 4723 + }, + { + "epoch": 0.44502013612491464, + "grad_norm": 0.8475649356842041, + "learning_rate": 1.796593820466757e-05, + "loss": 0.3158, + "step": 4724 + }, + { + "epoch": 0.4451143402180825, + "grad_norm": 0.8592053055763245, + "learning_rate": 1.7965025285387402e-05, + "loss": 0.3722, + "step": 4725 + }, + { + "epoch": 0.44520854431125034, + "grad_norm": 0.7791186571121216, + "learning_rate": 1.796411218449234e-05, + "loss": 0.3344, + "step": 4726 + }, + { + "epoch": 0.4453027484044182, + "grad_norm": 0.8951060175895691, + "learning_rate": 1.7963198902003202e-05, + "loss": 0.3531, + "step": 4727 + }, + { + "epoch": 0.44539695249758604, + "grad_norm": 0.8267094492912292, + "learning_rate": 1.7962285437940813e-05, + "loss": 0.3449, + "step": 4728 + }, + { + "epoch": 0.4454911565907539, + "grad_norm": 0.7618321776390076, + "learning_rate": 1.7961371792326004e-05, + "loss": 0.3041, + "step": 4729 + }, + { + "epoch": 0.44558536068392174, + "grad_norm": 0.7991135716438293, + "learning_rate": 1.7960457965179604e-05, + "loss": 0.3587, + "step": 4730 + }, + { + "epoch": 0.4456795647770896, + "grad_norm": 0.8610298037528992, + "learning_rate": 1.795954395652245e-05, + "loss": 0.3344, + "step": 4731 + }, + { + "epoch": 0.44577376887025744, + "grad_norm": 0.7768810987472534, + "learning_rate": 1.7958629766375387e-05, + "loss": 0.3357, + "step": 4732 + }, + { + "epoch": 0.4458679729634253, + "grad_norm": 0.8742057681083679, + "learning_rate": 1.7957715394759258e-05, + "loss": 0.3445, + "step": 4733 + }, + { + "epoch": 0.44596217705659313, + "grad_norm": 0.8278061747550964, + "learning_rate": 1.7956800841694906e-05, + "loss": 0.3595, + "step": 4734 + }, + { + "epoch": 0.446056381149761, + "grad_norm": 0.7438100576400757, + "learning_rate": 1.7955886107203194e-05, + "loss": 0.3369, + "step": 4735 + }, + { + "epoch": 0.44615058524292883, + "grad_norm": 1.0333058834075928, + "learning_rate": 1.795497119130497e-05, + "loss": 0.3849, + "step": 4736 + }, + { + "epoch": 0.4462447893360967, + "grad_norm": 0.8227161169052124, + "learning_rate": 1.7954056094021105e-05, + "loss": 0.3973, + "step": 4737 + }, + { + "epoch": 0.4463389934292645, + "grad_norm": 0.7209968566894531, + "learning_rate": 1.7953140815372457e-05, + "loss": 0.3257, + "step": 4738 + }, + { + "epoch": 0.4464331975224323, + "grad_norm": 1.1025093793869019, + "learning_rate": 1.79522253553799e-05, + "loss": 0.3451, + "step": 4739 + }, + { + "epoch": 0.4465274016156002, + "grad_norm": 0.969059944152832, + "learning_rate": 1.795130971406431e-05, + "loss": 0.4156, + "step": 4740 + }, + { + "epoch": 0.446621605708768, + "grad_norm": 0.8435564041137695, + "learning_rate": 1.7950393891446555e-05, + "loss": 0.353, + "step": 4741 + }, + { + "epoch": 0.44671580980193587, + "grad_norm": 0.8512372970581055, + "learning_rate": 1.7949477887547528e-05, + "loss": 0.3407, + "step": 4742 + }, + { + "epoch": 0.4468100138951037, + "grad_norm": 0.7670828700065613, + "learning_rate": 1.794856170238811e-05, + "loss": 0.3653, + "step": 4743 + }, + { + "epoch": 0.44690421798827157, + "grad_norm": 0.6930330395698547, + "learning_rate": 1.7947645335989192e-05, + "loss": 0.3251, + "step": 4744 + }, + { + "epoch": 0.4469984220814394, + "grad_norm": 0.7446362972259521, + "learning_rate": 1.794672878837167e-05, + "loss": 0.3119, + "step": 4745 + }, + { + "epoch": 0.44709262617460727, + "grad_norm": 0.7399316430091858, + "learning_rate": 1.794581205955644e-05, + "loss": 0.3362, + "step": 4746 + }, + { + "epoch": 0.4471868302677751, + "grad_norm": 0.8266837000846863, + "learning_rate": 1.7944895149564407e-05, + "loss": 0.338, + "step": 4747 + }, + { + "epoch": 0.44728103436094296, + "grad_norm": 0.855751097202301, + "learning_rate": 1.7943978058416477e-05, + "loss": 0.3183, + "step": 4748 + }, + { + "epoch": 0.4473752384541108, + "grad_norm": 0.8574679493904114, + "learning_rate": 1.7943060786133567e-05, + "loss": 0.3526, + "step": 4749 + }, + { + "epoch": 0.44746944254727866, + "grad_norm": 0.7882106304168701, + "learning_rate": 1.794214333273658e-05, + "loss": 0.3346, + "step": 4750 + }, + { + "epoch": 0.4475636466404465, + "grad_norm": 0.7670796513557434, + "learning_rate": 1.7941225698246445e-05, + "loss": 0.3449, + "step": 4751 + }, + { + "epoch": 0.44765785073361436, + "grad_norm": 0.7950530052185059, + "learning_rate": 1.7940307882684084e-05, + "loss": 0.2871, + "step": 4752 + }, + { + "epoch": 0.4477520548267822, + "grad_norm": 0.7690070867538452, + "learning_rate": 1.7939389886070422e-05, + "loss": 0.3564, + "step": 4753 + }, + { + "epoch": 0.44784625891995006, + "grad_norm": 0.8102042078971863, + "learning_rate": 1.7938471708426392e-05, + "loss": 0.3656, + "step": 4754 + }, + { + "epoch": 0.4479404630131179, + "grad_norm": 0.803938627243042, + "learning_rate": 1.7937553349772932e-05, + "loss": 0.3311, + "step": 4755 + }, + { + "epoch": 0.44803466710628576, + "grad_norm": 0.730629026889801, + "learning_rate": 1.7936634810130974e-05, + "loss": 0.2843, + "step": 4756 + }, + { + "epoch": 0.4481288711994536, + "grad_norm": 0.8434792757034302, + "learning_rate": 1.7935716089521474e-05, + "loss": 0.3195, + "step": 4757 + }, + { + "epoch": 0.44822307529262145, + "grad_norm": 0.946655809879303, + "learning_rate": 1.7934797187965374e-05, + "loss": 0.4033, + "step": 4758 + }, + { + "epoch": 0.4483172793857893, + "grad_norm": 0.7012725472450256, + "learning_rate": 1.7933878105483623e-05, + "loss": 0.2827, + "step": 4759 + }, + { + "epoch": 0.44841148347895715, + "grad_norm": 0.7796767354011536, + "learning_rate": 1.7932958842097186e-05, + "loss": 0.367, + "step": 4760 + }, + { + "epoch": 0.448505687572125, + "grad_norm": 0.7940994501113892, + "learning_rate": 1.793203939782702e-05, + "loss": 0.335, + "step": 4761 + }, + { + "epoch": 0.44859989166529285, + "grad_norm": 0.7789350748062134, + "learning_rate": 1.793111977269408e-05, + "loss": 0.3073, + "step": 4762 + }, + { + "epoch": 0.4486940957584607, + "grad_norm": 0.7863308191299438, + "learning_rate": 1.793019996671935e-05, + "loss": 0.3153, + "step": 4763 + }, + { + "epoch": 0.44878829985162855, + "grad_norm": 0.7704945206642151, + "learning_rate": 1.7929279979923794e-05, + "loss": 0.3236, + "step": 4764 + }, + { + "epoch": 0.4488825039447964, + "grad_norm": 0.9169216752052307, + "learning_rate": 1.7928359812328392e-05, + "loss": 0.3433, + "step": 4765 + }, + { + "epoch": 0.44897670803796424, + "grad_norm": 0.8632755875587463, + "learning_rate": 1.7927439463954125e-05, + "loss": 0.316, + "step": 4766 + }, + { + "epoch": 0.4490709121311321, + "grad_norm": 0.7209241390228271, + "learning_rate": 1.792651893482198e-05, + "loss": 0.3052, + "step": 4767 + }, + { + "epoch": 0.44916511622429994, + "grad_norm": 0.7108308672904968, + "learning_rate": 1.7925598224952945e-05, + "loss": 0.2939, + "step": 4768 + }, + { + "epoch": 0.4492593203174678, + "grad_norm": 0.9988152980804443, + "learning_rate": 1.792467733436801e-05, + "loss": 0.3072, + "step": 4769 + }, + { + "epoch": 0.44935352441063564, + "grad_norm": 0.7612809538841248, + "learning_rate": 1.792375626308818e-05, + "loss": 0.368, + "step": 4770 + }, + { + "epoch": 0.4494477285038035, + "grad_norm": 0.6917704939842224, + "learning_rate": 1.792283501113445e-05, + "loss": 0.3016, + "step": 4771 + }, + { + "epoch": 0.44954193259697134, + "grad_norm": 0.8588003516197205, + "learning_rate": 1.7921913578527827e-05, + "loss": 0.3324, + "step": 4772 + }, + { + "epoch": 0.4496361366901392, + "grad_norm": 0.9099318385124207, + "learning_rate": 1.7920991965289327e-05, + "loss": 0.3872, + "step": 4773 + }, + { + "epoch": 0.44973034078330704, + "grad_norm": 0.9846810698509216, + "learning_rate": 1.7920070171439956e-05, + "loss": 0.3522, + "step": 4774 + }, + { + "epoch": 0.4498245448764749, + "grad_norm": 0.843734860420227, + "learning_rate": 1.7919148197000738e-05, + "loss": 0.37, + "step": 4775 + }, + { + "epoch": 0.44991874896964273, + "grad_norm": 0.7469123005867004, + "learning_rate": 1.7918226041992697e-05, + "loss": 0.2989, + "step": 4776 + }, + { + "epoch": 0.4500129530628106, + "grad_norm": 0.870191752910614, + "learning_rate": 1.7917303706436856e-05, + "loss": 0.3206, + "step": 4777 + }, + { + "epoch": 0.45010715715597843, + "grad_norm": 0.8193684816360474, + "learning_rate": 1.791638119035424e-05, + "loss": 0.3457, + "step": 4778 + }, + { + "epoch": 0.4502013612491463, + "grad_norm": 0.8482380509376526, + "learning_rate": 1.79154584937659e-05, + "loss": 0.38, + "step": 4779 + }, + { + "epoch": 0.45029556534231413, + "grad_norm": 0.8874141573905945, + "learning_rate": 1.7914535616692856e-05, + "loss": 0.4028, + "step": 4780 + }, + { + "epoch": 0.450389769435482, + "grad_norm": 0.8776803016662598, + "learning_rate": 1.7913612559156163e-05, + "loss": 0.4122, + "step": 4781 + }, + { + "epoch": 0.45048397352864983, + "grad_norm": 0.8267475962638855, + "learning_rate": 1.7912689321176867e-05, + "loss": 0.368, + "step": 4782 + }, + { + "epoch": 0.4505781776218177, + "grad_norm": 1.0508227348327637, + "learning_rate": 1.7911765902776014e-05, + "loss": 0.3739, + "step": 4783 + }, + { + "epoch": 0.4506723817149855, + "grad_norm": 0.7327133417129517, + "learning_rate": 1.7910842303974666e-05, + "loss": 0.3236, + "step": 4784 + }, + { + "epoch": 0.4507665858081534, + "grad_norm": 0.6878085732460022, + "learning_rate": 1.7909918524793878e-05, + "loss": 0.2869, + "step": 4785 + }, + { + "epoch": 0.4508607899013212, + "grad_norm": 2.2699522972106934, + "learning_rate": 1.7908994565254713e-05, + "loss": 0.3113, + "step": 4786 + }, + { + "epoch": 0.4509549939944891, + "grad_norm": 0.8151803016662598, + "learning_rate": 1.7908070425378242e-05, + "loss": 0.323, + "step": 4787 + }, + { + "epoch": 0.4510491980876569, + "grad_norm": 0.8966958522796631, + "learning_rate": 1.790714610518553e-05, + "loss": 0.4524, + "step": 4788 + }, + { + "epoch": 0.45114340218082477, + "grad_norm": 0.8446221947669983, + "learning_rate": 1.7906221604697664e-05, + "loss": 0.3257, + "step": 4789 + }, + { + "epoch": 0.4512376062739926, + "grad_norm": 0.9763929843902588, + "learning_rate": 1.7905296923935718e-05, + "loss": 0.3832, + "step": 4790 + }, + { + "epoch": 0.45133181036716047, + "grad_norm": 0.7935441136360168, + "learning_rate": 1.7904372062920776e-05, + "loss": 0.3746, + "step": 4791 + }, + { + "epoch": 0.4514260144603283, + "grad_norm": 0.7477930188179016, + "learning_rate": 1.7903447021673924e-05, + "loss": 0.3361, + "step": 4792 + }, + { + "epoch": 0.45152021855349617, + "grad_norm": 0.7294282913208008, + "learning_rate": 1.7902521800216258e-05, + "loss": 0.3036, + "step": 4793 + }, + { + "epoch": 0.451614422646664, + "grad_norm": 0.7948665618896484, + "learning_rate": 1.7901596398568877e-05, + "loss": 0.3452, + "step": 4794 + }, + { + "epoch": 0.45170862673983186, + "grad_norm": 0.7534676790237427, + "learning_rate": 1.7900670816752875e-05, + "loss": 0.3087, + "step": 4795 + }, + { + "epoch": 0.4518028308329997, + "grad_norm": 0.776970386505127, + "learning_rate": 1.7899745054789358e-05, + "loss": 0.3258, + "step": 4796 + }, + { + "epoch": 0.45189703492616756, + "grad_norm": 0.7340169548988342, + "learning_rate": 1.789881911269944e-05, + "loss": 0.3417, + "step": 4797 + }, + { + "epoch": 0.4519912390193354, + "grad_norm": 0.7311638593673706, + "learning_rate": 1.789789299050423e-05, + "loss": 0.336, + "step": 4798 + }, + { + "epoch": 0.45208544311250326, + "grad_norm": 0.8584436178207397, + "learning_rate": 1.7896966688224843e-05, + "loss": 0.3535, + "step": 4799 + }, + { + "epoch": 0.4521796472056711, + "grad_norm": 0.7118861675262451, + "learning_rate": 1.7896040205882405e-05, + "loss": 0.3351, + "step": 4800 + }, + { + "epoch": 0.45227385129883896, + "grad_norm": 0.7925348281860352, + "learning_rate": 1.789511354349804e-05, + "loss": 0.3641, + "step": 4801 + }, + { + "epoch": 0.4523680553920068, + "grad_norm": 0.7541933059692383, + "learning_rate": 1.7894186701092874e-05, + "loss": 0.3218, + "step": 4802 + }, + { + "epoch": 0.45246225948517466, + "grad_norm": 0.762876570224762, + "learning_rate": 1.789325967868804e-05, + "loss": 0.3001, + "step": 4803 + }, + { + "epoch": 0.4525564635783425, + "grad_norm": 0.7414087057113647, + "learning_rate": 1.7892332476304684e-05, + "loss": 0.3026, + "step": 4804 + }, + { + "epoch": 0.45265066767151035, + "grad_norm": 1.387542963027954, + "learning_rate": 1.789140509396394e-05, + "loss": 0.3305, + "step": 4805 + }, + { + "epoch": 0.4527448717646782, + "grad_norm": 0.8495650291442871, + "learning_rate": 1.7890477531686953e-05, + "loss": 0.3466, + "step": 4806 + }, + { + "epoch": 0.452839075857846, + "grad_norm": 0.8461723327636719, + "learning_rate": 1.7889549789494876e-05, + "loss": 0.354, + "step": 4807 + }, + { + "epoch": 0.45293327995101385, + "grad_norm": 0.7747946381568909, + "learning_rate": 1.7888621867408864e-05, + "loss": 0.3115, + "step": 4808 + }, + { + "epoch": 0.4530274840441817, + "grad_norm": 0.7994644045829773, + "learning_rate": 1.788769376545007e-05, + "loss": 0.3099, + "step": 4809 + }, + { + "epoch": 0.45312168813734954, + "grad_norm": 0.7755493521690369, + "learning_rate": 1.7886765483639666e-05, + "loss": 0.3465, + "step": 4810 + }, + { + "epoch": 0.4532158922305174, + "grad_norm": 0.784080445766449, + "learning_rate": 1.7885837021998806e-05, + "loss": 0.3391, + "step": 4811 + }, + { + "epoch": 0.45331009632368524, + "grad_norm": 0.9579580426216125, + "learning_rate": 1.7884908380548668e-05, + "loss": 0.3874, + "step": 4812 + }, + { + "epoch": 0.4534043004168531, + "grad_norm": 0.8352931141853333, + "learning_rate": 1.7883979559310427e-05, + "loss": 0.3628, + "step": 4813 + }, + { + "epoch": 0.45349850451002094, + "grad_norm": 0.8074339032173157, + "learning_rate": 1.7883050558305255e-05, + "loss": 0.3758, + "step": 4814 + }, + { + "epoch": 0.4535927086031888, + "grad_norm": 0.7530412077903748, + "learning_rate": 1.7882121377554342e-05, + "loss": 0.3245, + "step": 4815 + }, + { + "epoch": 0.45368691269635664, + "grad_norm": 0.7738369703292847, + "learning_rate": 1.788119201707887e-05, + "loss": 0.321, + "step": 4816 + }, + { + "epoch": 0.4537811167895245, + "grad_norm": 0.9809846878051758, + "learning_rate": 1.7880262476900032e-05, + "loss": 0.3473, + "step": 4817 + }, + { + "epoch": 0.45387532088269233, + "grad_norm": 0.7883719205856323, + "learning_rate": 1.787933275703902e-05, + "loss": 0.3199, + "step": 4818 + }, + { + "epoch": 0.4539695249758602, + "grad_norm": 0.8401819467544556, + "learning_rate": 1.7878402857517044e-05, + "loss": 0.384, + "step": 4819 + }, + { + "epoch": 0.45406372906902803, + "grad_norm": 0.7190160751342773, + "learning_rate": 1.7877472778355292e-05, + "loss": 0.3125, + "step": 4820 + }, + { + "epoch": 0.4541579331621959, + "grad_norm": 0.8060850501060486, + "learning_rate": 1.787654251957498e-05, + "loss": 0.3674, + "step": 4821 + }, + { + "epoch": 0.45425213725536373, + "grad_norm": 0.7536591291427612, + "learning_rate": 1.7875612081197315e-05, + "loss": 0.3106, + "step": 4822 + }, + { + "epoch": 0.4543463413485316, + "grad_norm": 0.8911111950874329, + "learning_rate": 1.7874681463243516e-05, + "loss": 0.3664, + "step": 4823 + }, + { + "epoch": 0.45444054544169943, + "grad_norm": 0.8450865745544434, + "learning_rate": 1.7873750665734803e-05, + "loss": 0.33, + "step": 4824 + }, + { + "epoch": 0.4545347495348673, + "grad_norm": 0.7691038846969604, + "learning_rate": 1.7872819688692394e-05, + "loss": 0.3602, + "step": 4825 + }, + { + "epoch": 0.4546289536280351, + "grad_norm": 0.9228634238243103, + "learning_rate": 1.7871888532137524e-05, + "loss": 0.3903, + "step": 4826 + }, + { + "epoch": 0.454723157721203, + "grad_norm": 0.7325657606124878, + "learning_rate": 1.7870957196091415e-05, + "loss": 0.2993, + "step": 4827 + }, + { + "epoch": 0.4548173618143708, + "grad_norm": 0.7746484875679016, + "learning_rate": 1.7870025680575315e-05, + "loss": 0.3491, + "step": 4828 + }, + { + "epoch": 0.4549115659075387, + "grad_norm": 0.8059161901473999, + "learning_rate": 1.7869093985610457e-05, + "loss": 0.3414, + "step": 4829 + }, + { + "epoch": 0.4550057700007065, + "grad_norm": 0.7639622688293457, + "learning_rate": 1.786816211121809e-05, + "loss": 0.3296, + "step": 4830 + }, + { + "epoch": 0.45509997409387437, + "grad_norm": 0.7499001622200012, + "learning_rate": 1.7867230057419455e-05, + "loss": 0.3494, + "step": 4831 + }, + { + "epoch": 0.4551941781870422, + "grad_norm": 0.8941529393196106, + "learning_rate": 1.7866297824235808e-05, + "loss": 0.3568, + "step": 4832 + }, + { + "epoch": 0.45528838228021007, + "grad_norm": 0.8757671117782593, + "learning_rate": 1.7865365411688405e-05, + "loss": 0.3385, + "step": 4833 + }, + { + "epoch": 0.4553825863733779, + "grad_norm": 0.9858109354972839, + "learning_rate": 1.7864432819798506e-05, + "loss": 0.33, + "step": 4834 + }, + { + "epoch": 0.45547679046654577, + "grad_norm": 0.778272271156311, + "learning_rate": 1.7863500048587378e-05, + "loss": 0.3484, + "step": 4835 + }, + { + "epoch": 0.4555709945597136, + "grad_norm": 0.7088613510131836, + "learning_rate": 1.7862567098076285e-05, + "loss": 0.3089, + "step": 4836 + }, + { + "epoch": 0.45566519865288146, + "grad_norm": 0.7945699095726013, + "learning_rate": 1.7861633968286502e-05, + "loss": 0.3196, + "step": 4837 + }, + { + "epoch": 0.4557594027460493, + "grad_norm": 0.9303908944129944, + "learning_rate": 1.786070065923931e-05, + "loss": 0.3346, + "step": 4838 + }, + { + "epoch": 0.45585360683921716, + "grad_norm": 0.6546573042869568, + "learning_rate": 1.7859767170955983e-05, + "loss": 0.3137, + "step": 4839 + }, + { + "epoch": 0.455947810932385, + "grad_norm": 0.7632937431335449, + "learning_rate": 1.785883350345781e-05, + "loss": 0.351, + "step": 4840 + }, + { + "epoch": 0.45604201502555286, + "grad_norm": 0.7434275150299072, + "learning_rate": 1.785789965676608e-05, + "loss": 0.3104, + "step": 4841 + }, + { + "epoch": 0.4561362191187207, + "grad_norm": 0.8117392063140869, + "learning_rate": 1.7856965630902085e-05, + "loss": 0.3675, + "step": 4842 + }, + { + "epoch": 0.45623042321188856, + "grad_norm": 0.8150480389595032, + "learning_rate": 1.7856031425887127e-05, + "loss": 0.3365, + "step": 4843 + }, + { + "epoch": 0.4563246273050564, + "grad_norm": 0.7850632667541504, + "learning_rate": 1.7855097041742495e-05, + "loss": 0.3739, + "step": 4844 + }, + { + "epoch": 0.45641883139822426, + "grad_norm": 0.7791478037834167, + "learning_rate": 1.7854162478489507e-05, + "loss": 0.3229, + "step": 4845 + }, + { + "epoch": 0.4565130354913921, + "grad_norm": 0.8242077827453613, + "learning_rate": 1.7853227736149468e-05, + "loss": 0.3356, + "step": 4846 + }, + { + "epoch": 0.45660723958455995, + "grad_norm": 0.777420163154602, + "learning_rate": 1.785229281474369e-05, + "loss": 0.3696, + "step": 4847 + }, + { + "epoch": 0.4567014436777278, + "grad_norm": 0.8795902729034424, + "learning_rate": 1.7851357714293488e-05, + "loss": 0.3829, + "step": 4848 + }, + { + "epoch": 0.45679564777089565, + "grad_norm": 0.9282544255256653, + "learning_rate": 1.7850422434820194e-05, + "loss": 0.4562, + "step": 4849 + }, + { + "epoch": 0.4568898518640635, + "grad_norm": 0.775903046131134, + "learning_rate": 1.7849486976345125e-05, + "loss": 0.3489, + "step": 4850 + }, + { + "epoch": 0.45698405595723135, + "grad_norm": 1.5967013835906982, + "learning_rate": 1.7848551338889615e-05, + "loss": 0.3815, + "step": 4851 + }, + { + "epoch": 0.4570782600503992, + "grad_norm": 0.7737221121788025, + "learning_rate": 1.7847615522474993e-05, + "loss": 0.2724, + "step": 4852 + }, + { + "epoch": 0.45717246414356705, + "grad_norm": 0.7455960512161255, + "learning_rate": 1.7846679527122605e-05, + "loss": 0.3158, + "step": 4853 + }, + { + "epoch": 0.4572666682367349, + "grad_norm": 0.8008338809013367, + "learning_rate": 1.784574335285379e-05, + "loss": 0.3704, + "step": 4854 + }, + { + "epoch": 0.45736087232990275, + "grad_norm": 0.8408627510070801, + "learning_rate": 1.7844806999689886e-05, + "loss": 0.331, + "step": 4855 + }, + { + "epoch": 0.4574550764230706, + "grad_norm": 0.8857258558273315, + "learning_rate": 1.7843870467652252e-05, + "loss": 0.3796, + "step": 4856 + }, + { + "epoch": 0.45754928051623844, + "grad_norm": 0.6914681196212769, + "learning_rate": 1.7842933756762246e-05, + "loss": 0.3093, + "step": 4857 + }, + { + "epoch": 0.4576434846094063, + "grad_norm": 0.6860592365264893, + "learning_rate": 1.7841996867041213e-05, + "loss": 0.2916, + "step": 4858 + }, + { + "epoch": 0.45773768870257414, + "grad_norm": 0.7824400663375854, + "learning_rate": 1.784105979851053e-05, + "loss": 0.3269, + "step": 4859 + }, + { + "epoch": 0.457831892795742, + "grad_norm": 0.7608857154846191, + "learning_rate": 1.7840122551191555e-05, + "loss": 0.3494, + "step": 4860 + }, + { + "epoch": 0.45792609688890984, + "grad_norm": 0.8246377110481262, + "learning_rate": 1.783918512510566e-05, + "loss": 0.3698, + "step": 4861 + }, + { + "epoch": 0.4580203009820777, + "grad_norm": 0.8004178404808044, + "learning_rate": 1.7838247520274223e-05, + "loss": 0.3597, + "step": 4862 + }, + { + "epoch": 0.45811450507524554, + "grad_norm": 1.0507524013519287, + "learning_rate": 1.7837309736718616e-05, + "loss": 0.3652, + "step": 4863 + }, + { + "epoch": 0.4582087091684134, + "grad_norm": 0.8246618509292603, + "learning_rate": 1.783637177446023e-05, + "loss": 0.3411, + "step": 4864 + }, + { + "epoch": 0.45830291326158124, + "grad_norm": 0.843111515045166, + "learning_rate": 1.783543363352045e-05, + "loss": 0.3652, + "step": 4865 + }, + { + "epoch": 0.4583971173547491, + "grad_norm": 0.9570274949073792, + "learning_rate": 1.7834495313920662e-05, + "loss": 0.3723, + "step": 4866 + }, + { + "epoch": 0.45849132144791693, + "grad_norm": 0.9009582996368408, + "learning_rate": 1.7833556815682263e-05, + "loss": 0.3898, + "step": 4867 + }, + { + "epoch": 0.4585855255410848, + "grad_norm": 0.7815099954605103, + "learning_rate": 1.783261813882666e-05, + "loss": 0.3088, + "step": 4868 + }, + { + "epoch": 0.45867972963425263, + "grad_norm": 0.7526468634605408, + "learning_rate": 1.7831679283375245e-05, + "loss": 0.2947, + "step": 4869 + }, + { + "epoch": 0.4587739337274205, + "grad_norm": 0.7108429074287415, + "learning_rate": 1.783074024934943e-05, + "loss": 0.3357, + "step": 4870 + }, + { + "epoch": 0.45886813782058833, + "grad_norm": 0.8212223052978516, + "learning_rate": 1.7829801036770628e-05, + "loss": 0.3615, + "step": 4871 + }, + { + "epoch": 0.4589623419137562, + "grad_norm": 0.9447070360183716, + "learning_rate": 1.7828861645660257e-05, + "loss": 0.3824, + "step": 4872 + }, + { + "epoch": 0.459056546006924, + "grad_norm": 0.7781452536582947, + "learning_rate": 1.782792207603973e-05, + "loss": 0.3244, + "step": 4873 + }, + { + "epoch": 0.4591507501000919, + "grad_norm": 0.8141428232192993, + "learning_rate": 1.7826982327930474e-05, + "loss": 0.3668, + "step": 4874 + }, + { + "epoch": 0.4592449541932597, + "grad_norm": 0.748768150806427, + "learning_rate": 1.7826042401353914e-05, + "loss": 0.3, + "step": 4875 + }, + { + "epoch": 0.4593391582864275, + "grad_norm": 0.7869177460670471, + "learning_rate": 1.7825102296331483e-05, + "loss": 0.3523, + "step": 4876 + }, + { + "epoch": 0.45943336237959537, + "grad_norm": 0.8053706884384155, + "learning_rate": 1.7824162012884625e-05, + "loss": 0.3656, + "step": 4877 + }, + { + "epoch": 0.4595275664727632, + "grad_norm": 0.7964096665382385, + "learning_rate": 1.7823221551034766e-05, + "loss": 0.3686, + "step": 4878 + }, + { + "epoch": 0.45962177056593106, + "grad_norm": 0.89954674243927, + "learning_rate": 1.782228091080336e-05, + "loss": 0.4377, + "step": 4879 + }, + { + "epoch": 0.4597159746590989, + "grad_norm": 0.7223961353302002, + "learning_rate": 1.7821340092211853e-05, + "loss": 0.3245, + "step": 4880 + }, + { + "epoch": 0.45981017875226676, + "grad_norm": 0.965713620185852, + "learning_rate": 1.782039909528169e-05, + "loss": 0.3642, + "step": 4881 + }, + { + "epoch": 0.4599043828454346, + "grad_norm": 1.8892385959625244, + "learning_rate": 1.781945792003434e-05, + "loss": 0.3652, + "step": 4882 + }, + { + "epoch": 0.45999858693860246, + "grad_norm": 0.7769013047218323, + "learning_rate": 1.7818516566491254e-05, + "loss": 0.347, + "step": 4883 + }, + { + "epoch": 0.4600927910317703, + "grad_norm": 0.9221687912940979, + "learning_rate": 1.78175750346739e-05, + "loss": 0.3585, + "step": 4884 + }, + { + "epoch": 0.46018699512493816, + "grad_norm": 0.772746205329895, + "learning_rate": 1.781663332460374e-05, + "loss": 0.3258, + "step": 4885 + }, + { + "epoch": 0.460281199218106, + "grad_norm": 0.8882587552070618, + "learning_rate": 1.7815691436302255e-05, + "loss": 0.341, + "step": 4886 + }, + { + "epoch": 0.46037540331127386, + "grad_norm": 1.0558087825775146, + "learning_rate": 1.781474936979092e-05, + "loss": 0.3485, + "step": 4887 + }, + { + "epoch": 0.4604696074044417, + "grad_norm": 0.8857042193412781, + "learning_rate": 1.7813807125091213e-05, + "loss": 0.3225, + "step": 4888 + }, + { + "epoch": 0.46056381149760955, + "grad_norm": 0.8105474710464478, + "learning_rate": 1.7812864702224616e-05, + "loss": 0.3585, + "step": 4889 + }, + { + "epoch": 0.4606580155907774, + "grad_norm": 0.9532637000083923, + "learning_rate": 1.7811922101212622e-05, + "loss": 0.2935, + "step": 4890 + }, + { + "epoch": 0.46075221968394525, + "grad_norm": 0.8896369934082031, + "learning_rate": 1.7810979322076725e-05, + "loss": 0.3823, + "step": 4891 + }, + { + "epoch": 0.4608464237771131, + "grad_norm": 0.86832195520401, + "learning_rate": 1.7810036364838418e-05, + "loss": 0.4171, + "step": 4892 + }, + { + "epoch": 0.46094062787028095, + "grad_norm": 1.2206555604934692, + "learning_rate": 1.7809093229519203e-05, + "loss": 0.3373, + "step": 4893 + }, + { + "epoch": 0.4610348319634488, + "grad_norm": 0.8856639266014099, + "learning_rate": 1.7808149916140584e-05, + "loss": 0.3417, + "step": 4894 + }, + { + "epoch": 0.46112903605661665, + "grad_norm": 0.7485201954841614, + "learning_rate": 1.7807206424724076e-05, + "loss": 0.333, + "step": 4895 + }, + { + "epoch": 0.4612232401497845, + "grad_norm": 2.2062063217163086, + "learning_rate": 1.780626275529118e-05, + "loss": 0.3084, + "step": 4896 + }, + { + "epoch": 0.46131744424295235, + "grad_norm": 0.7655016779899597, + "learning_rate": 1.7805318907863428e-05, + "loss": 0.3406, + "step": 4897 + }, + { + "epoch": 0.4614116483361202, + "grad_norm": 0.9309150576591492, + "learning_rate": 1.780437488246233e-05, + "loss": 0.4256, + "step": 4898 + }, + { + "epoch": 0.46150585242928804, + "grad_norm": 0.7266963124275208, + "learning_rate": 1.7803430679109416e-05, + "loss": 0.3245, + "step": 4899 + }, + { + "epoch": 0.4616000565224559, + "grad_norm": 0.7196773290634155, + "learning_rate": 1.7802486297826216e-05, + "loss": 0.3361, + "step": 4900 + }, + { + "epoch": 0.46169426061562374, + "grad_norm": 0.7565684914588928, + "learning_rate": 1.780154173863426e-05, + "loss": 0.3536, + "step": 4901 + }, + { + "epoch": 0.4617884647087916, + "grad_norm": 0.8094671368598938, + "learning_rate": 1.7800597001555087e-05, + "loss": 0.3419, + "step": 4902 + }, + { + "epoch": 0.46188266880195944, + "grad_norm": 0.7917925119400024, + "learning_rate": 1.779965208661024e-05, + "loss": 0.3173, + "step": 4903 + }, + { + "epoch": 0.4619768728951273, + "grad_norm": 0.7965604662895203, + "learning_rate": 1.779870699382126e-05, + "loss": 0.3294, + "step": 4904 + }, + { + "epoch": 0.46207107698829514, + "grad_norm": 0.86468106508255, + "learning_rate": 1.77977617232097e-05, + "loss": 0.2982, + "step": 4905 + }, + { + "epoch": 0.462165281081463, + "grad_norm": 0.8347722291946411, + "learning_rate": 1.7796816274797115e-05, + "loss": 0.3868, + "step": 4906 + }, + { + "epoch": 0.46225948517463084, + "grad_norm": 0.8633783459663391, + "learning_rate": 1.779587064860506e-05, + "loss": 0.3621, + "step": 4907 + }, + { + "epoch": 0.4623536892677987, + "grad_norm": 0.7564566135406494, + "learning_rate": 1.77949248446551e-05, + "loss": 0.3439, + "step": 4908 + }, + { + "epoch": 0.46244789336096653, + "grad_norm": 1.1352871656417847, + "learning_rate": 1.7793978862968794e-05, + "loss": 0.3524, + "step": 4909 + }, + { + "epoch": 0.4625420974541344, + "grad_norm": 0.7950761914253235, + "learning_rate": 1.779303270356772e-05, + "loss": 0.3256, + "step": 4910 + }, + { + "epoch": 0.46263630154730223, + "grad_norm": 0.7687955498695374, + "learning_rate": 1.7792086366473447e-05, + "loss": 0.3191, + "step": 4911 + }, + { + "epoch": 0.4627305056404701, + "grad_norm": 0.7171542644500732, + "learning_rate": 1.7791139851707552e-05, + "loss": 0.3123, + "step": 4912 + }, + { + "epoch": 0.46282470973363793, + "grad_norm": 0.8845537900924683, + "learning_rate": 1.7790193159291622e-05, + "loss": 0.3339, + "step": 4913 + }, + { + "epoch": 0.4629189138268058, + "grad_norm": 0.7290259599685669, + "learning_rate": 1.7789246289247238e-05, + "loss": 0.2956, + "step": 4914 + }, + { + "epoch": 0.4630131179199736, + "grad_norm": 1.083981990814209, + "learning_rate": 1.7788299241595997e-05, + "loss": 0.3068, + "step": 4915 + }, + { + "epoch": 0.4631073220131415, + "grad_norm": 0.6418946385383606, + "learning_rate": 1.7787352016359483e-05, + "loss": 0.2986, + "step": 4916 + }, + { + "epoch": 0.4632015261063093, + "grad_norm": 0.7161775231361389, + "learning_rate": 1.7786404613559304e-05, + "loss": 0.3123, + "step": 4917 + }, + { + "epoch": 0.4632957301994772, + "grad_norm": 0.9228521585464478, + "learning_rate": 1.7785457033217054e-05, + "loss": 0.3632, + "step": 4918 + }, + { + "epoch": 0.463389934292645, + "grad_norm": 0.8408086895942688, + "learning_rate": 1.7784509275354348e-05, + "loss": 0.3463, + "step": 4919 + }, + { + "epoch": 0.46348413838581287, + "grad_norm": 0.7921558022499084, + "learning_rate": 1.7783561339992784e-05, + "loss": 0.398, + "step": 4920 + }, + { + "epoch": 0.4635783424789807, + "grad_norm": 0.8069859743118286, + "learning_rate": 1.778261322715399e-05, + "loss": 0.3098, + "step": 4921 + }, + { + "epoch": 0.46367254657214857, + "grad_norm": 1.0308336019515991, + "learning_rate": 1.7781664936859576e-05, + "loss": 0.3692, + "step": 4922 + }, + { + "epoch": 0.4637667506653164, + "grad_norm": 0.7749543190002441, + "learning_rate": 1.7780716469131167e-05, + "loss": 0.3501, + "step": 4923 + }, + { + "epoch": 0.46386095475848427, + "grad_norm": 0.8365309834480286, + "learning_rate": 1.7779767823990394e-05, + "loss": 0.3125, + "step": 4924 + }, + { + "epoch": 0.4639551588516521, + "grad_norm": 0.8372287750244141, + "learning_rate": 1.7778819001458877e-05, + "loss": 0.3244, + "step": 4925 + }, + { + "epoch": 0.46404936294481997, + "grad_norm": 0.8181421160697937, + "learning_rate": 1.777787000155826e-05, + "loss": 0.3302, + "step": 4926 + }, + { + "epoch": 0.4641435670379878, + "grad_norm": 0.8703104853630066, + "learning_rate": 1.7776920824310174e-05, + "loss": 0.3419, + "step": 4927 + }, + { + "epoch": 0.46423777113115566, + "grad_norm": 0.8228803277015686, + "learning_rate": 1.777597146973627e-05, + "loss": 0.3443, + "step": 4928 + }, + { + "epoch": 0.4643319752243235, + "grad_norm": 0.7369617819786072, + "learning_rate": 1.777502193785819e-05, + "loss": 0.3136, + "step": 4929 + }, + { + "epoch": 0.46442617931749136, + "grad_norm": 0.8577859997749329, + "learning_rate": 1.7774072228697584e-05, + "loss": 0.3719, + "step": 4930 + }, + { + "epoch": 0.4645203834106592, + "grad_norm": 0.8309158086776733, + "learning_rate": 1.777312234227611e-05, + "loss": 0.3265, + "step": 4931 + }, + { + "epoch": 0.46461458750382706, + "grad_norm": 0.8043569326400757, + "learning_rate": 1.7772172278615424e-05, + "loss": 0.3451, + "step": 4932 + }, + { + "epoch": 0.4647087915969949, + "grad_norm": 0.7470012307167053, + "learning_rate": 1.7771222037737192e-05, + "loss": 0.3274, + "step": 4933 + }, + { + "epoch": 0.46480299569016276, + "grad_norm": 0.725937008857727, + "learning_rate": 1.777027161966308e-05, + "loss": 0.3113, + "step": 4934 + }, + { + "epoch": 0.4648971997833306, + "grad_norm": 0.8681197762489319, + "learning_rate": 1.7769321024414753e-05, + "loss": 0.3397, + "step": 4935 + }, + { + "epoch": 0.46499140387649845, + "grad_norm": 0.8625389933586121, + "learning_rate": 1.7768370252013897e-05, + "loss": 0.3492, + "step": 4936 + }, + { + "epoch": 0.4650856079696663, + "grad_norm": 0.7602217197418213, + "learning_rate": 1.776741930248218e-05, + "loss": 0.3682, + "step": 4937 + }, + { + "epoch": 0.46517981206283415, + "grad_norm": 0.9192726016044617, + "learning_rate": 1.7766468175841295e-05, + "loss": 0.3581, + "step": 4938 + }, + { + "epoch": 0.465274016156002, + "grad_norm": 0.8578839898109436, + "learning_rate": 1.776551687211292e-05, + "loss": 0.3345, + "step": 4939 + }, + { + "epoch": 0.46536822024916985, + "grad_norm": 1.0106184482574463, + "learning_rate": 1.7764565391318753e-05, + "loss": 0.3459, + "step": 4940 + }, + { + "epoch": 0.4654624243423377, + "grad_norm": 0.7562296390533447, + "learning_rate": 1.7763613733480486e-05, + "loss": 0.3194, + "step": 4941 + }, + { + "epoch": 0.46555662843550555, + "grad_norm": 0.8383955359458923, + "learning_rate": 1.776266189861982e-05, + "loss": 0.3424, + "step": 4942 + }, + { + "epoch": 0.4656508325286734, + "grad_norm": 0.800328254699707, + "learning_rate": 1.7761709886758458e-05, + "loss": 0.3477, + "step": 4943 + }, + { + "epoch": 0.46574503662184125, + "grad_norm": 0.8032830357551575, + "learning_rate": 1.776075769791811e-05, + "loss": 0.336, + "step": 4944 + }, + { + "epoch": 0.46583924071500904, + "grad_norm": 0.7230134010314941, + "learning_rate": 1.775980533212048e-05, + "loss": 0.339, + "step": 4945 + }, + { + "epoch": 0.4659334448081769, + "grad_norm": 0.7199127674102783, + "learning_rate": 1.775885278938729e-05, + "loss": 0.2564, + "step": 4946 + }, + { + "epoch": 0.46602764890134474, + "grad_norm": 0.8952180743217468, + "learning_rate": 1.7757900069740253e-05, + "loss": 0.3629, + "step": 4947 + }, + { + "epoch": 0.4661218529945126, + "grad_norm": 0.8440223336219788, + "learning_rate": 1.77569471732011e-05, + "loss": 0.3758, + "step": 4948 + }, + { + "epoch": 0.46621605708768044, + "grad_norm": 0.8343638777732849, + "learning_rate": 1.7755994099791555e-05, + "loss": 0.341, + "step": 4949 + }, + { + "epoch": 0.4663102611808483, + "grad_norm": 0.8626476526260376, + "learning_rate": 1.775504084953335e-05, + "loss": 0.3534, + "step": 4950 + }, + { + "epoch": 0.46640446527401613, + "grad_norm": 0.7678931951522827, + "learning_rate": 1.7754087422448217e-05, + "loss": 0.3387, + "step": 4951 + }, + { + "epoch": 0.466498669367184, + "grad_norm": 0.6777465343475342, + "learning_rate": 1.77531338185579e-05, + "loss": 0.2817, + "step": 4952 + }, + { + "epoch": 0.46659287346035183, + "grad_norm": 0.8011043071746826, + "learning_rate": 1.7752180037884143e-05, + "loss": 0.3121, + "step": 4953 + }, + { + "epoch": 0.4666870775535197, + "grad_norm": 0.8017278909683228, + "learning_rate": 1.7751226080448694e-05, + "loss": 0.3342, + "step": 4954 + }, + { + "epoch": 0.46678128164668753, + "grad_norm": 0.805519700050354, + "learning_rate": 1.77502719462733e-05, + "loss": 0.3789, + "step": 4955 + }, + { + "epoch": 0.4668754857398554, + "grad_norm": 0.8002102375030518, + "learning_rate": 1.7749317635379718e-05, + "loss": 0.3464, + "step": 4956 + }, + { + "epoch": 0.4669696898330232, + "grad_norm": 0.8926291465759277, + "learning_rate": 1.7748363147789712e-05, + "loss": 0.3619, + "step": 4957 + }, + { + "epoch": 0.4670638939261911, + "grad_norm": 0.7907378673553467, + "learning_rate": 1.7747408483525045e-05, + "loss": 0.3027, + "step": 4958 + }, + { + "epoch": 0.4671580980193589, + "grad_norm": 0.8785030245780945, + "learning_rate": 1.774645364260748e-05, + "loss": 0.3662, + "step": 4959 + }, + { + "epoch": 0.4672523021125268, + "grad_norm": 1.0228649377822876, + "learning_rate": 1.774549862505879e-05, + "loss": 0.367, + "step": 4960 + }, + { + "epoch": 0.4673465062056946, + "grad_norm": 0.7833585739135742, + "learning_rate": 1.7744543430900755e-05, + "loss": 0.35, + "step": 4961 + }, + { + "epoch": 0.46744071029886247, + "grad_norm": 0.7718712091445923, + "learning_rate": 1.7743588060155153e-05, + "loss": 0.3274, + "step": 4962 + }, + { + "epoch": 0.4675349143920303, + "grad_norm": 0.7573497891426086, + "learning_rate": 1.7742632512843768e-05, + "loss": 0.3189, + "step": 4963 + }, + { + "epoch": 0.46762911848519817, + "grad_norm": 0.857114851474762, + "learning_rate": 1.7741676788988386e-05, + "loss": 0.3441, + "step": 4964 + }, + { + "epoch": 0.467723322578366, + "grad_norm": 0.7921775579452515, + "learning_rate": 1.7740720888610802e-05, + "loss": 0.361, + "step": 4965 + }, + { + "epoch": 0.46781752667153387, + "grad_norm": 0.6085286140441895, + "learning_rate": 1.773976481173281e-05, + "loss": 0.2574, + "step": 4966 + }, + { + "epoch": 0.4679117307647017, + "grad_norm": 0.7605984807014465, + "learning_rate": 1.773880855837621e-05, + "loss": 0.3201, + "step": 4967 + }, + { + "epoch": 0.46800593485786957, + "grad_norm": 0.8221992254257202, + "learning_rate": 1.7737852128562807e-05, + "loss": 0.3248, + "step": 4968 + }, + { + "epoch": 0.4681001389510374, + "grad_norm": 0.7544741034507751, + "learning_rate": 1.773689552231441e-05, + "loss": 0.3248, + "step": 4969 + }, + { + "epoch": 0.46819434304420526, + "grad_norm": 1.3404844999313354, + "learning_rate": 1.7735938739652827e-05, + "loss": 0.3196, + "step": 4970 + }, + { + "epoch": 0.4682885471373731, + "grad_norm": 0.8288660049438477, + "learning_rate": 1.773498178059988e-05, + "loss": 0.3616, + "step": 4971 + }, + { + "epoch": 0.46838275123054096, + "grad_norm": 1.1371781826019287, + "learning_rate": 1.7734024645177382e-05, + "loss": 0.372, + "step": 4972 + }, + { + "epoch": 0.4684769553237088, + "grad_norm": 0.8881229162216187, + "learning_rate": 1.773306733340716e-05, + "loss": 0.372, + "step": 4973 + }, + { + "epoch": 0.46857115941687666, + "grad_norm": 0.775094747543335, + "learning_rate": 1.773210984531105e-05, + "loss": 0.3763, + "step": 4974 + }, + { + "epoch": 0.4686653635100445, + "grad_norm": 0.8069360256195068, + "learning_rate": 1.773115218091087e-05, + "loss": 0.318, + "step": 4975 + }, + { + "epoch": 0.46875956760321236, + "grad_norm": 0.8223565816879272, + "learning_rate": 1.773019434022847e-05, + "loss": 0.329, + "step": 4976 + }, + { + "epoch": 0.4688537716963802, + "grad_norm": 0.7707805037498474, + "learning_rate": 1.7729236323285684e-05, + "loss": 0.2907, + "step": 4977 + }, + { + "epoch": 0.46894797578954805, + "grad_norm": 0.8500064611434937, + "learning_rate": 1.7728278130104356e-05, + "loss": 0.3522, + "step": 4978 + }, + { + "epoch": 0.4690421798827159, + "grad_norm": 0.8865066766738892, + "learning_rate": 1.772731976070633e-05, + "loss": 0.3546, + "step": 4979 + }, + { + "epoch": 0.46913638397588375, + "grad_norm": 0.7410879135131836, + "learning_rate": 1.7726361215113467e-05, + "loss": 0.3377, + "step": 4980 + }, + { + "epoch": 0.4692305880690516, + "grad_norm": 0.8143529891967773, + "learning_rate": 1.7725402493347618e-05, + "loss": 0.3295, + "step": 4981 + }, + { + "epoch": 0.46932479216221945, + "grad_norm": 0.8637810945510864, + "learning_rate": 1.7724443595430643e-05, + "loss": 0.3686, + "step": 4982 + }, + { + "epoch": 0.4694189962553873, + "grad_norm": 0.772445797920227, + "learning_rate": 1.7723484521384412e-05, + "loss": 0.3033, + "step": 4983 + }, + { + "epoch": 0.46951320034855515, + "grad_norm": 0.9185500741004944, + "learning_rate": 1.772252527123079e-05, + "loss": 0.3474, + "step": 4984 + }, + { + "epoch": 0.469607404441723, + "grad_norm": 0.8710028529167175, + "learning_rate": 1.7721565844991643e-05, + "loss": 0.3813, + "step": 4985 + }, + { + "epoch": 0.46970160853489085, + "grad_norm": 0.7688071131706238, + "learning_rate": 1.772060624268886e-05, + "loss": 0.3218, + "step": 4986 + }, + { + "epoch": 0.4697958126280587, + "grad_norm": 0.7208905816078186, + "learning_rate": 1.7719646464344307e-05, + "loss": 0.3245, + "step": 4987 + }, + { + "epoch": 0.46989001672122654, + "grad_norm": 0.788725733757019, + "learning_rate": 1.771868650997988e-05, + "loss": 0.3381, + "step": 4988 + }, + { + "epoch": 0.4699842208143944, + "grad_norm": 0.8555259704589844, + "learning_rate": 1.7717726379617462e-05, + "loss": 0.3384, + "step": 4989 + }, + { + "epoch": 0.47007842490756224, + "grad_norm": 0.8116970658302307, + "learning_rate": 1.771676607327895e-05, + "loss": 0.3377, + "step": 4990 + }, + { + "epoch": 0.4701726290007301, + "grad_norm": 0.817533552646637, + "learning_rate": 1.7715805590986232e-05, + "loss": 0.3524, + "step": 4991 + }, + { + "epoch": 0.47026683309389794, + "grad_norm": 1.2434886693954468, + "learning_rate": 1.771484493276122e-05, + "loss": 0.3988, + "step": 4992 + }, + { + "epoch": 0.4703610371870658, + "grad_norm": 0.8891094923019409, + "learning_rate": 1.771388409862581e-05, + "loss": 0.3913, + "step": 4993 + }, + { + "epoch": 0.47045524128023364, + "grad_norm": 0.7836058139801025, + "learning_rate": 1.771292308860191e-05, + "loss": 0.3209, + "step": 4994 + }, + { + "epoch": 0.4705494453734015, + "grad_norm": 0.7556778192520142, + "learning_rate": 1.7711961902711437e-05, + "loss": 0.323, + "step": 4995 + }, + { + "epoch": 0.47064364946656934, + "grad_norm": 1.004414439201355, + "learning_rate": 1.7711000540976305e-05, + "loss": 0.3219, + "step": 4996 + }, + { + "epoch": 0.4707378535597372, + "grad_norm": 0.7572587132453918, + "learning_rate": 1.7710039003418437e-05, + "loss": 0.3538, + "step": 4997 + }, + { + "epoch": 0.47083205765290503, + "grad_norm": 0.7744436264038086, + "learning_rate": 1.7709077290059755e-05, + "loss": 0.2975, + "step": 4998 + }, + { + "epoch": 0.4709262617460729, + "grad_norm": 0.7824291586875916, + "learning_rate": 1.7708115400922187e-05, + "loss": 0.3389, + "step": 4999 + }, + { + "epoch": 0.47102046583924073, + "grad_norm": 0.7729814648628235, + "learning_rate": 1.770715333602767e-05, + "loss": 0.3669, + "step": 5000 + }, + { + "epoch": 0.4711146699324086, + "grad_norm": 0.8225262761116028, + "learning_rate": 1.7706191095398138e-05, + "loss": 0.3689, + "step": 5001 + }, + { + "epoch": 0.47120887402557643, + "grad_norm": 0.7559041380882263, + "learning_rate": 1.770522867905553e-05, + "loss": 0.3428, + "step": 5002 + }, + { + "epoch": 0.4713030781187443, + "grad_norm": 0.7640758752822876, + "learning_rate": 1.770426608702179e-05, + "loss": 0.3299, + "step": 5003 + }, + { + "epoch": 0.4713972822119121, + "grad_norm": 0.747041642665863, + "learning_rate": 1.770330331931887e-05, + "loss": 0.3372, + "step": 5004 + }, + { + "epoch": 0.47149148630508, + "grad_norm": 1.1163392066955566, + "learning_rate": 1.7702340375968724e-05, + "loss": 0.3193, + "step": 5005 + }, + { + "epoch": 0.4715856903982478, + "grad_norm": 0.6798670887947083, + "learning_rate": 1.7701377256993296e-05, + "loss": 0.3568, + "step": 5006 + }, + { + "epoch": 0.4716798944914157, + "grad_norm": 0.778786838054657, + "learning_rate": 1.7700413962414565e-05, + "loss": 0.3651, + "step": 5007 + }, + { + "epoch": 0.4717740985845835, + "grad_norm": 0.8317043781280518, + "learning_rate": 1.7699450492254484e-05, + "loss": 0.3241, + "step": 5008 + }, + { + "epoch": 0.4718683026777514, + "grad_norm": 0.7771879434585571, + "learning_rate": 1.7698486846535026e-05, + "loss": 0.3363, + "step": 5009 + }, + { + "epoch": 0.4719625067709192, + "grad_norm": 1.0484275817871094, + "learning_rate": 1.7697523025278158e-05, + "loss": 0.4066, + "step": 5010 + }, + { + "epoch": 0.47205671086408707, + "grad_norm": 0.7146066427230835, + "learning_rate": 1.7696559028505864e-05, + "loss": 0.3468, + "step": 5011 + }, + { + "epoch": 0.4721509149572549, + "grad_norm": 0.9016075134277344, + "learning_rate": 1.7695594856240118e-05, + "loss": 0.3458, + "step": 5012 + }, + { + "epoch": 0.47224511905042277, + "grad_norm": 0.8408075571060181, + "learning_rate": 1.769463050850291e-05, + "loss": 0.3434, + "step": 5013 + }, + { + "epoch": 0.47233932314359056, + "grad_norm": 0.8501318693161011, + "learning_rate": 1.769366598531623e-05, + "loss": 0.372, + "step": 5014 + }, + { + "epoch": 0.4724335272367584, + "grad_norm": 0.7973361611366272, + "learning_rate": 1.7692701286702062e-05, + "loss": 0.3209, + "step": 5015 + }, + { + "epoch": 0.47252773132992626, + "grad_norm": 0.8058852553367615, + "learning_rate": 1.7691736412682408e-05, + "loss": 0.3414, + "step": 5016 + }, + { + "epoch": 0.4726219354230941, + "grad_norm": 0.9156191349029541, + "learning_rate": 1.7690771363279272e-05, + "loss": 0.3019, + "step": 5017 + }, + { + "epoch": 0.47271613951626196, + "grad_norm": 0.8219974040985107, + "learning_rate": 1.768980613851465e-05, + "loss": 0.3451, + "step": 5018 + }, + { + "epoch": 0.4728103436094298, + "grad_norm": 0.805870532989502, + "learning_rate": 1.768884073841056e-05, + "loss": 0.2751, + "step": 5019 + }, + { + "epoch": 0.47290454770259766, + "grad_norm": 0.8654406666755676, + "learning_rate": 1.768787516298901e-05, + "loss": 0.3512, + "step": 5020 + }, + { + "epoch": 0.4729987517957655, + "grad_norm": 0.7221662998199463, + "learning_rate": 1.768690941227201e-05, + "loss": 0.3044, + "step": 5021 + }, + { + "epoch": 0.47309295588893335, + "grad_norm": 0.8214747309684753, + "learning_rate": 1.7685943486281595e-05, + "loss": 0.323, + "step": 5022 + }, + { + "epoch": 0.4731871599821012, + "grad_norm": 0.8377925753593445, + "learning_rate": 1.7684977385039776e-05, + "loss": 0.3466, + "step": 5023 + }, + { + "epoch": 0.47328136407526905, + "grad_norm": 0.9490770101547241, + "learning_rate": 1.7684011108568593e-05, + "loss": 0.378, + "step": 5024 + }, + { + "epoch": 0.4733755681684369, + "grad_norm": 0.7713491320610046, + "learning_rate": 1.768304465689007e-05, + "loss": 0.3266, + "step": 5025 + }, + { + "epoch": 0.47346977226160475, + "grad_norm": 0.8269317150115967, + "learning_rate": 1.7682078030026245e-05, + "loss": 0.374, + "step": 5026 + }, + { + "epoch": 0.4735639763547726, + "grad_norm": 0.7009604573249817, + "learning_rate": 1.7681111227999164e-05, + "loss": 0.2888, + "step": 5027 + }, + { + "epoch": 0.47365818044794045, + "grad_norm": 0.8689761161804199, + "learning_rate": 1.7680144250830868e-05, + "loss": 0.3569, + "step": 5028 + }, + { + "epoch": 0.4737523845411083, + "grad_norm": 0.7176832556724548, + "learning_rate": 1.7679177098543405e-05, + "loss": 0.3106, + "step": 5029 + }, + { + "epoch": 0.47384658863427614, + "grad_norm": 0.9018487334251404, + "learning_rate": 1.7678209771158827e-05, + "loss": 0.3323, + "step": 5030 + }, + { + "epoch": 0.473940792727444, + "grad_norm": 0.7709859609603882, + "learning_rate": 1.7677242268699192e-05, + "loss": 0.3564, + "step": 5031 + }, + { + "epoch": 0.47403499682061184, + "grad_norm": 0.7048654556274414, + "learning_rate": 1.7676274591186562e-05, + "loss": 0.3244, + "step": 5032 + }, + { + "epoch": 0.4741292009137797, + "grad_norm": 0.7146427631378174, + "learning_rate": 1.7675306738642996e-05, + "loss": 0.3219, + "step": 5033 + }, + { + "epoch": 0.47422340500694754, + "grad_norm": 0.7720121145248413, + "learning_rate": 1.7674338711090568e-05, + "loss": 0.3171, + "step": 5034 + }, + { + "epoch": 0.4743176091001154, + "grad_norm": 0.862352192401886, + "learning_rate": 1.767337050855135e-05, + "loss": 0.3547, + "step": 5035 + }, + { + "epoch": 0.47441181319328324, + "grad_norm": 0.8553547263145447, + "learning_rate": 1.7672402131047414e-05, + "loss": 0.3598, + "step": 5036 + }, + { + "epoch": 0.4745060172864511, + "grad_norm": 0.7929760217666626, + "learning_rate": 1.7671433578600846e-05, + "loss": 0.3262, + "step": 5037 + }, + { + "epoch": 0.47460022137961894, + "grad_norm": 0.699970543384552, + "learning_rate": 1.767046485123373e-05, + "loss": 0.3004, + "step": 5038 + }, + { + "epoch": 0.4746944254727868, + "grad_norm": 0.7730126976966858, + "learning_rate": 1.7669495948968152e-05, + "loss": 0.3368, + "step": 5039 + }, + { + "epoch": 0.47478862956595463, + "grad_norm": 0.8011327385902405, + "learning_rate": 1.7668526871826204e-05, + "loss": 0.3613, + "step": 5040 + }, + { + "epoch": 0.4748828336591225, + "grad_norm": 0.7555213570594788, + "learning_rate": 1.7667557619829985e-05, + "loss": 0.328, + "step": 5041 + }, + { + "epoch": 0.47497703775229033, + "grad_norm": 0.7480307221412659, + "learning_rate": 1.7666588193001595e-05, + "loss": 0.337, + "step": 5042 + }, + { + "epoch": 0.4750712418454582, + "grad_norm": 0.750718891620636, + "learning_rate": 1.7665618591363135e-05, + "loss": 0.3007, + "step": 5043 + }, + { + "epoch": 0.47516544593862603, + "grad_norm": 0.8020790815353394, + "learning_rate": 1.7664648814936716e-05, + "loss": 0.3826, + "step": 5044 + }, + { + "epoch": 0.4752596500317939, + "grad_norm": 0.8922072052955627, + "learning_rate": 1.7663678863744455e-05, + "loss": 0.442, + "step": 5045 + }, + { + "epoch": 0.4753538541249617, + "grad_norm": 0.856925904750824, + "learning_rate": 1.7662708737808457e-05, + "loss": 0.3588, + "step": 5046 + }, + { + "epoch": 0.4754480582181296, + "grad_norm": 0.6952528953552246, + "learning_rate": 1.7661738437150853e-05, + "loss": 0.2948, + "step": 5047 + }, + { + "epoch": 0.4755422623112974, + "grad_norm": 0.8187353610992432, + "learning_rate": 1.7660767961793764e-05, + "loss": 0.3601, + "step": 5048 + }, + { + "epoch": 0.4756364664044653, + "grad_norm": 0.8927239179611206, + "learning_rate": 1.7659797311759314e-05, + "loss": 0.3659, + "step": 5049 + }, + { + "epoch": 0.4757306704976331, + "grad_norm": 0.7681635618209839, + "learning_rate": 1.7658826487069642e-05, + "loss": 0.3202, + "step": 5050 + }, + { + "epoch": 0.475824874590801, + "grad_norm": 0.7328925132751465, + "learning_rate": 1.7657855487746883e-05, + "loss": 0.3278, + "step": 5051 + }, + { + "epoch": 0.4759190786839688, + "grad_norm": 0.8765425682067871, + "learning_rate": 1.7656884313813174e-05, + "loss": 0.3556, + "step": 5052 + }, + { + "epoch": 0.47601328277713667, + "grad_norm": 0.7214697003364563, + "learning_rate": 1.765591296529066e-05, + "loss": 0.3369, + "step": 5053 + }, + { + "epoch": 0.4761074868703045, + "grad_norm": 0.7442885041236877, + "learning_rate": 1.7654941442201493e-05, + "loss": 0.3216, + "step": 5054 + }, + { + "epoch": 0.47620169096347237, + "grad_norm": 0.8426844477653503, + "learning_rate": 1.765396974456782e-05, + "loss": 0.3394, + "step": 5055 + }, + { + "epoch": 0.4762958950566402, + "grad_norm": 0.7568658590316772, + "learning_rate": 1.76529978724118e-05, + "loss": 0.3103, + "step": 5056 + }, + { + "epoch": 0.47639009914980807, + "grad_norm": 0.7060257792472839, + "learning_rate": 1.7652025825755593e-05, + "loss": 0.299, + "step": 5057 + }, + { + "epoch": 0.4764843032429759, + "grad_norm": 0.8324049711227417, + "learning_rate": 1.7651053604621367e-05, + "loss": 0.3264, + "step": 5058 + }, + { + "epoch": 0.47657850733614376, + "grad_norm": 0.6237125992774963, + "learning_rate": 1.7650081209031282e-05, + "loss": 0.2504, + "step": 5059 + }, + { + "epoch": 0.4766727114293116, + "grad_norm": 0.8777263760566711, + "learning_rate": 1.7649108639007516e-05, + "loss": 0.3544, + "step": 5060 + }, + { + "epoch": 0.47676691552247946, + "grad_norm": 0.7348818778991699, + "learning_rate": 1.764813589457224e-05, + "loss": 0.2901, + "step": 5061 + }, + { + "epoch": 0.4768611196156473, + "grad_norm": 0.7848557829856873, + "learning_rate": 1.7647162975747646e-05, + "loss": 0.3155, + "step": 5062 + }, + { + "epoch": 0.47695532370881516, + "grad_norm": 0.7364405393600464, + "learning_rate": 1.76461898825559e-05, + "loss": 0.3081, + "step": 5063 + }, + { + "epoch": 0.477049527801983, + "grad_norm": 0.8653562068939209, + "learning_rate": 1.7645216615019204e-05, + "loss": 0.3288, + "step": 5064 + }, + { + "epoch": 0.47714373189515086, + "grad_norm": 0.9205647110939026, + "learning_rate": 1.7644243173159746e-05, + "loss": 0.3057, + "step": 5065 + }, + { + "epoch": 0.4772379359883187, + "grad_norm": 0.8097381591796875, + "learning_rate": 1.764326955699972e-05, + "loss": 0.3551, + "step": 5066 + }, + { + "epoch": 0.47733214008148656, + "grad_norm": 0.8962020874023438, + "learning_rate": 1.764229576656133e-05, + "loss": 0.39, + "step": 5067 + }, + { + "epoch": 0.4774263441746544, + "grad_norm": 0.901494562625885, + "learning_rate": 1.7641321801866776e-05, + "loss": 0.2975, + "step": 5068 + }, + { + "epoch": 0.47752054826782225, + "grad_norm": 0.64666748046875, + "learning_rate": 1.7640347662938268e-05, + "loss": 0.3007, + "step": 5069 + }, + { + "epoch": 0.4776147523609901, + "grad_norm": 0.6876158714294434, + "learning_rate": 1.7639373349798016e-05, + "loss": 0.28, + "step": 5070 + }, + { + "epoch": 0.47770895645415795, + "grad_norm": 0.992737889289856, + "learning_rate": 1.7638398862468237e-05, + "loss": 0.2968, + "step": 5071 + }, + { + "epoch": 0.4778031605473258, + "grad_norm": 0.7393843531608582, + "learning_rate": 1.763742420097115e-05, + "loss": 0.347, + "step": 5072 + }, + { + "epoch": 0.47789736464049365, + "grad_norm": 0.7149779796600342, + "learning_rate": 1.7636449365328983e-05, + "loss": 0.3087, + "step": 5073 + }, + { + "epoch": 0.4779915687336615, + "grad_norm": 0.7476661205291748, + "learning_rate": 1.763547435556396e-05, + "loss": 0.2981, + "step": 5074 + }, + { + "epoch": 0.47808577282682935, + "grad_norm": 0.8229698538780212, + "learning_rate": 1.763449917169831e-05, + "loss": 0.3592, + "step": 5075 + }, + { + "epoch": 0.4781799769199972, + "grad_norm": 0.7402316927909851, + "learning_rate": 1.7633523813754276e-05, + "loss": 0.3137, + "step": 5076 + }, + { + "epoch": 0.47827418101316505, + "grad_norm": 0.876528799533844, + "learning_rate": 1.763254828175409e-05, + "loss": 0.4053, + "step": 5077 + }, + { + "epoch": 0.4783683851063329, + "grad_norm": 0.8130778670310974, + "learning_rate": 1.763157257572e-05, + "loss": 0.362, + "step": 5078 + }, + { + "epoch": 0.47846258919950074, + "grad_norm": 0.6440185308456421, + "learning_rate": 1.7630596695674254e-05, + "loss": 0.2755, + "step": 5079 + }, + { + "epoch": 0.4785567932926686, + "grad_norm": 0.7436035871505737, + "learning_rate": 1.7629620641639102e-05, + "loss": 0.2935, + "step": 5080 + }, + { + "epoch": 0.47865099738583644, + "grad_norm": 0.7836533188819885, + "learning_rate": 1.76286444136368e-05, + "loss": 0.3261, + "step": 5081 + }, + { + "epoch": 0.4787452014790043, + "grad_norm": 0.7561307549476624, + "learning_rate": 1.7627668011689607e-05, + "loss": 0.3232, + "step": 5082 + }, + { + "epoch": 0.4788394055721721, + "grad_norm": 0.8446324467658997, + "learning_rate": 1.7626691435819787e-05, + "loss": 0.3189, + "step": 5083 + }, + { + "epoch": 0.47893360966533993, + "grad_norm": 0.9219985008239746, + "learning_rate": 1.7625714686049607e-05, + "loss": 0.3258, + "step": 5084 + }, + { + "epoch": 0.4790278137585078, + "grad_norm": 0.7031102776527405, + "learning_rate": 1.762473776240134e-05, + "loss": 0.3253, + "step": 5085 + }, + { + "epoch": 0.47912201785167563, + "grad_norm": 0.894633412361145, + "learning_rate": 1.762376066489726e-05, + "loss": 0.3357, + "step": 5086 + }, + { + "epoch": 0.4792162219448435, + "grad_norm": 0.8764644265174866, + "learning_rate": 1.7622783393559646e-05, + "loss": 0.3381, + "step": 5087 + }, + { + "epoch": 0.47931042603801133, + "grad_norm": 0.8285947442054749, + "learning_rate": 1.7621805948410782e-05, + "loss": 0.3491, + "step": 5088 + }, + { + "epoch": 0.4794046301311792, + "grad_norm": 0.8000497817993164, + "learning_rate": 1.7620828329472952e-05, + "loss": 0.3458, + "step": 5089 + }, + { + "epoch": 0.479498834224347, + "grad_norm": 0.8902774453163147, + "learning_rate": 1.7619850536768455e-05, + "loss": 0.3236, + "step": 5090 + }, + { + "epoch": 0.4795930383175149, + "grad_norm": 0.6526225209236145, + "learning_rate": 1.7618872570319577e-05, + "loss": 0.2875, + "step": 5091 + }, + { + "epoch": 0.4796872424106827, + "grad_norm": 0.7107720375061035, + "learning_rate": 1.7617894430148626e-05, + "loss": 0.2934, + "step": 5092 + }, + { + "epoch": 0.4797814465038506, + "grad_norm": 0.812839150428772, + "learning_rate": 1.76169161162779e-05, + "loss": 0.3133, + "step": 5093 + }, + { + "epoch": 0.4798756505970184, + "grad_norm": 0.7085913419723511, + "learning_rate": 1.7615937628729704e-05, + "loss": 0.3124, + "step": 5094 + }, + { + "epoch": 0.47996985469018627, + "grad_norm": 0.8842501640319824, + "learning_rate": 1.7614958967526354e-05, + "loss": 0.3006, + "step": 5095 + }, + { + "epoch": 0.4800640587833541, + "grad_norm": 0.8363269567489624, + "learning_rate": 1.761398013269016e-05, + "loss": 0.3572, + "step": 5096 + }, + { + "epoch": 0.48015826287652197, + "grad_norm": 0.9766021370887756, + "learning_rate": 1.7613001124243448e-05, + "loss": 0.3211, + "step": 5097 + }, + { + "epoch": 0.4802524669696898, + "grad_norm": 0.8415078520774841, + "learning_rate": 1.761202194220853e-05, + "loss": 0.3477, + "step": 5098 + }, + { + "epoch": 0.48034667106285767, + "grad_norm": 0.7183197140693665, + "learning_rate": 1.7611042586607748e-05, + "loss": 0.3358, + "step": 5099 + }, + { + "epoch": 0.4804408751560255, + "grad_norm": 0.7389125823974609, + "learning_rate": 1.761006305746342e-05, + "loss": 0.3196, + "step": 5100 + }, + { + "epoch": 0.48053507924919336, + "grad_norm": 0.7939767837524414, + "learning_rate": 1.7609083354797884e-05, + "loss": 0.3451, + "step": 5101 + }, + { + "epoch": 0.4806292833423612, + "grad_norm": 0.8271040916442871, + "learning_rate": 1.7608103478633483e-05, + "loss": 0.2931, + "step": 5102 + }, + { + "epoch": 0.48072348743552906, + "grad_norm": 0.8910988569259644, + "learning_rate": 1.7607123428992554e-05, + "loss": 0.3108, + "step": 5103 + }, + { + "epoch": 0.4808176915286969, + "grad_norm": 0.777472198009491, + "learning_rate": 1.7606143205897445e-05, + "loss": 0.349, + "step": 5104 + }, + { + "epoch": 0.48091189562186476, + "grad_norm": 0.8044363856315613, + "learning_rate": 1.760516280937051e-05, + "loss": 0.3259, + "step": 5105 + }, + { + "epoch": 0.4810060997150326, + "grad_norm": 0.8904417157173157, + "learning_rate": 1.76041822394341e-05, + "loss": 0.3641, + "step": 5106 + }, + { + "epoch": 0.48110030380820046, + "grad_norm": 0.7185295820236206, + "learning_rate": 1.760320149611058e-05, + "loss": 0.2872, + "step": 5107 + }, + { + "epoch": 0.4811945079013683, + "grad_norm": 0.8121291399002075, + "learning_rate": 1.76022205794223e-05, + "loss": 0.3497, + "step": 5108 + }, + { + "epoch": 0.48128871199453616, + "grad_norm": 0.8890112638473511, + "learning_rate": 1.760123948939164e-05, + "loss": 0.3693, + "step": 5109 + }, + { + "epoch": 0.481382916087704, + "grad_norm": 0.7963833212852478, + "learning_rate": 1.7600258226040957e-05, + "loss": 0.349, + "step": 5110 + }, + { + "epoch": 0.48147712018087185, + "grad_norm": 0.8878795504570007, + "learning_rate": 1.759927678939264e-05, + "loss": 0.3739, + "step": 5111 + }, + { + "epoch": 0.4815713242740397, + "grad_norm": 0.8152940273284912, + "learning_rate": 1.7598295179469053e-05, + "loss": 0.2985, + "step": 5112 + }, + { + "epoch": 0.48166552836720755, + "grad_norm": 0.7406737208366394, + "learning_rate": 1.759731339629259e-05, + "loss": 0.3328, + "step": 5113 + }, + { + "epoch": 0.4817597324603754, + "grad_norm": 0.77753084897995, + "learning_rate": 1.759633143988563e-05, + "loss": 0.3112, + "step": 5114 + }, + { + "epoch": 0.48185393655354325, + "grad_norm": 0.7563633918762207, + "learning_rate": 1.7595349310270565e-05, + "loss": 0.3617, + "step": 5115 + }, + { + "epoch": 0.4819481406467111, + "grad_norm": 0.8029349446296692, + "learning_rate": 1.759436700746979e-05, + "loss": 0.3483, + "step": 5116 + }, + { + "epoch": 0.48204234473987895, + "grad_norm": 0.6659033894538879, + "learning_rate": 1.7593384531505703e-05, + "loss": 0.3155, + "step": 5117 + }, + { + "epoch": 0.4821365488330468, + "grad_norm": 0.9070823788642883, + "learning_rate": 1.7592401882400704e-05, + "loss": 0.3096, + "step": 5118 + }, + { + "epoch": 0.48223075292621465, + "grad_norm": 0.8133385181427002, + "learning_rate": 1.75914190601772e-05, + "loss": 0.3642, + "step": 5119 + }, + { + "epoch": 0.4823249570193825, + "grad_norm": 0.9977114200592041, + "learning_rate": 1.7590436064857603e-05, + "loss": 0.3252, + "step": 5120 + }, + { + "epoch": 0.48241916111255034, + "grad_norm": 0.6693086624145508, + "learning_rate": 1.758945289646432e-05, + "loss": 0.2655, + "step": 5121 + }, + { + "epoch": 0.4825133652057182, + "grad_norm": 0.8078154921531677, + "learning_rate": 1.7588469555019777e-05, + "loss": 0.3635, + "step": 5122 + }, + { + "epoch": 0.48260756929888604, + "grad_norm": 0.7313911318778992, + "learning_rate": 1.7587486040546392e-05, + "loss": 0.273, + "step": 5123 + }, + { + "epoch": 0.4827017733920539, + "grad_norm": 0.8575907349586487, + "learning_rate": 1.7586502353066593e-05, + "loss": 0.3633, + "step": 5124 + }, + { + "epoch": 0.48279597748522174, + "grad_norm": 0.7386175394058228, + "learning_rate": 1.7585518492602807e-05, + "loss": 0.3086, + "step": 5125 + }, + { + "epoch": 0.4828901815783896, + "grad_norm": 0.7343104481697083, + "learning_rate": 1.7584534459177466e-05, + "loss": 0.3236, + "step": 5126 + }, + { + "epoch": 0.48298438567155744, + "grad_norm": 0.9238309860229492, + "learning_rate": 1.758355025281301e-05, + "loss": 0.3199, + "step": 5127 + }, + { + "epoch": 0.4830785897647253, + "grad_norm": 0.7808854579925537, + "learning_rate": 1.758256587353188e-05, + "loss": 0.3399, + "step": 5128 + }, + { + "epoch": 0.48317279385789313, + "grad_norm": 0.7596392035484314, + "learning_rate": 1.7581581321356524e-05, + "loss": 0.3142, + "step": 5129 + }, + { + "epoch": 0.483266997951061, + "grad_norm": 0.8428837060928345, + "learning_rate": 1.7580596596309387e-05, + "loss": 0.3767, + "step": 5130 + }, + { + "epoch": 0.48336120204422883, + "grad_norm": 1.090317964553833, + "learning_rate": 1.7579611698412923e-05, + "loss": 0.3112, + "step": 5131 + }, + { + "epoch": 0.4834554061373967, + "grad_norm": 0.8067708611488342, + "learning_rate": 1.7578626627689594e-05, + "loss": 0.363, + "step": 5132 + }, + { + "epoch": 0.48354961023056453, + "grad_norm": 0.7280093431472778, + "learning_rate": 1.757764138416185e-05, + "loss": 0.3154, + "step": 5133 + }, + { + "epoch": 0.4836438143237324, + "grad_norm": 0.9062605500221252, + "learning_rate": 1.757665596785217e-05, + "loss": 0.3212, + "step": 5134 + }, + { + "epoch": 0.48373801841690023, + "grad_norm": 0.7609298825263977, + "learning_rate": 1.7575670378783013e-05, + "loss": 0.3096, + "step": 5135 + }, + { + "epoch": 0.4838322225100681, + "grad_norm": 0.7838581204414368, + "learning_rate": 1.7574684616976857e-05, + "loss": 0.3518, + "step": 5136 + }, + { + "epoch": 0.4839264266032359, + "grad_norm": 0.7546284198760986, + "learning_rate": 1.7573698682456176e-05, + "loss": 0.3158, + "step": 5137 + }, + { + "epoch": 0.4840206306964038, + "grad_norm": 0.7096731066703796, + "learning_rate": 1.7572712575243454e-05, + "loss": 0.3411, + "step": 5138 + }, + { + "epoch": 0.4841148347895716, + "grad_norm": 0.8472162485122681, + "learning_rate": 1.7571726295361172e-05, + "loss": 0.2935, + "step": 5139 + }, + { + "epoch": 0.4842090388827395, + "grad_norm": 0.8366554379463196, + "learning_rate": 1.7570739842831824e-05, + "loss": 0.3189, + "step": 5140 + }, + { + "epoch": 0.4843032429759073, + "grad_norm": 0.8048580884933472, + "learning_rate": 1.7569753217677893e-05, + "loss": 0.2775, + "step": 5141 + }, + { + "epoch": 0.48439744706907517, + "grad_norm": 0.7406998872756958, + "learning_rate": 1.7568766419921887e-05, + "loss": 0.3351, + "step": 5142 + }, + { + "epoch": 0.484491651162243, + "grad_norm": 0.7719348073005676, + "learning_rate": 1.75677794495863e-05, + "loss": 0.3258, + "step": 5143 + }, + { + "epoch": 0.48458585525541087, + "grad_norm": 0.8818472027778625, + "learning_rate": 1.7566792306693636e-05, + "loss": 0.3767, + "step": 5144 + }, + { + "epoch": 0.4846800593485787, + "grad_norm": 0.9801118969917297, + "learning_rate": 1.7565804991266407e-05, + "loss": 0.3647, + "step": 5145 + }, + { + "epoch": 0.48477426344174657, + "grad_norm": 0.7627350091934204, + "learning_rate": 1.7564817503327125e-05, + "loss": 0.3265, + "step": 5146 + }, + { + "epoch": 0.4848684675349144, + "grad_norm": 0.7406090497970581, + "learning_rate": 1.75638298428983e-05, + "loss": 0.3264, + "step": 5147 + }, + { + "epoch": 0.48496267162808226, + "grad_norm": 0.8357557654380798, + "learning_rate": 1.756284201000246e-05, + "loss": 0.3622, + "step": 5148 + }, + { + "epoch": 0.4850568757212501, + "grad_norm": 0.6987957954406738, + "learning_rate": 1.7561854004662126e-05, + "loss": 0.2845, + "step": 5149 + }, + { + "epoch": 0.48515107981441796, + "grad_norm": 0.8057776689529419, + "learning_rate": 1.7560865826899825e-05, + "loss": 0.383, + "step": 5150 + }, + { + "epoch": 0.4852452839075858, + "grad_norm": 0.7629268169403076, + "learning_rate": 1.755987747673809e-05, + "loss": 0.3477, + "step": 5151 + }, + { + "epoch": 0.48533948800075366, + "grad_norm": 0.7710081934928894, + "learning_rate": 1.755888895419946e-05, + "loss": 0.3138, + "step": 5152 + }, + { + "epoch": 0.48543369209392145, + "grad_norm": 0.7008806467056274, + "learning_rate": 1.755790025930647e-05, + "loss": 0.3631, + "step": 5153 + }, + { + "epoch": 0.4855278961870893, + "grad_norm": 0.8604103326797485, + "learning_rate": 1.7556911392081663e-05, + "loss": 0.3868, + "step": 5154 + }, + { + "epoch": 0.48562210028025715, + "grad_norm": 0.7947174906730652, + "learning_rate": 1.755592235254759e-05, + "loss": 0.3356, + "step": 5155 + }, + { + "epoch": 0.485716304373425, + "grad_norm": 0.840647280216217, + "learning_rate": 1.7554933140726803e-05, + "loss": 0.3162, + "step": 5156 + }, + { + "epoch": 0.48581050846659285, + "grad_norm": 0.8356987237930298, + "learning_rate": 1.755394375664186e-05, + "loss": 0.3557, + "step": 5157 + }, + { + "epoch": 0.4859047125597607, + "grad_norm": 0.893537163734436, + "learning_rate": 1.7552954200315313e-05, + "loss": 0.3298, + "step": 5158 + }, + { + "epoch": 0.48599891665292855, + "grad_norm": 0.8929669857025146, + "learning_rate": 1.755196447176973e-05, + "loss": 0.3518, + "step": 5159 + }, + { + "epoch": 0.4860931207460964, + "grad_norm": 1.0336229801177979, + "learning_rate": 1.7550974571027675e-05, + "loss": 0.3418, + "step": 5160 + }, + { + "epoch": 0.48618732483926425, + "grad_norm": 0.7523311376571655, + "learning_rate": 1.7549984498111724e-05, + "loss": 0.3341, + "step": 5161 + }, + { + "epoch": 0.4862815289324321, + "grad_norm": 0.6885283589363098, + "learning_rate": 1.7548994253044453e-05, + "loss": 0.3446, + "step": 5162 + }, + { + "epoch": 0.48637573302559994, + "grad_norm": 0.8484752178192139, + "learning_rate": 1.7548003835848436e-05, + "loss": 0.3315, + "step": 5163 + }, + { + "epoch": 0.4864699371187678, + "grad_norm": 0.8555371761322021, + "learning_rate": 1.7547013246546258e-05, + "loss": 0.3676, + "step": 5164 + }, + { + "epoch": 0.48656414121193564, + "grad_norm": 0.6904305815696716, + "learning_rate": 1.7546022485160507e-05, + "loss": 0.2942, + "step": 5165 + }, + { + "epoch": 0.4866583453051035, + "grad_norm": 0.8539214730262756, + "learning_rate": 1.7545031551713772e-05, + "loss": 0.3705, + "step": 5166 + }, + { + "epoch": 0.48675254939827134, + "grad_norm": 0.7262186408042908, + "learning_rate": 1.7544040446228652e-05, + "loss": 0.3406, + "step": 5167 + }, + { + "epoch": 0.4868467534914392, + "grad_norm": 0.7756384015083313, + "learning_rate": 1.7543049168727742e-05, + "loss": 0.3184, + "step": 5168 + }, + { + "epoch": 0.48694095758460704, + "grad_norm": 0.817236065864563, + "learning_rate": 1.754205771923364e-05, + "loss": 0.34, + "step": 5169 + }, + { + "epoch": 0.4870351616777749, + "grad_norm": 0.7116366624832153, + "learning_rate": 1.7541066097768965e-05, + "loss": 0.2996, + "step": 5170 + }, + { + "epoch": 0.48712936577094273, + "grad_norm": 0.7133092880249023, + "learning_rate": 1.7540074304356316e-05, + "loss": 0.3175, + "step": 5171 + }, + { + "epoch": 0.4872235698641106, + "grad_norm": 0.7212796807289124, + "learning_rate": 1.7539082339018314e-05, + "loss": 0.3484, + "step": 5172 + }, + { + "epoch": 0.48731777395727843, + "grad_norm": 0.7220133543014526, + "learning_rate": 1.7538090201777573e-05, + "loss": 0.3472, + "step": 5173 + }, + { + "epoch": 0.4874119780504463, + "grad_norm": 0.9291368126869202, + "learning_rate": 1.7537097892656724e-05, + "loss": 0.3562, + "step": 5174 + }, + { + "epoch": 0.48750618214361413, + "grad_norm": 0.7621473670005798, + "learning_rate": 1.753610541167838e-05, + "loss": 0.2822, + "step": 5175 + }, + { + "epoch": 0.487600386236782, + "grad_norm": 0.8579990267753601, + "learning_rate": 1.753511275886518e-05, + "loss": 0.3662, + "step": 5176 + }, + { + "epoch": 0.48769459032994983, + "grad_norm": 0.8141975998878479, + "learning_rate": 1.7534119934239754e-05, + "loss": 0.3588, + "step": 5177 + }, + { + "epoch": 0.4877887944231177, + "grad_norm": 0.7674275636672974, + "learning_rate": 1.7533126937824746e-05, + "loss": 0.3731, + "step": 5178 + }, + { + "epoch": 0.4878829985162855, + "grad_norm": 0.8437183499336243, + "learning_rate": 1.753213376964279e-05, + "loss": 0.333, + "step": 5179 + }, + { + "epoch": 0.4879772026094534, + "grad_norm": 0.7552438378334045, + "learning_rate": 1.7531140429716536e-05, + "loss": 0.3402, + "step": 5180 + }, + { + "epoch": 0.4880714067026212, + "grad_norm": 0.7128962278366089, + "learning_rate": 1.7530146918068633e-05, + "loss": 0.315, + "step": 5181 + }, + { + "epoch": 0.4881656107957891, + "grad_norm": 0.7599475979804993, + "learning_rate": 1.7529153234721733e-05, + "loss": 0.3402, + "step": 5182 + }, + { + "epoch": 0.4882598148889569, + "grad_norm": 0.916378378868103, + "learning_rate": 1.75281593796985e-05, + "loss": 0.383, + "step": 5183 + }, + { + "epoch": 0.48835401898212477, + "grad_norm": 0.6470703482627869, + "learning_rate": 1.7527165353021585e-05, + "loss": 0.2833, + "step": 5184 + }, + { + "epoch": 0.4884482230752926, + "grad_norm": 0.7961269021034241, + "learning_rate": 1.7526171154713664e-05, + "loss": 0.3685, + "step": 5185 + }, + { + "epoch": 0.48854242716846047, + "grad_norm": 0.9246634244918823, + "learning_rate": 1.75251767847974e-05, + "loss": 0.4044, + "step": 5186 + }, + { + "epoch": 0.4886366312616283, + "grad_norm": 0.8082994222640991, + "learning_rate": 1.7524182243295464e-05, + "loss": 0.382, + "step": 5187 + }, + { + "epoch": 0.48873083535479617, + "grad_norm": 0.7272515296936035, + "learning_rate": 1.7523187530230543e-05, + "loss": 0.3306, + "step": 5188 + }, + { + "epoch": 0.488825039447964, + "grad_norm": 0.7833527326583862, + "learning_rate": 1.7522192645625305e-05, + "loss": 0.3351, + "step": 5189 + }, + { + "epoch": 0.48891924354113186, + "grad_norm": 0.8743553161621094, + "learning_rate": 1.7521197589502442e-05, + "loss": 0.3526, + "step": 5190 + }, + { + "epoch": 0.4890134476342997, + "grad_norm": 0.720193088054657, + "learning_rate": 1.7520202361884643e-05, + "loss": 0.3225, + "step": 5191 + }, + { + "epoch": 0.48910765172746756, + "grad_norm": 0.6840072870254517, + "learning_rate": 1.7519206962794605e-05, + "loss": 0.2961, + "step": 5192 + }, + { + "epoch": 0.4892018558206354, + "grad_norm": 0.7914387583732605, + "learning_rate": 1.7518211392255013e-05, + "loss": 0.3699, + "step": 5193 + }, + { + "epoch": 0.48929605991380326, + "grad_norm": 0.6713058352470398, + "learning_rate": 1.7517215650288576e-05, + "loss": 0.2956, + "step": 5194 + }, + { + "epoch": 0.4893902640069711, + "grad_norm": 0.8198142647743225, + "learning_rate": 1.7516219736918e-05, + "loss": 0.374, + "step": 5195 + }, + { + "epoch": 0.48948446810013896, + "grad_norm": 0.869257926940918, + "learning_rate": 1.7515223652165985e-05, + "loss": 0.3474, + "step": 5196 + }, + { + "epoch": 0.4895786721933068, + "grad_norm": 0.791228711605072, + "learning_rate": 1.7514227396055252e-05, + "loss": 0.3528, + "step": 5197 + }, + { + "epoch": 0.48967287628647466, + "grad_norm": 0.7481163144111633, + "learning_rate": 1.7513230968608514e-05, + "loss": 0.3095, + "step": 5198 + }, + { + "epoch": 0.4897670803796425, + "grad_norm": 0.9064458608627319, + "learning_rate": 1.7512234369848484e-05, + "loss": 0.3704, + "step": 5199 + }, + { + "epoch": 0.48986128447281035, + "grad_norm": 0.8498217463493347, + "learning_rate": 1.7511237599797898e-05, + "loss": 0.3492, + "step": 5200 + }, + { + "epoch": 0.4899554885659782, + "grad_norm": 0.8195668458938599, + "learning_rate": 1.751024065847948e-05, + "loss": 0.3572, + "step": 5201 + }, + { + "epoch": 0.49004969265914605, + "grad_norm": 0.7476743459701538, + "learning_rate": 1.7509243545915953e-05, + "loss": 0.332, + "step": 5202 + }, + { + "epoch": 0.4901438967523139, + "grad_norm": 0.8016562461853027, + "learning_rate": 1.7508246262130065e-05, + "loss": 0.3863, + "step": 5203 + }, + { + "epoch": 0.49023810084548175, + "grad_norm": 0.6951786875724792, + "learning_rate": 1.750724880714455e-05, + "loss": 0.327, + "step": 5204 + }, + { + "epoch": 0.4903323049386496, + "grad_norm": 0.7730926275253296, + "learning_rate": 1.7506251180982153e-05, + "loss": 0.3423, + "step": 5205 + }, + { + "epoch": 0.49042650903181745, + "grad_norm": 0.6924357414245605, + "learning_rate": 1.7505253383665618e-05, + "loss": 0.3064, + "step": 5206 + }, + { + "epoch": 0.4905207131249853, + "grad_norm": 0.861555278301239, + "learning_rate": 1.75042554152177e-05, + "loss": 0.3739, + "step": 5207 + }, + { + "epoch": 0.49061491721815315, + "grad_norm": 0.8362349271774292, + "learning_rate": 1.750325727566115e-05, + "loss": 0.3512, + "step": 5208 + }, + { + "epoch": 0.490709121311321, + "grad_norm": 0.7900026440620422, + "learning_rate": 1.7502258965018737e-05, + "loss": 0.3095, + "step": 5209 + }, + { + "epoch": 0.49080332540448884, + "grad_norm": 0.8224048614501953, + "learning_rate": 1.750126048331321e-05, + "loss": 0.3615, + "step": 5210 + }, + { + "epoch": 0.4908975294976567, + "grad_norm": 0.9498628377914429, + "learning_rate": 1.7500261830567348e-05, + "loss": 0.3035, + "step": 5211 + }, + { + "epoch": 0.49099173359082454, + "grad_norm": 0.9057954549789429, + "learning_rate": 1.7499263006803913e-05, + "loss": 0.3277, + "step": 5212 + }, + { + "epoch": 0.4910859376839924, + "grad_norm": 0.8104428648948669, + "learning_rate": 1.7498264012045686e-05, + "loss": 0.3926, + "step": 5213 + }, + { + "epoch": 0.49118014177716024, + "grad_norm": 0.8333841562271118, + "learning_rate": 1.7497264846315443e-05, + "loss": 0.3278, + "step": 5214 + }, + { + "epoch": 0.4912743458703281, + "grad_norm": 0.8042159080505371, + "learning_rate": 1.7496265509635968e-05, + "loss": 0.3766, + "step": 5215 + }, + { + "epoch": 0.49136854996349594, + "grad_norm": 0.725310742855072, + "learning_rate": 1.7495266002030045e-05, + "loss": 0.3361, + "step": 5216 + }, + { + "epoch": 0.4914627540566638, + "grad_norm": 0.7242028117179871, + "learning_rate": 1.7494266323520466e-05, + "loss": 0.3016, + "step": 5217 + }, + { + "epoch": 0.49155695814983164, + "grad_norm": 0.8324516415596008, + "learning_rate": 1.7493266474130024e-05, + "loss": 0.3514, + "step": 5218 + }, + { + "epoch": 0.4916511622429995, + "grad_norm": 0.7419780492782593, + "learning_rate": 1.7492266453881516e-05, + "loss": 0.364, + "step": 5219 + }, + { + "epoch": 0.49174536633616733, + "grad_norm": 0.8550251126289368, + "learning_rate": 1.749126626279775e-05, + "loss": 0.3315, + "step": 5220 + }, + { + "epoch": 0.4918395704293352, + "grad_norm": 0.767841637134552, + "learning_rate": 1.749026590090152e-05, + "loss": 0.351, + "step": 5221 + }, + { + "epoch": 0.491933774522503, + "grad_norm": 0.7968747019767761, + "learning_rate": 1.748926536821565e-05, + "loss": 0.3368, + "step": 5222 + }, + { + "epoch": 0.4920279786156708, + "grad_norm": 0.9009972810745239, + "learning_rate": 1.7488264664762948e-05, + "loss": 0.3702, + "step": 5223 + }, + { + "epoch": 0.4921221827088387, + "grad_norm": 0.7290366291999817, + "learning_rate": 1.748726379056623e-05, + "loss": 0.3331, + "step": 5224 + }, + { + "epoch": 0.4922163868020065, + "grad_norm": 0.9687650203704834, + "learning_rate": 1.7486262745648316e-05, + "loss": 0.3235, + "step": 5225 + }, + { + "epoch": 0.49231059089517437, + "grad_norm": 0.7802406549453735, + "learning_rate": 1.7485261530032036e-05, + "loss": 0.3025, + "step": 5226 + }, + { + "epoch": 0.4924047949883422, + "grad_norm": 0.8119762539863586, + "learning_rate": 1.7484260143740216e-05, + "loss": 0.3105, + "step": 5227 + }, + { + "epoch": 0.49249899908151007, + "grad_norm": 0.8451618552207947, + "learning_rate": 1.7483258586795692e-05, + "loss": 0.3509, + "step": 5228 + }, + { + "epoch": 0.4925932031746779, + "grad_norm": 0.7818936109542847, + "learning_rate": 1.7482256859221297e-05, + "loss": 0.3317, + "step": 5229 + }, + { + "epoch": 0.49268740726784577, + "grad_norm": 0.8083449602127075, + "learning_rate": 1.7481254961039874e-05, + "loss": 0.3321, + "step": 5230 + }, + { + "epoch": 0.4927816113610136, + "grad_norm": 0.7966959476470947, + "learning_rate": 1.748025289227427e-05, + "loss": 0.3557, + "step": 5231 + }, + { + "epoch": 0.49287581545418147, + "grad_norm": 0.6925898790359497, + "learning_rate": 1.7479250652947325e-05, + "loss": 0.2877, + "step": 5232 + }, + { + "epoch": 0.4929700195473493, + "grad_norm": 0.8530373573303223, + "learning_rate": 1.7478248243081904e-05, + "loss": 0.367, + "step": 5233 + }, + { + "epoch": 0.49306422364051716, + "grad_norm": 0.7606818675994873, + "learning_rate": 1.7477245662700857e-05, + "loss": 0.3346, + "step": 5234 + }, + { + "epoch": 0.493158427733685, + "grad_norm": 0.7215161919593811, + "learning_rate": 1.7476242911827043e-05, + "loss": 0.3004, + "step": 5235 + }, + { + "epoch": 0.49325263182685286, + "grad_norm": 0.8876181840896606, + "learning_rate": 1.7475239990483332e-05, + "loss": 0.3717, + "step": 5236 + }, + { + "epoch": 0.4933468359200207, + "grad_norm": 0.8085825443267822, + "learning_rate": 1.7474236898692587e-05, + "loss": 0.3312, + "step": 5237 + }, + { + "epoch": 0.49344104001318856, + "grad_norm": 0.879197359085083, + "learning_rate": 1.747323363647768e-05, + "loss": 0.3265, + "step": 5238 + }, + { + "epoch": 0.4935352441063564, + "grad_norm": 0.7793401479721069, + "learning_rate": 1.747223020386149e-05, + "loss": 0.3641, + "step": 5239 + }, + { + "epoch": 0.49362944819952426, + "grad_norm": 0.7720771431922913, + "learning_rate": 1.7471226600866895e-05, + "loss": 0.3185, + "step": 5240 + }, + { + "epoch": 0.4937236522926921, + "grad_norm": 0.6867986917495728, + "learning_rate": 1.747022282751678e-05, + "loss": 0.3019, + "step": 5241 + }, + { + "epoch": 0.49381785638585995, + "grad_norm": 0.7740907073020935, + "learning_rate": 1.7469218883834033e-05, + "loss": 0.3386, + "step": 5242 + }, + { + "epoch": 0.4939120604790278, + "grad_norm": 0.9793553352355957, + "learning_rate": 1.7468214769841542e-05, + "loss": 0.3707, + "step": 5243 + }, + { + "epoch": 0.49400626457219565, + "grad_norm": 0.9979997277259827, + "learning_rate": 1.74672104855622e-05, + "loss": 0.3702, + "step": 5244 + }, + { + "epoch": 0.4941004686653635, + "grad_norm": 0.8075796365737915, + "learning_rate": 1.7466206031018918e-05, + "loss": 0.3085, + "step": 5245 + }, + { + "epoch": 0.49419467275853135, + "grad_norm": 0.865083634853363, + "learning_rate": 1.7465201406234585e-05, + "loss": 0.3501, + "step": 5246 + }, + { + "epoch": 0.4942888768516992, + "grad_norm": 0.8638558387756348, + "learning_rate": 1.746419661123212e-05, + "loss": 0.3572, + "step": 5247 + }, + { + "epoch": 0.49438308094486705, + "grad_norm": 0.684909999370575, + "learning_rate": 1.746319164603443e-05, + "loss": 0.3082, + "step": 5248 + }, + { + "epoch": 0.4944772850380349, + "grad_norm": 0.6876487135887146, + "learning_rate": 1.7462186510664426e-05, + "loss": 0.3275, + "step": 5249 + }, + { + "epoch": 0.49457148913120275, + "grad_norm": 0.6835941672325134, + "learning_rate": 1.7461181205145027e-05, + "loss": 0.2478, + "step": 5250 + }, + { + "epoch": 0.4946656932243706, + "grad_norm": 0.8077230453491211, + "learning_rate": 1.7460175729499158e-05, + "loss": 0.3566, + "step": 5251 + }, + { + "epoch": 0.49475989731753844, + "grad_norm": 0.7662020921707153, + "learning_rate": 1.7459170083749747e-05, + "loss": 0.368, + "step": 5252 + }, + { + "epoch": 0.4948541014107063, + "grad_norm": 0.7811310887336731, + "learning_rate": 1.7458164267919723e-05, + "loss": 0.3218, + "step": 5253 + }, + { + "epoch": 0.49494830550387414, + "grad_norm": 0.8241111636161804, + "learning_rate": 1.7457158282032018e-05, + "loss": 0.3658, + "step": 5254 + }, + { + "epoch": 0.495042509597042, + "grad_norm": 0.695144772529602, + "learning_rate": 1.745615212610957e-05, + "loss": 0.3463, + "step": 5255 + }, + { + "epoch": 0.49513671369020984, + "grad_norm": 0.8194564580917358, + "learning_rate": 1.7455145800175323e-05, + "loss": 0.3482, + "step": 5256 + }, + { + "epoch": 0.4952309177833777, + "grad_norm": 0.6636349558830261, + "learning_rate": 1.7454139304252225e-05, + "loss": 0.3177, + "step": 5257 + }, + { + "epoch": 0.49532512187654554, + "grad_norm": 0.8018602728843689, + "learning_rate": 1.745313263836322e-05, + "loss": 0.3435, + "step": 5258 + }, + { + "epoch": 0.4954193259697134, + "grad_norm": 0.648948609828949, + "learning_rate": 1.7452125802531266e-05, + "loss": 0.2837, + "step": 5259 + }, + { + "epoch": 0.49551353006288124, + "grad_norm": 0.7257091999053955, + "learning_rate": 1.7451118796779315e-05, + "loss": 0.3613, + "step": 5260 + }, + { + "epoch": 0.4956077341560491, + "grad_norm": 0.7232349514961243, + "learning_rate": 1.7450111621130335e-05, + "loss": 0.2871, + "step": 5261 + }, + { + "epoch": 0.49570193824921693, + "grad_norm": 0.8945488333702087, + "learning_rate": 1.7449104275607292e-05, + "loss": 0.3041, + "step": 5262 + }, + { + "epoch": 0.4957961423423848, + "grad_norm": 0.7877565026283264, + "learning_rate": 1.7448096760233143e-05, + "loss": 0.3282, + "step": 5263 + }, + { + "epoch": 0.49589034643555263, + "grad_norm": 1.1505717039108276, + "learning_rate": 1.7447089075030877e-05, + "loss": 0.3351, + "step": 5264 + }, + { + "epoch": 0.4959845505287205, + "grad_norm": 0.8410128951072693, + "learning_rate": 1.744608122002346e-05, + "loss": 0.3485, + "step": 5265 + }, + { + "epoch": 0.49607875462188833, + "grad_norm": 0.717402458190918, + "learning_rate": 1.7445073195233874e-05, + "loss": 0.3289, + "step": 5266 + }, + { + "epoch": 0.4961729587150562, + "grad_norm": 0.6788548827171326, + "learning_rate": 1.7444065000685103e-05, + "loss": 0.353, + "step": 5267 + }, + { + "epoch": 0.496267162808224, + "grad_norm": 0.6685397028923035, + "learning_rate": 1.7443056636400145e-05, + "loss": 0.3076, + "step": 5268 + }, + { + "epoch": 0.4963613669013919, + "grad_norm": 0.7368515729904175, + "learning_rate": 1.7442048102401982e-05, + "loss": 0.2994, + "step": 5269 + }, + { + "epoch": 0.4964555709945597, + "grad_norm": 0.7723667025566101, + "learning_rate": 1.744103939871361e-05, + "loss": 0.3074, + "step": 5270 + }, + { + "epoch": 0.4965497750877276, + "grad_norm": 0.885190486907959, + "learning_rate": 1.744003052535803e-05, + "loss": 0.3465, + "step": 5271 + }, + { + "epoch": 0.4966439791808954, + "grad_norm": 0.7844976782798767, + "learning_rate": 1.7439021482358254e-05, + "loss": 0.38, + "step": 5272 + }, + { + "epoch": 0.49673818327406327, + "grad_norm": 0.7608745098114014, + "learning_rate": 1.743801226973728e-05, + "loss": 0.3054, + "step": 5273 + }, + { + "epoch": 0.4968323873672311, + "grad_norm": 0.9637671113014221, + "learning_rate": 1.7437002887518125e-05, + "loss": 0.3801, + "step": 5274 + }, + { + "epoch": 0.49692659146039897, + "grad_norm": 0.8396672010421753, + "learning_rate": 1.74359933357238e-05, + "loss": 0.3398, + "step": 5275 + }, + { + "epoch": 0.4970207955535668, + "grad_norm": 0.7731151580810547, + "learning_rate": 1.743498361437733e-05, + "loss": 0.28, + "step": 5276 + }, + { + "epoch": 0.49711499964673467, + "grad_norm": 0.7089502215385437, + "learning_rate": 1.7433973723501732e-05, + "loss": 0.3192, + "step": 5277 + }, + { + "epoch": 0.4972092037399025, + "grad_norm": 0.7073197364807129, + "learning_rate": 1.7432963663120037e-05, + "loss": 0.3351, + "step": 5278 + }, + { + "epoch": 0.49730340783307037, + "grad_norm": 0.7932494282722473, + "learning_rate": 1.743195343325528e-05, + "loss": 0.3659, + "step": 5279 + }, + { + "epoch": 0.4973976119262382, + "grad_norm": 0.7352001070976257, + "learning_rate": 1.7430943033930483e-05, + "loss": 0.3492, + "step": 5280 + }, + { + "epoch": 0.49749181601940606, + "grad_norm": 0.8383285403251648, + "learning_rate": 1.7429932465168693e-05, + "loss": 0.3538, + "step": 5281 + }, + { + "epoch": 0.4975860201125739, + "grad_norm": 0.7251850962638855, + "learning_rate": 1.742892172699296e-05, + "loss": 0.3453, + "step": 5282 + }, + { + "epoch": 0.49768022420574176, + "grad_norm": 0.8373103737831116, + "learning_rate": 1.7427910819426318e-05, + "loss": 0.3231, + "step": 5283 + }, + { + "epoch": 0.4977744282989096, + "grad_norm": 0.7442533373832703, + "learning_rate": 1.7426899742491824e-05, + "loss": 0.31, + "step": 5284 + }, + { + "epoch": 0.49786863239207746, + "grad_norm": 0.7947465777397156, + "learning_rate": 1.7425888496212527e-05, + "loss": 0.3506, + "step": 5285 + }, + { + "epoch": 0.4979628364852453, + "grad_norm": 0.9055950045585632, + "learning_rate": 1.742487708061149e-05, + "loss": 0.3811, + "step": 5286 + }, + { + "epoch": 0.49805704057841316, + "grad_norm": 0.7091962695121765, + "learning_rate": 1.742386549571177e-05, + "loss": 0.327, + "step": 5287 + }, + { + "epoch": 0.498151244671581, + "grad_norm": 0.803471565246582, + "learning_rate": 1.7422853741536437e-05, + "loss": 0.3663, + "step": 5288 + }, + { + "epoch": 0.49824544876474886, + "grad_norm": 0.809049129486084, + "learning_rate": 1.742184181810856e-05, + "loss": 0.3592, + "step": 5289 + }, + { + "epoch": 0.4983396528579167, + "grad_norm": 0.7729027271270752, + "learning_rate": 1.742082972545121e-05, + "loss": 0.3076, + "step": 5290 + }, + { + "epoch": 0.4984338569510845, + "grad_norm": 0.8046419620513916, + "learning_rate": 1.7419817463587466e-05, + "loss": 0.3497, + "step": 5291 + }, + { + "epoch": 0.49852806104425235, + "grad_norm": 0.7646002173423767, + "learning_rate": 1.741880503254041e-05, + "loss": 0.3245, + "step": 5292 + }, + { + "epoch": 0.4986222651374202, + "grad_norm": 0.7153975963592529, + "learning_rate": 1.7417792432333124e-05, + "loss": 0.314, + "step": 5293 + }, + { + "epoch": 0.49871646923058804, + "grad_norm": 0.7249170541763306, + "learning_rate": 1.7416779662988702e-05, + "loss": 0.3337, + "step": 5294 + }, + { + "epoch": 0.4988106733237559, + "grad_norm": 1.015845775604248, + "learning_rate": 1.7415766724530232e-05, + "loss": 0.3111, + "step": 5295 + }, + { + "epoch": 0.49890487741692374, + "grad_norm": 0.8393731713294983, + "learning_rate": 1.741475361698081e-05, + "loss": 0.3366, + "step": 5296 + }, + { + "epoch": 0.4989990815100916, + "grad_norm": 0.9029054641723633, + "learning_rate": 1.7413740340363542e-05, + "loss": 0.3352, + "step": 5297 + }, + { + "epoch": 0.49909328560325944, + "grad_norm": 0.7641615867614746, + "learning_rate": 1.7412726894701527e-05, + "loss": 0.3471, + "step": 5298 + }, + { + "epoch": 0.4991874896964273, + "grad_norm": 0.8021811842918396, + "learning_rate": 1.741171328001787e-05, + "loss": 0.3387, + "step": 5299 + }, + { + "epoch": 0.49928169378959514, + "grad_norm": 0.9175771474838257, + "learning_rate": 1.7410699496335693e-05, + "loss": 0.3636, + "step": 5300 + }, + { + "epoch": 0.499375897882763, + "grad_norm": 0.8105636239051819, + "learning_rate": 1.7409685543678105e-05, + "loss": 0.3546, + "step": 5301 + }, + { + "epoch": 0.49947010197593084, + "grad_norm": 0.8636082410812378, + "learning_rate": 1.740867142206823e-05, + "loss": 0.3379, + "step": 5302 + }, + { + "epoch": 0.4995643060690987, + "grad_norm": 0.7709434628486633, + "learning_rate": 1.740765713152919e-05, + "loss": 0.3692, + "step": 5303 + }, + { + "epoch": 0.49965851016226653, + "grad_norm": 0.6728577017784119, + "learning_rate": 1.7406642672084105e-05, + "loss": 0.3277, + "step": 5304 + }, + { + "epoch": 0.4997527142554344, + "grad_norm": 0.7524570822715759, + "learning_rate": 1.7405628043756114e-05, + "loss": 0.3128, + "step": 5305 + }, + { + "epoch": 0.49984691834860223, + "grad_norm": 0.8218162059783936, + "learning_rate": 1.740461324656835e-05, + "loss": 0.3535, + "step": 5306 + }, + { + "epoch": 0.4999411224417701, + "grad_norm": 0.7679923176765442, + "learning_rate": 1.7403598280543955e-05, + "loss": 0.3312, + "step": 5307 + }, + { + "epoch": 0.500035326534938, + "grad_norm": 0.8444740176200867, + "learning_rate": 1.740258314570607e-05, + "loss": 0.3357, + "step": 5308 + }, + { + "epoch": 0.5001295306281058, + "grad_norm": 0.7953502535820007, + "learning_rate": 1.740156784207784e-05, + "loss": 0.3661, + "step": 5309 + }, + { + "epoch": 0.5002237347212737, + "grad_norm": 0.8228588104248047, + "learning_rate": 1.7400552369682417e-05, + "loss": 0.3662, + "step": 5310 + }, + { + "epoch": 0.5003179388144415, + "grad_norm": 0.9413067698478699, + "learning_rate": 1.7399536728542955e-05, + "loss": 0.3763, + "step": 5311 + }, + { + "epoch": 0.5004121429076094, + "grad_norm": 0.9656098484992981, + "learning_rate": 1.739852091868261e-05, + "loss": 0.3424, + "step": 5312 + }, + { + "epoch": 0.5005063470007772, + "grad_norm": 0.8443370461463928, + "learning_rate": 1.7397504940124546e-05, + "loss": 0.3378, + "step": 5313 + }, + { + "epoch": 0.5006005510939451, + "grad_norm": 0.7331113219261169, + "learning_rate": 1.7396488792891932e-05, + "loss": 0.3346, + "step": 5314 + }, + { + "epoch": 0.5006947551871129, + "grad_norm": 0.7605288624763489, + "learning_rate": 1.7395472477007932e-05, + "loss": 0.314, + "step": 5315 + }, + { + "epoch": 0.5007889592802808, + "grad_norm": 0.8171616792678833, + "learning_rate": 1.7394455992495722e-05, + "loss": 0.389, + "step": 5316 + }, + { + "epoch": 0.5008831633734486, + "grad_norm": 0.7240248918533325, + "learning_rate": 1.739343933937848e-05, + "loss": 0.2431, + "step": 5317 + }, + { + "epoch": 0.5009773674666165, + "grad_norm": 0.7218488454818726, + "learning_rate": 1.739242251767939e-05, + "loss": 0.3376, + "step": 5318 + }, + { + "epoch": 0.5010715715597843, + "grad_norm": 0.7530635595321655, + "learning_rate": 1.739140552742163e-05, + "loss": 0.3486, + "step": 5319 + }, + { + "epoch": 0.5011657756529522, + "grad_norm": 0.704440712928772, + "learning_rate": 1.7390388368628396e-05, + "loss": 0.3059, + "step": 5320 + }, + { + "epoch": 0.50125997974612, + "grad_norm": 0.8903000354766846, + "learning_rate": 1.7389371041322872e-05, + "loss": 0.3019, + "step": 5321 + }, + { + "epoch": 0.5013541838392879, + "grad_norm": 0.9734460115432739, + "learning_rate": 1.7388353545528265e-05, + "loss": 0.373, + "step": 5322 + }, + { + "epoch": 0.5014483879324557, + "grad_norm": 0.6763521432876587, + "learning_rate": 1.7387335881267774e-05, + "loss": 0.3321, + "step": 5323 + }, + { + "epoch": 0.5015425920256236, + "grad_norm": 0.7226347923278809, + "learning_rate": 1.7386318048564596e-05, + "loss": 0.2939, + "step": 5324 + }, + { + "epoch": 0.5016367961187913, + "grad_norm": 0.8059448599815369, + "learning_rate": 1.7385300047441944e-05, + "loss": 0.3213, + "step": 5325 + }, + { + "epoch": 0.5017310002119592, + "grad_norm": 0.9217113852500916, + "learning_rate": 1.738428187792303e-05, + "loss": 0.4016, + "step": 5326 + }, + { + "epoch": 0.501825204305127, + "grad_norm": 0.7564734816551208, + "learning_rate": 1.738326354003107e-05, + "loss": 0.3242, + "step": 5327 + }, + { + "epoch": 0.5019194083982949, + "grad_norm": 0.9484173059463501, + "learning_rate": 1.7382245033789277e-05, + "loss": 0.3468, + "step": 5328 + }, + { + "epoch": 0.5020136124914627, + "grad_norm": 0.8150784373283386, + "learning_rate": 1.7381226359220886e-05, + "loss": 0.3703, + "step": 5329 + }, + { + "epoch": 0.5021078165846306, + "grad_norm": 0.878149688243866, + "learning_rate": 1.7380207516349115e-05, + "loss": 0.3925, + "step": 5330 + }, + { + "epoch": 0.5022020206777984, + "grad_norm": 0.7702428102493286, + "learning_rate": 1.73791885051972e-05, + "loss": 0.3257, + "step": 5331 + }, + { + "epoch": 0.5022962247709662, + "grad_norm": 0.799381673336029, + "learning_rate": 1.7378169325788378e-05, + "loss": 0.3442, + "step": 5332 + }, + { + "epoch": 0.5023904288641341, + "grad_norm": 0.7093852162361145, + "learning_rate": 1.7377149978145883e-05, + "loss": 0.3711, + "step": 5333 + }, + { + "epoch": 0.502484632957302, + "grad_norm": 0.8848658800125122, + "learning_rate": 1.737613046229296e-05, + "loss": 0.3682, + "step": 5334 + }, + { + "epoch": 0.5025788370504698, + "grad_norm": 0.7343833446502686, + "learning_rate": 1.7375110778252855e-05, + "loss": 0.3377, + "step": 5335 + }, + { + "epoch": 0.5026730411436376, + "grad_norm": 0.8974533677101135, + "learning_rate": 1.7374090926048815e-05, + "loss": 0.3093, + "step": 5336 + }, + { + "epoch": 0.5027672452368055, + "grad_norm": 0.6666314601898193, + "learning_rate": 1.7373070905704103e-05, + "loss": 0.2988, + "step": 5337 + }, + { + "epoch": 0.5028614493299733, + "grad_norm": 0.8002578616142273, + "learning_rate": 1.737205071724197e-05, + "loss": 0.3454, + "step": 5338 + }, + { + "epoch": 0.5029556534231412, + "grad_norm": 0.9683690071105957, + "learning_rate": 1.7371030360685676e-05, + "loss": 0.3133, + "step": 5339 + }, + { + "epoch": 0.503049857516309, + "grad_norm": 0.908248782157898, + "learning_rate": 1.7370009836058493e-05, + "loss": 0.3499, + "step": 5340 + }, + { + "epoch": 0.5031440616094769, + "grad_norm": 0.7834539413452148, + "learning_rate": 1.736898914338369e-05, + "loss": 0.2971, + "step": 5341 + }, + { + "epoch": 0.5032382657026447, + "grad_norm": 0.6673378944396973, + "learning_rate": 1.7367968282684537e-05, + "loss": 0.2834, + "step": 5342 + }, + { + "epoch": 0.5033324697958126, + "grad_norm": 0.7688459753990173, + "learning_rate": 1.7366947253984313e-05, + "loss": 0.3274, + "step": 5343 + }, + { + "epoch": 0.5034266738889804, + "grad_norm": 3.2844011783599854, + "learning_rate": 1.7365926057306292e-05, + "loss": 0.3038, + "step": 5344 + }, + { + "epoch": 0.5035208779821483, + "grad_norm": 0.7614874839782715, + "learning_rate": 1.736490469267377e-05, + "loss": 0.3166, + "step": 5345 + }, + { + "epoch": 0.5036150820753161, + "grad_norm": 0.8794525265693665, + "learning_rate": 1.7363883160110032e-05, + "loss": 0.3714, + "step": 5346 + }, + { + "epoch": 0.503709286168484, + "grad_norm": 0.7511628270149231, + "learning_rate": 1.736286145963837e-05, + "loss": 0.3322, + "step": 5347 + }, + { + "epoch": 0.5038034902616518, + "grad_norm": 0.7849157452583313, + "learning_rate": 1.7361839591282076e-05, + "loss": 0.337, + "step": 5348 + }, + { + "epoch": 0.5038976943548197, + "grad_norm": 0.6939377784729004, + "learning_rate": 1.736081755506446e-05, + "loss": 0.3224, + "step": 5349 + }, + { + "epoch": 0.5039918984479875, + "grad_norm": 0.8040423393249512, + "learning_rate": 1.7359795351008816e-05, + "loss": 0.3452, + "step": 5350 + }, + { + "epoch": 0.5040861025411554, + "grad_norm": 0.6809116005897522, + "learning_rate": 1.7358772979138453e-05, + "loss": 0.3081, + "step": 5351 + }, + { + "epoch": 0.5041803066343232, + "grad_norm": 0.7361429929733276, + "learning_rate": 1.735775043947669e-05, + "loss": 0.3203, + "step": 5352 + }, + { + "epoch": 0.5042745107274911, + "grad_norm": 0.7542217969894409, + "learning_rate": 1.7356727732046835e-05, + "loss": 0.3544, + "step": 5353 + }, + { + "epoch": 0.5043687148206589, + "grad_norm": 0.7274656891822815, + "learning_rate": 1.7355704856872212e-05, + "loss": 0.3476, + "step": 5354 + }, + { + "epoch": 0.5044629189138268, + "grad_norm": 0.861713707447052, + "learning_rate": 1.7354681813976145e-05, + "loss": 0.3313, + "step": 5355 + }, + { + "epoch": 0.5045571230069946, + "grad_norm": 0.8149729371070862, + "learning_rate": 1.7353658603381956e-05, + "loss": 0.2991, + "step": 5356 + }, + { + "epoch": 0.5046513271001625, + "grad_norm": 0.7789759039878845, + "learning_rate": 1.7352635225112978e-05, + "loss": 0.3453, + "step": 5357 + }, + { + "epoch": 0.5047455311933303, + "grad_norm": 1.1818147897720337, + "learning_rate": 1.7351611679192547e-05, + "loss": 0.2856, + "step": 5358 + }, + { + "epoch": 0.5048397352864982, + "grad_norm": 0.8158087134361267, + "learning_rate": 1.7350587965643998e-05, + "loss": 0.3409, + "step": 5359 + }, + { + "epoch": 0.504933939379666, + "grad_norm": 1.0676349401474, + "learning_rate": 1.7349564084490678e-05, + "loss": 0.3229, + "step": 5360 + }, + { + "epoch": 0.5050281434728339, + "grad_norm": 0.8740731477737427, + "learning_rate": 1.7348540035755928e-05, + "loss": 0.3742, + "step": 5361 + }, + { + "epoch": 0.5051223475660017, + "grad_norm": 0.6558460593223572, + "learning_rate": 1.73475158194631e-05, + "loss": 0.2965, + "step": 5362 + }, + { + "epoch": 0.5052165516591696, + "grad_norm": 0.7562674880027771, + "learning_rate": 1.734649143563555e-05, + "loss": 0.3686, + "step": 5363 + }, + { + "epoch": 0.5053107557523374, + "grad_norm": 0.7583739757537842, + "learning_rate": 1.7345466884296636e-05, + "loss": 0.3373, + "step": 5364 + }, + { + "epoch": 0.5054049598455053, + "grad_norm": 0.965645432472229, + "learning_rate": 1.7344442165469714e-05, + "loss": 0.3581, + "step": 5365 + }, + { + "epoch": 0.5054991639386731, + "grad_norm": 0.7194229960441589, + "learning_rate": 1.7343417279178153e-05, + "loss": 0.2868, + "step": 5366 + }, + { + "epoch": 0.505593368031841, + "grad_norm": 0.812960147857666, + "learning_rate": 1.734239222544532e-05, + "loss": 0.3705, + "step": 5367 + }, + { + "epoch": 0.5056875721250088, + "grad_norm": 0.7716863751411438, + "learning_rate": 1.734136700429459e-05, + "loss": 0.3119, + "step": 5368 + }, + { + "epoch": 0.5057817762181767, + "grad_norm": 0.8303197026252747, + "learning_rate": 1.734034161574934e-05, + "loss": 0.3224, + "step": 5369 + }, + { + "epoch": 0.5058759803113445, + "grad_norm": 0.7828728556632996, + "learning_rate": 1.7339316059832946e-05, + "loss": 0.3984, + "step": 5370 + }, + { + "epoch": 0.5059701844045124, + "grad_norm": 0.8528280258178711, + "learning_rate": 1.7338290336568798e-05, + "loss": 0.3455, + "step": 5371 + }, + { + "epoch": 0.5060643884976802, + "grad_norm": 0.8985176086425781, + "learning_rate": 1.733726444598028e-05, + "loss": 0.311, + "step": 5372 + }, + { + "epoch": 0.506158592590848, + "grad_norm": 0.6802164316177368, + "learning_rate": 1.7336238388090787e-05, + "loss": 0.29, + "step": 5373 + }, + { + "epoch": 0.5062527966840159, + "grad_norm": 0.8211531639099121, + "learning_rate": 1.733521216292371e-05, + "loss": 0.3426, + "step": 5374 + }, + { + "epoch": 0.5063470007771838, + "grad_norm": 0.8165231347084045, + "learning_rate": 1.7334185770502453e-05, + "loss": 0.3192, + "step": 5375 + }, + { + "epoch": 0.5064412048703516, + "grad_norm": 0.8941564559936523, + "learning_rate": 1.7333159210850417e-05, + "loss": 0.3064, + "step": 5376 + }, + { + "epoch": 0.5065354089635195, + "grad_norm": 0.8007015585899353, + "learning_rate": 1.7332132483991015e-05, + "loss": 0.3443, + "step": 5377 + }, + { + "epoch": 0.5066296130566873, + "grad_norm": 0.7286374568939209, + "learning_rate": 1.7331105589947648e-05, + "loss": 0.3131, + "step": 5378 + }, + { + "epoch": 0.5067238171498551, + "grad_norm": 0.6885233521461487, + "learning_rate": 1.7330078528743738e-05, + "loss": 0.3361, + "step": 5379 + }, + { + "epoch": 0.506818021243023, + "grad_norm": 0.7876170873641968, + "learning_rate": 1.73290513004027e-05, + "loss": 0.3917, + "step": 5380 + }, + { + "epoch": 0.5069122253361908, + "grad_norm": 0.6517916917800903, + "learning_rate": 1.7328023904947958e-05, + "loss": 0.3179, + "step": 5381 + }, + { + "epoch": 0.5070064294293587, + "grad_norm": 0.7782155871391296, + "learning_rate": 1.732699634240294e-05, + "loss": 0.2861, + "step": 5382 + }, + { + "epoch": 0.5071006335225265, + "grad_norm": 0.7896504998207092, + "learning_rate": 1.7325968612791074e-05, + "loss": 0.3452, + "step": 5383 + }, + { + "epoch": 0.5071948376156944, + "grad_norm": 0.890192985534668, + "learning_rate": 1.732494071613579e-05, + "loss": 0.3359, + "step": 5384 + }, + { + "epoch": 0.5072890417088622, + "grad_norm": 0.7129732966423035, + "learning_rate": 1.732391265246053e-05, + "loss": 0.3197, + "step": 5385 + }, + { + "epoch": 0.5073832458020301, + "grad_norm": 0.7338051795959473, + "learning_rate": 1.7322884421788736e-05, + "loss": 0.316, + "step": 5386 + }, + { + "epoch": 0.5074774498951979, + "grad_norm": 0.7098434567451477, + "learning_rate": 1.7321856024143855e-05, + "loss": 0.3262, + "step": 5387 + }, + { + "epoch": 0.5075716539883658, + "grad_norm": 0.7533916234970093, + "learning_rate": 1.732082745954933e-05, + "loss": 0.2947, + "step": 5388 + }, + { + "epoch": 0.5076658580815336, + "grad_norm": 1.1245689392089844, + "learning_rate": 1.7319798728028617e-05, + "loss": 0.3883, + "step": 5389 + }, + { + "epoch": 0.5077600621747015, + "grad_norm": 0.8866914510726929, + "learning_rate": 1.7318769829605176e-05, + "loss": 0.3529, + "step": 5390 + }, + { + "epoch": 0.5078542662678693, + "grad_norm": 0.7427348494529724, + "learning_rate": 1.731774076430246e-05, + "loss": 0.3152, + "step": 5391 + }, + { + "epoch": 0.5079484703610372, + "grad_norm": 0.8100770711898804, + "learning_rate": 1.731671153214394e-05, + "loss": 0.3355, + "step": 5392 + }, + { + "epoch": 0.508042674454205, + "grad_norm": 0.6134782433509827, + "learning_rate": 1.731568213315308e-05, + "loss": 0.2811, + "step": 5393 + }, + { + "epoch": 0.5081368785473729, + "grad_norm": 0.7467697262763977, + "learning_rate": 1.7314652567353355e-05, + "loss": 0.3613, + "step": 5394 + }, + { + "epoch": 0.5082310826405407, + "grad_norm": 0.8124426007270813, + "learning_rate": 1.731362283476824e-05, + "loss": 0.3457, + "step": 5395 + }, + { + "epoch": 0.5083252867337086, + "grad_norm": 0.9458872079849243, + "learning_rate": 1.731259293542121e-05, + "loss": 0.318, + "step": 5396 + }, + { + "epoch": 0.5084194908268764, + "grad_norm": 0.8066940903663635, + "learning_rate": 1.7311562869335753e-05, + "loss": 0.3041, + "step": 5397 + }, + { + "epoch": 0.5085136949200443, + "grad_norm": 0.8185582756996155, + "learning_rate": 1.7310532636535357e-05, + "loss": 0.407, + "step": 5398 + }, + { + "epoch": 0.5086078990132121, + "grad_norm": 0.823703944683075, + "learning_rate": 1.7309502237043508e-05, + "loss": 0.3189, + "step": 5399 + }, + { + "epoch": 0.50870210310638, + "grad_norm": 0.7584295868873596, + "learning_rate": 1.7308471670883707e-05, + "loss": 0.3335, + "step": 5400 + }, + { + "epoch": 0.5087963071995478, + "grad_norm": 0.7898347973823547, + "learning_rate": 1.7307440938079447e-05, + "loss": 0.3086, + "step": 5401 + }, + { + "epoch": 0.5088905112927157, + "grad_norm": 0.6594393849372864, + "learning_rate": 1.730641003865423e-05, + "loss": 0.3196, + "step": 5402 + }, + { + "epoch": 0.5089847153858835, + "grad_norm": 0.850841760635376, + "learning_rate": 1.730537897263157e-05, + "loss": 0.3599, + "step": 5403 + }, + { + "epoch": 0.5090789194790514, + "grad_norm": 0.6819915771484375, + "learning_rate": 1.7304347740034968e-05, + "loss": 0.3202, + "step": 5404 + }, + { + "epoch": 0.5091731235722192, + "grad_norm": 0.7515741586685181, + "learning_rate": 1.7303316340887943e-05, + "loss": 0.3524, + "step": 5405 + }, + { + "epoch": 0.5092673276653871, + "grad_norm": 0.8181531429290771, + "learning_rate": 1.730228477521401e-05, + "loss": 0.3905, + "step": 5406 + }, + { + "epoch": 0.5093615317585549, + "grad_norm": 0.7893922924995422, + "learning_rate": 1.730125304303669e-05, + "loss": 0.3236, + "step": 5407 + }, + { + "epoch": 0.5094557358517228, + "grad_norm": 0.698948085308075, + "learning_rate": 1.730022114437951e-05, + "loss": 0.3337, + "step": 5408 + }, + { + "epoch": 0.5095499399448906, + "grad_norm": 0.830158531665802, + "learning_rate": 1.7299189079266e-05, + "loss": 0.3207, + "step": 5409 + }, + { + "epoch": 0.5096441440380585, + "grad_norm": 0.8753250241279602, + "learning_rate": 1.7298156847719687e-05, + "loss": 0.3427, + "step": 5410 + }, + { + "epoch": 0.5097383481312263, + "grad_norm": 0.9487413167953491, + "learning_rate": 1.729712444976411e-05, + "loss": 0.3504, + "step": 5411 + }, + { + "epoch": 0.5098325522243942, + "grad_norm": 0.788690447807312, + "learning_rate": 1.7296091885422816e-05, + "loss": 0.3139, + "step": 5412 + }, + { + "epoch": 0.509926756317562, + "grad_norm": 0.9058326482772827, + "learning_rate": 1.7295059154719337e-05, + "loss": 0.327, + "step": 5413 + }, + { + "epoch": 0.5100209604107299, + "grad_norm": 0.810093104839325, + "learning_rate": 1.7294026257677233e-05, + "loss": 0.3691, + "step": 5414 + }, + { + "epoch": 0.5101151645038977, + "grad_norm": 0.7243669033050537, + "learning_rate": 1.7292993194320047e-05, + "loss": 0.3132, + "step": 5415 + }, + { + "epoch": 0.5102093685970656, + "grad_norm": 0.7211150527000427, + "learning_rate": 1.7291959964671337e-05, + "loss": 0.333, + "step": 5416 + }, + { + "epoch": 0.5103035726902334, + "grad_norm": 0.7501480579376221, + "learning_rate": 1.7290926568754664e-05, + "loss": 0.3353, + "step": 5417 + }, + { + "epoch": 0.5103977767834013, + "grad_norm": 0.7837489247322083, + "learning_rate": 1.728989300659359e-05, + "loss": 0.3502, + "step": 5418 + }, + { + "epoch": 0.5104919808765691, + "grad_norm": 0.7330644726753235, + "learning_rate": 1.7288859278211676e-05, + "loss": 0.3383, + "step": 5419 + }, + { + "epoch": 0.510586184969737, + "grad_norm": 0.6996157765388489, + "learning_rate": 1.7287825383632503e-05, + "loss": 0.3377, + "step": 5420 + }, + { + "epoch": 0.5106803890629048, + "grad_norm": 0.7708149552345276, + "learning_rate": 1.728679132287964e-05, + "loss": 0.33, + "step": 5421 + }, + { + "epoch": 0.5107745931560727, + "grad_norm": 0.8168370127677917, + "learning_rate": 1.728575709597666e-05, + "loss": 0.3027, + "step": 5422 + }, + { + "epoch": 0.5108687972492405, + "grad_norm": 0.7973650097846985, + "learning_rate": 1.7284722702947162e-05, + "loss": 0.3504, + "step": 5423 + }, + { + "epoch": 0.5109630013424084, + "grad_norm": 0.8431777358055115, + "learning_rate": 1.728368814381471e-05, + "loss": 0.3333, + "step": 5424 + }, + { + "epoch": 0.5110572054355762, + "grad_norm": 0.6388592720031738, + "learning_rate": 1.7282653418602908e-05, + "loss": 0.2916, + "step": 5425 + }, + { + "epoch": 0.511151409528744, + "grad_norm": 0.6659934520721436, + "learning_rate": 1.7281618527335347e-05, + "loss": 0.31, + "step": 5426 + }, + { + "epoch": 0.5112456136219119, + "grad_norm": 0.755641520023346, + "learning_rate": 1.728058347003562e-05, + "loss": 0.3206, + "step": 5427 + }, + { + "epoch": 0.5113398177150797, + "grad_norm": 0.7816777229309082, + "learning_rate": 1.727954824672733e-05, + "loss": 0.347, + "step": 5428 + }, + { + "epoch": 0.5114340218082476, + "grad_norm": 0.6937658190727234, + "learning_rate": 1.7278512857434085e-05, + "loss": 0.2998, + "step": 5429 + }, + { + "epoch": 0.5115282259014154, + "grad_norm": 0.7459749579429626, + "learning_rate": 1.7277477302179487e-05, + "loss": 0.3347, + "step": 5430 + }, + { + "epoch": 0.5116224299945833, + "grad_norm": 0.8485836982727051, + "learning_rate": 1.727644158098715e-05, + "loss": 0.3318, + "step": 5431 + }, + { + "epoch": 0.5117166340877511, + "grad_norm": 0.7895025014877319, + "learning_rate": 1.7275405693880697e-05, + "loss": 0.3334, + "step": 5432 + }, + { + "epoch": 0.511810838180919, + "grad_norm": 0.7498315572738647, + "learning_rate": 1.727436964088374e-05, + "loss": 0.288, + "step": 5433 + }, + { + "epoch": 0.5119050422740868, + "grad_norm": 0.7922171950340271, + "learning_rate": 1.7273333422019905e-05, + "loss": 0.3679, + "step": 5434 + }, + { + "epoch": 0.5119992463672547, + "grad_norm": 0.7560186386108398, + "learning_rate": 1.7272297037312817e-05, + "loss": 0.268, + "step": 5435 + }, + { + "epoch": 0.5120934504604225, + "grad_norm": 0.7202543020248413, + "learning_rate": 1.7271260486786114e-05, + "loss": 0.3706, + "step": 5436 + }, + { + "epoch": 0.5121876545535904, + "grad_norm": 0.9651054739952087, + "learning_rate": 1.7270223770463424e-05, + "loss": 0.2877, + "step": 5437 + }, + { + "epoch": 0.5122818586467582, + "grad_norm": 0.7424997091293335, + "learning_rate": 1.726918688836839e-05, + "loss": 0.3793, + "step": 5438 + }, + { + "epoch": 0.5123760627399261, + "grad_norm": 0.7279515862464905, + "learning_rate": 1.7268149840524653e-05, + "loss": 0.3479, + "step": 5439 + }, + { + "epoch": 0.5124702668330939, + "grad_norm": 0.6700050830841064, + "learning_rate": 1.7267112626955856e-05, + "loss": 0.2837, + "step": 5440 + }, + { + "epoch": 0.5125644709262618, + "grad_norm": 0.7173581123352051, + "learning_rate": 1.7266075247685656e-05, + "loss": 0.2837, + "step": 5441 + }, + { + "epoch": 0.5126586750194296, + "grad_norm": 0.8028951287269592, + "learning_rate": 1.7265037702737703e-05, + "loss": 0.3404, + "step": 5442 + }, + { + "epoch": 0.5127528791125975, + "grad_norm": 0.8250238299369812, + "learning_rate": 1.7263999992135654e-05, + "loss": 0.3008, + "step": 5443 + }, + { + "epoch": 0.5128470832057653, + "grad_norm": 0.7309693694114685, + "learning_rate": 1.726296211590317e-05, + "loss": 0.3226, + "step": 5444 + }, + { + "epoch": 0.5129412872989332, + "grad_norm": 0.71340411901474, + "learning_rate": 1.726192407406392e-05, + "loss": 0.3574, + "step": 5445 + }, + { + "epoch": 0.513035491392101, + "grad_norm": 0.7260133624076843, + "learning_rate": 1.726088586664157e-05, + "loss": 0.3388, + "step": 5446 + }, + { + "epoch": 0.5131296954852689, + "grad_norm": 0.9490150213241577, + "learning_rate": 1.7259847493659793e-05, + "loss": 0.3615, + "step": 5447 + }, + { + "epoch": 0.5132238995784367, + "grad_norm": 0.754058837890625, + "learning_rate": 1.725880895514226e-05, + "loss": 0.3652, + "step": 5448 + }, + { + "epoch": 0.5133181036716046, + "grad_norm": 0.787288248538971, + "learning_rate": 1.7257770251112662e-05, + "loss": 0.3023, + "step": 5449 + }, + { + "epoch": 0.5134123077647724, + "grad_norm": 1.0981327295303345, + "learning_rate": 1.7256731381594677e-05, + "loss": 0.3503, + "step": 5450 + }, + { + "epoch": 0.5135065118579403, + "grad_norm": 0.7456867694854736, + "learning_rate": 1.7255692346611994e-05, + "loss": 0.3643, + "step": 5451 + }, + { + "epoch": 0.5136007159511081, + "grad_norm": 0.766739010810852, + "learning_rate": 1.7254653146188306e-05, + "loss": 0.3477, + "step": 5452 + }, + { + "epoch": 0.513694920044276, + "grad_norm": 0.7890262603759766, + "learning_rate": 1.7253613780347303e-05, + "loss": 0.3745, + "step": 5453 + }, + { + "epoch": 0.5137891241374438, + "grad_norm": 0.9555128216743469, + "learning_rate": 1.725257424911269e-05, + "loss": 0.3556, + "step": 5454 + }, + { + "epoch": 0.5138833282306117, + "grad_norm": 0.7513731122016907, + "learning_rate": 1.725153455250817e-05, + "loss": 0.3147, + "step": 5455 + }, + { + "epoch": 0.5139775323237795, + "grad_norm": 1.0011706352233887, + "learning_rate": 1.7250494690557445e-05, + "loss": 0.3655, + "step": 5456 + }, + { + "epoch": 0.5140717364169474, + "grad_norm": 0.7627401947975159, + "learning_rate": 1.7249454663284225e-05, + "loss": 0.3403, + "step": 5457 + }, + { + "epoch": 0.5141659405101152, + "grad_norm": 0.7304606437683105, + "learning_rate": 1.7248414470712232e-05, + "loss": 0.3463, + "step": 5458 + }, + { + "epoch": 0.5142601446032831, + "grad_norm": 0.8432670831680298, + "learning_rate": 1.7247374112865178e-05, + "loss": 0.3694, + "step": 5459 + }, + { + "epoch": 0.5143543486964509, + "grad_norm": 0.8038775324821472, + "learning_rate": 1.7246333589766786e-05, + "loss": 0.3167, + "step": 5460 + }, + { + "epoch": 0.5144485527896188, + "grad_norm": 0.7647266983985901, + "learning_rate": 1.724529290144078e-05, + "loss": 0.3057, + "step": 5461 + }, + { + "epoch": 0.5145427568827866, + "grad_norm": 0.820859968662262, + "learning_rate": 1.7244252047910893e-05, + "loss": 0.3457, + "step": 5462 + }, + { + "epoch": 0.5146369609759545, + "grad_norm": 0.8634036779403687, + "learning_rate": 1.7243211029200852e-05, + "loss": 0.3016, + "step": 5463 + }, + { + "epoch": 0.5147311650691222, + "grad_norm": 0.8282648921012878, + "learning_rate": 1.72421698453344e-05, + "loss": 0.3581, + "step": 5464 + }, + { + "epoch": 0.51482536916229, + "grad_norm": 0.7817854285240173, + "learning_rate": 1.7241128496335276e-05, + "loss": 0.353, + "step": 5465 + }, + { + "epoch": 0.5149195732554579, + "grad_norm": 0.7150627374649048, + "learning_rate": 1.7240086982227225e-05, + "loss": 0.3542, + "step": 5466 + }, + { + "epoch": 0.5150137773486257, + "grad_norm": 0.6410994529724121, + "learning_rate": 1.723904530303399e-05, + "loss": 0.3014, + "step": 5467 + }, + { + "epoch": 0.5151079814417936, + "grad_norm": 0.8309205770492554, + "learning_rate": 1.7238003458779327e-05, + "loss": 0.3426, + "step": 5468 + }, + { + "epoch": 0.5152021855349614, + "grad_norm": 0.8303483724594116, + "learning_rate": 1.7236961449486996e-05, + "loss": 0.3431, + "step": 5469 + }, + { + "epoch": 0.5152963896281293, + "grad_norm": 0.7229982018470764, + "learning_rate": 1.7235919275180748e-05, + "loss": 0.3116, + "step": 5470 + }, + { + "epoch": 0.5153905937212971, + "grad_norm": 0.7184120416641235, + "learning_rate": 1.723487693588435e-05, + "loss": 0.3366, + "step": 5471 + }, + { + "epoch": 0.515484797814465, + "grad_norm": 0.8860011100769043, + "learning_rate": 1.723383443162157e-05, + "loss": 0.299, + "step": 5472 + }, + { + "epoch": 0.5155790019076328, + "grad_norm": 0.7865309119224548, + "learning_rate": 1.7232791762416176e-05, + "loss": 0.3552, + "step": 5473 + }, + { + "epoch": 0.5156732060008007, + "grad_norm": 0.7601330876350403, + "learning_rate": 1.7231748928291946e-05, + "loss": 0.2869, + "step": 5474 + }, + { + "epoch": 0.5157674100939685, + "grad_norm": 0.6787477731704712, + "learning_rate": 1.7230705929272655e-05, + "loss": 0.2832, + "step": 5475 + }, + { + "epoch": 0.5158616141871364, + "grad_norm": 0.7316486239433289, + "learning_rate": 1.722966276538209e-05, + "loss": 0.3329, + "step": 5476 + }, + { + "epoch": 0.5159558182803042, + "grad_norm": 0.9282322525978088, + "learning_rate": 1.7228619436644026e-05, + "loss": 0.3011, + "step": 5477 + }, + { + "epoch": 0.5160500223734721, + "grad_norm": 0.6631028652191162, + "learning_rate": 1.7227575943082268e-05, + "loss": 0.2993, + "step": 5478 + }, + { + "epoch": 0.5161442264666399, + "grad_norm": 0.7901523113250732, + "learning_rate": 1.7226532284720594e-05, + "loss": 0.3189, + "step": 5479 + }, + { + "epoch": 0.5162384305598078, + "grad_norm": 0.8018639087677002, + "learning_rate": 1.7225488461582812e-05, + "loss": 0.3547, + "step": 5480 + }, + { + "epoch": 0.5163326346529756, + "grad_norm": 0.6779786348342896, + "learning_rate": 1.7224444473692718e-05, + "loss": 0.3308, + "step": 5481 + }, + { + "epoch": 0.5164268387461435, + "grad_norm": 0.9510851502418518, + "learning_rate": 1.7223400321074115e-05, + "loss": 0.3882, + "step": 5482 + }, + { + "epoch": 0.5165210428393113, + "grad_norm": 2.2835378646850586, + "learning_rate": 1.7222356003750814e-05, + "loss": 0.3359, + "step": 5483 + }, + { + "epoch": 0.5166152469324792, + "grad_norm": 0.8252720832824707, + "learning_rate": 1.7221311521746628e-05, + "loss": 0.3025, + "step": 5484 + }, + { + "epoch": 0.516709451025647, + "grad_norm": 0.9316931962966919, + "learning_rate": 1.722026687508537e-05, + "loss": 0.3915, + "step": 5485 + }, + { + "epoch": 0.5168036551188149, + "grad_norm": 0.8290546536445618, + "learning_rate": 1.721922206379086e-05, + "loss": 0.387, + "step": 5486 + }, + { + "epoch": 0.5168978592119827, + "grad_norm": 0.7082507610321045, + "learning_rate": 1.7218177087886923e-05, + "loss": 0.3085, + "step": 5487 + }, + { + "epoch": 0.5169920633051506, + "grad_norm": 0.7358608245849609, + "learning_rate": 1.7217131947397386e-05, + "loss": 0.3285, + "step": 5488 + }, + { + "epoch": 0.5170862673983184, + "grad_norm": 0.7969195246696472, + "learning_rate": 1.721608664234608e-05, + "loss": 0.3485, + "step": 5489 + }, + { + "epoch": 0.5171804714914863, + "grad_norm": 0.7457312345504761, + "learning_rate": 1.7215041172756838e-05, + "loss": 0.3061, + "step": 5490 + }, + { + "epoch": 0.5172746755846541, + "grad_norm": 0.7926058173179626, + "learning_rate": 1.7213995538653497e-05, + "loss": 0.3561, + "step": 5491 + }, + { + "epoch": 0.517368879677822, + "grad_norm": 0.7441315650939941, + "learning_rate": 1.7212949740059903e-05, + "loss": 0.3754, + "step": 5492 + }, + { + "epoch": 0.5174630837709898, + "grad_norm": 0.7602240443229675, + "learning_rate": 1.7211903776999903e-05, + "loss": 0.3352, + "step": 5493 + }, + { + "epoch": 0.5175572878641577, + "grad_norm": 0.8453928828239441, + "learning_rate": 1.721085764949734e-05, + "loss": 0.2914, + "step": 5494 + }, + { + "epoch": 0.5176514919573255, + "grad_norm": 0.739108145236969, + "learning_rate": 1.7209811357576066e-05, + "loss": 0.2925, + "step": 5495 + }, + { + "epoch": 0.5177456960504934, + "grad_norm": 0.7192702889442444, + "learning_rate": 1.720876490125995e-05, + "loss": 0.289, + "step": 5496 + }, + { + "epoch": 0.5178399001436612, + "grad_norm": 0.8548442125320435, + "learning_rate": 1.7207718280572844e-05, + "loss": 0.3342, + "step": 5497 + }, + { + "epoch": 0.5179341042368291, + "grad_norm": 0.6716485619544983, + "learning_rate": 1.720667149553861e-05, + "loss": 0.3124, + "step": 5498 + }, + { + "epoch": 0.5180283083299969, + "grad_norm": 0.66901695728302, + "learning_rate": 1.7205624546181128e-05, + "loss": 0.3067, + "step": 5499 + }, + { + "epoch": 0.5181225124231648, + "grad_norm": 0.7227648496627808, + "learning_rate": 1.7204577432524257e-05, + "loss": 0.3009, + "step": 5500 + }, + { + "epoch": 0.5182167165163326, + "grad_norm": 0.8730160593986511, + "learning_rate": 1.7203530154591883e-05, + "loss": 0.3605, + "step": 5501 + }, + { + "epoch": 0.5183109206095005, + "grad_norm": 0.7889743447303772, + "learning_rate": 1.7202482712407876e-05, + "loss": 0.3175, + "step": 5502 + }, + { + "epoch": 0.5184051247026683, + "grad_norm": 0.7347235083580017, + "learning_rate": 1.7201435105996128e-05, + "loss": 0.3111, + "step": 5503 + }, + { + "epoch": 0.5184993287958362, + "grad_norm": 0.7456490993499756, + "learning_rate": 1.720038733538052e-05, + "loss": 0.3193, + "step": 5504 + }, + { + "epoch": 0.518593532889004, + "grad_norm": 0.9792962074279785, + "learning_rate": 1.7199339400584944e-05, + "loss": 0.3732, + "step": 5505 + }, + { + "epoch": 0.5186877369821719, + "grad_norm": 0.8372748494148254, + "learning_rate": 1.7198291301633298e-05, + "loss": 0.3591, + "step": 5506 + }, + { + "epoch": 0.5187819410753397, + "grad_norm": 0.9081077575683594, + "learning_rate": 1.719724303854948e-05, + "loss": 0.4306, + "step": 5507 + }, + { + "epoch": 0.5188761451685076, + "grad_norm": 0.7463353872299194, + "learning_rate": 1.719619461135739e-05, + "loss": 0.3307, + "step": 5508 + }, + { + "epoch": 0.5189703492616754, + "grad_norm": 0.7470934391021729, + "learning_rate": 1.719514602008093e-05, + "loss": 0.315, + "step": 5509 + }, + { + "epoch": 0.5190645533548432, + "grad_norm": 0.7186209559440613, + "learning_rate": 1.7194097264744014e-05, + "loss": 0.3001, + "step": 5510 + }, + { + "epoch": 0.5191587574480111, + "grad_norm": 0.8188936114311218, + "learning_rate": 1.7193048345370553e-05, + "loss": 0.3803, + "step": 5511 + }, + { + "epoch": 0.519252961541179, + "grad_norm": 0.7234528064727783, + "learning_rate": 1.7191999261984466e-05, + "loss": 0.3001, + "step": 5512 + }, + { + "epoch": 0.5193471656343468, + "grad_norm": 0.847127377986908, + "learning_rate": 1.7190950014609677e-05, + "loss": 0.3308, + "step": 5513 + }, + { + "epoch": 0.5194413697275146, + "grad_norm": 0.8823010921478271, + "learning_rate": 1.7189900603270105e-05, + "loss": 0.3513, + "step": 5514 + }, + { + "epoch": 0.5195355738206825, + "grad_norm": 0.7539229989051819, + "learning_rate": 1.718885102798968e-05, + "loss": 0.2975, + "step": 5515 + }, + { + "epoch": 0.5196297779138503, + "grad_norm": 0.8143484592437744, + "learning_rate": 1.718780128879233e-05, + "loss": 0.3359, + "step": 5516 + }, + { + "epoch": 0.5197239820070182, + "grad_norm": 0.8117477893829346, + "learning_rate": 1.7186751385701998e-05, + "loss": 0.3133, + "step": 5517 + }, + { + "epoch": 0.519818186100186, + "grad_norm": 0.8710623383522034, + "learning_rate": 1.718570131874262e-05, + "loss": 0.3412, + "step": 5518 + }, + { + "epoch": 0.5199123901933539, + "grad_norm": 0.9749717116355896, + "learning_rate": 1.7184651087938138e-05, + "loss": 0.3676, + "step": 5519 + }, + { + "epoch": 0.5200065942865217, + "grad_norm": 0.7328717708587646, + "learning_rate": 1.7183600693312503e-05, + "loss": 0.3066, + "step": 5520 + }, + { + "epoch": 0.5201007983796896, + "grad_norm": 0.7747542858123779, + "learning_rate": 1.718255013488966e-05, + "loss": 0.3091, + "step": 5521 + }, + { + "epoch": 0.5201950024728574, + "grad_norm": 0.757550835609436, + "learning_rate": 1.7181499412693563e-05, + "loss": 0.3609, + "step": 5522 + }, + { + "epoch": 0.5202892065660253, + "grad_norm": 0.7175095081329346, + "learning_rate": 1.7180448526748177e-05, + "loss": 0.3076, + "step": 5523 + }, + { + "epoch": 0.5203834106591931, + "grad_norm": 0.830827534198761, + "learning_rate": 1.717939747707746e-05, + "loss": 0.3386, + "step": 5524 + }, + { + "epoch": 0.520477614752361, + "grad_norm": 0.8881201148033142, + "learning_rate": 1.7178346263705372e-05, + "loss": 0.3547, + "step": 5525 + }, + { + "epoch": 0.5205718188455288, + "grad_norm": 0.7616137862205505, + "learning_rate": 1.7177294886655894e-05, + "loss": 0.3291, + "step": 5526 + }, + { + "epoch": 0.5206660229386967, + "grad_norm": 0.8208198547363281, + "learning_rate": 1.717624334595299e-05, + "loss": 0.3096, + "step": 5527 + }, + { + "epoch": 0.5207602270318645, + "grad_norm": 0.7254412770271301, + "learning_rate": 1.7175191641620637e-05, + "loss": 0.3449, + "step": 5528 + }, + { + "epoch": 0.5208544311250324, + "grad_norm": 0.7519254088401794, + "learning_rate": 1.7174139773682824e-05, + "loss": 0.3091, + "step": 5529 + }, + { + "epoch": 0.5209486352182002, + "grad_norm": 0.853268563747406, + "learning_rate": 1.7173087742163527e-05, + "loss": 0.3382, + "step": 5530 + }, + { + "epoch": 0.5210428393113681, + "grad_norm": 0.7265686988830566, + "learning_rate": 1.7172035547086732e-05, + "loss": 0.3032, + "step": 5531 + }, + { + "epoch": 0.5211370434045359, + "grad_norm": 0.674207329750061, + "learning_rate": 1.7170983188476437e-05, + "loss": 0.2717, + "step": 5532 + }, + { + "epoch": 0.5212312474977038, + "grad_norm": 0.874945878982544, + "learning_rate": 1.7169930666356637e-05, + "loss": 0.3788, + "step": 5533 + }, + { + "epoch": 0.5213254515908716, + "grad_norm": 0.8548377752304077, + "learning_rate": 1.716887798075133e-05, + "loss": 0.3248, + "step": 5534 + }, + { + "epoch": 0.5214196556840395, + "grad_norm": 0.7522386312484741, + "learning_rate": 1.7167825131684516e-05, + "loss": 0.3738, + "step": 5535 + }, + { + "epoch": 0.5215138597772073, + "grad_norm": 0.8124439120292664, + "learning_rate": 1.7166772119180202e-05, + "loss": 0.3562, + "step": 5536 + }, + { + "epoch": 0.5216080638703752, + "grad_norm": 0.7803117036819458, + "learning_rate": 1.7165718943262402e-05, + "loss": 0.3614, + "step": 5537 + }, + { + "epoch": 0.521702267963543, + "grad_norm": 0.6467984318733215, + "learning_rate": 1.7164665603955128e-05, + "loss": 0.2744, + "step": 5538 + }, + { + "epoch": 0.5217964720567109, + "grad_norm": 0.7423121929168701, + "learning_rate": 1.7163612101282398e-05, + "loss": 0.297, + "step": 5539 + }, + { + "epoch": 0.5218906761498787, + "grad_norm": 0.8348680734634399, + "learning_rate": 1.7162558435268235e-05, + "loss": 0.3428, + "step": 5540 + }, + { + "epoch": 0.5219848802430466, + "grad_norm": 0.7723286151885986, + "learning_rate": 1.716150460593666e-05, + "loss": 0.3472, + "step": 5541 + }, + { + "epoch": 0.5220790843362144, + "grad_norm": 0.868765652179718, + "learning_rate": 1.7160450613311704e-05, + "loss": 0.3718, + "step": 5542 + }, + { + "epoch": 0.5221732884293823, + "grad_norm": 0.7805017232894897, + "learning_rate": 1.7159396457417405e-05, + "loss": 0.3947, + "step": 5543 + }, + { + "epoch": 0.5222674925225501, + "grad_norm": 0.9394553899765015, + "learning_rate": 1.715834213827779e-05, + "loss": 0.3416, + "step": 5544 + }, + { + "epoch": 0.522361696615718, + "grad_norm": 0.7767390012741089, + "learning_rate": 1.7157287655916904e-05, + "loss": 0.3545, + "step": 5545 + }, + { + "epoch": 0.5224559007088858, + "grad_norm": 0.7393049001693726, + "learning_rate": 1.715623301035879e-05, + "loss": 0.3298, + "step": 5546 + }, + { + "epoch": 0.5225501048020537, + "grad_norm": 0.780881404876709, + "learning_rate": 1.7155178201627497e-05, + "loss": 0.3271, + "step": 5547 + }, + { + "epoch": 0.5226443088952215, + "grad_norm": 0.7244885563850403, + "learning_rate": 1.7154123229747077e-05, + "loss": 0.3464, + "step": 5548 + }, + { + "epoch": 0.5227385129883894, + "grad_norm": 0.7590210437774658, + "learning_rate": 1.715306809474158e-05, + "loss": 0.2862, + "step": 5549 + }, + { + "epoch": 0.5228327170815572, + "grad_norm": 0.8282087445259094, + "learning_rate": 1.715201279663507e-05, + "loss": 0.3589, + "step": 5550 + }, + { + "epoch": 0.5229269211747251, + "grad_norm": 0.7800232172012329, + "learning_rate": 1.715095733545161e-05, + "loss": 0.3254, + "step": 5551 + }, + { + "epoch": 0.5230211252678929, + "grad_norm": 0.8241845369338989, + "learning_rate": 1.714990171121526e-05, + "loss": 0.316, + "step": 5552 + }, + { + "epoch": 0.5231153293610608, + "grad_norm": 1.1471837759017944, + "learning_rate": 1.7148845923950092e-05, + "loss": 0.3573, + "step": 5553 + }, + { + "epoch": 0.5232095334542286, + "grad_norm": 0.7228818535804749, + "learning_rate": 1.7147789973680184e-05, + "loss": 0.3171, + "step": 5554 + }, + { + "epoch": 0.5233037375473965, + "grad_norm": 0.9623462557792664, + "learning_rate": 1.7146733860429614e-05, + "loss": 0.3724, + "step": 5555 + }, + { + "epoch": 0.5233979416405643, + "grad_norm": 0.7974388599395752, + "learning_rate": 1.7145677584222454e-05, + "loss": 0.3478, + "step": 5556 + }, + { + "epoch": 0.5234921457337322, + "grad_norm": 0.9783768057823181, + "learning_rate": 1.7144621145082794e-05, + "loss": 0.3258, + "step": 5557 + }, + { + "epoch": 0.5235863498269, + "grad_norm": 0.7557503581047058, + "learning_rate": 1.7143564543034724e-05, + "loss": 0.3821, + "step": 5558 + }, + { + "epoch": 0.5236805539200678, + "grad_norm": 1.0619620084762573, + "learning_rate": 1.7142507778102334e-05, + "loss": 0.3297, + "step": 5559 + }, + { + "epoch": 0.5237747580132357, + "grad_norm": 0.8709781169891357, + "learning_rate": 1.714145085030972e-05, + "loss": 0.3376, + "step": 5560 + }, + { + "epoch": 0.5238689621064035, + "grad_norm": 0.8476454019546509, + "learning_rate": 1.714039375968098e-05, + "loss": 0.3563, + "step": 5561 + }, + { + "epoch": 0.5239631661995714, + "grad_norm": 0.720432698726654, + "learning_rate": 1.7139336506240227e-05, + "loss": 0.2886, + "step": 5562 + }, + { + "epoch": 0.5240573702927392, + "grad_norm": 0.9466307759284973, + "learning_rate": 1.7138279090011556e-05, + "loss": 0.3622, + "step": 5563 + }, + { + "epoch": 0.5241515743859071, + "grad_norm": 0.6946443915367126, + "learning_rate": 1.7137221511019083e-05, + "loss": 0.3279, + "step": 5564 + }, + { + "epoch": 0.5242457784790749, + "grad_norm": 0.7914426326751709, + "learning_rate": 1.713616376928692e-05, + "loss": 0.3326, + "step": 5565 + }, + { + "epoch": 0.5243399825722428, + "grad_norm": 0.7700616121292114, + "learning_rate": 1.7135105864839187e-05, + "loss": 0.2737, + "step": 5566 + }, + { + "epoch": 0.5244341866654106, + "grad_norm": 0.8591805696487427, + "learning_rate": 1.7134047797700004e-05, + "loss": 0.334, + "step": 5567 + }, + { + "epoch": 0.5245283907585785, + "grad_norm": 0.9132749438285828, + "learning_rate": 1.71329895678935e-05, + "loss": 0.3397, + "step": 5568 + }, + { + "epoch": 0.5246225948517463, + "grad_norm": 0.7847031950950623, + "learning_rate": 1.7131931175443806e-05, + "loss": 0.3108, + "step": 5569 + }, + { + "epoch": 0.5247167989449142, + "grad_norm": 0.927769124507904, + "learning_rate": 1.7130872620375048e-05, + "loss": 0.3748, + "step": 5570 + }, + { + "epoch": 0.524811003038082, + "grad_norm": 0.9591184854507446, + "learning_rate": 1.7129813902711366e-05, + "loss": 0.3548, + "step": 5571 + }, + { + "epoch": 0.5249052071312499, + "grad_norm": 0.797385573387146, + "learning_rate": 1.71287550224769e-05, + "loss": 0.3013, + "step": 5572 + }, + { + "epoch": 0.5249994112244177, + "grad_norm": 0.7265573143959045, + "learning_rate": 1.7127695979695795e-05, + "loss": 0.3252, + "step": 5573 + }, + { + "epoch": 0.5250936153175856, + "grad_norm": 0.9222989082336426, + "learning_rate": 1.71266367743922e-05, + "loss": 0.3214, + "step": 5574 + }, + { + "epoch": 0.5251878194107534, + "grad_norm": 0.906359851360321, + "learning_rate": 1.7125577406590266e-05, + "loss": 0.3792, + "step": 5575 + }, + { + "epoch": 0.5252820235039213, + "grad_norm": 0.8564597964286804, + "learning_rate": 1.7124517876314143e-05, + "loss": 0.3431, + "step": 5576 + }, + { + "epoch": 0.5253762275970891, + "grad_norm": 0.8298361301422119, + "learning_rate": 1.7123458183587996e-05, + "loss": 0.3019, + "step": 5577 + }, + { + "epoch": 0.525470431690257, + "grad_norm": 0.8447403311729431, + "learning_rate": 1.712239832843599e-05, + "loss": 0.3527, + "step": 5578 + }, + { + "epoch": 0.5255646357834248, + "grad_norm": 0.7736579775810242, + "learning_rate": 1.7121338310882283e-05, + "loss": 0.3332, + "step": 5579 + }, + { + "epoch": 0.5256588398765927, + "grad_norm": 0.8325954675674438, + "learning_rate": 1.7120278130951046e-05, + "loss": 0.3545, + "step": 5580 + }, + { + "epoch": 0.5257530439697605, + "grad_norm": 0.8582270741462708, + "learning_rate": 1.7119217788666462e-05, + "loss": 0.3623, + "step": 5581 + }, + { + "epoch": 0.5258472480629284, + "grad_norm": 0.7773718237876892, + "learning_rate": 1.71181572840527e-05, + "loss": 0.352, + "step": 5582 + }, + { + "epoch": 0.5259414521560962, + "grad_norm": 0.8369136452674866, + "learning_rate": 1.7117096617133943e-05, + "loss": 0.368, + "step": 5583 + }, + { + "epoch": 0.5260356562492641, + "grad_norm": 0.8247154951095581, + "learning_rate": 1.7116035787934377e-05, + "loss": 0.3153, + "step": 5584 + }, + { + "epoch": 0.5261298603424319, + "grad_norm": 0.7184444069862366, + "learning_rate": 1.711497479647819e-05, + "loss": 0.3331, + "step": 5585 + }, + { + "epoch": 0.5262240644355998, + "grad_norm": 0.9262751340866089, + "learning_rate": 1.711391364278957e-05, + "loss": 0.3529, + "step": 5586 + }, + { + "epoch": 0.5263182685287676, + "grad_norm": 0.7359481453895569, + "learning_rate": 1.711285232689272e-05, + "loss": 0.3376, + "step": 5587 + }, + { + "epoch": 0.5264124726219355, + "grad_norm": 0.8670516610145569, + "learning_rate": 1.711179084881184e-05, + "loss": 0.3538, + "step": 5588 + }, + { + "epoch": 0.5265066767151033, + "grad_norm": 0.8430225253105164, + "learning_rate": 1.7110729208571128e-05, + "loss": 0.377, + "step": 5589 + }, + { + "epoch": 0.5266008808082712, + "grad_norm": 0.8114784955978394, + "learning_rate": 1.7109667406194792e-05, + "loss": 0.2815, + "step": 5590 + }, + { + "epoch": 0.526695084901439, + "grad_norm": 0.7337326407432556, + "learning_rate": 1.7108605441707046e-05, + "loss": 0.3201, + "step": 5591 + }, + { + "epoch": 0.5267892889946069, + "grad_norm": 1.7015444040298462, + "learning_rate": 1.71075433151321e-05, + "loss": 0.3603, + "step": 5592 + }, + { + "epoch": 0.5268834930877747, + "grad_norm": 0.8098084926605225, + "learning_rate": 1.7106481026494175e-05, + "loss": 0.3353, + "step": 5593 + }, + { + "epoch": 0.5269776971809426, + "grad_norm": 0.772486686706543, + "learning_rate": 1.710541857581749e-05, + "loss": 0.3192, + "step": 5594 + }, + { + "epoch": 0.5270719012741104, + "grad_norm": 0.7702192068099976, + "learning_rate": 1.7104355963126275e-05, + "loss": 0.378, + "step": 5595 + }, + { + "epoch": 0.5271661053672783, + "grad_norm": 0.893916130065918, + "learning_rate": 1.7103293188444756e-05, + "loss": 0.3371, + "step": 5596 + }, + { + "epoch": 0.5272603094604461, + "grad_norm": 0.9102244973182678, + "learning_rate": 1.7102230251797168e-05, + "loss": 0.3498, + "step": 5597 + }, + { + "epoch": 0.527354513553614, + "grad_norm": 0.9307056665420532, + "learning_rate": 1.7101167153207746e-05, + "loss": 0.3123, + "step": 5598 + }, + { + "epoch": 0.5274487176467818, + "grad_norm": 0.765352189540863, + "learning_rate": 1.7100103892700733e-05, + "loss": 0.3247, + "step": 5599 + }, + { + "epoch": 0.5275429217399497, + "grad_norm": 0.7557453513145447, + "learning_rate": 1.7099040470300366e-05, + "loss": 0.363, + "step": 5600 + }, + { + "epoch": 0.5276371258331175, + "grad_norm": 0.8222182393074036, + "learning_rate": 1.7097976886030902e-05, + "loss": 0.3337, + "step": 5601 + }, + { + "epoch": 0.5277313299262852, + "grad_norm": 0.9086328148841858, + "learning_rate": 1.7096913139916583e-05, + "loss": 0.3495, + "step": 5602 + }, + { + "epoch": 0.5278255340194531, + "grad_norm": 0.6892519593238831, + "learning_rate": 1.7095849231981674e-05, + "loss": 0.2742, + "step": 5603 + }, + { + "epoch": 0.5279197381126209, + "grad_norm": 0.875949501991272, + "learning_rate": 1.7094785162250428e-05, + "loss": 0.3633, + "step": 5604 + }, + { + "epoch": 0.5280139422057888, + "grad_norm": 0.784872829914093, + "learning_rate": 1.7093720930747104e-05, + "loss": 0.3159, + "step": 5605 + }, + { + "epoch": 0.5281081462989566, + "grad_norm": 0.8378578424453735, + "learning_rate": 1.7092656537495974e-05, + "loss": 0.3402, + "step": 5606 + }, + { + "epoch": 0.5282023503921245, + "grad_norm": 0.7835084199905396, + "learning_rate": 1.7091591982521305e-05, + "loss": 0.3068, + "step": 5607 + }, + { + "epoch": 0.5282965544852923, + "grad_norm": 1.1599599123001099, + "learning_rate": 1.7090527265847375e-05, + "loss": 0.3309, + "step": 5608 + }, + { + "epoch": 0.5283907585784602, + "grad_norm": 0.8036758303642273, + "learning_rate": 1.7089462387498453e-05, + "loss": 0.3676, + "step": 5609 + }, + { + "epoch": 0.528484962671628, + "grad_norm": 0.8531378507614136, + "learning_rate": 1.708839734749883e-05, + "loss": 0.346, + "step": 5610 + }, + { + "epoch": 0.5285791667647959, + "grad_norm": 0.9988169074058533, + "learning_rate": 1.7087332145872778e-05, + "loss": 0.3574, + "step": 5611 + }, + { + "epoch": 0.5286733708579637, + "grad_norm": 0.8267210125923157, + "learning_rate": 1.70862667826446e-05, + "loss": 0.3186, + "step": 5612 + }, + { + "epoch": 0.5287675749511316, + "grad_norm": 0.9520545601844788, + "learning_rate": 1.7085201257838574e-05, + "loss": 0.3687, + "step": 5613 + }, + { + "epoch": 0.5288617790442994, + "grad_norm": 0.6689392328262329, + "learning_rate": 1.7084135571479005e-05, + "loss": 0.3086, + "step": 5614 + }, + { + "epoch": 0.5289559831374673, + "grad_norm": 0.7221395969390869, + "learning_rate": 1.708306972359019e-05, + "loss": 0.318, + "step": 5615 + }, + { + "epoch": 0.5290501872306351, + "grad_norm": 0.7734705805778503, + "learning_rate": 1.7082003714196428e-05, + "loss": 0.3415, + "step": 5616 + }, + { + "epoch": 0.529144391323803, + "grad_norm": 1.0123803615570068, + "learning_rate": 1.708093754332203e-05, + "loss": 0.3626, + "step": 5617 + }, + { + "epoch": 0.5292385954169708, + "grad_norm": 1.1305488348007202, + "learning_rate": 1.7079871210991306e-05, + "loss": 0.363, + "step": 5618 + }, + { + "epoch": 0.5293327995101387, + "grad_norm": 0.8126332759857178, + "learning_rate": 1.7078804717228568e-05, + "loss": 0.3115, + "step": 5619 + }, + { + "epoch": 0.5294270036033065, + "grad_norm": 0.7479857206344604, + "learning_rate": 1.7077738062058135e-05, + "loss": 0.3584, + "step": 5620 + }, + { + "epoch": 0.5295212076964744, + "grad_norm": 0.86054927110672, + "learning_rate": 1.707667124550433e-05, + "loss": 0.383, + "step": 5621 + }, + { + "epoch": 0.5296154117896422, + "grad_norm": 0.7326262593269348, + "learning_rate": 1.7075604267591475e-05, + "loss": 0.3778, + "step": 5622 + }, + { + "epoch": 0.5297096158828101, + "grad_norm": 0.6459468007087708, + "learning_rate": 1.70745371283439e-05, + "loss": 0.2673, + "step": 5623 + }, + { + "epoch": 0.5298038199759779, + "grad_norm": 0.8367050290107727, + "learning_rate": 1.7073469827785936e-05, + "loss": 0.3682, + "step": 5624 + }, + { + "epoch": 0.5298980240691458, + "grad_norm": 0.9269501566886902, + "learning_rate": 1.7072402365941925e-05, + "loss": 0.3366, + "step": 5625 + }, + { + "epoch": 0.5299922281623136, + "grad_norm": 0.75841224193573, + "learning_rate": 1.70713347428362e-05, + "loss": 0.3442, + "step": 5626 + }, + { + "epoch": 0.5300864322554815, + "grad_norm": 0.7608478665351868, + "learning_rate": 1.7070266958493103e-05, + "loss": 0.3226, + "step": 5627 + }, + { + "epoch": 0.5301806363486493, + "grad_norm": 0.7829641103744507, + "learning_rate": 1.706919901293699e-05, + "loss": 0.3231, + "step": 5628 + }, + { + "epoch": 0.5302748404418172, + "grad_norm": 0.828075110912323, + "learning_rate": 1.7068130906192207e-05, + "loss": 0.3378, + "step": 5629 + }, + { + "epoch": 0.530369044534985, + "grad_norm": 0.7459672689437866, + "learning_rate": 1.7067062638283104e-05, + "loss": 0.341, + "step": 5630 + }, + { + "epoch": 0.5304632486281529, + "grad_norm": 0.8326666951179504, + "learning_rate": 1.7065994209234044e-05, + "loss": 0.3374, + "step": 5631 + }, + { + "epoch": 0.5305574527213207, + "grad_norm": 0.8724443316459656, + "learning_rate": 1.7064925619069393e-05, + "loss": 0.3682, + "step": 5632 + }, + { + "epoch": 0.5306516568144886, + "grad_norm": 0.8615604639053345, + "learning_rate": 1.7063856867813505e-05, + "loss": 0.3717, + "step": 5633 + }, + { + "epoch": 0.5307458609076564, + "grad_norm": 0.7260650992393494, + "learning_rate": 1.7062787955490762e-05, + "loss": 0.3283, + "step": 5634 + }, + { + "epoch": 0.5308400650008243, + "grad_norm": 0.7473740577697754, + "learning_rate": 1.7061718882125528e-05, + "loss": 0.3357, + "step": 5635 + }, + { + "epoch": 0.5309342690939921, + "grad_norm": 0.7821106910705566, + "learning_rate": 1.7060649647742183e-05, + "loss": 0.3039, + "step": 5636 + }, + { + "epoch": 0.53102847318716, + "grad_norm": 0.778623640537262, + "learning_rate": 1.705958025236511e-05, + "loss": 0.3356, + "step": 5637 + }, + { + "epoch": 0.5311226772803278, + "grad_norm": 0.7781974673271179, + "learning_rate": 1.7058510696018686e-05, + "loss": 0.3379, + "step": 5638 + }, + { + "epoch": 0.5312168813734957, + "grad_norm": 0.7608444690704346, + "learning_rate": 1.70574409787273e-05, + "loss": 0.3, + "step": 5639 + }, + { + "epoch": 0.5313110854666635, + "grad_norm": 1.0025386810302734, + "learning_rate": 1.7056371100515345e-05, + "loss": 0.3566, + "step": 5640 + }, + { + "epoch": 0.5314052895598314, + "grad_norm": 0.7889797687530518, + "learning_rate": 1.7055301061407217e-05, + "loss": 0.282, + "step": 5641 + }, + { + "epoch": 0.5314994936529992, + "grad_norm": 0.7105996012687683, + "learning_rate": 1.7054230861427316e-05, + "loss": 0.3223, + "step": 5642 + }, + { + "epoch": 0.531593697746167, + "grad_norm": 0.7551090717315674, + "learning_rate": 1.705316050060004e-05, + "loss": 0.3308, + "step": 5643 + }, + { + "epoch": 0.5316879018393349, + "grad_norm": 0.8662528395652771, + "learning_rate": 1.7052089978949796e-05, + "loss": 0.3789, + "step": 5644 + }, + { + "epoch": 0.5317821059325027, + "grad_norm": 0.7586779594421387, + "learning_rate": 1.7051019296501e-05, + "loss": 0.3023, + "step": 5645 + }, + { + "epoch": 0.5318763100256706, + "grad_norm": 0.7028634548187256, + "learning_rate": 1.7049948453278052e-05, + "loss": 0.3238, + "step": 5646 + }, + { + "epoch": 0.5319705141188384, + "grad_norm": 0.7961217164993286, + "learning_rate": 1.704887744930538e-05, + "loss": 0.3831, + "step": 5647 + }, + { + "epoch": 0.5320647182120063, + "grad_norm": 0.7676875591278076, + "learning_rate": 1.70478062846074e-05, + "loss": 0.3099, + "step": 5648 + }, + { + "epoch": 0.5321589223051741, + "grad_norm": 0.7232643365859985, + "learning_rate": 1.7046734959208536e-05, + "loss": 0.3167, + "step": 5649 + }, + { + "epoch": 0.532253126398342, + "grad_norm": 0.7947850823402405, + "learning_rate": 1.7045663473133215e-05, + "loss": 0.3208, + "step": 5650 + }, + { + "epoch": 0.5323473304915098, + "grad_norm": 0.7122358083724976, + "learning_rate": 1.7044591826405877e-05, + "loss": 0.3155, + "step": 5651 + }, + { + "epoch": 0.5324415345846777, + "grad_norm": 0.9151607751846313, + "learning_rate": 1.7043520019050945e-05, + "loss": 0.3632, + "step": 5652 + }, + { + "epoch": 0.5325357386778455, + "grad_norm": 0.8084571957588196, + "learning_rate": 1.7042448051092867e-05, + "loss": 0.3433, + "step": 5653 + }, + { + "epoch": 0.5326299427710134, + "grad_norm": 0.8168090581893921, + "learning_rate": 1.704137592255608e-05, + "loss": 0.3339, + "step": 5654 + }, + { + "epoch": 0.5327241468641812, + "grad_norm": 0.723678469657898, + "learning_rate": 1.7040303633465033e-05, + "loss": 0.3043, + "step": 5655 + }, + { + "epoch": 0.5328183509573491, + "grad_norm": 0.7119664549827576, + "learning_rate": 1.7039231183844174e-05, + "loss": 0.2625, + "step": 5656 + }, + { + "epoch": 0.5329125550505169, + "grad_norm": 0.686281144618988, + "learning_rate": 1.703815857371796e-05, + "loss": 0.2849, + "step": 5657 + }, + { + "epoch": 0.5330067591436848, + "grad_norm": 0.9660475850105286, + "learning_rate": 1.7037085803110845e-05, + "loss": 0.3211, + "step": 5658 + }, + { + "epoch": 0.5331009632368526, + "grad_norm": 0.7467395067214966, + "learning_rate": 1.703601287204729e-05, + "loss": 0.3489, + "step": 5659 + }, + { + "epoch": 0.5331951673300205, + "grad_norm": 1.1019151210784912, + "learning_rate": 1.703493978055176e-05, + "loss": 0.3479, + "step": 5660 + }, + { + "epoch": 0.5332893714231883, + "grad_norm": 0.768088698387146, + "learning_rate": 1.7033866528648722e-05, + "loss": 0.2789, + "step": 5661 + }, + { + "epoch": 0.5333835755163562, + "grad_norm": 0.971484899520874, + "learning_rate": 1.703279311636265e-05, + "loss": 0.3748, + "step": 5662 + }, + { + "epoch": 0.533477779609524, + "grad_norm": 0.7638828754425049, + "learning_rate": 1.7031719543718018e-05, + "loss": 0.3528, + "step": 5663 + }, + { + "epoch": 0.5335719837026919, + "grad_norm": 0.817284345626831, + "learning_rate": 1.703064581073931e-05, + "loss": 0.3509, + "step": 5664 + }, + { + "epoch": 0.5336661877958597, + "grad_norm": 0.8197344541549683, + "learning_rate": 1.7029571917451e-05, + "loss": 0.3453, + "step": 5665 + }, + { + "epoch": 0.5337603918890276, + "grad_norm": 0.7332201600074768, + "learning_rate": 1.7028497863877576e-05, + "loss": 0.3008, + "step": 5666 + }, + { + "epoch": 0.5338545959821954, + "grad_norm": 0.876661479473114, + "learning_rate": 1.7027423650043538e-05, + "loss": 0.3491, + "step": 5667 + }, + { + "epoch": 0.5339488000753633, + "grad_norm": 0.7736461758613586, + "learning_rate": 1.702634927597337e-05, + "loss": 0.3967, + "step": 5668 + }, + { + "epoch": 0.5340430041685311, + "grad_norm": 1.092462182044983, + "learning_rate": 1.702527474169157e-05, + "loss": 0.3447, + "step": 5669 + }, + { + "epoch": 0.534137208261699, + "grad_norm": 0.7720444798469543, + "learning_rate": 1.7024200047222645e-05, + "loss": 0.3777, + "step": 5670 + }, + { + "epoch": 0.5342314123548668, + "grad_norm": 0.8187174797058105, + "learning_rate": 1.7023125192591092e-05, + "loss": 0.3194, + "step": 5671 + }, + { + "epoch": 0.5343256164480347, + "grad_norm": 0.6948956251144409, + "learning_rate": 1.7022050177821425e-05, + "loss": 0.3183, + "step": 5672 + }, + { + "epoch": 0.5344198205412025, + "grad_norm": 0.8639745116233826, + "learning_rate": 1.702097500293815e-05, + "loss": 0.3483, + "step": 5673 + }, + { + "epoch": 0.5345140246343704, + "grad_norm": 0.7871007919311523, + "learning_rate": 1.7019899667965795e-05, + "loss": 0.3817, + "step": 5674 + }, + { + "epoch": 0.5346082287275382, + "grad_norm": 0.7117692232131958, + "learning_rate": 1.7018824172928864e-05, + "loss": 0.3135, + "step": 5675 + }, + { + "epoch": 0.5347024328207061, + "grad_norm": 0.6765762567520142, + "learning_rate": 1.701774851785189e-05, + "loss": 0.351, + "step": 5676 + }, + { + "epoch": 0.5347966369138739, + "grad_norm": 0.7431235313415527, + "learning_rate": 1.7016672702759397e-05, + "loss": 0.2992, + "step": 5677 + }, + { + "epoch": 0.5348908410070418, + "grad_norm": 0.7420750856399536, + "learning_rate": 1.7015596727675914e-05, + "loss": 0.3344, + "step": 5678 + }, + { + "epoch": 0.5349850451002096, + "grad_norm": 0.6724424958229065, + "learning_rate": 1.7014520592625977e-05, + "loss": 0.2744, + "step": 5679 + }, + { + "epoch": 0.5350792491933775, + "grad_norm": 0.7238250970840454, + "learning_rate": 1.7013444297634122e-05, + "loss": 0.3036, + "step": 5680 + }, + { + "epoch": 0.5351734532865453, + "grad_norm": 0.8361940383911133, + "learning_rate": 1.7012367842724887e-05, + "loss": 0.3544, + "step": 5681 + }, + { + "epoch": 0.5352676573797132, + "grad_norm": 0.7910535335540771, + "learning_rate": 1.7011291227922827e-05, + "loss": 0.3195, + "step": 5682 + }, + { + "epoch": 0.535361861472881, + "grad_norm": 0.9023531675338745, + "learning_rate": 1.7010214453252477e-05, + "loss": 0.3374, + "step": 5683 + }, + { + "epoch": 0.5354560655660489, + "grad_norm": 0.7297065258026123, + "learning_rate": 1.7009137518738397e-05, + "loss": 0.3461, + "step": 5684 + }, + { + "epoch": 0.5355502696592167, + "grad_norm": 0.9916013479232788, + "learning_rate": 1.7008060424405145e-05, + "loss": 0.3538, + "step": 5685 + }, + { + "epoch": 0.5356444737523846, + "grad_norm": 0.8602205514907837, + "learning_rate": 1.7006983170277277e-05, + "loss": 0.3424, + "step": 5686 + }, + { + "epoch": 0.5357386778455524, + "grad_norm": 0.7788354754447937, + "learning_rate": 1.7005905756379354e-05, + "loss": 0.2895, + "step": 5687 + }, + { + "epoch": 0.5358328819387203, + "grad_norm": 0.9086410999298096, + "learning_rate": 1.7004828182735947e-05, + "loss": 0.4563, + "step": 5688 + }, + { + "epoch": 0.5359270860318881, + "grad_norm": 0.7011946439743042, + "learning_rate": 1.7003750449371624e-05, + "loss": 0.3276, + "step": 5689 + }, + { + "epoch": 0.536021290125056, + "grad_norm": 0.7426707148551941, + "learning_rate": 1.7002672556310957e-05, + "loss": 0.3452, + "step": 5690 + }, + { + "epoch": 0.5361154942182238, + "grad_norm": 0.7261567711830139, + "learning_rate": 1.7001594503578526e-05, + "loss": 0.2979, + "step": 5691 + }, + { + "epoch": 0.5362096983113916, + "grad_norm": 0.760969340801239, + "learning_rate": 1.7000516291198914e-05, + "loss": 0.3198, + "step": 5692 + }, + { + "epoch": 0.5363039024045595, + "grad_norm": 0.7361472845077515, + "learning_rate": 1.6999437919196705e-05, + "loss": 0.3361, + "step": 5693 + }, + { + "epoch": 0.5363981064977273, + "grad_norm": 0.818294107913971, + "learning_rate": 1.6998359387596484e-05, + "loss": 0.3582, + "step": 5694 + }, + { + "epoch": 0.5364923105908952, + "grad_norm": 0.8254144191741943, + "learning_rate": 1.699728069642285e-05, + "loss": 0.3582, + "step": 5695 + }, + { + "epoch": 0.536586514684063, + "grad_norm": 0.7636622190475464, + "learning_rate": 1.699620184570039e-05, + "loss": 0.337, + "step": 5696 + }, + { + "epoch": 0.5366807187772309, + "grad_norm": 0.7546584606170654, + "learning_rate": 1.6995122835453708e-05, + "loss": 0.3052, + "step": 5697 + }, + { + "epoch": 0.5367749228703987, + "grad_norm": 0.8165667057037354, + "learning_rate": 1.699404366570741e-05, + "loss": 0.2908, + "step": 5698 + }, + { + "epoch": 0.5368691269635666, + "grad_norm": 0.7652526497840881, + "learning_rate": 1.6992964336486094e-05, + "loss": 0.3535, + "step": 5699 + }, + { + "epoch": 0.5369633310567344, + "grad_norm": 0.786133348941803, + "learning_rate": 1.6991884847814385e-05, + "loss": 0.3139, + "step": 5700 + }, + { + "epoch": 0.5370575351499023, + "grad_norm": 0.7707028985023499, + "learning_rate": 1.6990805199716885e-05, + "loss": 0.3701, + "step": 5701 + }, + { + "epoch": 0.5371517392430701, + "grad_norm": 0.7296610474586487, + "learning_rate": 1.6989725392218213e-05, + "loss": 0.308, + "step": 5702 + }, + { + "epoch": 0.537245943336238, + "grad_norm": 0.6958910226821899, + "learning_rate": 1.6988645425342993e-05, + "loss": 0.3136, + "step": 5703 + }, + { + "epoch": 0.5373401474294058, + "grad_norm": 0.8359389901161194, + "learning_rate": 1.698756529911585e-05, + "loss": 0.3752, + "step": 5704 + }, + { + "epoch": 0.5374343515225737, + "grad_norm": 0.7149270176887512, + "learning_rate": 1.698648501356141e-05, + "loss": 0.3216, + "step": 5705 + }, + { + "epoch": 0.5375285556157415, + "grad_norm": 0.6960687041282654, + "learning_rate": 1.698540456870431e-05, + "loss": 0.295, + "step": 5706 + }, + { + "epoch": 0.5376227597089094, + "grad_norm": 0.7871416807174683, + "learning_rate": 1.698432396456918e-05, + "loss": 0.3146, + "step": 5707 + }, + { + "epoch": 0.5377169638020772, + "grad_norm": 0.7495483160018921, + "learning_rate": 1.6983243201180663e-05, + "loss": 0.3428, + "step": 5708 + }, + { + "epoch": 0.5378111678952451, + "grad_norm": 0.7924083471298218, + "learning_rate": 1.69821622785634e-05, + "loss": 0.4065, + "step": 5709 + }, + { + "epoch": 0.5379053719884129, + "grad_norm": 0.6857657432556152, + "learning_rate": 1.698108119674204e-05, + "loss": 0.3177, + "step": 5710 + }, + { + "epoch": 0.5379995760815808, + "grad_norm": 0.6931462287902832, + "learning_rate": 1.6979999955741234e-05, + "loss": 0.2973, + "step": 5711 + }, + { + "epoch": 0.5380937801747486, + "grad_norm": 0.8428118824958801, + "learning_rate": 1.6978918555585634e-05, + "loss": 0.3352, + "step": 5712 + }, + { + "epoch": 0.5381879842679165, + "grad_norm": 0.7515289187431335, + "learning_rate": 1.6977836996299896e-05, + "loss": 0.3374, + "step": 5713 + }, + { + "epoch": 0.5382821883610843, + "grad_norm": 0.7781171798706055, + "learning_rate": 1.6976755277908684e-05, + "loss": 0.3353, + "step": 5714 + }, + { + "epoch": 0.5383763924542522, + "grad_norm": 0.8441618084907532, + "learning_rate": 1.6975673400436662e-05, + "loss": 0.3592, + "step": 5715 + }, + { + "epoch": 0.53847059654742, + "grad_norm": 0.7104495763778687, + "learning_rate": 1.6974591363908496e-05, + "loss": 0.3033, + "step": 5716 + }, + { + "epoch": 0.5385648006405879, + "grad_norm": 0.83321213722229, + "learning_rate": 1.6973509168348863e-05, + "loss": 0.3641, + "step": 5717 + }, + { + "epoch": 0.5386590047337557, + "grad_norm": 0.8065574169158936, + "learning_rate": 1.6972426813782433e-05, + "loss": 0.3526, + "step": 5718 + }, + { + "epoch": 0.5387532088269236, + "grad_norm": 0.7498657703399658, + "learning_rate": 1.6971344300233893e-05, + "loss": 0.312, + "step": 5719 + }, + { + "epoch": 0.5388474129200914, + "grad_norm": 0.7255940437316895, + "learning_rate": 1.697026162772792e-05, + "loss": 0.2893, + "step": 5720 + }, + { + "epoch": 0.5389416170132593, + "grad_norm": 0.7836167812347412, + "learning_rate": 1.6969178796289202e-05, + "loss": 0.2813, + "step": 5721 + }, + { + "epoch": 0.5390358211064271, + "grad_norm": 0.9158719182014465, + "learning_rate": 1.696809580594243e-05, + "loss": 0.3561, + "step": 5722 + }, + { + "epoch": 0.539130025199595, + "grad_norm": 0.8073228001594543, + "learning_rate": 1.6967012656712296e-05, + "loss": 0.3363, + "step": 5723 + }, + { + "epoch": 0.5392242292927628, + "grad_norm": 0.7554263472557068, + "learning_rate": 1.6965929348623497e-05, + "loss": 0.2851, + "step": 5724 + }, + { + "epoch": 0.5393184333859307, + "grad_norm": 0.760604739189148, + "learning_rate": 1.696484588170074e-05, + "loss": 0.3638, + "step": 5725 + }, + { + "epoch": 0.5394126374790985, + "grad_norm": 0.9967789649963379, + "learning_rate": 1.6963762255968723e-05, + "loss": 0.3422, + "step": 5726 + }, + { + "epoch": 0.5395068415722664, + "grad_norm": 0.7668309807777405, + "learning_rate": 1.6962678471452158e-05, + "loss": 0.3403, + "step": 5727 + }, + { + "epoch": 0.5396010456654342, + "grad_norm": 0.7941800355911255, + "learning_rate": 1.6961594528175757e-05, + "loss": 0.3789, + "step": 5728 + }, + { + "epoch": 0.5396952497586021, + "grad_norm": 0.6772311925888062, + "learning_rate": 1.6960510426164233e-05, + "loss": 0.2775, + "step": 5729 + }, + { + "epoch": 0.5397894538517699, + "grad_norm": 0.7815453410148621, + "learning_rate": 1.6959426165442306e-05, + "loss": 0.3235, + "step": 5730 + }, + { + "epoch": 0.5398836579449378, + "grad_norm": 0.8438002467155457, + "learning_rate": 1.69583417460347e-05, + "loss": 0.3244, + "step": 5731 + }, + { + "epoch": 0.5399778620381056, + "grad_norm": 0.8616735339164734, + "learning_rate": 1.6957257167966142e-05, + "loss": 0.3283, + "step": 5732 + }, + { + "epoch": 0.5400720661312735, + "grad_norm": 0.8114157915115356, + "learning_rate": 1.695617243126136e-05, + "loss": 0.3469, + "step": 5733 + }, + { + "epoch": 0.5401662702244413, + "grad_norm": 0.7440274953842163, + "learning_rate": 1.6955087535945085e-05, + "loss": 0.3141, + "step": 5734 + }, + { + "epoch": 0.5402604743176092, + "grad_norm": 0.7023537755012512, + "learning_rate": 1.6954002482042065e-05, + "loss": 0.3036, + "step": 5735 + }, + { + "epoch": 0.540354678410777, + "grad_norm": 0.6811996102333069, + "learning_rate": 1.6952917269577026e-05, + "loss": 0.3162, + "step": 5736 + }, + { + "epoch": 0.5404488825039448, + "grad_norm": 0.8686399459838867, + "learning_rate": 1.6951831898574727e-05, + "loss": 0.381, + "step": 5737 + }, + { + "epoch": 0.5405430865971127, + "grad_norm": 0.8593584895133972, + "learning_rate": 1.6950746369059908e-05, + "loss": 0.378, + "step": 5738 + }, + { + "epoch": 0.5406372906902805, + "grad_norm": 0.8086289763450623, + "learning_rate": 1.694966068105732e-05, + "loss": 0.3461, + "step": 5739 + }, + { + "epoch": 0.5407314947834483, + "grad_norm": 0.952411413192749, + "learning_rate": 1.6948574834591722e-05, + "loss": 0.3862, + "step": 5740 + }, + { + "epoch": 0.5408256988766161, + "grad_norm": 0.7227944731712341, + "learning_rate": 1.694748882968787e-05, + "loss": 0.3257, + "step": 5741 + }, + { + "epoch": 0.540919902969784, + "grad_norm": 0.6877641081809998, + "learning_rate": 1.6946402666370533e-05, + "loss": 0.2851, + "step": 5742 + }, + { + "epoch": 0.5410141070629518, + "grad_norm": 0.8375235795974731, + "learning_rate": 1.6945316344664468e-05, + "loss": 0.3337, + "step": 5743 + }, + { + "epoch": 0.5411083111561197, + "grad_norm": 0.7709132432937622, + "learning_rate": 1.694422986459445e-05, + "loss": 0.3152, + "step": 5744 + }, + { + "epoch": 0.5412025152492875, + "grad_norm": 0.7222093343734741, + "learning_rate": 1.6943143226185252e-05, + "loss": 0.3001, + "step": 5745 + }, + { + "epoch": 0.5412967193424554, + "grad_norm": 2.080857276916504, + "learning_rate": 1.694205642946165e-05, + "loss": 0.2963, + "step": 5746 + }, + { + "epoch": 0.5413909234356232, + "grad_norm": 0.847663938999176, + "learning_rate": 1.6940969474448427e-05, + "loss": 0.3205, + "step": 5747 + }, + { + "epoch": 0.5414851275287911, + "grad_norm": 0.8542855381965637, + "learning_rate": 1.6939882361170364e-05, + "loss": 0.3426, + "step": 5748 + }, + { + "epoch": 0.5415793316219589, + "grad_norm": 0.8413586616516113, + "learning_rate": 1.693879508965225e-05, + "loss": 0.3558, + "step": 5749 + }, + { + "epoch": 0.5416735357151268, + "grad_norm": 0.7056402564048767, + "learning_rate": 1.693770765991888e-05, + "loss": 0.3113, + "step": 5750 + }, + { + "epoch": 0.5417677398082946, + "grad_norm": 0.7774807214736938, + "learning_rate": 1.6936620071995044e-05, + "loss": 0.3115, + "step": 5751 + }, + { + "epoch": 0.5418619439014625, + "grad_norm": 0.8446057438850403, + "learning_rate": 1.693553232590554e-05, + "loss": 0.4087, + "step": 5752 + }, + { + "epoch": 0.5419561479946303, + "grad_norm": 0.7298682928085327, + "learning_rate": 1.693444442167518e-05, + "loss": 0.3067, + "step": 5753 + }, + { + "epoch": 0.5420503520877982, + "grad_norm": 0.7178559899330139, + "learning_rate": 1.6933356359328756e-05, + "loss": 0.327, + "step": 5754 + }, + { + "epoch": 0.542144556180966, + "grad_norm": 0.7992531061172485, + "learning_rate": 1.693226813889109e-05, + "loss": 0.333, + "step": 5755 + }, + { + "epoch": 0.5422387602741339, + "grad_norm": 0.6976920962333679, + "learning_rate": 1.6931179760386983e-05, + "loss": 0.2899, + "step": 5756 + }, + { + "epoch": 0.5423329643673017, + "grad_norm": 0.778207540512085, + "learning_rate": 1.693009122384126e-05, + "loss": 0.3128, + "step": 5757 + }, + { + "epoch": 0.5424271684604696, + "grad_norm": 0.812389612197876, + "learning_rate": 1.692900252927874e-05, + "loss": 0.3304, + "step": 5758 + }, + { + "epoch": 0.5425213725536374, + "grad_norm": 0.6788195371627808, + "learning_rate": 1.6927913676724247e-05, + "loss": 0.3077, + "step": 5759 + }, + { + "epoch": 0.5426155766468053, + "grad_norm": 0.7889478206634521, + "learning_rate": 1.6926824666202612e-05, + "loss": 0.3277, + "step": 5760 + }, + { + "epoch": 0.5427097807399731, + "grad_norm": 0.8202923536300659, + "learning_rate": 1.6925735497738656e-05, + "loss": 0.3129, + "step": 5761 + }, + { + "epoch": 0.542803984833141, + "grad_norm": 0.6905041337013245, + "learning_rate": 1.6924646171357225e-05, + "loss": 0.3115, + "step": 5762 + }, + { + "epoch": 0.5428981889263088, + "grad_norm": 0.7418622374534607, + "learning_rate": 1.6923556687083147e-05, + "loss": 0.319, + "step": 5763 + }, + { + "epoch": 0.5429923930194767, + "grad_norm": 0.7273638844490051, + "learning_rate": 1.692246704494127e-05, + "loss": 0.3173, + "step": 5764 + }, + { + "epoch": 0.5430865971126445, + "grad_norm": 0.7781940698623657, + "learning_rate": 1.6921377244956444e-05, + "loss": 0.3177, + "step": 5765 + }, + { + "epoch": 0.5431808012058124, + "grad_norm": 0.6877148747444153, + "learning_rate": 1.6920287287153506e-05, + "loss": 0.2994, + "step": 5766 + }, + { + "epoch": 0.5432750052989802, + "grad_norm": 0.7039399743080139, + "learning_rate": 1.691919717155732e-05, + "loss": 0.3828, + "step": 5767 + }, + { + "epoch": 0.5433692093921481, + "grad_norm": 0.825885534286499, + "learning_rate": 1.6918106898192734e-05, + "loss": 0.3283, + "step": 5768 + }, + { + "epoch": 0.5434634134853159, + "grad_norm": 0.7322880029678345, + "learning_rate": 1.6917016467084614e-05, + "loss": 0.3095, + "step": 5769 + }, + { + "epoch": 0.5435576175784838, + "grad_norm": 0.7304609417915344, + "learning_rate": 1.691592587825782e-05, + "loss": 0.3661, + "step": 5770 + }, + { + "epoch": 0.5436518216716516, + "grad_norm": 0.9463625550270081, + "learning_rate": 1.691483513173722e-05, + "loss": 0.3022, + "step": 5771 + }, + { + "epoch": 0.5437460257648195, + "grad_norm": 0.7674868106842041, + "learning_rate": 1.6913744227547687e-05, + "loss": 0.341, + "step": 5772 + }, + { + "epoch": 0.5438402298579873, + "grad_norm": 0.6974307894706726, + "learning_rate": 1.691265316571409e-05, + "loss": 0.3063, + "step": 5773 + }, + { + "epoch": 0.5439344339511551, + "grad_norm": 0.8258277177810669, + "learning_rate": 1.691156194626131e-05, + "loss": 0.3272, + "step": 5774 + }, + { + "epoch": 0.544028638044323, + "grad_norm": 0.8583522439002991, + "learning_rate": 1.6910470569214236e-05, + "loss": 0.3066, + "step": 5775 + }, + { + "epoch": 0.5441228421374908, + "grad_norm": 0.7144218683242798, + "learning_rate": 1.6909379034597742e-05, + "loss": 0.3272, + "step": 5776 + }, + { + "epoch": 0.5442170462306587, + "grad_norm": 0.8724987506866455, + "learning_rate": 1.6908287342436718e-05, + "loss": 0.3839, + "step": 5777 + }, + { + "epoch": 0.5443112503238265, + "grad_norm": 0.7545903325080872, + "learning_rate": 1.690719549275606e-05, + "loss": 0.3492, + "step": 5778 + }, + { + "epoch": 0.5444054544169944, + "grad_norm": 0.8440935611724854, + "learning_rate": 1.690610348558066e-05, + "loss": 0.3367, + "step": 5779 + }, + { + "epoch": 0.5444996585101622, + "grad_norm": 0.7396623492240906, + "learning_rate": 1.6905011320935425e-05, + "loss": 0.3481, + "step": 5780 + }, + { + "epoch": 0.5445938626033301, + "grad_norm": 0.7483841776847839, + "learning_rate": 1.690391899884525e-05, + "loss": 0.3532, + "step": 5781 + }, + { + "epoch": 0.5446880666964979, + "grad_norm": 0.7436134815216064, + "learning_rate": 1.6902826519335048e-05, + "loss": 0.3338, + "step": 5782 + }, + { + "epoch": 0.5447822707896658, + "grad_norm": 0.7555732727050781, + "learning_rate": 1.690173388242972e-05, + "loss": 0.3299, + "step": 5783 + }, + { + "epoch": 0.5448764748828336, + "grad_norm": 0.9253072142601013, + "learning_rate": 1.690064108815419e-05, + "loss": 0.3687, + "step": 5784 + }, + { + "epoch": 0.5449706789760015, + "grad_norm": 0.7201090455055237, + "learning_rate": 1.689954813653337e-05, + "loss": 0.3182, + "step": 5785 + }, + { + "epoch": 0.5450648830691693, + "grad_norm": 0.7178362011909485, + "learning_rate": 1.6898455027592184e-05, + "loss": 0.3253, + "step": 5786 + }, + { + "epoch": 0.5451590871623372, + "grad_norm": 0.7304385304450989, + "learning_rate": 1.689736176135555e-05, + "loss": 0.3179, + "step": 5787 + }, + { + "epoch": 0.545253291255505, + "grad_norm": 0.7304025292396545, + "learning_rate": 1.6896268337848404e-05, + "loss": 0.3228, + "step": 5788 + }, + { + "epoch": 0.5453474953486729, + "grad_norm": 0.7466516494750977, + "learning_rate": 1.6895174757095676e-05, + "loss": 0.3581, + "step": 5789 + }, + { + "epoch": 0.5454416994418407, + "grad_norm": 0.7444651126861572, + "learning_rate": 1.68940810191223e-05, + "loss": 0.3225, + "step": 5790 + }, + { + "epoch": 0.5455359035350086, + "grad_norm": 0.9712061882019043, + "learning_rate": 1.689298712395321e-05, + "loss": 0.2931, + "step": 5791 + }, + { + "epoch": 0.5456301076281764, + "grad_norm": 0.7565516829490662, + "learning_rate": 1.689189307161336e-05, + "loss": 0.3122, + "step": 5792 + }, + { + "epoch": 0.5457243117213443, + "grad_norm": 0.7904927134513855, + "learning_rate": 1.6890798862127683e-05, + "loss": 0.3025, + "step": 5793 + }, + { + "epoch": 0.5458185158145121, + "grad_norm": 0.8687581419944763, + "learning_rate": 1.688970449552114e-05, + "loss": 0.3277, + "step": 5794 + }, + { + "epoch": 0.54591271990768, + "grad_norm": 0.7989944219589233, + "learning_rate": 1.688860997181868e-05, + "loss": 0.3416, + "step": 5795 + }, + { + "epoch": 0.5460069240008478, + "grad_norm": 0.8734157681465149, + "learning_rate": 1.6887515291045255e-05, + "loss": 0.3433, + "step": 5796 + }, + { + "epoch": 0.5461011280940157, + "grad_norm": 0.8354502320289612, + "learning_rate": 1.6886420453225832e-05, + "loss": 0.3027, + "step": 5797 + }, + { + "epoch": 0.5461953321871835, + "grad_norm": 0.8106501698493958, + "learning_rate": 1.6885325458385372e-05, + "loss": 0.3838, + "step": 5798 + }, + { + "epoch": 0.5462895362803514, + "grad_norm": 0.9645668864250183, + "learning_rate": 1.6884230306548842e-05, + "loss": 0.3242, + "step": 5799 + }, + { + "epoch": 0.5463837403735192, + "grad_norm": 1.0720152854919434, + "learning_rate": 1.6883134997741217e-05, + "loss": 0.3666, + "step": 5800 + }, + { + "epoch": 0.5464779444666871, + "grad_norm": 0.7827078104019165, + "learning_rate": 1.6882039531987467e-05, + "loss": 0.3435, + "step": 5801 + }, + { + "epoch": 0.5465721485598549, + "grad_norm": 0.7407355308532715, + "learning_rate": 1.6880943909312573e-05, + "loss": 0.3001, + "step": 5802 + }, + { + "epoch": 0.5466663526530228, + "grad_norm": 0.8197450637817383, + "learning_rate": 1.6879848129741516e-05, + "loss": 0.3273, + "step": 5803 + }, + { + "epoch": 0.5467605567461906, + "grad_norm": 0.7303133606910706, + "learning_rate": 1.6878752193299282e-05, + "loss": 0.3215, + "step": 5804 + }, + { + "epoch": 0.5468547608393585, + "grad_norm": 0.7092004418373108, + "learning_rate": 1.687765610001086e-05, + "loss": 0.3097, + "step": 5805 + }, + { + "epoch": 0.5469489649325263, + "grad_norm": 0.7629405856132507, + "learning_rate": 1.6876559849901243e-05, + "loss": 0.3387, + "step": 5806 + }, + { + "epoch": 0.5470431690256942, + "grad_norm": 0.9411080479621887, + "learning_rate": 1.6875463442995426e-05, + "loss": 0.3637, + "step": 5807 + }, + { + "epoch": 0.547137373118862, + "grad_norm": 0.7682428359985352, + "learning_rate": 1.687436687931841e-05, + "loss": 0.2983, + "step": 5808 + }, + { + "epoch": 0.5472315772120299, + "grad_norm": 0.8321592807769775, + "learning_rate": 1.6873270158895196e-05, + "loss": 0.3313, + "step": 5809 + }, + { + "epoch": 0.5473257813051977, + "grad_norm": 0.7408233880996704, + "learning_rate": 1.6872173281750796e-05, + "loss": 0.3124, + "step": 5810 + }, + { + "epoch": 0.5474199853983656, + "grad_norm": 0.7069203853607178, + "learning_rate": 1.6871076247910216e-05, + "loss": 0.2999, + "step": 5811 + }, + { + "epoch": 0.5475141894915334, + "grad_norm": 0.7806495428085327, + "learning_rate": 1.6869979057398468e-05, + "loss": 0.3431, + "step": 5812 + }, + { + "epoch": 0.5476083935847013, + "grad_norm": 0.8627522587776184, + "learning_rate": 1.6868881710240574e-05, + "loss": 0.3756, + "step": 5813 + }, + { + "epoch": 0.5477025976778691, + "grad_norm": 0.6727634072303772, + "learning_rate": 1.6867784206461554e-05, + "loss": 0.2804, + "step": 5814 + }, + { + "epoch": 0.547796801771037, + "grad_norm": 0.822658360004425, + "learning_rate": 1.6866686546086435e-05, + "loss": 0.3237, + "step": 5815 + }, + { + "epoch": 0.5478910058642048, + "grad_norm": 0.7285251021385193, + "learning_rate": 1.6865588729140242e-05, + "loss": 0.3172, + "step": 5816 + }, + { + "epoch": 0.5479852099573727, + "grad_norm": 0.671876072883606, + "learning_rate": 1.686449075564801e-05, + "loss": 0.2954, + "step": 5817 + }, + { + "epoch": 0.5480794140505405, + "grad_norm": 0.8374348878860474, + "learning_rate": 1.686339262563477e-05, + "loss": 0.3504, + "step": 5818 + }, + { + "epoch": 0.5481736181437084, + "grad_norm": 0.7035629153251648, + "learning_rate": 1.686229433912556e-05, + "loss": 0.3206, + "step": 5819 + }, + { + "epoch": 0.5482678222368762, + "grad_norm": 0.8646016716957092, + "learning_rate": 1.6861195896145436e-05, + "loss": 0.3803, + "step": 5820 + }, + { + "epoch": 0.548362026330044, + "grad_norm": 0.7874694466590881, + "learning_rate": 1.686009729671943e-05, + "loss": 0.3362, + "step": 5821 + }, + { + "epoch": 0.5484562304232119, + "grad_norm": 0.7751681208610535, + "learning_rate": 1.6858998540872594e-05, + "loss": 0.324, + "step": 5822 + }, + { + "epoch": 0.5485504345163797, + "grad_norm": 0.7489470839500427, + "learning_rate": 1.6857899628629985e-05, + "loss": 0.3287, + "step": 5823 + }, + { + "epoch": 0.5486446386095476, + "grad_norm": 0.7112763524055481, + "learning_rate": 1.6856800560016657e-05, + "loss": 0.3521, + "step": 5824 + }, + { + "epoch": 0.5487388427027154, + "grad_norm": 0.8134369850158691, + "learning_rate": 1.6855701335057677e-05, + "loss": 0.3763, + "step": 5825 + }, + { + "epoch": 0.5488330467958833, + "grad_norm": 0.6973388195037842, + "learning_rate": 1.68546019537781e-05, + "loss": 0.332, + "step": 5826 + }, + { + "epoch": 0.5489272508890511, + "grad_norm": 0.7977001667022705, + "learning_rate": 1.6853502416203e-05, + "loss": 0.3845, + "step": 5827 + }, + { + "epoch": 0.549021454982219, + "grad_norm": 0.8567266464233398, + "learning_rate": 1.6852402722357443e-05, + "loss": 0.3562, + "step": 5828 + }, + { + "epoch": 0.5491156590753868, + "grad_norm": 0.7990422248840332, + "learning_rate": 1.6851302872266507e-05, + "loss": 0.318, + "step": 5829 + }, + { + "epoch": 0.5492098631685547, + "grad_norm": 0.701423168182373, + "learning_rate": 1.6850202865955272e-05, + "loss": 0.3355, + "step": 5830 + }, + { + "epoch": 0.5493040672617225, + "grad_norm": 0.7400321960449219, + "learning_rate": 1.6849102703448818e-05, + "loss": 0.3452, + "step": 5831 + }, + { + "epoch": 0.5493982713548904, + "grad_norm": 0.7852572202682495, + "learning_rate": 1.6848002384772225e-05, + "loss": 0.3286, + "step": 5832 + }, + { + "epoch": 0.5494924754480582, + "grad_norm": 0.6592714786529541, + "learning_rate": 1.6846901909950594e-05, + "loss": 0.2842, + "step": 5833 + }, + { + "epoch": 0.5495866795412261, + "grad_norm": 0.7866077423095703, + "learning_rate": 1.6845801279009e-05, + "loss": 0.3133, + "step": 5834 + }, + { + "epoch": 0.5496808836343939, + "grad_norm": 0.730317234992981, + "learning_rate": 1.684470049197256e-05, + "loss": 0.3178, + "step": 5835 + }, + { + "epoch": 0.5497750877275618, + "grad_norm": 1.0602613687515259, + "learning_rate": 1.684359954886636e-05, + "loss": 0.3482, + "step": 5836 + }, + { + "epoch": 0.5498692918207296, + "grad_norm": 0.8464245796203613, + "learning_rate": 1.6842498449715506e-05, + "loss": 0.3677, + "step": 5837 + }, + { + "epoch": 0.5499634959138975, + "grad_norm": 0.8427851796150208, + "learning_rate": 1.6841397194545104e-05, + "loss": 0.368, + "step": 5838 + }, + { + "epoch": 0.5500577000070653, + "grad_norm": 0.7537657618522644, + "learning_rate": 1.6840295783380266e-05, + "loss": 0.3785, + "step": 5839 + }, + { + "epoch": 0.5501519041002332, + "grad_norm": 1.0740464925765991, + "learning_rate": 1.683919421624611e-05, + "loss": 0.3479, + "step": 5840 + }, + { + "epoch": 0.550246108193401, + "grad_norm": 0.6912543773651123, + "learning_rate": 1.683809249316774e-05, + "loss": 0.317, + "step": 5841 + }, + { + "epoch": 0.5503403122865689, + "grad_norm": 0.8974886536598206, + "learning_rate": 1.6836990614170296e-05, + "loss": 0.3481, + "step": 5842 + }, + { + "epoch": 0.5504345163797367, + "grad_norm": 0.6748872995376587, + "learning_rate": 1.6835888579278887e-05, + "loss": 0.2973, + "step": 5843 + }, + { + "epoch": 0.5505287204729046, + "grad_norm": 0.8794151544570923, + "learning_rate": 1.6834786388518646e-05, + "loss": 0.3499, + "step": 5844 + }, + { + "epoch": 0.5506229245660724, + "grad_norm": 0.8693121075630188, + "learning_rate": 1.6833684041914704e-05, + "loss": 0.3676, + "step": 5845 + }, + { + "epoch": 0.5507171286592403, + "grad_norm": 0.7209320664405823, + "learning_rate": 1.6832581539492198e-05, + "loss": 0.3318, + "step": 5846 + }, + { + "epoch": 0.5508113327524081, + "grad_norm": 0.6598643064498901, + "learning_rate": 1.683147888127627e-05, + "loss": 0.3241, + "step": 5847 + }, + { + "epoch": 0.550905536845576, + "grad_norm": 1.2043993473052979, + "learning_rate": 1.6830376067292053e-05, + "loss": 0.3987, + "step": 5848 + }, + { + "epoch": 0.5509997409387438, + "grad_norm": 0.7692755460739136, + "learning_rate": 1.6829273097564702e-05, + "loss": 0.3262, + "step": 5849 + }, + { + "epoch": 0.5510939450319117, + "grad_norm": 0.7148430943489075, + "learning_rate": 1.6828169972119362e-05, + "loss": 0.3101, + "step": 5850 + }, + { + "epoch": 0.5511881491250795, + "grad_norm": 0.7019156217575073, + "learning_rate": 1.6827066690981188e-05, + "loss": 0.3423, + "step": 5851 + }, + { + "epoch": 0.5512823532182474, + "grad_norm": 0.8840952515602112, + "learning_rate": 1.6825963254175333e-05, + "loss": 0.3792, + "step": 5852 + }, + { + "epoch": 0.5513765573114152, + "grad_norm": 0.8111600875854492, + "learning_rate": 1.682485966172696e-05, + "loss": 0.3665, + "step": 5853 + }, + { + "epoch": 0.5514707614045831, + "grad_norm": 0.6971681714057922, + "learning_rate": 1.682375591366123e-05, + "loss": 0.3451, + "step": 5854 + }, + { + "epoch": 0.5515649654977509, + "grad_norm": 0.7528238892555237, + "learning_rate": 1.6822652010003316e-05, + "loss": 0.3123, + "step": 5855 + }, + { + "epoch": 0.5516591695909188, + "grad_norm": 0.7064241170883179, + "learning_rate": 1.682154795077838e-05, + "loss": 0.3132, + "step": 5856 + }, + { + "epoch": 0.5517533736840866, + "grad_norm": 0.8221530914306641, + "learning_rate": 1.6820443736011604e-05, + "loss": 0.3341, + "step": 5857 + }, + { + "epoch": 0.5518475777772545, + "grad_norm": 0.6869242787361145, + "learning_rate": 1.6819339365728162e-05, + "loss": 0.3087, + "step": 5858 + }, + { + "epoch": 0.5519417818704223, + "grad_norm": 0.6711872816085815, + "learning_rate": 1.6818234839953237e-05, + "loss": 0.301, + "step": 5859 + }, + { + "epoch": 0.5520359859635902, + "grad_norm": 1.7455143928527832, + "learning_rate": 1.6817130158712013e-05, + "loss": 0.3433, + "step": 5860 + }, + { + "epoch": 0.552130190056758, + "grad_norm": 0.6875800490379333, + "learning_rate": 1.6816025322029676e-05, + "loss": 0.3371, + "step": 5861 + }, + { + "epoch": 0.5522243941499259, + "grad_norm": 0.7491611242294312, + "learning_rate": 1.6814920329931427e-05, + "loss": 0.3523, + "step": 5862 + }, + { + "epoch": 0.5523185982430937, + "grad_norm": 0.7028124332427979, + "learning_rate": 1.681381518244245e-05, + "loss": 0.3379, + "step": 5863 + }, + { + "epoch": 0.5524128023362616, + "grad_norm": 0.718915581703186, + "learning_rate": 1.6812709879587944e-05, + "loss": 0.3384, + "step": 5864 + }, + { + "epoch": 0.5525070064294294, + "grad_norm": 0.8772738575935364, + "learning_rate": 1.6811604421393126e-05, + "loss": 0.317, + "step": 5865 + }, + { + "epoch": 0.5526012105225973, + "grad_norm": 0.7316091656684875, + "learning_rate": 1.6810498807883185e-05, + "loss": 0.3346, + "step": 5866 + }, + { + "epoch": 0.5526954146157651, + "grad_norm": 0.7291149497032166, + "learning_rate": 1.6809393039083347e-05, + "loss": 0.2907, + "step": 5867 + }, + { + "epoch": 0.552789618708933, + "grad_norm": 1.0712668895721436, + "learning_rate": 1.680828711501881e-05, + "loss": 0.3861, + "step": 5868 + }, + { + "epoch": 0.5528838228021008, + "grad_norm": 0.7991876006126404, + "learning_rate": 1.68071810357148e-05, + "loss": 0.3588, + "step": 5869 + }, + { + "epoch": 0.5529780268952686, + "grad_norm": 0.7969545125961304, + "learning_rate": 1.6806074801196536e-05, + "loss": 0.3586, + "step": 5870 + }, + { + "epoch": 0.5530722309884365, + "grad_norm": 0.7247651815414429, + "learning_rate": 1.6804968411489238e-05, + "loss": 0.3448, + "step": 5871 + }, + { + "epoch": 0.5531664350816043, + "grad_norm": 0.7463715672492981, + "learning_rate": 1.680386186661814e-05, + "loss": 0.2937, + "step": 5872 + }, + { + "epoch": 0.5532606391747722, + "grad_norm": 0.9406419992446899, + "learning_rate": 1.6802755166608465e-05, + "loss": 0.3404, + "step": 5873 + }, + { + "epoch": 0.55335484326794, + "grad_norm": 0.7245320677757263, + "learning_rate": 1.6801648311485453e-05, + "loss": 0.2765, + "step": 5874 + }, + { + "epoch": 0.5534490473611079, + "grad_norm": 0.8365300893783569, + "learning_rate": 1.6800541301274344e-05, + "loss": 0.3799, + "step": 5875 + }, + { + "epoch": 0.5535432514542757, + "grad_norm": 0.7576985359191895, + "learning_rate": 1.679943413600037e-05, + "loss": 0.3251, + "step": 5876 + }, + { + "epoch": 0.5536374555474436, + "grad_norm": 0.8103680610656738, + "learning_rate": 1.6798326815688787e-05, + "loss": 0.3515, + "step": 5877 + }, + { + "epoch": 0.5537316596406114, + "grad_norm": 0.7867361307144165, + "learning_rate": 1.6797219340364836e-05, + "loss": 0.3497, + "step": 5878 + }, + { + "epoch": 0.5538258637337792, + "grad_norm": 0.8220361471176147, + "learning_rate": 1.679611171005377e-05, + "loss": 0.3254, + "step": 5879 + }, + { + "epoch": 0.553920067826947, + "grad_norm": 0.7523766756057739, + "learning_rate": 1.6795003924780854e-05, + "loss": 0.3603, + "step": 5880 + }, + { + "epoch": 0.5540142719201149, + "grad_norm": 0.8207775950431824, + "learning_rate": 1.6793895984571333e-05, + "loss": 0.3432, + "step": 5881 + }, + { + "epoch": 0.5541084760132827, + "grad_norm": 0.7494213581085205, + "learning_rate": 1.679278788945048e-05, + "loss": 0.3571, + "step": 5882 + }, + { + "epoch": 0.5542026801064506, + "grad_norm": 0.7716641426086426, + "learning_rate": 1.6791679639443556e-05, + "loss": 0.3705, + "step": 5883 + }, + { + "epoch": 0.5542968841996184, + "grad_norm": 0.6826990842819214, + "learning_rate": 1.6790571234575833e-05, + "loss": 0.3504, + "step": 5884 + }, + { + "epoch": 0.5543910882927863, + "grad_norm": 0.7369707822799683, + "learning_rate": 1.6789462674872584e-05, + "loss": 0.3452, + "step": 5885 + }, + { + "epoch": 0.5544852923859541, + "grad_norm": 0.8421262502670288, + "learning_rate": 1.6788353960359086e-05, + "loss": 0.3487, + "step": 5886 + }, + { + "epoch": 0.554579496479122, + "grad_norm": 0.8324693441390991, + "learning_rate": 1.678724509106062e-05, + "loss": 0.3402, + "step": 5887 + }, + { + "epoch": 0.5546737005722898, + "grad_norm": 0.9260829091072083, + "learning_rate": 1.678613606700247e-05, + "loss": 0.3887, + "step": 5888 + }, + { + "epoch": 0.5547679046654577, + "grad_norm": 0.756175696849823, + "learning_rate": 1.678502688820992e-05, + "loss": 0.3463, + "step": 5889 + }, + { + "epoch": 0.5548621087586255, + "grad_norm": 0.7201794385910034, + "learning_rate": 1.6783917554708264e-05, + "loss": 0.3544, + "step": 5890 + }, + { + "epoch": 0.5549563128517934, + "grad_norm": 0.7131332159042358, + "learning_rate": 1.6782808066522796e-05, + "loss": 0.3211, + "step": 5891 + }, + { + "epoch": 0.5550505169449612, + "grad_norm": 0.8617600798606873, + "learning_rate": 1.678169842367882e-05, + "loss": 0.3358, + "step": 5892 + }, + { + "epoch": 0.5551447210381291, + "grad_norm": 0.7968863844871521, + "learning_rate": 1.6780588626201626e-05, + "loss": 0.3499, + "step": 5893 + }, + { + "epoch": 0.5552389251312969, + "grad_norm": 0.856134831905365, + "learning_rate": 1.677947867411653e-05, + "loss": 0.2971, + "step": 5894 + }, + { + "epoch": 0.5553331292244648, + "grad_norm": 1.0501809120178223, + "learning_rate": 1.677836856744883e-05, + "loss": 0.3298, + "step": 5895 + }, + { + "epoch": 0.5554273333176326, + "grad_norm": 0.7026547789573669, + "learning_rate": 1.6777258306223845e-05, + "loss": 0.3443, + "step": 5896 + }, + { + "epoch": 0.5555215374108005, + "grad_norm": 0.7354648113250732, + "learning_rate": 1.677614789046689e-05, + "loss": 0.3458, + "step": 5897 + }, + { + "epoch": 0.5556157415039683, + "grad_norm": 0.8148699402809143, + "learning_rate": 1.6775037320203285e-05, + "loss": 0.3603, + "step": 5898 + }, + { + "epoch": 0.5557099455971362, + "grad_norm": 0.810670793056488, + "learning_rate": 1.677392659545835e-05, + "loss": 0.3365, + "step": 5899 + }, + { + "epoch": 0.555804149690304, + "grad_norm": 0.7497707009315491, + "learning_rate": 1.6772815716257414e-05, + "loss": 0.3075, + "step": 5900 + }, + { + "epoch": 0.5558983537834719, + "grad_norm": 1.0751029253005981, + "learning_rate": 1.6771704682625802e-05, + "loss": 0.3328, + "step": 5901 + }, + { + "epoch": 0.5559925578766397, + "grad_norm": 0.7456806898117065, + "learning_rate": 1.6770593494588853e-05, + "loss": 0.3326, + "step": 5902 + }, + { + "epoch": 0.5560867619698076, + "grad_norm": 0.9161959886550903, + "learning_rate": 1.6769482152171902e-05, + "loss": 0.3603, + "step": 5903 + }, + { + "epoch": 0.5561809660629754, + "grad_norm": 0.7455811500549316, + "learning_rate": 1.6768370655400286e-05, + "loss": 0.3131, + "step": 5904 + }, + { + "epoch": 0.5562751701561433, + "grad_norm": 0.8076489567756653, + "learning_rate": 1.6767259004299355e-05, + "loss": 0.3364, + "step": 5905 + }, + { + "epoch": 0.5563693742493111, + "grad_norm": 0.8564948439598083, + "learning_rate": 1.676614719889445e-05, + "loss": 0.3658, + "step": 5906 + }, + { + "epoch": 0.556463578342479, + "grad_norm": 1.1845258474349976, + "learning_rate": 1.6765035239210926e-05, + "loss": 0.3103, + "step": 5907 + }, + { + "epoch": 0.5565577824356468, + "grad_norm": 0.9748694896697998, + "learning_rate": 1.6763923125274137e-05, + "loss": 0.3656, + "step": 5908 + }, + { + "epoch": 0.5566519865288146, + "grad_norm": 0.804319441318512, + "learning_rate": 1.6762810857109436e-05, + "loss": 0.3332, + "step": 5909 + }, + { + "epoch": 0.5567461906219825, + "grad_norm": 0.6985915899276733, + "learning_rate": 1.6761698434742195e-05, + "loss": 0.3004, + "step": 5910 + }, + { + "epoch": 0.5568403947151503, + "grad_norm": 1.3096137046813965, + "learning_rate": 1.676058585819777e-05, + "loss": 0.3829, + "step": 5911 + }, + { + "epoch": 0.5569345988083182, + "grad_norm": 0.8578698635101318, + "learning_rate": 1.6759473127501532e-05, + "loss": 0.3393, + "step": 5912 + }, + { + "epoch": 0.557028802901486, + "grad_norm": 0.7410597801208496, + "learning_rate": 1.6758360242678852e-05, + "loss": 0.309, + "step": 5913 + }, + { + "epoch": 0.5571230069946539, + "grad_norm": 0.7900176048278809, + "learning_rate": 1.675724720375511e-05, + "loss": 0.4037, + "step": 5914 + }, + { + "epoch": 0.5572172110878217, + "grad_norm": 0.7226098775863647, + "learning_rate": 1.6756134010755675e-05, + "loss": 0.2961, + "step": 5915 + }, + { + "epoch": 0.5573114151809896, + "grad_norm": 0.7388399839401245, + "learning_rate": 1.675502066370594e-05, + "loss": 0.2991, + "step": 5916 + }, + { + "epoch": 0.5574056192741574, + "grad_norm": 0.6885120868682861, + "learning_rate": 1.6753907162631286e-05, + "loss": 0.2969, + "step": 5917 + }, + { + "epoch": 0.5574998233673253, + "grad_norm": 1.1873542070388794, + "learning_rate": 1.67527935075571e-05, + "loss": 0.3435, + "step": 5918 + }, + { + "epoch": 0.5575940274604931, + "grad_norm": 0.7631744146347046, + "learning_rate": 1.6751679698508786e-05, + "loss": 0.331, + "step": 5919 + }, + { + "epoch": 0.557688231553661, + "grad_norm": 0.7792918086051941, + "learning_rate": 1.675056573551173e-05, + "loss": 0.3342, + "step": 5920 + }, + { + "epoch": 0.5577824356468288, + "grad_norm": 0.8030377626419067, + "learning_rate": 1.6749451618591335e-05, + "loss": 0.3352, + "step": 5921 + }, + { + "epoch": 0.5578766397399967, + "grad_norm": 0.81960129737854, + "learning_rate": 1.6748337347773003e-05, + "loss": 0.384, + "step": 5922 + }, + { + "epoch": 0.5579708438331645, + "grad_norm": 0.8801048994064331, + "learning_rate": 1.674722292308215e-05, + "loss": 0.3506, + "step": 5923 + }, + { + "epoch": 0.5580650479263324, + "grad_norm": 0.7337673306465149, + "learning_rate": 1.674610834454417e-05, + "loss": 0.352, + "step": 5924 + }, + { + "epoch": 0.5581592520195002, + "grad_norm": 0.7766098976135254, + "learning_rate": 1.674499361218449e-05, + "loss": 0.3359, + "step": 5925 + }, + { + "epoch": 0.5582534561126681, + "grad_norm": 0.95281982421875, + "learning_rate": 1.6743878726028525e-05, + "loss": 0.3893, + "step": 5926 + }, + { + "epoch": 0.5583476602058359, + "grad_norm": 0.7152496576309204, + "learning_rate": 1.67427636861017e-05, + "loss": 0.3264, + "step": 5927 + }, + { + "epoch": 0.5584418642990038, + "grad_norm": 0.7302485108375549, + "learning_rate": 1.674164849242943e-05, + "loss": 0.3053, + "step": 5928 + }, + { + "epoch": 0.5585360683921716, + "grad_norm": 0.7903436422348022, + "learning_rate": 1.6740533145037147e-05, + "loss": 0.3568, + "step": 5929 + }, + { + "epoch": 0.5586302724853395, + "grad_norm": 0.7666296362876892, + "learning_rate": 1.6739417643950287e-05, + "loss": 0.3069, + "step": 5930 + }, + { + "epoch": 0.5587244765785073, + "grad_norm": 0.7227993011474609, + "learning_rate": 1.673830198919428e-05, + "loss": 0.322, + "step": 5931 + }, + { + "epoch": 0.5588186806716752, + "grad_norm": 0.7785037159919739, + "learning_rate": 1.6737186180794573e-05, + "loss": 0.3548, + "step": 5932 + }, + { + "epoch": 0.558912884764843, + "grad_norm": 0.8512129187583923, + "learning_rate": 1.6736070218776594e-05, + "loss": 0.3441, + "step": 5933 + }, + { + "epoch": 0.5590070888580109, + "grad_norm": 0.6713905334472656, + "learning_rate": 1.67349541031658e-05, + "loss": 0.3327, + "step": 5934 + }, + { + "epoch": 0.5591012929511787, + "grad_norm": 0.7852702140808105, + "learning_rate": 1.6733837833987634e-05, + "loss": 0.3128, + "step": 5935 + }, + { + "epoch": 0.5591954970443466, + "grad_norm": 1.1700233221054077, + "learning_rate": 1.673272141126755e-05, + "loss": 0.3093, + "step": 5936 + }, + { + "epoch": 0.5592897011375144, + "grad_norm": 0.7732242941856384, + "learning_rate": 1.673160483503101e-05, + "loss": 0.3331, + "step": 5937 + }, + { + "epoch": 0.5593839052306823, + "grad_norm": 0.7639914751052856, + "learning_rate": 1.6730488105303467e-05, + "loss": 0.3335, + "step": 5938 + }, + { + "epoch": 0.5594781093238501, + "grad_norm": 0.8488541841506958, + "learning_rate": 1.672937122211039e-05, + "loss": 0.3783, + "step": 5939 + }, + { + "epoch": 0.559572313417018, + "grad_norm": 0.7996029257774353, + "learning_rate": 1.672825418547724e-05, + "loss": 0.3387, + "step": 5940 + }, + { + "epoch": 0.5596665175101858, + "grad_norm": 0.847177267074585, + "learning_rate": 1.6727136995429484e-05, + "loss": 0.3859, + "step": 5941 + }, + { + "epoch": 0.5597607216033537, + "grad_norm": 0.8273327946662903, + "learning_rate": 1.6726019651992607e-05, + "loss": 0.3483, + "step": 5942 + }, + { + "epoch": 0.5598549256965215, + "grad_norm": 0.7557797431945801, + "learning_rate": 1.6724902155192077e-05, + "loss": 0.3221, + "step": 5943 + }, + { + "epoch": 0.5599491297896894, + "grad_norm": 0.7209107875823975, + "learning_rate": 1.6723784505053377e-05, + "loss": 0.3196, + "step": 5944 + }, + { + "epoch": 0.5600433338828572, + "grad_norm": 0.7248364686965942, + "learning_rate": 1.6722666701601997e-05, + "loss": 0.348, + "step": 5945 + }, + { + "epoch": 0.5601375379760251, + "grad_norm": 0.7239592671394348, + "learning_rate": 1.6721548744863413e-05, + "loss": 0.2932, + "step": 5946 + }, + { + "epoch": 0.5602317420691929, + "grad_norm": 0.8387351036071777, + "learning_rate": 1.6720430634863126e-05, + "loss": 0.3113, + "step": 5947 + }, + { + "epoch": 0.5603259461623608, + "grad_norm": 0.6595821976661682, + "learning_rate": 1.6719312371626623e-05, + "loss": 0.2771, + "step": 5948 + }, + { + "epoch": 0.5604201502555286, + "grad_norm": 0.8561384677886963, + "learning_rate": 1.671819395517941e-05, + "loss": 0.3539, + "step": 5949 + }, + { + "epoch": 0.5605143543486965, + "grad_norm": 0.7971121072769165, + "learning_rate": 1.6717075385546986e-05, + "loss": 0.3421, + "step": 5950 + }, + { + "epoch": 0.5606085584418643, + "grad_norm": 0.8530870079994202, + "learning_rate": 1.671595666275485e-05, + "loss": 0.3475, + "step": 5951 + }, + { + "epoch": 0.5607027625350322, + "grad_norm": 0.6426548361778259, + "learning_rate": 1.6714837786828525e-05, + "loss": 0.2956, + "step": 5952 + }, + { + "epoch": 0.5607969666282, + "grad_norm": 0.7764912843704224, + "learning_rate": 1.6713718757793508e-05, + "loss": 0.3498, + "step": 5953 + }, + { + "epoch": 0.5608911707213678, + "grad_norm": 0.8872030377388, + "learning_rate": 1.6712599575675318e-05, + "loss": 0.3307, + "step": 5954 + }, + { + "epoch": 0.5609853748145357, + "grad_norm": 0.6706919074058533, + "learning_rate": 1.6711480240499477e-05, + "loss": 0.2947, + "step": 5955 + }, + { + "epoch": 0.5610795789077035, + "grad_norm": 0.9219316840171814, + "learning_rate": 1.671036075229151e-05, + "loss": 0.38, + "step": 5956 + }, + { + "epoch": 0.5611737830008714, + "grad_norm": 0.7901992797851562, + "learning_rate": 1.6709241111076937e-05, + "loss": 0.299, + "step": 5957 + }, + { + "epoch": 0.5612679870940392, + "grad_norm": 0.8954774737358093, + "learning_rate": 1.6708121316881295e-05, + "loss": 0.3843, + "step": 5958 + }, + { + "epoch": 0.5613621911872071, + "grad_norm": 0.7102924585342407, + "learning_rate": 1.670700136973011e-05, + "loss": 0.3078, + "step": 5959 + }, + { + "epoch": 0.5614563952803749, + "grad_norm": 0.658418595790863, + "learning_rate": 1.670588126964892e-05, + "loss": 0.2838, + "step": 5960 + }, + { + "epoch": 0.5615505993735428, + "grad_norm": 0.8780316710472107, + "learning_rate": 1.6704761016663266e-05, + "loss": 0.3387, + "step": 5961 + }, + { + "epoch": 0.5616448034667106, + "grad_norm": 0.7748724222183228, + "learning_rate": 1.6703640610798694e-05, + "loss": 0.3662, + "step": 5962 + }, + { + "epoch": 0.5617390075598785, + "grad_norm": 0.6843838691711426, + "learning_rate": 1.670252005208075e-05, + "loss": 0.3143, + "step": 5963 + }, + { + "epoch": 0.5618332116530463, + "grad_norm": 0.9076165556907654, + "learning_rate": 1.670139934053498e-05, + "loss": 0.3444, + "step": 5964 + }, + { + "epoch": 0.5619274157462142, + "grad_norm": 0.71778404712677, + "learning_rate": 1.670027847618694e-05, + "loss": 0.301, + "step": 5965 + }, + { + "epoch": 0.562021619839382, + "grad_norm": 0.8193926215171814, + "learning_rate": 1.669915745906219e-05, + "loss": 0.3233, + "step": 5966 + }, + { + "epoch": 0.5621158239325499, + "grad_norm": 0.6978916525840759, + "learning_rate": 1.669803628918629e-05, + "loss": 0.3458, + "step": 5967 + }, + { + "epoch": 0.5622100280257177, + "grad_norm": 0.7813271880149841, + "learning_rate": 1.6696914966584805e-05, + "loss": 0.3363, + "step": 5968 + }, + { + "epoch": 0.5623042321188856, + "grad_norm": 0.7352415323257446, + "learning_rate": 1.6695793491283302e-05, + "loss": 0.3238, + "step": 5969 + }, + { + "epoch": 0.5623984362120534, + "grad_norm": 0.7068782448768616, + "learning_rate": 1.6694671863307352e-05, + "loss": 0.3334, + "step": 5970 + }, + { + "epoch": 0.5624926403052213, + "grad_norm": 0.7115263938903809, + "learning_rate": 1.6693550082682527e-05, + "loss": 0.3223, + "step": 5971 + }, + { + "epoch": 0.5625868443983891, + "grad_norm": 0.7842731475830078, + "learning_rate": 1.6692428149434413e-05, + "loss": 0.3312, + "step": 5972 + }, + { + "epoch": 0.562681048491557, + "grad_norm": 0.8012655377388, + "learning_rate": 1.6691306063588583e-05, + "loss": 0.3435, + "step": 5973 + }, + { + "epoch": 0.5627752525847248, + "grad_norm": 0.964786171913147, + "learning_rate": 1.669018382517063e-05, + "loss": 0.3674, + "step": 5974 + }, + { + "epoch": 0.5628694566778927, + "grad_norm": 0.8468067049980164, + "learning_rate": 1.6689061434206134e-05, + "loss": 0.3292, + "step": 5975 + }, + { + "epoch": 0.5629636607710605, + "grad_norm": 0.7377479672431946, + "learning_rate": 1.6687938890720698e-05, + "loss": 0.3522, + "step": 5976 + }, + { + "epoch": 0.5630578648642284, + "grad_norm": 0.783111035823822, + "learning_rate": 1.668681619473991e-05, + "loss": 0.3473, + "step": 5977 + }, + { + "epoch": 0.5631520689573962, + "grad_norm": 0.7521863579750061, + "learning_rate": 1.6685693346289372e-05, + "loss": 0.3193, + "step": 5978 + }, + { + "epoch": 0.5632462730505641, + "grad_norm": 0.7061949968338013, + "learning_rate": 1.6684570345394683e-05, + "loss": 0.2905, + "step": 5979 + }, + { + "epoch": 0.5633404771437319, + "grad_norm": 0.8568319082260132, + "learning_rate": 1.6683447192081457e-05, + "loss": 0.3315, + "step": 5980 + }, + { + "epoch": 0.5634346812368998, + "grad_norm": 0.7928352952003479, + "learning_rate": 1.6682323886375294e-05, + "loss": 0.3603, + "step": 5981 + }, + { + "epoch": 0.5635288853300676, + "grad_norm": 0.8035825490951538, + "learning_rate": 1.6681200428301816e-05, + "loss": 0.3295, + "step": 5982 + }, + { + "epoch": 0.5636230894232355, + "grad_norm": 0.851418673992157, + "learning_rate": 1.6680076817886632e-05, + "loss": 0.3602, + "step": 5983 + }, + { + "epoch": 0.5637172935164033, + "grad_norm": 0.6932213306427002, + "learning_rate": 1.667895305515537e-05, + "loss": 0.3067, + "step": 5984 + }, + { + "epoch": 0.5638114976095712, + "grad_norm": 0.7550251483917236, + "learning_rate": 1.6677829140133647e-05, + "loss": 0.3421, + "step": 5985 + }, + { + "epoch": 0.563905701702739, + "grad_norm": 0.8431921601295471, + "learning_rate": 1.667670507284709e-05, + "loss": 0.3746, + "step": 5986 + }, + { + "epoch": 0.5639999057959069, + "grad_norm": 0.7865931987762451, + "learning_rate": 1.6675580853321335e-05, + "loss": 0.3782, + "step": 5987 + }, + { + "epoch": 0.5640941098890747, + "grad_norm": 0.7720548510551453, + "learning_rate": 1.667445648158201e-05, + "loss": 0.3459, + "step": 5988 + }, + { + "epoch": 0.5641883139822426, + "grad_norm": 0.8384525179862976, + "learning_rate": 1.6673331957654755e-05, + "loss": 0.333, + "step": 5989 + }, + { + "epoch": 0.5642825180754104, + "grad_norm": 0.7852795720100403, + "learning_rate": 1.6672207281565212e-05, + "loss": 0.3709, + "step": 5990 + }, + { + "epoch": 0.5643767221685783, + "grad_norm": 0.8175713419914246, + "learning_rate": 1.6671082453339024e-05, + "loss": 0.3286, + "step": 5991 + }, + { + "epoch": 0.5644709262617461, + "grad_norm": 0.9116515517234802, + "learning_rate": 1.666995747300184e-05, + "loss": 0.3214, + "step": 5992 + }, + { + "epoch": 0.564565130354914, + "grad_norm": 0.7680133581161499, + "learning_rate": 1.666883234057931e-05, + "loss": 0.3157, + "step": 5993 + }, + { + "epoch": 0.5646593344480818, + "grad_norm": 0.8134737610816956, + "learning_rate": 1.666770705609709e-05, + "loss": 0.3543, + "step": 5994 + }, + { + "epoch": 0.5647535385412497, + "grad_norm": 0.7875015139579773, + "learning_rate": 1.6666581619580835e-05, + "loss": 0.3644, + "step": 5995 + }, + { + "epoch": 0.5648477426344175, + "grad_norm": 0.6372561454772949, + "learning_rate": 1.666545603105621e-05, + "loss": 0.3292, + "step": 5996 + }, + { + "epoch": 0.5649419467275854, + "grad_norm": 0.71250319480896, + "learning_rate": 1.666433029054888e-05, + "loss": 0.326, + "step": 5997 + }, + { + "epoch": 0.5650361508207532, + "grad_norm": 0.8104060888290405, + "learning_rate": 1.666320439808451e-05, + "loss": 0.3636, + "step": 5998 + }, + { + "epoch": 0.565130354913921, + "grad_norm": 0.7650859355926514, + "learning_rate": 1.666207835368878e-05, + "loss": 0.3158, + "step": 5999 + }, + { + "epoch": 0.5652245590070889, + "grad_norm": 0.9378272294998169, + "learning_rate": 1.6660952157387355e-05, + "loss": 0.3618, + "step": 6000 + }, + { + "epoch": 0.5653187631002567, + "grad_norm": 0.7213669419288635, + "learning_rate": 1.6659825809205924e-05, + "loss": 0.3188, + "step": 6001 + }, + { + "epoch": 0.5654129671934246, + "grad_norm": 0.8028793931007385, + "learning_rate": 1.6658699309170157e-05, + "loss": 0.35, + "step": 6002 + }, + { + "epoch": 0.5655071712865924, + "grad_norm": 0.6953859329223633, + "learning_rate": 1.6657572657305758e-05, + "loss": 0.3512, + "step": 6003 + }, + { + "epoch": 0.5656013753797603, + "grad_norm": 0.7779675126075745, + "learning_rate": 1.66564458536384e-05, + "loss": 0.3353, + "step": 6004 + }, + { + "epoch": 0.5656955794729281, + "grad_norm": 0.785291314125061, + "learning_rate": 1.6655318898193784e-05, + "loss": 0.325, + "step": 6005 + }, + { + "epoch": 0.565789783566096, + "grad_norm": 0.8910825252532959, + "learning_rate": 1.6654191790997604e-05, + "loss": 0.3489, + "step": 6006 + }, + { + "epoch": 0.5658839876592638, + "grad_norm": 1.6566963195800781, + "learning_rate": 1.6653064532075563e-05, + "loss": 0.3491, + "step": 6007 + }, + { + "epoch": 0.5659781917524317, + "grad_norm": 0.6922672986984253, + "learning_rate": 1.6651937121453357e-05, + "loss": 0.3293, + "step": 6008 + }, + { + "epoch": 0.5660723958455995, + "grad_norm": 0.6288778185844421, + "learning_rate": 1.66508095591567e-05, + "loss": 0.2707, + "step": 6009 + }, + { + "epoch": 0.5661665999387674, + "grad_norm": 0.7408380508422852, + "learning_rate": 1.66496818452113e-05, + "loss": 0.3155, + "step": 6010 + }, + { + "epoch": 0.5662608040319352, + "grad_norm": 1.2888745069503784, + "learning_rate": 1.6648553979642867e-05, + "loss": 0.3063, + "step": 6011 + }, + { + "epoch": 0.5663550081251031, + "grad_norm": 0.7629866600036621, + "learning_rate": 1.6647425962477122e-05, + "loss": 0.3297, + "step": 6012 + }, + { + "epoch": 0.5664492122182709, + "grad_norm": 0.7858827114105225, + "learning_rate": 1.6646297793739784e-05, + "loss": 0.3095, + "step": 6013 + }, + { + "epoch": 0.5665434163114388, + "grad_norm": 0.6547446250915527, + "learning_rate": 1.664516947345658e-05, + "loss": 0.3263, + "step": 6014 + }, + { + "epoch": 0.5666376204046066, + "grad_norm": 0.7657425403594971, + "learning_rate": 1.6644041001653236e-05, + "loss": 0.3195, + "step": 6015 + }, + { + "epoch": 0.5667318244977745, + "grad_norm": 0.7175026535987854, + "learning_rate": 1.6642912378355478e-05, + "loss": 0.3413, + "step": 6016 + }, + { + "epoch": 0.5668260285909422, + "grad_norm": 0.810639500617981, + "learning_rate": 1.6641783603589048e-05, + "loss": 0.3336, + "step": 6017 + }, + { + "epoch": 0.5669202326841101, + "grad_norm": 0.7549232244491577, + "learning_rate": 1.6640654677379677e-05, + "loss": 0.3141, + "step": 6018 + }, + { + "epoch": 0.5670144367772779, + "grad_norm": 0.8682640790939331, + "learning_rate": 1.663952559975311e-05, + "loss": 0.3471, + "step": 6019 + }, + { + "epoch": 0.5671086408704458, + "grad_norm": 0.8188992142677307, + "learning_rate": 1.6638396370735095e-05, + "loss": 0.3525, + "step": 6020 + }, + { + "epoch": 0.5672028449636136, + "grad_norm": 0.7149308919906616, + "learning_rate": 1.663726699035137e-05, + "loss": 0.3279, + "step": 6021 + }, + { + "epoch": 0.5672970490567815, + "grad_norm": 0.7880874872207642, + "learning_rate": 1.6636137458627696e-05, + "loss": 0.3741, + "step": 6022 + }, + { + "epoch": 0.5673912531499493, + "grad_norm": 0.7084752321243286, + "learning_rate": 1.6635007775589826e-05, + "loss": 0.2901, + "step": 6023 + }, + { + "epoch": 0.5674854572431172, + "grad_norm": 0.7272505760192871, + "learning_rate": 1.6633877941263517e-05, + "loss": 0.3339, + "step": 6024 + }, + { + "epoch": 0.567579661336285, + "grad_norm": 0.7028395533561707, + "learning_rate": 1.663274795567453e-05, + "loss": 0.3252, + "step": 6025 + }, + { + "epoch": 0.5676738654294529, + "grad_norm": 0.8396478295326233, + "learning_rate": 1.6631617818848632e-05, + "loss": 0.3106, + "step": 6026 + }, + { + "epoch": 0.5677680695226207, + "grad_norm": 0.8230246901512146, + "learning_rate": 1.6630487530811594e-05, + "loss": 0.3202, + "step": 6027 + }, + { + "epoch": 0.5678622736157886, + "grad_norm": 0.7576961517333984, + "learning_rate": 1.6629357091589184e-05, + "loss": 0.3503, + "step": 6028 + }, + { + "epoch": 0.5679564777089564, + "grad_norm": 0.7407094836235046, + "learning_rate": 1.662822650120718e-05, + "loss": 0.3276, + "step": 6029 + }, + { + "epoch": 0.5680506818021243, + "grad_norm": 0.8200357556343079, + "learning_rate": 1.6627095759691364e-05, + "loss": 0.3449, + "step": 6030 + }, + { + "epoch": 0.5681448858952921, + "grad_norm": 0.9754223823547363, + "learning_rate": 1.6625964867067514e-05, + "loss": 0.3591, + "step": 6031 + }, + { + "epoch": 0.56823908998846, + "grad_norm": 0.8169598579406738, + "learning_rate": 1.6624833823361416e-05, + "loss": 0.3135, + "step": 6032 + }, + { + "epoch": 0.5683332940816278, + "grad_norm": 0.8093365430831909, + "learning_rate": 1.662370262859886e-05, + "loss": 0.323, + "step": 6033 + }, + { + "epoch": 0.5684274981747957, + "grad_norm": 0.8016453981399536, + "learning_rate": 1.6622571282805642e-05, + "loss": 0.3414, + "step": 6034 + }, + { + "epoch": 0.5685217022679635, + "grad_norm": 0.769794762134552, + "learning_rate": 1.6621439786007557e-05, + "loss": 0.3579, + "step": 6035 + }, + { + "epoch": 0.5686159063611314, + "grad_norm": 0.8014020919799805, + "learning_rate": 1.6620308138230406e-05, + "loss": 0.3087, + "step": 6036 + }, + { + "epoch": 0.5687101104542992, + "grad_norm": 0.8015003800392151, + "learning_rate": 1.6619176339499993e-05, + "loss": 0.3561, + "step": 6037 + }, + { + "epoch": 0.568804314547467, + "grad_norm": 0.7012295722961426, + "learning_rate": 1.6618044389842116e-05, + "loss": 0.338, + "step": 6038 + }, + { + "epoch": 0.5688985186406349, + "grad_norm": 0.7357358932495117, + "learning_rate": 1.6616912289282595e-05, + "loss": 0.3359, + "step": 6039 + }, + { + "epoch": 0.5689927227338027, + "grad_norm": 1.0131534337997437, + "learning_rate": 1.661578003784724e-05, + "loss": 0.3035, + "step": 6040 + }, + { + "epoch": 0.5690869268269706, + "grad_norm": 0.752738893032074, + "learning_rate": 1.6614647635561872e-05, + "loss": 0.3494, + "step": 6041 + }, + { + "epoch": 0.5691811309201384, + "grad_norm": 0.7525772452354431, + "learning_rate": 1.661351508245231e-05, + "loss": 0.3618, + "step": 6042 + }, + { + "epoch": 0.5692753350133063, + "grad_norm": 0.8118856549263, + "learning_rate": 1.661238237854437e-05, + "loss": 0.3678, + "step": 6043 + }, + { + "epoch": 0.5693695391064741, + "grad_norm": 0.7494533658027649, + "learning_rate": 1.6611249523863886e-05, + "loss": 0.3157, + "step": 6044 + }, + { + "epoch": 0.569463743199642, + "grad_norm": 0.7700856924057007, + "learning_rate": 1.661011651843669e-05, + "loss": 0.3459, + "step": 6045 + }, + { + "epoch": 0.5695579472928098, + "grad_norm": 0.706672191619873, + "learning_rate": 1.6608983362288612e-05, + "loss": 0.3149, + "step": 6046 + }, + { + "epoch": 0.5696521513859777, + "grad_norm": 0.7745957374572754, + "learning_rate": 1.6607850055445497e-05, + "loss": 0.36, + "step": 6047 + }, + { + "epoch": 0.5697463554791455, + "grad_norm": 0.7428537011146545, + "learning_rate": 1.660671659793318e-05, + "loss": 0.3265, + "step": 6048 + }, + { + "epoch": 0.5698405595723134, + "grad_norm": 0.6797862648963928, + "learning_rate": 1.6605582989777504e-05, + "loss": 0.3167, + "step": 6049 + }, + { + "epoch": 0.5699347636654812, + "grad_norm": 0.6493890881538391, + "learning_rate": 1.6604449231004323e-05, + "loss": 0.3108, + "step": 6050 + }, + { + "epoch": 0.5700289677586491, + "grad_norm": 0.8943933844566345, + "learning_rate": 1.6603315321639486e-05, + "loss": 0.3856, + "step": 6051 + }, + { + "epoch": 0.5701231718518169, + "grad_norm": 0.7026787400245667, + "learning_rate": 1.6602181261708847e-05, + "loss": 0.3084, + "step": 6052 + }, + { + "epoch": 0.5702173759449848, + "grad_norm": 0.7042235732078552, + "learning_rate": 1.6601047051238264e-05, + "loss": 0.3246, + "step": 6053 + }, + { + "epoch": 0.5703115800381526, + "grad_norm": 0.7456470131874084, + "learning_rate": 1.65999126902536e-05, + "loss": 0.3478, + "step": 6054 + }, + { + "epoch": 0.5704057841313205, + "grad_norm": 0.8093039393424988, + "learning_rate": 1.6598778178780718e-05, + "loss": 0.3588, + "step": 6055 + }, + { + "epoch": 0.5704999882244883, + "grad_norm": 0.8204628229141235, + "learning_rate": 1.659764351684549e-05, + "loss": 0.3227, + "step": 6056 + }, + { + "epoch": 0.5705941923176562, + "grad_norm": 0.9243156909942627, + "learning_rate": 1.6596508704473787e-05, + "loss": 0.3555, + "step": 6057 + }, + { + "epoch": 0.570688396410824, + "grad_norm": 0.7073413133621216, + "learning_rate": 1.659537374169148e-05, + "loss": 0.2897, + "step": 6058 + }, + { + "epoch": 0.5707826005039919, + "grad_norm": 0.7905264496803284, + "learning_rate": 1.6594238628524456e-05, + "loss": 0.3473, + "step": 6059 + }, + { + "epoch": 0.5708768045971597, + "grad_norm": 0.7482105493545532, + "learning_rate": 1.6593103364998593e-05, + "loss": 0.3458, + "step": 6060 + }, + { + "epoch": 0.5709710086903276, + "grad_norm": 0.73758864402771, + "learning_rate": 1.6591967951139777e-05, + "loss": 0.3241, + "step": 6061 + }, + { + "epoch": 0.5710652127834954, + "grad_norm": 0.8431020379066467, + "learning_rate": 1.6590832386973895e-05, + "loss": 0.3491, + "step": 6062 + }, + { + "epoch": 0.5711594168766633, + "grad_norm": 0.6449090838432312, + "learning_rate": 1.6589696672526844e-05, + "loss": 0.2917, + "step": 6063 + }, + { + "epoch": 0.5712536209698311, + "grad_norm": 0.7195934653282166, + "learning_rate": 1.6588560807824513e-05, + "loss": 0.3207, + "step": 6064 + }, + { + "epoch": 0.571347825062999, + "grad_norm": 0.8135504722595215, + "learning_rate": 1.658742479289281e-05, + "loss": 0.3832, + "step": 6065 + }, + { + "epoch": 0.5714420291561668, + "grad_norm": 0.7827112078666687, + "learning_rate": 1.6586288627757633e-05, + "loss": 0.3414, + "step": 6066 + }, + { + "epoch": 0.5715362332493347, + "grad_norm": 0.7211251854896545, + "learning_rate": 1.658515231244489e-05, + "loss": 0.3356, + "step": 6067 + }, + { + "epoch": 0.5716304373425025, + "grad_norm": 0.7144401669502258, + "learning_rate": 1.658401584698049e-05, + "loss": 0.2805, + "step": 6068 + }, + { + "epoch": 0.5717246414356704, + "grad_norm": 0.8043273091316223, + "learning_rate": 1.658287923139035e-05, + "loss": 0.3678, + "step": 6069 + }, + { + "epoch": 0.5718188455288382, + "grad_norm": 0.8267249464988708, + "learning_rate": 1.6581742465700377e-05, + "loss": 0.3456, + "step": 6070 + }, + { + "epoch": 0.5719130496220061, + "grad_norm": 0.8214240670204163, + "learning_rate": 1.6580605549936496e-05, + "loss": 0.3052, + "step": 6071 + }, + { + "epoch": 0.5720072537151739, + "grad_norm": 0.7541815638542175, + "learning_rate": 1.6579468484124637e-05, + "loss": 0.355, + "step": 6072 + }, + { + "epoch": 0.5721014578083418, + "grad_norm": 0.7984311580657959, + "learning_rate": 1.657833126829072e-05, + "loss": 0.3192, + "step": 6073 + }, + { + "epoch": 0.5721956619015096, + "grad_norm": 0.7665155529975891, + "learning_rate": 1.6577193902460675e-05, + "loss": 0.4045, + "step": 6074 + }, + { + "epoch": 0.5722898659946775, + "grad_norm": 0.7091019153594971, + "learning_rate": 1.6576056386660435e-05, + "loss": 0.2941, + "step": 6075 + }, + { + "epoch": 0.5723840700878453, + "grad_norm": 0.7414884567260742, + "learning_rate": 1.657491872091594e-05, + "loss": 0.3314, + "step": 6076 + }, + { + "epoch": 0.5724782741810132, + "grad_norm": 0.7666023969650269, + "learning_rate": 1.6573780905253133e-05, + "loss": 0.3601, + "step": 6077 + }, + { + "epoch": 0.572572478274181, + "grad_norm": 0.848470151424408, + "learning_rate": 1.6572642939697955e-05, + "loss": 0.2971, + "step": 6078 + }, + { + "epoch": 0.5726666823673489, + "grad_norm": 0.6963145136833191, + "learning_rate": 1.657150482427635e-05, + "loss": 0.2959, + "step": 6079 + }, + { + "epoch": 0.5727608864605167, + "grad_norm": 0.7042207717895508, + "learning_rate": 1.657036655901427e-05, + "loss": 0.3387, + "step": 6080 + }, + { + "epoch": 0.5728550905536846, + "grad_norm": 0.7728890776634216, + "learning_rate": 1.6569228143937678e-05, + "loss": 0.3307, + "step": 6081 + }, + { + "epoch": 0.5729492946468524, + "grad_norm": 0.8549204468727112, + "learning_rate": 1.656808957907252e-05, + "loss": 0.3461, + "step": 6082 + }, + { + "epoch": 0.5730434987400203, + "grad_norm": 0.7708202600479126, + "learning_rate": 1.6566950864444766e-05, + "loss": 0.3259, + "step": 6083 + }, + { + "epoch": 0.5731377028331881, + "grad_norm": 0.9670395851135254, + "learning_rate": 1.6565812000080373e-05, + "loss": 0.3047, + "step": 6084 + }, + { + "epoch": 0.573231906926356, + "grad_norm": 0.6677764058113098, + "learning_rate": 1.6564672986005312e-05, + "loss": 0.2787, + "step": 6085 + }, + { + "epoch": 0.5733261110195238, + "grad_norm": 0.6614779829978943, + "learning_rate": 1.6563533822245553e-05, + "loss": 0.2757, + "step": 6086 + }, + { + "epoch": 0.5734203151126916, + "grad_norm": 0.8284443020820618, + "learning_rate": 1.6562394508827077e-05, + "loss": 0.3727, + "step": 6087 + }, + { + "epoch": 0.5735145192058595, + "grad_norm": 0.797268271446228, + "learning_rate": 1.6561255045775856e-05, + "loss": 0.3205, + "step": 6088 + }, + { + "epoch": 0.5736087232990273, + "grad_norm": 0.9774706363677979, + "learning_rate": 1.656011543311787e-05, + "loss": 0.3245, + "step": 6089 + }, + { + "epoch": 0.5737029273921952, + "grad_norm": 0.6838274598121643, + "learning_rate": 1.655897567087911e-05, + "loss": 0.2869, + "step": 6090 + }, + { + "epoch": 0.573797131485363, + "grad_norm": 0.8185810446739197, + "learning_rate": 1.6557835759085555e-05, + "loss": 0.3422, + "step": 6091 + }, + { + "epoch": 0.5738913355785309, + "grad_norm": 0.7627773880958557, + "learning_rate": 1.655669569776321e-05, + "loss": 0.3203, + "step": 6092 + }, + { + "epoch": 0.5739855396716987, + "grad_norm": 0.7528077363967896, + "learning_rate": 1.655555548693806e-05, + "loss": 0.3591, + "step": 6093 + }, + { + "epoch": 0.5740797437648666, + "grad_norm": 0.8339890837669373, + "learning_rate": 1.6554415126636104e-05, + "loss": 0.346, + "step": 6094 + }, + { + "epoch": 0.5741739478580344, + "grad_norm": 0.7653105854988098, + "learning_rate": 1.655327461688335e-05, + "loss": 0.3586, + "step": 6095 + }, + { + "epoch": 0.5742681519512023, + "grad_norm": 0.8562196493148804, + "learning_rate": 1.65521339577058e-05, + "loss": 0.364, + "step": 6096 + }, + { + "epoch": 0.5743623560443701, + "grad_norm": 0.7822995781898499, + "learning_rate": 1.6550993149129463e-05, + "loss": 0.3116, + "step": 6097 + }, + { + "epoch": 0.574456560137538, + "grad_norm": 1.4264404773712158, + "learning_rate": 1.6549852191180348e-05, + "loss": 0.3083, + "step": 6098 + }, + { + "epoch": 0.5745507642307058, + "grad_norm": 0.7861554622650146, + "learning_rate": 1.6548711083884474e-05, + "loss": 0.3046, + "step": 6099 + }, + { + "epoch": 0.5746449683238737, + "grad_norm": 0.9693179726600647, + "learning_rate": 1.6547569827267863e-05, + "loss": 0.3648, + "step": 6100 + }, + { + "epoch": 0.5747391724170415, + "grad_norm": 0.8236418962478638, + "learning_rate": 1.6546428421356527e-05, + "loss": 0.3386, + "step": 6101 + }, + { + "epoch": 0.5748333765102094, + "grad_norm": 0.7198533415794373, + "learning_rate": 1.6545286866176506e-05, + "loss": 0.2965, + "step": 6102 + }, + { + "epoch": 0.5749275806033772, + "grad_norm": 1.6166539192199707, + "learning_rate": 1.654414516175382e-05, + "loss": 0.3522, + "step": 6103 + }, + { + "epoch": 0.5750217846965451, + "grad_norm": 0.7453978061676025, + "learning_rate": 1.65430033081145e-05, + "loss": 0.3368, + "step": 6104 + }, + { + "epoch": 0.5751159887897129, + "grad_norm": 0.757904589176178, + "learning_rate": 1.654186130528459e-05, + "loss": 0.3615, + "step": 6105 + }, + { + "epoch": 0.5752101928828808, + "grad_norm": 0.7784548401832581, + "learning_rate": 1.6540719153290125e-05, + "loss": 0.3639, + "step": 6106 + }, + { + "epoch": 0.5753043969760486, + "grad_norm": 0.7553150653839111, + "learning_rate": 1.6539576852157145e-05, + "loss": 0.3607, + "step": 6107 + }, + { + "epoch": 0.5753986010692165, + "grad_norm": 0.7249464988708496, + "learning_rate": 1.65384344019117e-05, + "loss": 0.334, + "step": 6108 + }, + { + "epoch": 0.5754928051623843, + "grad_norm": 0.8580317497253418, + "learning_rate": 1.653729180257984e-05, + "loss": 0.3098, + "step": 6109 + }, + { + "epoch": 0.5755870092555522, + "grad_norm": 0.7214152812957764, + "learning_rate": 1.6536149054187615e-05, + "loss": 0.3158, + "step": 6110 + }, + { + "epoch": 0.57568121334872, + "grad_norm": 0.7895944714546204, + "learning_rate": 1.6535006156761085e-05, + "loss": 0.341, + "step": 6111 + }, + { + "epoch": 0.5757754174418879, + "grad_norm": 0.7869628071784973, + "learning_rate": 1.6533863110326305e-05, + "loss": 0.321, + "step": 6112 + }, + { + "epoch": 0.5758696215350557, + "grad_norm": 0.7153104543685913, + "learning_rate": 1.6532719914909347e-05, + "loss": 0.3335, + "step": 6113 + }, + { + "epoch": 0.5759638256282236, + "grad_norm": 0.779529333114624, + "learning_rate": 1.6531576570536267e-05, + "loss": 0.354, + "step": 6114 + }, + { + "epoch": 0.5760580297213914, + "grad_norm": 0.9581845998764038, + "learning_rate": 1.6530433077233142e-05, + "loss": 0.3407, + "step": 6115 + }, + { + "epoch": 0.5761522338145593, + "grad_norm": 0.7201777696609497, + "learning_rate": 1.6529289435026043e-05, + "loss": 0.32, + "step": 6116 + }, + { + "epoch": 0.5762464379077271, + "grad_norm": 0.7921628355979919, + "learning_rate": 1.652814564394105e-05, + "loss": 0.3061, + "step": 6117 + }, + { + "epoch": 0.576340642000895, + "grad_norm": 0.7956567406654358, + "learning_rate": 1.6527001704004237e-05, + "loss": 0.3992, + "step": 6118 + }, + { + "epoch": 0.5764348460940628, + "grad_norm": 0.8144785165786743, + "learning_rate": 1.6525857615241686e-05, + "loss": 0.349, + "step": 6119 + }, + { + "epoch": 0.5765290501872307, + "grad_norm": 0.7400908470153809, + "learning_rate": 1.6524713377679496e-05, + "loss": 0.3468, + "step": 6120 + }, + { + "epoch": 0.5766232542803985, + "grad_norm": 0.816346287727356, + "learning_rate": 1.6523568991343747e-05, + "loss": 0.3759, + "step": 6121 + }, + { + "epoch": 0.5767174583735664, + "grad_norm": 0.8564147353172302, + "learning_rate": 1.6522424456260537e-05, + "loss": 0.3723, + "step": 6122 + }, + { + "epoch": 0.5768116624667342, + "grad_norm": 0.8647595047950745, + "learning_rate": 1.652127977245596e-05, + "loss": 0.3198, + "step": 6123 + }, + { + "epoch": 0.5769058665599021, + "grad_norm": 0.9383739829063416, + "learning_rate": 1.6520134939956125e-05, + "loss": 0.3527, + "step": 6124 + }, + { + "epoch": 0.5770000706530699, + "grad_norm": 0.8213065266609192, + "learning_rate": 1.6518989958787126e-05, + "loss": 0.3493, + "step": 6125 + }, + { + "epoch": 0.5770942747462378, + "grad_norm": 0.7273536920547485, + "learning_rate": 1.6517844828975076e-05, + "loss": 0.2987, + "step": 6126 + }, + { + "epoch": 0.5771884788394056, + "grad_norm": 0.69364333152771, + "learning_rate": 1.6516699550546084e-05, + "loss": 0.2913, + "step": 6127 + }, + { + "epoch": 0.5772826829325735, + "grad_norm": 0.6990786790847778, + "learning_rate": 1.651555412352626e-05, + "loss": 0.2861, + "step": 6128 + }, + { + "epoch": 0.5773768870257413, + "grad_norm": 0.7258824706077576, + "learning_rate": 1.6514408547941728e-05, + "loss": 0.3566, + "step": 6129 + }, + { + "epoch": 0.5774710911189092, + "grad_norm": 0.8860750198364258, + "learning_rate": 1.651326282381861e-05, + "loss": 0.3253, + "step": 6130 + }, + { + "epoch": 0.577565295212077, + "grad_norm": 0.7322821021080017, + "learning_rate": 1.651211695118302e-05, + "loss": 0.3348, + "step": 6131 + }, + { + "epoch": 0.5776594993052449, + "grad_norm": 0.8280245065689087, + "learning_rate": 1.6510970930061095e-05, + "loss": 0.3294, + "step": 6132 + }, + { + "epoch": 0.5777537033984127, + "grad_norm": 0.7888951301574707, + "learning_rate": 1.650982476047896e-05, + "loss": 0.317, + "step": 6133 + }, + { + "epoch": 0.5778479074915805, + "grad_norm": 0.9588354825973511, + "learning_rate": 1.650867844246276e-05, + "loss": 0.3446, + "step": 6134 + }, + { + "epoch": 0.5779421115847484, + "grad_norm": 0.707567572593689, + "learning_rate": 1.6507531976038622e-05, + "loss": 0.3042, + "step": 6135 + }, + { + "epoch": 0.5780363156779162, + "grad_norm": 0.7950550317764282, + "learning_rate": 1.6506385361232688e-05, + "loss": 0.3341, + "step": 6136 + }, + { + "epoch": 0.5781305197710841, + "grad_norm": 0.6977543830871582, + "learning_rate": 1.6505238598071112e-05, + "loss": 0.3163, + "step": 6137 + }, + { + "epoch": 0.5782247238642519, + "grad_norm": 0.743246853351593, + "learning_rate": 1.650409168658003e-05, + "loss": 0.3252, + "step": 6138 + }, + { + "epoch": 0.5783189279574198, + "grad_norm": 0.6571413278579712, + "learning_rate": 1.65029446267856e-05, + "loss": 0.265, + "step": 6139 + }, + { + "epoch": 0.5784131320505876, + "grad_norm": 0.7845455408096313, + "learning_rate": 1.650179741871398e-05, + "loss": 0.3826, + "step": 6140 + }, + { + "epoch": 0.5785073361437555, + "grad_norm": 0.7896680235862732, + "learning_rate": 1.6500650062391318e-05, + "loss": 0.34, + "step": 6141 + }, + { + "epoch": 0.5786015402369233, + "grad_norm": 0.7778838872909546, + "learning_rate": 1.6499502557843782e-05, + "loss": 0.3443, + "step": 6142 + }, + { + "epoch": 0.5786957443300912, + "grad_norm": 0.7396849393844604, + "learning_rate": 1.649835490509754e-05, + "loss": 0.3063, + "step": 6143 + }, + { + "epoch": 0.578789948423259, + "grad_norm": 0.7858325242996216, + "learning_rate": 1.6497207104178748e-05, + "loss": 0.3322, + "step": 6144 + }, + { + "epoch": 0.5788841525164269, + "grad_norm": 0.7330732345581055, + "learning_rate": 1.649605915511359e-05, + "loss": 0.3113, + "step": 6145 + }, + { + "epoch": 0.5789783566095947, + "grad_norm": 0.8418616652488708, + "learning_rate": 1.649491105792824e-05, + "loss": 0.3616, + "step": 6146 + }, + { + "epoch": 0.5790725607027626, + "grad_norm": 0.7793329954147339, + "learning_rate": 1.6493762812648872e-05, + "loss": 0.3404, + "step": 6147 + }, + { + "epoch": 0.5791667647959304, + "grad_norm": 0.7458708882331848, + "learning_rate": 1.649261441930167e-05, + "loss": 0.3146, + "step": 6148 + }, + { + "epoch": 0.5792609688890983, + "grad_norm": 0.8060473203659058, + "learning_rate": 1.6491465877912815e-05, + "loss": 0.3224, + "step": 6149 + }, + { + "epoch": 0.5793551729822661, + "grad_norm": 0.6975293159484863, + "learning_rate": 1.64903171885085e-05, + "loss": 0.337, + "step": 6150 + }, + { + "epoch": 0.579449377075434, + "grad_norm": 0.7504389882087708, + "learning_rate": 1.6489168351114913e-05, + "loss": 0.3023, + "step": 6151 + }, + { + "epoch": 0.5795435811686018, + "grad_norm": 1.0578118562698364, + "learning_rate": 1.6488019365758256e-05, + "loss": 0.3307, + "step": 6152 + }, + { + "epoch": 0.5796377852617697, + "grad_norm": 0.7639904618263245, + "learning_rate": 1.6486870232464724e-05, + "loss": 0.3828, + "step": 6153 + }, + { + "epoch": 0.5797319893549375, + "grad_norm": 1.1117942333221436, + "learning_rate": 1.6485720951260516e-05, + "loss": 0.3385, + "step": 6154 + }, + { + "epoch": 0.5798261934481053, + "grad_norm": 0.8805582523345947, + "learning_rate": 1.648457152217184e-05, + "loss": 0.3436, + "step": 6155 + }, + { + "epoch": 0.5799203975412731, + "grad_norm": 0.7073836922645569, + "learning_rate": 1.6483421945224906e-05, + "loss": 0.3074, + "step": 6156 + }, + { + "epoch": 0.580014601634441, + "grad_norm": 0.6927123665809631, + "learning_rate": 1.6482272220445925e-05, + "loss": 0.3361, + "step": 6157 + }, + { + "epoch": 0.5801088057276088, + "grad_norm": 1.035823106765747, + "learning_rate": 1.648112234786111e-05, + "loss": 0.3312, + "step": 6158 + }, + { + "epoch": 0.5802030098207767, + "grad_norm": 0.8590176105499268, + "learning_rate": 1.6479972327496685e-05, + "loss": 0.3081, + "step": 6159 + }, + { + "epoch": 0.5802972139139445, + "grad_norm": 0.7897781133651733, + "learning_rate": 1.647882215937887e-05, + "loss": 0.3009, + "step": 6160 + }, + { + "epoch": 0.5803914180071124, + "grad_norm": 0.751808226108551, + "learning_rate": 1.6477671843533885e-05, + "loss": 0.3499, + "step": 6161 + }, + { + "epoch": 0.5804856221002802, + "grad_norm": 0.7967620491981506, + "learning_rate": 1.647652137998797e-05, + "loss": 0.3186, + "step": 6162 + }, + { + "epoch": 0.5805798261934481, + "grad_norm": 0.9034838676452637, + "learning_rate": 1.647537076876735e-05, + "loss": 0.3432, + "step": 6163 + }, + { + "epoch": 0.5806740302866159, + "grad_norm": 0.8102205991744995, + "learning_rate": 1.6474220009898258e-05, + "loss": 0.3468, + "step": 6164 + }, + { + "epoch": 0.5807682343797838, + "grad_norm": 0.834168553352356, + "learning_rate": 1.6473069103406943e-05, + "loss": 0.3292, + "step": 6165 + }, + { + "epoch": 0.5808624384729516, + "grad_norm": 0.7016925811767578, + "learning_rate": 1.647191804931964e-05, + "loss": 0.3116, + "step": 6166 + }, + { + "epoch": 0.5809566425661195, + "grad_norm": 0.7217550873756409, + "learning_rate": 1.6470766847662596e-05, + "loss": 0.308, + "step": 6167 + }, + { + "epoch": 0.5810508466592873, + "grad_norm": 0.8545812368392944, + "learning_rate": 1.6469615498462057e-05, + "loss": 0.3669, + "step": 6168 + }, + { + "epoch": 0.5811450507524552, + "grad_norm": 1.0315021276474, + "learning_rate": 1.6468464001744283e-05, + "loss": 0.3695, + "step": 6169 + }, + { + "epoch": 0.581239254845623, + "grad_norm": 0.9519716501235962, + "learning_rate": 1.6467312357535524e-05, + "loss": 0.3322, + "step": 6170 + }, + { + "epoch": 0.5813334589387908, + "grad_norm": 1.0121030807495117, + "learning_rate": 1.6466160565862043e-05, + "loss": 0.323, + "step": 6171 + }, + { + "epoch": 0.5814276630319587, + "grad_norm": 0.73393315076828, + "learning_rate": 1.6465008626750104e-05, + "loss": 0.3027, + "step": 6172 + }, + { + "epoch": 0.5815218671251265, + "grad_norm": 0.7842426300048828, + "learning_rate": 1.6463856540225965e-05, + "loss": 0.3182, + "step": 6173 + }, + { + "epoch": 0.5816160712182944, + "grad_norm": 0.8784646987915039, + "learning_rate": 1.64627043063159e-05, + "loss": 0.3543, + "step": 6174 + }, + { + "epoch": 0.5817102753114622, + "grad_norm": 0.8059666752815247, + "learning_rate": 1.6461551925046184e-05, + "loss": 0.2835, + "step": 6175 + }, + { + "epoch": 0.5818044794046301, + "grad_norm": 0.9482343792915344, + "learning_rate": 1.646039939644309e-05, + "loss": 0.3429, + "step": 6176 + }, + { + "epoch": 0.5818986834977979, + "grad_norm": 0.7454880475997925, + "learning_rate": 1.6459246720532906e-05, + "loss": 0.3151, + "step": 6177 + }, + { + "epoch": 0.5819928875909658, + "grad_norm": 0.7945920825004578, + "learning_rate": 1.6458093897341898e-05, + "loss": 0.3464, + "step": 6178 + }, + { + "epoch": 0.5820870916841336, + "grad_norm": 0.9061700701713562, + "learning_rate": 1.6456940926896365e-05, + "loss": 0.3755, + "step": 6179 + }, + { + "epoch": 0.5821812957773015, + "grad_norm": 0.7213881015777588, + "learning_rate": 1.645578780922259e-05, + "loss": 0.3295, + "step": 6180 + }, + { + "epoch": 0.5822754998704693, + "grad_norm": 0.7835339307785034, + "learning_rate": 1.6454634544346872e-05, + "loss": 0.3683, + "step": 6181 + }, + { + "epoch": 0.5823697039636372, + "grad_norm": 0.7409759759902954, + "learning_rate": 1.6453481132295507e-05, + "loss": 0.3062, + "step": 6182 + }, + { + "epoch": 0.582463908056805, + "grad_norm": 0.7188201546669006, + "learning_rate": 1.6452327573094783e-05, + "loss": 0.2995, + "step": 6183 + }, + { + "epoch": 0.5825581121499729, + "grad_norm": 0.7926046848297119, + "learning_rate": 1.645117386677102e-05, + "loss": 0.3231, + "step": 6184 + }, + { + "epoch": 0.5826523162431407, + "grad_norm": 0.7814804315567017, + "learning_rate": 1.645002001335051e-05, + "loss": 0.3117, + "step": 6185 + }, + { + "epoch": 0.5827465203363086, + "grad_norm": 0.6480827331542969, + "learning_rate": 1.644886601285957e-05, + "loss": 0.3006, + "step": 6186 + }, + { + "epoch": 0.5828407244294764, + "grad_norm": 0.7361974120140076, + "learning_rate": 1.6447711865324517e-05, + "loss": 0.3355, + "step": 6187 + }, + { + "epoch": 0.5829349285226443, + "grad_norm": 0.8072815537452698, + "learning_rate": 1.6446557570771656e-05, + "loss": 0.3641, + "step": 6188 + }, + { + "epoch": 0.5830291326158121, + "grad_norm": 0.7710379362106323, + "learning_rate": 1.6445403129227316e-05, + "loss": 0.3285, + "step": 6189 + }, + { + "epoch": 0.58312333670898, + "grad_norm": 0.7955127954483032, + "learning_rate": 1.6444248540717812e-05, + "loss": 0.3388, + "step": 6190 + }, + { + "epoch": 0.5832175408021478, + "grad_norm": 0.8016431927680969, + "learning_rate": 1.6443093805269482e-05, + "loss": 0.3347, + "step": 6191 + }, + { + "epoch": 0.5833117448953157, + "grad_norm": 0.7551401853561401, + "learning_rate": 1.6441938922908644e-05, + "loss": 0.3395, + "step": 6192 + }, + { + "epoch": 0.5834059489884835, + "grad_norm": 0.9337490797042847, + "learning_rate": 1.6440783893661637e-05, + "loss": 0.3666, + "step": 6193 + }, + { + "epoch": 0.5835001530816514, + "grad_norm": 0.7654420137405396, + "learning_rate": 1.6439628717554794e-05, + "loss": 0.3571, + "step": 6194 + }, + { + "epoch": 0.5835943571748192, + "grad_norm": 0.8538124561309814, + "learning_rate": 1.643847339461446e-05, + "loss": 0.3574, + "step": 6195 + }, + { + "epoch": 0.5836885612679871, + "grad_norm": 0.7688474059104919, + "learning_rate": 1.6437317924866977e-05, + "loss": 0.3026, + "step": 6196 + }, + { + "epoch": 0.5837827653611549, + "grad_norm": 0.8166373372077942, + "learning_rate": 1.6436162308338685e-05, + "loss": 0.3623, + "step": 6197 + }, + { + "epoch": 0.5838769694543228, + "grad_norm": 0.762607216835022, + "learning_rate": 1.643500654505594e-05, + "loss": 0.3101, + "step": 6198 + }, + { + "epoch": 0.5839711735474906, + "grad_norm": 0.7641329169273376, + "learning_rate": 1.6433850635045095e-05, + "loss": 0.3236, + "step": 6199 + }, + { + "epoch": 0.5840653776406585, + "grad_norm": 0.8614398837089539, + "learning_rate": 1.6432694578332507e-05, + "loss": 0.343, + "step": 6200 + }, + { + "epoch": 0.5841595817338263, + "grad_norm": 0.6884185075759888, + "learning_rate": 1.6431538374944532e-05, + "loss": 0.3076, + "step": 6201 + }, + { + "epoch": 0.5842537858269942, + "grad_norm": 0.7685940265655518, + "learning_rate": 1.6430382024907538e-05, + "loss": 0.3418, + "step": 6202 + }, + { + "epoch": 0.584347989920162, + "grad_norm": 0.7043229937553406, + "learning_rate": 1.6429225528247886e-05, + "loss": 0.2962, + "step": 6203 + }, + { + "epoch": 0.5844421940133299, + "grad_norm": 0.655018150806427, + "learning_rate": 1.642806888499195e-05, + "loss": 0.2674, + "step": 6204 + }, + { + "epoch": 0.5845363981064977, + "grad_norm": 0.7077436447143555, + "learning_rate": 1.64269120951661e-05, + "loss": 0.3026, + "step": 6205 + }, + { + "epoch": 0.5846306021996656, + "grad_norm": 0.6780821681022644, + "learning_rate": 1.642575515879672e-05, + "loss": 0.2931, + "step": 6206 + }, + { + "epoch": 0.5847248062928334, + "grad_norm": 0.7512264251708984, + "learning_rate": 1.642459807591018e-05, + "loss": 0.3367, + "step": 6207 + }, + { + "epoch": 0.5848190103860013, + "grad_norm": 0.7362152934074402, + "learning_rate": 1.642344084653287e-05, + "loss": 0.3548, + "step": 6208 + }, + { + "epoch": 0.5849132144791691, + "grad_norm": 0.6350093483924866, + "learning_rate": 1.6422283470691174e-05, + "loss": 0.3267, + "step": 6209 + }, + { + "epoch": 0.585007418572337, + "grad_norm": 0.8073122501373291, + "learning_rate": 1.6421125948411484e-05, + "loss": 0.3571, + "step": 6210 + }, + { + "epoch": 0.5851016226655048, + "grad_norm": 0.8017840385437012, + "learning_rate": 1.641996827972019e-05, + "loss": 0.3316, + "step": 6211 + }, + { + "epoch": 0.5851958267586727, + "grad_norm": 0.7363123297691345, + "learning_rate": 1.6418810464643694e-05, + "loss": 0.3095, + "step": 6212 + }, + { + "epoch": 0.5852900308518405, + "grad_norm": 0.7336936593055725, + "learning_rate": 1.641765250320839e-05, + "loss": 0.304, + "step": 6213 + }, + { + "epoch": 0.5853842349450084, + "grad_norm": 0.8079828023910522, + "learning_rate": 1.641649439544068e-05, + "loss": 0.3465, + "step": 6214 + }, + { + "epoch": 0.5854784390381762, + "grad_norm": 1.0163007974624634, + "learning_rate": 1.641533614136698e-05, + "loss": 0.3477, + "step": 6215 + }, + { + "epoch": 0.585572643131344, + "grad_norm": 0.7917940616607666, + "learning_rate": 1.6414177741013696e-05, + "loss": 0.3607, + "step": 6216 + }, + { + "epoch": 0.5856668472245119, + "grad_norm": 0.6637675762176514, + "learning_rate": 1.6413019194407237e-05, + "loss": 0.3309, + "step": 6217 + }, + { + "epoch": 0.5857610513176797, + "grad_norm": 0.7806336879730225, + "learning_rate": 1.641186050157402e-05, + "loss": 0.3381, + "step": 6218 + }, + { + "epoch": 0.5858552554108476, + "grad_norm": 0.7391020059585571, + "learning_rate": 1.641070166254047e-05, + "loss": 0.3097, + "step": 6219 + }, + { + "epoch": 0.5859494595040154, + "grad_norm": 0.8411687016487122, + "learning_rate": 1.6409542677333007e-05, + "loss": 0.3638, + "step": 6220 + }, + { + "epoch": 0.5860436635971833, + "grad_norm": 0.7389108538627625, + "learning_rate": 1.6408383545978058e-05, + "loss": 0.2995, + "step": 6221 + }, + { + "epoch": 0.5861378676903511, + "grad_norm": 0.7615322470664978, + "learning_rate": 1.6407224268502055e-05, + "loss": 0.3436, + "step": 6222 + }, + { + "epoch": 0.586232071783519, + "grad_norm": 0.7641750574111938, + "learning_rate": 1.6406064844931428e-05, + "loss": 0.3177, + "step": 6223 + }, + { + "epoch": 0.5863262758766868, + "grad_norm": 0.8718242049217224, + "learning_rate": 1.6404905275292616e-05, + "loss": 0.3555, + "step": 6224 + }, + { + "epoch": 0.5864204799698547, + "grad_norm": 0.7840854525566101, + "learning_rate": 1.6403745559612063e-05, + "loss": 0.3483, + "step": 6225 + }, + { + "epoch": 0.5865146840630225, + "grad_norm": 0.7070127725601196, + "learning_rate": 1.64025856979162e-05, + "loss": 0.3352, + "step": 6226 + }, + { + "epoch": 0.5866088881561904, + "grad_norm": 0.6949003338813782, + "learning_rate": 1.6401425690231486e-05, + "loss": 0.3256, + "step": 6227 + }, + { + "epoch": 0.5867030922493582, + "grad_norm": 0.8621259927749634, + "learning_rate": 1.6400265536584364e-05, + "loss": 0.3317, + "step": 6228 + }, + { + "epoch": 0.5867972963425261, + "grad_norm": 0.7552080154418945, + "learning_rate": 1.639910523700129e-05, + "loss": 0.3409, + "step": 6229 + }, + { + "epoch": 0.5868915004356939, + "grad_norm": 0.7715356945991516, + "learning_rate": 1.639794479150872e-05, + "loss": 0.359, + "step": 6230 + }, + { + "epoch": 0.5869857045288618, + "grad_norm": 0.7047427296638489, + "learning_rate": 1.6396784200133114e-05, + "loss": 0.3242, + "step": 6231 + }, + { + "epoch": 0.5870799086220296, + "grad_norm": 0.7249360084533691, + "learning_rate": 1.639562346290094e-05, + "loss": 0.3225, + "step": 6232 + }, + { + "epoch": 0.5871741127151975, + "grad_norm": 0.7523140907287598, + "learning_rate": 1.6394462579838653e-05, + "loss": 0.3249, + "step": 6233 + }, + { + "epoch": 0.5872683168083653, + "grad_norm": 0.7092387080192566, + "learning_rate": 1.639330155097273e-05, + "loss": 0.3372, + "step": 6234 + }, + { + "epoch": 0.5873625209015332, + "grad_norm": 0.8043521642684937, + "learning_rate": 1.6392140376329652e-05, + "loss": 0.2957, + "step": 6235 + }, + { + "epoch": 0.587456724994701, + "grad_norm": 0.8312609791755676, + "learning_rate": 1.639097905593588e-05, + "loss": 0.3079, + "step": 6236 + }, + { + "epoch": 0.5875509290878689, + "grad_norm": 0.7470102310180664, + "learning_rate": 1.6389817589817908e-05, + "loss": 0.3012, + "step": 6237 + }, + { + "epoch": 0.5876451331810367, + "grad_norm": 0.6481717824935913, + "learning_rate": 1.638865597800221e-05, + "loss": 0.2944, + "step": 6238 + }, + { + "epoch": 0.5877393372742046, + "grad_norm": 0.7260505557060242, + "learning_rate": 1.6387494220515276e-05, + "loss": 0.297, + "step": 6239 + }, + { + "epoch": 0.5878335413673724, + "grad_norm": 0.6912792921066284, + "learning_rate": 1.6386332317383594e-05, + "loss": 0.3345, + "step": 6240 + }, + { + "epoch": 0.5879277454605403, + "grad_norm": 0.7770125865936279, + "learning_rate": 1.6385170268633658e-05, + "loss": 0.3079, + "step": 6241 + }, + { + "epoch": 0.5880219495537081, + "grad_norm": 0.8794979453086853, + "learning_rate": 1.6384008074291965e-05, + "loss": 0.3393, + "step": 6242 + }, + { + "epoch": 0.588116153646876, + "grad_norm": 0.7339308261871338, + "learning_rate": 1.6382845734385014e-05, + "loss": 0.3141, + "step": 6243 + }, + { + "epoch": 0.5882103577400438, + "grad_norm": 0.7595008611679077, + "learning_rate": 1.6381683248939313e-05, + "loss": 0.291, + "step": 6244 + }, + { + "epoch": 0.5883045618332117, + "grad_norm": 0.6484681963920593, + "learning_rate": 1.6380520617981362e-05, + "loss": 0.2828, + "step": 6245 + }, + { + "epoch": 0.5883987659263795, + "grad_norm": 0.7001732587814331, + "learning_rate": 1.6379357841537672e-05, + "loss": 0.2951, + "step": 6246 + }, + { + "epoch": 0.5884929700195474, + "grad_norm": 0.9187387824058533, + "learning_rate": 1.6378194919634762e-05, + "loss": 0.3507, + "step": 6247 + }, + { + "epoch": 0.5885871741127152, + "grad_norm": 0.7903961539268494, + "learning_rate": 1.6377031852299138e-05, + "loss": 0.2709, + "step": 6248 + }, + { + "epoch": 0.5886813782058831, + "grad_norm": 0.7197296023368835, + "learning_rate": 1.637586863955733e-05, + "loss": 0.3396, + "step": 6249 + }, + { + "epoch": 0.5887755822990509, + "grad_norm": 0.8467602133750916, + "learning_rate": 1.6374705281435853e-05, + "loss": 0.3276, + "step": 6250 + }, + { + "epoch": 0.5888697863922188, + "grad_norm": 0.8200076222419739, + "learning_rate": 1.637354177796124e-05, + "loss": 0.3467, + "step": 6251 + }, + { + "epoch": 0.5889639904853866, + "grad_norm": 0.7501189112663269, + "learning_rate": 1.6372378129160015e-05, + "loss": 0.3241, + "step": 6252 + }, + { + "epoch": 0.5890581945785545, + "grad_norm": 0.7274275422096252, + "learning_rate": 1.6371214335058714e-05, + "loss": 0.3151, + "step": 6253 + }, + { + "epoch": 0.5891523986717223, + "grad_norm": 0.8300936818122864, + "learning_rate": 1.6370050395683873e-05, + "loss": 0.3146, + "step": 6254 + }, + { + "epoch": 0.5892466027648902, + "grad_norm": 0.8656488060951233, + "learning_rate": 1.6368886311062032e-05, + "loss": 0.3497, + "step": 6255 + }, + { + "epoch": 0.589340806858058, + "grad_norm": 0.7315640449523926, + "learning_rate": 1.6367722081219736e-05, + "loss": 0.3053, + "step": 6256 + }, + { + "epoch": 0.5894350109512259, + "grad_norm": 0.7211642861366272, + "learning_rate": 1.6366557706183527e-05, + "loss": 0.3306, + "step": 6257 + }, + { + "epoch": 0.5895292150443937, + "grad_norm": 0.6625840663909912, + "learning_rate": 1.6365393185979953e-05, + "loss": 0.2982, + "step": 6258 + }, + { + "epoch": 0.5896234191375616, + "grad_norm": 0.724456250667572, + "learning_rate": 1.6364228520635572e-05, + "loss": 0.3318, + "step": 6259 + }, + { + "epoch": 0.5897176232307294, + "grad_norm": 0.8413934111595154, + "learning_rate": 1.636306371017694e-05, + "loss": 0.3287, + "step": 6260 + }, + { + "epoch": 0.5898118273238973, + "grad_norm": 0.7873420715332031, + "learning_rate": 1.6361898754630616e-05, + "loss": 0.3801, + "step": 6261 + }, + { + "epoch": 0.5899060314170651, + "grad_norm": 0.7733575105667114, + "learning_rate": 1.636073365402316e-05, + "loss": 0.318, + "step": 6262 + }, + { + "epoch": 0.590000235510233, + "grad_norm": 0.7879183292388916, + "learning_rate": 1.6359568408381138e-05, + "loss": 0.3678, + "step": 6263 + }, + { + "epoch": 0.5900944396034008, + "grad_norm": 0.8884425759315491, + "learning_rate": 1.635840301773112e-05, + "loss": 0.3611, + "step": 6264 + }, + { + "epoch": 0.5901886436965686, + "grad_norm": 0.7099341154098511, + "learning_rate": 1.6357237482099682e-05, + "loss": 0.3, + "step": 6265 + }, + { + "epoch": 0.5902828477897365, + "grad_norm": 0.7190153002738953, + "learning_rate": 1.6356071801513395e-05, + "loss": 0.319, + "step": 6266 + }, + { + "epoch": 0.5903770518829043, + "grad_norm": 0.7086362838745117, + "learning_rate": 1.635490597599885e-05, + "loss": 0.3434, + "step": 6267 + }, + { + "epoch": 0.5904712559760722, + "grad_norm": 0.7540826201438904, + "learning_rate": 1.635374000558261e-05, + "loss": 0.3496, + "step": 6268 + }, + { + "epoch": 0.59056546006924, + "grad_norm": 0.7036347985267639, + "learning_rate": 1.6352573890291275e-05, + "loss": 0.3418, + "step": 6269 + }, + { + "epoch": 0.5906596641624079, + "grad_norm": 0.6594241857528687, + "learning_rate": 1.635140763015143e-05, + "loss": 0.2874, + "step": 6270 + }, + { + "epoch": 0.5907538682555757, + "grad_norm": 0.7100725769996643, + "learning_rate": 1.6350241225189672e-05, + "loss": 0.3356, + "step": 6271 + }, + { + "epoch": 0.5908480723487436, + "grad_norm": 0.7776488065719604, + "learning_rate": 1.634907467543259e-05, + "loss": 0.346, + "step": 6272 + }, + { + "epoch": 0.5909422764419114, + "grad_norm": 0.7593851685523987, + "learning_rate": 1.6347907980906788e-05, + "loss": 0.3152, + "step": 6273 + }, + { + "epoch": 0.5910364805350793, + "grad_norm": 0.635486364364624, + "learning_rate": 1.6346741141638863e-05, + "loss": 0.2968, + "step": 6274 + }, + { + "epoch": 0.5911306846282471, + "grad_norm": 0.6391139030456543, + "learning_rate": 1.634557415765543e-05, + "loss": 0.3186, + "step": 6275 + }, + { + "epoch": 0.591224888721415, + "grad_norm": 0.7033796310424805, + "learning_rate": 1.634440702898309e-05, + "loss": 0.3068, + "step": 6276 + }, + { + "epoch": 0.5913190928145828, + "grad_norm": 0.6473323702812195, + "learning_rate": 1.634323975564846e-05, + "loss": 0.3039, + "step": 6277 + }, + { + "epoch": 0.5914132969077507, + "grad_norm": 0.725439727306366, + "learning_rate": 1.6342072337678152e-05, + "loss": 0.3223, + "step": 6278 + }, + { + "epoch": 0.5915075010009185, + "grad_norm": 0.8260787129402161, + "learning_rate": 1.6340904775098785e-05, + "loss": 0.3401, + "step": 6279 + }, + { + "epoch": 0.5916017050940864, + "grad_norm": 0.6736300587654114, + "learning_rate": 1.6339737067936986e-05, + "loss": 0.2882, + "step": 6280 + }, + { + "epoch": 0.5916959091872542, + "grad_norm": 0.6979449987411499, + "learning_rate": 1.6338569216219375e-05, + "loss": 0.2978, + "step": 6281 + }, + { + "epoch": 0.5917901132804221, + "grad_norm": 0.8412538170814514, + "learning_rate": 1.6337401219972583e-05, + "loss": 0.3518, + "step": 6282 + }, + { + "epoch": 0.5918843173735899, + "grad_norm": 0.8842818737030029, + "learning_rate": 1.6336233079223244e-05, + "loss": 0.3375, + "step": 6283 + }, + { + "epoch": 0.5919785214667578, + "grad_norm": 0.7733251452445984, + "learning_rate": 1.633506479399799e-05, + "loss": 0.3652, + "step": 6284 + }, + { + "epoch": 0.5920727255599256, + "grad_norm": 0.7524434924125671, + "learning_rate": 1.6333896364323462e-05, + "loss": 0.3354, + "step": 6285 + }, + { + "epoch": 0.5921669296530935, + "grad_norm": 0.7618670463562012, + "learning_rate": 1.63327277902263e-05, + "loss": 0.3328, + "step": 6286 + }, + { + "epoch": 0.5922611337462613, + "grad_norm": 0.7212899327278137, + "learning_rate": 1.6331559071733153e-05, + "loss": 0.2971, + "step": 6287 + }, + { + "epoch": 0.5923553378394292, + "grad_norm": 0.7447496652603149, + "learning_rate": 1.6330390208870666e-05, + "loss": 0.2942, + "step": 6288 + }, + { + "epoch": 0.592449541932597, + "grad_norm": 0.7702828049659729, + "learning_rate": 1.632922120166549e-05, + "loss": 0.3935, + "step": 6289 + }, + { + "epoch": 0.5925437460257649, + "grad_norm": 0.7589718699455261, + "learning_rate": 1.6328052050144285e-05, + "loss": 0.3181, + "step": 6290 + }, + { + "epoch": 0.5926379501189327, + "grad_norm": 0.7950143814086914, + "learning_rate": 1.6326882754333708e-05, + "loss": 0.3401, + "step": 6291 + }, + { + "epoch": 0.5927321542121006, + "grad_norm": 1.0885834693908691, + "learning_rate": 1.6325713314260415e-05, + "loss": 0.3906, + "step": 6292 + }, + { + "epoch": 0.5928263583052684, + "grad_norm": 1.0690568685531616, + "learning_rate": 1.632454372995108e-05, + "loss": 0.2889, + "step": 6293 + }, + { + "epoch": 0.5929205623984362, + "grad_norm": 0.7887766361236572, + "learning_rate": 1.6323374001432362e-05, + "loss": 0.3554, + "step": 6294 + }, + { + "epoch": 0.593014766491604, + "grad_norm": 0.7963334918022156, + "learning_rate": 1.6322204128730944e-05, + "loss": 0.3442, + "step": 6295 + }, + { + "epoch": 0.5931089705847719, + "grad_norm": 0.7484216690063477, + "learning_rate": 1.6321034111873487e-05, + "loss": 0.3302, + "step": 6296 + }, + { + "epoch": 0.5932031746779397, + "grad_norm": 0.8004947304725647, + "learning_rate": 1.6319863950886683e-05, + "loss": 0.2989, + "step": 6297 + }, + { + "epoch": 0.5932973787711076, + "grad_norm": 0.6810011267662048, + "learning_rate": 1.6318693645797202e-05, + "loss": 0.3102, + "step": 6298 + }, + { + "epoch": 0.5933915828642754, + "grad_norm": 0.829971432685852, + "learning_rate": 1.6317523196631737e-05, + "loss": 0.3402, + "step": 6299 + }, + { + "epoch": 0.5934857869574433, + "grad_norm": 0.6834694743156433, + "learning_rate": 1.6316352603416975e-05, + "loss": 0.3081, + "step": 6300 + }, + { + "epoch": 0.5935799910506111, + "grad_norm": 0.6294878721237183, + "learning_rate": 1.63151818661796e-05, + "loss": 0.2939, + "step": 6301 + }, + { + "epoch": 0.593674195143779, + "grad_norm": 0.7290021777153015, + "learning_rate": 1.6314010984946317e-05, + "loss": 0.3125, + "step": 6302 + }, + { + "epoch": 0.5937683992369468, + "grad_norm": 0.7154480218887329, + "learning_rate": 1.631283995974381e-05, + "loss": 0.3153, + "step": 6303 + }, + { + "epoch": 0.5938626033301146, + "grad_norm": 0.7332636117935181, + "learning_rate": 1.6311668790598795e-05, + "loss": 0.3197, + "step": 6304 + }, + { + "epoch": 0.5939568074232825, + "grad_norm": 0.7051419615745544, + "learning_rate": 1.6310497477537973e-05, + "loss": 0.3229, + "step": 6305 + }, + { + "epoch": 0.5940510115164503, + "grad_norm": 0.6272082328796387, + "learning_rate": 1.6309326020588048e-05, + "loss": 0.2736, + "step": 6306 + }, + { + "epoch": 0.5941452156096182, + "grad_norm": 0.6881904006004333, + "learning_rate": 1.630815441977573e-05, + "loss": 0.3129, + "step": 6307 + }, + { + "epoch": 0.594239419702786, + "grad_norm": 0.7506492733955383, + "learning_rate": 1.6306982675127735e-05, + "loss": 0.3306, + "step": 6308 + }, + { + "epoch": 0.5943336237959539, + "grad_norm": 0.7264650464057922, + "learning_rate": 1.630581078667078e-05, + "loss": 0.3373, + "step": 6309 + }, + { + "epoch": 0.5944278278891217, + "grad_norm": 0.7115740776062012, + "learning_rate": 1.6304638754431592e-05, + "loss": 0.3135, + "step": 6310 + }, + { + "epoch": 0.5945220319822896, + "grad_norm": 0.7792097330093384, + "learning_rate": 1.6303466578436883e-05, + "loss": 0.3129, + "step": 6311 + }, + { + "epoch": 0.5946162360754574, + "grad_norm": 0.7447195053100586, + "learning_rate": 1.6302294258713395e-05, + "loss": 0.3125, + "step": 6312 + }, + { + "epoch": 0.5947104401686253, + "grad_norm": 0.703715443611145, + "learning_rate": 1.6301121795287846e-05, + "loss": 0.305, + "step": 6313 + }, + { + "epoch": 0.5948046442617931, + "grad_norm": 0.7331198453903198, + "learning_rate": 1.6299949188186977e-05, + "loss": 0.3225, + "step": 6314 + }, + { + "epoch": 0.594898848354961, + "grad_norm": 0.719076931476593, + "learning_rate": 1.6298776437437526e-05, + "loss": 0.2862, + "step": 6315 + }, + { + "epoch": 0.5949930524481288, + "grad_norm": 0.7023767232894897, + "learning_rate": 1.6297603543066226e-05, + "loss": 0.312, + "step": 6316 + }, + { + "epoch": 0.5950872565412967, + "grad_norm": 0.8601207733154297, + "learning_rate": 1.6296430505099828e-05, + "loss": 0.3679, + "step": 6317 + }, + { + "epoch": 0.5951814606344645, + "grad_norm": 0.7502859830856323, + "learning_rate": 1.6295257323565077e-05, + "loss": 0.3062, + "step": 6318 + }, + { + "epoch": 0.5952756647276324, + "grad_norm": 0.7650384306907654, + "learning_rate": 1.6294083998488727e-05, + "loss": 0.3732, + "step": 6319 + }, + { + "epoch": 0.5953698688208002, + "grad_norm": 0.6345619559288025, + "learning_rate": 1.6292910529897522e-05, + "loss": 0.2863, + "step": 6320 + }, + { + "epoch": 0.5954640729139681, + "grad_norm": 0.7029299736022949, + "learning_rate": 1.6291736917818227e-05, + "loss": 0.3419, + "step": 6321 + }, + { + "epoch": 0.5955582770071359, + "grad_norm": 0.7231665849685669, + "learning_rate": 1.62905631622776e-05, + "loss": 0.2789, + "step": 6322 + }, + { + "epoch": 0.5956524811003038, + "grad_norm": 0.7480236887931824, + "learning_rate": 1.6289389263302404e-05, + "loss": 0.3443, + "step": 6323 + }, + { + "epoch": 0.5957466851934716, + "grad_norm": 0.7741104960441589, + "learning_rate": 1.6288215220919405e-05, + "loss": 0.3667, + "step": 6324 + }, + { + "epoch": 0.5958408892866395, + "grad_norm": 0.846674919128418, + "learning_rate": 1.6287041035155375e-05, + "loss": 0.3913, + "step": 6325 + }, + { + "epoch": 0.5959350933798073, + "grad_norm": 0.9323556423187256, + "learning_rate": 1.628586670603709e-05, + "loss": 0.3027, + "step": 6326 + }, + { + "epoch": 0.5960292974729752, + "grad_norm": 0.7532998323440552, + "learning_rate": 1.6284692233591316e-05, + "loss": 0.3009, + "step": 6327 + }, + { + "epoch": 0.596123501566143, + "grad_norm": 0.7252593040466309, + "learning_rate": 1.6283517617844843e-05, + "loss": 0.3249, + "step": 6328 + }, + { + "epoch": 0.5962177056593109, + "grad_norm": 0.7854056358337402, + "learning_rate": 1.6282342858824452e-05, + "loss": 0.3284, + "step": 6329 + }, + { + "epoch": 0.5963119097524787, + "grad_norm": 1.1120035648345947, + "learning_rate": 1.6281167956556925e-05, + "loss": 0.354, + "step": 6330 + }, + { + "epoch": 0.5964061138456466, + "grad_norm": 0.7881585955619812, + "learning_rate": 1.627999291106906e-05, + "loss": 0.3359, + "step": 6331 + }, + { + "epoch": 0.5965003179388144, + "grad_norm": 0.7999001741409302, + "learning_rate": 1.6278817722387638e-05, + "loss": 0.3137, + "step": 6332 + }, + { + "epoch": 0.5965945220319823, + "grad_norm": 0.7499207854270935, + "learning_rate": 1.6277642390539465e-05, + "loss": 0.3427, + "step": 6333 + }, + { + "epoch": 0.5966887261251501, + "grad_norm": 0.8090696334838867, + "learning_rate": 1.6276466915551336e-05, + "loss": 0.3378, + "step": 6334 + }, + { + "epoch": 0.596782930218318, + "grad_norm": 0.7112345099449158, + "learning_rate": 1.6275291297450055e-05, + "loss": 0.3045, + "step": 6335 + }, + { + "epoch": 0.5968771343114858, + "grad_norm": 0.8204688429832458, + "learning_rate": 1.6274115536262425e-05, + "loss": 0.3058, + "step": 6336 + }, + { + "epoch": 0.5969713384046537, + "grad_norm": 0.8297174572944641, + "learning_rate": 1.627293963201526e-05, + "loss": 0.4045, + "step": 6337 + }, + { + "epoch": 0.5970655424978215, + "grad_norm": 0.7028781771659851, + "learning_rate": 1.6271763584735373e-05, + "loss": 0.3385, + "step": 6338 + }, + { + "epoch": 0.5971597465909894, + "grad_norm": 0.8586329221725464, + "learning_rate": 1.6270587394449573e-05, + "loss": 0.3884, + "step": 6339 + }, + { + "epoch": 0.5972539506841572, + "grad_norm": 0.789535641670227, + "learning_rate": 1.6269411061184683e-05, + "loss": 0.336, + "step": 6340 + }, + { + "epoch": 0.5973481547773251, + "grad_norm": 0.7452242970466614, + "learning_rate": 1.6268234584967527e-05, + "loss": 0.336, + "step": 6341 + }, + { + "epoch": 0.5974423588704929, + "grad_norm": 0.7482655048370361, + "learning_rate": 1.6267057965824925e-05, + "loss": 0.3672, + "step": 6342 + }, + { + "epoch": 0.5975365629636608, + "grad_norm": 0.8206696510314941, + "learning_rate": 1.626588120378371e-05, + "loss": 0.3469, + "step": 6343 + }, + { + "epoch": 0.5976307670568286, + "grad_norm": 0.7177087664604187, + "learning_rate": 1.6264704298870715e-05, + "loss": 0.3186, + "step": 6344 + }, + { + "epoch": 0.5977249711499965, + "grad_norm": 0.7452403903007507, + "learning_rate": 1.6263527251112775e-05, + "loss": 0.34, + "step": 6345 + }, + { + "epoch": 0.5978191752431643, + "grad_norm": 0.7684333920478821, + "learning_rate": 1.626235006053672e-05, + "loss": 0.3334, + "step": 6346 + }, + { + "epoch": 0.5979133793363322, + "grad_norm": 0.7974861264228821, + "learning_rate": 1.6261172727169406e-05, + "loss": 0.3471, + "step": 6347 + }, + { + "epoch": 0.5980075834295, + "grad_norm": 0.8435362577438354, + "learning_rate": 1.6259995251037672e-05, + "loss": 0.3429, + "step": 6348 + }, + { + "epoch": 0.5981017875226678, + "grad_norm": 0.7124806642532349, + "learning_rate": 1.6258817632168357e-05, + "loss": 0.3262, + "step": 6349 + }, + { + "epoch": 0.5981959916158357, + "grad_norm": 0.7446151375770569, + "learning_rate": 1.6257639870588325e-05, + "loss": 0.3336, + "step": 6350 + }, + { + "epoch": 0.5982901957090035, + "grad_norm": 0.7604463696479797, + "learning_rate": 1.6256461966324425e-05, + "loss": 0.3157, + "step": 6351 + }, + { + "epoch": 0.5983843998021714, + "grad_norm": 0.7514591813087463, + "learning_rate": 1.625528391940352e-05, + "loss": 0.3376, + "step": 6352 + }, + { + "epoch": 0.5984786038953392, + "grad_norm": 0.765207052230835, + "learning_rate": 1.6254105729852466e-05, + "loss": 0.3526, + "step": 6353 + }, + { + "epoch": 0.5985728079885071, + "grad_norm": 0.7503435015678406, + "learning_rate": 1.6252927397698125e-05, + "loss": 0.3441, + "step": 6354 + }, + { + "epoch": 0.5986670120816749, + "grad_norm": 0.7455945611000061, + "learning_rate": 1.6251748922967374e-05, + "loss": 0.3624, + "step": 6355 + }, + { + "epoch": 0.5987612161748428, + "grad_norm": 0.6535062193870544, + "learning_rate": 1.6250570305687077e-05, + "loss": 0.2649, + "step": 6356 + }, + { + "epoch": 0.5988554202680106, + "grad_norm": 0.8072831630706787, + "learning_rate": 1.624939154588411e-05, + "loss": 0.3485, + "step": 6357 + }, + { + "epoch": 0.5989496243611785, + "grad_norm": 1.0439919233322144, + "learning_rate": 1.624821264358535e-05, + "loss": 0.3534, + "step": 6358 + }, + { + "epoch": 0.5990438284543463, + "grad_norm": 0.7842357158660889, + "learning_rate": 1.6247033598817683e-05, + "loss": 0.3284, + "step": 6359 + }, + { + "epoch": 0.5991380325475142, + "grad_norm": 0.740278959274292, + "learning_rate": 1.6245854411607987e-05, + "loss": 0.3337, + "step": 6360 + }, + { + "epoch": 0.599232236640682, + "grad_norm": 0.7467502355575562, + "learning_rate": 1.624467508198315e-05, + "loss": 0.3537, + "step": 6361 + }, + { + "epoch": 0.5993264407338499, + "grad_norm": 0.688685417175293, + "learning_rate": 1.6243495609970063e-05, + "loss": 0.2656, + "step": 6362 + }, + { + "epoch": 0.5994206448270177, + "grad_norm": 0.8290200233459473, + "learning_rate": 1.624231599559562e-05, + "loss": 0.3136, + "step": 6363 + }, + { + "epoch": 0.5995148489201856, + "grad_norm": 0.748266339302063, + "learning_rate": 1.6241136238886718e-05, + "loss": 0.3122, + "step": 6364 + }, + { + "epoch": 0.5996090530133534, + "grad_norm": 0.7195231318473816, + "learning_rate": 1.623995633987026e-05, + "loss": 0.3422, + "step": 6365 + }, + { + "epoch": 0.5997032571065213, + "grad_norm": 0.8352035284042358, + "learning_rate": 1.6238776298573146e-05, + "loss": 0.3844, + "step": 6366 + }, + { + "epoch": 0.5997974611996891, + "grad_norm": 0.7455942630767822, + "learning_rate": 1.6237596115022284e-05, + "loss": 0.3229, + "step": 6367 + }, + { + "epoch": 0.599891665292857, + "grad_norm": 0.8090448379516602, + "learning_rate": 1.6236415789244586e-05, + "loss": 0.37, + "step": 6368 + }, + { + "epoch": 0.5999858693860248, + "grad_norm": 0.8044054508209229, + "learning_rate": 1.6235235321266958e-05, + "loss": 0.3058, + "step": 6369 + }, + { + "epoch": 0.6000800734791927, + "grad_norm": 0.7277327179908752, + "learning_rate": 1.6234054711116325e-05, + "loss": 0.2925, + "step": 6370 + }, + { + "epoch": 0.6001742775723605, + "grad_norm": 0.8373342752456665, + "learning_rate": 1.6232873958819603e-05, + "loss": 0.3469, + "step": 6371 + }, + { + "epoch": 0.6002684816655284, + "grad_norm": 0.7661371827125549, + "learning_rate": 1.6231693064403716e-05, + "loss": 0.3311, + "step": 6372 + }, + { + "epoch": 0.6003626857586962, + "grad_norm": 0.6980554461479187, + "learning_rate": 1.623051202789559e-05, + "loss": 0.2768, + "step": 6373 + }, + { + "epoch": 0.6004568898518641, + "grad_norm": 0.7779518961906433, + "learning_rate": 1.6229330849322155e-05, + "loss": 0.35, + "step": 6374 + }, + { + "epoch": 0.6005510939450319, + "grad_norm": 0.8295838236808777, + "learning_rate": 1.622814952871034e-05, + "loss": 0.3401, + "step": 6375 + }, + { + "epoch": 0.6006452980381998, + "grad_norm": 0.6845333576202393, + "learning_rate": 1.6226968066087084e-05, + "loss": 0.3166, + "step": 6376 + }, + { + "epoch": 0.6007395021313676, + "grad_norm": 0.7462641596794128, + "learning_rate": 1.622578646147933e-05, + "loss": 0.2987, + "step": 6377 + }, + { + "epoch": 0.6008337062245355, + "grad_norm": 0.7424726486206055, + "learning_rate": 1.6224604714914013e-05, + "loss": 0.3264, + "step": 6378 + }, + { + "epoch": 0.6009279103177033, + "grad_norm": 0.7039068937301636, + "learning_rate": 1.622342282641808e-05, + "loss": 0.3031, + "step": 6379 + }, + { + "epoch": 0.6010221144108712, + "grad_norm": 0.7586017847061157, + "learning_rate": 1.6222240796018485e-05, + "loss": 0.3026, + "step": 6380 + }, + { + "epoch": 0.601116318504039, + "grad_norm": 0.6884627938270569, + "learning_rate": 1.6221058623742175e-05, + "loss": 0.298, + "step": 6381 + }, + { + "epoch": 0.6012105225972069, + "grad_norm": 0.6970832943916321, + "learning_rate": 1.621987630961611e-05, + "loss": 0.3049, + "step": 6382 + }, + { + "epoch": 0.6013047266903747, + "grad_norm": 0.7597485780715942, + "learning_rate": 1.6218693853667243e-05, + "loss": 0.3237, + "step": 6383 + }, + { + "epoch": 0.6013989307835426, + "grad_norm": 0.6965933442115784, + "learning_rate": 1.6217511255922537e-05, + "loss": 0.2817, + "step": 6384 + }, + { + "epoch": 0.6014931348767104, + "grad_norm": 0.7090418338775635, + "learning_rate": 1.6216328516408967e-05, + "loss": 0.3308, + "step": 6385 + }, + { + "epoch": 0.6015873389698783, + "grad_norm": 0.7436476349830627, + "learning_rate": 1.6215145635153486e-05, + "loss": 0.2948, + "step": 6386 + }, + { + "epoch": 0.6016815430630461, + "grad_norm": 0.824032723903656, + "learning_rate": 1.6213962612183075e-05, + "loss": 0.3688, + "step": 6387 + }, + { + "epoch": 0.601775747156214, + "grad_norm": 0.8087217807769775, + "learning_rate": 1.6212779447524705e-05, + "loss": 0.3284, + "step": 6388 + }, + { + "epoch": 0.6018699512493818, + "grad_norm": 0.9960329532623291, + "learning_rate": 1.6211596141205354e-05, + "loss": 0.3162, + "step": 6389 + }, + { + "epoch": 0.6019641553425497, + "grad_norm": 1.5578619241714478, + "learning_rate": 1.6210412693252008e-05, + "loss": 0.3386, + "step": 6390 + }, + { + "epoch": 0.6020583594357175, + "grad_norm": 0.8485972285270691, + "learning_rate": 1.6209229103691646e-05, + "loss": 0.3746, + "step": 6391 + }, + { + "epoch": 0.6021525635288854, + "grad_norm": 0.8117714524269104, + "learning_rate": 1.620804537255126e-05, + "loss": 0.3493, + "step": 6392 + }, + { + "epoch": 0.6022467676220532, + "grad_norm": 0.8204595446586609, + "learning_rate": 1.6206861499857834e-05, + "loss": 0.3076, + "step": 6393 + }, + { + "epoch": 0.602340971715221, + "grad_norm": 0.7204444408416748, + "learning_rate": 1.620567748563837e-05, + "loss": 0.3395, + "step": 6394 + }, + { + "epoch": 0.6024351758083889, + "grad_norm": 0.6890771985054016, + "learning_rate": 1.6204493329919863e-05, + "loss": 0.3302, + "step": 6395 + }, + { + "epoch": 0.6025293799015567, + "grad_norm": 0.9710570573806763, + "learning_rate": 1.620330903272931e-05, + "loss": 0.3111, + "step": 6396 + }, + { + "epoch": 0.6026235839947246, + "grad_norm": 0.7515151500701904, + "learning_rate": 1.620212459409372e-05, + "loss": 0.3214, + "step": 6397 + }, + { + "epoch": 0.6027177880878924, + "grad_norm": 0.7595040202140808, + "learning_rate": 1.62009400140401e-05, + "loss": 0.3546, + "step": 6398 + }, + { + "epoch": 0.6028119921810603, + "grad_norm": 0.7797225117683411, + "learning_rate": 1.619975529259545e-05, + "loss": 0.293, + "step": 6399 + }, + { + "epoch": 0.6029061962742281, + "grad_norm": 0.6177393198013306, + "learning_rate": 1.6198570429786797e-05, + "loss": 0.2894, + "step": 6400 + }, + { + "epoch": 0.603000400367396, + "grad_norm": 0.9203518033027649, + "learning_rate": 1.619738542564115e-05, + "loss": 0.3339, + "step": 6401 + }, + { + "epoch": 0.6030946044605638, + "grad_norm": 0.7182189226150513, + "learning_rate": 1.6196200280185532e-05, + "loss": 0.3188, + "step": 6402 + }, + { + "epoch": 0.6031888085537317, + "grad_norm": 0.8205165863037109, + "learning_rate": 1.619501499344696e-05, + "loss": 0.394, + "step": 6403 + }, + { + "epoch": 0.6032830126468995, + "grad_norm": 0.7397306561470032, + "learning_rate": 1.619382956545247e-05, + "loss": 0.3184, + "step": 6404 + }, + { + "epoch": 0.6033772167400674, + "grad_norm": 0.7576934695243835, + "learning_rate": 1.619264399622909e-05, + "loss": 0.3423, + "step": 6405 + }, + { + "epoch": 0.6034714208332352, + "grad_norm": 0.6832389235496521, + "learning_rate": 1.6191458285803845e-05, + "loss": 0.2759, + "step": 6406 + }, + { + "epoch": 0.6035656249264031, + "grad_norm": 0.7263265252113342, + "learning_rate": 1.6190272434203776e-05, + "loss": 0.3262, + "step": 6407 + }, + { + "epoch": 0.6036598290195709, + "grad_norm": 0.7277495265007019, + "learning_rate": 1.6189086441455922e-05, + "loss": 0.3115, + "step": 6408 + }, + { + "epoch": 0.6037540331127388, + "grad_norm": 0.7232794761657715, + "learning_rate": 1.6187900307587328e-05, + "loss": 0.2919, + "step": 6409 + }, + { + "epoch": 0.6038482372059066, + "grad_norm": 0.8191846013069153, + "learning_rate": 1.6186714032625036e-05, + "loss": 0.3251, + "step": 6410 + }, + { + "epoch": 0.6039424412990745, + "grad_norm": 0.7194938659667969, + "learning_rate": 1.6185527616596096e-05, + "loss": 0.3441, + "step": 6411 + }, + { + "epoch": 0.6040366453922423, + "grad_norm": 0.7607441544532776, + "learning_rate": 1.618434105952756e-05, + "loss": 0.3488, + "step": 6412 + }, + { + "epoch": 0.6041308494854102, + "grad_norm": 0.7107058167457581, + "learning_rate": 1.6183154361446484e-05, + "loss": 0.3302, + "step": 6413 + }, + { + "epoch": 0.604225053578578, + "grad_norm": 0.7096536755561829, + "learning_rate": 1.6181967522379925e-05, + "loss": 0.2899, + "step": 6414 + }, + { + "epoch": 0.6043192576717459, + "grad_norm": 0.881682813167572, + "learning_rate": 1.6180780542354947e-05, + "loss": 0.3484, + "step": 6415 + }, + { + "epoch": 0.6044134617649137, + "grad_norm": 0.7957301735877991, + "learning_rate": 1.6179593421398614e-05, + "loss": 0.3169, + "step": 6416 + }, + { + "epoch": 0.6045076658580816, + "grad_norm": 0.737355649471283, + "learning_rate": 1.6178406159537992e-05, + "loss": 0.3238, + "step": 6417 + }, + { + "epoch": 0.6046018699512494, + "grad_norm": 0.697355329990387, + "learning_rate": 1.6177218756800158e-05, + "loss": 0.3201, + "step": 6418 + }, + { + "epoch": 0.6046960740444173, + "grad_norm": 1.2903817892074585, + "learning_rate": 1.617603121321218e-05, + "loss": 0.3525, + "step": 6419 + }, + { + "epoch": 0.6047902781375851, + "grad_norm": 0.8625286221504211, + "learning_rate": 1.617484352880114e-05, + "loss": 0.3398, + "step": 6420 + }, + { + "epoch": 0.604884482230753, + "grad_norm": 0.7835841774940491, + "learning_rate": 1.617365570359412e-05, + "loss": 0.3522, + "step": 6421 + }, + { + "epoch": 0.6049786863239208, + "grad_norm": 0.7972666025161743, + "learning_rate": 1.61724677376182e-05, + "loss": 0.3105, + "step": 6422 + }, + { + "epoch": 0.6050728904170887, + "grad_norm": 0.9149686694145203, + "learning_rate": 1.6171279630900468e-05, + "loss": 0.3529, + "step": 6423 + }, + { + "epoch": 0.6051670945102565, + "grad_norm": 0.8179230093955994, + "learning_rate": 1.6170091383468022e-05, + "loss": 0.3443, + "step": 6424 + }, + { + "epoch": 0.6052612986034244, + "grad_norm": 0.7353527545928955, + "learning_rate": 1.6168902995347944e-05, + "loss": 0.3243, + "step": 6425 + }, + { + "epoch": 0.6053555026965922, + "grad_norm": 0.7327826023101807, + "learning_rate": 1.616771446656734e-05, + "loss": 0.3194, + "step": 6426 + }, + { + "epoch": 0.6054497067897601, + "grad_norm": 0.7549339532852173, + "learning_rate": 1.616652579715331e-05, + "loss": 0.3609, + "step": 6427 + }, + { + "epoch": 0.6055439108829279, + "grad_norm": 0.7450674772262573, + "learning_rate": 1.616533698713295e-05, + "loss": 0.3143, + "step": 6428 + }, + { + "epoch": 0.6056381149760958, + "grad_norm": 0.7472980618476868, + "learning_rate": 1.6164148036533376e-05, + "loss": 0.3383, + "step": 6429 + }, + { + "epoch": 0.6057323190692636, + "grad_norm": 0.683591902256012, + "learning_rate": 1.616295894538169e-05, + "loss": 0.3279, + "step": 6430 + }, + { + "epoch": 0.6058265231624315, + "grad_norm": 0.699081301689148, + "learning_rate": 1.6161769713705015e-05, + "loss": 0.3008, + "step": 6431 + }, + { + "epoch": 0.6059207272555992, + "grad_norm": 0.8056870102882385, + "learning_rate": 1.6160580341530455e-05, + "loss": 0.3514, + "step": 6432 + }, + { + "epoch": 0.606014931348767, + "grad_norm": 0.7246406674385071, + "learning_rate": 1.6159390828885137e-05, + "loss": 0.2994, + "step": 6433 + }, + { + "epoch": 0.6061091354419349, + "grad_norm": 0.6836478114128113, + "learning_rate": 1.6158201175796186e-05, + "loss": 0.3015, + "step": 6434 + }, + { + "epoch": 0.6062033395351027, + "grad_norm": 0.7912698984146118, + "learning_rate": 1.615701138229072e-05, + "loss": 0.3346, + "step": 6435 + }, + { + "epoch": 0.6062975436282706, + "grad_norm": 0.7896254658699036, + "learning_rate": 1.6155821448395874e-05, + "loss": 0.3272, + "step": 6436 + }, + { + "epoch": 0.6063917477214384, + "grad_norm": 0.7898254990577698, + "learning_rate": 1.6154631374138777e-05, + "loss": 0.3083, + "step": 6437 + }, + { + "epoch": 0.6064859518146063, + "grad_norm": 0.7792987823486328, + "learning_rate": 1.615344115954657e-05, + "loss": 0.3989, + "step": 6438 + }, + { + "epoch": 0.6065801559077741, + "grad_norm": 0.7514324188232422, + "learning_rate": 1.6152250804646386e-05, + "loss": 0.3793, + "step": 6439 + }, + { + "epoch": 0.606674360000942, + "grad_norm": 1.2695530652999878, + "learning_rate": 1.615106030946537e-05, + "loss": 0.3289, + "step": 6440 + }, + { + "epoch": 0.6067685640941098, + "grad_norm": 0.8743518590927124, + "learning_rate": 1.6149869674030663e-05, + "loss": 0.3704, + "step": 6441 + }, + { + "epoch": 0.6068627681872777, + "grad_norm": 0.7111954092979431, + "learning_rate": 1.6148678898369422e-05, + "loss": 0.3217, + "step": 6442 + }, + { + "epoch": 0.6069569722804455, + "grad_norm": 0.6936813592910767, + "learning_rate": 1.6147487982508788e-05, + "loss": 0.3209, + "step": 6443 + }, + { + "epoch": 0.6070511763736134, + "grad_norm": 0.7675489187240601, + "learning_rate": 1.614629692647592e-05, + "loss": 0.3221, + "step": 6444 + }, + { + "epoch": 0.6071453804667812, + "grad_norm": 0.7326104044914246, + "learning_rate": 1.6145105730297986e-05, + "loss": 0.3387, + "step": 6445 + }, + { + "epoch": 0.6072395845599491, + "grad_norm": 0.7697705626487732, + "learning_rate": 1.6143914394002127e-05, + "loss": 0.3137, + "step": 6446 + }, + { + "epoch": 0.6073337886531169, + "grad_norm": 0.7528795599937439, + "learning_rate": 1.6142722917615526e-05, + "loss": 0.3744, + "step": 6447 + }, + { + "epoch": 0.6074279927462848, + "grad_norm": 0.7764901518821716, + "learning_rate": 1.6141531301165335e-05, + "loss": 0.31, + "step": 6448 + }, + { + "epoch": 0.6075221968394526, + "grad_norm": 0.8144018650054932, + "learning_rate": 1.6140339544678738e-05, + "loss": 0.3191, + "step": 6449 + }, + { + "epoch": 0.6076164009326205, + "grad_norm": 0.7953997254371643, + "learning_rate": 1.61391476481829e-05, + "loss": 0.3249, + "step": 6450 + }, + { + "epoch": 0.6077106050257883, + "grad_norm": 0.7794833779335022, + "learning_rate": 1.6137955611705003e-05, + "loss": 0.3676, + "step": 6451 + }, + { + "epoch": 0.6078048091189562, + "grad_norm": 0.7058234810829163, + "learning_rate": 1.6136763435272227e-05, + "loss": 0.3165, + "step": 6452 + }, + { + "epoch": 0.607899013212124, + "grad_norm": 0.7798442244529724, + "learning_rate": 1.613557111891175e-05, + "loss": 0.29, + "step": 6453 + }, + { + "epoch": 0.6079932173052919, + "grad_norm": 0.6657100319862366, + "learning_rate": 1.6134378662650763e-05, + "loss": 0.3063, + "step": 6454 + }, + { + "epoch": 0.6080874213984597, + "grad_norm": 0.7678936123847961, + "learning_rate": 1.613318606651646e-05, + "loss": 0.3025, + "step": 6455 + }, + { + "epoch": 0.6081816254916276, + "grad_norm": 0.7438762187957764, + "learning_rate": 1.6131993330536025e-05, + "loss": 0.338, + "step": 6456 + }, + { + "epoch": 0.6082758295847954, + "grad_norm": 0.7807517647743225, + "learning_rate": 1.6130800454736663e-05, + "loss": 0.3099, + "step": 6457 + }, + { + "epoch": 0.6083700336779633, + "grad_norm": 0.7923942804336548, + "learning_rate": 1.6129607439145564e-05, + "loss": 0.338, + "step": 6458 + }, + { + "epoch": 0.6084642377711311, + "grad_norm": 0.7815907597541809, + "learning_rate": 1.6128414283789937e-05, + "loss": 0.3054, + "step": 6459 + }, + { + "epoch": 0.608558441864299, + "grad_norm": 0.7461601495742798, + "learning_rate": 1.612722098869699e-05, + "loss": 0.3402, + "step": 6460 + }, + { + "epoch": 0.6086526459574668, + "grad_norm": 0.7422223687171936, + "learning_rate": 1.6126027553893924e-05, + "loss": 0.3018, + "step": 6461 + }, + { + "epoch": 0.6087468500506347, + "grad_norm": 1.1513397693634033, + "learning_rate": 1.612483397940796e-05, + "loss": 0.3206, + "step": 6462 + }, + { + "epoch": 0.6088410541438025, + "grad_norm": 0.7039524912834167, + "learning_rate": 1.6123640265266306e-05, + "loss": 0.3185, + "step": 6463 + }, + { + "epoch": 0.6089352582369704, + "grad_norm": 0.6909993290901184, + "learning_rate": 1.6122446411496184e-05, + "loss": 0.3022, + "step": 6464 + }, + { + "epoch": 0.6090294623301382, + "grad_norm": 0.7051947116851807, + "learning_rate": 1.612125241812482e-05, + "loss": 0.2992, + "step": 6465 + }, + { + "epoch": 0.6091236664233061, + "grad_norm": 0.675311267375946, + "learning_rate": 1.612005828517943e-05, + "loss": 0.2678, + "step": 6466 + }, + { + "epoch": 0.6092178705164739, + "grad_norm": 0.7527945637702942, + "learning_rate": 1.6118864012687246e-05, + "loss": 0.345, + "step": 6467 + }, + { + "epoch": 0.6093120746096418, + "grad_norm": 0.7540647983551025, + "learning_rate": 1.61176696006755e-05, + "loss": 0.3219, + "step": 6468 + }, + { + "epoch": 0.6094062787028096, + "grad_norm": 0.9244742393493652, + "learning_rate": 1.6116475049171424e-05, + "loss": 0.3276, + "step": 6469 + }, + { + "epoch": 0.6095004827959775, + "grad_norm": 1.0800234079360962, + "learning_rate": 1.6115280358202258e-05, + "loss": 0.3412, + "step": 6470 + }, + { + "epoch": 0.6095946868891453, + "grad_norm": 0.7098768353462219, + "learning_rate": 1.6114085527795243e-05, + "loss": 0.3596, + "step": 6471 + }, + { + "epoch": 0.6096888909823132, + "grad_norm": 0.7159668207168579, + "learning_rate": 1.6112890557977627e-05, + "loss": 0.3013, + "step": 6472 + }, + { + "epoch": 0.609783095075481, + "grad_norm": 0.7337132096290588, + "learning_rate": 1.6111695448776646e-05, + "loss": 0.2818, + "step": 6473 + }, + { + "epoch": 0.6098772991686489, + "grad_norm": 0.9161654114723206, + "learning_rate": 1.6110500200219562e-05, + "loss": 0.351, + "step": 6474 + }, + { + "epoch": 0.6099715032618167, + "grad_norm": 0.7414738535881042, + "learning_rate": 1.610930481233362e-05, + "loss": 0.3271, + "step": 6475 + }, + { + "epoch": 0.6100657073549846, + "grad_norm": 0.7354120016098022, + "learning_rate": 1.610810928514608e-05, + "loss": 0.3817, + "step": 6476 + }, + { + "epoch": 0.6101599114481524, + "grad_norm": 0.6794965267181396, + "learning_rate": 1.6106913618684204e-05, + "loss": 0.3438, + "step": 6477 + }, + { + "epoch": 0.6102541155413203, + "grad_norm": 0.9572599530220032, + "learning_rate": 1.6105717812975254e-05, + "loss": 0.318, + "step": 6478 + }, + { + "epoch": 0.6103483196344881, + "grad_norm": 0.8186508417129517, + "learning_rate": 1.610452186804649e-05, + "loss": 0.3164, + "step": 6479 + }, + { + "epoch": 0.610442523727656, + "grad_norm": 0.7896221280097961, + "learning_rate": 1.610332578392519e-05, + "loss": 0.3159, + "step": 6480 + }, + { + "epoch": 0.6105367278208238, + "grad_norm": 0.6747741103172302, + "learning_rate": 1.610212956063862e-05, + "loss": 0.3161, + "step": 6481 + }, + { + "epoch": 0.6106309319139916, + "grad_norm": 0.728388249874115, + "learning_rate": 1.6100933198214065e-05, + "loss": 0.3035, + "step": 6482 + }, + { + "epoch": 0.6107251360071595, + "grad_norm": 0.7789629697799683, + "learning_rate": 1.6099736696678795e-05, + "loss": 0.3037, + "step": 6483 + }, + { + "epoch": 0.6108193401003273, + "grad_norm": 0.8035514950752258, + "learning_rate": 1.609854005606009e-05, + "loss": 0.3518, + "step": 6484 + }, + { + "epoch": 0.6109135441934952, + "grad_norm": 0.9325544834136963, + "learning_rate": 1.6097343276385248e-05, + "loss": 0.3471, + "step": 6485 + }, + { + "epoch": 0.611007748286663, + "grad_norm": 0.7909505367279053, + "learning_rate": 1.6096146357681545e-05, + "loss": 0.3618, + "step": 6486 + }, + { + "epoch": 0.6111019523798309, + "grad_norm": 0.7790225744247437, + "learning_rate": 1.6094949299976283e-05, + "loss": 0.3519, + "step": 6487 + }, + { + "epoch": 0.6111961564729987, + "grad_norm": 0.7841339111328125, + "learning_rate": 1.6093752103296742e-05, + "loss": 0.3627, + "step": 6488 + }, + { + "epoch": 0.6112903605661666, + "grad_norm": 0.7887298464775085, + "learning_rate": 1.6092554767670236e-05, + "loss": 0.3259, + "step": 6489 + }, + { + "epoch": 0.6113845646593344, + "grad_norm": 0.7558081746101379, + "learning_rate": 1.6091357293124055e-05, + "loss": 0.2976, + "step": 6490 + }, + { + "epoch": 0.6114787687525023, + "grad_norm": 0.7437493801116943, + "learning_rate": 1.6090159679685507e-05, + "loss": 0.3187, + "step": 6491 + }, + { + "epoch": 0.6115729728456701, + "grad_norm": 0.6939466595649719, + "learning_rate": 1.60889619273819e-05, + "loss": 0.3225, + "step": 6492 + }, + { + "epoch": 0.611667176938838, + "grad_norm": 0.7833172082901001, + "learning_rate": 1.6087764036240545e-05, + "loss": 0.3288, + "step": 6493 + }, + { + "epoch": 0.6117613810320058, + "grad_norm": 0.8887497186660767, + "learning_rate": 1.6086566006288755e-05, + "loss": 0.3882, + "step": 6494 + }, + { + "epoch": 0.6118555851251737, + "grad_norm": 0.776712954044342, + "learning_rate": 1.608536783755385e-05, + "loss": 0.2764, + "step": 6495 + }, + { + "epoch": 0.6119497892183415, + "grad_norm": 0.9048894643783569, + "learning_rate": 1.608416953006314e-05, + "loss": 0.3693, + "step": 6496 + }, + { + "epoch": 0.6120439933115094, + "grad_norm": 0.6698115468025208, + "learning_rate": 1.6082971083843962e-05, + "loss": 0.2707, + "step": 6497 + }, + { + "epoch": 0.6121381974046772, + "grad_norm": 0.786310613155365, + "learning_rate": 1.6081772498923635e-05, + "loss": 0.3426, + "step": 6498 + }, + { + "epoch": 0.6122324014978451, + "grad_norm": 0.9830306768417358, + "learning_rate": 1.6080573775329487e-05, + "loss": 0.3652, + "step": 6499 + }, + { + "epoch": 0.6123266055910129, + "grad_norm": 0.6997115612030029, + "learning_rate": 1.6079374913088855e-05, + "loss": 0.3163, + "step": 6500 + }, + { + "epoch": 0.6124208096841808, + "grad_norm": 0.7159838080406189, + "learning_rate": 1.607817591222907e-05, + "loss": 0.3141, + "step": 6501 + }, + { + "epoch": 0.6125150137773486, + "grad_norm": 0.6669812798500061, + "learning_rate": 1.607697677277748e-05, + "loss": 0.2776, + "step": 6502 + }, + { + "epoch": 0.6126092178705165, + "grad_norm": 0.7507301568984985, + "learning_rate": 1.6075777494761418e-05, + "loss": 0.3215, + "step": 6503 + }, + { + "epoch": 0.6127034219636843, + "grad_norm": 0.7340702414512634, + "learning_rate": 1.607457807820823e-05, + "loss": 0.3295, + "step": 6504 + }, + { + "epoch": 0.6127976260568522, + "grad_norm": 0.741966724395752, + "learning_rate": 1.6073378523145272e-05, + "loss": 0.3646, + "step": 6505 + }, + { + "epoch": 0.61289183015002, + "grad_norm": 0.9108197093009949, + "learning_rate": 1.6072178829599886e-05, + "loss": 0.3381, + "step": 6506 + }, + { + "epoch": 0.6129860342431879, + "grad_norm": 0.7761343121528625, + "learning_rate": 1.6070978997599436e-05, + "loss": 0.3381, + "step": 6507 + }, + { + "epoch": 0.6130802383363557, + "grad_norm": 0.6764536499977112, + "learning_rate": 1.6069779027171274e-05, + "loss": 0.3239, + "step": 6508 + }, + { + "epoch": 0.6131744424295236, + "grad_norm": 0.6758368611335754, + "learning_rate": 1.6068578918342764e-05, + "loss": 0.2754, + "step": 6509 + }, + { + "epoch": 0.6132686465226914, + "grad_norm": 0.7585486769676208, + "learning_rate": 1.6067378671141268e-05, + "loss": 0.3488, + "step": 6510 + }, + { + "epoch": 0.6133628506158593, + "grad_norm": 0.7682276368141174, + "learning_rate": 1.6066178285594155e-05, + "loss": 0.3452, + "step": 6511 + }, + { + "epoch": 0.6134570547090271, + "grad_norm": 0.7287978529930115, + "learning_rate": 1.6064977761728798e-05, + "loss": 0.322, + "step": 6512 + }, + { + "epoch": 0.613551258802195, + "grad_norm": 0.7907217144966125, + "learning_rate": 1.606377709957257e-05, + "loss": 0.375, + "step": 6513 + }, + { + "epoch": 0.6136454628953628, + "grad_norm": 0.7274413108825684, + "learning_rate": 1.606257629915284e-05, + "loss": 0.3072, + "step": 6514 + }, + { + "epoch": 0.6137396669885307, + "grad_norm": 0.8911492824554443, + "learning_rate": 1.6061375360497e-05, + "loss": 0.3349, + "step": 6515 + }, + { + "epoch": 0.6138338710816985, + "grad_norm": 0.711392879486084, + "learning_rate": 1.6060174283632423e-05, + "loss": 0.3151, + "step": 6516 + }, + { + "epoch": 0.6139280751748664, + "grad_norm": 0.7808783650398254, + "learning_rate": 1.6058973068586503e-05, + "loss": 0.3171, + "step": 6517 + }, + { + "epoch": 0.6140222792680342, + "grad_norm": 0.717170000076294, + "learning_rate": 1.6057771715386624e-05, + "loss": 0.2443, + "step": 6518 + }, + { + "epoch": 0.6141164833612021, + "grad_norm": 0.6050516366958618, + "learning_rate": 1.6056570224060184e-05, + "loss": 0.2431, + "step": 6519 + }, + { + "epoch": 0.6142106874543699, + "grad_norm": 0.7782800793647766, + "learning_rate": 1.6055368594634576e-05, + "loss": 0.3064, + "step": 6520 + }, + { + "epoch": 0.6143048915475378, + "grad_norm": 0.9019989967346191, + "learning_rate": 1.60541668271372e-05, + "loss": 0.3511, + "step": 6521 + }, + { + "epoch": 0.6143990956407056, + "grad_norm": 0.9744110703468323, + "learning_rate": 1.6052964921595457e-05, + "loss": 0.3424, + "step": 6522 + }, + { + "epoch": 0.6144932997338735, + "grad_norm": 0.7475957870483398, + "learning_rate": 1.605176287803675e-05, + "loss": 0.3177, + "step": 6523 + }, + { + "epoch": 0.6145875038270413, + "grad_norm": 0.856407880783081, + "learning_rate": 1.6050560696488493e-05, + "loss": 0.3021, + "step": 6524 + }, + { + "epoch": 0.6146817079202092, + "grad_norm": 0.7817702293395996, + "learning_rate": 1.6049358376978092e-05, + "loss": 0.3265, + "step": 6525 + }, + { + "epoch": 0.614775912013377, + "grad_norm": 1.1578670740127563, + "learning_rate": 1.6048155919532967e-05, + "loss": 0.3299, + "step": 6526 + }, + { + "epoch": 0.6148701161065449, + "grad_norm": 0.7789450883865356, + "learning_rate": 1.604695332418053e-05, + "loss": 0.3703, + "step": 6527 + }, + { + "epoch": 0.6149643201997127, + "grad_norm": 0.8720627427101135, + "learning_rate": 1.6045750590948214e-05, + "loss": 0.3612, + "step": 6528 + }, + { + "epoch": 0.6150585242928805, + "grad_norm": 0.7772655487060547, + "learning_rate": 1.6044547719863426e-05, + "loss": 0.3449, + "step": 6529 + }, + { + "epoch": 0.6151527283860484, + "grad_norm": 0.7157712578773499, + "learning_rate": 1.6043344710953604e-05, + "loss": 0.3074, + "step": 6530 + }, + { + "epoch": 0.6152469324792162, + "grad_norm": 0.6866297125816345, + "learning_rate": 1.6042141564246175e-05, + "loss": 0.3429, + "step": 6531 + }, + { + "epoch": 0.6153411365723841, + "grad_norm": 0.8729900121688843, + "learning_rate": 1.604093827976858e-05, + "loss": 0.398, + "step": 6532 + }, + { + "epoch": 0.6154353406655519, + "grad_norm": 0.872604489326477, + "learning_rate": 1.6039734857548243e-05, + "loss": 0.3161, + "step": 6533 + }, + { + "epoch": 0.6155295447587198, + "grad_norm": 0.9276250004768372, + "learning_rate": 1.6038531297612613e-05, + "loss": 0.371, + "step": 6534 + }, + { + "epoch": 0.6156237488518876, + "grad_norm": 0.739362895488739, + "learning_rate": 1.603732759998913e-05, + "loss": 0.3207, + "step": 6535 + }, + { + "epoch": 0.6157179529450555, + "grad_norm": 0.7881128787994385, + "learning_rate": 1.6036123764705245e-05, + "loss": 0.3218, + "step": 6536 + }, + { + "epoch": 0.6158121570382233, + "grad_norm": 0.7558095455169678, + "learning_rate": 1.6034919791788398e-05, + "loss": 0.3343, + "step": 6537 + }, + { + "epoch": 0.6159063611313912, + "grad_norm": 1.2704678773880005, + "learning_rate": 1.603371568126605e-05, + "loss": 0.3478, + "step": 6538 + }, + { + "epoch": 0.616000565224559, + "grad_norm": 0.8412241339683533, + "learning_rate": 1.6032511433165652e-05, + "loss": 0.3525, + "step": 6539 + }, + { + "epoch": 0.6160947693177269, + "grad_norm": 0.8343496918678284, + "learning_rate": 1.6031307047514665e-05, + "loss": 0.3353, + "step": 6540 + }, + { + "epoch": 0.6161889734108947, + "grad_norm": 0.6275550723075867, + "learning_rate": 1.6030102524340547e-05, + "loss": 0.2906, + "step": 6541 + }, + { + "epoch": 0.6162831775040626, + "grad_norm": 0.8872958421707153, + "learning_rate": 1.6028897863670768e-05, + "loss": 0.3251, + "step": 6542 + }, + { + "epoch": 0.6163773815972304, + "grad_norm": 0.7549136877059937, + "learning_rate": 1.602769306553279e-05, + "loss": 0.3325, + "step": 6543 + }, + { + "epoch": 0.6164715856903983, + "grad_norm": 0.6889528632164001, + "learning_rate": 1.6026488129954092e-05, + "loss": 0.3275, + "step": 6544 + }, + { + "epoch": 0.6165657897835661, + "grad_norm": 0.7459689378738403, + "learning_rate": 1.6025283056962143e-05, + "loss": 0.33, + "step": 6545 + }, + { + "epoch": 0.616659993876734, + "grad_norm": 0.7205366492271423, + "learning_rate": 1.602407784658442e-05, + "loss": 0.3013, + "step": 6546 + }, + { + "epoch": 0.6167541979699018, + "grad_norm": 0.7754681706428528, + "learning_rate": 1.6022872498848408e-05, + "loss": 0.3339, + "step": 6547 + }, + { + "epoch": 0.6168484020630697, + "grad_norm": 0.899183988571167, + "learning_rate": 1.602166701378159e-05, + "loss": 0.3756, + "step": 6548 + }, + { + "epoch": 0.6169426061562375, + "grad_norm": 0.8031266331672668, + "learning_rate": 1.6020461391411452e-05, + "loss": 0.2941, + "step": 6549 + }, + { + "epoch": 0.6170368102494054, + "grad_norm": 0.711356520652771, + "learning_rate": 1.601925563176548e-05, + "loss": 0.3009, + "step": 6550 + }, + { + "epoch": 0.6171310143425732, + "grad_norm": 0.7564597129821777, + "learning_rate": 1.6018049734871174e-05, + "loss": 0.3229, + "step": 6551 + }, + { + "epoch": 0.6172252184357411, + "grad_norm": 0.8064227104187012, + "learning_rate": 1.6016843700756025e-05, + "loss": 0.3266, + "step": 6552 + }, + { + "epoch": 0.6173194225289089, + "grad_norm": 0.6288996338844299, + "learning_rate": 1.6015637529447533e-05, + "loss": 0.2794, + "step": 6553 + }, + { + "epoch": 0.6174136266220768, + "grad_norm": 0.8485068082809448, + "learning_rate": 1.6014431220973205e-05, + "loss": 0.3633, + "step": 6554 + }, + { + "epoch": 0.6175078307152446, + "grad_norm": 0.7502201199531555, + "learning_rate": 1.601322477536054e-05, + "loss": 0.3425, + "step": 6555 + }, + { + "epoch": 0.6176020348084125, + "grad_norm": 0.7736746072769165, + "learning_rate": 1.6012018192637052e-05, + "loss": 0.3416, + "step": 6556 + }, + { + "epoch": 0.6176962389015803, + "grad_norm": 0.8648883104324341, + "learning_rate": 1.6010811472830253e-05, + "loss": 0.3529, + "step": 6557 + }, + { + "epoch": 0.6177904429947482, + "grad_norm": 0.7279908657073975, + "learning_rate": 1.6009604615967656e-05, + "loss": 0.3293, + "step": 6558 + }, + { + "epoch": 0.617884647087916, + "grad_norm": 0.8421671986579895, + "learning_rate": 1.6008397622076778e-05, + "loss": 0.3428, + "step": 6559 + }, + { + "epoch": 0.6179788511810839, + "grad_norm": 0.8226191401481628, + "learning_rate": 1.6007190491185144e-05, + "loss": 0.3391, + "step": 6560 + }, + { + "epoch": 0.6180730552742517, + "grad_norm": 0.745063066482544, + "learning_rate": 1.6005983223320276e-05, + "loss": 0.3136, + "step": 6561 + }, + { + "epoch": 0.6181672593674196, + "grad_norm": 0.8741940855979919, + "learning_rate": 1.60047758185097e-05, + "loss": 0.3044, + "step": 6562 + }, + { + "epoch": 0.6182614634605874, + "grad_norm": 0.7277724146842957, + "learning_rate": 1.600356827678095e-05, + "loss": 0.2929, + "step": 6563 + }, + { + "epoch": 0.6183556675537553, + "grad_norm": 0.7360935211181641, + "learning_rate": 1.600236059816156e-05, + "loss": 0.2837, + "step": 6564 + }, + { + "epoch": 0.6184498716469231, + "grad_norm": 0.9177241325378418, + "learning_rate": 1.6001152782679067e-05, + "loss": 0.371, + "step": 6565 + }, + { + "epoch": 0.618544075740091, + "grad_norm": 0.7400040626525879, + "learning_rate": 1.5999944830361007e-05, + "loss": 0.3262, + "step": 6566 + }, + { + "epoch": 0.6186382798332588, + "grad_norm": 0.716580867767334, + "learning_rate": 1.5998736741234922e-05, + "loss": 0.3269, + "step": 6567 + }, + { + "epoch": 0.6187324839264267, + "grad_norm": 0.7563260793685913, + "learning_rate": 1.5997528515328367e-05, + "loss": 0.3266, + "step": 6568 + }, + { + "epoch": 0.6188266880195945, + "grad_norm": 0.8368607759475708, + "learning_rate": 1.5996320152668886e-05, + "loss": 0.3438, + "step": 6569 + }, + { + "epoch": 0.6189208921127622, + "grad_norm": 0.7384721040725708, + "learning_rate": 1.5995111653284036e-05, + "loss": 0.3393, + "step": 6570 + }, + { + "epoch": 0.6190150962059301, + "grad_norm": 0.7471886277198792, + "learning_rate": 1.5993903017201363e-05, + "loss": 0.3232, + "step": 6571 + }, + { + "epoch": 0.6191093002990979, + "grad_norm": 0.7168338298797607, + "learning_rate": 1.5992694244448434e-05, + "loss": 0.3572, + "step": 6572 + }, + { + "epoch": 0.6192035043922658, + "grad_norm": 0.8276976346969604, + "learning_rate": 1.5991485335052807e-05, + "loss": 0.3045, + "step": 6573 + }, + { + "epoch": 0.6192977084854336, + "grad_norm": 0.7061718702316284, + "learning_rate": 1.5990276289042047e-05, + "loss": 0.3401, + "step": 6574 + }, + { + "epoch": 0.6193919125786015, + "grad_norm": 0.7345423698425293, + "learning_rate": 1.5989067106443722e-05, + "loss": 0.3714, + "step": 6575 + }, + { + "epoch": 0.6194861166717693, + "grad_norm": 0.7423486113548279, + "learning_rate": 1.598785778728541e-05, + "loss": 0.345, + "step": 6576 + }, + { + "epoch": 0.6195803207649372, + "grad_norm": 0.7642102837562561, + "learning_rate": 1.5986648331594678e-05, + "loss": 0.3387, + "step": 6577 + }, + { + "epoch": 0.619674524858105, + "grad_norm": 0.686305820941925, + "learning_rate": 1.5985438739399107e-05, + "loss": 0.2932, + "step": 6578 + }, + { + "epoch": 0.6197687289512729, + "grad_norm": 0.8475738763809204, + "learning_rate": 1.598422901072627e-05, + "loss": 0.3521, + "step": 6579 + }, + { + "epoch": 0.6198629330444407, + "grad_norm": 0.8894240260124207, + "learning_rate": 1.5983019145603766e-05, + "loss": 0.3937, + "step": 6580 + }, + { + "epoch": 0.6199571371376086, + "grad_norm": 0.7335094213485718, + "learning_rate": 1.5981809144059167e-05, + "loss": 0.3217, + "step": 6581 + }, + { + "epoch": 0.6200513412307764, + "grad_norm": 0.6611246466636658, + "learning_rate": 1.598059900612007e-05, + "loss": 0.2838, + "step": 6582 + }, + { + "epoch": 0.6201455453239443, + "grad_norm": 0.7276688814163208, + "learning_rate": 1.5979388731814063e-05, + "loss": 0.3131, + "step": 6583 + }, + { + "epoch": 0.6202397494171121, + "grad_norm": 0.8888239860534668, + "learning_rate": 1.5978178321168746e-05, + "loss": 0.3382, + "step": 6584 + }, + { + "epoch": 0.62033395351028, + "grad_norm": 0.9546866416931152, + "learning_rate": 1.597696777421172e-05, + "loss": 0.285, + "step": 6585 + }, + { + "epoch": 0.6204281576034478, + "grad_norm": 0.7835800647735596, + "learning_rate": 1.5975757090970586e-05, + "loss": 0.3688, + "step": 6586 + }, + { + "epoch": 0.6205223616966157, + "grad_norm": 1.0066163539886475, + "learning_rate": 1.5974546271472952e-05, + "loss": 0.3065, + "step": 6587 + }, + { + "epoch": 0.6206165657897835, + "grad_norm": 0.7106859087944031, + "learning_rate": 1.5973335315746416e-05, + "loss": 0.3054, + "step": 6588 + }, + { + "epoch": 0.6207107698829514, + "grad_norm": 0.8374449610710144, + "learning_rate": 1.59721242238186e-05, + "loss": 0.3169, + "step": 6589 + }, + { + "epoch": 0.6208049739761192, + "grad_norm": 0.702578067779541, + "learning_rate": 1.597091299571712e-05, + "loss": 0.3196, + "step": 6590 + }, + { + "epoch": 0.6208991780692871, + "grad_norm": 0.7177067995071411, + "learning_rate": 1.596970163146958e-05, + "loss": 0.3182, + "step": 6591 + }, + { + "epoch": 0.6209933821624549, + "grad_norm": 0.7188496589660645, + "learning_rate": 1.5968490131103618e-05, + "loss": 0.2904, + "step": 6592 + }, + { + "epoch": 0.6210875862556228, + "grad_norm": 0.7323099374771118, + "learning_rate": 1.5967278494646847e-05, + "loss": 0.3555, + "step": 6593 + }, + { + "epoch": 0.6211817903487906, + "grad_norm": 0.8324366211891174, + "learning_rate": 1.5966066722126897e-05, + "loss": 0.2937, + "step": 6594 + }, + { + "epoch": 0.6212759944419585, + "grad_norm": 0.752227246761322, + "learning_rate": 1.5964854813571403e-05, + "loss": 0.2981, + "step": 6595 + }, + { + "epoch": 0.6213701985351263, + "grad_norm": 0.7920807003974915, + "learning_rate": 1.596364276900799e-05, + "loss": 0.3655, + "step": 6596 + }, + { + "epoch": 0.6214644026282942, + "grad_norm": 0.7501734495162964, + "learning_rate": 1.5962430588464302e-05, + "loss": 0.3009, + "step": 6597 + }, + { + "epoch": 0.621558606721462, + "grad_norm": 0.8034332394599915, + "learning_rate": 1.5961218271967976e-05, + "loss": 0.3655, + "step": 6598 + }, + { + "epoch": 0.6216528108146299, + "grad_norm": 0.6022971272468567, + "learning_rate": 1.596000581954665e-05, + "loss": 0.2758, + "step": 6599 + }, + { + "epoch": 0.6217470149077977, + "grad_norm": 0.7588047385215759, + "learning_rate": 1.595879323122798e-05, + "loss": 0.3601, + "step": 6600 + }, + { + "epoch": 0.6218412190009656, + "grad_norm": 0.7950819730758667, + "learning_rate": 1.5957580507039604e-05, + "loss": 0.3183, + "step": 6601 + }, + { + "epoch": 0.6219354230941334, + "grad_norm": 0.7252808213233948, + "learning_rate": 1.5956367647009185e-05, + "loss": 0.3409, + "step": 6602 + }, + { + "epoch": 0.6220296271873013, + "grad_norm": 0.7759315371513367, + "learning_rate": 1.5955154651164368e-05, + "loss": 0.3408, + "step": 6603 + }, + { + "epoch": 0.6221238312804691, + "grad_norm": 0.788898766040802, + "learning_rate": 1.595394151953281e-05, + "loss": 0.3177, + "step": 6604 + }, + { + "epoch": 0.622218035373637, + "grad_norm": 0.7779346108436584, + "learning_rate": 1.5952728252142183e-05, + "loss": 0.3513, + "step": 6605 + }, + { + "epoch": 0.6223122394668048, + "grad_norm": 0.779161810874939, + "learning_rate": 1.5951514849020147e-05, + "loss": 0.3713, + "step": 6606 + }, + { + "epoch": 0.6224064435599727, + "grad_norm": 0.7594448924064636, + "learning_rate": 1.5950301310194366e-05, + "loss": 0.3097, + "step": 6607 + }, + { + "epoch": 0.6225006476531405, + "grad_norm": 0.7905697822570801, + "learning_rate": 1.5949087635692512e-05, + "loss": 0.3418, + "step": 6608 + }, + { + "epoch": 0.6225948517463084, + "grad_norm": 0.936529815196991, + "learning_rate": 1.594787382554226e-05, + "loss": 0.3151, + "step": 6609 + }, + { + "epoch": 0.6226890558394762, + "grad_norm": 0.7440918684005737, + "learning_rate": 1.5946659879771288e-05, + "loss": 0.3529, + "step": 6610 + }, + { + "epoch": 0.622783259932644, + "grad_norm": 0.7420088648796082, + "learning_rate": 1.5945445798407274e-05, + "loss": 0.3043, + "step": 6611 + }, + { + "epoch": 0.6228774640258119, + "grad_norm": 0.7822197675704956, + "learning_rate": 1.59442315814779e-05, + "loss": 0.3951, + "step": 6612 + }, + { + "epoch": 0.6229716681189797, + "grad_norm": 0.7788919806480408, + "learning_rate": 1.594301722901085e-05, + "loss": 0.3468, + "step": 6613 + }, + { + "epoch": 0.6230658722121476, + "grad_norm": 0.833311915397644, + "learning_rate": 1.594180274103382e-05, + "loss": 0.3853, + "step": 6614 + }, + { + "epoch": 0.6231600763053154, + "grad_norm": 0.6766711473464966, + "learning_rate": 1.5940588117574497e-05, + "loss": 0.2991, + "step": 6615 + }, + { + "epoch": 0.6232542803984833, + "grad_norm": 0.717984139919281, + "learning_rate": 1.593937335866058e-05, + "loss": 0.2994, + "step": 6616 + }, + { + "epoch": 0.6233484844916511, + "grad_norm": 0.8255095481872559, + "learning_rate": 1.5938158464319763e-05, + "loss": 0.3249, + "step": 6617 + }, + { + "epoch": 0.623442688584819, + "grad_norm": 0.6872636675834656, + "learning_rate": 1.5936943434579748e-05, + "loss": 0.2778, + "step": 6618 + }, + { + "epoch": 0.6235368926779868, + "grad_norm": 0.7614285945892334, + "learning_rate": 1.593572826946824e-05, + "loss": 0.3422, + "step": 6619 + }, + { + "epoch": 0.6236310967711547, + "grad_norm": 0.7388732433319092, + "learning_rate": 1.5934512969012953e-05, + "loss": 0.3095, + "step": 6620 + }, + { + "epoch": 0.6237253008643225, + "grad_norm": 0.6868925094604492, + "learning_rate": 1.593329753324159e-05, + "loss": 0.3295, + "step": 6621 + }, + { + "epoch": 0.6238195049574904, + "grad_norm": 1.0019731521606445, + "learning_rate": 1.5932081962181863e-05, + "loss": 0.3794, + "step": 6622 + }, + { + "epoch": 0.6239137090506582, + "grad_norm": 0.787661075592041, + "learning_rate": 1.5930866255861498e-05, + "loss": 0.3288, + "step": 6623 + }, + { + "epoch": 0.6240079131438261, + "grad_norm": 0.8396860957145691, + "learning_rate": 1.592965041430821e-05, + "loss": 0.3159, + "step": 6624 + }, + { + "epoch": 0.6241021172369939, + "grad_norm": 0.9020205140113831, + "learning_rate": 1.5928434437549724e-05, + "loss": 0.3365, + "step": 6625 + }, + { + "epoch": 0.6241963213301618, + "grad_norm": 0.8161234855651855, + "learning_rate": 1.5927218325613758e-05, + "loss": 0.3024, + "step": 6626 + }, + { + "epoch": 0.6242905254233296, + "grad_norm": 0.8369538187980652, + "learning_rate": 1.592600207852805e-05, + "loss": 0.3211, + "step": 6627 + }, + { + "epoch": 0.6243847295164975, + "grad_norm": 0.7108891606330872, + "learning_rate": 1.592478569632033e-05, + "loss": 0.309, + "step": 6628 + }, + { + "epoch": 0.6244789336096653, + "grad_norm": 0.7067973017692566, + "learning_rate": 1.5923569179018333e-05, + "loss": 0.3026, + "step": 6629 + }, + { + "epoch": 0.6245731377028332, + "grad_norm": 0.8192605972290039, + "learning_rate": 1.5922352526649803e-05, + "loss": 0.3017, + "step": 6630 + }, + { + "epoch": 0.624667341796001, + "grad_norm": 0.8145104050636292, + "learning_rate": 1.5921135739242473e-05, + "loss": 0.3164, + "step": 6631 + }, + { + "epoch": 0.6247615458891689, + "grad_norm": 0.731330931186676, + "learning_rate": 1.591991881682409e-05, + "loss": 0.312, + "step": 6632 + }, + { + "epoch": 0.6248557499823367, + "grad_norm": 0.7608345150947571, + "learning_rate": 1.59187017594224e-05, + "loss": 0.332, + "step": 6633 + }, + { + "epoch": 0.6249499540755046, + "grad_norm": 0.7256740927696228, + "learning_rate": 1.591748456706516e-05, + "loss": 0.3063, + "step": 6634 + }, + { + "epoch": 0.6250441581686724, + "grad_norm": 0.8325484991073608, + "learning_rate": 1.591626723978012e-05, + "loss": 0.3611, + "step": 6635 + }, + { + "epoch": 0.6251383622618403, + "grad_norm": 0.7092111706733704, + "learning_rate": 1.5915049777595036e-05, + "loss": 0.3152, + "step": 6636 + }, + { + "epoch": 0.6252325663550081, + "grad_norm": 0.7213624119758606, + "learning_rate": 1.591383218053767e-05, + "loss": 0.3182, + "step": 6637 + }, + { + "epoch": 0.625326770448176, + "grad_norm": 0.7438649535179138, + "learning_rate": 1.5912614448635784e-05, + "loss": 0.3343, + "step": 6638 + }, + { + "epoch": 0.6254209745413438, + "grad_norm": 1.3518234491348267, + "learning_rate": 1.5911396581917144e-05, + "loss": 0.332, + "step": 6639 + }, + { + "epoch": 0.6255151786345117, + "grad_norm": 0.7216207981109619, + "learning_rate": 1.5910178580409522e-05, + "loss": 0.2506, + "step": 6640 + }, + { + "epoch": 0.6256093827276795, + "grad_norm": 0.7129900455474854, + "learning_rate": 1.5908960444140686e-05, + "loss": 0.2791, + "step": 6641 + }, + { + "epoch": 0.6257035868208474, + "grad_norm": 0.7139818072319031, + "learning_rate": 1.5907742173138415e-05, + "loss": 0.3754, + "step": 6642 + }, + { + "epoch": 0.6257977909140152, + "grad_norm": 0.730042576789856, + "learning_rate": 1.5906523767430485e-05, + "loss": 0.3263, + "step": 6643 + }, + { + "epoch": 0.6258919950071831, + "grad_norm": 0.7480024695396423, + "learning_rate": 1.590530522704468e-05, + "loss": 0.308, + "step": 6644 + }, + { + "epoch": 0.6259861991003509, + "grad_norm": 0.7542113661766052, + "learning_rate": 1.590408655200878e-05, + "loss": 0.3602, + "step": 6645 + }, + { + "epoch": 0.6260804031935188, + "grad_norm": 0.7510566711425781, + "learning_rate": 1.5902867742350578e-05, + "loss": 0.3131, + "step": 6646 + }, + { + "epoch": 0.6261746072866866, + "grad_norm": 0.8571420907974243, + "learning_rate": 1.5901648798097863e-05, + "loss": 0.3479, + "step": 6647 + }, + { + "epoch": 0.6262688113798545, + "grad_norm": 0.6914628744125366, + "learning_rate": 1.5900429719278428e-05, + "loss": 0.2876, + "step": 6648 + }, + { + "epoch": 0.6263630154730223, + "grad_norm": 0.8220464587211609, + "learning_rate": 1.5899210505920066e-05, + "loss": 0.3221, + "step": 6649 + }, + { + "epoch": 0.6264572195661902, + "grad_norm": 0.7591971755027771, + "learning_rate": 1.5897991158050586e-05, + "loss": 0.34, + "step": 6650 + }, + { + "epoch": 0.626551423659358, + "grad_norm": 0.7553174495697021, + "learning_rate": 1.5896771675697786e-05, + "loss": 0.2989, + "step": 6651 + }, + { + "epoch": 0.6266456277525259, + "grad_norm": 0.6980757117271423, + "learning_rate": 1.589555205888947e-05, + "loss": 0.293, + "step": 6652 + }, + { + "epoch": 0.6267398318456937, + "grad_norm": 0.7261850833892822, + "learning_rate": 1.589433230765345e-05, + "loss": 0.3286, + "step": 6653 + }, + { + "epoch": 0.6268340359388616, + "grad_norm": 0.6873316168785095, + "learning_rate": 1.589311242201754e-05, + "loss": 0.2959, + "step": 6654 + }, + { + "epoch": 0.6269282400320294, + "grad_norm": 0.7370021343231201, + "learning_rate": 1.589189240200955e-05, + "loss": 0.3687, + "step": 6655 + }, + { + "epoch": 0.6270224441251973, + "grad_norm": 0.8008817434310913, + "learning_rate": 1.5890672247657303e-05, + "loss": 0.3434, + "step": 6656 + }, + { + "epoch": 0.6271166482183651, + "grad_norm": 0.8398225903511047, + "learning_rate": 1.588945195898862e-05, + "loss": 0.3857, + "step": 6657 + }, + { + "epoch": 0.627210852311533, + "grad_norm": 1.046363115310669, + "learning_rate": 1.588823153603132e-05, + "loss": 0.3579, + "step": 6658 + }, + { + "epoch": 0.6273050564047008, + "grad_norm": 0.7483053803443909, + "learning_rate": 1.5887010978813235e-05, + "loss": 0.3334, + "step": 6659 + }, + { + "epoch": 0.6273992604978686, + "grad_norm": 0.7133119106292725, + "learning_rate": 1.58857902873622e-05, + "loss": 0.3184, + "step": 6660 + }, + { + "epoch": 0.6274934645910365, + "grad_norm": 0.7998007535934448, + "learning_rate": 1.588456946170604e-05, + "loss": 0.3422, + "step": 6661 + }, + { + "epoch": 0.6275876686842043, + "grad_norm": 0.7578026056289673, + "learning_rate": 1.5883348501872597e-05, + "loss": 0.3168, + "step": 6662 + }, + { + "epoch": 0.6276818727773722, + "grad_norm": 0.7887904644012451, + "learning_rate": 1.588212740788971e-05, + "loss": 0.3716, + "step": 6663 + }, + { + "epoch": 0.62777607687054, + "grad_norm": 0.7964813709259033, + "learning_rate": 1.5880906179785222e-05, + "loss": 0.3104, + "step": 6664 + }, + { + "epoch": 0.6278702809637079, + "grad_norm": 0.7783161997795105, + "learning_rate": 1.5879684817586974e-05, + "loss": 0.3619, + "step": 6665 + }, + { + "epoch": 0.6279644850568757, + "grad_norm": 0.9833865761756897, + "learning_rate": 1.5878463321322822e-05, + "loss": 0.3808, + "step": 6666 + }, + { + "epoch": 0.6280586891500436, + "grad_norm": 0.6703827977180481, + "learning_rate": 1.587724169102061e-05, + "loss": 0.3076, + "step": 6667 + }, + { + "epoch": 0.6281528932432114, + "grad_norm": 0.7270950675010681, + "learning_rate": 1.5876019926708207e-05, + "loss": 0.2927, + "step": 6668 + }, + { + "epoch": 0.6282470973363793, + "grad_norm": 0.8654983043670654, + "learning_rate": 1.5874798028413456e-05, + "loss": 0.3234, + "step": 6669 + }, + { + "epoch": 0.6283413014295471, + "grad_norm": 0.7640737891197205, + "learning_rate": 1.587357599616423e-05, + "loss": 0.3389, + "step": 6670 + }, + { + "epoch": 0.628435505522715, + "grad_norm": 2.5828471183776855, + "learning_rate": 1.587235382998838e-05, + "loss": 0.309, + "step": 6671 + }, + { + "epoch": 0.6285297096158828, + "grad_norm": 0.6812258362770081, + "learning_rate": 1.5871131529913782e-05, + "loss": 0.2965, + "step": 6672 + }, + { + "epoch": 0.6286239137090507, + "grad_norm": 0.7365323901176453, + "learning_rate": 1.586990909596831e-05, + "loss": 0.3237, + "step": 6673 + }, + { + "epoch": 0.6287181178022185, + "grad_norm": 0.8122865557670593, + "learning_rate": 1.586868652817983e-05, + "loss": 0.3093, + "step": 6674 + }, + { + "epoch": 0.6288123218953864, + "grad_norm": 0.7173646092414856, + "learning_rate": 1.5867463826576223e-05, + "loss": 0.3315, + "step": 6675 + }, + { + "epoch": 0.6289065259885542, + "grad_norm": 0.6234768629074097, + "learning_rate": 1.5866240991185365e-05, + "loss": 0.2994, + "step": 6676 + }, + { + "epoch": 0.6290007300817221, + "grad_norm": 0.6665681600570679, + "learning_rate": 1.586501802203514e-05, + "loss": 0.3055, + "step": 6677 + }, + { + "epoch": 0.6290949341748899, + "grad_norm": 0.6856829524040222, + "learning_rate": 1.586379491915343e-05, + "loss": 0.2913, + "step": 6678 + }, + { + "epoch": 0.6291891382680578, + "grad_norm": 0.7844224572181702, + "learning_rate": 1.5862571682568135e-05, + "loss": 0.3622, + "step": 6679 + }, + { + "epoch": 0.6292833423612256, + "grad_norm": 0.7180244326591492, + "learning_rate": 1.5861348312307138e-05, + "loss": 0.3065, + "step": 6680 + }, + { + "epoch": 0.6293775464543935, + "grad_norm": 0.7316251993179321, + "learning_rate": 1.586012480839833e-05, + "loss": 0.3329, + "step": 6681 + }, + { + "epoch": 0.6294717505475613, + "grad_norm": 0.6861616373062134, + "learning_rate": 1.5858901170869613e-05, + "loss": 0.3216, + "step": 6682 + }, + { + "epoch": 0.6295659546407292, + "grad_norm": 0.6843136548995972, + "learning_rate": 1.585767739974889e-05, + "loss": 0.3171, + "step": 6683 + }, + { + "epoch": 0.629660158733897, + "grad_norm": 0.7426478266716003, + "learning_rate": 1.5856453495064067e-05, + "loss": 0.3227, + "step": 6684 + }, + { + "epoch": 0.6297543628270649, + "grad_norm": 0.6927346587181091, + "learning_rate": 1.5855229456843045e-05, + "loss": 0.3399, + "step": 6685 + }, + { + "epoch": 0.6298485669202327, + "grad_norm": 0.7568566203117371, + "learning_rate": 1.5854005285113734e-05, + "loss": 0.3248, + "step": 6686 + }, + { + "epoch": 0.6299427710134006, + "grad_norm": 0.703330397605896, + "learning_rate": 1.585278097990405e-05, + "loss": 0.307, + "step": 6687 + }, + { + "epoch": 0.6300369751065684, + "grad_norm": 0.7454832792282104, + "learning_rate": 1.585155654124191e-05, + "loss": 0.305, + "step": 6688 + }, + { + "epoch": 0.6301311791997363, + "grad_norm": 0.7922065258026123, + "learning_rate": 1.5850331969155228e-05, + "loss": 0.3402, + "step": 6689 + }, + { + "epoch": 0.6302253832929041, + "grad_norm": 0.7510750889778137, + "learning_rate": 1.5849107263671934e-05, + "loss": 0.3135, + "step": 6690 + }, + { + "epoch": 0.630319587386072, + "grad_norm": 0.8461636304855347, + "learning_rate": 1.5847882424819944e-05, + "loss": 0.3373, + "step": 6691 + }, + { + "epoch": 0.6304137914792398, + "grad_norm": 0.7858095765113831, + "learning_rate": 1.584665745262719e-05, + "loss": 0.3472, + "step": 6692 + }, + { + "epoch": 0.6305079955724077, + "grad_norm": 0.8112091422080994, + "learning_rate": 1.584543234712161e-05, + "loss": 0.3363, + "step": 6693 + }, + { + "epoch": 0.6306021996655755, + "grad_norm": 0.7483133673667908, + "learning_rate": 1.5844207108331125e-05, + "loss": 0.3419, + "step": 6694 + }, + { + "epoch": 0.6306964037587434, + "grad_norm": 0.7431141138076782, + "learning_rate": 1.5842981736283686e-05, + "loss": 0.3081, + "step": 6695 + }, + { + "epoch": 0.6307906078519112, + "grad_norm": 0.7465755939483643, + "learning_rate": 1.5841756231007224e-05, + "loss": 0.3327, + "step": 6696 + }, + { + "epoch": 0.6308848119450791, + "grad_norm": 0.7578116059303284, + "learning_rate": 1.5840530592529685e-05, + "loss": 0.2924, + "step": 6697 + }, + { + "epoch": 0.6309790160382469, + "grad_norm": 0.7328236103057861, + "learning_rate": 1.5839304820879016e-05, + "loss": 0.3, + "step": 6698 + }, + { + "epoch": 0.6310732201314148, + "grad_norm": 0.6725963950157166, + "learning_rate": 1.5838078916083168e-05, + "loss": 0.3046, + "step": 6699 + }, + { + "epoch": 0.6311674242245826, + "grad_norm": 0.8018659949302673, + "learning_rate": 1.5836852878170095e-05, + "loss": 0.3182, + "step": 6700 + }, + { + "epoch": 0.6312616283177505, + "grad_norm": 0.9003369808197021, + "learning_rate": 1.5835626707167742e-05, + "loss": 0.3683, + "step": 6701 + }, + { + "epoch": 0.6313558324109183, + "grad_norm": 0.6894645094871521, + "learning_rate": 1.583440040310408e-05, + "loss": 0.2916, + "step": 6702 + }, + { + "epoch": 0.6314500365040862, + "grad_norm": 0.7459478378295898, + "learning_rate": 1.583317396600707e-05, + "loss": 0.3374, + "step": 6703 + }, + { + "epoch": 0.631544240597254, + "grad_norm": 0.8423133492469788, + "learning_rate": 1.5831947395904662e-05, + "loss": 0.3395, + "step": 6704 + }, + { + "epoch": 0.6316384446904219, + "grad_norm": 0.8305002450942993, + "learning_rate": 1.583072069282484e-05, + "loss": 0.3492, + "step": 6705 + }, + { + "epoch": 0.6317326487835897, + "grad_norm": 0.8187609314918518, + "learning_rate": 1.582949385679557e-05, + "loss": 0.3322, + "step": 6706 + }, + { + "epoch": 0.6318268528767575, + "grad_norm": 0.7323743104934692, + "learning_rate": 1.582826688784482e-05, + "loss": 0.2975, + "step": 6707 + }, + { + "epoch": 0.6319210569699254, + "grad_norm": 0.7921598553657532, + "learning_rate": 1.5827039786000574e-05, + "loss": 0.3277, + "step": 6708 + }, + { + "epoch": 0.6320152610630931, + "grad_norm": 0.6891117095947266, + "learning_rate": 1.582581255129081e-05, + "loss": 0.3077, + "step": 6709 + }, + { + "epoch": 0.632109465156261, + "grad_norm": 0.7351700067520142, + "learning_rate": 1.582458518374351e-05, + "loss": 0.2881, + "step": 6710 + }, + { + "epoch": 0.6322036692494288, + "grad_norm": 0.8305718302726746, + "learning_rate": 1.582335768338666e-05, + "loss": 0.3465, + "step": 6711 + }, + { + "epoch": 0.6322978733425967, + "grad_norm": 0.759661078453064, + "learning_rate": 1.582213005024825e-05, + "loss": 0.3083, + "step": 6712 + }, + { + "epoch": 0.6323920774357645, + "grad_norm": 0.716044008731842, + "learning_rate": 1.5820902284356267e-05, + "loss": 0.3346, + "step": 6713 + }, + { + "epoch": 0.6324862815289324, + "grad_norm": 0.7560633420944214, + "learning_rate": 1.5819674385738712e-05, + "loss": 0.3015, + "step": 6714 + }, + { + "epoch": 0.6325804856221002, + "grad_norm": 0.9049429893493652, + "learning_rate": 1.581844635442358e-05, + "loss": 0.3334, + "step": 6715 + }, + { + "epoch": 0.6326746897152681, + "grad_norm": 0.7073135375976562, + "learning_rate": 1.5817218190438872e-05, + "loss": 0.2907, + "step": 6716 + }, + { + "epoch": 0.6327688938084359, + "grad_norm": 0.9328410625457764, + "learning_rate": 1.5815989893812594e-05, + "loss": 0.343, + "step": 6717 + }, + { + "epoch": 0.6328630979016038, + "grad_norm": 0.8090577721595764, + "learning_rate": 1.5814761464572753e-05, + "loss": 0.3365, + "step": 6718 + }, + { + "epoch": 0.6329573019947716, + "grad_norm": 0.765785276889801, + "learning_rate": 1.5813532902747354e-05, + "loss": 0.3213, + "step": 6719 + }, + { + "epoch": 0.6330515060879395, + "grad_norm": 0.8419969081878662, + "learning_rate": 1.581230420836442e-05, + "loss": 0.3207, + "step": 6720 + }, + { + "epoch": 0.6331457101811073, + "grad_norm": 0.8804196119308472, + "learning_rate": 1.5811075381451954e-05, + "loss": 0.3305, + "step": 6721 + }, + { + "epoch": 0.6332399142742752, + "grad_norm": 0.7283568978309631, + "learning_rate": 1.5809846422037986e-05, + "loss": 0.2971, + "step": 6722 + }, + { + "epoch": 0.633334118367443, + "grad_norm": 0.7433123588562012, + "learning_rate": 1.5808617330150535e-05, + "loss": 0.3753, + "step": 6723 + }, + { + "epoch": 0.6334283224606109, + "grad_norm": 0.687572181224823, + "learning_rate": 1.5807388105817625e-05, + "loss": 0.304, + "step": 6724 + }, + { + "epoch": 0.6335225265537787, + "grad_norm": 0.8676371574401855, + "learning_rate": 1.5806158749067285e-05, + "loss": 0.3183, + "step": 6725 + }, + { + "epoch": 0.6336167306469466, + "grad_norm": 0.9256935715675354, + "learning_rate": 1.5804929259927545e-05, + "loss": 0.3332, + "step": 6726 + }, + { + "epoch": 0.6337109347401144, + "grad_norm": 0.760909378528595, + "learning_rate": 1.5803699638426442e-05, + "loss": 0.2998, + "step": 6727 + }, + { + "epoch": 0.6338051388332823, + "grad_norm": 0.725662112236023, + "learning_rate": 1.580246988459201e-05, + "loss": 0.3273, + "step": 6728 + }, + { + "epoch": 0.6338993429264501, + "grad_norm": 0.785145103931427, + "learning_rate": 1.5801239998452294e-05, + "loss": 0.2735, + "step": 6729 + }, + { + "epoch": 0.633993547019618, + "grad_norm": 0.8270518183708191, + "learning_rate": 1.5800009980035334e-05, + "loss": 0.3611, + "step": 6730 + }, + { + "epoch": 0.6340877511127858, + "grad_norm": 0.7773357629776001, + "learning_rate": 1.5798779829369174e-05, + "loss": 0.3255, + "step": 6731 + }, + { + "epoch": 0.6341819552059537, + "grad_norm": 2.223407506942749, + "learning_rate": 1.5797549546481866e-05, + "loss": 0.3426, + "step": 6732 + }, + { + "epoch": 0.6342761592991215, + "grad_norm": 0.7606045603752136, + "learning_rate": 1.5796319131401463e-05, + "loss": 0.329, + "step": 6733 + }, + { + "epoch": 0.6343703633922894, + "grad_norm": 0.8077683448791504, + "learning_rate": 1.579508858415602e-05, + "loss": 0.3607, + "step": 6734 + }, + { + "epoch": 0.6344645674854572, + "grad_norm": 0.7511078119277954, + "learning_rate": 1.5793857904773595e-05, + "loss": 0.3202, + "step": 6735 + }, + { + "epoch": 0.6345587715786251, + "grad_norm": 0.7200794219970703, + "learning_rate": 1.5792627093282247e-05, + "loss": 0.3061, + "step": 6736 + }, + { + "epoch": 0.6346529756717929, + "grad_norm": 0.7865471243858337, + "learning_rate": 1.5791396149710046e-05, + "loss": 0.3427, + "step": 6737 + }, + { + "epoch": 0.6347471797649608, + "grad_norm": 0.830402672290802, + "learning_rate": 1.5790165074085057e-05, + "loss": 0.3343, + "step": 6738 + }, + { + "epoch": 0.6348413838581286, + "grad_norm": 0.6368622183799744, + "learning_rate": 1.5788933866435346e-05, + "loss": 0.3058, + "step": 6739 + }, + { + "epoch": 0.6349355879512965, + "grad_norm": 0.8150480389595032, + "learning_rate": 1.5787702526788994e-05, + "loss": 0.3063, + "step": 6740 + }, + { + "epoch": 0.6350297920444643, + "grad_norm": 0.7546366453170776, + "learning_rate": 1.578647105517407e-05, + "loss": 0.3172, + "step": 6741 + }, + { + "epoch": 0.6351239961376322, + "grad_norm": 0.7057252526283264, + "learning_rate": 1.5785239451618654e-05, + "loss": 0.323, + "step": 6742 + }, + { + "epoch": 0.6352182002308, + "grad_norm": 0.8907890319824219, + "learning_rate": 1.578400771615084e-05, + "loss": 0.3618, + "step": 6743 + }, + { + "epoch": 0.6353124043239678, + "grad_norm": 0.8284263014793396, + "learning_rate": 1.5782775848798698e-05, + "loss": 0.3361, + "step": 6744 + }, + { + "epoch": 0.6354066084171357, + "grad_norm": 0.7687682509422302, + "learning_rate": 1.578154384959033e-05, + "loss": 0.3625, + "step": 6745 + }, + { + "epoch": 0.6355008125103035, + "grad_norm": 0.8055520057678223, + "learning_rate": 1.5780311718553813e-05, + "loss": 0.3628, + "step": 6746 + }, + { + "epoch": 0.6355950166034714, + "grad_norm": 0.7241090536117554, + "learning_rate": 1.5779079455717253e-05, + "loss": 0.3248, + "step": 6747 + }, + { + "epoch": 0.6356892206966392, + "grad_norm": 0.7102499008178711, + "learning_rate": 1.5777847061108747e-05, + "loss": 0.3317, + "step": 6748 + }, + { + "epoch": 0.6357834247898071, + "grad_norm": 0.7378541827201843, + "learning_rate": 1.5776614534756388e-05, + "loss": 0.3458, + "step": 6749 + }, + { + "epoch": 0.6358776288829749, + "grad_norm": 0.816027820110321, + "learning_rate": 1.577538187668829e-05, + "loss": 0.3443, + "step": 6750 + }, + { + "epoch": 0.6359718329761428, + "grad_norm": 0.7290892601013184, + "learning_rate": 1.5774149086932546e-05, + "loss": 0.3363, + "step": 6751 + }, + { + "epoch": 0.6360660370693106, + "grad_norm": 0.8670916557312012, + "learning_rate": 1.5772916165517275e-05, + "loss": 0.3243, + "step": 6752 + }, + { + "epoch": 0.6361602411624785, + "grad_norm": 0.6518049240112305, + "learning_rate": 1.5771683112470587e-05, + "loss": 0.3129, + "step": 6753 + }, + { + "epoch": 0.6362544452556463, + "grad_norm": 0.750372052192688, + "learning_rate": 1.57704499278206e-05, + "loss": 0.3366, + "step": 6754 + }, + { + "epoch": 0.6363486493488142, + "grad_norm": 0.7233075499534607, + "learning_rate": 1.5769216611595432e-05, + "loss": 0.305, + "step": 6755 + }, + { + "epoch": 0.636442853441982, + "grad_norm": 0.7097463607788086, + "learning_rate": 1.5767983163823198e-05, + "loss": 0.3388, + "step": 6756 + }, + { + "epoch": 0.6365370575351499, + "grad_norm": 0.7607314586639404, + "learning_rate": 1.576674958453203e-05, + "loss": 0.3238, + "step": 6757 + }, + { + "epoch": 0.6366312616283177, + "grad_norm": 0.7543935179710388, + "learning_rate": 1.5765515873750055e-05, + "loss": 0.3287, + "step": 6758 + }, + { + "epoch": 0.6367254657214856, + "grad_norm": 0.8424161076545715, + "learning_rate": 1.57642820315054e-05, + "loss": 0.3453, + "step": 6759 + }, + { + "epoch": 0.6368196698146534, + "grad_norm": 0.9995113611221313, + "learning_rate": 1.57630480578262e-05, + "loss": 0.3726, + "step": 6760 + }, + { + "epoch": 0.6369138739078213, + "grad_norm": 0.7712142467498779, + "learning_rate": 1.576181395274059e-05, + "loss": 0.3068, + "step": 6761 + }, + { + "epoch": 0.6370080780009891, + "grad_norm": 0.7060720324516296, + "learning_rate": 1.5760579716276714e-05, + "loss": 0.326, + "step": 6762 + }, + { + "epoch": 0.637102282094157, + "grad_norm": 0.8832055330276489, + "learning_rate": 1.575934534846271e-05, + "loss": 0.3783, + "step": 6763 + }, + { + "epoch": 0.6371964861873248, + "grad_norm": 0.6925953030586243, + "learning_rate": 1.5758110849326724e-05, + "loss": 0.2866, + "step": 6764 + }, + { + "epoch": 0.6372906902804927, + "grad_norm": 0.7408279180526733, + "learning_rate": 1.5756876218896908e-05, + "loss": 0.3361, + "step": 6765 + }, + { + "epoch": 0.6373848943736605, + "grad_norm": 0.8043666481971741, + "learning_rate": 1.575564145720141e-05, + "loss": 0.3528, + "step": 6766 + }, + { + "epoch": 0.6374790984668284, + "grad_norm": 0.7212235331535339, + "learning_rate": 1.575440656426838e-05, + "loss": 0.3354, + "step": 6767 + }, + { + "epoch": 0.6375733025599962, + "grad_norm": 0.959936797618866, + "learning_rate": 1.5753171540125993e-05, + "loss": 0.3474, + "step": 6768 + }, + { + "epoch": 0.6376675066531641, + "grad_norm": 0.7940359115600586, + "learning_rate": 1.5751936384802388e-05, + "loss": 0.3192, + "step": 6769 + }, + { + "epoch": 0.6377617107463319, + "grad_norm": 0.7453657388687134, + "learning_rate": 1.575070109832574e-05, + "loss": 0.3333, + "step": 6770 + }, + { + "epoch": 0.6378559148394998, + "grad_norm": 0.7639998197555542, + "learning_rate": 1.5749465680724215e-05, + "loss": 0.3857, + "step": 6771 + }, + { + "epoch": 0.6379501189326676, + "grad_norm": 0.6975438594818115, + "learning_rate": 1.574823013202598e-05, + "loss": 0.2871, + "step": 6772 + }, + { + "epoch": 0.6380443230258355, + "grad_norm": 0.7081899642944336, + "learning_rate": 1.5746994452259206e-05, + "loss": 0.2871, + "step": 6773 + }, + { + "epoch": 0.6381385271190033, + "grad_norm": 0.8343057036399841, + "learning_rate": 1.5745758641452074e-05, + "loss": 0.3262, + "step": 6774 + }, + { + "epoch": 0.6382327312121712, + "grad_norm": 0.6873591542243958, + "learning_rate": 1.5744522699632757e-05, + "loss": 0.3026, + "step": 6775 + }, + { + "epoch": 0.638326935305339, + "grad_norm": 0.8080335259437561, + "learning_rate": 1.5743286626829437e-05, + "loss": 0.3114, + "step": 6776 + }, + { + "epoch": 0.6384211393985069, + "grad_norm": 0.8233070373535156, + "learning_rate": 1.57420504230703e-05, + "loss": 0.3154, + "step": 6777 + }, + { + "epoch": 0.6385153434916747, + "grad_norm": 0.8136635422706604, + "learning_rate": 1.574081408838354e-05, + "loss": 0.3472, + "step": 6778 + }, + { + "epoch": 0.6386095475848426, + "grad_norm": 0.7526406645774841, + "learning_rate": 1.5739577622797334e-05, + "loss": 0.3168, + "step": 6779 + }, + { + "epoch": 0.6387037516780104, + "grad_norm": 0.6697912216186523, + "learning_rate": 1.5738341026339882e-05, + "loss": 0.2986, + "step": 6780 + }, + { + "epoch": 0.6387979557711783, + "grad_norm": 0.8566330671310425, + "learning_rate": 1.573710429903938e-05, + "loss": 0.3363, + "step": 6781 + }, + { + "epoch": 0.6388921598643461, + "grad_norm": 0.8247984647750854, + "learning_rate": 1.5735867440924027e-05, + "loss": 0.3355, + "step": 6782 + }, + { + "epoch": 0.638986363957514, + "grad_norm": 0.7584102153778076, + "learning_rate": 1.5734630452022028e-05, + "loss": 0.3233, + "step": 6783 + }, + { + "epoch": 0.6390805680506818, + "grad_norm": 0.8046115040779114, + "learning_rate": 1.5733393332361585e-05, + "loss": 0.3161, + "step": 6784 + }, + { + "epoch": 0.6391747721438497, + "grad_norm": 0.631939172744751, + "learning_rate": 1.573215608197091e-05, + "loss": 0.2771, + "step": 6785 + }, + { + "epoch": 0.6392689762370175, + "grad_norm": 0.819452166557312, + "learning_rate": 1.5730918700878203e-05, + "loss": 0.3283, + "step": 6786 + }, + { + "epoch": 0.6393631803301854, + "grad_norm": 0.6071840524673462, + "learning_rate": 1.5729681189111694e-05, + "loss": 0.2443, + "step": 6787 + }, + { + "epoch": 0.6394573844233532, + "grad_norm": 1.0969040393829346, + "learning_rate": 1.5728443546699592e-05, + "loss": 0.3384, + "step": 6788 + }, + { + "epoch": 0.639551588516521, + "grad_norm": 0.7380397915840149, + "learning_rate": 1.5727205773670117e-05, + "loss": 0.343, + "step": 6789 + }, + { + "epoch": 0.6396457926096889, + "grad_norm": 0.7145038843154907, + "learning_rate": 1.572596787005149e-05, + "loss": 0.3134, + "step": 6790 + }, + { + "epoch": 0.6397399967028568, + "grad_norm": 0.854451060295105, + "learning_rate": 1.5724729835871945e-05, + "loss": 0.404, + "step": 6791 + }, + { + "epoch": 0.6398342007960246, + "grad_norm": 0.7254519462585449, + "learning_rate": 1.5723491671159703e-05, + "loss": 0.3069, + "step": 6792 + }, + { + "epoch": 0.6399284048891924, + "grad_norm": 0.7702884674072266, + "learning_rate": 1.5722253375943002e-05, + "loss": 0.3559, + "step": 6793 + }, + { + "epoch": 0.6400226089823603, + "grad_norm": 0.8042786717414856, + "learning_rate": 1.5721014950250074e-05, + "loss": 0.3222, + "step": 6794 + }, + { + "epoch": 0.6401168130755281, + "grad_norm": 0.726668119430542, + "learning_rate": 1.5719776394109152e-05, + "loss": 0.2788, + "step": 6795 + }, + { + "epoch": 0.640211017168696, + "grad_norm": 0.6974692940711975, + "learning_rate": 1.5718537707548488e-05, + "loss": 0.3105, + "step": 6796 + }, + { + "epoch": 0.6403052212618638, + "grad_norm": 0.6868124008178711, + "learning_rate": 1.5717298890596317e-05, + "loss": 0.3269, + "step": 6797 + }, + { + "epoch": 0.6403994253550317, + "grad_norm": 0.7715316414833069, + "learning_rate": 1.5716059943280896e-05, + "loss": 0.359, + "step": 6798 + }, + { + "epoch": 0.6404936294481995, + "grad_norm": 0.7154345512390137, + "learning_rate": 1.5714820865630462e-05, + "loss": 0.3162, + "step": 6799 + }, + { + "epoch": 0.6405878335413674, + "grad_norm": 0.7197099328041077, + "learning_rate": 1.5713581657673276e-05, + "loss": 0.283, + "step": 6800 + }, + { + "epoch": 0.6406820376345352, + "grad_norm": 0.7262243628501892, + "learning_rate": 1.5712342319437592e-05, + "loss": 0.327, + "step": 6801 + }, + { + "epoch": 0.6407762417277031, + "grad_norm": 0.7279204726219177, + "learning_rate": 1.571110285095167e-05, + "loss": 0.3027, + "step": 6802 + }, + { + "epoch": 0.6408704458208709, + "grad_norm": 0.8032323718070984, + "learning_rate": 1.5709863252243768e-05, + "loss": 0.349, + "step": 6803 + }, + { + "epoch": 0.6409646499140388, + "grad_norm": 0.7235321402549744, + "learning_rate": 1.5708623523342153e-05, + "loss": 0.3158, + "step": 6804 + }, + { + "epoch": 0.6410588540072066, + "grad_norm": 0.760461151599884, + "learning_rate": 1.5707383664275094e-05, + "loss": 0.3293, + "step": 6805 + }, + { + "epoch": 0.6411530581003745, + "grad_norm": 0.7263423800468445, + "learning_rate": 1.5706143675070862e-05, + "loss": 0.3249, + "step": 6806 + }, + { + "epoch": 0.6412472621935423, + "grad_norm": 0.6820230484008789, + "learning_rate": 1.5704903555757728e-05, + "loss": 0.3317, + "step": 6807 + }, + { + "epoch": 0.6413414662867102, + "grad_norm": 0.8332756161689758, + "learning_rate": 1.5703663306363976e-05, + "loss": 0.3476, + "step": 6808 + }, + { + "epoch": 0.641435670379878, + "grad_norm": 0.7073219418525696, + "learning_rate": 1.5702422926917872e-05, + "loss": 0.321, + "step": 6809 + }, + { + "epoch": 0.6415298744730459, + "grad_norm": 0.7357454895973206, + "learning_rate": 1.570118241744771e-05, + "loss": 0.2712, + "step": 6810 + }, + { + "epoch": 0.6416240785662137, + "grad_norm": 0.7738254070281982, + "learning_rate": 1.5699941777981772e-05, + "loss": 0.3165, + "step": 6811 + }, + { + "epoch": 0.6417182826593816, + "grad_norm": 0.669426441192627, + "learning_rate": 1.5698701008548343e-05, + "loss": 0.331, + "step": 6812 + }, + { + "epoch": 0.6418124867525494, + "grad_norm": 0.7154690623283386, + "learning_rate": 1.569746010917572e-05, + "loss": 0.2842, + "step": 6813 + }, + { + "epoch": 0.6419066908457173, + "grad_norm": 0.7300938963890076, + "learning_rate": 1.5696219079892198e-05, + "loss": 0.3314, + "step": 6814 + }, + { + "epoch": 0.6420008949388851, + "grad_norm": 0.7254599332809448, + "learning_rate": 1.5694977920726066e-05, + "loss": 0.3008, + "step": 6815 + }, + { + "epoch": 0.642095099032053, + "grad_norm": 0.7386945486068726, + "learning_rate": 1.5693736631705632e-05, + "loss": 0.2717, + "step": 6816 + }, + { + "epoch": 0.6421893031252208, + "grad_norm": 0.9841634631156921, + "learning_rate": 1.56924952128592e-05, + "loss": 0.2993, + "step": 6817 + }, + { + "epoch": 0.6422835072183887, + "grad_norm": 0.7504370808601379, + "learning_rate": 1.569125366421507e-05, + "loss": 0.3875, + "step": 6818 + }, + { + "epoch": 0.6423777113115565, + "grad_norm": 0.7812278866767883, + "learning_rate": 1.569001198580156e-05, + "loss": 0.3503, + "step": 6819 + }, + { + "epoch": 0.6424719154047244, + "grad_norm": 0.8756741881370544, + "learning_rate": 1.5688770177646972e-05, + "loss": 0.3034, + "step": 6820 + }, + { + "epoch": 0.6425661194978922, + "grad_norm": 0.7655273079872131, + "learning_rate": 1.5687528239779627e-05, + "loss": 0.3183, + "step": 6821 + }, + { + "epoch": 0.6426603235910601, + "grad_norm": 0.6566729545593262, + "learning_rate": 1.5686286172227844e-05, + "loss": 0.2956, + "step": 6822 + }, + { + "epoch": 0.6427545276842279, + "grad_norm": 0.7181766033172607, + "learning_rate": 1.568504397501994e-05, + "loss": 0.3117, + "step": 6823 + }, + { + "epoch": 0.6428487317773958, + "grad_norm": 0.7318626046180725, + "learning_rate": 1.568380164818424e-05, + "loss": 0.2907, + "step": 6824 + }, + { + "epoch": 0.6429429358705636, + "grad_norm": 0.8867868185043335, + "learning_rate": 1.5682559191749075e-05, + "loss": 0.3467, + "step": 6825 + }, + { + "epoch": 0.6430371399637315, + "grad_norm": 0.7642084360122681, + "learning_rate": 1.5681316605742773e-05, + "loss": 0.3273, + "step": 6826 + }, + { + "epoch": 0.6431313440568993, + "grad_norm": 0.7472430467605591, + "learning_rate": 1.5680073890193662e-05, + "loss": 0.3329, + "step": 6827 + }, + { + "epoch": 0.6432255481500672, + "grad_norm": 0.8168688416481018, + "learning_rate": 1.5678831045130086e-05, + "loss": 0.3099, + "step": 6828 + }, + { + "epoch": 0.643319752243235, + "grad_norm": 0.7869260311126709, + "learning_rate": 1.567758807058038e-05, + "loss": 0.2915, + "step": 6829 + }, + { + "epoch": 0.6434139563364029, + "grad_norm": 0.82233726978302, + "learning_rate": 1.5676344966572882e-05, + "loss": 0.3511, + "step": 6830 + }, + { + "epoch": 0.6435081604295707, + "grad_norm": 0.7085413336753845, + "learning_rate": 1.567510173313594e-05, + "loss": 0.2987, + "step": 6831 + }, + { + "epoch": 0.6436023645227386, + "grad_norm": 0.6723721027374268, + "learning_rate": 1.5673858370297906e-05, + "loss": 0.3143, + "step": 6832 + }, + { + "epoch": 0.6436965686159064, + "grad_norm": 0.7982920408248901, + "learning_rate": 1.5672614878087125e-05, + "loss": 0.3605, + "step": 6833 + }, + { + "epoch": 0.6437907727090743, + "grad_norm": 0.6995309591293335, + "learning_rate": 1.5671371256531952e-05, + "loss": 0.2919, + "step": 6834 + }, + { + "epoch": 0.6438849768022421, + "grad_norm": 0.7752536535263062, + "learning_rate": 1.567012750566074e-05, + "loss": 0.3161, + "step": 6835 + }, + { + "epoch": 0.64397918089541, + "grad_norm": 0.8362805843353271, + "learning_rate": 1.5668883625501856e-05, + "loss": 0.3332, + "step": 6836 + }, + { + "epoch": 0.6440733849885778, + "grad_norm": 0.8919989466667175, + "learning_rate": 1.5667639616083653e-05, + "loss": 0.3331, + "step": 6837 + }, + { + "epoch": 0.6441675890817457, + "grad_norm": 0.7835939526557922, + "learning_rate": 1.5666395477434508e-05, + "loss": 0.3112, + "step": 6838 + }, + { + "epoch": 0.6442617931749135, + "grad_norm": 0.7878260016441345, + "learning_rate": 1.566515120958278e-05, + "loss": 0.3195, + "step": 6839 + }, + { + "epoch": 0.6443559972680813, + "grad_norm": 0.7726975083351135, + "learning_rate": 1.5663906812556843e-05, + "loss": 0.3585, + "step": 6840 + }, + { + "epoch": 0.6444502013612492, + "grad_norm": 0.7149421572685242, + "learning_rate": 1.566266228638507e-05, + "loss": 0.3018, + "step": 6841 + }, + { + "epoch": 0.644544405454417, + "grad_norm": 0.7051762342453003, + "learning_rate": 1.566141763109584e-05, + "loss": 0.3098, + "step": 6842 + }, + { + "epoch": 0.6446386095475849, + "grad_norm": 0.8522613048553467, + "learning_rate": 1.5660172846717536e-05, + "loss": 0.3887, + "step": 6843 + }, + { + "epoch": 0.6447328136407527, + "grad_norm": 0.6800092458724976, + "learning_rate": 1.565892793327853e-05, + "loss": 0.3298, + "step": 6844 + }, + { + "epoch": 0.6448270177339206, + "grad_norm": 0.7485498189926147, + "learning_rate": 1.5657682890807225e-05, + "loss": 0.3623, + "step": 6845 + }, + { + "epoch": 0.6449212218270884, + "grad_norm": 0.6815838813781738, + "learning_rate": 1.5656437719331993e-05, + "loss": 0.3088, + "step": 6846 + }, + { + "epoch": 0.6450154259202562, + "grad_norm": 0.7152532935142517, + "learning_rate": 1.5655192418881235e-05, + "loss": 0.3026, + "step": 6847 + }, + { + "epoch": 0.645109630013424, + "grad_norm": 0.8499749302864075, + "learning_rate": 1.5653946989483345e-05, + "loss": 0.323, + "step": 6848 + }, + { + "epoch": 0.6452038341065919, + "grad_norm": 0.8016695380210876, + "learning_rate": 1.565270143116672e-05, + "loss": 0.2988, + "step": 6849 + }, + { + "epoch": 0.6452980381997597, + "grad_norm": 0.6921725869178772, + "learning_rate": 1.565145574395976e-05, + "loss": 0.2804, + "step": 6850 + }, + { + "epoch": 0.6453922422929276, + "grad_norm": 0.7418727874755859, + "learning_rate": 1.5650209927890868e-05, + "loss": 0.2879, + "step": 6851 + }, + { + "epoch": 0.6454864463860954, + "grad_norm": 0.8103009462356567, + "learning_rate": 1.564896398298845e-05, + "loss": 0.3066, + "step": 6852 + }, + { + "epoch": 0.6455806504792633, + "grad_norm": 0.7640576362609863, + "learning_rate": 1.564771790928092e-05, + "loss": 0.3625, + "step": 6853 + }, + { + "epoch": 0.6456748545724311, + "grad_norm": 0.9308742880821228, + "learning_rate": 1.5646471706796686e-05, + "loss": 0.3705, + "step": 6854 + }, + { + "epoch": 0.645769058665599, + "grad_norm": 0.7717334032058716, + "learning_rate": 1.5645225375564165e-05, + "loss": 0.3456, + "step": 6855 + }, + { + "epoch": 0.6458632627587668, + "grad_norm": 0.7640878558158875, + "learning_rate": 1.564397891561177e-05, + "loss": 0.3292, + "step": 6856 + }, + { + "epoch": 0.6459574668519347, + "grad_norm": 0.7348083853721619, + "learning_rate": 1.5642732326967934e-05, + "loss": 0.332, + "step": 6857 + }, + { + "epoch": 0.6460516709451025, + "grad_norm": 0.6975739002227783, + "learning_rate": 1.5641485609661073e-05, + "loss": 0.2983, + "step": 6858 + }, + { + "epoch": 0.6461458750382704, + "grad_norm": 0.768665611743927, + "learning_rate": 1.5640238763719614e-05, + "loss": 0.3287, + "step": 6859 + }, + { + "epoch": 0.6462400791314382, + "grad_norm": 0.8188949823379517, + "learning_rate": 1.563899178917199e-05, + "loss": 0.3534, + "step": 6860 + }, + { + "epoch": 0.6463342832246061, + "grad_norm": 0.8445432186126709, + "learning_rate": 1.563774468604663e-05, + "loss": 0.3449, + "step": 6861 + }, + { + "epoch": 0.6464284873177739, + "grad_norm": 0.8140099048614502, + "learning_rate": 1.5636497454371973e-05, + "loss": 0.3215, + "step": 6862 + }, + { + "epoch": 0.6465226914109418, + "grad_norm": 0.6855039000511169, + "learning_rate": 1.5635250094176456e-05, + "loss": 0.2879, + "step": 6863 + }, + { + "epoch": 0.6466168955041096, + "grad_norm": 0.7547051906585693, + "learning_rate": 1.5634002605488524e-05, + "loss": 0.3131, + "step": 6864 + }, + { + "epoch": 0.6467110995972775, + "grad_norm": 0.855553388595581, + "learning_rate": 1.563275498833662e-05, + "loss": 0.3354, + "step": 6865 + }, + { + "epoch": 0.6468053036904453, + "grad_norm": 0.6999885439872742, + "learning_rate": 1.5631507242749187e-05, + "loss": 0.3232, + "step": 6866 + }, + { + "epoch": 0.6468995077836132, + "grad_norm": 0.7532494068145752, + "learning_rate": 1.5630259368754682e-05, + "loss": 0.3198, + "step": 6867 + }, + { + "epoch": 0.646993711876781, + "grad_norm": 0.6313682794570923, + "learning_rate": 1.5629011366381556e-05, + "loss": 0.2997, + "step": 6868 + }, + { + "epoch": 0.6470879159699489, + "grad_norm": 0.8592627048492432, + "learning_rate": 1.5627763235658266e-05, + "loss": 0.3376, + "step": 6869 + }, + { + "epoch": 0.6471821200631167, + "grad_norm": 0.7356202006340027, + "learning_rate": 1.5626514976613273e-05, + "loss": 0.328, + "step": 6870 + }, + { + "epoch": 0.6472763241562846, + "grad_norm": 0.7478100061416626, + "learning_rate": 1.5625266589275032e-05, + "loss": 0.318, + "step": 6871 + }, + { + "epoch": 0.6473705282494524, + "grad_norm": 0.8141969442367554, + "learning_rate": 1.5624018073672016e-05, + "loss": 0.3171, + "step": 6872 + }, + { + "epoch": 0.6474647323426203, + "grad_norm": 0.7093797922134399, + "learning_rate": 1.5622769429832687e-05, + "loss": 0.3347, + "step": 6873 + }, + { + "epoch": 0.6475589364357881, + "grad_norm": 0.721733808517456, + "learning_rate": 1.5621520657785523e-05, + "loss": 0.3195, + "step": 6874 + }, + { + "epoch": 0.647653140528956, + "grad_norm": 0.7266020774841309, + "learning_rate": 1.5620271757558994e-05, + "loss": 0.3223, + "step": 6875 + }, + { + "epoch": 0.6477473446221238, + "grad_norm": 0.8292143940925598, + "learning_rate": 1.5619022729181575e-05, + "loss": 0.3082, + "step": 6876 + }, + { + "epoch": 0.6478415487152916, + "grad_norm": 0.7851315140724182, + "learning_rate": 1.5617773572681748e-05, + "loss": 0.3001, + "step": 6877 + }, + { + "epoch": 0.6479357528084595, + "grad_norm": 0.8517122864723206, + "learning_rate": 1.5616524288088e-05, + "loss": 0.3389, + "step": 6878 + }, + { + "epoch": 0.6480299569016273, + "grad_norm": 0.7251204252243042, + "learning_rate": 1.5615274875428807e-05, + "loss": 0.3061, + "step": 6879 + }, + { + "epoch": 0.6481241609947952, + "grad_norm": 0.9605844616889954, + "learning_rate": 1.5614025334732664e-05, + "loss": 0.3732, + "step": 6880 + }, + { + "epoch": 0.648218365087963, + "grad_norm": 0.7894840836524963, + "learning_rate": 1.561277566602806e-05, + "loss": 0.3382, + "step": 6881 + }, + { + "epoch": 0.6483125691811309, + "grad_norm": 0.7801871299743652, + "learning_rate": 1.561152586934349e-05, + "loss": 0.3377, + "step": 6882 + }, + { + "epoch": 0.6484067732742987, + "grad_norm": 0.7582337856292725, + "learning_rate": 1.5610275944707454e-05, + "loss": 0.3575, + "step": 6883 + }, + { + "epoch": 0.6485009773674666, + "grad_norm": 0.6954520344734192, + "learning_rate": 1.560902589214845e-05, + "loss": 0.2911, + "step": 6884 + }, + { + "epoch": 0.6485951814606344, + "grad_norm": 0.667131781578064, + "learning_rate": 1.560777571169498e-05, + "loss": 0.2855, + "step": 6885 + }, + { + "epoch": 0.6486893855538023, + "grad_norm": 0.7147957682609558, + "learning_rate": 1.560652540337555e-05, + "loss": 0.303, + "step": 6886 + }, + { + "epoch": 0.6487835896469701, + "grad_norm": 0.6967618465423584, + "learning_rate": 1.5605274967218672e-05, + "loss": 0.323, + "step": 6887 + }, + { + "epoch": 0.648877793740138, + "grad_norm": 0.7516948580741882, + "learning_rate": 1.5604024403252858e-05, + "loss": 0.3295, + "step": 6888 + }, + { + "epoch": 0.6489719978333058, + "grad_norm": 0.7380844354629517, + "learning_rate": 1.5602773711506617e-05, + "loss": 0.3091, + "step": 6889 + }, + { + "epoch": 0.6490662019264737, + "grad_norm": 0.6943092346191406, + "learning_rate": 1.5601522892008475e-05, + "loss": 0.3289, + "step": 6890 + }, + { + "epoch": 0.6491604060196415, + "grad_norm": 0.7739840745925903, + "learning_rate": 1.5600271944786944e-05, + "loss": 0.3311, + "step": 6891 + }, + { + "epoch": 0.6492546101128094, + "grad_norm": 0.9014347195625305, + "learning_rate": 1.559902086987055e-05, + "loss": 0.359, + "step": 6892 + }, + { + "epoch": 0.6493488142059772, + "grad_norm": 0.632686197757721, + "learning_rate": 1.5597769667287826e-05, + "loss": 0.3407, + "step": 6893 + }, + { + "epoch": 0.6494430182991451, + "grad_norm": 0.7913288474082947, + "learning_rate": 1.5596518337067293e-05, + "loss": 0.3465, + "step": 6894 + }, + { + "epoch": 0.6495372223923129, + "grad_norm": 0.6992575526237488, + "learning_rate": 1.5595266879237492e-05, + "loss": 0.3417, + "step": 6895 + }, + { + "epoch": 0.6496314264854808, + "grad_norm": 0.7911957502365112, + "learning_rate": 1.5594015293826945e-05, + "loss": 0.3579, + "step": 6896 + }, + { + "epoch": 0.6497256305786486, + "grad_norm": 0.6819725036621094, + "learning_rate": 1.5592763580864204e-05, + "loss": 0.3191, + "step": 6897 + }, + { + "epoch": 0.6498198346718165, + "grad_norm": 0.6952462196350098, + "learning_rate": 1.5591511740377802e-05, + "loss": 0.29, + "step": 6898 + }, + { + "epoch": 0.6499140387649843, + "grad_norm": 0.7029697895050049, + "learning_rate": 1.5590259772396287e-05, + "loss": 0.3408, + "step": 6899 + }, + { + "epoch": 0.6500082428581522, + "grad_norm": 0.7309373617172241, + "learning_rate": 1.55890076769482e-05, + "loss": 0.3427, + "step": 6900 + }, + { + "epoch": 0.65010244695132, + "grad_norm": 0.6198153495788574, + "learning_rate": 1.5587755454062095e-05, + "loss": 0.2921, + "step": 6901 + }, + { + "epoch": 0.6501966510444879, + "grad_norm": 0.722285270690918, + "learning_rate": 1.5586503103766526e-05, + "loss": 0.3019, + "step": 6902 + }, + { + "epoch": 0.6502908551376557, + "grad_norm": 0.5886370539665222, + "learning_rate": 1.5585250626090045e-05, + "loss": 0.2599, + "step": 6903 + }, + { + "epoch": 0.6503850592308236, + "grad_norm": 0.8670148849487305, + "learning_rate": 1.5583998021061213e-05, + "loss": 0.3129, + "step": 6904 + }, + { + "epoch": 0.6504792633239914, + "grad_norm": 0.7227097749710083, + "learning_rate": 1.558274528870859e-05, + "loss": 0.3129, + "step": 6905 + }, + { + "epoch": 0.6505734674171593, + "grad_norm": 0.7777684926986694, + "learning_rate": 1.558149242906074e-05, + "loss": 0.3186, + "step": 6906 + }, + { + "epoch": 0.6506676715103271, + "grad_norm": 0.7884363532066345, + "learning_rate": 1.558023944214623e-05, + "loss": 0.3167, + "step": 6907 + }, + { + "epoch": 0.650761875603495, + "grad_norm": 0.7935061454772949, + "learning_rate": 1.5578986327993633e-05, + "loss": 0.2878, + "step": 6908 + }, + { + "epoch": 0.6508560796966628, + "grad_norm": 0.6717670559883118, + "learning_rate": 1.557773308663152e-05, + "loss": 0.3035, + "step": 6909 + }, + { + "epoch": 0.6509502837898307, + "grad_norm": 0.6283624768257141, + "learning_rate": 1.5576479718088466e-05, + "loss": 0.3054, + "step": 6910 + }, + { + "epoch": 0.6510444878829985, + "grad_norm": 0.7556511759757996, + "learning_rate": 1.5575226222393048e-05, + "loss": 0.3377, + "step": 6911 + }, + { + "epoch": 0.6511386919761664, + "grad_norm": 0.7569495439529419, + "learning_rate": 1.557397259957385e-05, + "loss": 0.2915, + "step": 6912 + }, + { + "epoch": 0.6512328960693342, + "grad_norm": 0.7043121457099915, + "learning_rate": 1.5572718849659458e-05, + "loss": 0.3003, + "step": 6913 + }, + { + "epoch": 0.6513271001625021, + "grad_norm": 0.6409788727760315, + "learning_rate": 1.5571464972678457e-05, + "loss": 0.3125, + "step": 6914 + }, + { + "epoch": 0.6514213042556699, + "grad_norm": 0.796539843082428, + "learning_rate": 1.557021096865944e-05, + "loss": 0.3095, + "step": 6915 + }, + { + "epoch": 0.6515155083488378, + "grad_norm": 0.7086146473884583, + "learning_rate": 1.5568956837630996e-05, + "loss": 0.3483, + "step": 6916 + }, + { + "epoch": 0.6516097124420056, + "grad_norm": 1.0475225448608398, + "learning_rate": 1.5567702579621724e-05, + "loss": 0.3078, + "step": 6917 + }, + { + "epoch": 0.6517039165351735, + "grad_norm": 0.8035339117050171, + "learning_rate": 1.5566448194660225e-05, + "loss": 0.3203, + "step": 6918 + }, + { + "epoch": 0.6517981206283413, + "grad_norm": 0.6819237470626831, + "learning_rate": 1.5565193682775097e-05, + "loss": 0.3577, + "step": 6919 + }, + { + "epoch": 0.6518923247215092, + "grad_norm": 0.6923936009407043, + "learning_rate": 1.5563939043994944e-05, + "loss": 0.326, + "step": 6920 + }, + { + "epoch": 0.651986528814677, + "grad_norm": 0.9624119997024536, + "learning_rate": 1.5562684278348378e-05, + "loss": 0.324, + "step": 6921 + }, + { + "epoch": 0.6520807329078449, + "grad_norm": 0.7303101420402527, + "learning_rate": 1.5561429385864005e-05, + "loss": 0.3153, + "step": 6922 + }, + { + "epoch": 0.6521749370010127, + "grad_norm": 0.7050034403800964, + "learning_rate": 1.5560174366570448e-05, + "loss": 0.2999, + "step": 6923 + }, + { + "epoch": 0.6522691410941805, + "grad_norm": 0.6949707865715027, + "learning_rate": 1.555891922049631e-05, + "loss": 0.3143, + "step": 6924 + }, + { + "epoch": 0.6523633451873484, + "grad_norm": 0.7842676043510437, + "learning_rate": 1.555766394767022e-05, + "loss": 0.3172, + "step": 6925 + }, + { + "epoch": 0.6524575492805162, + "grad_norm": 0.8317732214927673, + "learning_rate": 1.5556408548120794e-05, + "loss": 0.3373, + "step": 6926 + }, + { + "epoch": 0.6525517533736841, + "grad_norm": 0.7175168991088867, + "learning_rate": 1.555515302187666e-05, + "loss": 0.345, + "step": 6927 + }, + { + "epoch": 0.652645957466852, + "grad_norm": 0.7620663046836853, + "learning_rate": 1.555389736896645e-05, + "loss": 0.3163, + "step": 6928 + }, + { + "epoch": 0.6527401615600198, + "grad_norm": 0.6921946406364441, + "learning_rate": 1.555264158941879e-05, + "loss": 0.3488, + "step": 6929 + }, + { + "epoch": 0.6528343656531876, + "grad_norm": 0.7068236470222473, + "learning_rate": 1.555138568326231e-05, + "loss": 0.3179, + "step": 6930 + }, + { + "epoch": 0.6529285697463555, + "grad_norm": 1.0250000953674316, + "learning_rate": 1.5550129650525655e-05, + "loss": 0.3019, + "step": 6931 + }, + { + "epoch": 0.6530227738395233, + "grad_norm": 0.8708575367927551, + "learning_rate": 1.5548873491237458e-05, + "loss": 0.2934, + "step": 6932 + }, + { + "epoch": 0.6531169779326912, + "grad_norm": 0.6972542405128479, + "learning_rate": 1.5547617205426367e-05, + "loss": 0.2928, + "step": 6933 + }, + { + "epoch": 0.653211182025859, + "grad_norm": 0.7019140720367432, + "learning_rate": 1.554636079312102e-05, + "loss": 0.335, + "step": 6934 + }, + { + "epoch": 0.6533053861190269, + "grad_norm": 0.6595550179481506, + "learning_rate": 1.5545104254350074e-05, + "loss": 0.3089, + "step": 6935 + }, + { + "epoch": 0.6533995902121947, + "grad_norm": 0.893386960029602, + "learning_rate": 1.5543847589142173e-05, + "loss": 0.3344, + "step": 6936 + }, + { + "epoch": 0.6534937943053626, + "grad_norm": 0.758198082447052, + "learning_rate": 1.554259079752597e-05, + "loss": 0.3077, + "step": 6937 + }, + { + "epoch": 0.6535879983985304, + "grad_norm": 0.8025262355804443, + "learning_rate": 1.5541333879530132e-05, + "loss": 0.3256, + "step": 6938 + }, + { + "epoch": 0.6536822024916983, + "grad_norm": 0.9499841928482056, + "learning_rate": 1.5540076835183307e-05, + "loss": 0.3883, + "step": 6939 + }, + { + "epoch": 0.6537764065848661, + "grad_norm": 0.6841041445732117, + "learning_rate": 1.5538819664514165e-05, + "loss": 0.2785, + "step": 6940 + }, + { + "epoch": 0.653870610678034, + "grad_norm": 0.8491919040679932, + "learning_rate": 1.5537562367551365e-05, + "loss": 0.3484, + "step": 6941 + }, + { + "epoch": 0.6539648147712018, + "grad_norm": 0.7580857872962952, + "learning_rate": 1.553630494432358e-05, + "loss": 0.3264, + "step": 6942 + }, + { + "epoch": 0.6540590188643697, + "grad_norm": 0.7019778490066528, + "learning_rate": 1.553504739485948e-05, + "loss": 0.2818, + "step": 6943 + }, + { + "epoch": 0.6541532229575375, + "grad_norm": 0.7552631497383118, + "learning_rate": 1.553378971918774e-05, + "loss": 0.3703, + "step": 6944 + }, + { + "epoch": 0.6542474270507054, + "grad_norm": 0.7579901218414307, + "learning_rate": 1.553253191733704e-05, + "loss": 0.347, + "step": 6945 + }, + { + "epoch": 0.6543416311438732, + "grad_norm": 0.7338920831680298, + "learning_rate": 1.5531273989336052e-05, + "loss": 0.32, + "step": 6946 + }, + { + "epoch": 0.6544358352370411, + "grad_norm": 0.6904371976852417, + "learning_rate": 1.553001593521346e-05, + "loss": 0.3305, + "step": 6947 + }, + { + "epoch": 0.6545300393302089, + "grad_norm": 0.7427716255187988, + "learning_rate": 1.5528757754997957e-05, + "loss": 0.3337, + "step": 6948 + }, + { + "epoch": 0.6546242434233768, + "grad_norm": 0.7259464263916016, + "learning_rate": 1.5527499448718225e-05, + "loss": 0.2932, + "step": 6949 + }, + { + "epoch": 0.6547184475165446, + "grad_norm": 0.6885246634483337, + "learning_rate": 1.5526241016402962e-05, + "loss": 0.3193, + "step": 6950 + }, + { + "epoch": 0.6548126516097125, + "grad_norm": 0.8501678705215454, + "learning_rate": 1.552498245808085e-05, + "loss": 0.3212, + "step": 6951 + }, + { + "epoch": 0.6549068557028803, + "grad_norm": 0.7948251962661743, + "learning_rate": 1.5523723773780597e-05, + "loss": 0.35, + "step": 6952 + }, + { + "epoch": 0.6550010597960482, + "grad_norm": 0.7340381741523743, + "learning_rate": 1.55224649635309e-05, + "loss": 0.3224, + "step": 6953 + }, + { + "epoch": 0.655095263889216, + "grad_norm": 0.886984646320343, + "learning_rate": 1.5521206027360458e-05, + "loss": 0.3283, + "step": 6954 + }, + { + "epoch": 0.6551894679823839, + "grad_norm": 0.8832926154136658, + "learning_rate": 1.5519946965297984e-05, + "loss": 0.3545, + "step": 6955 + }, + { + "epoch": 0.6552836720755517, + "grad_norm": 0.6827700138092041, + "learning_rate": 1.551868777737218e-05, + "loss": 0.3008, + "step": 6956 + }, + { + "epoch": 0.6553778761687196, + "grad_norm": 0.8592090010643005, + "learning_rate": 1.551742846361176e-05, + "loss": 0.3598, + "step": 6957 + }, + { + "epoch": 0.6554720802618874, + "grad_norm": 0.7674247622489929, + "learning_rate": 1.5516169024045437e-05, + "loss": 0.313, + "step": 6958 + }, + { + "epoch": 0.6555662843550553, + "grad_norm": 0.9212135076522827, + "learning_rate": 1.551490945870193e-05, + "loss": 0.3673, + "step": 6959 + }, + { + "epoch": 0.6556604884482231, + "grad_norm": 0.7794392108917236, + "learning_rate": 1.5513649767609962e-05, + "loss": 0.3913, + "step": 6960 + }, + { + "epoch": 0.655754692541391, + "grad_norm": 0.690242350101471, + "learning_rate": 1.5512389950798248e-05, + "loss": 0.3117, + "step": 6961 + }, + { + "epoch": 0.6558488966345588, + "grad_norm": 0.7565612196922302, + "learning_rate": 1.551113000829552e-05, + "loss": 0.321, + "step": 6962 + }, + { + "epoch": 0.6559431007277267, + "grad_norm": 0.750796377658844, + "learning_rate": 1.55098699401305e-05, + "loss": 0.3157, + "step": 6963 + }, + { + "epoch": 0.6560373048208945, + "grad_norm": 0.7473530769348145, + "learning_rate": 1.550860974633193e-05, + "loss": 0.3469, + "step": 6964 + }, + { + "epoch": 0.6561315089140624, + "grad_norm": 0.7537606954574585, + "learning_rate": 1.550734942692854e-05, + "loss": 0.351, + "step": 6965 + }, + { + "epoch": 0.6562257130072302, + "grad_norm": 0.7087019085884094, + "learning_rate": 1.550608898194906e-05, + "loss": 0.3084, + "step": 6966 + }, + { + "epoch": 0.656319917100398, + "grad_norm": 0.6352601647377014, + "learning_rate": 1.5504828411422237e-05, + "loss": 0.2826, + "step": 6967 + }, + { + "epoch": 0.6564141211935659, + "grad_norm": 0.81820148229599, + "learning_rate": 1.550356771537682e-05, + "loss": 0.2954, + "step": 6968 + }, + { + "epoch": 0.6565083252867338, + "grad_norm": 0.6734491586685181, + "learning_rate": 1.550230689384154e-05, + "loss": 0.2805, + "step": 6969 + }, + { + "epoch": 0.6566025293799016, + "grad_norm": 0.6242235898971558, + "learning_rate": 1.550104594684515e-05, + "loss": 0.2695, + "step": 6970 + }, + { + "epoch": 0.6566967334730694, + "grad_norm": 0.6996893286705017, + "learning_rate": 1.549978487441641e-05, + "loss": 0.2763, + "step": 6971 + }, + { + "epoch": 0.6567909375662373, + "grad_norm": 0.766075611114502, + "learning_rate": 1.549852367658407e-05, + "loss": 0.3404, + "step": 6972 + }, + { + "epoch": 0.6568851416594051, + "grad_norm": 0.8189030289649963, + "learning_rate": 1.5497262353376888e-05, + "loss": 0.3535, + "step": 6973 + }, + { + "epoch": 0.656979345752573, + "grad_norm": 0.8577380180358887, + "learning_rate": 1.5496000904823622e-05, + "loss": 0.3258, + "step": 6974 + }, + { + "epoch": 0.6570735498457408, + "grad_norm": 0.8298494815826416, + "learning_rate": 1.5494739330953034e-05, + "loss": 0.3608, + "step": 6975 + }, + { + "epoch": 0.6571677539389087, + "grad_norm": 0.7967024445533752, + "learning_rate": 1.5493477631793893e-05, + "loss": 0.3569, + "step": 6976 + }, + { + "epoch": 0.6572619580320765, + "grad_norm": 0.7760042548179626, + "learning_rate": 1.549221580737496e-05, + "loss": 0.3331, + "step": 6977 + }, + { + "epoch": 0.6573561621252444, + "grad_norm": 0.7782050967216492, + "learning_rate": 1.5490953857725023e-05, + "loss": 0.3272, + "step": 6978 + }, + { + "epoch": 0.6574503662184122, + "grad_norm": 0.6885071396827698, + "learning_rate": 1.5489691782872838e-05, + "loss": 0.3077, + "step": 6979 + }, + { + "epoch": 0.6575445703115801, + "grad_norm": 0.815770149230957, + "learning_rate": 1.5488429582847194e-05, + "loss": 0.3228, + "step": 6980 + }, + { + "epoch": 0.6576387744047479, + "grad_norm": 0.7214744687080383, + "learning_rate": 1.5487167257676868e-05, + "loss": 0.2933, + "step": 6981 + }, + { + "epoch": 0.6577329784979158, + "grad_norm": 0.7750257849693298, + "learning_rate": 1.5485904807390638e-05, + "loss": 0.3294, + "step": 6982 + }, + { + "epoch": 0.6578271825910836, + "grad_norm": 0.6970694065093994, + "learning_rate": 1.54846422320173e-05, + "loss": 0.3188, + "step": 6983 + }, + { + "epoch": 0.6579213866842515, + "grad_norm": 0.8175873160362244, + "learning_rate": 1.5483379531585634e-05, + "loss": 0.3047, + "step": 6984 + }, + { + "epoch": 0.6580155907774192, + "grad_norm": 0.6365392804145813, + "learning_rate": 1.5482116706124435e-05, + "loss": 0.3088, + "step": 6985 + }, + { + "epoch": 0.6581097948705871, + "grad_norm": 0.8470643162727356, + "learning_rate": 1.5480853755662498e-05, + "loss": 0.3328, + "step": 6986 + }, + { + "epoch": 0.6582039989637549, + "grad_norm": 0.6874431371688843, + "learning_rate": 1.5479590680228612e-05, + "loss": 0.2949, + "step": 6987 + }, + { + "epoch": 0.6582982030569228, + "grad_norm": 0.7445035576820374, + "learning_rate": 1.5478327479851592e-05, + "loss": 0.3493, + "step": 6988 + }, + { + "epoch": 0.6583924071500906, + "grad_norm": 0.6998085975646973, + "learning_rate": 1.5477064154560232e-05, + "loss": 0.3309, + "step": 6989 + }, + { + "epoch": 0.6584866112432585, + "grad_norm": 0.7649194002151489, + "learning_rate": 1.5475800704383338e-05, + "loss": 0.3242, + "step": 6990 + }, + { + "epoch": 0.6585808153364263, + "grad_norm": 0.687041163444519, + "learning_rate": 1.547453712934972e-05, + "loss": 0.298, + "step": 6991 + }, + { + "epoch": 0.6586750194295942, + "grad_norm": 0.8076814413070679, + "learning_rate": 1.5473273429488187e-05, + "loss": 0.329, + "step": 6992 + }, + { + "epoch": 0.658769223522762, + "grad_norm": 0.8376584649085999, + "learning_rate": 1.5472009604827557e-05, + "loss": 0.3438, + "step": 6993 + }, + { + "epoch": 0.6588634276159299, + "grad_norm": 0.9204269647598267, + "learning_rate": 1.5470745655396643e-05, + "loss": 0.3012, + "step": 6994 + }, + { + "epoch": 0.6589576317090977, + "grad_norm": 0.653154194355011, + "learning_rate": 1.5469481581224274e-05, + "loss": 0.2979, + "step": 6995 + }, + { + "epoch": 0.6590518358022656, + "grad_norm": 0.8088932633399963, + "learning_rate": 1.5468217382339256e-05, + "loss": 0.3347, + "step": 6996 + }, + { + "epoch": 0.6591460398954334, + "grad_norm": 0.8216959238052368, + "learning_rate": 1.546695305877043e-05, + "loss": 0.3401, + "step": 6997 + }, + { + "epoch": 0.6592402439886013, + "grad_norm": 0.737947404384613, + "learning_rate": 1.546568861054662e-05, + "loss": 0.3203, + "step": 6998 + }, + { + "epoch": 0.6593344480817691, + "grad_norm": 0.7853481769561768, + "learning_rate": 1.5464424037696655e-05, + "loss": 0.3525, + "step": 6999 + }, + { + "epoch": 0.659428652174937, + "grad_norm": 0.7092281579971313, + "learning_rate": 1.5463159340249377e-05, + "loss": 0.3131, + "step": 7000 + }, + { + "epoch": 0.6595228562681048, + "grad_norm": 0.7580530643463135, + "learning_rate": 1.546189451823361e-05, + "loss": 0.3227, + "step": 7001 + }, + { + "epoch": 0.6596170603612727, + "grad_norm": 0.7429535984992981, + "learning_rate": 1.5460629571678205e-05, + "loss": 0.3315, + "step": 7002 + }, + { + "epoch": 0.6597112644544405, + "grad_norm": 0.7682792544364929, + "learning_rate": 1.5459364500612e-05, + "loss": 0.3303, + "step": 7003 + }, + { + "epoch": 0.6598054685476084, + "grad_norm": 0.6755010485649109, + "learning_rate": 1.545809930506384e-05, + "loss": 0.2796, + "step": 7004 + }, + { + "epoch": 0.6598996726407762, + "grad_norm": 0.776313304901123, + "learning_rate": 1.5456833985062574e-05, + "loss": 0.3469, + "step": 7005 + }, + { + "epoch": 0.659993876733944, + "grad_norm": 0.7985623478889465, + "learning_rate": 1.5455568540637055e-05, + "loss": 0.3586, + "step": 7006 + }, + { + "epoch": 0.6600880808271119, + "grad_norm": 0.7103636264801025, + "learning_rate": 1.5454302971816138e-05, + "loss": 0.3139, + "step": 7007 + }, + { + "epoch": 0.6601822849202797, + "grad_norm": 0.7411162853240967, + "learning_rate": 1.5453037278628676e-05, + "loss": 0.3214, + "step": 7008 + }, + { + "epoch": 0.6602764890134476, + "grad_norm": 0.7773886322975159, + "learning_rate": 1.545177146110353e-05, + "loss": 0.3126, + "step": 7009 + }, + { + "epoch": 0.6603706931066154, + "grad_norm": 0.7389939427375793, + "learning_rate": 1.5450505519269568e-05, + "loss": 0.3177, + "step": 7010 + }, + { + "epoch": 0.6604648971997833, + "grad_norm": 0.7481390833854675, + "learning_rate": 1.544923945315565e-05, + "loss": 0.298, + "step": 7011 + }, + { + "epoch": 0.6605591012929511, + "grad_norm": 0.699995756149292, + "learning_rate": 1.5447973262790638e-05, + "loss": 0.3108, + "step": 7012 + }, + { + "epoch": 0.660653305386119, + "grad_norm": 0.7394351959228516, + "learning_rate": 1.5446706948203415e-05, + "loss": 0.3307, + "step": 7013 + }, + { + "epoch": 0.6607475094792868, + "grad_norm": 0.7475301623344421, + "learning_rate": 1.544544050942285e-05, + "loss": 0.3158, + "step": 7014 + }, + { + "epoch": 0.6608417135724547, + "grad_norm": 0.8901613354682922, + "learning_rate": 1.544417394647782e-05, + "loss": 0.3282, + "step": 7015 + }, + { + "epoch": 0.6609359176656225, + "grad_norm": 0.72362220287323, + "learning_rate": 1.5442907259397203e-05, + "loss": 0.3231, + "step": 7016 + }, + { + "epoch": 0.6610301217587904, + "grad_norm": 0.7319814562797546, + "learning_rate": 1.5441640448209884e-05, + "loss": 0.36, + "step": 7017 + }, + { + "epoch": 0.6611243258519582, + "grad_norm": 0.7211571335792542, + "learning_rate": 1.544037351294475e-05, + "loss": 0.3455, + "step": 7018 + }, + { + "epoch": 0.6612185299451261, + "grad_norm": 0.6436552405357361, + "learning_rate": 1.5439106453630683e-05, + "loss": 0.2964, + "step": 7019 + }, + { + "epoch": 0.6613127340382939, + "grad_norm": 0.6992090940475464, + "learning_rate": 1.5437839270296575e-05, + "loss": 0.3117, + "step": 7020 + }, + { + "epoch": 0.6614069381314618, + "grad_norm": 0.7446430921554565, + "learning_rate": 1.5436571962971325e-05, + "loss": 0.3103, + "step": 7021 + }, + { + "epoch": 0.6615011422246296, + "grad_norm": 0.8667694330215454, + "learning_rate": 1.5435304531683827e-05, + "loss": 0.299, + "step": 7022 + }, + { + "epoch": 0.6615953463177975, + "grad_norm": 0.8172776699066162, + "learning_rate": 1.5434036976462977e-05, + "loss": 0.3013, + "step": 7023 + }, + { + "epoch": 0.6616895504109653, + "grad_norm": 0.8228224515914917, + "learning_rate": 1.543276929733768e-05, + "loss": 0.3266, + "step": 7024 + }, + { + "epoch": 0.6617837545041332, + "grad_norm": 0.8509236574172974, + "learning_rate": 1.5431501494336843e-05, + "loss": 0.3442, + "step": 7025 + }, + { + "epoch": 0.661877958597301, + "grad_norm": 0.7069663405418396, + "learning_rate": 1.5430233567489375e-05, + "loss": 0.279, + "step": 7026 + }, + { + "epoch": 0.6619721626904689, + "grad_norm": 0.7457709908485413, + "learning_rate": 1.5428965516824178e-05, + "loss": 0.3255, + "step": 7027 + }, + { + "epoch": 0.6620663667836367, + "grad_norm": 0.8237126469612122, + "learning_rate": 1.5427697342370175e-05, + "loss": 0.3547, + "step": 7028 + }, + { + "epoch": 0.6621605708768046, + "grad_norm": 0.9309609532356262, + "learning_rate": 1.5426429044156276e-05, + "loss": 0.2992, + "step": 7029 + }, + { + "epoch": 0.6622547749699724, + "grad_norm": 0.7035329341888428, + "learning_rate": 1.5425160622211402e-05, + "loss": 0.3505, + "step": 7030 + }, + { + "epoch": 0.6623489790631403, + "grad_norm": 0.6925642490386963, + "learning_rate": 1.542389207656448e-05, + "loss": 0.3032, + "step": 7031 + }, + { + "epoch": 0.6624431831563081, + "grad_norm": 0.8220372200012207, + "learning_rate": 1.5422623407244425e-05, + "loss": 0.3311, + "step": 7032 + }, + { + "epoch": 0.662537387249476, + "grad_norm": 0.7475167512893677, + "learning_rate": 1.5421354614280174e-05, + "loss": 0.2834, + "step": 7033 + }, + { + "epoch": 0.6626315913426438, + "grad_norm": 0.7504574060440063, + "learning_rate": 1.542008569770065e-05, + "loss": 0.2699, + "step": 7034 + }, + { + "epoch": 0.6627257954358117, + "grad_norm": 0.6945213079452515, + "learning_rate": 1.5418816657534793e-05, + "loss": 0.3236, + "step": 7035 + }, + { + "epoch": 0.6628199995289795, + "grad_norm": 0.9477180242538452, + "learning_rate": 1.5417547493811533e-05, + "loss": 0.3594, + "step": 7036 + }, + { + "epoch": 0.6629142036221474, + "grad_norm": 0.7390096783638, + "learning_rate": 1.5416278206559816e-05, + "loss": 0.365, + "step": 7037 + }, + { + "epoch": 0.6630084077153152, + "grad_norm": 0.7738800048828125, + "learning_rate": 1.5415008795808578e-05, + "loss": 0.3057, + "step": 7038 + }, + { + "epoch": 0.6631026118084831, + "grad_norm": 0.7573422193527222, + "learning_rate": 1.541373926158676e-05, + "loss": 0.3143, + "step": 7039 + }, + { + "epoch": 0.6631968159016509, + "grad_norm": 0.6920155882835388, + "learning_rate": 1.541246960392332e-05, + "loss": 0.2913, + "step": 7040 + }, + { + "epoch": 0.6632910199948188, + "grad_norm": 0.8231220245361328, + "learning_rate": 1.5411199822847202e-05, + "loss": 0.318, + "step": 7041 + }, + { + "epoch": 0.6633852240879866, + "grad_norm": 0.7004749774932861, + "learning_rate": 1.5409929918387357e-05, + "loss": 0.284, + "step": 7042 + }, + { + "epoch": 0.6634794281811545, + "grad_norm": 0.7746490240097046, + "learning_rate": 1.5408659890572746e-05, + "loss": 0.3166, + "step": 7043 + }, + { + "epoch": 0.6635736322743223, + "grad_norm": 0.8276163935661316, + "learning_rate": 1.540738973943232e-05, + "loss": 0.3065, + "step": 7044 + }, + { + "epoch": 0.6636678363674902, + "grad_norm": 0.6804275512695312, + "learning_rate": 1.540611946499505e-05, + "loss": 0.2782, + "step": 7045 + }, + { + "epoch": 0.663762040460658, + "grad_norm": 0.9253482818603516, + "learning_rate": 1.5404849067289896e-05, + "loss": 0.3574, + "step": 7046 + }, + { + "epoch": 0.6638562445538259, + "grad_norm": 0.7378069162368774, + "learning_rate": 1.540357854634582e-05, + "loss": 0.3055, + "step": 7047 + }, + { + "epoch": 0.6639504486469937, + "grad_norm": 0.7682768106460571, + "learning_rate": 1.5402307902191803e-05, + "loss": 0.3325, + "step": 7048 + }, + { + "epoch": 0.6640446527401616, + "grad_norm": 0.7268179059028625, + "learning_rate": 1.54010371348568e-05, + "loss": 0.2796, + "step": 7049 + }, + { + "epoch": 0.6641388568333294, + "grad_norm": 0.8494754433631897, + "learning_rate": 1.5399766244369806e-05, + "loss": 0.282, + "step": 7050 + }, + { + "epoch": 0.6642330609264973, + "grad_norm": 0.7781804800033569, + "learning_rate": 1.5398495230759793e-05, + "loss": 0.3255, + "step": 7051 + }, + { + "epoch": 0.6643272650196651, + "grad_norm": 0.6691144108772278, + "learning_rate": 1.5397224094055732e-05, + "loss": 0.2812, + "step": 7052 + }, + { + "epoch": 0.664421469112833, + "grad_norm": 0.784238338470459, + "learning_rate": 1.539595283428662e-05, + "loss": 0.3432, + "step": 7053 + }, + { + "epoch": 0.6645156732060008, + "grad_norm": 0.7589924335479736, + "learning_rate": 1.5394681451481437e-05, + "loss": 0.3392, + "step": 7054 + }, + { + "epoch": 0.6646098772991686, + "grad_norm": 0.6793816089630127, + "learning_rate": 1.5393409945669177e-05, + "loss": 0.3015, + "step": 7055 + }, + { + "epoch": 0.6647040813923365, + "grad_norm": 0.7202833890914917, + "learning_rate": 1.5392138316878826e-05, + "loss": 0.3121, + "step": 7056 + }, + { + "epoch": 0.6647982854855043, + "grad_norm": 0.8880021572113037, + "learning_rate": 1.539086656513938e-05, + "loss": 0.2888, + "step": 7057 + }, + { + "epoch": 0.6648924895786722, + "grad_norm": 0.7612547278404236, + "learning_rate": 1.538959469047984e-05, + "loss": 0.3234, + "step": 7058 + }, + { + "epoch": 0.66498669367184, + "grad_norm": 0.8277795314788818, + "learning_rate": 1.5388322692929207e-05, + "loss": 0.3174, + "step": 7059 + }, + { + "epoch": 0.6650808977650079, + "grad_norm": 0.8115323781967163, + "learning_rate": 1.5387050572516488e-05, + "loss": 0.2976, + "step": 7060 + }, + { + "epoch": 0.6651751018581757, + "grad_norm": 0.7044195532798767, + "learning_rate": 1.5385778329270676e-05, + "loss": 0.3068, + "step": 7061 + }, + { + "epoch": 0.6652693059513436, + "grad_norm": 0.9168974161148071, + "learning_rate": 1.5384505963220794e-05, + "loss": 0.3771, + "step": 7062 + }, + { + "epoch": 0.6653635100445114, + "grad_norm": 0.7239624857902527, + "learning_rate": 1.5383233474395848e-05, + "loss": 0.3074, + "step": 7063 + }, + { + "epoch": 0.6654577141376793, + "grad_norm": 0.7223944664001465, + "learning_rate": 1.5381960862824853e-05, + "loss": 0.3603, + "step": 7064 + }, + { + "epoch": 0.6655519182308471, + "grad_norm": 0.7156306505203247, + "learning_rate": 1.5380688128536827e-05, + "loss": 0.3039, + "step": 7065 + }, + { + "epoch": 0.665646122324015, + "grad_norm": 0.8616565465927124, + "learning_rate": 1.5379415271560794e-05, + "loss": 0.3457, + "step": 7066 + }, + { + "epoch": 0.6657403264171828, + "grad_norm": 0.8814892172813416, + "learning_rate": 1.5378142291925768e-05, + "loss": 0.3634, + "step": 7067 + }, + { + "epoch": 0.6658345305103507, + "grad_norm": 0.8556869626045227, + "learning_rate": 1.5376869189660784e-05, + "loss": 0.3858, + "step": 7068 + }, + { + "epoch": 0.6659287346035185, + "grad_norm": 0.758821964263916, + "learning_rate": 1.5375595964794862e-05, + "loss": 0.3542, + "step": 7069 + }, + { + "epoch": 0.6660229386966864, + "grad_norm": 0.915661096572876, + "learning_rate": 1.5374322617357046e-05, + "loss": 0.3306, + "step": 7070 + }, + { + "epoch": 0.6661171427898542, + "grad_norm": 0.7214668393135071, + "learning_rate": 1.5373049147376358e-05, + "loss": 0.3163, + "step": 7071 + }, + { + "epoch": 0.6662113468830221, + "grad_norm": 0.9709456562995911, + "learning_rate": 1.5371775554881837e-05, + "loss": 0.2997, + "step": 7072 + }, + { + "epoch": 0.6663055509761899, + "grad_norm": 0.7502962350845337, + "learning_rate": 1.5370501839902533e-05, + "loss": 0.3331, + "step": 7073 + }, + { + "epoch": 0.6663997550693578, + "grad_norm": 0.8238278031349182, + "learning_rate": 1.5369228002467477e-05, + "loss": 0.324, + "step": 7074 + }, + { + "epoch": 0.6664939591625256, + "grad_norm": 0.698924720287323, + "learning_rate": 1.536795404260572e-05, + "loss": 0.2914, + "step": 7075 + }, + { + "epoch": 0.6665881632556935, + "grad_norm": 0.7339386343955994, + "learning_rate": 1.5366679960346307e-05, + "loss": 0.3572, + "step": 7076 + }, + { + "epoch": 0.6666823673488613, + "grad_norm": 0.7864017486572266, + "learning_rate": 1.5365405755718293e-05, + "loss": 0.3427, + "step": 7077 + }, + { + "epoch": 0.6667765714420292, + "grad_norm": 0.8368102312088013, + "learning_rate": 1.536413142875073e-05, + "loss": 0.3304, + "step": 7078 + }, + { + "epoch": 0.666870775535197, + "grad_norm": 0.6254177093505859, + "learning_rate": 1.5362856979472672e-05, + "loss": 0.2907, + "step": 7079 + }, + { + "epoch": 0.6669649796283649, + "grad_norm": 0.7711076140403748, + "learning_rate": 1.5361582407913188e-05, + "loss": 0.3246, + "step": 7080 + }, + { + "epoch": 0.6670591837215327, + "grad_norm": 0.7080456018447876, + "learning_rate": 1.5360307714101326e-05, + "loss": 0.3072, + "step": 7081 + }, + { + "epoch": 0.6671533878147006, + "grad_norm": 0.7083311080932617, + "learning_rate": 1.535903289806616e-05, + "loss": 0.2914, + "step": 7082 + }, + { + "epoch": 0.6672475919078684, + "grad_norm": 0.7401914596557617, + "learning_rate": 1.535775795983676e-05, + "loss": 0.2759, + "step": 7083 + }, + { + "epoch": 0.6673417960010363, + "grad_norm": 0.8789626955986023, + "learning_rate": 1.5356482899442188e-05, + "loss": 0.3096, + "step": 7084 + }, + { + "epoch": 0.6674360000942041, + "grad_norm": 0.7936314940452576, + "learning_rate": 1.5355207716911523e-05, + "loss": 0.3301, + "step": 7085 + }, + { + "epoch": 0.667530204187372, + "grad_norm": 0.6872920393943787, + "learning_rate": 1.535393241227384e-05, + "loss": 0.3099, + "step": 7086 + }, + { + "epoch": 0.6676244082805398, + "grad_norm": 0.7527125477790833, + "learning_rate": 1.535265698555822e-05, + "loss": 0.3669, + "step": 7087 + }, + { + "epoch": 0.6677186123737077, + "grad_norm": 0.8189874887466431, + "learning_rate": 1.535138143679374e-05, + "loss": 0.3399, + "step": 7088 + }, + { + "epoch": 0.6678128164668755, + "grad_norm": 0.8073145151138306, + "learning_rate": 1.535010576600949e-05, + "loss": 0.3524, + "step": 7089 + }, + { + "epoch": 0.6679070205600434, + "grad_norm": 0.760679304599762, + "learning_rate": 1.534882997323455e-05, + "loss": 0.3643, + "step": 7090 + }, + { + "epoch": 0.6680012246532112, + "grad_norm": 0.968233048915863, + "learning_rate": 1.534755405849802e-05, + "loss": 0.3642, + "step": 7091 + }, + { + "epoch": 0.6680954287463791, + "grad_norm": 0.7697311043739319, + "learning_rate": 1.5346278021828983e-05, + "loss": 0.3609, + "step": 7092 + }, + { + "epoch": 0.6681896328395469, + "grad_norm": 0.824440062046051, + "learning_rate": 1.534500186325654e-05, + "loss": 0.3532, + "step": 7093 + }, + { + "epoch": 0.6682838369327148, + "grad_norm": 0.7930930852890015, + "learning_rate": 1.5343725582809793e-05, + "loss": 0.3141, + "step": 7094 + }, + { + "epoch": 0.6683780410258826, + "grad_norm": 0.6535442471504211, + "learning_rate": 1.5342449180517834e-05, + "loss": 0.3, + "step": 7095 + }, + { + "epoch": 0.6684722451190505, + "grad_norm": 1.4270353317260742, + "learning_rate": 1.534117265640977e-05, + "loss": 0.329, + "step": 7096 + }, + { + "epoch": 0.6685664492122183, + "grad_norm": 0.7029897570610046, + "learning_rate": 1.533989601051471e-05, + "loss": 0.2952, + "step": 7097 + }, + { + "epoch": 0.6686606533053862, + "grad_norm": 0.8326965570449829, + "learning_rate": 1.5338619242861766e-05, + "loss": 0.323, + "step": 7098 + }, + { + "epoch": 0.668754857398554, + "grad_norm": 0.9096085429191589, + "learning_rate": 1.5337342353480044e-05, + "loss": 0.3177, + "step": 7099 + }, + { + "epoch": 0.6688490614917219, + "grad_norm": 0.9957058429718018, + "learning_rate": 1.5336065342398664e-05, + "loss": 0.3176, + "step": 7100 + }, + { + "epoch": 0.6689432655848897, + "grad_norm": 0.7245864272117615, + "learning_rate": 1.5334788209646738e-05, + "loss": 0.2801, + "step": 7101 + }, + { + "epoch": 0.6690374696780576, + "grad_norm": 0.8069775700569153, + "learning_rate": 1.5333510955253396e-05, + "loss": 0.341, + "step": 7102 + }, + { + "epoch": 0.6691316737712254, + "grad_norm": 0.7672380805015564, + "learning_rate": 1.533223357924775e-05, + "loss": 0.3182, + "step": 7103 + }, + { + "epoch": 0.6692258778643932, + "grad_norm": 0.692295253276825, + "learning_rate": 1.5330956081658932e-05, + "loss": 0.3094, + "step": 7104 + }, + { + "epoch": 0.6693200819575611, + "grad_norm": 0.757405698299408, + "learning_rate": 1.5329678462516073e-05, + "loss": 0.3011, + "step": 7105 + }, + { + "epoch": 0.669414286050729, + "grad_norm": 0.6376831531524658, + "learning_rate": 1.5328400721848305e-05, + "loss": 0.3108, + "step": 7106 + }, + { + "epoch": 0.6695084901438968, + "grad_norm": 0.6972873210906982, + "learning_rate": 1.5327122859684758e-05, + "loss": 0.3065, + "step": 7107 + }, + { + "epoch": 0.6696026942370646, + "grad_norm": 0.7769290804862976, + "learning_rate": 1.532584487605457e-05, + "loss": 0.3314, + "step": 7108 + }, + { + "epoch": 0.6696968983302325, + "grad_norm": 0.8072546124458313, + "learning_rate": 1.5324566770986884e-05, + "loss": 0.3751, + "step": 7109 + }, + { + "epoch": 0.6697911024234003, + "grad_norm": 0.7501782178878784, + "learning_rate": 1.532328854451084e-05, + "loss": 0.341, + "step": 7110 + }, + { + "epoch": 0.6698853065165682, + "grad_norm": 0.6741506457328796, + "learning_rate": 1.532201019665559e-05, + "loss": 0.3058, + "step": 7111 + }, + { + "epoch": 0.669979510609736, + "grad_norm": 0.71759033203125, + "learning_rate": 1.5320731727450268e-05, + "loss": 0.3151, + "step": 7112 + }, + { + "epoch": 0.6700737147029039, + "grad_norm": 0.7359454035758972, + "learning_rate": 1.5319453136924037e-05, + "loss": 0.3131, + "step": 7113 + }, + { + "epoch": 0.6701679187960717, + "grad_norm": 0.737964928150177, + "learning_rate": 1.531817442510605e-05, + "loss": 0.3149, + "step": 7114 + }, + { + "epoch": 0.6702621228892396, + "grad_norm": 0.8473628163337708, + "learning_rate": 1.5316895592025458e-05, + "loss": 0.3736, + "step": 7115 + }, + { + "epoch": 0.6703563269824074, + "grad_norm": 0.672426700592041, + "learning_rate": 1.5315616637711424e-05, + "loss": 0.3112, + "step": 7116 + }, + { + "epoch": 0.6704505310755753, + "grad_norm": 0.6875741481781006, + "learning_rate": 1.5314337562193112e-05, + "loss": 0.3068, + "step": 7117 + }, + { + "epoch": 0.6705447351687431, + "grad_norm": 0.9028030633926392, + "learning_rate": 1.5313058365499686e-05, + "loss": 0.3761, + "step": 7118 + }, + { + "epoch": 0.670638939261911, + "grad_norm": 0.7115257382392883, + "learning_rate": 1.5311779047660312e-05, + "loss": 0.3361, + "step": 7119 + }, + { + "epoch": 0.6707331433550788, + "grad_norm": 0.8416838049888611, + "learning_rate": 1.531049960870416e-05, + "loss": 0.3459, + "step": 7120 + }, + { + "epoch": 0.6708273474482467, + "grad_norm": 0.8257519006729126, + "learning_rate": 1.5309220048660403e-05, + "loss": 0.3544, + "step": 7121 + }, + { + "epoch": 0.6709215515414145, + "grad_norm": 0.6752427220344543, + "learning_rate": 1.5307940367558217e-05, + "loss": 0.303, + "step": 7122 + }, + { + "epoch": 0.6710157556345823, + "grad_norm": 0.6976400017738342, + "learning_rate": 1.530666056542679e-05, + "loss": 0.3069, + "step": 7123 + }, + { + "epoch": 0.6711099597277501, + "grad_norm": 0.7103172540664673, + "learning_rate": 1.5305380642295285e-05, + "loss": 0.3026, + "step": 7124 + }, + { + "epoch": 0.671204163820918, + "grad_norm": 0.7587864995002747, + "learning_rate": 1.53041005981929e-05, + "loss": 0.3254, + "step": 7125 + }, + { + "epoch": 0.6712983679140858, + "grad_norm": 0.7170512676239014, + "learning_rate": 1.5302820433148817e-05, + "loss": 0.3277, + "step": 7126 + }, + { + "epoch": 0.6713925720072537, + "grad_norm": 0.7051998972892761, + "learning_rate": 1.5301540147192227e-05, + "loss": 0.2846, + "step": 7127 + }, + { + "epoch": 0.6714867761004215, + "grad_norm": 0.7642733454704285, + "learning_rate": 1.5300259740352327e-05, + "loss": 0.3373, + "step": 7128 + }, + { + "epoch": 0.6715809801935894, + "grad_norm": 0.6517353653907776, + "learning_rate": 1.5298979212658304e-05, + "loss": 0.2974, + "step": 7129 + }, + { + "epoch": 0.6716751842867572, + "grad_norm": 0.7439292669296265, + "learning_rate": 1.5297698564139364e-05, + "loss": 0.3111, + "step": 7130 + }, + { + "epoch": 0.6717693883799251, + "grad_norm": 0.6721612811088562, + "learning_rate": 1.52964177948247e-05, + "loss": 0.3028, + "step": 7131 + }, + { + "epoch": 0.6718635924730929, + "grad_norm": 0.7699888944625854, + "learning_rate": 1.5295136904743518e-05, + "loss": 0.3321, + "step": 7132 + }, + { + "epoch": 0.6719577965662608, + "grad_norm": 0.7608176469802856, + "learning_rate": 1.529385589392503e-05, + "loss": 0.2732, + "step": 7133 + }, + { + "epoch": 0.6720520006594286, + "grad_norm": 0.7948134541511536, + "learning_rate": 1.529257476239844e-05, + "loss": 0.3372, + "step": 7134 + }, + { + "epoch": 0.6721462047525965, + "grad_norm": 0.7166592478752136, + "learning_rate": 1.5291293510192957e-05, + "loss": 0.3339, + "step": 7135 + }, + { + "epoch": 0.6722404088457643, + "grad_norm": 0.7006956338882446, + "learning_rate": 1.52900121373378e-05, + "loss": 0.3186, + "step": 7136 + }, + { + "epoch": 0.6723346129389322, + "grad_norm": 0.7558178305625916, + "learning_rate": 1.5288730643862185e-05, + "loss": 0.3647, + "step": 7137 + }, + { + "epoch": 0.6724288170321, + "grad_norm": 0.8109870553016663, + "learning_rate": 1.5287449029795335e-05, + "loss": 0.3012, + "step": 7138 + }, + { + "epoch": 0.6725230211252679, + "grad_norm": 0.8044179081916809, + "learning_rate": 1.5286167295166468e-05, + "loss": 0.3522, + "step": 7139 + }, + { + "epoch": 0.6726172252184357, + "grad_norm": 0.8509412407875061, + "learning_rate": 1.528488544000481e-05, + "loss": 0.3154, + "step": 7140 + }, + { + "epoch": 0.6727114293116035, + "grad_norm": 0.7896276116371155, + "learning_rate": 1.528360346433959e-05, + "loss": 0.3271, + "step": 7141 + }, + { + "epoch": 0.6728056334047714, + "grad_norm": 0.9728242754936218, + "learning_rate": 1.528232136820004e-05, + "loss": 0.3971, + "step": 7142 + }, + { + "epoch": 0.6728998374979392, + "grad_norm": 0.8134274482727051, + "learning_rate": 1.52810391516154e-05, + "loss": 0.3393, + "step": 7143 + }, + { + "epoch": 0.6729940415911071, + "grad_norm": 1.1658309698104858, + "learning_rate": 1.527975681461489e-05, + "loss": 0.3121, + "step": 7144 + }, + { + "epoch": 0.6730882456842749, + "grad_norm": 0.7035720944404602, + "learning_rate": 1.5278474357227765e-05, + "loss": 0.2972, + "step": 7145 + }, + { + "epoch": 0.6731824497774428, + "grad_norm": 0.7104215025901794, + "learning_rate": 1.527719177948326e-05, + "loss": 0.3073, + "step": 7146 + }, + { + "epoch": 0.6732766538706106, + "grad_norm": 0.7597449421882629, + "learning_rate": 1.5275909081410622e-05, + "loss": 0.3584, + "step": 7147 + }, + { + "epoch": 0.6733708579637785, + "grad_norm": 0.6931965947151184, + "learning_rate": 1.52746262630391e-05, + "loss": 0.3201, + "step": 7148 + }, + { + "epoch": 0.6734650620569463, + "grad_norm": 0.7502024173736572, + "learning_rate": 1.5273343324397938e-05, + "loss": 0.2716, + "step": 7149 + }, + { + "epoch": 0.6735592661501142, + "grad_norm": 0.7594082951545715, + "learning_rate": 1.5272060265516392e-05, + "loss": 0.2708, + "step": 7150 + }, + { + "epoch": 0.673653470243282, + "grad_norm": 0.8175405859947205, + "learning_rate": 1.5270777086423724e-05, + "loss": 0.3482, + "step": 7151 + }, + { + "epoch": 0.6737476743364499, + "grad_norm": 0.8257627487182617, + "learning_rate": 1.5269493787149183e-05, + "loss": 0.3398, + "step": 7152 + }, + { + "epoch": 0.6738418784296177, + "grad_norm": 0.6777796149253845, + "learning_rate": 1.5268210367722035e-05, + "loss": 0.3117, + "step": 7153 + }, + { + "epoch": 0.6739360825227856, + "grad_norm": 0.638866126537323, + "learning_rate": 1.5266926828171542e-05, + "loss": 0.3195, + "step": 7154 + }, + { + "epoch": 0.6740302866159534, + "grad_norm": 0.6855704188346863, + "learning_rate": 1.5265643168526972e-05, + "loss": 0.2833, + "step": 7155 + }, + { + "epoch": 0.6741244907091213, + "grad_norm": 0.7688138484954834, + "learning_rate": 1.5264359388817596e-05, + "loss": 0.3279, + "step": 7156 + }, + { + "epoch": 0.6742186948022891, + "grad_norm": 0.6711253523826599, + "learning_rate": 1.526307548907268e-05, + "loss": 0.287, + "step": 7157 + }, + { + "epoch": 0.674312898895457, + "grad_norm": 0.8004896640777588, + "learning_rate": 1.526179146932151e-05, + "loss": 0.3616, + "step": 7158 + }, + { + "epoch": 0.6744071029886248, + "grad_norm": 0.6729282736778259, + "learning_rate": 1.5260507329593354e-05, + "loss": 0.3272, + "step": 7159 + }, + { + "epoch": 0.6745013070817927, + "grad_norm": 0.7034173011779785, + "learning_rate": 1.52592230699175e-05, + "loss": 0.3348, + "step": 7160 + }, + { + "epoch": 0.6745955111749605, + "grad_norm": 0.6225616931915283, + "learning_rate": 1.5257938690323218e-05, + "loss": 0.2642, + "step": 7161 + }, + { + "epoch": 0.6746897152681284, + "grad_norm": 0.7345744967460632, + "learning_rate": 1.5256654190839806e-05, + "loss": 0.2824, + "step": 7162 + }, + { + "epoch": 0.6747839193612962, + "grad_norm": 0.6629753112792969, + "learning_rate": 1.525536957149655e-05, + "loss": 0.3058, + "step": 7163 + }, + { + "epoch": 0.6748781234544641, + "grad_norm": 0.7448536157608032, + "learning_rate": 1.5254084832322736e-05, + "loss": 0.3332, + "step": 7164 + }, + { + "epoch": 0.6749723275476319, + "grad_norm": 0.7381179332733154, + "learning_rate": 1.5252799973347667e-05, + "loss": 0.3314, + "step": 7165 + }, + { + "epoch": 0.6750665316407998, + "grad_norm": 0.7234967350959778, + "learning_rate": 1.5251514994600633e-05, + "loss": 0.3454, + "step": 7166 + }, + { + "epoch": 0.6751607357339676, + "grad_norm": 0.6849322319030762, + "learning_rate": 1.5250229896110935e-05, + "loss": 0.3421, + "step": 7167 + }, + { + "epoch": 0.6752549398271355, + "grad_norm": 0.7592267990112305, + "learning_rate": 1.5248944677907877e-05, + "loss": 0.3175, + "step": 7168 + }, + { + "epoch": 0.6753491439203033, + "grad_norm": 0.707671046257019, + "learning_rate": 1.5247659340020763e-05, + "loss": 0.3274, + "step": 7169 + }, + { + "epoch": 0.6754433480134712, + "grad_norm": 0.7229474186897278, + "learning_rate": 1.5246373882478899e-05, + "loss": 0.3219, + "step": 7170 + }, + { + "epoch": 0.675537552106639, + "grad_norm": 0.8195897936820984, + "learning_rate": 1.5245088305311599e-05, + "loss": 0.3377, + "step": 7171 + }, + { + "epoch": 0.6756317561998069, + "grad_norm": 0.720527172088623, + "learning_rate": 1.524380260854817e-05, + "loss": 0.3536, + "step": 7172 + }, + { + "epoch": 0.6757259602929747, + "grad_norm": 0.7520124912261963, + "learning_rate": 1.5242516792217936e-05, + "loss": 0.2899, + "step": 7173 + }, + { + "epoch": 0.6758201643861426, + "grad_norm": 0.8100510835647583, + "learning_rate": 1.5241230856350209e-05, + "loss": 0.3314, + "step": 7174 + }, + { + "epoch": 0.6759143684793104, + "grad_norm": 0.7205991744995117, + "learning_rate": 1.5239944800974313e-05, + "loss": 0.3313, + "step": 7175 + }, + { + "epoch": 0.6760085725724783, + "grad_norm": 0.8324742317199707, + "learning_rate": 1.5238658626119574e-05, + "loss": 0.3426, + "step": 7176 + }, + { + "epoch": 0.6761027766656461, + "grad_norm": 0.7149767279624939, + "learning_rate": 1.5237372331815312e-05, + "loss": 0.3028, + "step": 7177 + }, + { + "epoch": 0.676196980758814, + "grad_norm": 0.7355038523674011, + "learning_rate": 1.5236085918090867e-05, + "loss": 0.3039, + "step": 7178 + }, + { + "epoch": 0.6762911848519818, + "grad_norm": 0.7257593870162964, + "learning_rate": 1.5234799384975562e-05, + "loss": 0.3449, + "step": 7179 + }, + { + "epoch": 0.6763853889451497, + "grad_norm": 0.7089222073554993, + "learning_rate": 1.5233512732498735e-05, + "loss": 0.2887, + "step": 7180 + }, + { + "epoch": 0.6764795930383175, + "grad_norm": 0.7266278266906738, + "learning_rate": 1.5232225960689722e-05, + "loss": 0.284, + "step": 7181 + }, + { + "epoch": 0.6765737971314854, + "grad_norm": 0.7619205713272095, + "learning_rate": 1.523093906957787e-05, + "loss": 0.3695, + "step": 7182 + }, + { + "epoch": 0.6766680012246532, + "grad_norm": 0.7840532660484314, + "learning_rate": 1.5229652059192515e-05, + "loss": 0.3703, + "step": 7183 + }, + { + "epoch": 0.676762205317821, + "grad_norm": 0.7276999950408936, + "learning_rate": 1.5228364929563004e-05, + "loss": 0.2923, + "step": 7184 + }, + { + "epoch": 0.6768564094109889, + "grad_norm": 0.9476488828659058, + "learning_rate": 1.522707768071869e-05, + "loss": 0.308, + "step": 7185 + }, + { + "epoch": 0.6769506135041568, + "grad_norm": 0.7192996740341187, + "learning_rate": 1.522579031268892e-05, + "loss": 0.3442, + "step": 7186 + }, + { + "epoch": 0.6770448175973246, + "grad_norm": 0.6776466369628906, + "learning_rate": 1.5224502825503045e-05, + "loss": 0.305, + "step": 7187 + }, + { + "epoch": 0.6771390216904924, + "grad_norm": 0.6640827655792236, + "learning_rate": 1.522321521919043e-05, + "loss": 0.2844, + "step": 7188 + }, + { + "epoch": 0.6772332257836603, + "grad_norm": 0.7528949975967407, + "learning_rate": 1.5221927493780428e-05, + "loss": 0.335, + "step": 7189 + }, + { + "epoch": 0.6773274298768281, + "grad_norm": 0.6359009146690369, + "learning_rate": 1.5220639649302404e-05, + "loss": 0.3238, + "step": 7190 + }, + { + "epoch": 0.677421633969996, + "grad_norm": 0.6911217570304871, + "learning_rate": 1.5219351685785724e-05, + "loss": 0.269, + "step": 7191 + }, + { + "epoch": 0.6775158380631638, + "grad_norm": 0.7504013180732727, + "learning_rate": 1.5218063603259751e-05, + "loss": 0.353, + "step": 7192 + }, + { + "epoch": 0.6776100421563317, + "grad_norm": 0.7562568187713623, + "learning_rate": 1.5216775401753859e-05, + "loss": 0.3469, + "step": 7193 + }, + { + "epoch": 0.6777042462494995, + "grad_norm": 0.7544463872909546, + "learning_rate": 1.5215487081297422e-05, + "loss": 0.3095, + "step": 7194 + }, + { + "epoch": 0.6777984503426674, + "grad_norm": 0.7292487621307373, + "learning_rate": 1.5214198641919811e-05, + "loss": 0.335, + "step": 7195 + }, + { + "epoch": 0.6778926544358352, + "grad_norm": 0.7718842625617981, + "learning_rate": 1.5212910083650404e-05, + "loss": 0.3387, + "step": 7196 + }, + { + "epoch": 0.6779868585290031, + "grad_norm": 0.6888427734375, + "learning_rate": 1.521162140651859e-05, + "loss": 0.3307, + "step": 7197 + }, + { + "epoch": 0.6780810626221709, + "grad_norm": 0.7067251205444336, + "learning_rate": 1.5210332610553749e-05, + "loss": 0.3215, + "step": 7198 + }, + { + "epoch": 0.6781752667153388, + "grad_norm": 0.6551802158355713, + "learning_rate": 1.5209043695785264e-05, + "loss": 0.2828, + "step": 7199 + }, + { + "epoch": 0.6782694708085066, + "grad_norm": 0.7964441776275635, + "learning_rate": 1.5207754662242528e-05, + "loss": 0.3374, + "step": 7200 + }, + { + "epoch": 0.6783636749016745, + "grad_norm": 0.7138524055480957, + "learning_rate": 1.520646550995493e-05, + "loss": 0.3223, + "step": 7201 + }, + { + "epoch": 0.6784578789948423, + "grad_norm": 0.7613831162452698, + "learning_rate": 1.520517623895187e-05, + "loss": 0.3431, + "step": 7202 + }, + { + "epoch": 0.6785520830880102, + "grad_norm": 0.7344279289245605, + "learning_rate": 1.5203886849262736e-05, + "loss": 0.3366, + "step": 7203 + }, + { + "epoch": 0.678646287181178, + "grad_norm": 0.7466939687728882, + "learning_rate": 1.5202597340916935e-05, + "loss": 0.3003, + "step": 7204 + }, + { + "epoch": 0.6787404912743459, + "grad_norm": 0.8112344145774841, + "learning_rate": 1.5201307713943875e-05, + "loss": 0.3652, + "step": 7205 + }, + { + "epoch": 0.6788346953675137, + "grad_norm": 0.8347423076629639, + "learning_rate": 1.5200017968372951e-05, + "loss": 0.3301, + "step": 7206 + }, + { + "epoch": 0.6789288994606816, + "grad_norm": 0.6893008351325989, + "learning_rate": 1.5198728104233573e-05, + "loss": 0.3412, + "step": 7207 + }, + { + "epoch": 0.6790231035538494, + "grad_norm": 0.8349971175193787, + "learning_rate": 1.5197438121555159e-05, + "loss": 0.3159, + "step": 7208 + }, + { + "epoch": 0.6791173076470173, + "grad_norm": 0.67628413438797, + "learning_rate": 1.5196148020367112e-05, + "loss": 0.334, + "step": 7209 + }, + { + "epoch": 0.6792115117401851, + "grad_norm": 0.7063706517219543, + "learning_rate": 1.519485780069886e-05, + "loss": 0.3581, + "step": 7210 + }, + { + "epoch": 0.679305715833353, + "grad_norm": 0.861177384853363, + "learning_rate": 1.5193567462579815e-05, + "loss": 0.3792, + "step": 7211 + }, + { + "epoch": 0.6793999199265208, + "grad_norm": 0.6842749714851379, + "learning_rate": 1.5192277006039395e-05, + "loss": 0.2947, + "step": 7212 + }, + { + "epoch": 0.6794941240196887, + "grad_norm": 0.717474639415741, + "learning_rate": 1.5190986431107033e-05, + "loss": 0.3059, + "step": 7213 + }, + { + "epoch": 0.6795883281128565, + "grad_norm": 1.0629812479019165, + "learning_rate": 1.5189695737812153e-05, + "loss": 0.3097, + "step": 7214 + }, + { + "epoch": 0.6796825322060244, + "grad_norm": 0.850768506526947, + "learning_rate": 1.5188404926184184e-05, + "loss": 0.3584, + "step": 7215 + }, + { + "epoch": 0.6797767362991922, + "grad_norm": 0.7694965600967407, + "learning_rate": 1.5187113996252556e-05, + "loss": 0.3311, + "step": 7216 + }, + { + "epoch": 0.6798709403923601, + "grad_norm": 0.7983599901199341, + "learning_rate": 1.518582294804671e-05, + "loss": 0.3123, + "step": 7217 + }, + { + "epoch": 0.6799651444855279, + "grad_norm": 0.7268584966659546, + "learning_rate": 1.5184531781596079e-05, + "loss": 0.318, + "step": 7218 + }, + { + "epoch": 0.6800593485786958, + "grad_norm": 0.868614137172699, + "learning_rate": 1.5183240496930105e-05, + "loss": 0.3533, + "step": 7219 + }, + { + "epoch": 0.6801535526718636, + "grad_norm": 0.7720600366592407, + "learning_rate": 1.5181949094078235e-05, + "loss": 0.307, + "step": 7220 + }, + { + "epoch": 0.6802477567650315, + "grad_norm": 0.7273695468902588, + "learning_rate": 1.5180657573069906e-05, + "loss": 0.3076, + "step": 7221 + }, + { + "epoch": 0.6803419608581993, + "grad_norm": 0.6686651110649109, + "learning_rate": 1.5179365933934577e-05, + "loss": 0.2642, + "step": 7222 + }, + { + "epoch": 0.6804361649513672, + "grad_norm": 0.7749326229095459, + "learning_rate": 1.5178074176701691e-05, + "loss": 0.3221, + "step": 7223 + }, + { + "epoch": 0.680530369044535, + "grad_norm": 0.7839756608009338, + "learning_rate": 1.5176782301400707e-05, + "loss": 0.2697, + "step": 7224 + }, + { + "epoch": 0.6806245731377029, + "grad_norm": 0.7855985760688782, + "learning_rate": 1.5175490308061085e-05, + "loss": 0.3313, + "step": 7225 + }, + { + "epoch": 0.6807187772308707, + "grad_norm": 0.692952036857605, + "learning_rate": 1.5174198196712273e-05, + "loss": 0.3358, + "step": 7226 + }, + { + "epoch": 0.6808129813240386, + "grad_norm": 0.7354058027267456, + "learning_rate": 1.5172905967383744e-05, + "loss": 0.342, + "step": 7227 + }, + { + "epoch": 0.6809071854172064, + "grad_norm": 0.6863634586334229, + "learning_rate": 1.5171613620104961e-05, + "loss": 0.3144, + "step": 7228 + }, + { + "epoch": 0.6810013895103743, + "grad_norm": 0.7194997668266296, + "learning_rate": 1.5170321154905386e-05, + "loss": 0.294, + "step": 7229 + }, + { + "epoch": 0.6810955936035421, + "grad_norm": 0.874332070350647, + "learning_rate": 1.516902857181449e-05, + "loss": 0.4257, + "step": 7230 + }, + { + "epoch": 0.68118979769671, + "grad_norm": 0.748012363910675, + "learning_rate": 1.5167735870861756e-05, + "loss": 0.3238, + "step": 7231 + }, + { + "epoch": 0.6812840017898778, + "grad_norm": 0.7189104557037354, + "learning_rate": 1.5166443052076645e-05, + "loss": 0.3173, + "step": 7232 + }, + { + "epoch": 0.6813782058830457, + "grad_norm": 0.6643146276473999, + "learning_rate": 1.5165150115488644e-05, + "loss": 0.2946, + "step": 7233 + }, + { + "epoch": 0.6814724099762135, + "grad_norm": 0.7028785943984985, + "learning_rate": 1.5163857061127233e-05, + "loss": 0.3132, + "step": 7234 + }, + { + "epoch": 0.6815666140693813, + "grad_norm": 1.397181749343872, + "learning_rate": 1.5162563889021897e-05, + "loss": 0.3317, + "step": 7235 + }, + { + "epoch": 0.6816608181625492, + "grad_norm": 0.6844511032104492, + "learning_rate": 1.516127059920212e-05, + "loss": 0.3198, + "step": 7236 + }, + { + "epoch": 0.681755022255717, + "grad_norm": 0.7008311152458191, + "learning_rate": 1.5159977191697386e-05, + "loss": 0.2861, + "step": 7237 + }, + { + "epoch": 0.6818492263488849, + "grad_norm": 0.7851535081863403, + "learning_rate": 1.5158683666537197e-05, + "loss": 0.3309, + "step": 7238 + }, + { + "epoch": 0.6819434304420527, + "grad_norm": 0.7332495450973511, + "learning_rate": 1.5157390023751042e-05, + "loss": 0.3277, + "step": 7239 + }, + { + "epoch": 0.6820376345352206, + "grad_norm": 0.767930805683136, + "learning_rate": 1.5156096263368418e-05, + "loss": 0.3209, + "step": 7240 + }, + { + "epoch": 0.6821318386283884, + "grad_norm": 0.9117291569709778, + "learning_rate": 1.5154802385418825e-05, + "loss": 0.3244, + "step": 7241 + }, + { + "epoch": 0.6822260427215563, + "grad_norm": 0.7538485527038574, + "learning_rate": 1.5153508389931766e-05, + "loss": 0.3462, + "step": 7242 + }, + { + "epoch": 0.6823202468147241, + "grad_norm": 0.7208590507507324, + "learning_rate": 1.5152214276936743e-05, + "loss": 0.3632, + "step": 7243 + }, + { + "epoch": 0.682414450907892, + "grad_norm": 0.7393494844436646, + "learning_rate": 1.5150920046463266e-05, + "loss": 0.3261, + "step": 7244 + }, + { + "epoch": 0.6825086550010598, + "grad_norm": 1.5232760906219482, + "learning_rate": 1.5149625698540853e-05, + "loss": 0.3299, + "step": 7245 + }, + { + "epoch": 0.6826028590942277, + "grad_norm": 0.6693358421325684, + "learning_rate": 1.5148331233199002e-05, + "loss": 0.3022, + "step": 7246 + }, + { + "epoch": 0.6826970631873955, + "grad_norm": 0.8321530818939209, + "learning_rate": 1.514703665046724e-05, + "loss": 0.3431, + "step": 7247 + }, + { + "epoch": 0.6827912672805634, + "grad_norm": 0.7812127470970154, + "learning_rate": 1.5145741950375085e-05, + "loss": 0.3076, + "step": 7248 + }, + { + "epoch": 0.6828854713737312, + "grad_norm": 0.750748872756958, + "learning_rate": 1.5144447132952052e-05, + "loss": 0.3458, + "step": 7249 + }, + { + "epoch": 0.6829796754668991, + "grad_norm": 0.7384516596794128, + "learning_rate": 1.5143152198227668e-05, + "loss": 0.3273, + "step": 7250 + }, + { + "epoch": 0.6830738795600669, + "grad_norm": 0.6384652853012085, + "learning_rate": 1.5141857146231462e-05, + "loss": 0.3068, + "step": 7251 + }, + { + "epoch": 0.6831680836532348, + "grad_norm": 1.020987629890442, + "learning_rate": 1.5140561976992959e-05, + "loss": 0.3177, + "step": 7252 + }, + { + "epoch": 0.6832622877464026, + "grad_norm": 0.778069019317627, + "learning_rate": 1.5139266690541695e-05, + "loss": 0.3769, + "step": 7253 + }, + { + "epoch": 0.6833564918395705, + "grad_norm": 0.7732622623443604, + "learning_rate": 1.5137971286907202e-05, + "loss": 0.2927, + "step": 7254 + }, + { + "epoch": 0.6834506959327383, + "grad_norm": 0.7108065485954285, + "learning_rate": 1.5136675766119017e-05, + "loss": 0.3024, + "step": 7255 + }, + { + "epoch": 0.6835449000259062, + "grad_norm": 0.745226263999939, + "learning_rate": 1.5135380128206683e-05, + "loss": 0.291, + "step": 7256 + }, + { + "epoch": 0.683639104119074, + "grad_norm": 0.9556676149368286, + "learning_rate": 1.5134084373199733e-05, + "loss": 0.2916, + "step": 7257 + }, + { + "epoch": 0.6837333082122419, + "grad_norm": 0.6682932376861572, + "learning_rate": 1.5132788501127727e-05, + "loss": 0.2971, + "step": 7258 + }, + { + "epoch": 0.6838275123054097, + "grad_norm": 0.7709806561470032, + "learning_rate": 1.51314925120202e-05, + "loss": 0.3476, + "step": 7259 + }, + { + "epoch": 0.6839217163985776, + "grad_norm": 0.9079115390777588, + "learning_rate": 1.5130196405906711e-05, + "loss": 0.3558, + "step": 7260 + }, + { + "epoch": 0.6840159204917454, + "grad_norm": 0.685491144657135, + "learning_rate": 1.5128900182816808e-05, + "loss": 0.283, + "step": 7261 + }, + { + "epoch": 0.6841101245849132, + "grad_norm": 0.6666446328163147, + "learning_rate": 1.512760384278005e-05, + "loss": 0.3051, + "step": 7262 + }, + { + "epoch": 0.684204328678081, + "grad_norm": 0.6125575304031372, + "learning_rate": 1.5126307385825993e-05, + "loss": 0.2523, + "step": 7263 + }, + { + "epoch": 0.6842985327712489, + "grad_norm": 0.7506864070892334, + "learning_rate": 1.51250108119842e-05, + "loss": 0.2981, + "step": 7264 + }, + { + "epoch": 0.6843927368644167, + "grad_norm": 0.7479058504104614, + "learning_rate": 1.512371412128424e-05, + "loss": 0.3377, + "step": 7265 + }, + { + "epoch": 0.6844869409575846, + "grad_norm": 0.6979084014892578, + "learning_rate": 1.5122417313755667e-05, + "loss": 0.2938, + "step": 7266 + }, + { + "epoch": 0.6845811450507524, + "grad_norm": 0.7295845746994019, + "learning_rate": 1.5121120389428058e-05, + "loss": 0.2793, + "step": 7267 + }, + { + "epoch": 0.6846753491439203, + "grad_norm": 0.7008985280990601, + "learning_rate": 1.5119823348330988e-05, + "loss": 0.3245, + "step": 7268 + }, + { + "epoch": 0.6847695532370881, + "grad_norm": 0.7741862535476685, + "learning_rate": 1.5118526190494022e-05, + "loss": 0.357, + "step": 7269 + }, + { + "epoch": 0.684863757330256, + "grad_norm": 0.696915864944458, + "learning_rate": 1.511722891594675e-05, + "loss": 0.3251, + "step": 7270 + }, + { + "epoch": 0.6849579614234238, + "grad_norm": 0.8568238019943237, + "learning_rate": 1.5115931524718739e-05, + "loss": 0.3236, + "step": 7271 + }, + { + "epoch": 0.6850521655165916, + "grad_norm": 0.7071682810783386, + "learning_rate": 1.5114634016839578e-05, + "loss": 0.3299, + "step": 7272 + }, + { + "epoch": 0.6851463696097595, + "grad_norm": 0.7535279393196106, + "learning_rate": 1.5113336392338853e-05, + "loss": 0.3423, + "step": 7273 + }, + { + "epoch": 0.6852405737029273, + "grad_norm": 0.7997384071350098, + "learning_rate": 1.511203865124615e-05, + "loss": 0.3216, + "step": 7274 + }, + { + "epoch": 0.6853347777960952, + "grad_norm": 0.8931795358657837, + "learning_rate": 1.511074079359106e-05, + "loss": 0.3887, + "step": 7275 + }, + { + "epoch": 0.685428981889263, + "grad_norm": 0.7333723306655884, + "learning_rate": 1.5109442819403178e-05, + "loss": 0.3081, + "step": 7276 + }, + { + "epoch": 0.6855231859824309, + "grad_norm": 0.6808123588562012, + "learning_rate": 1.5108144728712093e-05, + "loss": 0.2971, + "step": 7277 + }, + { + "epoch": 0.6856173900755987, + "grad_norm": 0.6848199367523193, + "learning_rate": 1.5106846521547413e-05, + "loss": 0.3307, + "step": 7278 + }, + { + "epoch": 0.6857115941687666, + "grad_norm": 0.7820274233818054, + "learning_rate": 1.5105548197938733e-05, + "loss": 0.3739, + "step": 7279 + }, + { + "epoch": 0.6858057982619344, + "grad_norm": 0.7594008445739746, + "learning_rate": 1.5104249757915658e-05, + "loss": 0.3421, + "step": 7280 + }, + { + "epoch": 0.6859000023551023, + "grad_norm": 0.6797794103622437, + "learning_rate": 1.5102951201507792e-05, + "loss": 0.2652, + "step": 7281 + }, + { + "epoch": 0.6859942064482701, + "grad_norm": 0.8875043988227844, + "learning_rate": 1.510165252874475e-05, + "loss": 0.3433, + "step": 7282 + }, + { + "epoch": 0.686088410541438, + "grad_norm": 0.7918300032615662, + "learning_rate": 1.5100353739656139e-05, + "loss": 0.3706, + "step": 7283 + }, + { + "epoch": 0.6861826146346058, + "grad_norm": 0.7903196215629578, + "learning_rate": 1.5099054834271574e-05, + "loss": 0.3153, + "step": 7284 + }, + { + "epoch": 0.6862768187277737, + "grad_norm": 0.7844611406326294, + "learning_rate": 1.5097755812620679e-05, + "loss": 0.3556, + "step": 7285 + }, + { + "epoch": 0.6863710228209415, + "grad_norm": 0.7814464569091797, + "learning_rate": 1.5096456674733059e-05, + "loss": 0.3168, + "step": 7286 + }, + { + "epoch": 0.6864652269141094, + "grad_norm": 0.7347399592399597, + "learning_rate": 1.5095157420638349e-05, + "loss": 0.3406, + "step": 7287 + }, + { + "epoch": 0.6865594310072772, + "grad_norm": 0.7254319190979004, + "learning_rate": 1.509385805036617e-05, + "loss": 0.328, + "step": 7288 + }, + { + "epoch": 0.6866536351004451, + "grad_norm": 0.7397040128707886, + "learning_rate": 1.509255856394615e-05, + "loss": 0.2663, + "step": 7289 + }, + { + "epoch": 0.6867478391936129, + "grad_norm": 0.7593182325363159, + "learning_rate": 1.509125896140792e-05, + "loss": 0.3113, + "step": 7290 + }, + { + "epoch": 0.6868420432867808, + "grad_norm": 0.7546846270561218, + "learning_rate": 1.5089959242781109e-05, + "loss": 0.3033, + "step": 7291 + }, + { + "epoch": 0.6869362473799486, + "grad_norm": 0.6999562978744507, + "learning_rate": 1.5088659408095356e-05, + "loss": 0.3063, + "step": 7292 + }, + { + "epoch": 0.6870304514731165, + "grad_norm": 0.6722689867019653, + "learning_rate": 1.5087359457380299e-05, + "loss": 0.2836, + "step": 7293 + }, + { + "epoch": 0.6871246555662843, + "grad_norm": 0.7416934370994568, + "learning_rate": 1.5086059390665582e-05, + "loss": 0.3114, + "step": 7294 + }, + { + "epoch": 0.6872188596594522, + "grad_norm": 0.7684811949729919, + "learning_rate": 1.5084759207980844e-05, + "loss": 0.3342, + "step": 7295 + }, + { + "epoch": 0.68731306375262, + "grad_norm": 0.9004353880882263, + "learning_rate": 1.5083458909355728e-05, + "loss": 0.3416, + "step": 7296 + }, + { + "epoch": 0.6874072678457879, + "grad_norm": 0.7631049752235413, + "learning_rate": 1.508215849481989e-05, + "loss": 0.3257, + "step": 7297 + }, + { + "epoch": 0.6875014719389557, + "grad_norm": 0.8043209910392761, + "learning_rate": 1.5080857964402978e-05, + "loss": 0.3137, + "step": 7298 + }, + { + "epoch": 0.6875956760321236, + "grad_norm": 0.8755385279655457, + "learning_rate": 1.507955731813465e-05, + "loss": 0.3667, + "step": 7299 + }, + { + "epoch": 0.6876898801252914, + "grad_norm": 0.6808297038078308, + "learning_rate": 1.5078256556044557e-05, + "loss": 0.301, + "step": 7300 + }, + { + "epoch": 0.6877840842184593, + "grad_norm": 0.839055061340332, + "learning_rate": 1.5076955678162359e-05, + "loss": 0.3008, + "step": 7301 + }, + { + "epoch": 0.6878782883116271, + "grad_norm": 0.7584748864173889, + "learning_rate": 1.5075654684517722e-05, + "loss": 0.304, + "step": 7302 + }, + { + "epoch": 0.687972492404795, + "grad_norm": 0.7207547426223755, + "learning_rate": 1.5074353575140307e-05, + "loss": 0.307, + "step": 7303 + }, + { + "epoch": 0.6880666964979628, + "grad_norm": 0.8173823356628418, + "learning_rate": 1.5073052350059783e-05, + "loss": 0.3126, + "step": 7304 + }, + { + "epoch": 0.6881609005911307, + "grad_norm": 1.3754568099975586, + "learning_rate": 1.5071751009305824e-05, + "loss": 0.3187, + "step": 7305 + }, + { + "epoch": 0.6882551046842985, + "grad_norm": 0.9424414038658142, + "learning_rate": 1.5070449552908093e-05, + "loss": 0.3687, + "step": 7306 + }, + { + "epoch": 0.6883493087774664, + "grad_norm": 0.8139634728431702, + "learning_rate": 1.5069147980896272e-05, + "loss": 0.3351, + "step": 7307 + }, + { + "epoch": 0.6884435128706342, + "grad_norm": 0.6309536695480347, + "learning_rate": 1.506784629330004e-05, + "loss": 0.299, + "step": 7308 + }, + { + "epoch": 0.6885377169638021, + "grad_norm": 0.7700256705284119, + "learning_rate": 1.5066544490149068e-05, + "loss": 0.3735, + "step": 7309 + }, + { + "epoch": 0.6886319210569699, + "grad_norm": 0.7104466557502747, + "learning_rate": 1.5065242571473054e-05, + "loss": 0.3123, + "step": 7310 + }, + { + "epoch": 0.6887261251501378, + "grad_norm": 0.6783331036567688, + "learning_rate": 1.5063940537301673e-05, + "loss": 0.3288, + "step": 7311 + }, + { + "epoch": 0.6888203292433056, + "grad_norm": 0.6386809945106506, + "learning_rate": 1.5062638387664614e-05, + "loss": 0.2988, + "step": 7312 + }, + { + "epoch": 0.6889145333364735, + "grad_norm": 0.6671721935272217, + "learning_rate": 1.5061336122591574e-05, + "loss": 0.3652, + "step": 7313 + }, + { + "epoch": 0.6890087374296413, + "grad_norm": 0.9433803558349609, + "learning_rate": 1.5060033742112242e-05, + "loss": 0.3975, + "step": 7314 + }, + { + "epoch": 0.6891029415228092, + "grad_norm": 0.6702458262443542, + "learning_rate": 1.5058731246256315e-05, + "loss": 0.2997, + "step": 7315 + }, + { + "epoch": 0.689197145615977, + "grad_norm": 0.6700741052627563, + "learning_rate": 1.505742863505349e-05, + "loss": 0.2819, + "step": 7316 + }, + { + "epoch": 0.6892913497091449, + "grad_norm": 0.7831470370292664, + "learning_rate": 1.5056125908533472e-05, + "loss": 0.2924, + "step": 7317 + }, + { + "epoch": 0.6893855538023127, + "grad_norm": 0.7220161557197571, + "learning_rate": 1.5054823066725966e-05, + "loss": 0.3258, + "step": 7318 + }, + { + "epoch": 0.6894797578954805, + "grad_norm": 0.6773331761360168, + "learning_rate": 1.5053520109660674e-05, + "loss": 0.3416, + "step": 7319 + }, + { + "epoch": 0.6895739619886484, + "grad_norm": 0.7731783390045166, + "learning_rate": 1.5052217037367309e-05, + "loss": 0.3591, + "step": 7320 + }, + { + "epoch": 0.6896681660818162, + "grad_norm": 0.7107616662979126, + "learning_rate": 1.5050913849875582e-05, + "loss": 0.3189, + "step": 7321 + }, + { + "epoch": 0.6897623701749841, + "grad_norm": 0.9149519801139832, + "learning_rate": 1.5049610547215205e-05, + "loss": 0.3726, + "step": 7322 + }, + { + "epoch": 0.689856574268152, + "grad_norm": 0.7625017762184143, + "learning_rate": 1.5048307129415903e-05, + "loss": 0.3269, + "step": 7323 + }, + { + "epoch": 0.6899507783613198, + "grad_norm": 0.7767654061317444, + "learning_rate": 1.5047003596507388e-05, + "loss": 0.3448, + "step": 7324 + }, + { + "epoch": 0.6900449824544876, + "grad_norm": 0.6901801824569702, + "learning_rate": 1.5045699948519388e-05, + "loss": 0.3182, + "step": 7325 + }, + { + "epoch": 0.6901391865476555, + "grad_norm": 0.7489223480224609, + "learning_rate": 1.5044396185481622e-05, + "loss": 0.3471, + "step": 7326 + }, + { + "epoch": 0.6902333906408233, + "grad_norm": 0.7715135812759399, + "learning_rate": 1.5043092307423825e-05, + "loss": 0.3547, + "step": 7327 + }, + { + "epoch": 0.6903275947339912, + "grad_norm": 0.6974141001701355, + "learning_rate": 1.5041788314375722e-05, + "loss": 0.3095, + "step": 7328 + }, + { + "epoch": 0.690421798827159, + "grad_norm": 0.7149832248687744, + "learning_rate": 1.5040484206367049e-05, + "loss": 0.3247, + "step": 7329 + }, + { + "epoch": 0.6905160029203269, + "grad_norm": 0.8140485286712646, + "learning_rate": 1.5039179983427543e-05, + "loss": 0.3222, + "step": 7330 + }, + { + "epoch": 0.6906102070134947, + "grad_norm": 0.8052679896354675, + "learning_rate": 1.5037875645586937e-05, + "loss": 0.3354, + "step": 7331 + }, + { + "epoch": 0.6907044111066626, + "grad_norm": 0.9338856935501099, + "learning_rate": 1.5036571192874977e-05, + "loss": 0.3547, + "step": 7332 + }, + { + "epoch": 0.6907986151998304, + "grad_norm": 0.7552533149719238, + "learning_rate": 1.5035266625321403e-05, + "loss": 0.3038, + "step": 7333 + }, + { + "epoch": 0.6908928192929983, + "grad_norm": 0.9233847856521606, + "learning_rate": 1.5033961942955966e-05, + "loss": 0.3901, + "step": 7334 + }, + { + "epoch": 0.6909870233861661, + "grad_norm": 0.7393520474433899, + "learning_rate": 1.5032657145808409e-05, + "loss": 0.2992, + "step": 7335 + }, + { + "epoch": 0.691081227479334, + "grad_norm": 0.68341463804245, + "learning_rate": 1.5031352233908486e-05, + "loss": 0.3029, + "step": 7336 + }, + { + "epoch": 0.6911754315725018, + "grad_norm": 0.7289931178092957, + "learning_rate": 1.5030047207285949e-05, + "loss": 0.2857, + "step": 7337 + }, + { + "epoch": 0.6912696356656697, + "grad_norm": 0.6364055275917053, + "learning_rate": 1.502874206597056e-05, + "loss": 0.2769, + "step": 7338 + }, + { + "epoch": 0.6913638397588375, + "grad_norm": 0.7573159337043762, + "learning_rate": 1.5027436809992076e-05, + "loss": 0.3168, + "step": 7339 + }, + { + "epoch": 0.6914580438520054, + "grad_norm": 0.7118650078773499, + "learning_rate": 1.5026131439380254e-05, + "loss": 0.3248, + "step": 7340 + }, + { + "epoch": 0.6915522479451732, + "grad_norm": 0.7071848511695862, + "learning_rate": 1.5024825954164862e-05, + "loss": 0.3323, + "step": 7341 + }, + { + "epoch": 0.6916464520383411, + "grad_norm": 0.8578891754150391, + "learning_rate": 1.5023520354375669e-05, + "loss": 0.292, + "step": 7342 + }, + { + "epoch": 0.6917406561315089, + "grad_norm": 0.6838341951370239, + "learning_rate": 1.5022214640042444e-05, + "loss": 0.2881, + "step": 7343 + }, + { + "epoch": 0.6918348602246768, + "grad_norm": 0.6806995868682861, + "learning_rate": 1.5020908811194952e-05, + "loss": 0.3118, + "step": 7344 + }, + { + "epoch": 0.6919290643178446, + "grad_norm": 0.7868961095809937, + "learning_rate": 1.5019602867862983e-05, + "loss": 0.2921, + "step": 7345 + }, + { + "epoch": 0.6920232684110125, + "grad_norm": 0.8781580924987793, + "learning_rate": 1.5018296810076299e-05, + "loss": 0.3064, + "step": 7346 + }, + { + "epoch": 0.6921174725041803, + "grad_norm": 0.7523723840713501, + "learning_rate": 1.501699063786469e-05, + "loss": 0.3128, + "step": 7347 + }, + { + "epoch": 0.6922116765973482, + "grad_norm": 0.9119831919670105, + "learning_rate": 1.5015684351257935e-05, + "loss": 0.2864, + "step": 7348 + }, + { + "epoch": 0.692305880690516, + "grad_norm": 0.6982825994491577, + "learning_rate": 1.5014377950285818e-05, + "loss": 0.3008, + "step": 7349 + }, + { + "epoch": 0.6924000847836839, + "grad_norm": 0.6579086780548096, + "learning_rate": 1.501307143497813e-05, + "loss": 0.2484, + "step": 7350 + }, + { + "epoch": 0.6924942888768517, + "grad_norm": 0.8615376353263855, + "learning_rate": 1.5011764805364663e-05, + "loss": 0.3572, + "step": 7351 + }, + { + "epoch": 0.6925884929700196, + "grad_norm": 0.7610296010971069, + "learning_rate": 1.5010458061475203e-05, + "loss": 0.33, + "step": 7352 + }, + { + "epoch": 0.6926826970631874, + "grad_norm": 0.6841700673103333, + "learning_rate": 1.5009151203339556e-05, + "loss": 0.3228, + "step": 7353 + }, + { + "epoch": 0.6927769011563553, + "grad_norm": 0.7239129543304443, + "learning_rate": 1.500784423098751e-05, + "loss": 0.3231, + "step": 7354 + }, + { + "epoch": 0.6928711052495231, + "grad_norm": 0.7602190971374512, + "learning_rate": 1.5006537144448871e-05, + "loss": 0.3059, + "step": 7355 + }, + { + "epoch": 0.692965309342691, + "grad_norm": 1.425072193145752, + "learning_rate": 1.5005229943753445e-05, + "loss": 0.3425, + "step": 7356 + }, + { + "epoch": 0.6930595134358588, + "grad_norm": 0.6205198168754578, + "learning_rate": 1.500392262893103e-05, + "loss": 0.2982, + "step": 7357 + }, + { + "epoch": 0.6931537175290267, + "grad_norm": 0.7715321779251099, + "learning_rate": 1.5002615200011447e-05, + "loss": 0.3352, + "step": 7358 + }, + { + "epoch": 0.6932479216221945, + "grad_norm": 0.7210916876792908, + "learning_rate": 1.50013076570245e-05, + "loss": 0.3328, + "step": 7359 + }, + { + "epoch": 0.6933421257153624, + "grad_norm": 0.652855634689331, + "learning_rate": 1.5000000000000002e-05, + "loss": 0.3075, + "step": 7360 + }, + { + "epoch": 0.6934363298085302, + "grad_norm": 0.7345904111862183, + "learning_rate": 1.499869222896777e-05, + "loss": 0.3021, + "step": 7361 + }, + { + "epoch": 0.693530533901698, + "grad_norm": 0.7462295293807983, + "learning_rate": 1.4997384343957626e-05, + "loss": 0.3683, + "step": 7362 + }, + { + "epoch": 0.6936247379948659, + "grad_norm": 0.7322737574577332, + "learning_rate": 1.499607634499939e-05, + "loss": 0.3329, + "step": 7363 + }, + { + "epoch": 0.6937189420880338, + "grad_norm": 0.7705998420715332, + "learning_rate": 1.4994768232122887e-05, + "loss": 0.3383, + "step": 7364 + }, + { + "epoch": 0.6938131461812016, + "grad_norm": 0.7012845873832703, + "learning_rate": 1.4993460005357948e-05, + "loss": 0.322, + "step": 7365 + }, + { + "epoch": 0.6939073502743694, + "grad_norm": 0.6502741575241089, + "learning_rate": 1.499215166473439e-05, + "loss": 0.2975, + "step": 7366 + }, + { + "epoch": 0.6940015543675373, + "grad_norm": 0.6985697150230408, + "learning_rate": 1.4990843210282058e-05, + "loss": 0.3163, + "step": 7367 + }, + { + "epoch": 0.6940957584607051, + "grad_norm": 0.7667263150215149, + "learning_rate": 1.4989534642030783e-05, + "loss": 0.3301, + "step": 7368 + }, + { + "epoch": 0.694189962553873, + "grad_norm": 0.7916030287742615, + "learning_rate": 1.4988225960010398e-05, + "loss": 0.3283, + "step": 7369 + }, + { + "epoch": 0.6942841666470408, + "grad_norm": 0.681128740310669, + "learning_rate": 1.4986917164250751e-05, + "loss": 0.3103, + "step": 7370 + }, + { + "epoch": 0.6943783707402087, + "grad_norm": 0.7746285200119019, + "learning_rate": 1.498560825478168e-05, + "loss": 0.3573, + "step": 7371 + }, + { + "epoch": 0.6944725748333765, + "grad_norm": 0.8283178210258484, + "learning_rate": 1.4984299231633027e-05, + "loss": 0.2894, + "step": 7372 + }, + { + "epoch": 0.6945667789265444, + "grad_norm": 0.7182471752166748, + "learning_rate": 1.4982990094834647e-05, + "loss": 0.2693, + "step": 7373 + }, + { + "epoch": 0.6946609830197122, + "grad_norm": 1.1338497400283813, + "learning_rate": 1.4981680844416384e-05, + "loss": 0.3941, + "step": 7374 + }, + { + "epoch": 0.6947551871128801, + "grad_norm": 0.7589165568351746, + "learning_rate": 1.4980371480408094e-05, + "loss": 0.3794, + "step": 7375 + }, + { + "epoch": 0.6948493912060479, + "grad_norm": 0.7595975399017334, + "learning_rate": 1.4979062002839634e-05, + "loss": 0.3811, + "step": 7376 + }, + { + "epoch": 0.6949435952992158, + "grad_norm": 0.7198370695114136, + "learning_rate": 1.4977752411740855e-05, + "loss": 0.3589, + "step": 7377 + }, + { + "epoch": 0.6950377993923836, + "grad_norm": 0.6901041865348816, + "learning_rate": 1.4976442707141625e-05, + "loss": 0.2827, + "step": 7378 + }, + { + "epoch": 0.6951320034855515, + "grad_norm": 0.7376834154129028, + "learning_rate": 1.4975132889071808e-05, + "loss": 0.3498, + "step": 7379 + }, + { + "epoch": 0.6952262075787193, + "grad_norm": 0.8776727914810181, + "learning_rate": 1.4973822957561266e-05, + "loss": 0.3504, + "step": 7380 + }, + { + "epoch": 0.6953204116718872, + "grad_norm": 0.7100030183792114, + "learning_rate": 1.4972512912639864e-05, + "loss": 0.3353, + "step": 7381 + }, + { + "epoch": 0.695414615765055, + "grad_norm": 0.8575757145881653, + "learning_rate": 1.4971202754337482e-05, + "loss": 0.3404, + "step": 7382 + }, + { + "epoch": 0.6955088198582229, + "grad_norm": 0.7626842260360718, + "learning_rate": 1.496989248268399e-05, + "loss": 0.3046, + "step": 7383 + }, + { + "epoch": 0.6956030239513907, + "grad_norm": 0.7086209058761597, + "learning_rate": 1.4968582097709259e-05, + "loss": 0.3093, + "step": 7384 + }, + { + "epoch": 0.6956972280445586, + "grad_norm": 0.6811178922653198, + "learning_rate": 1.4967271599443176e-05, + "loss": 0.2899, + "step": 7385 + }, + { + "epoch": 0.6957914321377264, + "grad_norm": 1.1253029108047485, + "learning_rate": 1.4965960987915615e-05, + "loss": 0.3812, + "step": 7386 + }, + { + "epoch": 0.6958856362308943, + "grad_norm": 0.762640655040741, + "learning_rate": 1.4964650263156466e-05, + "loss": 0.3376, + "step": 7387 + }, + { + "epoch": 0.6959798403240621, + "grad_norm": 0.7089758515357971, + "learning_rate": 1.4963339425195611e-05, + "loss": 0.3254, + "step": 7388 + }, + { + "epoch": 0.69607404441723, + "grad_norm": 0.6183210015296936, + "learning_rate": 1.4962028474062941e-05, + "loss": 0.2624, + "step": 7389 + }, + { + "epoch": 0.6961682485103978, + "grad_norm": 0.8221989274024963, + "learning_rate": 1.496071740978835e-05, + "loss": 0.331, + "step": 7390 + }, + { + "epoch": 0.6962624526035657, + "grad_norm": 0.6767950057983398, + "learning_rate": 1.495940623240173e-05, + "loss": 0.3022, + "step": 7391 + }, + { + "epoch": 0.6963566566967335, + "grad_norm": 0.7246094346046448, + "learning_rate": 1.4958094941932977e-05, + "loss": 0.3315, + "step": 7392 + }, + { + "epoch": 0.6964508607899014, + "grad_norm": 0.8228355050086975, + "learning_rate": 1.4956783538411994e-05, + "loss": 0.3194, + "step": 7393 + }, + { + "epoch": 0.6965450648830692, + "grad_norm": 0.7368044853210449, + "learning_rate": 1.4955472021868676e-05, + "loss": 0.3127, + "step": 7394 + }, + { + "epoch": 0.6966392689762371, + "grad_norm": 0.6888823509216309, + "learning_rate": 1.4954160392332938e-05, + "loss": 0.2869, + "step": 7395 + }, + { + "epoch": 0.6967334730694049, + "grad_norm": 0.80766761302948, + "learning_rate": 1.4952848649834676e-05, + "loss": 0.3349, + "step": 7396 + }, + { + "epoch": 0.6968276771625728, + "grad_norm": 0.7352944612503052, + "learning_rate": 1.4951536794403805e-05, + "loss": 0.3244, + "step": 7397 + }, + { + "epoch": 0.6969218812557406, + "grad_norm": 0.7246667742729187, + "learning_rate": 1.495022482607024e-05, + "loss": 0.3178, + "step": 7398 + }, + { + "epoch": 0.6970160853489085, + "grad_norm": 0.601959764957428, + "learning_rate": 1.4948912744863892e-05, + "loss": 0.2314, + "step": 7399 + }, + { + "epoch": 0.6971102894420762, + "grad_norm": 0.6760597825050354, + "learning_rate": 1.4947600550814683e-05, + "loss": 0.3039, + "step": 7400 + }, + { + "epoch": 0.697204493535244, + "grad_norm": 0.7061398029327393, + "learning_rate": 1.4946288243952524e-05, + "loss": 0.3108, + "step": 7401 + }, + { + "epoch": 0.6972986976284119, + "grad_norm": 0.7306122779846191, + "learning_rate": 1.4944975824307348e-05, + "loss": 0.3273, + "step": 7402 + }, + { + "epoch": 0.6973929017215797, + "grad_norm": 0.8416723012924194, + "learning_rate": 1.4943663291909074e-05, + "loss": 0.3187, + "step": 7403 + }, + { + "epoch": 0.6974871058147476, + "grad_norm": 0.6917863488197327, + "learning_rate": 1.494235064678763e-05, + "loss": 0.3189, + "step": 7404 + }, + { + "epoch": 0.6975813099079154, + "grad_norm": 0.8066960573196411, + "learning_rate": 1.4941037888972953e-05, + "loss": 0.3375, + "step": 7405 + }, + { + "epoch": 0.6976755140010833, + "grad_norm": 0.6849872469902039, + "learning_rate": 1.4939725018494966e-05, + "loss": 0.3212, + "step": 7406 + }, + { + "epoch": 0.6977697180942511, + "grad_norm": 0.8231762647628784, + "learning_rate": 1.493841203538361e-05, + "loss": 0.3257, + "step": 7407 + }, + { + "epoch": 0.697863922187419, + "grad_norm": 0.7129079103469849, + "learning_rate": 1.4937098939668823e-05, + "loss": 0.3392, + "step": 7408 + }, + { + "epoch": 0.6979581262805868, + "grad_norm": 0.7407151460647583, + "learning_rate": 1.4935785731380543e-05, + "loss": 0.3138, + "step": 7409 + }, + { + "epoch": 0.6980523303737547, + "grad_norm": 0.7718027830123901, + "learning_rate": 1.4934472410548717e-05, + "loss": 0.2729, + "step": 7410 + }, + { + "epoch": 0.6981465344669225, + "grad_norm": 0.8110967874526978, + "learning_rate": 1.4933158977203288e-05, + "loss": 0.3182, + "step": 7411 + }, + { + "epoch": 0.6982407385600904, + "grad_norm": 0.6419220566749573, + "learning_rate": 1.4931845431374203e-05, + "loss": 0.2866, + "step": 7412 + }, + { + "epoch": 0.6983349426532582, + "grad_norm": 0.7150808572769165, + "learning_rate": 1.493053177309142e-05, + "loss": 0.3134, + "step": 7413 + }, + { + "epoch": 0.6984291467464261, + "grad_norm": 0.7292571067810059, + "learning_rate": 1.4929218002384884e-05, + "loss": 0.333, + "step": 7414 + }, + { + "epoch": 0.6985233508395939, + "grad_norm": 0.8620499968528748, + "learning_rate": 1.4927904119284552e-05, + "loss": 0.3092, + "step": 7415 + }, + { + "epoch": 0.6986175549327618, + "grad_norm": 0.766682505607605, + "learning_rate": 1.492659012382039e-05, + "loss": 0.326, + "step": 7416 + }, + { + "epoch": 0.6987117590259296, + "grad_norm": 0.6765879392623901, + "learning_rate": 1.492527601602235e-05, + "loss": 0.2916, + "step": 7417 + }, + { + "epoch": 0.6988059631190975, + "grad_norm": 0.762783944606781, + "learning_rate": 1.4923961795920402e-05, + "loss": 0.3447, + "step": 7418 + }, + { + "epoch": 0.6989001672122653, + "grad_norm": 0.7239778637886047, + "learning_rate": 1.492264746354451e-05, + "loss": 0.3173, + "step": 7419 + }, + { + "epoch": 0.6989943713054332, + "grad_norm": 0.7437894940376282, + "learning_rate": 1.4921333018924645e-05, + "loss": 0.3443, + "step": 7420 + }, + { + "epoch": 0.699088575398601, + "grad_norm": 1.151016354560852, + "learning_rate": 1.4920018462090771e-05, + "loss": 0.2907, + "step": 7421 + }, + { + "epoch": 0.6991827794917689, + "grad_norm": 0.6117764115333557, + "learning_rate": 1.4918703793072869e-05, + "loss": 0.2701, + "step": 7422 + }, + { + "epoch": 0.6992769835849367, + "grad_norm": 0.7207898497581482, + "learning_rate": 1.4917389011900913e-05, + "loss": 0.3036, + "step": 7423 + }, + { + "epoch": 0.6993711876781046, + "grad_norm": 0.843279242515564, + "learning_rate": 1.4916074118604887e-05, + "loss": 0.3656, + "step": 7424 + }, + { + "epoch": 0.6994653917712724, + "grad_norm": 0.9610438942909241, + "learning_rate": 1.4914759113214765e-05, + "loss": 0.3661, + "step": 7425 + }, + { + "epoch": 0.6995595958644403, + "grad_norm": 0.6298871040344238, + "learning_rate": 1.4913443995760533e-05, + "loss": 0.2975, + "step": 7426 + }, + { + "epoch": 0.6996537999576081, + "grad_norm": 0.7482747435569763, + "learning_rate": 1.491212876627218e-05, + "loss": 0.3257, + "step": 7427 + }, + { + "epoch": 0.699748004050776, + "grad_norm": 1.1633214950561523, + "learning_rate": 1.4910813424779695e-05, + "loss": 0.3578, + "step": 7428 + }, + { + "epoch": 0.6998422081439438, + "grad_norm": 0.6494668126106262, + "learning_rate": 1.4909497971313069e-05, + "loss": 0.2765, + "step": 7429 + }, + { + "epoch": 0.6999364122371117, + "grad_norm": 0.7671998739242554, + "learning_rate": 1.4908182405902296e-05, + "loss": 0.33, + "step": 7430 + }, + { + "epoch": 0.7000306163302795, + "grad_norm": 0.7501470446586609, + "learning_rate": 1.4906866728577373e-05, + "loss": 0.3344, + "step": 7431 + }, + { + "epoch": 0.7001248204234474, + "grad_norm": 3.6615259647369385, + "learning_rate": 1.49055509393683e-05, + "loss": 0.3136, + "step": 7432 + }, + { + "epoch": 0.7002190245166152, + "grad_norm": 0.8734865188598633, + "learning_rate": 1.4904235038305084e-05, + "loss": 0.3371, + "step": 7433 + }, + { + "epoch": 0.7003132286097831, + "grad_norm": 0.8595309853553772, + "learning_rate": 1.4902919025417718e-05, + "loss": 0.3185, + "step": 7434 + }, + { + "epoch": 0.7004074327029509, + "grad_norm": 0.6879934072494507, + "learning_rate": 1.490160290073622e-05, + "loss": 0.2974, + "step": 7435 + }, + { + "epoch": 0.7005016367961188, + "grad_norm": 0.7653011679649353, + "learning_rate": 1.4900286664290593e-05, + "loss": 0.3238, + "step": 7436 + }, + { + "epoch": 0.7005958408892866, + "grad_norm": 0.6484694480895996, + "learning_rate": 1.4898970316110852e-05, + "loss": 0.3273, + "step": 7437 + }, + { + "epoch": 0.7006900449824545, + "grad_norm": 0.6532779335975647, + "learning_rate": 1.4897653856227012e-05, + "loss": 0.3061, + "step": 7438 + }, + { + "epoch": 0.7007842490756223, + "grad_norm": 0.910477876663208, + "learning_rate": 1.4896337284669091e-05, + "loss": 0.2835, + "step": 7439 + }, + { + "epoch": 0.7008784531687902, + "grad_norm": 0.6949968338012695, + "learning_rate": 1.4895020601467107e-05, + "loss": 0.3215, + "step": 7440 + }, + { + "epoch": 0.700972657261958, + "grad_norm": 0.7668197154998779, + "learning_rate": 1.4893703806651081e-05, + "loss": 0.3319, + "step": 7441 + }, + { + "epoch": 0.7010668613551259, + "grad_norm": 0.6469756364822388, + "learning_rate": 1.4892386900251041e-05, + "loss": 0.3026, + "step": 7442 + }, + { + "epoch": 0.7011610654482937, + "grad_norm": 0.9849398732185364, + "learning_rate": 1.4891069882297013e-05, + "loss": 0.3075, + "step": 7443 + }, + { + "epoch": 0.7012552695414616, + "grad_norm": 0.787672221660614, + "learning_rate": 1.4889752752819029e-05, + "loss": 0.2772, + "step": 7444 + }, + { + "epoch": 0.7013494736346294, + "grad_norm": 0.859749436378479, + "learning_rate": 1.488843551184712e-05, + "loss": 0.331, + "step": 7445 + }, + { + "epoch": 0.7014436777277973, + "grad_norm": 0.747424840927124, + "learning_rate": 1.4887118159411317e-05, + "loss": 0.3107, + "step": 7446 + }, + { + "epoch": 0.7015378818209651, + "grad_norm": 0.7198090553283691, + "learning_rate": 1.4885800695541668e-05, + "loss": 0.3098, + "step": 7447 + }, + { + "epoch": 0.701632085914133, + "grad_norm": 0.7659379243850708, + "learning_rate": 1.4884483120268202e-05, + "loss": 0.3099, + "step": 7448 + }, + { + "epoch": 0.7017262900073008, + "grad_norm": 0.7211584448814392, + "learning_rate": 1.488316543362097e-05, + "loss": 0.3165, + "step": 7449 + }, + { + "epoch": 0.7018204941004687, + "grad_norm": 0.668483555316925, + "learning_rate": 1.4881847635630014e-05, + "loss": 0.2987, + "step": 7450 + }, + { + "epoch": 0.7019146981936365, + "grad_norm": 0.7471227645874023, + "learning_rate": 1.488052972632538e-05, + "loss": 0.3173, + "step": 7451 + }, + { + "epoch": 0.7020089022868043, + "grad_norm": 0.6653876900672913, + "learning_rate": 1.4879211705737121e-05, + "loss": 0.2566, + "step": 7452 + }, + { + "epoch": 0.7021031063799722, + "grad_norm": 0.6835277676582336, + "learning_rate": 1.4877893573895292e-05, + "loss": 0.2719, + "step": 7453 + }, + { + "epoch": 0.70219731047314, + "grad_norm": 0.6887438893318176, + "learning_rate": 1.4876575330829943e-05, + "loss": 0.3095, + "step": 7454 + }, + { + "epoch": 0.7022915145663079, + "grad_norm": 0.7306256890296936, + "learning_rate": 1.4875256976571135e-05, + "loss": 0.3151, + "step": 7455 + }, + { + "epoch": 0.7023857186594757, + "grad_norm": 0.7601445913314819, + "learning_rate": 1.487393851114893e-05, + "loss": 0.3356, + "step": 7456 + }, + { + "epoch": 0.7024799227526436, + "grad_norm": 0.7967188358306885, + "learning_rate": 1.4872619934593387e-05, + "loss": 0.3189, + "step": 7457 + }, + { + "epoch": 0.7025741268458114, + "grad_norm": 0.7820927500724792, + "learning_rate": 1.4871301246934574e-05, + "loss": 0.3021, + "step": 7458 + }, + { + "epoch": 0.7026683309389793, + "grad_norm": 0.7008146047592163, + "learning_rate": 1.486998244820256e-05, + "loss": 0.2988, + "step": 7459 + }, + { + "epoch": 0.7027625350321471, + "grad_norm": 0.7662867903709412, + "learning_rate": 1.4868663538427415e-05, + "loss": 0.3522, + "step": 7460 + }, + { + "epoch": 0.702856739125315, + "grad_norm": 0.9097483158111572, + "learning_rate": 1.4867344517639208e-05, + "loss": 0.3585, + "step": 7461 + }, + { + "epoch": 0.7029509432184828, + "grad_norm": 1.0366448163986206, + "learning_rate": 1.4866025385868024e-05, + "loss": 0.372, + "step": 7462 + }, + { + "epoch": 0.7030451473116507, + "grad_norm": 0.8925309777259827, + "learning_rate": 1.4864706143143931e-05, + "loss": 0.327, + "step": 7463 + }, + { + "epoch": 0.7031393514048185, + "grad_norm": 0.7557048201560974, + "learning_rate": 1.4863386789497019e-05, + "loss": 0.3275, + "step": 7464 + }, + { + "epoch": 0.7032335554979864, + "grad_norm": 0.7582609057426453, + "learning_rate": 1.4862067324957364e-05, + "loss": 0.3, + "step": 7465 + }, + { + "epoch": 0.7033277595911542, + "grad_norm": 0.9192906618118286, + "learning_rate": 1.4860747749555054e-05, + "loss": 0.3507, + "step": 7466 + }, + { + "epoch": 0.7034219636843221, + "grad_norm": 0.7212841510772705, + "learning_rate": 1.485942806332018e-05, + "loss": 0.3203, + "step": 7467 + }, + { + "epoch": 0.7035161677774899, + "grad_norm": 0.7245615124702454, + "learning_rate": 1.4858108266282831e-05, + "loss": 0.3122, + "step": 7468 + }, + { + "epoch": 0.7036103718706578, + "grad_norm": 0.7031428217887878, + "learning_rate": 1.4856788358473097e-05, + "loss": 0.3309, + "step": 7469 + }, + { + "epoch": 0.7037045759638256, + "grad_norm": 0.7170652747154236, + "learning_rate": 1.4855468339921082e-05, + "loss": 0.2901, + "step": 7470 + }, + { + "epoch": 0.7037987800569935, + "grad_norm": 0.7371145486831665, + "learning_rate": 1.4854148210656876e-05, + "loss": 0.3079, + "step": 7471 + }, + { + "epoch": 0.7038929841501613, + "grad_norm": 0.7147948145866394, + "learning_rate": 1.4852827970710584e-05, + "loss": 0.2954, + "step": 7472 + }, + { + "epoch": 0.7039871882433292, + "grad_norm": 0.7179796099662781, + "learning_rate": 1.4851507620112313e-05, + "loss": 0.3479, + "step": 7473 + }, + { + "epoch": 0.704081392336497, + "grad_norm": 0.7083032727241516, + "learning_rate": 1.4850187158892163e-05, + "loss": 0.3177, + "step": 7474 + }, + { + "epoch": 0.7041755964296649, + "grad_norm": 0.7225296497344971, + "learning_rate": 1.4848866587080245e-05, + "loss": 0.2992, + "step": 7475 + }, + { + "epoch": 0.7042698005228327, + "grad_norm": 0.6736642122268677, + "learning_rate": 1.4847545904706668e-05, + "loss": 0.3126, + "step": 7476 + }, + { + "epoch": 0.7043640046160006, + "grad_norm": 0.7275652885437012, + "learning_rate": 1.4846225111801546e-05, + "loss": 0.3571, + "step": 7477 + }, + { + "epoch": 0.7044582087091684, + "grad_norm": 0.8346661925315857, + "learning_rate": 1.4844904208395001e-05, + "loss": 0.3115, + "step": 7478 + }, + { + "epoch": 0.7045524128023363, + "grad_norm": 0.7387918829917908, + "learning_rate": 1.4843583194517145e-05, + "loss": 0.297, + "step": 7479 + }, + { + "epoch": 0.7046466168955041, + "grad_norm": 0.8029735088348389, + "learning_rate": 1.4842262070198104e-05, + "loss": 0.297, + "step": 7480 + }, + { + "epoch": 0.704740820988672, + "grad_norm": 0.7478859424591064, + "learning_rate": 1.4840940835467996e-05, + "loss": 0.3135, + "step": 7481 + }, + { + "epoch": 0.7048350250818398, + "grad_norm": 0.6728004217147827, + "learning_rate": 1.4839619490356953e-05, + "loss": 0.2493, + "step": 7482 + }, + { + "epoch": 0.7049292291750077, + "grad_norm": 0.7483137845993042, + "learning_rate": 1.4838298034895097e-05, + "loss": 0.3107, + "step": 7483 + }, + { + "epoch": 0.7050234332681755, + "grad_norm": 0.8089814782142639, + "learning_rate": 1.483697646911257e-05, + "loss": 0.3221, + "step": 7484 + }, + { + "epoch": 0.7051176373613434, + "grad_norm": 0.7896853685379028, + "learning_rate": 1.4835654793039493e-05, + "loss": 0.3284, + "step": 7485 + }, + { + "epoch": 0.7052118414545112, + "grad_norm": 1.2581260204315186, + "learning_rate": 1.483433300670601e-05, + "loss": 0.3294, + "step": 7486 + }, + { + "epoch": 0.7053060455476791, + "grad_norm": 0.732612669467926, + "learning_rate": 1.4833011110142257e-05, + "loss": 0.343, + "step": 7487 + }, + { + "epoch": 0.7054002496408469, + "grad_norm": 0.819580078125, + "learning_rate": 1.4831689103378378e-05, + "loss": 0.3698, + "step": 7488 + }, + { + "epoch": 0.7054944537340148, + "grad_norm": 0.69870924949646, + "learning_rate": 1.4830366986444513e-05, + "loss": 0.292, + "step": 7489 + }, + { + "epoch": 0.7055886578271826, + "grad_norm": 0.7503359317779541, + "learning_rate": 1.4829044759370814e-05, + "loss": 0.2922, + "step": 7490 + }, + { + "epoch": 0.7056828619203505, + "grad_norm": 0.9934459924697876, + "learning_rate": 1.4827722422187424e-05, + "loss": 0.3177, + "step": 7491 + }, + { + "epoch": 0.7057770660135183, + "grad_norm": 0.8618832230567932, + "learning_rate": 1.4826399974924495e-05, + "loss": 0.3352, + "step": 7492 + }, + { + "epoch": 0.7058712701066862, + "grad_norm": 0.7464866042137146, + "learning_rate": 1.4825077417612187e-05, + "loss": 0.3037, + "step": 7493 + }, + { + "epoch": 0.705965474199854, + "grad_norm": 0.7409743070602417, + "learning_rate": 1.4823754750280646e-05, + "loss": 0.3414, + "step": 7494 + }, + { + "epoch": 0.7060596782930219, + "grad_norm": 0.7519456148147583, + "learning_rate": 1.482243197296004e-05, + "loss": 0.3243, + "step": 7495 + }, + { + "epoch": 0.7061538823861897, + "grad_norm": 0.7602671384811401, + "learning_rate": 1.4821109085680528e-05, + "loss": 0.3124, + "step": 7496 + }, + { + "epoch": 0.7062480864793576, + "grad_norm": 0.7375609278678894, + "learning_rate": 1.4819786088472268e-05, + "loss": 0.2839, + "step": 7497 + }, + { + "epoch": 0.7063422905725254, + "grad_norm": 0.6524137258529663, + "learning_rate": 1.4818462981365435e-05, + "loss": 0.2732, + "step": 7498 + }, + { + "epoch": 0.7064364946656932, + "grad_norm": 0.8175925612449646, + "learning_rate": 1.4817139764390193e-05, + "loss": 0.3312, + "step": 7499 + }, + { + "epoch": 0.7065306987588611, + "grad_norm": 1.2809901237487793, + "learning_rate": 1.4815816437576716e-05, + "loss": 0.323, + "step": 7500 + }, + { + "epoch": 0.706624902852029, + "grad_norm": 0.7297874093055725, + "learning_rate": 1.4814493000955175e-05, + "loss": 0.3283, + "step": 7501 + }, + { + "epoch": 0.7067191069451968, + "grad_norm": 0.6727867126464844, + "learning_rate": 1.481316945455575e-05, + "loss": 0.294, + "step": 7502 + }, + { + "epoch": 0.7068133110383646, + "grad_norm": 0.7023646831512451, + "learning_rate": 1.4811845798408615e-05, + "loss": 0.3109, + "step": 7503 + }, + { + "epoch": 0.7069075151315325, + "grad_norm": 0.8942722082138062, + "learning_rate": 1.4810522032543956e-05, + "loss": 0.3004, + "step": 7504 + }, + { + "epoch": 0.7070017192247003, + "grad_norm": 0.7658995389938354, + "learning_rate": 1.4809198156991954e-05, + "loss": 0.2923, + "step": 7505 + }, + { + "epoch": 0.7070959233178682, + "grad_norm": 0.8051748275756836, + "learning_rate": 1.4807874171782795e-05, + "loss": 0.3487, + "step": 7506 + }, + { + "epoch": 0.707190127411036, + "grad_norm": 0.7508425712585449, + "learning_rate": 1.4806550076946673e-05, + "loss": 0.3322, + "step": 7507 + }, + { + "epoch": 0.7072843315042039, + "grad_norm": 0.7547079920768738, + "learning_rate": 1.4805225872513774e-05, + "loss": 0.2869, + "step": 7508 + }, + { + "epoch": 0.7073785355973717, + "grad_norm": 1.0497976541519165, + "learning_rate": 1.4803901558514291e-05, + "loss": 0.3009, + "step": 7509 + }, + { + "epoch": 0.7074727396905396, + "grad_norm": 0.7662302255630493, + "learning_rate": 1.4802577134978429e-05, + "loss": 0.3124, + "step": 7510 + }, + { + "epoch": 0.7075669437837074, + "grad_norm": 0.7299298644065857, + "learning_rate": 1.4801252601936375e-05, + "loss": 0.3711, + "step": 7511 + }, + { + "epoch": 0.7076611478768753, + "grad_norm": 0.7905606031417847, + "learning_rate": 1.4799927959418338e-05, + "loss": 0.2966, + "step": 7512 + }, + { + "epoch": 0.7077553519700431, + "grad_norm": 0.7510884404182434, + "learning_rate": 1.4798603207454524e-05, + "loss": 0.3529, + "step": 7513 + }, + { + "epoch": 0.707849556063211, + "grad_norm": 0.754071831703186, + "learning_rate": 1.479727834607513e-05, + "loss": 0.3493, + "step": 7514 + }, + { + "epoch": 0.7079437601563788, + "grad_norm": 0.7773962616920471, + "learning_rate": 1.4795953375310375e-05, + "loss": 0.3261, + "step": 7515 + }, + { + "epoch": 0.7080379642495467, + "grad_norm": 0.6722862720489502, + "learning_rate": 1.4794628295190466e-05, + "loss": 0.2935, + "step": 7516 + }, + { + "epoch": 0.7081321683427145, + "grad_norm": 0.6952345371246338, + "learning_rate": 1.4793303105745613e-05, + "loss": 0.2584, + "step": 7517 + }, + { + "epoch": 0.7082263724358824, + "grad_norm": 0.8048527240753174, + "learning_rate": 1.479197780700604e-05, + "loss": 0.3083, + "step": 7518 + }, + { + "epoch": 0.7083205765290502, + "grad_norm": 0.8349993824958801, + "learning_rate": 1.4790652399001962e-05, + "loss": 0.3216, + "step": 7519 + }, + { + "epoch": 0.7084147806222181, + "grad_norm": 0.8492235541343689, + "learning_rate": 1.4789326881763597e-05, + "loss": 0.3474, + "step": 7520 + }, + { + "epoch": 0.7085089847153859, + "grad_norm": 0.7065709829330444, + "learning_rate": 1.4788001255321174e-05, + "loss": 0.2937, + "step": 7521 + }, + { + "epoch": 0.7086031888085538, + "grad_norm": 0.8696237802505493, + "learning_rate": 1.4786675519704919e-05, + "loss": 0.3609, + "step": 7522 + }, + { + "epoch": 0.7086973929017216, + "grad_norm": 0.7493220567703247, + "learning_rate": 1.4785349674945058e-05, + "loss": 0.343, + "step": 7523 + }, + { + "epoch": 0.7087915969948895, + "grad_norm": 0.7842844128608704, + "learning_rate": 1.4784023721071825e-05, + "loss": 0.3472, + "step": 7524 + }, + { + "epoch": 0.7088858010880573, + "grad_norm": 0.7957137823104858, + "learning_rate": 1.478269765811545e-05, + "loss": 0.3187, + "step": 7525 + }, + { + "epoch": 0.7089800051812252, + "grad_norm": 0.7527223825454712, + "learning_rate": 1.4781371486106174e-05, + "loss": 0.2964, + "step": 7526 + }, + { + "epoch": 0.709074209274393, + "grad_norm": 0.8188031315803528, + "learning_rate": 1.4780045205074231e-05, + "loss": 0.3618, + "step": 7527 + }, + { + "epoch": 0.7091684133675609, + "grad_norm": 0.7470711469650269, + "learning_rate": 1.4778718815049868e-05, + "loss": 0.3106, + "step": 7528 + }, + { + "epoch": 0.7092626174607287, + "grad_norm": 0.8296229839324951, + "learning_rate": 1.4777392316063324e-05, + "loss": 0.3047, + "step": 7529 + }, + { + "epoch": 0.7093568215538966, + "grad_norm": 0.6684839725494385, + "learning_rate": 1.4776065708144848e-05, + "loss": 0.2667, + "step": 7530 + }, + { + "epoch": 0.7094510256470644, + "grad_norm": 0.7411872744560242, + "learning_rate": 1.4774738991324686e-05, + "loss": 0.3176, + "step": 7531 + }, + { + "epoch": 0.7095452297402323, + "grad_norm": 0.689228355884552, + "learning_rate": 1.477341216563309e-05, + "loss": 0.311, + "step": 7532 + }, + { + "epoch": 0.7096394338334001, + "grad_norm": 0.7025071978569031, + "learning_rate": 1.477208523110032e-05, + "loss": 0.3029, + "step": 7533 + }, + { + "epoch": 0.709733637926568, + "grad_norm": 0.6913065910339355, + "learning_rate": 1.4770758187756619e-05, + "loss": 0.3309, + "step": 7534 + }, + { + "epoch": 0.7098278420197358, + "grad_norm": 0.9382787942886353, + "learning_rate": 1.4769431035632258e-05, + "loss": 0.3401, + "step": 7535 + }, + { + "epoch": 0.7099220461129037, + "grad_norm": 0.8482984304428101, + "learning_rate": 1.4768103774757491e-05, + "loss": 0.3835, + "step": 7536 + }, + { + "epoch": 0.7100162502060715, + "grad_norm": 0.9082434773445129, + "learning_rate": 1.4766776405162584e-05, + "loss": 0.3392, + "step": 7537 + }, + { + "epoch": 0.7101104542992392, + "grad_norm": 0.7156729698181152, + "learning_rate": 1.4765448926877804e-05, + "loss": 0.334, + "step": 7538 + }, + { + "epoch": 0.7102046583924071, + "grad_norm": 0.6966708302497864, + "learning_rate": 1.476412133993342e-05, + "loss": 0.2901, + "step": 7539 + }, + { + "epoch": 0.7102988624855749, + "grad_norm": 0.8167742490768433, + "learning_rate": 1.4762793644359699e-05, + "loss": 0.3767, + "step": 7540 + }, + { + "epoch": 0.7103930665787428, + "grad_norm": 0.7934980988502502, + "learning_rate": 1.4761465840186919e-05, + "loss": 0.3093, + "step": 7541 + }, + { + "epoch": 0.7104872706719106, + "grad_norm": 0.8151422142982483, + "learning_rate": 1.476013792744535e-05, + "loss": 0.3036, + "step": 7542 + }, + { + "epoch": 0.7105814747650785, + "grad_norm": 0.7666172981262207, + "learning_rate": 1.4758809906165281e-05, + "loss": 0.3427, + "step": 7543 + }, + { + "epoch": 0.7106756788582463, + "grad_norm": 0.7245919108390808, + "learning_rate": 1.4757481776376983e-05, + "loss": 0.3168, + "step": 7544 + }, + { + "epoch": 0.7107698829514142, + "grad_norm": 0.8002803921699524, + "learning_rate": 1.4756153538110742e-05, + "loss": 0.2777, + "step": 7545 + }, + { + "epoch": 0.710864087044582, + "grad_norm": 0.7824395895004272, + "learning_rate": 1.4754825191396847e-05, + "loss": 0.3185, + "step": 7546 + }, + { + "epoch": 0.7109582911377499, + "grad_norm": 0.7184972167015076, + "learning_rate": 1.4753496736265582e-05, + "loss": 0.3018, + "step": 7547 + }, + { + "epoch": 0.7110524952309177, + "grad_norm": 0.8208193778991699, + "learning_rate": 1.4752168172747242e-05, + "loss": 0.3566, + "step": 7548 + }, + { + "epoch": 0.7111466993240856, + "grad_norm": 0.8324835300445557, + "learning_rate": 1.4750839500872116e-05, + "loss": 0.29, + "step": 7549 + }, + { + "epoch": 0.7112409034172534, + "grad_norm": 0.8856258392333984, + "learning_rate": 1.4749510720670506e-05, + "loss": 0.3169, + "step": 7550 + }, + { + "epoch": 0.7113351075104213, + "grad_norm": 0.7524356245994568, + "learning_rate": 1.4748181832172703e-05, + "loss": 0.3383, + "step": 7551 + }, + { + "epoch": 0.7114293116035891, + "grad_norm": 0.6797780990600586, + "learning_rate": 1.4746852835409009e-05, + "loss": 0.2675, + "step": 7552 + }, + { + "epoch": 0.711523515696757, + "grad_norm": 0.7996805310249329, + "learning_rate": 1.4745523730409734e-05, + "loss": 0.3425, + "step": 7553 + }, + { + "epoch": 0.7116177197899248, + "grad_norm": 0.7375102639198303, + "learning_rate": 1.4744194517205177e-05, + "loss": 0.2872, + "step": 7554 + }, + { + "epoch": 0.7117119238830927, + "grad_norm": 0.7449827194213867, + "learning_rate": 1.4742865195825649e-05, + "loss": 0.2979, + "step": 7555 + }, + { + "epoch": 0.7118061279762605, + "grad_norm": 0.7717209458351135, + "learning_rate": 1.4741535766301458e-05, + "loss": 0.3293, + "step": 7556 + }, + { + "epoch": 0.7119003320694284, + "grad_norm": 0.7713416814804077, + "learning_rate": 1.4740206228662917e-05, + "loss": 0.3874, + "step": 7557 + }, + { + "epoch": 0.7119945361625962, + "grad_norm": 0.8193777799606323, + "learning_rate": 1.4738876582940344e-05, + "loss": 0.323, + "step": 7558 + }, + { + "epoch": 0.7120887402557641, + "grad_norm": 0.7421199083328247, + "learning_rate": 1.4737546829164057e-05, + "loss": 0.3599, + "step": 7559 + }, + { + "epoch": 0.7121829443489319, + "grad_norm": 0.7332062125205994, + "learning_rate": 1.4736216967364379e-05, + "loss": 0.3498, + "step": 7560 + }, + { + "epoch": 0.7122771484420998, + "grad_norm": 0.7626232504844666, + "learning_rate": 1.4734886997571627e-05, + "loss": 0.3051, + "step": 7561 + }, + { + "epoch": 0.7123713525352676, + "grad_norm": 1.1218847036361694, + "learning_rate": 1.4733556919816126e-05, + "loss": 0.2813, + "step": 7562 + }, + { + "epoch": 0.7124655566284355, + "grad_norm": 0.7519378066062927, + "learning_rate": 1.4732226734128208e-05, + "loss": 0.3451, + "step": 7563 + }, + { + "epoch": 0.7125597607216033, + "grad_norm": 0.7056507468223572, + "learning_rate": 1.4730896440538203e-05, + "loss": 0.3118, + "step": 7564 + }, + { + "epoch": 0.7126539648147712, + "grad_norm": 0.77479487657547, + "learning_rate": 1.4729566039076444e-05, + "loss": 0.3676, + "step": 7565 + }, + { + "epoch": 0.712748168907939, + "grad_norm": 0.7520031929016113, + "learning_rate": 1.472823552977326e-05, + "loss": 0.3103, + "step": 7566 + }, + { + "epoch": 0.7128423730011069, + "grad_norm": 0.7875931859016418, + "learning_rate": 1.4726904912658997e-05, + "loss": 0.3242, + "step": 7567 + }, + { + "epoch": 0.7129365770942747, + "grad_norm": 0.772013247013092, + "learning_rate": 1.4725574187763991e-05, + "loss": 0.3427, + "step": 7568 + }, + { + "epoch": 0.7130307811874426, + "grad_norm": 0.7841171026229858, + "learning_rate": 1.4724243355118587e-05, + "loss": 0.3181, + "step": 7569 + }, + { + "epoch": 0.7131249852806104, + "grad_norm": 0.8140650987625122, + "learning_rate": 1.4722912414753127e-05, + "loss": 0.3338, + "step": 7570 + }, + { + "epoch": 0.7132191893737783, + "grad_norm": 0.7578203678131104, + "learning_rate": 1.4721581366697959e-05, + "loss": 0.3208, + "step": 7571 + }, + { + "epoch": 0.7133133934669461, + "grad_norm": 0.7792476415634155, + "learning_rate": 1.4720250210983434e-05, + "loss": 0.3746, + "step": 7572 + }, + { + "epoch": 0.713407597560114, + "grad_norm": 0.9261788129806519, + "learning_rate": 1.4718918947639907e-05, + "loss": 0.3038, + "step": 7573 + }, + { + "epoch": 0.7135018016532818, + "grad_norm": 0.8140808939933777, + "learning_rate": 1.4717587576697727e-05, + "loss": 0.3144, + "step": 7574 + }, + { + "epoch": 0.7135960057464497, + "grad_norm": 0.6864731907844543, + "learning_rate": 1.4716256098187254e-05, + "loss": 0.3152, + "step": 7575 + }, + { + "epoch": 0.7136902098396175, + "grad_norm": 0.8876187801361084, + "learning_rate": 1.471492451213885e-05, + "loss": 0.3289, + "step": 7576 + }, + { + "epoch": 0.7137844139327854, + "grad_norm": 0.7731730937957764, + "learning_rate": 1.4713592818582873e-05, + "loss": 0.3409, + "step": 7577 + }, + { + "epoch": 0.7138786180259532, + "grad_norm": 0.8989897966384888, + "learning_rate": 1.4712261017549692e-05, + "loss": 0.3412, + "step": 7578 + }, + { + "epoch": 0.713972822119121, + "grad_norm": 0.7042735815048218, + "learning_rate": 1.4710929109069674e-05, + "loss": 0.2827, + "step": 7579 + }, + { + "epoch": 0.7140670262122889, + "grad_norm": 0.7563043832778931, + "learning_rate": 1.4709597093173184e-05, + "loss": 0.3589, + "step": 7580 + }, + { + "epoch": 0.7141612303054568, + "grad_norm": 0.7748274207115173, + "learning_rate": 1.4708264969890598e-05, + "loss": 0.3215, + "step": 7581 + }, + { + "epoch": 0.7142554343986246, + "grad_norm": 0.6974570155143738, + "learning_rate": 1.4706932739252286e-05, + "loss": 0.2845, + "step": 7582 + }, + { + "epoch": 0.7143496384917924, + "grad_norm": 0.7702315449714661, + "learning_rate": 1.4705600401288632e-05, + "loss": 0.3051, + "step": 7583 + }, + { + "epoch": 0.7144438425849603, + "grad_norm": 0.7599518299102783, + "learning_rate": 1.4704267956030011e-05, + "loss": 0.3312, + "step": 7584 + }, + { + "epoch": 0.7145380466781281, + "grad_norm": 0.7633971571922302, + "learning_rate": 1.4702935403506804e-05, + "loss": 0.3311, + "step": 7585 + }, + { + "epoch": 0.714632250771296, + "grad_norm": 0.7205565571784973, + "learning_rate": 1.4701602743749397e-05, + "loss": 0.3354, + "step": 7586 + }, + { + "epoch": 0.7147264548644638, + "grad_norm": 0.7502606511116028, + "learning_rate": 1.4700269976788175e-05, + "loss": 0.3379, + "step": 7587 + }, + { + "epoch": 0.7148206589576317, + "grad_norm": 0.7521887421607971, + "learning_rate": 1.469893710265353e-05, + "loss": 0.3116, + "step": 7588 + }, + { + "epoch": 0.7149148630507995, + "grad_norm": 0.7757728099822998, + "learning_rate": 1.469760412137585e-05, + "loss": 0.2945, + "step": 7589 + }, + { + "epoch": 0.7150090671439674, + "grad_norm": 0.8067355751991272, + "learning_rate": 1.4696271032985536e-05, + "loss": 0.3169, + "step": 7590 + }, + { + "epoch": 0.7151032712371352, + "grad_norm": 0.8102144002914429, + "learning_rate": 1.4694937837512975e-05, + "loss": 0.3197, + "step": 7591 + }, + { + "epoch": 0.7151974753303031, + "grad_norm": 0.7835685610771179, + "learning_rate": 1.469360453498857e-05, + "loss": 0.3038, + "step": 7592 + }, + { + "epoch": 0.7152916794234709, + "grad_norm": 0.7127066254615784, + "learning_rate": 1.4692271125442727e-05, + "loss": 0.2981, + "step": 7593 + }, + { + "epoch": 0.7153858835166388, + "grad_norm": 0.7197777628898621, + "learning_rate": 1.4690937608905843e-05, + "loss": 0.3397, + "step": 7594 + }, + { + "epoch": 0.7154800876098066, + "grad_norm": 0.6157107949256897, + "learning_rate": 1.4689603985408324e-05, + "loss": 0.2464, + "step": 7595 + }, + { + "epoch": 0.7155742917029745, + "grad_norm": 0.8053467869758606, + "learning_rate": 1.4688270254980588e-05, + "loss": 0.3336, + "step": 7596 + }, + { + "epoch": 0.7156684957961423, + "grad_norm": 1.9933987855911255, + "learning_rate": 1.4686936417653032e-05, + "loss": 0.3317, + "step": 7597 + }, + { + "epoch": 0.7157626998893102, + "grad_norm": 0.6925097107887268, + "learning_rate": 1.4685602473456084e-05, + "loss": 0.3272, + "step": 7598 + }, + { + "epoch": 0.715856903982478, + "grad_norm": 0.6546260118484497, + "learning_rate": 1.4684268422420149e-05, + "loss": 0.2834, + "step": 7599 + }, + { + "epoch": 0.7159511080756459, + "grad_norm": 0.7096473574638367, + "learning_rate": 1.4682934264575655e-05, + "loss": 0.3105, + "step": 7600 + }, + { + "epoch": 0.7160453121688137, + "grad_norm": 0.8337863683700562, + "learning_rate": 1.4681599999953014e-05, + "loss": 0.3549, + "step": 7601 + }, + { + "epoch": 0.7161395162619816, + "grad_norm": 0.7449379563331604, + "learning_rate": 1.4680265628582649e-05, + "loss": 0.3275, + "step": 7602 + }, + { + "epoch": 0.7162337203551494, + "grad_norm": 0.6516759395599365, + "learning_rate": 1.4678931150494992e-05, + "loss": 0.2657, + "step": 7603 + }, + { + "epoch": 0.7163279244483173, + "grad_norm": 0.7168840765953064, + "learning_rate": 1.4677596565720472e-05, + "loss": 0.3175, + "step": 7604 + }, + { + "epoch": 0.7164221285414851, + "grad_norm": 0.7934387922286987, + "learning_rate": 1.4676261874289512e-05, + "loss": 0.3344, + "step": 7605 + }, + { + "epoch": 0.716516332634653, + "grad_norm": 0.6381176710128784, + "learning_rate": 1.4674927076232553e-05, + "loss": 0.2709, + "step": 7606 + }, + { + "epoch": 0.7166105367278208, + "grad_norm": 0.7143204808235168, + "learning_rate": 1.4673592171580025e-05, + "loss": 0.3133, + "step": 7607 + }, + { + "epoch": 0.7167047408209887, + "grad_norm": 0.7504051923751831, + "learning_rate": 1.4672257160362367e-05, + "loss": 0.3191, + "step": 7608 + }, + { + "epoch": 0.7167989449141565, + "grad_norm": 0.7678667902946472, + "learning_rate": 1.4670922042610018e-05, + "loss": 0.3171, + "step": 7609 + }, + { + "epoch": 0.7168931490073244, + "grad_norm": 0.8258387446403503, + "learning_rate": 1.4669586818353427e-05, + "loss": 0.4037, + "step": 7610 + }, + { + "epoch": 0.7169873531004922, + "grad_norm": 0.8272293210029602, + "learning_rate": 1.4668251487623033e-05, + "loss": 0.356, + "step": 7611 + }, + { + "epoch": 0.7170815571936601, + "grad_norm": 0.9082145690917969, + "learning_rate": 1.4666916050449285e-05, + "loss": 0.2972, + "step": 7612 + }, + { + "epoch": 0.7171757612868279, + "grad_norm": 0.9436091184616089, + "learning_rate": 1.4665580506862636e-05, + "loss": 0.3203, + "step": 7613 + }, + { + "epoch": 0.7172699653799958, + "grad_norm": 0.7845689058303833, + "learning_rate": 1.4664244856893532e-05, + "loss": 0.3522, + "step": 7614 + }, + { + "epoch": 0.7173641694731636, + "grad_norm": 0.7713825106620789, + "learning_rate": 1.4662909100572433e-05, + "loss": 0.298, + "step": 7615 + }, + { + "epoch": 0.7174583735663315, + "grad_norm": 0.7790085077285767, + "learning_rate": 1.4661573237929798e-05, + "loss": 0.3182, + "step": 7616 + }, + { + "epoch": 0.7175525776594993, + "grad_norm": 0.7305101752281189, + "learning_rate": 1.466023726899608e-05, + "loss": 0.3121, + "step": 7617 + }, + { + "epoch": 0.7176467817526672, + "grad_norm": 0.7902939319610596, + "learning_rate": 1.4658901193801747e-05, + "loss": 0.3211, + "step": 7618 + }, + { + "epoch": 0.717740985845835, + "grad_norm": 0.7859717011451721, + "learning_rate": 1.4657565012377263e-05, + "loss": 0.3065, + "step": 7619 + }, + { + "epoch": 0.7178351899390029, + "grad_norm": 0.8295626044273376, + "learning_rate": 1.4656228724753093e-05, + "loss": 0.2733, + "step": 7620 + }, + { + "epoch": 0.7179293940321707, + "grad_norm": 0.7178802490234375, + "learning_rate": 1.4654892330959707e-05, + "loss": 0.289, + "step": 7621 + }, + { + "epoch": 0.7180235981253386, + "grad_norm": 0.8516690135002136, + "learning_rate": 1.4653555831027575e-05, + "loss": 0.316, + "step": 7622 + }, + { + "epoch": 0.7181178022185064, + "grad_norm": 0.7444583773612976, + "learning_rate": 1.4652219224987174e-05, + "loss": 0.2749, + "step": 7623 + }, + { + "epoch": 0.7182120063116743, + "grad_norm": 0.7659174799919128, + "learning_rate": 1.4650882512868983e-05, + "loss": 0.3405, + "step": 7624 + }, + { + "epoch": 0.7183062104048421, + "grad_norm": 5.310520648956299, + "learning_rate": 1.4649545694703476e-05, + "loss": 0.3339, + "step": 7625 + }, + { + "epoch": 0.71840041449801, + "grad_norm": 0.756412148475647, + "learning_rate": 1.4648208770521134e-05, + "loss": 0.366, + "step": 7626 + }, + { + "epoch": 0.7184946185911778, + "grad_norm": 0.7343206405639648, + "learning_rate": 1.4646871740352444e-05, + "loss": 0.3252, + "step": 7627 + }, + { + "epoch": 0.7185888226843457, + "grad_norm": 0.6697603464126587, + "learning_rate": 1.4645534604227894e-05, + "loss": 0.3041, + "step": 7628 + }, + { + "epoch": 0.7186830267775135, + "grad_norm": 0.665007472038269, + "learning_rate": 1.4644197362177967e-05, + "loss": 0.2974, + "step": 7629 + }, + { + "epoch": 0.7187772308706813, + "grad_norm": 0.6484889388084412, + "learning_rate": 1.4642860014233164e-05, + "loss": 0.2805, + "step": 7630 + }, + { + "epoch": 0.7188714349638492, + "grad_norm": 0.8215962648391724, + "learning_rate": 1.4641522560423966e-05, + "loss": 0.3275, + "step": 7631 + }, + { + "epoch": 0.718965639057017, + "grad_norm": 0.8847222328186035, + "learning_rate": 1.464018500078088e-05, + "loss": 0.3599, + "step": 7632 + }, + { + "epoch": 0.7190598431501849, + "grad_norm": 0.8452063202857971, + "learning_rate": 1.4638847335334399e-05, + "loss": 0.342, + "step": 7633 + }, + { + "epoch": 0.7191540472433527, + "grad_norm": 0.8655790686607361, + "learning_rate": 1.4637509564115022e-05, + "loss": 0.397, + "step": 7634 + }, + { + "epoch": 0.7192482513365206, + "grad_norm": 0.6889362931251526, + "learning_rate": 1.4636171687153255e-05, + "loss": 0.2995, + "step": 7635 + }, + { + "epoch": 0.7193424554296884, + "grad_norm": 1.0323363542556763, + "learning_rate": 1.4634833704479607e-05, + "loss": 0.3485, + "step": 7636 + }, + { + "epoch": 0.7194366595228563, + "grad_norm": 0.6997215747833252, + "learning_rate": 1.463349561612458e-05, + "loss": 0.2629, + "step": 7637 + }, + { + "epoch": 0.7195308636160241, + "grad_norm": 0.7511755228042603, + "learning_rate": 1.4632157422118687e-05, + "loss": 0.3361, + "step": 7638 + }, + { + "epoch": 0.719625067709192, + "grad_norm": 0.7879907488822937, + "learning_rate": 1.4630819122492444e-05, + "loss": 0.2914, + "step": 7639 + }, + { + "epoch": 0.7197192718023598, + "grad_norm": 0.837253212928772, + "learning_rate": 1.4629480717276361e-05, + "loss": 0.334, + "step": 7640 + }, + { + "epoch": 0.7198134758955277, + "grad_norm": 0.7338079810142517, + "learning_rate": 1.4628142206500959e-05, + "loss": 0.2808, + "step": 7641 + }, + { + "epoch": 0.7199076799886955, + "grad_norm": 0.7047969698905945, + "learning_rate": 1.4626803590196754e-05, + "loss": 0.3102, + "step": 7642 + }, + { + "epoch": 0.7200018840818634, + "grad_norm": 0.6892857551574707, + "learning_rate": 1.4625464868394275e-05, + "loss": 0.3119, + "step": 7643 + }, + { + "epoch": 0.7200960881750312, + "grad_norm": 0.8218495845794678, + "learning_rate": 1.4624126041124043e-05, + "loss": 0.3265, + "step": 7644 + }, + { + "epoch": 0.7201902922681991, + "grad_norm": 0.7993860244750977, + "learning_rate": 1.4622787108416585e-05, + "loss": 0.3039, + "step": 7645 + }, + { + "epoch": 0.7202844963613669, + "grad_norm": 0.7169907093048096, + "learning_rate": 1.462144807030243e-05, + "loss": 0.3169, + "step": 7646 + }, + { + "epoch": 0.7203787004545348, + "grad_norm": 0.704494833946228, + "learning_rate": 1.4620108926812115e-05, + "loss": 0.3111, + "step": 7647 + }, + { + "epoch": 0.7204729045477026, + "grad_norm": 0.7978274822235107, + "learning_rate": 1.461876967797617e-05, + "loss": 0.3151, + "step": 7648 + }, + { + "epoch": 0.7205671086408705, + "grad_norm": 0.5734168291091919, + "learning_rate": 1.4617430323825133e-05, + "loss": 0.2606, + "step": 7649 + }, + { + "epoch": 0.7206613127340383, + "grad_norm": 0.6860144734382629, + "learning_rate": 1.4616090864389545e-05, + "loss": 0.2641, + "step": 7650 + }, + { + "epoch": 0.7207555168272062, + "grad_norm": 0.6949218511581421, + "learning_rate": 1.4614751299699946e-05, + "loss": 0.3033, + "step": 7651 + }, + { + "epoch": 0.720849720920374, + "grad_norm": 0.7663481831550598, + "learning_rate": 1.461341162978688e-05, + "loss": 0.3392, + "step": 7652 + }, + { + "epoch": 0.7209439250135419, + "grad_norm": 0.7189931273460388, + "learning_rate": 1.4612071854680893e-05, + "loss": 0.3315, + "step": 7653 + }, + { + "epoch": 0.7210381291067097, + "grad_norm": 0.7467421293258667, + "learning_rate": 1.4610731974412535e-05, + "loss": 0.2719, + "step": 7654 + }, + { + "epoch": 0.7211323331998776, + "grad_norm": 0.7532447576522827, + "learning_rate": 1.460939198901236e-05, + "loss": 0.321, + "step": 7655 + }, + { + "epoch": 0.7212265372930454, + "grad_norm": 0.8318077921867371, + "learning_rate": 1.4608051898510918e-05, + "loss": 0.324, + "step": 7656 + }, + { + "epoch": 0.7213207413862133, + "grad_norm": 0.9194123148918152, + "learning_rate": 1.4606711702938765e-05, + "loss": 0.318, + "step": 7657 + }, + { + "epoch": 0.7214149454793811, + "grad_norm": 0.8693779110908508, + "learning_rate": 1.4605371402326462e-05, + "loss": 0.3017, + "step": 7658 + }, + { + "epoch": 0.721509149572549, + "grad_norm": 1.0532331466674805, + "learning_rate": 1.4604030996704568e-05, + "loss": 0.2838, + "step": 7659 + }, + { + "epoch": 0.7216033536657168, + "grad_norm": 0.7592741847038269, + "learning_rate": 1.4602690486103648e-05, + "loss": 0.3202, + "step": 7660 + }, + { + "epoch": 0.7216975577588847, + "grad_norm": 0.907318115234375, + "learning_rate": 1.4601349870554264e-05, + "loss": 0.3529, + "step": 7661 + }, + { + "epoch": 0.7217917618520525, + "grad_norm": 0.8160556554794312, + "learning_rate": 1.460000915008699e-05, + "loss": 0.3145, + "step": 7662 + }, + { + "epoch": 0.7218859659452204, + "grad_norm": 0.8162074089050293, + "learning_rate": 1.4598668324732392e-05, + "loss": 0.3461, + "step": 7663 + }, + { + "epoch": 0.7219801700383882, + "grad_norm": 0.7328223586082458, + "learning_rate": 1.4597327394521044e-05, + "loss": 0.3273, + "step": 7664 + }, + { + "epoch": 0.7220743741315561, + "grad_norm": 0.6975407600402832, + "learning_rate": 1.4595986359483523e-05, + "loss": 0.3584, + "step": 7665 + }, + { + "epoch": 0.7221685782247239, + "grad_norm": 0.7907822132110596, + "learning_rate": 1.45946452196504e-05, + "loss": 0.3562, + "step": 7666 + }, + { + "epoch": 0.7222627823178918, + "grad_norm": 1.0598143339157104, + "learning_rate": 1.4593303975052266e-05, + "loss": 0.3115, + "step": 7667 + }, + { + "epoch": 0.7223569864110596, + "grad_norm": 0.7048850655555725, + "learning_rate": 1.4591962625719696e-05, + "loss": 0.2968, + "step": 7668 + }, + { + "epoch": 0.7224511905042275, + "grad_norm": 0.9282791614532471, + "learning_rate": 1.4590621171683275e-05, + "loss": 0.3706, + "step": 7669 + }, + { + "epoch": 0.7225453945973953, + "grad_norm": 1.0115927457809448, + "learning_rate": 1.4589279612973597e-05, + "loss": 0.375, + "step": 7670 + }, + { + "epoch": 0.7226395986905632, + "grad_norm": 0.69843590259552, + "learning_rate": 1.458793794962124e-05, + "loss": 0.3144, + "step": 7671 + }, + { + "epoch": 0.722733802783731, + "grad_norm": 0.6993424892425537, + "learning_rate": 1.4586596181656804e-05, + "loss": 0.2873, + "step": 7672 + }, + { + "epoch": 0.7228280068768989, + "grad_norm": 0.7413098216056824, + "learning_rate": 1.4585254309110886e-05, + "loss": 0.2961, + "step": 7673 + }, + { + "epoch": 0.7229222109700667, + "grad_norm": 0.7417903542518616, + "learning_rate": 1.4583912332014071e-05, + "loss": 0.2996, + "step": 7674 + }, + { + "epoch": 0.7230164150632346, + "grad_norm": 0.7722974419593811, + "learning_rate": 1.4582570250396972e-05, + "loss": 0.3328, + "step": 7675 + }, + { + "epoch": 0.7231106191564024, + "grad_norm": 0.8128108978271484, + "learning_rate": 1.4581228064290182e-05, + "loss": 0.2928, + "step": 7676 + }, + { + "epoch": 0.7232048232495701, + "grad_norm": 0.8373945355415344, + "learning_rate": 1.4579885773724305e-05, + "loss": 0.3435, + "step": 7677 + }, + { + "epoch": 0.723299027342738, + "grad_norm": 0.8415419459342957, + "learning_rate": 1.4578543378729954e-05, + "loss": 0.3078, + "step": 7678 + }, + { + "epoch": 0.7233932314359058, + "grad_norm": 0.8667798638343811, + "learning_rate": 1.457720087933773e-05, + "loss": 0.3105, + "step": 7679 + }, + { + "epoch": 0.7234874355290737, + "grad_norm": 0.7439168095588684, + "learning_rate": 1.4575858275578248e-05, + "loss": 0.2982, + "step": 7680 + }, + { + "epoch": 0.7235816396222415, + "grad_norm": 0.6972314119338989, + "learning_rate": 1.4574515567482122e-05, + "loss": 0.2696, + "step": 7681 + }, + { + "epoch": 0.7236758437154094, + "grad_norm": 0.8637144565582275, + "learning_rate": 1.4573172755079963e-05, + "loss": 0.3105, + "step": 7682 + }, + { + "epoch": 0.7237700478085772, + "grad_norm": 0.7399028539657593, + "learning_rate": 1.4571829838402394e-05, + "loss": 0.3514, + "step": 7683 + }, + { + "epoch": 0.7238642519017451, + "grad_norm": 0.7735146284103394, + "learning_rate": 1.4570486817480036e-05, + "loss": 0.3345, + "step": 7684 + }, + { + "epoch": 0.7239584559949129, + "grad_norm": 0.7760829329490662, + "learning_rate": 1.456914369234351e-05, + "loss": 0.3542, + "step": 7685 + }, + { + "epoch": 0.7240526600880808, + "grad_norm": 0.7597216367721558, + "learning_rate": 1.4567800463023438e-05, + "loss": 0.3513, + "step": 7686 + }, + { + "epoch": 0.7241468641812486, + "grad_norm": 0.631736695766449, + "learning_rate": 1.4566457129550453e-05, + "loss": 0.283, + "step": 7687 + }, + { + "epoch": 0.7242410682744165, + "grad_norm": 0.7289599180221558, + "learning_rate": 1.4565113691955187e-05, + "loss": 0.2733, + "step": 7688 + }, + { + "epoch": 0.7243352723675843, + "grad_norm": 0.8164040446281433, + "learning_rate": 1.4563770150268264e-05, + "loss": 0.3156, + "step": 7689 + }, + { + "epoch": 0.7244294764607522, + "grad_norm": 0.7564865350723267, + "learning_rate": 1.4562426504520326e-05, + "loss": 0.3268, + "step": 7690 + }, + { + "epoch": 0.72452368055392, + "grad_norm": 0.6564485430717468, + "learning_rate": 1.4561082754742007e-05, + "loss": 0.3047, + "step": 7691 + }, + { + "epoch": 0.7246178846470879, + "grad_norm": 0.7447567582130432, + "learning_rate": 1.4559738900963946e-05, + "loss": 0.2957, + "step": 7692 + }, + { + "epoch": 0.7247120887402557, + "grad_norm": 0.8177298307418823, + "learning_rate": 1.4558394943216789e-05, + "loss": 0.3118, + "step": 7693 + }, + { + "epoch": 0.7248062928334236, + "grad_norm": 0.6390464901924133, + "learning_rate": 1.4557050881531174e-05, + "loss": 0.2806, + "step": 7694 + }, + { + "epoch": 0.7249004969265914, + "grad_norm": 0.7485120892524719, + "learning_rate": 1.4555706715937755e-05, + "loss": 0.2938, + "step": 7695 + }, + { + "epoch": 0.7249947010197593, + "grad_norm": 0.7831445336341858, + "learning_rate": 1.4554362446467175e-05, + "loss": 0.3013, + "step": 7696 + }, + { + "epoch": 0.7250889051129271, + "grad_norm": 0.7116540670394897, + "learning_rate": 1.4553018073150088e-05, + "loss": 0.325, + "step": 7697 + }, + { + "epoch": 0.725183109206095, + "grad_norm": 0.7577566504478455, + "learning_rate": 1.455167359601715e-05, + "loss": 0.2896, + "step": 7698 + }, + { + "epoch": 0.7252773132992628, + "grad_norm": 0.8203549981117249, + "learning_rate": 1.4550329015099012e-05, + "loss": 0.3543, + "step": 7699 + }, + { + "epoch": 0.7253715173924307, + "grad_norm": 0.9177870154380798, + "learning_rate": 1.4548984330426337e-05, + "loss": 0.3804, + "step": 7700 + }, + { + "epoch": 0.7254657214855985, + "grad_norm": 0.6643106937408447, + "learning_rate": 1.4547639542029784e-05, + "loss": 0.3008, + "step": 7701 + }, + { + "epoch": 0.7255599255787664, + "grad_norm": 1.296820044517517, + "learning_rate": 1.4546294649940014e-05, + "loss": 0.2951, + "step": 7702 + }, + { + "epoch": 0.7256541296719342, + "grad_norm": 0.7863390445709229, + "learning_rate": 1.4544949654187697e-05, + "loss": 0.2886, + "step": 7703 + }, + { + "epoch": 0.7257483337651021, + "grad_norm": 0.8272736668586731, + "learning_rate": 1.4543604554803499e-05, + "loss": 0.3391, + "step": 7704 + }, + { + "epoch": 0.7258425378582699, + "grad_norm": 0.7474284768104553, + "learning_rate": 1.4542259351818092e-05, + "loss": 0.3191, + "step": 7705 + }, + { + "epoch": 0.7259367419514378, + "grad_norm": 0.8520966172218323, + "learning_rate": 1.4540914045262142e-05, + "loss": 0.3483, + "step": 7706 + }, + { + "epoch": 0.7260309460446056, + "grad_norm": 0.83542799949646, + "learning_rate": 1.4539568635166332e-05, + "loss": 0.3682, + "step": 7707 + }, + { + "epoch": 0.7261251501377735, + "grad_norm": 0.787520170211792, + "learning_rate": 1.4538223121561338e-05, + "loss": 0.286, + "step": 7708 + }, + { + "epoch": 0.7262193542309413, + "grad_norm": 0.7447176575660706, + "learning_rate": 1.4536877504477836e-05, + "loss": 0.3206, + "step": 7709 + }, + { + "epoch": 0.7263135583241092, + "grad_norm": 0.6937974691390991, + "learning_rate": 1.4535531783946513e-05, + "loss": 0.3101, + "step": 7710 + }, + { + "epoch": 0.726407762417277, + "grad_norm": 0.6858431696891785, + "learning_rate": 1.4534185959998048e-05, + "loss": 0.3008, + "step": 7711 + }, + { + "epoch": 0.7265019665104449, + "grad_norm": 0.6721896529197693, + "learning_rate": 1.453284003266313e-05, + "loss": 0.295, + "step": 7712 + }, + { + "epoch": 0.7265961706036127, + "grad_norm": 0.8485425114631653, + "learning_rate": 1.4531494001972455e-05, + "loss": 0.2685, + "step": 7713 + }, + { + "epoch": 0.7266903746967805, + "grad_norm": 0.8477091193199158, + "learning_rate": 1.4530147867956703e-05, + "loss": 0.3375, + "step": 7714 + }, + { + "epoch": 0.7267845787899484, + "grad_norm": 0.6675555109977722, + "learning_rate": 1.4528801630646577e-05, + "loss": 0.3135, + "step": 7715 + }, + { + "epoch": 0.7268787828831162, + "grad_norm": 0.6150081157684326, + "learning_rate": 1.4527455290072767e-05, + "loss": 0.2665, + "step": 7716 + }, + { + "epoch": 0.7269729869762841, + "grad_norm": 0.7910028696060181, + "learning_rate": 1.4526108846265976e-05, + "loss": 0.3216, + "step": 7717 + }, + { + "epoch": 0.727067191069452, + "grad_norm": 0.8232051730155945, + "learning_rate": 1.4524762299256904e-05, + "loss": 0.3288, + "step": 7718 + }, + { + "epoch": 0.7271613951626198, + "grad_norm": 0.8236272931098938, + "learning_rate": 1.4523415649076253e-05, + "loss": 0.2982, + "step": 7719 + }, + { + "epoch": 0.7272555992557876, + "grad_norm": 0.8505282402038574, + "learning_rate": 1.452206889575473e-05, + "loss": 0.3502, + "step": 7720 + }, + { + "epoch": 0.7273498033489555, + "grad_norm": 0.7677298188209534, + "learning_rate": 1.4520722039323045e-05, + "loss": 0.327, + "step": 7721 + }, + { + "epoch": 0.7274440074421233, + "grad_norm": 0.7097388505935669, + "learning_rate": 1.4519375079811902e-05, + "loss": 0.3, + "step": 7722 + }, + { + "epoch": 0.7275382115352912, + "grad_norm": 0.6577833890914917, + "learning_rate": 1.4518028017252019e-05, + "loss": 0.2954, + "step": 7723 + }, + { + "epoch": 0.727632415628459, + "grad_norm": 1.1290863752365112, + "learning_rate": 1.4516680851674112e-05, + "loss": 0.3226, + "step": 7724 + }, + { + "epoch": 0.7277266197216269, + "grad_norm": 0.8429285883903503, + "learning_rate": 1.4515333583108896e-05, + "loss": 0.3298, + "step": 7725 + }, + { + "epoch": 0.7278208238147947, + "grad_norm": 0.7749203443527222, + "learning_rate": 1.4513986211587087e-05, + "loss": 0.3388, + "step": 7726 + }, + { + "epoch": 0.7279150279079626, + "grad_norm": 0.9690777659416199, + "learning_rate": 1.4512638737139417e-05, + "loss": 0.3394, + "step": 7727 + }, + { + "epoch": 0.7280092320011304, + "grad_norm": 0.7415863275527954, + "learning_rate": 1.45112911597966e-05, + "loss": 0.3133, + "step": 7728 + }, + { + "epoch": 0.7281034360942983, + "grad_norm": 0.6323605179786682, + "learning_rate": 1.4509943479589373e-05, + "loss": 0.2861, + "step": 7729 + }, + { + "epoch": 0.7281976401874661, + "grad_norm": 0.7535349130630493, + "learning_rate": 1.4508595696548457e-05, + "loss": 0.3172, + "step": 7730 + }, + { + "epoch": 0.728291844280634, + "grad_norm": 0.7147517204284668, + "learning_rate": 1.4507247810704586e-05, + "loss": 0.3317, + "step": 7731 + }, + { + "epoch": 0.7283860483738018, + "grad_norm": 0.644232451915741, + "learning_rate": 1.4505899822088494e-05, + "loss": 0.2731, + "step": 7732 + }, + { + "epoch": 0.7284802524669697, + "grad_norm": 0.7214465737342834, + "learning_rate": 1.450455173073092e-05, + "loss": 0.3029, + "step": 7733 + }, + { + "epoch": 0.7285744565601375, + "grad_norm": 0.7252299189567566, + "learning_rate": 1.4503203536662596e-05, + "loss": 0.3139, + "step": 7734 + }, + { + "epoch": 0.7286686606533054, + "grad_norm": 0.6736034750938416, + "learning_rate": 1.450185523991427e-05, + "loss": 0.2972, + "step": 7735 + }, + { + "epoch": 0.7287628647464732, + "grad_norm": 0.741563081741333, + "learning_rate": 1.4500506840516683e-05, + "loss": 0.334, + "step": 7736 + }, + { + "epoch": 0.7288570688396411, + "grad_norm": 0.8350456953048706, + "learning_rate": 1.4499158338500578e-05, + "loss": 0.3538, + "step": 7737 + }, + { + "epoch": 0.7289512729328089, + "grad_norm": 0.6199936866760254, + "learning_rate": 1.4497809733896708e-05, + "loss": 0.2771, + "step": 7738 + }, + { + "epoch": 0.7290454770259768, + "grad_norm": 0.7002807259559631, + "learning_rate": 1.4496461026735815e-05, + "loss": 0.3028, + "step": 7739 + }, + { + "epoch": 0.7291396811191446, + "grad_norm": 0.8033571243286133, + "learning_rate": 1.449511221704866e-05, + "loss": 0.3532, + "step": 7740 + }, + { + "epoch": 0.7292338852123125, + "grad_norm": 0.9143756628036499, + "learning_rate": 1.4493763304865995e-05, + "loss": 0.3265, + "step": 7741 + }, + { + "epoch": 0.7293280893054803, + "grad_norm": 0.6929057240486145, + "learning_rate": 1.4492414290218575e-05, + "loss": 0.3382, + "step": 7742 + }, + { + "epoch": 0.7294222933986482, + "grad_norm": 0.8464141488075256, + "learning_rate": 1.4491065173137162e-05, + "loss": 0.3094, + "step": 7743 + }, + { + "epoch": 0.729516497491816, + "grad_norm": 0.7139920592308044, + "learning_rate": 1.4489715953652517e-05, + "loss": 0.3127, + "step": 7744 + }, + { + "epoch": 0.7296107015849839, + "grad_norm": 0.7467449307441711, + "learning_rate": 1.4488366631795408e-05, + "loss": 0.3303, + "step": 7745 + }, + { + "epoch": 0.7297049056781517, + "grad_norm": 0.7776731848716736, + "learning_rate": 1.4487017207596595e-05, + "loss": 0.2987, + "step": 7746 + }, + { + "epoch": 0.7297991097713196, + "grad_norm": 0.8799088001251221, + "learning_rate": 1.448566768108685e-05, + "loss": 0.2734, + "step": 7747 + }, + { + "epoch": 0.7298933138644874, + "grad_norm": 0.835666298866272, + "learning_rate": 1.4484318052296946e-05, + "loss": 0.3238, + "step": 7748 + }, + { + "epoch": 0.7299875179576553, + "grad_norm": 1.0648915767669678, + "learning_rate": 1.4482968321257658e-05, + "loss": 0.3295, + "step": 7749 + }, + { + "epoch": 0.7300817220508231, + "grad_norm": 0.8137198090553284, + "learning_rate": 1.4481618487999755e-05, + "loss": 0.3468, + "step": 7750 + }, + { + "epoch": 0.730175926143991, + "grad_norm": 0.7838937044143677, + "learning_rate": 1.448026855255402e-05, + "loss": 0.2977, + "step": 7751 + }, + { + "epoch": 0.7302701302371588, + "grad_norm": 0.6746060848236084, + "learning_rate": 1.4478918514951235e-05, + "loss": 0.2868, + "step": 7752 + }, + { + "epoch": 0.7303643343303267, + "grad_norm": 0.8358673453330994, + "learning_rate": 1.447756837522218e-05, + "loss": 0.3153, + "step": 7753 + }, + { + "epoch": 0.7304585384234945, + "grad_norm": 0.7528150677680969, + "learning_rate": 1.447621813339764e-05, + "loss": 0.3058, + "step": 7754 + }, + { + "epoch": 0.7305527425166624, + "grad_norm": 0.6820029616355896, + "learning_rate": 1.447486778950841e-05, + "loss": 0.2789, + "step": 7755 + }, + { + "epoch": 0.7306469466098302, + "grad_norm": 0.8257085084915161, + "learning_rate": 1.4473517343585265e-05, + "loss": 0.3562, + "step": 7756 + }, + { + "epoch": 0.730741150702998, + "grad_norm": 0.710114598274231, + "learning_rate": 1.447216679565901e-05, + "loss": 0.3109, + "step": 7757 + }, + { + "epoch": 0.7308353547961659, + "grad_norm": 0.7254622578620911, + "learning_rate": 1.4470816145760437e-05, + "loss": 0.3128, + "step": 7758 + }, + { + "epoch": 0.7309295588893338, + "grad_norm": 0.7184939980506897, + "learning_rate": 1.4469465393920341e-05, + "loss": 0.2997, + "step": 7759 + }, + { + "epoch": 0.7310237629825016, + "grad_norm": 0.6432626843452454, + "learning_rate": 1.4468114540169522e-05, + "loss": 0.272, + "step": 7760 + }, + { + "epoch": 0.7311179670756695, + "grad_norm": 0.7790955305099487, + "learning_rate": 1.4466763584538783e-05, + "loss": 0.3428, + "step": 7761 + }, + { + "epoch": 0.7312121711688373, + "grad_norm": 0.7339643836021423, + "learning_rate": 1.4465412527058921e-05, + "loss": 0.3384, + "step": 7762 + }, + { + "epoch": 0.7313063752620051, + "grad_norm": 0.8110752701759338, + "learning_rate": 1.4464061367760753e-05, + "loss": 0.2799, + "step": 7763 + }, + { + "epoch": 0.731400579355173, + "grad_norm": 0.7033218145370483, + "learning_rate": 1.4462710106675079e-05, + "loss": 0.31, + "step": 7764 + }, + { + "epoch": 0.7314947834483408, + "grad_norm": 0.7438430190086365, + "learning_rate": 1.4461358743832716e-05, + "loss": 0.341, + "step": 7765 + }, + { + "epoch": 0.7315889875415087, + "grad_norm": 1.0014957189559937, + "learning_rate": 1.446000727926447e-05, + "loss": 0.3457, + "step": 7766 + }, + { + "epoch": 0.7316831916346765, + "grad_norm": 0.9046600461006165, + "learning_rate": 1.4458655713001162e-05, + "loss": 0.3285, + "step": 7767 + }, + { + "epoch": 0.7317773957278444, + "grad_norm": 0.7668585777282715, + "learning_rate": 1.445730404507361e-05, + "loss": 0.2884, + "step": 7768 + }, + { + "epoch": 0.7318715998210122, + "grad_norm": 0.6925591230392456, + "learning_rate": 1.4455952275512632e-05, + "loss": 0.3028, + "step": 7769 + }, + { + "epoch": 0.7319658039141801, + "grad_norm": 0.8074665665626526, + "learning_rate": 1.445460040434905e-05, + "loss": 0.3269, + "step": 7770 + }, + { + "epoch": 0.7320600080073479, + "grad_norm": 0.848996639251709, + "learning_rate": 1.4453248431613687e-05, + "loss": 0.3035, + "step": 7771 + }, + { + "epoch": 0.7321542121005158, + "grad_norm": 0.7105998396873474, + "learning_rate": 1.4451896357337376e-05, + "loss": 0.3296, + "step": 7772 + }, + { + "epoch": 0.7322484161936836, + "grad_norm": 0.6984401345252991, + "learning_rate": 1.4450544181550943e-05, + "loss": 0.2914, + "step": 7773 + }, + { + "epoch": 0.7323426202868515, + "grad_norm": 0.7580311894416809, + "learning_rate": 1.4449191904285218e-05, + "loss": 0.289, + "step": 7774 + }, + { + "epoch": 0.7324368243800193, + "grad_norm": 0.7013496160507202, + "learning_rate": 1.444783952557104e-05, + "loss": 0.2993, + "step": 7775 + }, + { + "epoch": 0.7325310284731872, + "grad_norm": 0.6538723111152649, + "learning_rate": 1.4446487045439237e-05, + "loss": 0.2664, + "step": 7776 + }, + { + "epoch": 0.732625232566355, + "grad_norm": 0.7903271317481995, + "learning_rate": 1.4445134463920656e-05, + "loss": 0.3435, + "step": 7777 + }, + { + "epoch": 0.7327194366595229, + "grad_norm": 0.8297932147979736, + "learning_rate": 1.4443781781046135e-05, + "loss": 0.3158, + "step": 7778 + }, + { + "epoch": 0.7328136407526907, + "grad_norm": 0.7473205327987671, + "learning_rate": 1.4442428996846515e-05, + "loss": 0.35, + "step": 7779 + }, + { + "epoch": 0.7329078448458586, + "grad_norm": 1.1932625770568848, + "learning_rate": 1.4441076111352646e-05, + "loss": 0.3148, + "step": 7780 + }, + { + "epoch": 0.7330020489390264, + "grad_norm": 0.7813129425048828, + "learning_rate": 1.4439723124595373e-05, + "loss": 0.3275, + "step": 7781 + }, + { + "epoch": 0.7330962530321943, + "grad_norm": 0.7652055621147156, + "learning_rate": 1.4438370036605545e-05, + "loss": 0.3187, + "step": 7782 + }, + { + "epoch": 0.7331904571253621, + "grad_norm": 0.7172680497169495, + "learning_rate": 1.4437016847414017e-05, + "loss": 0.2769, + "step": 7783 + }, + { + "epoch": 0.73328466121853, + "grad_norm": 0.7182568907737732, + "learning_rate": 1.4435663557051643e-05, + "loss": 0.3502, + "step": 7784 + }, + { + "epoch": 0.7333788653116978, + "grad_norm": 0.6757748126983643, + "learning_rate": 1.4434310165549282e-05, + "loss": 0.2798, + "step": 7785 + }, + { + "epoch": 0.7334730694048657, + "grad_norm": 0.7448100447654724, + "learning_rate": 1.4432956672937785e-05, + "loss": 0.3146, + "step": 7786 + }, + { + "epoch": 0.7335672734980335, + "grad_norm": 0.762901246547699, + "learning_rate": 1.4431603079248024e-05, + "loss": 0.2983, + "step": 7787 + }, + { + "epoch": 0.7336614775912014, + "grad_norm": 0.8209456205368042, + "learning_rate": 1.443024938451086e-05, + "loss": 0.3238, + "step": 7788 + }, + { + "epoch": 0.7337556816843692, + "grad_norm": 0.6932891607284546, + "learning_rate": 1.4428895588757162e-05, + "loss": 0.3441, + "step": 7789 + }, + { + "epoch": 0.7338498857775371, + "grad_norm": 0.7887647151947021, + "learning_rate": 1.442754169201779e-05, + "loss": 0.3272, + "step": 7790 + }, + { + "epoch": 0.7339440898707049, + "grad_norm": 0.7699485421180725, + "learning_rate": 1.4426187694323619e-05, + "loss": 0.3537, + "step": 7791 + }, + { + "epoch": 0.7340382939638728, + "grad_norm": 0.8260576128959656, + "learning_rate": 1.4424833595705527e-05, + "loss": 0.338, + "step": 7792 + }, + { + "epoch": 0.7341324980570406, + "grad_norm": 0.869855523109436, + "learning_rate": 1.4423479396194387e-05, + "loss": 0.2982, + "step": 7793 + }, + { + "epoch": 0.7342267021502085, + "grad_norm": 0.8493530750274658, + "learning_rate": 1.4422125095821068e-05, + "loss": 0.3599, + "step": 7794 + }, + { + "epoch": 0.7343209062433763, + "grad_norm": 0.7826700806617737, + "learning_rate": 1.4420770694616469e-05, + "loss": 0.3199, + "step": 7795 + }, + { + "epoch": 0.7344151103365442, + "grad_norm": 0.8466247916221619, + "learning_rate": 1.4419416192611453e-05, + "loss": 0.3228, + "step": 7796 + }, + { + "epoch": 0.734509314429712, + "grad_norm": 0.7265615463256836, + "learning_rate": 1.4418061589836914e-05, + "loss": 0.3111, + "step": 7797 + }, + { + "epoch": 0.7346035185228799, + "grad_norm": 0.6740742921829224, + "learning_rate": 1.4416706886323741e-05, + "loss": 0.2788, + "step": 7798 + }, + { + "epoch": 0.7346977226160477, + "grad_norm": 0.7846463918685913, + "learning_rate": 1.4415352082102818e-05, + "loss": 0.2909, + "step": 7799 + }, + { + "epoch": 0.7347919267092156, + "grad_norm": 0.8173394203186035, + "learning_rate": 1.4413997177205042e-05, + "loss": 0.3649, + "step": 7800 + }, + { + "epoch": 0.7348861308023834, + "grad_norm": 0.7414736747741699, + "learning_rate": 1.4412642171661301e-05, + "loss": 0.3457, + "step": 7801 + }, + { + "epoch": 0.7349803348955513, + "grad_norm": 0.8608464002609253, + "learning_rate": 1.4411287065502494e-05, + "loss": 0.3097, + "step": 7802 + }, + { + "epoch": 0.7350745389887191, + "grad_norm": 0.7352781891822815, + "learning_rate": 1.4409931858759523e-05, + "loss": 0.2752, + "step": 7803 + }, + { + "epoch": 0.735168743081887, + "grad_norm": 1.0024056434631348, + "learning_rate": 1.4408576551463283e-05, + "loss": 0.3473, + "step": 7804 + }, + { + "epoch": 0.7352629471750548, + "grad_norm": 0.8175967931747437, + "learning_rate": 1.4407221143644681e-05, + "loss": 0.3221, + "step": 7805 + }, + { + "epoch": 0.7353571512682227, + "grad_norm": 0.6681419014930725, + "learning_rate": 1.4405865635334619e-05, + "loss": 0.2863, + "step": 7806 + }, + { + "epoch": 0.7354513553613905, + "grad_norm": 0.7162423133850098, + "learning_rate": 1.4404510026564007e-05, + "loss": 0.3276, + "step": 7807 + }, + { + "epoch": 0.7355455594545584, + "grad_norm": 0.7024120688438416, + "learning_rate": 1.4403154317363757e-05, + "loss": 0.3066, + "step": 7808 + }, + { + "epoch": 0.7356397635477262, + "grad_norm": 0.8371044993400574, + "learning_rate": 1.4401798507764777e-05, + "loss": 0.3513, + "step": 7809 + }, + { + "epoch": 0.735733967640894, + "grad_norm": 0.7337832450866699, + "learning_rate": 1.4400442597797985e-05, + "loss": 0.3478, + "step": 7810 + }, + { + "epoch": 0.7358281717340619, + "grad_norm": 0.831731379032135, + "learning_rate": 1.4399086587494292e-05, + "loss": 0.3103, + "step": 7811 + }, + { + "epoch": 0.7359223758272297, + "grad_norm": 0.7630707621574402, + "learning_rate": 1.4397730476884628e-05, + "loss": 0.3501, + "step": 7812 + }, + { + "epoch": 0.7360165799203976, + "grad_norm": 0.5945831537246704, + "learning_rate": 1.4396374265999905e-05, + "loss": 0.2768, + "step": 7813 + }, + { + "epoch": 0.7361107840135654, + "grad_norm": 0.7229190468788147, + "learning_rate": 1.4395017954871046e-05, + "loss": 0.3436, + "step": 7814 + }, + { + "epoch": 0.7362049881067332, + "grad_norm": 0.802777886390686, + "learning_rate": 1.4393661543528988e-05, + "loss": 0.362, + "step": 7815 + }, + { + "epoch": 0.736299192199901, + "grad_norm": 0.6775187253952026, + "learning_rate": 1.4392305032004648e-05, + "loss": 0.3228, + "step": 7816 + }, + { + "epoch": 0.7363933962930689, + "grad_norm": 0.8231747150421143, + "learning_rate": 1.439094842032896e-05, + "loss": 0.335, + "step": 7817 + }, + { + "epoch": 0.7364876003862367, + "grad_norm": 0.7906420826911926, + "learning_rate": 1.4389591708532858e-05, + "loss": 0.3352, + "step": 7818 + }, + { + "epoch": 0.7365818044794046, + "grad_norm": 0.7793775200843811, + "learning_rate": 1.4388234896647272e-05, + "loss": 0.3572, + "step": 7819 + }, + { + "epoch": 0.7366760085725724, + "grad_norm": 0.7064700722694397, + "learning_rate": 1.4386877984703152e-05, + "loss": 0.3197, + "step": 7820 + }, + { + "epoch": 0.7367702126657403, + "grad_norm": 0.7348716855049133, + "learning_rate": 1.4385520972731423e-05, + "loss": 0.3316, + "step": 7821 + }, + { + "epoch": 0.7368644167589081, + "grad_norm": 0.750260055065155, + "learning_rate": 1.4384163860763037e-05, + "loss": 0.3037, + "step": 7822 + }, + { + "epoch": 0.736958620852076, + "grad_norm": 0.777546763420105, + "learning_rate": 1.4382806648828934e-05, + "loss": 0.2928, + "step": 7823 + }, + { + "epoch": 0.7370528249452438, + "grad_norm": 0.6131889820098877, + "learning_rate": 1.438144933696006e-05, + "loss": 0.2752, + "step": 7824 + }, + { + "epoch": 0.7371470290384117, + "grad_norm": 0.8008215427398682, + "learning_rate": 1.4380091925187368e-05, + "loss": 0.3215, + "step": 7825 + }, + { + "epoch": 0.7372412331315795, + "grad_norm": 0.8012502193450928, + "learning_rate": 1.4378734413541802e-05, + "loss": 0.3252, + "step": 7826 + }, + { + "epoch": 0.7373354372247474, + "grad_norm": 0.75214684009552, + "learning_rate": 1.4377376802054322e-05, + "loss": 0.3336, + "step": 7827 + }, + { + "epoch": 0.7374296413179152, + "grad_norm": 0.797979474067688, + "learning_rate": 1.4376019090755882e-05, + "loss": 0.3361, + "step": 7828 + }, + { + "epoch": 0.7375238454110831, + "grad_norm": 1.650551438331604, + "learning_rate": 1.4374661279677438e-05, + "loss": 0.3175, + "step": 7829 + }, + { + "epoch": 0.7376180495042509, + "grad_norm": 1.420734167098999, + "learning_rate": 1.437330336884995e-05, + "loss": 0.3828, + "step": 7830 + }, + { + "epoch": 0.7377122535974188, + "grad_norm": 0.777060866355896, + "learning_rate": 1.4371945358304383e-05, + "loss": 0.3606, + "step": 7831 + }, + { + "epoch": 0.7378064576905866, + "grad_norm": 0.8682255744934082, + "learning_rate": 1.4370587248071698e-05, + "loss": 0.3288, + "step": 7832 + }, + { + "epoch": 0.7379006617837545, + "grad_norm": 0.6950915455818176, + "learning_rate": 1.436922903818287e-05, + "loss": 0.297, + "step": 7833 + }, + { + "epoch": 0.7379948658769223, + "grad_norm": 0.6280592679977417, + "learning_rate": 1.4367870728668858e-05, + "loss": 0.291, + "step": 7834 + }, + { + "epoch": 0.7380890699700902, + "grad_norm": 0.7080531120300293, + "learning_rate": 1.4366512319560642e-05, + "loss": 0.3251, + "step": 7835 + }, + { + "epoch": 0.738183274063258, + "grad_norm": 0.7126514315605164, + "learning_rate": 1.4365153810889188e-05, + "loss": 0.3244, + "step": 7836 + }, + { + "epoch": 0.7382774781564259, + "grad_norm": 0.6826251149177551, + "learning_rate": 1.4363795202685478e-05, + "loss": 0.309, + "step": 7837 + }, + { + "epoch": 0.7383716822495937, + "grad_norm": 0.7983651757240295, + "learning_rate": 1.436243649498049e-05, + "loss": 0.3098, + "step": 7838 + }, + { + "epoch": 0.7384658863427616, + "grad_norm": 1.0099413394927979, + "learning_rate": 1.4361077687805201e-05, + "loss": 0.3366, + "step": 7839 + }, + { + "epoch": 0.7385600904359294, + "grad_norm": 0.7486034631729126, + "learning_rate": 1.43597187811906e-05, + "loss": 0.3248, + "step": 7840 + }, + { + "epoch": 0.7386542945290973, + "grad_norm": 0.8024405241012573, + "learning_rate": 1.4358359775167666e-05, + "loss": 0.3657, + "step": 7841 + }, + { + "epoch": 0.7387484986222651, + "grad_norm": 0.7130158543586731, + "learning_rate": 1.4357000669767386e-05, + "loss": 0.3575, + "step": 7842 + }, + { + "epoch": 0.738842702715433, + "grad_norm": 0.8017832636833191, + "learning_rate": 1.4355641465020755e-05, + "loss": 0.3025, + "step": 7843 + }, + { + "epoch": 0.7389369068086008, + "grad_norm": 0.7155521512031555, + "learning_rate": 1.4354282160958764e-05, + "loss": 0.3482, + "step": 7844 + }, + { + "epoch": 0.7390311109017687, + "grad_norm": 0.7501134276390076, + "learning_rate": 1.4352922757612407e-05, + "loss": 0.2926, + "step": 7845 + }, + { + "epoch": 0.7391253149949365, + "grad_norm": 0.7163833379745483, + "learning_rate": 1.4351563255012674e-05, + "loss": 0.304, + "step": 7846 + }, + { + "epoch": 0.7392195190881043, + "grad_norm": 0.7612854838371277, + "learning_rate": 1.435020365319057e-05, + "loss": 0.3363, + "step": 7847 + }, + { + "epoch": 0.7393137231812722, + "grad_norm": 0.7598935961723328, + "learning_rate": 1.4348843952177098e-05, + "loss": 0.3056, + "step": 7848 + }, + { + "epoch": 0.73940792727444, + "grad_norm": 0.6794955730438232, + "learning_rate": 1.4347484152003256e-05, + "loss": 0.3472, + "step": 7849 + }, + { + "epoch": 0.7395021313676079, + "grad_norm": 0.7308415174484253, + "learning_rate": 1.4346124252700056e-05, + "loss": 0.3234, + "step": 7850 + }, + { + "epoch": 0.7395963354607757, + "grad_norm": 0.7182336449623108, + "learning_rate": 1.4344764254298495e-05, + "loss": 0.3104, + "step": 7851 + }, + { + "epoch": 0.7396905395539436, + "grad_norm": 0.6178147196769714, + "learning_rate": 1.4343404156829595e-05, + "loss": 0.2624, + "step": 7852 + }, + { + "epoch": 0.7397847436471114, + "grad_norm": 0.7868710160255432, + "learning_rate": 1.4342043960324361e-05, + "loss": 0.3175, + "step": 7853 + }, + { + "epoch": 0.7398789477402793, + "grad_norm": 0.7279520034790039, + "learning_rate": 1.4340683664813809e-05, + "loss": 0.3327, + "step": 7854 + }, + { + "epoch": 0.7399731518334471, + "grad_norm": 0.6738916039466858, + "learning_rate": 1.4339323270328957e-05, + "loss": 0.2856, + "step": 7855 + }, + { + "epoch": 0.740067355926615, + "grad_norm": 1.0068432092666626, + "learning_rate": 1.4337962776900822e-05, + "loss": 0.3266, + "step": 7856 + }, + { + "epoch": 0.7401615600197828, + "grad_norm": 0.6624978184700012, + "learning_rate": 1.4336602184560429e-05, + "loss": 0.301, + "step": 7857 + }, + { + "epoch": 0.7402557641129507, + "grad_norm": 0.9040200114250183, + "learning_rate": 1.43352414933388e-05, + "loss": 0.4003, + "step": 7858 + }, + { + "epoch": 0.7403499682061185, + "grad_norm": 0.7967157959938049, + "learning_rate": 1.4333880703266959e-05, + "loss": 0.319, + "step": 7859 + }, + { + "epoch": 0.7404441722992864, + "grad_norm": 0.7224143147468567, + "learning_rate": 1.4332519814375937e-05, + "loss": 0.3178, + "step": 7860 + }, + { + "epoch": 0.7405383763924542, + "grad_norm": 1.0972416400909424, + "learning_rate": 1.4331158826696761e-05, + "loss": 0.3998, + "step": 7861 + }, + { + "epoch": 0.7406325804856221, + "grad_norm": 0.8151473999023438, + "learning_rate": 1.4329797740260465e-05, + "loss": 0.3415, + "step": 7862 + }, + { + "epoch": 0.7407267845787899, + "grad_norm": 0.6831167936325073, + "learning_rate": 1.4328436555098083e-05, + "loss": 0.2768, + "step": 7863 + }, + { + "epoch": 0.7408209886719578, + "grad_norm": 0.8053088188171387, + "learning_rate": 1.432707527124066e-05, + "loss": 0.3417, + "step": 7864 + }, + { + "epoch": 0.7409151927651256, + "grad_norm": 0.6614615321159363, + "learning_rate": 1.4325713888719224e-05, + "loss": 0.2893, + "step": 7865 + }, + { + "epoch": 0.7410093968582935, + "grad_norm": 0.9234987497329712, + "learning_rate": 1.4324352407564824e-05, + "loss": 0.3428, + "step": 7866 + }, + { + "epoch": 0.7411036009514613, + "grad_norm": 0.74969881772995, + "learning_rate": 1.4322990827808499e-05, + "loss": 0.2816, + "step": 7867 + }, + { + "epoch": 0.7411978050446292, + "grad_norm": 0.7505182027816772, + "learning_rate": 1.4321629149481298e-05, + "loss": 0.3518, + "step": 7868 + }, + { + "epoch": 0.741292009137797, + "grad_norm": 0.6784083247184753, + "learning_rate": 1.432026737261427e-05, + "loss": 0.3529, + "step": 7869 + }, + { + "epoch": 0.7413862132309649, + "grad_norm": 0.8002153038978577, + "learning_rate": 1.4318905497238462e-05, + "loss": 0.3351, + "step": 7870 + }, + { + "epoch": 0.7414804173241327, + "grad_norm": 0.8567597270011902, + "learning_rate": 1.4317543523384928e-05, + "loss": 0.2924, + "step": 7871 + }, + { + "epoch": 0.7415746214173006, + "grad_norm": 0.7616846561431885, + "learning_rate": 1.431618145108473e-05, + "loss": 0.3456, + "step": 7872 + }, + { + "epoch": 0.7416688255104684, + "grad_norm": 0.849391520023346, + "learning_rate": 1.4314819280368916e-05, + "loss": 0.3267, + "step": 7873 + }, + { + "epoch": 0.7417630296036363, + "grad_norm": 0.7345016598701477, + "learning_rate": 1.4313457011268552e-05, + "loss": 0.3301, + "step": 7874 + }, + { + "epoch": 0.7418572336968041, + "grad_norm": 0.5992431044578552, + "learning_rate": 1.4312094643814698e-05, + "loss": 0.2562, + "step": 7875 + }, + { + "epoch": 0.741951437789972, + "grad_norm": 0.8267952799797058, + "learning_rate": 1.4310732178038413e-05, + "loss": 0.3047, + "step": 7876 + }, + { + "epoch": 0.7420456418831398, + "grad_norm": 0.679075300693512, + "learning_rate": 1.4309369613970767e-05, + "loss": 0.2938, + "step": 7877 + }, + { + "epoch": 0.7421398459763077, + "grad_norm": 0.758065402507782, + "learning_rate": 1.4308006951642837e-05, + "loss": 0.2984, + "step": 7878 + }, + { + "epoch": 0.7422340500694755, + "grad_norm": 0.7927003502845764, + "learning_rate": 1.430664419108568e-05, + "loss": 0.3213, + "step": 7879 + }, + { + "epoch": 0.7423282541626434, + "grad_norm": 0.7506650686264038, + "learning_rate": 1.4305281332330375e-05, + "loss": 0.2948, + "step": 7880 + }, + { + "epoch": 0.7424224582558112, + "grad_norm": 0.883560061454773, + "learning_rate": 1.4303918375407999e-05, + "loss": 0.3047, + "step": 7881 + }, + { + "epoch": 0.7425166623489791, + "grad_norm": 0.7973462343215942, + "learning_rate": 1.4302555320349627e-05, + "loss": 0.3665, + "step": 7882 + }, + { + "epoch": 0.7426108664421469, + "grad_norm": 0.6922618746757507, + "learning_rate": 1.430119216718634e-05, + "loss": 0.294, + "step": 7883 + }, + { + "epoch": 0.7427050705353148, + "grad_norm": 0.6867779493331909, + "learning_rate": 1.4299828915949221e-05, + "loss": 0.3169, + "step": 7884 + }, + { + "epoch": 0.7427992746284826, + "grad_norm": 0.7345553040504456, + "learning_rate": 1.4298465566669353e-05, + "loss": 0.3628, + "step": 7885 + }, + { + "epoch": 0.7428934787216505, + "grad_norm": 0.6013640761375427, + "learning_rate": 1.4297102119377821e-05, + "loss": 0.2683, + "step": 7886 + }, + { + "epoch": 0.7429876828148183, + "grad_norm": 0.7088180780410767, + "learning_rate": 1.4295738574105713e-05, + "loss": 0.3391, + "step": 7887 + }, + { + "epoch": 0.7430818869079862, + "grad_norm": 0.7746397256851196, + "learning_rate": 1.4294374930884126e-05, + "loss": 0.3309, + "step": 7888 + }, + { + "epoch": 0.743176091001154, + "grad_norm": 0.8246579766273499, + "learning_rate": 1.4293011189744146e-05, + "loss": 0.34, + "step": 7889 + }, + { + "epoch": 0.7432702950943219, + "grad_norm": 0.8670780062675476, + "learning_rate": 1.4291647350716876e-05, + "loss": 0.3149, + "step": 7890 + }, + { + "epoch": 0.7433644991874897, + "grad_norm": 0.7426936626434326, + "learning_rate": 1.4290283413833404e-05, + "loss": 0.3235, + "step": 7891 + }, + { + "epoch": 0.7434587032806576, + "grad_norm": 0.6389971971511841, + "learning_rate": 1.4288919379124837e-05, + "loss": 0.2535, + "step": 7892 + }, + { + "epoch": 0.7435529073738254, + "grad_norm": 0.6910268068313599, + "learning_rate": 1.4287555246622276e-05, + "loss": 0.3008, + "step": 7893 + }, + { + "epoch": 0.7436471114669932, + "grad_norm": 1.02792227268219, + "learning_rate": 1.4286191016356822e-05, + "loss": 0.347, + "step": 7894 + }, + { + "epoch": 0.7437413155601611, + "grad_norm": 0.6576209664344788, + "learning_rate": 1.428482668835959e-05, + "loss": 0.29, + "step": 7895 + }, + { + "epoch": 0.743835519653329, + "grad_norm": 0.7317492365837097, + "learning_rate": 1.4283462262661675e-05, + "loss": 0.3104, + "step": 7896 + }, + { + "epoch": 0.7439297237464968, + "grad_norm": 0.6787400245666504, + "learning_rate": 1.42820977392942e-05, + "loss": 0.3396, + "step": 7897 + }, + { + "epoch": 0.7440239278396646, + "grad_norm": 0.7138457894325256, + "learning_rate": 1.4280733118288277e-05, + "loss": 0.2857, + "step": 7898 + }, + { + "epoch": 0.7441181319328325, + "grad_norm": 0.7572120428085327, + "learning_rate": 1.4279368399675015e-05, + "loss": 0.3215, + "step": 7899 + }, + { + "epoch": 0.7442123360260003, + "grad_norm": 0.6458143591880798, + "learning_rate": 1.4278003583485537e-05, + "loss": 0.2809, + "step": 7900 + }, + { + "epoch": 0.7443065401191682, + "grad_norm": 0.7505171298980713, + "learning_rate": 1.427663866975096e-05, + "loss": 0.3363, + "step": 7901 + }, + { + "epoch": 0.744400744212336, + "grad_norm": 0.7094796895980835, + "learning_rate": 1.4275273658502407e-05, + "loss": 0.3119, + "step": 7902 + }, + { + "epoch": 0.7444949483055039, + "grad_norm": 0.8144106864929199, + "learning_rate": 1.4273908549771003e-05, + "loss": 0.3166, + "step": 7903 + }, + { + "epoch": 0.7445891523986717, + "grad_norm": 0.6488919258117676, + "learning_rate": 1.4272543343587875e-05, + "loss": 0.309, + "step": 7904 + }, + { + "epoch": 0.7446833564918396, + "grad_norm": 0.8431639671325684, + "learning_rate": 1.4271178039984153e-05, + "loss": 0.3034, + "step": 7905 + }, + { + "epoch": 0.7447775605850074, + "grad_norm": 0.7071564197540283, + "learning_rate": 1.4269812638990965e-05, + "loss": 0.2825, + "step": 7906 + }, + { + "epoch": 0.7448717646781753, + "grad_norm": 0.6395066380500793, + "learning_rate": 1.4268447140639444e-05, + "loss": 0.291, + "step": 7907 + }, + { + "epoch": 0.7449659687713431, + "grad_norm": 0.6347860097885132, + "learning_rate": 1.4267081544960726e-05, + "loss": 0.2977, + "step": 7908 + }, + { + "epoch": 0.745060172864511, + "grad_norm": 0.6679201722145081, + "learning_rate": 1.4265715851985951e-05, + "loss": 0.2718, + "step": 7909 + }, + { + "epoch": 0.7451543769576788, + "grad_norm": 0.7665024995803833, + "learning_rate": 1.4264350061746259e-05, + "loss": 0.3337, + "step": 7910 + }, + { + "epoch": 0.7452485810508467, + "grad_norm": 0.7200942039489746, + "learning_rate": 1.4262984174272787e-05, + "loss": 0.3243, + "step": 7911 + }, + { + "epoch": 0.7453427851440145, + "grad_norm": 0.8476040959358215, + "learning_rate": 1.4261618189596687e-05, + "loss": 0.3242, + "step": 7912 + }, + { + "epoch": 0.7454369892371824, + "grad_norm": 0.7364603877067566, + "learning_rate": 1.42602521077491e-05, + "loss": 0.342, + "step": 7913 + }, + { + "epoch": 0.7455311933303502, + "grad_norm": 0.8330252170562744, + "learning_rate": 1.4258885928761175e-05, + "loss": 0.316, + "step": 7914 + }, + { + "epoch": 0.7456253974235181, + "grad_norm": 0.7311208844184875, + "learning_rate": 1.4257519652664069e-05, + "loss": 0.3249, + "step": 7915 + }, + { + "epoch": 0.7457196015166859, + "grad_norm": 0.7431167960166931, + "learning_rate": 1.4256153279488925e-05, + "loss": 0.3321, + "step": 7916 + }, + { + "epoch": 0.7458138056098538, + "grad_norm": 0.8051790595054626, + "learning_rate": 1.4254786809266907e-05, + "loss": 0.3327, + "step": 7917 + }, + { + "epoch": 0.7459080097030216, + "grad_norm": 0.7629981637001038, + "learning_rate": 1.4253420242029168e-05, + "loss": 0.3202, + "step": 7918 + }, + { + "epoch": 0.7460022137961895, + "grad_norm": 0.7642293572425842, + "learning_rate": 1.4252053577806867e-05, + "loss": 0.3112, + "step": 7919 + }, + { + "epoch": 0.7460964178893573, + "grad_norm": 0.6936383843421936, + "learning_rate": 1.4250686816631174e-05, + "loss": 0.3177, + "step": 7920 + }, + { + "epoch": 0.7461906219825252, + "grad_norm": 0.7215296030044556, + "learning_rate": 1.4249319958533245e-05, + "loss": 0.2934, + "step": 7921 + }, + { + "epoch": 0.746284826075693, + "grad_norm": 0.7291421294212341, + "learning_rate": 1.4247953003544248e-05, + "loss": 0.3014, + "step": 7922 + }, + { + "epoch": 0.7463790301688609, + "grad_norm": 0.7604468464851379, + "learning_rate": 1.4246585951695356e-05, + "loss": 0.3054, + "step": 7923 + }, + { + "epoch": 0.7464732342620287, + "grad_norm": 0.6813427209854126, + "learning_rate": 1.4245218803017735e-05, + "loss": 0.3009, + "step": 7924 + }, + { + "epoch": 0.7465674383551966, + "grad_norm": 0.6189952492713928, + "learning_rate": 1.4243851557542561e-05, + "loss": 0.2692, + "step": 7925 + }, + { + "epoch": 0.7466616424483644, + "grad_norm": 0.8515956997871399, + "learning_rate": 1.4242484215301009e-05, + "loss": 0.2973, + "step": 7926 + }, + { + "epoch": 0.7467558465415323, + "grad_norm": 0.7358274459838867, + "learning_rate": 1.4241116776324253e-05, + "loss": 0.2999, + "step": 7927 + }, + { + "epoch": 0.7468500506347001, + "grad_norm": 0.7132756114006042, + "learning_rate": 1.4239749240643477e-05, + "loss": 0.3015, + "step": 7928 + }, + { + "epoch": 0.746944254727868, + "grad_norm": 0.7790988087654114, + "learning_rate": 1.4238381608289862e-05, + "loss": 0.2955, + "step": 7929 + }, + { + "epoch": 0.7470384588210358, + "grad_norm": 0.6831228137016296, + "learning_rate": 1.423701387929459e-05, + "loss": 0.3137, + "step": 7930 + }, + { + "epoch": 0.7471326629142037, + "grad_norm": 0.8106114268302917, + "learning_rate": 1.4235646053688847e-05, + "loss": 0.3298, + "step": 7931 + }, + { + "epoch": 0.7472268670073715, + "grad_norm": 0.8043280243873596, + "learning_rate": 1.4234278131503829e-05, + "loss": 0.3127, + "step": 7932 + }, + { + "epoch": 0.7473210711005394, + "grad_norm": 0.7510353922843933, + "learning_rate": 1.4232910112770717e-05, + "loss": 0.3049, + "step": 7933 + }, + { + "epoch": 0.7474152751937072, + "grad_norm": 0.7884879112243652, + "learning_rate": 1.4231541997520708e-05, + "loss": 0.3412, + "step": 7934 + }, + { + "epoch": 0.7475094792868751, + "grad_norm": 0.7577821612358093, + "learning_rate": 1.4230173785785002e-05, + "loss": 0.297, + "step": 7935 + }, + { + "epoch": 0.7476036833800429, + "grad_norm": 0.7543067336082458, + "learning_rate": 1.4228805477594788e-05, + "loss": 0.314, + "step": 7936 + }, + { + "epoch": 0.7476978874732108, + "grad_norm": 0.7481685876846313, + "learning_rate": 1.422743707298127e-05, + "loss": 0.31, + "step": 7937 + }, + { + "epoch": 0.7477920915663786, + "grad_norm": 0.7791463136672974, + "learning_rate": 1.4226068571975647e-05, + "loss": 0.3088, + "step": 7938 + }, + { + "epoch": 0.7478862956595465, + "grad_norm": 0.7431942820549011, + "learning_rate": 1.4224699974609125e-05, + "loss": 0.2799, + "step": 7939 + }, + { + "epoch": 0.7479804997527143, + "grad_norm": 0.7565886974334717, + "learning_rate": 1.4223331280912912e-05, + "loss": 0.3445, + "step": 7940 + }, + { + "epoch": 0.7480747038458821, + "grad_norm": 0.7745346426963806, + "learning_rate": 1.4221962490918214e-05, + "loss": 0.3155, + "step": 7941 + }, + { + "epoch": 0.74816890793905, + "grad_norm": 0.6992605328559875, + "learning_rate": 1.422059360465624e-05, + "loss": 0.2928, + "step": 7942 + }, + { + "epoch": 0.7482631120322178, + "grad_norm": 0.7950454354286194, + "learning_rate": 1.4219224622158204e-05, + "loss": 0.3356, + "step": 7943 + }, + { + "epoch": 0.7483573161253857, + "grad_norm": 0.8491213917732239, + "learning_rate": 1.4217855543455323e-05, + "loss": 0.3467, + "step": 7944 + }, + { + "epoch": 0.7484515202185535, + "grad_norm": 0.6545068621635437, + "learning_rate": 1.4216486368578815e-05, + "loss": 0.3091, + "step": 7945 + }, + { + "epoch": 0.7485457243117214, + "grad_norm": 0.7261474132537842, + "learning_rate": 1.4215117097559893e-05, + "loss": 0.34, + "step": 7946 + }, + { + "epoch": 0.7486399284048892, + "grad_norm": 0.7189820408821106, + "learning_rate": 1.4213747730429783e-05, + "loss": 0.3038, + "step": 7947 + }, + { + "epoch": 0.7487341324980571, + "grad_norm": 0.9007904529571533, + "learning_rate": 1.421237826721971e-05, + "loss": 0.3348, + "step": 7948 + }, + { + "epoch": 0.7488283365912249, + "grad_norm": 0.8184025883674622, + "learning_rate": 1.4211008707960897e-05, + "loss": 0.3194, + "step": 7949 + }, + { + "epoch": 0.7489225406843928, + "grad_norm": 0.7761973738670349, + "learning_rate": 1.4209639052684574e-05, + "loss": 0.317, + "step": 7950 + }, + { + "epoch": 0.7490167447775606, + "grad_norm": 0.8261517286300659, + "learning_rate": 1.4208269301421966e-05, + "loss": 0.3124, + "step": 7951 + }, + { + "epoch": 0.7491109488707285, + "grad_norm": 0.7534788250923157, + "learning_rate": 1.4206899454204315e-05, + "loss": 0.3432, + "step": 7952 + }, + { + "epoch": 0.7492051529638962, + "grad_norm": 0.7461051940917969, + "learning_rate": 1.4205529511062847e-05, + "loss": 0.3091, + "step": 7953 + }, + { + "epoch": 0.7492993570570641, + "grad_norm": 0.7263821363449097, + "learning_rate": 1.4204159472028801e-05, + "loss": 0.3569, + "step": 7954 + }, + { + "epoch": 0.7493935611502319, + "grad_norm": 0.7321131229400635, + "learning_rate": 1.4202789337133425e-05, + "loss": 0.308, + "step": 7955 + }, + { + "epoch": 0.7494877652433998, + "grad_norm": 0.7263550162315369, + "learning_rate": 1.4201419106407946e-05, + "loss": 0.3249, + "step": 7956 + }, + { + "epoch": 0.7495819693365676, + "grad_norm": 0.7975026369094849, + "learning_rate": 1.4200048779883613e-05, + "loss": 0.2992, + "step": 7957 + }, + { + "epoch": 0.7496761734297355, + "grad_norm": 0.7149685025215149, + "learning_rate": 1.4198678357591676e-05, + "loss": 0.3195, + "step": 7958 + }, + { + "epoch": 0.7497703775229033, + "grad_norm": 1.5166430473327637, + "learning_rate": 1.4197307839563375e-05, + "loss": 0.3252, + "step": 7959 + }, + { + "epoch": 0.7498645816160712, + "grad_norm": 0.9258027076721191, + "learning_rate": 1.4195937225829965e-05, + "loss": 0.3329, + "step": 7960 + }, + { + "epoch": 0.749958785709239, + "grad_norm": 0.7562490105628967, + "learning_rate": 1.4194566516422698e-05, + "loss": 0.3796, + "step": 7961 + }, + { + "epoch": 0.7500529898024069, + "grad_norm": 0.6452712416648865, + "learning_rate": 1.4193195711372828e-05, + "loss": 0.2851, + "step": 7962 + }, + { + "epoch": 0.7501471938955747, + "grad_norm": 0.7906914949417114, + "learning_rate": 1.4191824810711615e-05, + "loss": 0.3526, + "step": 7963 + }, + { + "epoch": 0.7502413979887426, + "grad_norm": 0.7486674189567566, + "learning_rate": 1.4190453814470306e-05, + "loss": 0.2646, + "step": 7964 + }, + { + "epoch": 0.7503356020819104, + "grad_norm": 0.8381386995315552, + "learning_rate": 1.4189082722680173e-05, + "loss": 0.375, + "step": 7965 + }, + { + "epoch": 0.7504298061750783, + "grad_norm": 0.7577159404754639, + "learning_rate": 1.4187711535372475e-05, + "loss": 0.2916, + "step": 7966 + }, + { + "epoch": 0.7505240102682461, + "grad_norm": 0.6354668140411377, + "learning_rate": 1.4186340252578475e-05, + "loss": 0.2829, + "step": 7967 + }, + { + "epoch": 0.750618214361414, + "grad_norm": 0.865922212600708, + "learning_rate": 1.4184968874329446e-05, + "loss": 0.3279, + "step": 7968 + }, + { + "epoch": 0.7507124184545818, + "grad_norm": 0.7015376091003418, + "learning_rate": 1.4183597400656655e-05, + "loss": 0.2861, + "step": 7969 + }, + { + "epoch": 0.7508066225477497, + "grad_norm": 0.6503175497055054, + "learning_rate": 1.4182225831591372e-05, + "loss": 0.2547, + "step": 7970 + }, + { + "epoch": 0.7509008266409175, + "grad_norm": 0.7912094593048096, + "learning_rate": 1.418085416716487e-05, + "loss": 0.3194, + "step": 7971 + }, + { + "epoch": 0.7509950307340854, + "grad_norm": 0.7017026543617249, + "learning_rate": 1.4179482407408429e-05, + "loss": 0.2929, + "step": 7972 + }, + { + "epoch": 0.7510892348272532, + "grad_norm": 0.7565056681632996, + "learning_rate": 1.4178110552353329e-05, + "loss": 0.2862, + "step": 7973 + }, + { + "epoch": 0.751183438920421, + "grad_norm": 0.7025262713432312, + "learning_rate": 1.4176738602030842e-05, + "loss": 0.3332, + "step": 7974 + }, + { + "epoch": 0.7512776430135889, + "grad_norm": 0.6589545607566833, + "learning_rate": 1.4175366556472259e-05, + "loss": 0.2889, + "step": 7975 + }, + { + "epoch": 0.7513718471067568, + "grad_norm": 0.8647730946540833, + "learning_rate": 1.4173994415708856e-05, + "loss": 0.3268, + "step": 7976 + }, + { + "epoch": 0.7514660511999246, + "grad_norm": 0.7150059342384338, + "learning_rate": 1.4172622179771929e-05, + "loss": 0.3157, + "step": 7977 + }, + { + "epoch": 0.7515602552930924, + "grad_norm": 0.6853088140487671, + "learning_rate": 1.4171249848692762e-05, + "loss": 0.2909, + "step": 7978 + }, + { + "epoch": 0.7516544593862603, + "grad_norm": 0.6837041974067688, + "learning_rate": 1.4169877422502646e-05, + "loss": 0.3078, + "step": 7979 + }, + { + "epoch": 0.7517486634794281, + "grad_norm": 0.7358826994895935, + "learning_rate": 1.416850490123288e-05, + "loss": 0.2862, + "step": 7980 + }, + { + "epoch": 0.751842867572596, + "grad_norm": 0.6721023917198181, + "learning_rate": 1.4167132284914752e-05, + "loss": 0.3298, + "step": 7981 + }, + { + "epoch": 0.7519370716657638, + "grad_norm": 0.747821569442749, + "learning_rate": 1.4165759573579565e-05, + "loss": 0.3282, + "step": 7982 + }, + { + "epoch": 0.7520312757589317, + "grad_norm": 0.7575122714042664, + "learning_rate": 1.416438676725862e-05, + "loss": 0.2959, + "step": 7983 + }, + { + "epoch": 0.7521254798520995, + "grad_norm": 0.6453376412391663, + "learning_rate": 1.416301386598321e-05, + "loss": 0.2999, + "step": 7984 + }, + { + "epoch": 0.7522196839452674, + "grad_norm": 0.748228907585144, + "learning_rate": 1.4161640869784651e-05, + "loss": 0.3322, + "step": 7985 + }, + { + "epoch": 0.7523138880384352, + "grad_norm": 0.8601110577583313, + "learning_rate": 1.4160267778694241e-05, + "loss": 0.3335, + "step": 7986 + }, + { + "epoch": 0.7524080921316031, + "grad_norm": 0.7255735993385315, + "learning_rate": 1.4158894592743293e-05, + "loss": 0.3292, + "step": 7987 + }, + { + "epoch": 0.7525022962247709, + "grad_norm": 0.7679718136787415, + "learning_rate": 1.4157521311963116e-05, + "loss": 0.3068, + "step": 7988 + }, + { + "epoch": 0.7525965003179388, + "grad_norm": 0.6139155626296997, + "learning_rate": 1.4156147936385023e-05, + "loss": 0.2903, + "step": 7989 + }, + { + "epoch": 0.7526907044111066, + "grad_norm": 0.7174772620201111, + "learning_rate": 1.415477446604033e-05, + "loss": 0.2985, + "step": 7990 + }, + { + "epoch": 0.7527849085042745, + "grad_norm": 0.7099175453186035, + "learning_rate": 1.4153400900960353e-05, + "loss": 0.3357, + "step": 7991 + }, + { + "epoch": 0.7528791125974423, + "grad_norm": 0.6271936893463135, + "learning_rate": 1.4152027241176414e-05, + "loss": 0.2822, + "step": 7992 + }, + { + "epoch": 0.7529733166906102, + "grad_norm": 0.7048296928405762, + "learning_rate": 1.4150653486719832e-05, + "loss": 0.2922, + "step": 7993 + }, + { + "epoch": 0.753067520783778, + "grad_norm": 0.8052453398704529, + "learning_rate": 1.414927963762193e-05, + "loss": 0.3411, + "step": 7994 + }, + { + "epoch": 0.7531617248769459, + "grad_norm": 0.763670802116394, + "learning_rate": 1.4147905693914037e-05, + "loss": 0.3061, + "step": 7995 + }, + { + "epoch": 0.7532559289701137, + "grad_norm": 0.6347185969352722, + "learning_rate": 1.4146531655627476e-05, + "loss": 0.2963, + "step": 7996 + }, + { + "epoch": 0.7533501330632816, + "grad_norm": 0.7523487210273743, + "learning_rate": 1.4145157522793584e-05, + "loss": 0.3171, + "step": 7997 + }, + { + "epoch": 0.7534443371564494, + "grad_norm": 0.7609498500823975, + "learning_rate": 1.414378329544369e-05, + "loss": 0.3208, + "step": 7998 + }, + { + "epoch": 0.7535385412496173, + "grad_norm": 0.8232473731040955, + "learning_rate": 1.4142408973609125e-05, + "loss": 0.3439, + "step": 7999 + }, + { + "epoch": 0.7536327453427851, + "grad_norm": 0.7161168456077576, + "learning_rate": 1.4141034557321232e-05, + "loss": 0.3228, + "step": 8000 + }, + { + "epoch": 0.753726949435953, + "grad_norm": 0.711948812007904, + "learning_rate": 1.4139660046611346e-05, + "loss": 0.319, + "step": 8001 + }, + { + "epoch": 0.7538211535291208, + "grad_norm": 0.8147966265678406, + "learning_rate": 1.4138285441510808e-05, + "loss": 0.3277, + "step": 8002 + }, + { + "epoch": 0.7539153576222887, + "grad_norm": 0.6981361508369446, + "learning_rate": 1.4136910742050965e-05, + "loss": 0.3249, + "step": 8003 + }, + { + "epoch": 0.7540095617154565, + "grad_norm": 0.754050612449646, + "learning_rate": 1.4135535948263155e-05, + "loss": 0.32, + "step": 8004 + }, + { + "epoch": 0.7541037658086244, + "grad_norm": 0.7808064818382263, + "learning_rate": 1.4134161060178732e-05, + "loss": 0.3342, + "step": 8005 + }, + { + "epoch": 0.7541979699017922, + "grad_norm": 0.7009022831916809, + "learning_rate": 1.4132786077829044e-05, + "loss": 0.3126, + "step": 8006 + }, + { + "epoch": 0.7542921739949601, + "grad_norm": 0.7169830203056335, + "learning_rate": 1.4131411001245438e-05, + "loss": 0.3044, + "step": 8007 + }, + { + "epoch": 0.7543863780881279, + "grad_norm": 0.7986016273498535, + "learning_rate": 1.4130035830459276e-05, + "loss": 0.3823, + "step": 8008 + }, + { + "epoch": 0.7544805821812958, + "grad_norm": 0.7309896349906921, + "learning_rate": 1.4128660565501911e-05, + "loss": 0.3422, + "step": 8009 + }, + { + "epoch": 0.7545747862744636, + "grad_norm": 0.7200166583061218, + "learning_rate": 1.4127285206404697e-05, + "loss": 0.3264, + "step": 8010 + }, + { + "epoch": 0.7546689903676315, + "grad_norm": 0.805809497833252, + "learning_rate": 1.4125909753198996e-05, + "loss": 0.3373, + "step": 8011 + }, + { + "epoch": 0.7547631944607993, + "grad_norm": 0.6786282658576965, + "learning_rate": 1.4124534205916174e-05, + "loss": 0.3372, + "step": 8012 + }, + { + "epoch": 0.7548573985539672, + "grad_norm": 0.6856420040130615, + "learning_rate": 1.4123158564587594e-05, + "loss": 0.3215, + "step": 8013 + }, + { + "epoch": 0.754951602647135, + "grad_norm": 0.676866888999939, + "learning_rate": 1.4121782829244618e-05, + "loss": 0.2929, + "step": 8014 + }, + { + "epoch": 0.7550458067403029, + "grad_norm": 0.7994716167449951, + "learning_rate": 1.4120406999918626e-05, + "loss": 0.3283, + "step": 8015 + }, + { + "epoch": 0.7551400108334707, + "grad_norm": 0.8341429829597473, + "learning_rate": 1.4119031076640977e-05, + "loss": 0.3011, + "step": 8016 + }, + { + "epoch": 0.7552342149266386, + "grad_norm": 0.6932470798492432, + "learning_rate": 1.4117655059443052e-05, + "loss": 0.2757, + "step": 8017 + }, + { + "epoch": 0.7553284190198064, + "grad_norm": 0.9134334921836853, + "learning_rate": 1.4116278948356222e-05, + "loss": 0.3036, + "step": 8018 + }, + { + "epoch": 0.7554226231129743, + "grad_norm": 0.7064512968063354, + "learning_rate": 1.4114902743411864e-05, + "loss": 0.3207, + "step": 8019 + }, + { + "epoch": 0.7555168272061421, + "grad_norm": 0.691676914691925, + "learning_rate": 1.4113526444641363e-05, + "loss": 0.3261, + "step": 8020 + }, + { + "epoch": 0.75561103129931, + "grad_norm": 0.6851295232772827, + "learning_rate": 1.4112150052076094e-05, + "loss": 0.3128, + "step": 8021 + }, + { + "epoch": 0.7557052353924778, + "grad_norm": 1.0884650945663452, + "learning_rate": 1.4110773565747446e-05, + "loss": 0.3101, + "step": 8022 + }, + { + "epoch": 0.7557994394856457, + "grad_norm": 0.754021167755127, + "learning_rate": 1.4109396985686808e-05, + "loss": 0.3529, + "step": 8023 + }, + { + "epoch": 0.7558936435788135, + "grad_norm": 0.6890600919723511, + "learning_rate": 1.4108020311925557e-05, + "loss": 0.2677, + "step": 8024 + }, + { + "epoch": 0.7559878476719814, + "grad_norm": 0.673433244228363, + "learning_rate": 1.4106643544495092e-05, + "loss": 0.2949, + "step": 8025 + }, + { + "epoch": 0.7560820517651492, + "grad_norm": 0.7391211986541748, + "learning_rate": 1.4105266683426804e-05, + "loss": 0.2777, + "step": 8026 + }, + { + "epoch": 0.756176255858317, + "grad_norm": 0.7063663601875305, + "learning_rate": 1.4103889728752083e-05, + "loss": 0.3192, + "step": 8027 + }, + { + "epoch": 0.7562704599514849, + "grad_norm": 0.6587368845939636, + "learning_rate": 1.4102512680502333e-05, + "loss": 0.2801, + "step": 8028 + }, + { + "epoch": 0.7563646640446527, + "grad_norm": 0.7968171238899231, + "learning_rate": 1.4101135538708948e-05, + "loss": 0.3284, + "step": 8029 + }, + { + "epoch": 0.7564588681378206, + "grad_norm": 0.6723277568817139, + "learning_rate": 1.4099758303403333e-05, + "loss": 0.2703, + "step": 8030 + }, + { + "epoch": 0.7565530722309884, + "grad_norm": 1.5148507356643677, + "learning_rate": 1.4098380974616882e-05, + "loss": 0.3164, + "step": 8031 + }, + { + "epoch": 0.7566472763241563, + "grad_norm": 0.7872427701950073, + "learning_rate": 1.4097003552381012e-05, + "loss": 0.2998, + "step": 8032 + }, + { + "epoch": 0.7567414804173241, + "grad_norm": 0.6773476004600525, + "learning_rate": 1.4095626036727124e-05, + "loss": 0.3328, + "step": 8033 + }, + { + "epoch": 0.756835684510492, + "grad_norm": 0.7695625424385071, + "learning_rate": 1.4094248427686628e-05, + "loss": 0.3296, + "step": 8034 + }, + { + "epoch": 0.7569298886036598, + "grad_norm": 0.8502200841903687, + "learning_rate": 1.4092870725290934e-05, + "loss": 0.3237, + "step": 8035 + }, + { + "epoch": 0.7570240926968277, + "grad_norm": 0.7770158052444458, + "learning_rate": 1.4091492929571458e-05, + "loss": 0.3165, + "step": 8036 + }, + { + "epoch": 0.7571182967899955, + "grad_norm": 0.8137102127075195, + "learning_rate": 1.4090115040559617e-05, + "loss": 0.3679, + "step": 8037 + }, + { + "epoch": 0.7572125008831634, + "grad_norm": 0.7285508513450623, + "learning_rate": 1.4088737058286828e-05, + "loss": 0.3147, + "step": 8038 + }, + { + "epoch": 0.7573067049763312, + "grad_norm": 0.793168842792511, + "learning_rate": 1.4087358982784509e-05, + "loss": 0.3697, + "step": 8039 + }, + { + "epoch": 0.7574009090694991, + "grad_norm": 0.6035233736038208, + "learning_rate": 1.4085980814084086e-05, + "loss": 0.2632, + "step": 8040 + }, + { + "epoch": 0.7574951131626669, + "grad_norm": 0.6877842545509338, + "learning_rate": 1.408460255221698e-05, + "loss": 0.2853, + "step": 8041 + }, + { + "epoch": 0.7575893172558348, + "grad_norm": 0.6556431651115417, + "learning_rate": 1.4083224197214618e-05, + "loss": 0.3135, + "step": 8042 + }, + { + "epoch": 0.7576835213490026, + "grad_norm": 0.7101203203201294, + "learning_rate": 1.4081845749108433e-05, + "loss": 0.3361, + "step": 8043 + }, + { + "epoch": 0.7577777254421705, + "grad_norm": 0.8247424960136414, + "learning_rate": 1.408046720792985e-05, + "loss": 0.3474, + "step": 8044 + }, + { + "epoch": 0.7578719295353383, + "grad_norm": 0.8068971633911133, + "learning_rate": 1.4079088573710302e-05, + "loss": 0.3401, + "step": 8045 + }, + { + "epoch": 0.7579661336285062, + "grad_norm": 0.82241290807724, + "learning_rate": 1.407770984648123e-05, + "loss": 0.3199, + "step": 8046 + }, + { + "epoch": 0.758060337721674, + "grad_norm": 0.8863838911056519, + "learning_rate": 1.4076331026274063e-05, + "loss": 0.33, + "step": 8047 + }, + { + "epoch": 0.7581545418148419, + "grad_norm": 0.8461202383041382, + "learning_rate": 1.4074952113120248e-05, + "loss": 0.3091, + "step": 8048 + }, + { + "epoch": 0.7582487459080097, + "grad_norm": 0.8029640913009644, + "learning_rate": 1.4073573107051222e-05, + "loss": 0.3681, + "step": 8049 + }, + { + "epoch": 0.7583429500011776, + "grad_norm": 0.7179197072982788, + "learning_rate": 1.4072194008098427e-05, + "loss": 0.3133, + "step": 8050 + }, + { + "epoch": 0.7584371540943454, + "grad_norm": 0.688663899898529, + "learning_rate": 1.4070814816293313e-05, + "loss": 0.2863, + "step": 8051 + }, + { + "epoch": 0.7585313581875133, + "grad_norm": 0.732281506061554, + "learning_rate": 1.4069435531667326e-05, + "loss": 0.2834, + "step": 8052 + }, + { + "epoch": 0.7586255622806811, + "grad_norm": 0.6937088370323181, + "learning_rate": 1.4068056154251914e-05, + "loss": 0.2947, + "step": 8053 + }, + { + "epoch": 0.758719766373849, + "grad_norm": 0.7696384191513062, + "learning_rate": 1.4066676684078533e-05, + "loss": 0.322, + "step": 8054 + }, + { + "epoch": 0.7588139704670168, + "grad_norm": 0.6650841236114502, + "learning_rate": 1.4065297121178631e-05, + "loss": 0.3047, + "step": 8055 + }, + { + "epoch": 0.7589081745601847, + "grad_norm": 0.7577145099639893, + "learning_rate": 1.4063917465583668e-05, + "loss": 0.3068, + "step": 8056 + }, + { + "epoch": 0.7590023786533525, + "grad_norm": 0.8542148470878601, + "learning_rate": 1.4062537717325104e-05, + "loss": 0.3257, + "step": 8057 + }, + { + "epoch": 0.7590965827465204, + "grad_norm": 0.6929702758789062, + "learning_rate": 1.4061157876434395e-05, + "loss": 0.3078, + "step": 8058 + }, + { + "epoch": 0.7591907868396882, + "grad_norm": 0.678442120552063, + "learning_rate": 1.4059777942943005e-05, + "loss": 0.2982, + "step": 8059 + }, + { + "epoch": 0.7592849909328561, + "grad_norm": 0.6912717223167419, + "learning_rate": 1.4058397916882402e-05, + "loss": 0.3069, + "step": 8060 + }, + { + "epoch": 0.7593791950260239, + "grad_norm": 0.7928740382194519, + "learning_rate": 1.4057017798284049e-05, + "loss": 0.3178, + "step": 8061 + }, + { + "epoch": 0.7594733991191918, + "grad_norm": 0.861893892288208, + "learning_rate": 1.4055637587179413e-05, + "loss": 0.3265, + "step": 8062 + }, + { + "epoch": 0.7595676032123596, + "grad_norm": 0.7354323267936707, + "learning_rate": 1.4054257283599974e-05, + "loss": 0.3189, + "step": 8063 + }, + { + "epoch": 0.7596618073055275, + "grad_norm": 0.8310542702674866, + "learning_rate": 1.4052876887577194e-05, + "loss": 0.3373, + "step": 8064 + }, + { + "epoch": 0.7597560113986953, + "grad_norm": 0.7149031162261963, + "learning_rate": 1.4051496399142557e-05, + "loss": 0.3172, + "step": 8065 + }, + { + "epoch": 0.7598502154918632, + "grad_norm": 0.8128882646560669, + "learning_rate": 1.4050115818327531e-05, + "loss": 0.2976, + "step": 8066 + }, + { + "epoch": 0.759944419585031, + "grad_norm": 0.6709223985671997, + "learning_rate": 1.4048735145163604e-05, + "loss": 0.3164, + "step": 8067 + }, + { + "epoch": 0.7600386236781989, + "grad_norm": 0.7269225716590881, + "learning_rate": 1.4047354379682254e-05, + "loss": 0.2663, + "step": 8068 + }, + { + "epoch": 0.7601328277713667, + "grad_norm": 0.7139459252357483, + "learning_rate": 1.4045973521914967e-05, + "loss": 0.2883, + "step": 8069 + }, + { + "epoch": 0.7602270318645346, + "grad_norm": 0.8169942498207092, + "learning_rate": 1.4044592571893223e-05, + "loss": 0.345, + "step": 8070 + }, + { + "epoch": 0.7603212359577024, + "grad_norm": 0.7426708340644836, + "learning_rate": 1.4043211529648512e-05, + "loss": 0.2864, + "step": 8071 + }, + { + "epoch": 0.7604154400508703, + "grad_norm": 0.8026089668273926, + "learning_rate": 1.4041830395212328e-05, + "loss": 0.3014, + "step": 8072 + }, + { + "epoch": 0.7605096441440381, + "grad_norm": 0.7458574771881104, + "learning_rate": 1.4040449168616161e-05, + "loss": 0.3015, + "step": 8073 + }, + { + "epoch": 0.760603848237206, + "grad_norm": 0.7134662866592407, + "learning_rate": 1.4039067849891503e-05, + "loss": 0.3032, + "step": 8074 + }, + { + "epoch": 0.7606980523303738, + "grad_norm": 0.6305004954338074, + "learning_rate": 1.4037686439069853e-05, + "loss": 0.2523, + "step": 8075 + }, + { + "epoch": 0.7607922564235416, + "grad_norm": 0.8049094080924988, + "learning_rate": 1.4036304936182705e-05, + "loss": 0.3237, + "step": 8076 + }, + { + "epoch": 0.7608864605167095, + "grad_norm": 0.7716397047042847, + "learning_rate": 1.4034923341261565e-05, + "loss": 0.342, + "step": 8077 + }, + { + "epoch": 0.7609806646098773, + "grad_norm": 0.7197418212890625, + "learning_rate": 1.403354165433793e-05, + "loss": 0.3002, + "step": 8078 + }, + { + "epoch": 0.7610748687030452, + "grad_norm": 0.77171790599823, + "learning_rate": 1.4032159875443307e-05, + "loss": 0.341, + "step": 8079 + }, + { + "epoch": 0.761169072796213, + "grad_norm": 0.6065251231193542, + "learning_rate": 1.4030778004609209e-05, + "loss": 0.2757, + "step": 8080 + }, + { + "epoch": 0.7612632768893809, + "grad_norm": 0.7310432195663452, + "learning_rate": 1.4029396041867132e-05, + "loss": 0.3066, + "step": 8081 + }, + { + "epoch": 0.7613574809825487, + "grad_norm": 0.7769371867179871, + "learning_rate": 1.4028013987248595e-05, + "loss": 0.3343, + "step": 8082 + }, + { + "epoch": 0.7614516850757166, + "grad_norm": 0.7630373239517212, + "learning_rate": 1.4026631840785112e-05, + "loss": 0.3188, + "step": 8083 + }, + { + "epoch": 0.7615458891688844, + "grad_norm": 0.794040322303772, + "learning_rate": 1.4025249602508193e-05, + "loss": 0.3359, + "step": 8084 + }, + { + "epoch": 0.7616400932620523, + "grad_norm": 0.9124943017959595, + "learning_rate": 1.4023867272449358e-05, + "loss": 0.3374, + "step": 8085 + }, + { + "epoch": 0.7617342973552201, + "grad_norm": 0.736742377281189, + "learning_rate": 1.4022484850640128e-05, + "loss": 0.3346, + "step": 8086 + }, + { + "epoch": 0.761828501448388, + "grad_norm": 0.7343044877052307, + "learning_rate": 1.402110233711202e-05, + "loss": 0.3091, + "step": 8087 + }, + { + "epoch": 0.7619227055415558, + "grad_norm": 0.6946993470191956, + "learning_rate": 1.4019719731896564e-05, + "loss": 0.3206, + "step": 8088 + }, + { + "epoch": 0.7620169096347237, + "grad_norm": 0.7589704394340515, + "learning_rate": 1.401833703502528e-05, + "loss": 0.3133, + "step": 8089 + }, + { + "epoch": 0.7621111137278915, + "grad_norm": 0.7888718843460083, + "learning_rate": 1.4016954246529697e-05, + "loss": 0.33, + "step": 8090 + }, + { + "epoch": 0.7622053178210594, + "grad_norm": 0.731242299079895, + "learning_rate": 1.4015571366441343e-05, + "loss": 0.3199, + "step": 8091 + }, + { + "epoch": 0.7622995219142271, + "grad_norm": 0.6867145299911499, + "learning_rate": 1.4014188394791753e-05, + "loss": 0.3337, + "step": 8092 + }, + { + "epoch": 0.762393726007395, + "grad_norm": 0.8416774272918701, + "learning_rate": 1.401280533161246e-05, + "loss": 0.3111, + "step": 8093 + }, + { + "epoch": 0.7624879301005628, + "grad_norm": 0.7183943390846252, + "learning_rate": 1.4011422176935e-05, + "loss": 0.3122, + "step": 8094 + }, + { + "epoch": 0.7625821341937307, + "grad_norm": 0.7674915194511414, + "learning_rate": 1.401003893079091e-05, + "loss": 0.3151, + "step": 8095 + }, + { + "epoch": 0.7626763382868985, + "grad_norm": 0.6793264746665955, + "learning_rate": 1.4008655593211729e-05, + "loss": 0.306, + "step": 8096 + }, + { + "epoch": 0.7627705423800664, + "grad_norm": 0.7976694703102112, + "learning_rate": 1.4007272164229003e-05, + "loss": 0.3329, + "step": 8097 + }, + { + "epoch": 0.7628647464732342, + "grad_norm": 0.8111255168914795, + "learning_rate": 1.4005888643874273e-05, + "loss": 0.3308, + "step": 8098 + }, + { + "epoch": 0.7629589505664021, + "grad_norm": 0.6630674600601196, + "learning_rate": 1.4004505032179085e-05, + "loss": 0.3016, + "step": 8099 + }, + { + "epoch": 0.7630531546595699, + "grad_norm": 0.6879538297653198, + "learning_rate": 1.4003121329174993e-05, + "loss": 0.3092, + "step": 8100 + }, + { + "epoch": 0.7631473587527378, + "grad_norm": 0.6774730086326599, + "learning_rate": 1.4001737534893542e-05, + "loss": 0.3023, + "step": 8101 + }, + { + "epoch": 0.7632415628459056, + "grad_norm": 0.7137449979782104, + "learning_rate": 1.4000353649366285e-05, + "loss": 0.2802, + "step": 8102 + }, + { + "epoch": 0.7633357669390735, + "grad_norm": 0.6669721007347107, + "learning_rate": 1.3998969672624782e-05, + "loss": 0.2965, + "step": 8103 + }, + { + "epoch": 0.7634299710322413, + "grad_norm": 0.6711835861206055, + "learning_rate": 1.3997585604700584e-05, + "loss": 0.2843, + "step": 8104 + }, + { + "epoch": 0.7635241751254092, + "grad_norm": 0.6854143738746643, + "learning_rate": 1.3996201445625251e-05, + "loss": 0.2827, + "step": 8105 + }, + { + "epoch": 0.763618379218577, + "grad_norm": 0.6946225166320801, + "learning_rate": 1.3994817195430345e-05, + "loss": 0.2896, + "step": 8106 + }, + { + "epoch": 0.7637125833117449, + "grad_norm": 0.7499455809593201, + "learning_rate": 1.3993432854147429e-05, + "loss": 0.3348, + "step": 8107 + }, + { + "epoch": 0.7638067874049127, + "grad_norm": 0.8228910565376282, + "learning_rate": 1.3992048421808066e-05, + "loss": 0.2826, + "step": 8108 + }, + { + "epoch": 0.7639009914980806, + "grad_norm": 0.6496267914772034, + "learning_rate": 1.399066389844383e-05, + "loss": 0.3002, + "step": 8109 + }, + { + "epoch": 0.7639951955912484, + "grad_norm": 0.8050598502159119, + "learning_rate": 1.3989279284086283e-05, + "loss": 0.3685, + "step": 8110 + }, + { + "epoch": 0.7640893996844162, + "grad_norm": 0.7404391169548035, + "learning_rate": 1.3987894578766997e-05, + "loss": 0.3167, + "step": 8111 + }, + { + "epoch": 0.7641836037775841, + "grad_norm": 0.7374035716056824, + "learning_rate": 1.3986509782517548e-05, + "loss": 0.3142, + "step": 8112 + }, + { + "epoch": 0.764277807870752, + "grad_norm": 0.7885564565658569, + "learning_rate": 1.3985124895369513e-05, + "loss": 0.3198, + "step": 8113 + }, + { + "epoch": 0.7643720119639198, + "grad_norm": 0.7686183452606201, + "learning_rate": 1.3983739917354466e-05, + "loss": 0.3225, + "step": 8114 + }, + { + "epoch": 0.7644662160570876, + "grad_norm": 0.7687329649925232, + "learning_rate": 1.3982354848503987e-05, + "loss": 0.3247, + "step": 8115 + }, + { + "epoch": 0.7645604201502555, + "grad_norm": 0.7419228553771973, + "learning_rate": 1.3980969688849659e-05, + "loss": 0.3078, + "step": 8116 + }, + { + "epoch": 0.7646546242434233, + "grad_norm": 0.7465457320213318, + "learning_rate": 1.3979584438423066e-05, + "loss": 0.3261, + "step": 8117 + }, + { + "epoch": 0.7647488283365912, + "grad_norm": 0.678605318069458, + "learning_rate": 1.3978199097255791e-05, + "loss": 0.3024, + "step": 8118 + }, + { + "epoch": 0.764843032429759, + "grad_norm": 0.8090999126434326, + "learning_rate": 1.3976813665379427e-05, + "loss": 0.2924, + "step": 8119 + }, + { + "epoch": 0.7649372365229269, + "grad_norm": 0.7168018817901611, + "learning_rate": 1.3975428142825562e-05, + "loss": 0.303, + "step": 8120 + }, + { + "epoch": 0.7650314406160947, + "grad_norm": 0.7721872925758362, + "learning_rate": 1.3974042529625784e-05, + "loss": 0.2712, + "step": 8121 + }, + { + "epoch": 0.7651256447092626, + "grad_norm": 0.8302431702613831, + "learning_rate": 1.3972656825811691e-05, + "loss": 0.2959, + "step": 8122 + }, + { + "epoch": 0.7652198488024304, + "grad_norm": 0.7172962427139282, + "learning_rate": 1.3971271031414882e-05, + "loss": 0.3422, + "step": 8123 + }, + { + "epoch": 0.7653140528955983, + "grad_norm": 0.8304644823074341, + "learning_rate": 1.3969885146466946e-05, + "loss": 0.3443, + "step": 8124 + }, + { + "epoch": 0.7654082569887661, + "grad_norm": 0.6628880500793457, + "learning_rate": 1.3968499170999495e-05, + "loss": 0.2508, + "step": 8125 + }, + { + "epoch": 0.765502461081934, + "grad_norm": 0.7547474503517151, + "learning_rate": 1.3967113105044121e-05, + "loss": 0.3166, + "step": 8126 + }, + { + "epoch": 0.7655966651751018, + "grad_norm": 2.869410991668701, + "learning_rate": 1.3965726948632434e-05, + "loss": 0.3228, + "step": 8127 + }, + { + "epoch": 0.7656908692682697, + "grad_norm": 0.715911865234375, + "learning_rate": 1.396434070179604e-05, + "loss": 0.3164, + "step": 8128 + }, + { + "epoch": 0.7657850733614375, + "grad_norm": 0.6984981298446655, + "learning_rate": 1.3962954364566547e-05, + "loss": 0.308, + "step": 8129 + }, + { + "epoch": 0.7658792774546054, + "grad_norm": 0.7362462282180786, + "learning_rate": 1.3961567936975566e-05, + "loss": 0.2996, + "step": 8130 + }, + { + "epoch": 0.7659734815477732, + "grad_norm": 0.8271350860595703, + "learning_rate": 1.3960181419054708e-05, + "loss": 0.3011, + "step": 8131 + }, + { + "epoch": 0.7660676856409411, + "grad_norm": 0.7668757438659668, + "learning_rate": 1.3958794810835592e-05, + "loss": 0.3077, + "step": 8132 + }, + { + "epoch": 0.7661618897341089, + "grad_norm": 0.916616678237915, + "learning_rate": 1.395740811234983e-05, + "loss": 0.3014, + "step": 8133 + }, + { + "epoch": 0.7662560938272768, + "grad_norm": 0.6834691762924194, + "learning_rate": 1.3956021323629043e-05, + "loss": 0.3064, + "step": 8134 + }, + { + "epoch": 0.7663502979204446, + "grad_norm": 0.7731265425682068, + "learning_rate": 1.3954634444704854e-05, + "loss": 0.3148, + "step": 8135 + }, + { + "epoch": 0.7664445020136125, + "grad_norm": 0.805877149105072, + "learning_rate": 1.395324747560888e-05, + "loss": 0.3069, + "step": 8136 + }, + { + "epoch": 0.7665387061067803, + "grad_norm": 0.6751599311828613, + "learning_rate": 1.3951860416372748e-05, + "loss": 0.3253, + "step": 8137 + }, + { + "epoch": 0.7666329101999482, + "grad_norm": 0.6992517113685608, + "learning_rate": 1.3950473267028093e-05, + "loss": 0.305, + "step": 8138 + }, + { + "epoch": 0.766727114293116, + "grad_norm": 0.7160972356796265, + "learning_rate": 1.3949086027606533e-05, + "loss": 0.2926, + "step": 8139 + }, + { + "epoch": 0.7668213183862839, + "grad_norm": 0.8054567575454712, + "learning_rate": 1.3947698698139708e-05, + "loss": 0.3314, + "step": 8140 + }, + { + "epoch": 0.7669155224794517, + "grad_norm": 0.7271167635917664, + "learning_rate": 1.3946311278659246e-05, + "loss": 0.3636, + "step": 8141 + }, + { + "epoch": 0.7670097265726196, + "grad_norm": 0.68764328956604, + "learning_rate": 1.3944923769196781e-05, + "loss": 0.3025, + "step": 8142 + }, + { + "epoch": 0.7671039306657874, + "grad_norm": 0.9515770673751831, + "learning_rate": 1.3943536169783958e-05, + "loss": 0.3, + "step": 8143 + }, + { + "epoch": 0.7671981347589553, + "grad_norm": 0.7493038773536682, + "learning_rate": 1.3942148480452407e-05, + "loss": 0.3265, + "step": 8144 + }, + { + "epoch": 0.7672923388521231, + "grad_norm": 0.8873345851898193, + "learning_rate": 1.3940760701233775e-05, + "loss": 0.3252, + "step": 8145 + }, + { + "epoch": 0.767386542945291, + "grad_norm": 0.7618383169174194, + "learning_rate": 1.3939372832159709e-05, + "loss": 0.3099, + "step": 8146 + }, + { + "epoch": 0.7674807470384588, + "grad_norm": 0.7449729442596436, + "learning_rate": 1.3937984873261843e-05, + "loss": 0.3375, + "step": 8147 + }, + { + "epoch": 0.7675749511316267, + "grad_norm": 0.6962200403213501, + "learning_rate": 1.3936596824571838e-05, + "loss": 0.362, + "step": 8148 + }, + { + "epoch": 0.7676691552247945, + "grad_norm": 0.6698456406593323, + "learning_rate": 1.3935208686121333e-05, + "loss": 0.2944, + "step": 8149 + }, + { + "epoch": 0.7677633593179624, + "grad_norm": 0.6792502999305725, + "learning_rate": 1.3933820457941986e-05, + "loss": 0.3251, + "step": 8150 + }, + { + "epoch": 0.7678575634111302, + "grad_norm": 0.7187953591346741, + "learning_rate": 1.3932432140065451e-05, + "loss": 0.3096, + "step": 8151 + }, + { + "epoch": 0.767951767504298, + "grad_norm": 0.6905596256256104, + "learning_rate": 1.3931043732523377e-05, + "loss": 0.2816, + "step": 8152 + }, + { + "epoch": 0.7680459715974659, + "grad_norm": 0.8274813890457153, + "learning_rate": 1.3929655235347429e-05, + "loss": 0.3219, + "step": 8153 + }, + { + "epoch": 0.7681401756906338, + "grad_norm": 0.7077164053916931, + "learning_rate": 1.3928266648569264e-05, + "loss": 0.2866, + "step": 8154 + }, + { + "epoch": 0.7682343797838016, + "grad_norm": 0.7026647329330444, + "learning_rate": 1.3926877972220543e-05, + "loss": 0.2728, + "step": 8155 + }, + { + "epoch": 0.7683285838769695, + "grad_norm": 0.736984372138977, + "learning_rate": 1.3925489206332932e-05, + "loss": 0.3352, + "step": 8156 + }, + { + "epoch": 0.7684227879701373, + "grad_norm": 0.8198491334915161, + "learning_rate": 1.3924100350938097e-05, + "loss": 0.3118, + "step": 8157 + }, + { + "epoch": 0.7685169920633051, + "grad_norm": 0.6261926293373108, + "learning_rate": 1.3922711406067703e-05, + "loss": 0.2887, + "step": 8158 + }, + { + "epoch": 0.768611196156473, + "grad_norm": 0.7135415077209473, + "learning_rate": 1.3921322371753423e-05, + "loss": 0.3018, + "step": 8159 + }, + { + "epoch": 0.7687054002496408, + "grad_norm": 0.7526355385780334, + "learning_rate": 1.3919933248026933e-05, + "loss": 0.3307, + "step": 8160 + }, + { + "epoch": 0.7687996043428087, + "grad_norm": 0.7028264403343201, + "learning_rate": 1.3918544034919897e-05, + "loss": 0.3125, + "step": 8161 + }, + { + "epoch": 0.7688938084359765, + "grad_norm": 0.8565718531608582, + "learning_rate": 1.3917154732463998e-05, + "loss": 0.3268, + "step": 8162 + }, + { + "epoch": 0.7689880125291444, + "grad_norm": 0.7672988176345825, + "learning_rate": 1.3915765340690916e-05, + "loss": 0.326, + "step": 8163 + }, + { + "epoch": 0.7690822166223122, + "grad_norm": 0.8035504817962646, + "learning_rate": 1.3914375859632325e-05, + "loss": 0.3036, + "step": 8164 + }, + { + "epoch": 0.7691764207154801, + "grad_norm": 0.7635858058929443, + "learning_rate": 1.3912986289319914e-05, + "loss": 0.3715, + "step": 8165 + }, + { + "epoch": 0.7692706248086479, + "grad_norm": 0.7464389204978943, + "learning_rate": 1.3911596629785362e-05, + "loss": 0.3249, + "step": 8166 + }, + { + "epoch": 0.7693648289018158, + "grad_norm": 0.8326256275177002, + "learning_rate": 1.3910206881060355e-05, + "loss": 0.2854, + "step": 8167 + }, + { + "epoch": 0.7694590329949836, + "grad_norm": 0.7256987690925598, + "learning_rate": 1.3908817043176588e-05, + "loss": 0.3221, + "step": 8168 + }, + { + "epoch": 0.7695532370881515, + "grad_norm": 0.825480580329895, + "learning_rate": 1.3907427116165746e-05, + "loss": 0.3515, + "step": 8169 + }, + { + "epoch": 0.7696474411813193, + "grad_norm": 0.8091462850570679, + "learning_rate": 1.3906037100059524e-05, + "loss": 0.275, + "step": 8170 + }, + { + "epoch": 0.7697416452744872, + "grad_norm": 0.9617534279823303, + "learning_rate": 1.3904646994889614e-05, + "loss": 0.3468, + "step": 8171 + }, + { + "epoch": 0.769835849367655, + "grad_norm": 0.7480612397193909, + "learning_rate": 1.3903256800687711e-05, + "loss": 0.2869, + "step": 8172 + }, + { + "epoch": 0.7699300534608229, + "grad_norm": 0.8497846722602844, + "learning_rate": 1.3901866517485521e-05, + "loss": 0.3008, + "step": 8173 + }, + { + "epoch": 0.7700242575539907, + "grad_norm": 0.7676090598106384, + "learning_rate": 1.3900476145314739e-05, + "loss": 0.3214, + "step": 8174 + }, + { + "epoch": 0.7701184616471586, + "grad_norm": 0.6978567838668823, + "learning_rate": 1.3899085684207068e-05, + "loss": 0.3002, + "step": 8175 + }, + { + "epoch": 0.7702126657403264, + "grad_norm": 0.7182883024215698, + "learning_rate": 1.389769513419421e-05, + "loss": 0.3341, + "step": 8176 + }, + { + "epoch": 0.7703068698334943, + "grad_norm": 0.7043347358703613, + "learning_rate": 1.3896304495307881e-05, + "loss": 0.2765, + "step": 8177 + }, + { + "epoch": 0.7704010739266621, + "grad_norm": 1.1452966928482056, + "learning_rate": 1.389491376757978e-05, + "loss": 0.3302, + "step": 8178 + }, + { + "epoch": 0.77049527801983, + "grad_norm": 0.8262236714363098, + "learning_rate": 1.3893522951041622e-05, + "loss": 0.2965, + "step": 8179 + }, + { + "epoch": 0.7705894821129978, + "grad_norm": 0.6995968222618103, + "learning_rate": 1.3892132045725122e-05, + "loss": 0.301, + "step": 8180 + }, + { + "epoch": 0.7706836862061657, + "grad_norm": 0.7085097432136536, + "learning_rate": 1.3890741051661989e-05, + "loss": 0.2831, + "step": 8181 + }, + { + "epoch": 0.7707778902993335, + "grad_norm": 0.7411379814147949, + "learning_rate": 1.3889349968883943e-05, + "loss": 0.3106, + "step": 8182 + }, + { + "epoch": 0.7708720943925014, + "grad_norm": 0.7735549211502075, + "learning_rate": 1.3887958797422707e-05, + "loss": 0.3126, + "step": 8183 + }, + { + "epoch": 0.7709662984856692, + "grad_norm": 0.69361811876297, + "learning_rate": 1.3886567537309992e-05, + "loss": 0.312, + "step": 8184 + }, + { + "epoch": 0.7710605025788371, + "grad_norm": 0.7850237488746643, + "learning_rate": 1.388517618857753e-05, + "loss": 0.3094, + "step": 8185 + }, + { + "epoch": 0.7711547066720049, + "grad_norm": 0.7504292130470276, + "learning_rate": 1.3883784751257042e-05, + "loss": 0.3169, + "step": 8186 + }, + { + "epoch": 0.7712489107651728, + "grad_norm": 0.7509244680404663, + "learning_rate": 1.3882393225380251e-05, + "loss": 0.2529, + "step": 8187 + }, + { + "epoch": 0.7713431148583406, + "grad_norm": 0.7746277451515198, + "learning_rate": 1.3881001610978894e-05, + "loss": 0.3651, + "step": 8188 + }, + { + "epoch": 0.7714373189515085, + "grad_norm": 0.8231454491615295, + "learning_rate": 1.3879609908084697e-05, + "loss": 0.3469, + "step": 8189 + }, + { + "epoch": 0.7715315230446763, + "grad_norm": 0.6980456709861755, + "learning_rate": 1.3878218116729397e-05, + "loss": 0.3341, + "step": 8190 + }, + { + "epoch": 0.7716257271378442, + "grad_norm": 0.750406801700592, + "learning_rate": 1.3876826236944724e-05, + "loss": 0.3357, + "step": 8191 + }, + { + "epoch": 0.771719931231012, + "grad_norm": 0.696100115776062, + "learning_rate": 1.3875434268762415e-05, + "loss": 0.3139, + "step": 8192 + }, + { + "epoch": 0.7718141353241799, + "grad_norm": 0.7752167582511902, + "learning_rate": 1.3874042212214215e-05, + "loss": 0.3647, + "step": 8193 + }, + { + "epoch": 0.7719083394173477, + "grad_norm": 0.8316193222999573, + "learning_rate": 1.3872650067331859e-05, + "loss": 0.295, + "step": 8194 + }, + { + "epoch": 0.7720025435105156, + "grad_norm": 0.6846539378166199, + "learning_rate": 1.3871257834147094e-05, + "loss": 0.2856, + "step": 8195 + }, + { + "epoch": 0.7720967476036834, + "grad_norm": 0.8855867385864258, + "learning_rate": 1.386986551269166e-05, + "loss": 0.384, + "step": 8196 + }, + { + "epoch": 0.7721909516968513, + "grad_norm": 0.6644063591957092, + "learning_rate": 1.3868473102997311e-05, + "loss": 0.2704, + "step": 8197 + }, + { + "epoch": 0.7722851557900191, + "grad_norm": 1.0443947315216064, + "learning_rate": 1.386708060509579e-05, + "loss": 0.3466, + "step": 8198 + }, + { + "epoch": 0.772379359883187, + "grad_norm": 0.7246731519699097, + "learning_rate": 1.386568801901885e-05, + "loss": 0.3448, + "step": 8199 + }, + { + "epoch": 0.7724735639763548, + "grad_norm": 0.6464845538139343, + "learning_rate": 1.3864295344798251e-05, + "loss": 0.2988, + "step": 8200 + }, + { + "epoch": 0.7725677680695227, + "grad_norm": 0.7232339382171631, + "learning_rate": 1.3862902582465736e-05, + "loss": 0.3653, + "step": 8201 + }, + { + "epoch": 0.7726619721626905, + "grad_norm": 0.8247326016426086, + "learning_rate": 1.3861509732053067e-05, + "loss": 0.3376, + "step": 8202 + }, + { + "epoch": 0.7727561762558584, + "grad_norm": 0.7704737782478333, + "learning_rate": 1.3860116793592009e-05, + "loss": 0.3394, + "step": 8203 + }, + { + "epoch": 0.7728503803490262, + "grad_norm": 0.716284990310669, + "learning_rate": 1.3858723767114314e-05, + "loss": 0.3142, + "step": 8204 + }, + { + "epoch": 0.772944584442194, + "grad_norm": 0.7100145220756531, + "learning_rate": 1.3857330652651753e-05, + "loss": 0.3099, + "step": 8205 + }, + { + "epoch": 0.7730387885353619, + "grad_norm": 0.789027988910675, + "learning_rate": 1.3855937450236087e-05, + "loss": 0.3277, + "step": 8206 + }, + { + "epoch": 0.7731329926285297, + "grad_norm": 0.7239149808883667, + "learning_rate": 1.3854544159899081e-05, + "loss": 0.3365, + "step": 8207 + }, + { + "epoch": 0.7732271967216976, + "grad_norm": 0.6745103001594543, + "learning_rate": 1.3853150781672508e-05, + "loss": 0.3027, + "step": 8208 + }, + { + "epoch": 0.7733214008148654, + "grad_norm": 0.6679433584213257, + "learning_rate": 1.3851757315588141e-05, + "loss": 0.3138, + "step": 8209 + }, + { + "epoch": 0.7734156049080333, + "grad_norm": 0.7353999614715576, + "learning_rate": 1.385036376167775e-05, + "loss": 0.2889, + "step": 8210 + }, + { + "epoch": 0.7735098090012011, + "grad_norm": 0.7928226590156555, + "learning_rate": 1.3848970119973108e-05, + "loss": 0.3359, + "step": 8211 + }, + { + "epoch": 0.773604013094369, + "grad_norm": 0.6852613091468811, + "learning_rate": 1.3847576390505994e-05, + "loss": 0.3217, + "step": 8212 + }, + { + "epoch": 0.7736982171875368, + "grad_norm": 0.6255519986152649, + "learning_rate": 1.3846182573308191e-05, + "loss": 0.2603, + "step": 8213 + }, + { + "epoch": 0.7737924212807047, + "grad_norm": 0.7862254977226257, + "learning_rate": 1.3844788668411475e-05, + "loss": 0.3375, + "step": 8214 + }, + { + "epoch": 0.7738866253738725, + "grad_norm": 0.7256548404693604, + "learning_rate": 1.3843394675847635e-05, + "loss": 0.3236, + "step": 8215 + }, + { + "epoch": 0.7739808294670404, + "grad_norm": 0.7552547454833984, + "learning_rate": 1.3842000595648446e-05, + "loss": 0.3212, + "step": 8216 + }, + { + "epoch": 0.7740750335602082, + "grad_norm": 0.5907566547393799, + "learning_rate": 1.3840606427845707e-05, + "loss": 0.3019, + "step": 8217 + }, + { + "epoch": 0.7741692376533761, + "grad_norm": 0.7375285029411316, + "learning_rate": 1.38392121724712e-05, + "loss": 0.3383, + "step": 8218 + }, + { + "epoch": 0.7742634417465439, + "grad_norm": 0.7448163032531738, + "learning_rate": 1.3837817829556716e-05, + "loss": 0.3322, + "step": 8219 + }, + { + "epoch": 0.7743576458397118, + "grad_norm": 0.7693195343017578, + "learning_rate": 1.3836423399134056e-05, + "loss": 0.2978, + "step": 8220 + }, + { + "epoch": 0.7744518499328796, + "grad_norm": 0.7602647542953491, + "learning_rate": 1.3835028881235001e-05, + "loss": 0.3288, + "step": 8221 + }, + { + "epoch": 0.7745460540260475, + "grad_norm": 0.7578790187835693, + "learning_rate": 1.3833634275891364e-05, + "loss": 0.3453, + "step": 8222 + }, + { + "epoch": 0.7746402581192153, + "grad_norm": 0.7210747599601746, + "learning_rate": 1.3832239583134934e-05, + "loss": 0.298, + "step": 8223 + }, + { + "epoch": 0.7747344622123832, + "grad_norm": 0.6661896705627441, + "learning_rate": 1.3830844802997514e-05, + "loss": 0.304, + "step": 8224 + }, + { + "epoch": 0.774828666305551, + "grad_norm": 0.8601593375205994, + "learning_rate": 1.3829449935510908e-05, + "loss": 0.333, + "step": 8225 + }, + { + "epoch": 0.7749228703987189, + "grad_norm": 0.6311641335487366, + "learning_rate": 1.3828054980706921e-05, + "loss": 0.2966, + "step": 8226 + }, + { + "epoch": 0.7750170744918867, + "grad_norm": 0.6895314455032349, + "learning_rate": 1.382665993861736e-05, + "loss": 0.3055, + "step": 8227 + }, + { + "epoch": 0.7751112785850546, + "grad_norm": 0.6774254441261292, + "learning_rate": 1.3825264809274037e-05, + "loss": 0.2659, + "step": 8228 + }, + { + "epoch": 0.7752054826782224, + "grad_norm": 0.7692509889602661, + "learning_rate": 1.3823869592708759e-05, + "loss": 0.3441, + "step": 8229 + }, + { + "epoch": 0.7752996867713902, + "grad_norm": 0.7428672909736633, + "learning_rate": 1.382247428895334e-05, + "loss": 0.3138, + "step": 8230 + }, + { + "epoch": 0.775393890864558, + "grad_norm": 0.7270107865333557, + "learning_rate": 1.3821078898039598e-05, + "loss": 0.3359, + "step": 8231 + }, + { + "epoch": 0.7754880949577259, + "grad_norm": 0.7511183619499207, + "learning_rate": 1.3819683419999343e-05, + "loss": 0.3063, + "step": 8232 + }, + { + "epoch": 0.7755822990508937, + "grad_norm": 0.7697188854217529, + "learning_rate": 1.3818287854864401e-05, + "loss": 0.3548, + "step": 8233 + }, + { + "epoch": 0.7756765031440616, + "grad_norm": 0.6531904935836792, + "learning_rate": 1.3816892202666591e-05, + "loss": 0.2833, + "step": 8234 + }, + { + "epoch": 0.7757707072372294, + "grad_norm": 0.6312221884727478, + "learning_rate": 1.3815496463437739e-05, + "loss": 0.2818, + "step": 8235 + }, + { + "epoch": 0.7758649113303973, + "grad_norm": 0.847802996635437, + "learning_rate": 1.3814100637209663e-05, + "loss": 0.337, + "step": 8236 + }, + { + "epoch": 0.7759591154235651, + "grad_norm": 0.7020447254180908, + "learning_rate": 1.3812704724014192e-05, + "loss": 0.31, + "step": 8237 + }, + { + "epoch": 0.776053319516733, + "grad_norm": 0.7162377834320068, + "learning_rate": 1.381130872388316e-05, + "loss": 0.2889, + "step": 8238 + }, + { + "epoch": 0.7761475236099008, + "grad_norm": 0.8712856769561768, + "learning_rate": 1.3809912636848395e-05, + "loss": 0.3241, + "step": 8239 + }, + { + "epoch": 0.7762417277030687, + "grad_norm": 0.6571957468986511, + "learning_rate": 1.380851646294173e-05, + "loss": 0.2901, + "step": 8240 + }, + { + "epoch": 0.7763359317962365, + "grad_norm": 0.8193198442459106, + "learning_rate": 1.3807120202195e-05, + "loss": 0.354, + "step": 8241 + }, + { + "epoch": 0.7764301358894043, + "grad_norm": 1.451332926750183, + "learning_rate": 1.3805723854640039e-05, + "loss": 0.354, + "step": 8242 + }, + { + "epoch": 0.7765243399825722, + "grad_norm": 0.7328004240989685, + "learning_rate": 1.380432742030869e-05, + "loss": 0.3538, + "step": 8243 + }, + { + "epoch": 0.77661854407574, + "grad_norm": 0.7978833913803101, + "learning_rate": 1.3802930899232791e-05, + "loss": 0.3297, + "step": 8244 + }, + { + "epoch": 0.7767127481689079, + "grad_norm": 0.7007066011428833, + "learning_rate": 1.3801534291444187e-05, + "loss": 0.2975, + "step": 8245 + }, + { + "epoch": 0.7768069522620757, + "grad_norm": 0.6987437605857849, + "learning_rate": 1.3800137596974723e-05, + "loss": 0.3003, + "step": 8246 + }, + { + "epoch": 0.7769011563552436, + "grad_norm": 0.6815653443336487, + "learning_rate": 1.3798740815856241e-05, + "loss": 0.2912, + "step": 8247 + }, + { + "epoch": 0.7769953604484114, + "grad_norm": 0.7572231292724609, + "learning_rate": 1.3797343948120599e-05, + "loss": 0.3054, + "step": 8248 + }, + { + "epoch": 0.7770895645415793, + "grad_norm": 0.7254391312599182, + "learning_rate": 1.379594699379964e-05, + "loss": 0.2941, + "step": 8249 + }, + { + "epoch": 0.7771837686347471, + "grad_norm": 0.7043079733848572, + "learning_rate": 1.3794549952925217e-05, + "loss": 0.3013, + "step": 8250 + }, + { + "epoch": 0.777277972727915, + "grad_norm": 1.0014352798461914, + "learning_rate": 1.379315282552919e-05, + "loss": 0.3082, + "step": 8251 + }, + { + "epoch": 0.7773721768210828, + "grad_norm": 0.8136102557182312, + "learning_rate": 1.3791755611643409e-05, + "loss": 0.3184, + "step": 8252 + }, + { + "epoch": 0.7774663809142507, + "grad_norm": 0.7315954566001892, + "learning_rate": 1.3790358311299739e-05, + "loss": 0.3245, + "step": 8253 + }, + { + "epoch": 0.7775605850074185, + "grad_norm": 0.7224061489105225, + "learning_rate": 1.3788960924530037e-05, + "loss": 0.3155, + "step": 8254 + }, + { + "epoch": 0.7776547891005864, + "grad_norm": 0.9970704913139343, + "learning_rate": 1.3787563451366167e-05, + "loss": 0.2995, + "step": 8255 + }, + { + "epoch": 0.7777489931937542, + "grad_norm": 0.7501780986785889, + "learning_rate": 1.3786165891839988e-05, + "loss": 0.3284, + "step": 8256 + }, + { + "epoch": 0.7778431972869221, + "grad_norm": 0.7430704236030579, + "learning_rate": 1.3784768245983377e-05, + "loss": 0.2966, + "step": 8257 + }, + { + "epoch": 0.7779374013800899, + "grad_norm": 0.759501039981842, + "learning_rate": 1.3783370513828194e-05, + "loss": 0.3284, + "step": 8258 + }, + { + "epoch": 0.7780316054732578, + "grad_norm": 0.7739242911338806, + "learning_rate": 1.378197269540631e-05, + "loss": 0.3237, + "step": 8259 + }, + { + "epoch": 0.7781258095664256, + "grad_norm": 0.8120517134666443, + "learning_rate": 1.3780574790749606e-05, + "loss": 0.2965, + "step": 8260 + }, + { + "epoch": 0.7782200136595935, + "grad_norm": 0.7086346745491028, + "learning_rate": 1.3779176799889944e-05, + "loss": 0.2946, + "step": 8261 + }, + { + "epoch": 0.7783142177527613, + "grad_norm": 0.723177433013916, + "learning_rate": 1.377777872285921e-05, + "loss": 0.3502, + "step": 8262 + }, + { + "epoch": 0.7784084218459292, + "grad_norm": 0.7041103839874268, + "learning_rate": 1.3776380559689278e-05, + "loss": 0.3152, + "step": 8263 + }, + { + "epoch": 0.778502625939097, + "grad_norm": 0.6800200343132019, + "learning_rate": 1.3774982310412025e-05, + "loss": 0.2653, + "step": 8264 + }, + { + "epoch": 0.7785968300322649, + "grad_norm": 0.664059042930603, + "learning_rate": 1.3773583975059341e-05, + "loss": 0.3084, + "step": 8265 + }, + { + "epoch": 0.7786910341254327, + "grad_norm": 0.8002199530601501, + "learning_rate": 1.3772185553663107e-05, + "loss": 0.3038, + "step": 8266 + }, + { + "epoch": 0.7787852382186006, + "grad_norm": 0.7921991348266602, + "learning_rate": 1.3770787046255205e-05, + "loss": 0.3117, + "step": 8267 + }, + { + "epoch": 0.7788794423117684, + "grad_norm": 0.7738434672355652, + "learning_rate": 1.3769388452867532e-05, + "loss": 0.345, + "step": 8268 + }, + { + "epoch": 0.7789736464049363, + "grad_norm": 0.7458887100219727, + "learning_rate": 1.3767989773531969e-05, + "loss": 0.2633, + "step": 8269 + }, + { + "epoch": 0.7790678504981041, + "grad_norm": 0.7952979207038879, + "learning_rate": 1.3766591008280413e-05, + "loss": 0.2914, + "step": 8270 + }, + { + "epoch": 0.779162054591272, + "grad_norm": 0.7417327165603638, + "learning_rate": 1.3765192157144757e-05, + "loss": 0.3492, + "step": 8271 + }, + { + "epoch": 0.7792562586844398, + "grad_norm": 0.6867636442184448, + "learning_rate": 1.3763793220156893e-05, + "loss": 0.3074, + "step": 8272 + }, + { + "epoch": 0.7793504627776077, + "grad_norm": 0.6547208428382874, + "learning_rate": 1.3762394197348727e-05, + "loss": 0.2581, + "step": 8273 + }, + { + "epoch": 0.7794446668707755, + "grad_norm": 0.7010359764099121, + "learning_rate": 1.3760995088752153e-05, + "loss": 0.2769, + "step": 8274 + }, + { + "epoch": 0.7795388709639434, + "grad_norm": 0.7501020431518555, + "learning_rate": 1.3759595894399078e-05, + "loss": 0.3148, + "step": 8275 + }, + { + "epoch": 0.7796330750571112, + "grad_norm": 0.6372538208961487, + "learning_rate": 1.3758196614321398e-05, + "loss": 0.2668, + "step": 8276 + }, + { + "epoch": 0.7797272791502791, + "grad_norm": 0.7269129753112793, + "learning_rate": 1.3756797248551026e-05, + "loss": 0.2931, + "step": 8277 + }, + { + "epoch": 0.7798214832434469, + "grad_norm": 0.6892969012260437, + "learning_rate": 1.3755397797119864e-05, + "loss": 0.3268, + "step": 8278 + }, + { + "epoch": 0.7799156873366148, + "grad_norm": 0.6834263801574707, + "learning_rate": 1.3753998260059825e-05, + "loss": 0.3344, + "step": 8279 + }, + { + "epoch": 0.7800098914297826, + "grad_norm": 0.683588445186615, + "learning_rate": 1.3752598637402824e-05, + "loss": 0.2749, + "step": 8280 + }, + { + "epoch": 0.7801040955229505, + "grad_norm": 0.6910967230796814, + "learning_rate": 1.3751198929180765e-05, + "loss": 0.3284, + "step": 8281 + }, + { + "epoch": 0.7801982996161183, + "grad_norm": 0.8028631806373596, + "learning_rate": 1.3749799135425573e-05, + "loss": 0.3242, + "step": 8282 + }, + { + "epoch": 0.7802925037092862, + "grad_norm": 0.7768567800521851, + "learning_rate": 1.3748399256169158e-05, + "loss": 0.3309, + "step": 8283 + }, + { + "epoch": 0.780386707802454, + "grad_norm": 0.7080867886543274, + "learning_rate": 1.3746999291443444e-05, + "loss": 0.3029, + "step": 8284 + }, + { + "epoch": 0.7804809118956219, + "grad_norm": 0.8380313515663147, + "learning_rate": 1.3745599241280352e-05, + "loss": 0.3342, + "step": 8285 + }, + { + "epoch": 0.7805751159887897, + "grad_norm": 0.8456419706344604, + "learning_rate": 1.3744199105711806e-05, + "loss": 0.3106, + "step": 8286 + }, + { + "epoch": 0.7806693200819576, + "grad_norm": 0.6792260408401489, + "learning_rate": 1.3742798884769727e-05, + "loss": 0.3044, + "step": 8287 + }, + { + "epoch": 0.7807635241751254, + "grad_norm": 0.6739451885223389, + "learning_rate": 1.3741398578486049e-05, + "loss": 0.3054, + "step": 8288 + }, + { + "epoch": 0.7808577282682932, + "grad_norm": 0.737349271774292, + "learning_rate": 1.3739998186892694e-05, + "loss": 0.299, + "step": 8289 + }, + { + "epoch": 0.7809519323614611, + "grad_norm": 0.7201666831970215, + "learning_rate": 1.3738597710021598e-05, + "loss": 0.3134, + "step": 8290 + }, + { + "epoch": 0.781046136454629, + "grad_norm": 0.698501467704773, + "learning_rate": 1.373719714790469e-05, + "loss": 0.3177, + "step": 8291 + }, + { + "epoch": 0.7811403405477968, + "grad_norm": 0.8547974228858948, + "learning_rate": 1.373579650057391e-05, + "loss": 0.3312, + "step": 8292 + }, + { + "epoch": 0.7812345446409646, + "grad_norm": 0.7456308007240295, + "learning_rate": 1.373439576806119e-05, + "loss": 0.3104, + "step": 8293 + }, + { + "epoch": 0.7813287487341325, + "grad_norm": 0.7419720888137817, + "learning_rate": 1.3732994950398472e-05, + "loss": 0.3395, + "step": 8294 + }, + { + "epoch": 0.7814229528273003, + "grad_norm": 0.7196950912475586, + "learning_rate": 1.3731594047617698e-05, + "loss": 0.2703, + "step": 8295 + }, + { + "epoch": 0.7815171569204682, + "grad_norm": 0.7468432188034058, + "learning_rate": 1.3730193059750806e-05, + "loss": 0.3043, + "step": 8296 + }, + { + "epoch": 0.781611361013636, + "grad_norm": 0.7866313457489014, + "learning_rate": 1.3728791986829743e-05, + "loss": 0.3058, + "step": 8297 + }, + { + "epoch": 0.7817055651068039, + "grad_norm": 0.6918613910675049, + "learning_rate": 1.372739082888646e-05, + "loss": 0.319, + "step": 8298 + }, + { + "epoch": 0.7817997691999717, + "grad_norm": 0.727741003036499, + "learning_rate": 1.3725989585952897e-05, + "loss": 0.2761, + "step": 8299 + }, + { + "epoch": 0.7818939732931396, + "grad_norm": 0.7165054678916931, + "learning_rate": 1.3724588258061013e-05, + "loss": 0.3093, + "step": 8300 + }, + { + "epoch": 0.7819881773863074, + "grad_norm": 0.7405712008476257, + "learning_rate": 1.3723186845242753e-05, + "loss": 0.3038, + "step": 8301 + }, + { + "epoch": 0.7820823814794753, + "grad_norm": 0.6232830286026001, + "learning_rate": 1.3721785347530077e-05, + "loss": 0.2679, + "step": 8302 + }, + { + "epoch": 0.7821765855726431, + "grad_norm": 1.404122233390808, + "learning_rate": 1.3720383764954938e-05, + "loss": 0.3374, + "step": 8303 + }, + { + "epoch": 0.782270789665811, + "grad_norm": 0.6597297787666321, + "learning_rate": 1.3718982097549296e-05, + "loss": 0.2817, + "step": 8304 + }, + { + "epoch": 0.7823649937589788, + "grad_norm": 0.6930121779441833, + "learning_rate": 1.3717580345345112e-05, + "loss": 0.2841, + "step": 8305 + }, + { + "epoch": 0.7824591978521467, + "grad_norm": 1.024834156036377, + "learning_rate": 1.3716178508374344e-05, + "loss": 0.3172, + "step": 8306 + }, + { + "epoch": 0.7825534019453145, + "grad_norm": 0.6708495020866394, + "learning_rate": 1.371477658666896e-05, + "loss": 0.2735, + "step": 8307 + }, + { + "epoch": 0.7826476060384824, + "grad_norm": 0.7648048400878906, + "learning_rate": 1.3713374580260927e-05, + "loss": 0.309, + "step": 8308 + }, + { + "epoch": 0.7827418101316502, + "grad_norm": 0.7775411009788513, + "learning_rate": 1.3711972489182208e-05, + "loss": 0.3023, + "step": 8309 + }, + { + "epoch": 0.7828360142248181, + "grad_norm": 0.9280102849006653, + "learning_rate": 1.3710570313464778e-05, + "loss": 0.2923, + "step": 8310 + }, + { + "epoch": 0.7829302183179859, + "grad_norm": 0.9284132122993469, + "learning_rate": 1.3709168053140604e-05, + "loss": 0.3423, + "step": 8311 + }, + { + "epoch": 0.7830244224111538, + "grad_norm": 0.7038367986679077, + "learning_rate": 1.3707765708241663e-05, + "loss": 0.2897, + "step": 8312 + }, + { + "epoch": 0.7831186265043216, + "grad_norm": 0.7105676531791687, + "learning_rate": 1.3706363278799931e-05, + "loss": 0.2623, + "step": 8313 + }, + { + "epoch": 0.7832128305974895, + "grad_norm": 0.9595640301704407, + "learning_rate": 1.370496076484738e-05, + "loss": 0.3136, + "step": 8314 + }, + { + "epoch": 0.7833070346906573, + "grad_norm": 0.7618606686592102, + "learning_rate": 1.3703558166415998e-05, + "loss": 0.3421, + "step": 8315 + }, + { + "epoch": 0.7834012387838252, + "grad_norm": 0.666176438331604, + "learning_rate": 1.370215548353776e-05, + "loss": 0.2788, + "step": 8316 + }, + { + "epoch": 0.783495442876993, + "grad_norm": 0.6845123171806335, + "learning_rate": 1.3700752716244651e-05, + "loss": 0.312, + "step": 8317 + }, + { + "epoch": 0.7835896469701609, + "grad_norm": 0.7459732890129089, + "learning_rate": 1.369934986456866e-05, + "loss": 0.3177, + "step": 8318 + }, + { + "epoch": 0.7836838510633287, + "grad_norm": 0.7005225419998169, + "learning_rate": 1.3697946928541768e-05, + "loss": 0.2864, + "step": 8319 + }, + { + "epoch": 0.7837780551564966, + "grad_norm": 0.6945435404777527, + "learning_rate": 1.369654390819597e-05, + "loss": 0.2965, + "step": 8320 + }, + { + "epoch": 0.7838722592496644, + "grad_norm": 0.7020549178123474, + "learning_rate": 1.369514080356325e-05, + "loss": 0.3204, + "step": 8321 + }, + { + "epoch": 0.7839664633428323, + "grad_norm": 0.7289341688156128, + "learning_rate": 1.3693737614675608e-05, + "loss": 0.3118, + "step": 8322 + }, + { + "epoch": 0.7840606674360001, + "grad_norm": 0.7745868563652039, + "learning_rate": 1.3692334341565037e-05, + "loss": 0.2994, + "step": 8323 + }, + { + "epoch": 0.784154871529168, + "grad_norm": 0.7368453145027161, + "learning_rate": 1.3690930984263528e-05, + "loss": 0.3034, + "step": 8324 + }, + { + "epoch": 0.7842490756223358, + "grad_norm": 0.685530424118042, + "learning_rate": 1.3689527542803087e-05, + "loss": 0.2837, + "step": 8325 + }, + { + "epoch": 0.7843432797155037, + "grad_norm": 0.8169264793395996, + "learning_rate": 1.3688124017215714e-05, + "loss": 0.3502, + "step": 8326 + }, + { + "epoch": 0.7844374838086715, + "grad_norm": 0.9533312916755676, + "learning_rate": 1.3686720407533404e-05, + "loss": 0.35, + "step": 8327 + }, + { + "epoch": 0.7845316879018394, + "grad_norm": 0.721153199672699, + "learning_rate": 1.3685316713788174e-05, + "loss": 0.2947, + "step": 8328 + }, + { + "epoch": 0.7846258919950072, + "grad_norm": 0.7245532870292664, + "learning_rate": 1.3683912936012021e-05, + "loss": 0.3427, + "step": 8329 + }, + { + "epoch": 0.7847200960881751, + "grad_norm": 0.684908926486969, + "learning_rate": 1.3682509074236954e-05, + "loss": 0.302, + "step": 8330 + }, + { + "epoch": 0.7848143001813429, + "grad_norm": 0.7864391207695007, + "learning_rate": 1.3681105128494987e-05, + "loss": 0.3273, + "step": 8331 + }, + { + "epoch": 0.7849085042745108, + "grad_norm": 0.6541042923927307, + "learning_rate": 1.3679701098818128e-05, + "loss": 0.2963, + "step": 8332 + }, + { + "epoch": 0.7850027083676786, + "grad_norm": 0.6770807504653931, + "learning_rate": 1.3678296985238395e-05, + "loss": 0.3088, + "step": 8333 + }, + { + "epoch": 0.7850969124608465, + "grad_norm": 0.823836088180542, + "learning_rate": 1.3676892787787801e-05, + "loss": 0.2892, + "step": 8334 + }, + { + "epoch": 0.7851911165540143, + "grad_norm": 0.6440860629081726, + "learning_rate": 1.3675488506498367e-05, + "loss": 0.2827, + "step": 8335 + }, + { + "epoch": 0.7852853206471822, + "grad_norm": 0.6993657946586609, + "learning_rate": 1.3674084141402108e-05, + "loss": 0.3128, + "step": 8336 + }, + { + "epoch": 0.78537952474035, + "grad_norm": 0.7165569067001343, + "learning_rate": 1.367267969253105e-05, + "loss": 0.3159, + "step": 8337 + }, + { + "epoch": 0.7854737288335178, + "grad_norm": 0.7132493257522583, + "learning_rate": 1.3671275159917214e-05, + "loss": 0.3267, + "step": 8338 + }, + { + "epoch": 0.7855679329266857, + "grad_norm": 0.6765950322151184, + "learning_rate": 1.3669870543592629e-05, + "loss": 0.2855, + "step": 8339 + }, + { + "epoch": 0.7856621370198535, + "grad_norm": 0.6541544795036316, + "learning_rate": 1.3668465843589318e-05, + "loss": 0.298, + "step": 8340 + }, + { + "epoch": 0.7857563411130214, + "grad_norm": 0.836005449295044, + "learning_rate": 1.3667061059939312e-05, + "loss": 0.3522, + "step": 8341 + }, + { + "epoch": 0.7858505452061892, + "grad_norm": 0.8120798468589783, + "learning_rate": 1.3665656192674645e-05, + "loss": 0.3356, + "step": 8342 + }, + { + "epoch": 0.7859447492993571, + "grad_norm": 0.6587315201759338, + "learning_rate": 1.3664251241827347e-05, + "loss": 0.2633, + "step": 8343 + }, + { + "epoch": 0.7860389533925249, + "grad_norm": 0.7285856008529663, + "learning_rate": 1.3662846207429453e-05, + "loss": 0.3178, + "step": 8344 + }, + { + "epoch": 0.7861331574856928, + "grad_norm": 0.8499427437782288, + "learning_rate": 1.3661441089512998e-05, + "loss": 0.2918, + "step": 8345 + }, + { + "epoch": 0.7862273615788606, + "grad_norm": 0.6641467809677124, + "learning_rate": 1.3660035888110027e-05, + "loss": 0.2999, + "step": 8346 + }, + { + "epoch": 0.7863215656720285, + "grad_norm": 0.8487431406974792, + "learning_rate": 1.3658630603252578e-05, + "loss": 0.3024, + "step": 8347 + }, + { + "epoch": 0.7864157697651963, + "grad_norm": 0.7532932162284851, + "learning_rate": 1.3657225234972695e-05, + "loss": 0.3307, + "step": 8348 + }, + { + "epoch": 0.7865099738583642, + "grad_norm": 0.7543956637382507, + "learning_rate": 1.3655819783302415e-05, + "loss": 0.3076, + "step": 8349 + }, + { + "epoch": 0.786604177951532, + "grad_norm": 0.7056576609611511, + "learning_rate": 1.3654414248273792e-05, + "loss": 0.2803, + "step": 8350 + }, + { + "epoch": 0.7866983820446999, + "grad_norm": 0.6470280289649963, + "learning_rate": 1.3653008629918875e-05, + "loss": 0.3051, + "step": 8351 + }, + { + "epoch": 0.7867925861378677, + "grad_norm": 0.6129191517829895, + "learning_rate": 1.365160292826971e-05, + "loss": 0.2892, + "step": 8352 + }, + { + "epoch": 0.7868867902310356, + "grad_norm": 0.7020622491836548, + "learning_rate": 1.365019714335835e-05, + "loss": 0.2817, + "step": 8353 + }, + { + "epoch": 0.7869809943242034, + "grad_norm": 0.710928201675415, + "learning_rate": 1.364879127521685e-05, + "loss": 0.3287, + "step": 8354 + }, + { + "epoch": 0.7870751984173713, + "grad_norm": 0.698966920375824, + "learning_rate": 1.3647385323877269e-05, + "loss": 0.2689, + "step": 8355 + }, + { + "epoch": 0.7871694025105391, + "grad_norm": 0.731090784072876, + "learning_rate": 1.3645979289371658e-05, + "loss": 0.2874, + "step": 8356 + }, + { + "epoch": 0.787263606603707, + "grad_norm": 0.7392199635505676, + "learning_rate": 1.3644573171732082e-05, + "loss": 0.3158, + "step": 8357 + }, + { + "epoch": 0.7873578106968748, + "grad_norm": 0.6243274211883545, + "learning_rate": 1.3643166970990604e-05, + "loss": 0.2703, + "step": 8358 + }, + { + "epoch": 0.7874520147900427, + "grad_norm": 0.7213075757026672, + "learning_rate": 1.3641760687179281e-05, + "loss": 0.3139, + "step": 8359 + }, + { + "epoch": 0.7875462188832105, + "grad_norm": 0.8094550371170044, + "learning_rate": 1.3640354320330185e-05, + "loss": 0.3348, + "step": 8360 + }, + { + "epoch": 0.7876404229763784, + "grad_norm": 0.7065017819404602, + "learning_rate": 1.3638947870475376e-05, + "loss": 0.2956, + "step": 8361 + }, + { + "epoch": 0.7877346270695462, + "grad_norm": 0.7342098355293274, + "learning_rate": 1.3637541337646933e-05, + "loss": 0.3074, + "step": 8362 + }, + { + "epoch": 0.7878288311627141, + "grad_norm": 0.7462935447692871, + "learning_rate": 1.3636134721876922e-05, + "loss": 0.3157, + "step": 8363 + }, + { + "epoch": 0.7879230352558819, + "grad_norm": 0.7283497452735901, + "learning_rate": 1.3634728023197412e-05, + "loss": 0.3114, + "step": 8364 + }, + { + "epoch": 0.7880172393490498, + "grad_norm": 0.7347304224967957, + "learning_rate": 1.3633321241640485e-05, + "loss": 0.3235, + "step": 8365 + }, + { + "epoch": 0.7881114434422176, + "grad_norm": 0.7423493266105652, + "learning_rate": 1.3631914377238213e-05, + "loss": 0.3015, + "step": 8366 + }, + { + "epoch": 0.7882056475353855, + "grad_norm": 0.719247043132782, + "learning_rate": 1.3630507430022674e-05, + "loss": 0.3318, + "step": 8367 + }, + { + "epoch": 0.7882998516285532, + "grad_norm": 0.6581484079360962, + "learning_rate": 1.3629100400025956e-05, + "loss": 0.2722, + "step": 8368 + }, + { + "epoch": 0.788394055721721, + "grad_norm": 0.722379744052887, + "learning_rate": 1.3627693287280132e-05, + "loss": 0.2994, + "step": 8369 + }, + { + "epoch": 0.7884882598148889, + "grad_norm": 0.6473997235298157, + "learning_rate": 1.3626286091817293e-05, + "loss": 0.2885, + "step": 8370 + }, + { + "epoch": 0.7885824639080568, + "grad_norm": 0.8524295687675476, + "learning_rate": 1.3624878813669524e-05, + "loss": 0.3142, + "step": 8371 + }, + { + "epoch": 0.7886766680012246, + "grad_norm": 0.6791268587112427, + "learning_rate": 1.3623471452868908e-05, + "loss": 0.3375, + "step": 8372 + }, + { + "epoch": 0.7887708720943925, + "grad_norm": 0.805366575717926, + "learning_rate": 1.3622064009447544e-05, + "loss": 0.3437, + "step": 8373 + }, + { + "epoch": 0.7888650761875603, + "grad_norm": 0.7245519757270813, + "learning_rate": 1.3620656483437518e-05, + "loss": 0.339, + "step": 8374 + }, + { + "epoch": 0.7889592802807281, + "grad_norm": 0.7727537155151367, + "learning_rate": 1.3619248874870924e-05, + "loss": 0.3093, + "step": 8375 + }, + { + "epoch": 0.789053484373896, + "grad_norm": 0.8279337286949158, + "learning_rate": 1.3617841183779853e-05, + "loss": 0.3432, + "step": 8376 + }, + { + "epoch": 0.7891476884670638, + "grad_norm": 0.7298142910003662, + "learning_rate": 1.3616433410196414e-05, + "loss": 0.3138, + "step": 8377 + }, + { + "epoch": 0.7892418925602317, + "grad_norm": 0.6630862951278687, + "learning_rate": 1.36150255541527e-05, + "loss": 0.2768, + "step": 8378 + }, + { + "epoch": 0.7893360966533995, + "grad_norm": 0.645504891872406, + "learning_rate": 1.3613617615680812e-05, + "loss": 0.293, + "step": 8379 + }, + { + "epoch": 0.7894303007465674, + "grad_norm": 0.6487581133842468, + "learning_rate": 1.3612209594812856e-05, + "loss": 0.287, + "step": 8380 + }, + { + "epoch": 0.7895245048397352, + "grad_norm": 0.8455442786216736, + "learning_rate": 1.3610801491580929e-05, + "loss": 0.3331, + "step": 8381 + }, + { + "epoch": 0.7896187089329031, + "grad_norm": 0.697076141834259, + "learning_rate": 1.3609393306017149e-05, + "loss": 0.3141, + "step": 8382 + }, + { + "epoch": 0.7897129130260709, + "grad_norm": 0.724602222442627, + "learning_rate": 1.3607985038153616e-05, + "loss": 0.2976, + "step": 8383 + }, + { + "epoch": 0.7898071171192388, + "grad_norm": 0.7992532849311829, + "learning_rate": 1.3606576688022446e-05, + "loss": 0.3487, + "step": 8384 + }, + { + "epoch": 0.7899013212124066, + "grad_norm": 0.6527953743934631, + "learning_rate": 1.3605168255655752e-05, + "loss": 0.3096, + "step": 8385 + }, + { + "epoch": 0.7899955253055745, + "grad_norm": 0.8105137944221497, + "learning_rate": 1.3603759741085642e-05, + "loss": 0.3776, + "step": 8386 + }, + { + "epoch": 0.7900897293987423, + "grad_norm": 0.7063913941383362, + "learning_rate": 1.360235114434424e-05, + "loss": 0.3311, + "step": 8387 + }, + { + "epoch": 0.7901839334919102, + "grad_norm": 0.7864614725112915, + "learning_rate": 1.360094246546366e-05, + "loss": 0.3316, + "step": 8388 + }, + { + "epoch": 0.790278137585078, + "grad_norm": 0.7231094837188721, + "learning_rate": 1.359953370447602e-05, + "loss": 0.3261, + "step": 8389 + }, + { + "epoch": 0.7903723416782459, + "grad_norm": 0.9381780028343201, + "learning_rate": 1.359812486141345e-05, + "loss": 0.321, + "step": 8390 + }, + { + "epoch": 0.7904665457714137, + "grad_norm": 1.0008454322814941, + "learning_rate": 1.3596715936308064e-05, + "loss": 0.3944, + "step": 8391 + }, + { + "epoch": 0.7905607498645816, + "grad_norm": 0.8007410764694214, + "learning_rate": 1.3595306929191994e-05, + "loss": 0.2822, + "step": 8392 + }, + { + "epoch": 0.7906549539577494, + "grad_norm": 0.9686805009841919, + "learning_rate": 1.3593897840097366e-05, + "loss": 0.3272, + "step": 8393 + }, + { + "epoch": 0.7907491580509173, + "grad_norm": 0.6351611018180847, + "learning_rate": 1.359248866905631e-05, + "loss": 0.3036, + "step": 8394 + }, + { + "epoch": 0.7908433621440851, + "grad_norm": 0.7274337410926819, + "learning_rate": 1.3591079416100955e-05, + "loss": 0.3087, + "step": 8395 + }, + { + "epoch": 0.790937566237253, + "grad_norm": 0.7062101364135742, + "learning_rate": 1.3589670081263432e-05, + "loss": 0.292, + "step": 8396 + }, + { + "epoch": 0.7910317703304208, + "grad_norm": 0.6788089871406555, + "learning_rate": 1.3588260664575885e-05, + "loss": 0.3035, + "step": 8397 + }, + { + "epoch": 0.7911259744235887, + "grad_norm": 0.6554300785064697, + "learning_rate": 1.3586851166070445e-05, + "loss": 0.2749, + "step": 8398 + }, + { + "epoch": 0.7912201785167565, + "grad_norm": 0.7202410697937012, + "learning_rate": 1.358544158577925e-05, + "loss": 0.3206, + "step": 8399 + }, + { + "epoch": 0.7913143826099244, + "grad_norm": 0.7300735712051392, + "learning_rate": 1.3584031923734443e-05, + "loss": 0.304, + "step": 8400 + }, + { + "epoch": 0.7914085867030922, + "grad_norm": 0.8400070667266846, + "learning_rate": 1.3582622179968161e-05, + "loss": 0.3198, + "step": 8401 + }, + { + "epoch": 0.7915027907962601, + "grad_norm": 0.7179476618766785, + "learning_rate": 1.3581212354512558e-05, + "loss": 0.2923, + "step": 8402 + }, + { + "epoch": 0.7915969948894279, + "grad_norm": 0.7091476917266846, + "learning_rate": 1.3579802447399773e-05, + "loss": 0.3092, + "step": 8403 + }, + { + "epoch": 0.7916911989825958, + "grad_norm": 0.7615411281585693, + "learning_rate": 1.3578392458661956e-05, + "loss": 0.3169, + "step": 8404 + }, + { + "epoch": 0.7917854030757636, + "grad_norm": 0.5997017621994019, + "learning_rate": 1.3576982388331258e-05, + "loss": 0.2648, + "step": 8405 + }, + { + "epoch": 0.7918796071689315, + "grad_norm": 0.7050553560256958, + "learning_rate": 1.3575572236439828e-05, + "loss": 0.3208, + "step": 8406 + }, + { + "epoch": 0.7919738112620993, + "grad_norm": 0.8503166437149048, + "learning_rate": 1.3574162003019819e-05, + "loss": 0.3104, + "step": 8407 + }, + { + "epoch": 0.7920680153552672, + "grad_norm": 0.6925212144851685, + "learning_rate": 1.3572751688103394e-05, + "loss": 0.303, + "step": 8408 + }, + { + "epoch": 0.792162219448435, + "grad_norm": 0.7317957282066345, + "learning_rate": 1.3571341291722701e-05, + "loss": 0.3057, + "step": 8409 + }, + { + "epoch": 0.7922564235416029, + "grad_norm": 0.7509272694587708, + "learning_rate": 1.3569930813909904e-05, + "loss": 0.2666, + "step": 8410 + }, + { + "epoch": 0.7923506276347707, + "grad_norm": 0.7998015880584717, + "learning_rate": 1.3568520254697166e-05, + "loss": 0.3611, + "step": 8411 + }, + { + "epoch": 0.7924448317279386, + "grad_norm": 0.77497398853302, + "learning_rate": 1.3567109614116643e-05, + "loss": 0.2746, + "step": 8412 + }, + { + "epoch": 0.7925390358211064, + "grad_norm": 0.7679684162139893, + "learning_rate": 1.3565698892200507e-05, + "loss": 0.311, + "step": 8413 + }, + { + "epoch": 0.7926332399142743, + "grad_norm": 0.8640990853309631, + "learning_rate": 1.3564288088980923e-05, + "loss": 0.3185, + "step": 8414 + }, + { + "epoch": 0.7927274440074421, + "grad_norm": 0.6967403292655945, + "learning_rate": 1.3562877204490058e-05, + "loss": 0.2685, + "step": 8415 + }, + { + "epoch": 0.79282164810061, + "grad_norm": 0.7341375350952148, + "learning_rate": 1.356146623876008e-05, + "loss": 0.3051, + "step": 8416 + }, + { + "epoch": 0.7929158521937778, + "grad_norm": 0.706219494342804, + "learning_rate": 1.3560055191823165e-05, + "loss": 0.2888, + "step": 8417 + }, + { + "epoch": 0.7930100562869457, + "grad_norm": 0.7508575320243835, + "learning_rate": 1.3558644063711489e-05, + "loss": 0.2669, + "step": 8418 + }, + { + "epoch": 0.7931042603801135, + "grad_norm": 0.7251358032226562, + "learning_rate": 1.355723285445722e-05, + "loss": 0.3131, + "step": 8419 + }, + { + "epoch": 0.7931984644732814, + "grad_norm": 0.667508065700531, + "learning_rate": 1.3555821564092544e-05, + "loss": 0.2753, + "step": 8420 + }, + { + "epoch": 0.7932926685664492, + "grad_norm": 0.7205508351325989, + "learning_rate": 1.3554410192649634e-05, + "loss": 0.2861, + "step": 8421 + }, + { + "epoch": 0.793386872659617, + "grad_norm": 0.7302700877189636, + "learning_rate": 1.3552998740160676e-05, + "loss": 0.297, + "step": 8422 + }, + { + "epoch": 0.7934810767527849, + "grad_norm": 0.9575607180595398, + "learning_rate": 1.3551587206657855e-05, + "loss": 0.2917, + "step": 8423 + }, + { + "epoch": 0.7935752808459527, + "grad_norm": 0.7603209614753723, + "learning_rate": 1.3550175592173347e-05, + "loss": 0.2802, + "step": 8424 + }, + { + "epoch": 0.7936694849391206, + "grad_norm": 0.7659693360328674, + "learning_rate": 1.3548763896739351e-05, + "loss": 0.2844, + "step": 8425 + }, + { + "epoch": 0.7937636890322884, + "grad_norm": 0.8542771339416504, + "learning_rate": 1.3547352120388046e-05, + "loss": 0.3169, + "step": 8426 + }, + { + "epoch": 0.7938578931254563, + "grad_norm": 0.9261520504951477, + "learning_rate": 1.3545940263151627e-05, + "loss": 0.3304, + "step": 8427 + }, + { + "epoch": 0.7939520972186241, + "grad_norm": 0.885272741317749, + "learning_rate": 1.3544528325062289e-05, + "loss": 0.3193, + "step": 8428 + }, + { + "epoch": 0.794046301311792, + "grad_norm": 0.749832272529602, + "learning_rate": 1.354311630615222e-05, + "loss": 0.2886, + "step": 8429 + }, + { + "epoch": 0.7941405054049598, + "grad_norm": 0.8688532710075378, + "learning_rate": 1.354170420645362e-05, + "loss": 0.3752, + "step": 8430 + }, + { + "epoch": 0.7942347094981277, + "grad_norm": 0.7473974227905273, + "learning_rate": 1.354029202599869e-05, + "loss": 0.3138, + "step": 8431 + }, + { + "epoch": 0.7943289135912955, + "grad_norm": 0.6640375256538391, + "learning_rate": 1.3538879764819624e-05, + "loss": 0.3087, + "step": 8432 + }, + { + "epoch": 0.7944231176844634, + "grad_norm": 0.7127041816711426, + "learning_rate": 1.3537467422948626e-05, + "loss": 0.3381, + "step": 8433 + }, + { + "epoch": 0.7945173217776312, + "grad_norm": 0.7646892666816711, + "learning_rate": 1.3536055000417903e-05, + "loss": 0.338, + "step": 8434 + }, + { + "epoch": 0.7946115258707991, + "grad_norm": 0.6835283637046814, + "learning_rate": 1.3534642497259656e-05, + "loss": 0.3147, + "step": 8435 + }, + { + "epoch": 0.7947057299639669, + "grad_norm": 0.7504387497901917, + "learning_rate": 1.353322991350609e-05, + "loss": 0.3455, + "step": 8436 + }, + { + "epoch": 0.7947999340571348, + "grad_norm": 0.8048112392425537, + "learning_rate": 1.353181724918942e-05, + "loss": 0.3455, + "step": 8437 + }, + { + "epoch": 0.7948941381503026, + "grad_norm": 1.639927625656128, + "learning_rate": 1.3530404504341856e-05, + "loss": 0.3101, + "step": 8438 + }, + { + "epoch": 0.7949883422434705, + "grad_norm": 0.9135071635246277, + "learning_rate": 1.352899167899561e-05, + "loss": 0.3383, + "step": 8439 + }, + { + "epoch": 0.7950825463366383, + "grad_norm": 0.8039232492446899, + "learning_rate": 1.3527578773182895e-05, + "loss": 0.335, + "step": 8440 + }, + { + "epoch": 0.7951767504298062, + "grad_norm": 0.626367449760437, + "learning_rate": 1.3526165786935926e-05, + "loss": 0.2672, + "step": 8441 + }, + { + "epoch": 0.795270954522974, + "grad_norm": 0.8004292249679565, + "learning_rate": 1.3524752720286927e-05, + "loss": 0.296, + "step": 8442 + }, + { + "epoch": 0.7953651586161419, + "grad_norm": 0.7011231780052185, + "learning_rate": 1.3523339573268116e-05, + "loss": 0.3, + "step": 8443 + }, + { + "epoch": 0.7954593627093097, + "grad_norm": 0.6995598077774048, + "learning_rate": 1.352192634591171e-05, + "loss": 0.3172, + "step": 8444 + }, + { + "epoch": 0.7955535668024776, + "grad_norm": 0.897178590297699, + "learning_rate": 1.3520513038249939e-05, + "loss": 0.3036, + "step": 8445 + }, + { + "epoch": 0.7956477708956454, + "grad_norm": 0.6888857483863831, + "learning_rate": 1.3519099650315023e-05, + "loss": 0.3188, + "step": 8446 + }, + { + "epoch": 0.7957419749888133, + "grad_norm": 0.7783874273300171, + "learning_rate": 1.3517686182139193e-05, + "loss": 0.2787, + "step": 8447 + }, + { + "epoch": 0.7958361790819811, + "grad_norm": 0.7472442984580994, + "learning_rate": 1.3516272633754684e-05, + "loss": 0.3276, + "step": 8448 + }, + { + "epoch": 0.795930383175149, + "grad_norm": 0.8034288287162781, + "learning_rate": 1.3514859005193714e-05, + "loss": 0.3242, + "step": 8449 + }, + { + "epoch": 0.7960245872683168, + "grad_norm": 0.6877335906028748, + "learning_rate": 1.3513445296488525e-05, + "loss": 0.3051, + "step": 8450 + }, + { + "epoch": 0.7961187913614847, + "grad_norm": 0.7368878126144409, + "learning_rate": 1.351203150767135e-05, + "loss": 0.317, + "step": 8451 + }, + { + "epoch": 0.7962129954546525, + "grad_norm": 0.8141829967498779, + "learning_rate": 1.3510617638774424e-05, + "loss": 0.3725, + "step": 8452 + }, + { + "epoch": 0.7963071995478204, + "grad_norm": 0.7533358335494995, + "learning_rate": 1.3509203689829988e-05, + "loss": 0.2992, + "step": 8453 + }, + { + "epoch": 0.7964014036409882, + "grad_norm": 0.7177354693412781, + "learning_rate": 1.3507789660870282e-05, + "loss": 0.3515, + "step": 8454 + }, + { + "epoch": 0.7964956077341561, + "grad_norm": 1.0493124723434448, + "learning_rate": 1.3506375551927546e-05, + "loss": 0.3108, + "step": 8455 + }, + { + "epoch": 0.7965898118273239, + "grad_norm": 0.8071884512901306, + "learning_rate": 1.3504961363034024e-05, + "loss": 0.334, + "step": 8456 + }, + { + "epoch": 0.7966840159204918, + "grad_norm": 0.7486411333084106, + "learning_rate": 1.350354709422196e-05, + "loss": 0.3259, + "step": 8457 + }, + { + "epoch": 0.7967782200136596, + "grad_norm": 0.659052312374115, + "learning_rate": 1.3502132745523609e-05, + "loss": 0.3128, + "step": 8458 + }, + { + "epoch": 0.7968724241068275, + "grad_norm": 0.7426510453224182, + "learning_rate": 1.3500718316971213e-05, + "loss": 0.2573, + "step": 8459 + }, + { + "epoch": 0.7969666281999953, + "grad_norm": 0.7538797855377197, + "learning_rate": 1.3499303808597024e-05, + "loss": 0.3087, + "step": 8460 + }, + { + "epoch": 0.7970608322931632, + "grad_norm": 0.8201055526733398, + "learning_rate": 1.3497889220433296e-05, + "loss": 0.3069, + "step": 8461 + }, + { + "epoch": 0.797155036386331, + "grad_norm": 0.7548934817314148, + "learning_rate": 1.3496474552512286e-05, + "loss": 0.2948, + "step": 8462 + }, + { + "epoch": 0.7972492404794989, + "grad_norm": 0.7931002974510193, + "learning_rate": 1.3495059804866251e-05, + "loss": 0.3261, + "step": 8463 + }, + { + "epoch": 0.7973434445726667, + "grad_norm": 0.7292978763580322, + "learning_rate": 1.349364497752744e-05, + "loss": 0.3252, + "step": 8464 + }, + { + "epoch": 0.7974376486658346, + "grad_norm": 0.6160940527915955, + "learning_rate": 1.349223007052813e-05, + "loss": 0.3014, + "step": 8465 + }, + { + "epoch": 0.7975318527590024, + "grad_norm": 1.3209818601608276, + "learning_rate": 1.3490815083900566e-05, + "loss": 0.3094, + "step": 8466 + }, + { + "epoch": 0.7976260568521703, + "grad_norm": 0.7098581790924072, + "learning_rate": 1.3489400017677022e-05, + "loss": 0.2818, + "step": 8467 + }, + { + "epoch": 0.7977202609453381, + "grad_norm": 0.6893822550773621, + "learning_rate": 1.3487984871889763e-05, + "loss": 0.3562, + "step": 8468 + }, + { + "epoch": 0.797814465038506, + "grad_norm": 0.7777463793754578, + "learning_rate": 1.3486569646571051e-05, + "loss": 0.3225, + "step": 8469 + }, + { + "epoch": 0.7979086691316738, + "grad_norm": 0.7464541792869568, + "learning_rate": 1.3485154341753161e-05, + "loss": 0.3276, + "step": 8470 + }, + { + "epoch": 0.7980028732248416, + "grad_norm": 0.675532877445221, + "learning_rate": 1.3483738957468364e-05, + "loss": 0.2811, + "step": 8471 + }, + { + "epoch": 0.7980970773180095, + "grad_norm": 0.8053826093673706, + "learning_rate": 1.3482323493748926e-05, + "loss": 0.3372, + "step": 8472 + }, + { + "epoch": 0.7981912814111773, + "grad_norm": 0.7255682945251465, + "learning_rate": 1.348090795062713e-05, + "loss": 0.2961, + "step": 8473 + }, + { + "epoch": 0.7982854855043452, + "grad_norm": 0.9220722317695618, + "learning_rate": 1.3479492328135251e-05, + "loss": 0.2961, + "step": 8474 + }, + { + "epoch": 0.798379689597513, + "grad_norm": 0.7277863025665283, + "learning_rate": 1.3478076626305563e-05, + "loss": 0.2953, + "step": 8475 + }, + { + "epoch": 0.7984738936906809, + "grad_norm": 0.9244139790534973, + "learning_rate": 1.347666084517035e-05, + "loss": 0.3297, + "step": 8476 + }, + { + "epoch": 0.7985680977838487, + "grad_norm": 0.6993528008460999, + "learning_rate": 1.3475244984761893e-05, + "loss": 0.3188, + "step": 8477 + }, + { + "epoch": 0.7986623018770166, + "grad_norm": 0.7611246705055237, + "learning_rate": 1.3473829045112476e-05, + "loss": 0.2533, + "step": 8478 + }, + { + "epoch": 0.7987565059701844, + "grad_norm": 0.7787326574325562, + "learning_rate": 1.3472413026254385e-05, + "loss": 0.3388, + "step": 8479 + }, + { + "epoch": 0.7988507100633523, + "grad_norm": 0.7947128415107727, + "learning_rate": 1.3470996928219906e-05, + "loss": 0.3487, + "step": 8480 + }, + { + "epoch": 0.7989449141565201, + "grad_norm": 0.881558895111084, + "learning_rate": 1.3469580751041327e-05, + "loss": 0.356, + "step": 8481 + }, + { + "epoch": 0.799039118249688, + "grad_norm": 0.7980902791023254, + "learning_rate": 1.3468164494750944e-05, + "loss": 0.3097, + "step": 8482 + }, + { + "epoch": 0.7991333223428558, + "grad_norm": 0.6231949925422668, + "learning_rate": 1.3466748159381047e-05, + "loss": 0.2444, + "step": 8483 + }, + { + "epoch": 0.7992275264360237, + "grad_norm": 0.7653826475143433, + "learning_rate": 1.3465331744963928e-05, + "loss": 0.3333, + "step": 8484 + }, + { + "epoch": 0.7993217305291915, + "grad_norm": 0.7707959413528442, + "learning_rate": 1.346391525153189e-05, + "loss": 0.2909, + "step": 8485 + }, + { + "epoch": 0.7994159346223594, + "grad_norm": 0.8047236204147339, + "learning_rate": 1.3462498679117224e-05, + "loss": 0.3081, + "step": 8486 + }, + { + "epoch": 0.7995101387155272, + "grad_norm": 0.7369197010993958, + "learning_rate": 1.346108202775223e-05, + "loss": 0.3547, + "step": 8487 + }, + { + "epoch": 0.7996043428086951, + "grad_norm": 0.7479953169822693, + "learning_rate": 1.3459665297469221e-05, + "loss": 0.3355, + "step": 8488 + }, + { + "epoch": 0.7996985469018629, + "grad_norm": 0.8746562004089355, + "learning_rate": 1.3458248488300487e-05, + "loss": 0.3311, + "step": 8489 + }, + { + "epoch": 0.7997927509950308, + "grad_norm": 0.7730000615119934, + "learning_rate": 1.3456831600278341e-05, + "loss": 0.2851, + "step": 8490 + }, + { + "epoch": 0.7998869550881986, + "grad_norm": 0.6304970979690552, + "learning_rate": 1.3455414633435089e-05, + "loss": 0.2694, + "step": 8491 + }, + { + "epoch": 0.7999811591813665, + "grad_norm": 0.7707965970039368, + "learning_rate": 1.3453997587803039e-05, + "loss": 0.297, + "step": 8492 + }, + { + "epoch": 0.8000753632745343, + "grad_norm": 0.6389234066009521, + "learning_rate": 1.3452580463414501e-05, + "loss": 0.2608, + "step": 8493 + }, + { + "epoch": 0.8001695673677022, + "grad_norm": 0.8090130090713501, + "learning_rate": 1.3451163260301791e-05, + "loss": 0.331, + "step": 8494 + }, + { + "epoch": 0.80026377146087, + "grad_norm": 0.8407273292541504, + "learning_rate": 1.344974597849722e-05, + "loss": 0.3259, + "step": 8495 + }, + { + "epoch": 0.8003579755540379, + "grad_norm": 0.8513500690460205, + "learning_rate": 1.3448328618033109e-05, + "loss": 0.2912, + "step": 8496 + }, + { + "epoch": 0.8004521796472057, + "grad_norm": 0.7660210728645325, + "learning_rate": 1.3446911178941766e-05, + "loss": 0.3085, + "step": 8497 + }, + { + "epoch": 0.8005463837403736, + "grad_norm": 0.725410521030426, + "learning_rate": 1.3445493661255523e-05, + "loss": 0.2573, + "step": 8498 + }, + { + "epoch": 0.8006405878335414, + "grad_norm": 0.8393821120262146, + "learning_rate": 1.3444076065006692e-05, + "loss": 0.3604, + "step": 8499 + }, + { + "epoch": 0.8007347919267093, + "grad_norm": 0.7441098093986511, + "learning_rate": 1.3442658390227604e-05, + "loss": 0.3247, + "step": 8500 + }, + { + "epoch": 0.8008289960198771, + "grad_norm": 0.7958167195320129, + "learning_rate": 1.3441240636950577e-05, + "loss": 0.356, + "step": 8501 + }, + { + "epoch": 0.800923200113045, + "grad_norm": 0.7425439357757568, + "learning_rate": 1.3439822805207942e-05, + "loss": 0.2755, + "step": 8502 + }, + { + "epoch": 0.8010174042062128, + "grad_norm": 1.0281422138214111, + "learning_rate": 1.3438404895032032e-05, + "loss": 0.3532, + "step": 8503 + }, + { + "epoch": 0.8011116082993807, + "grad_norm": 0.8411215543746948, + "learning_rate": 1.3436986906455167e-05, + "loss": 0.3284, + "step": 8504 + }, + { + "epoch": 0.8012058123925485, + "grad_norm": 0.7676801681518555, + "learning_rate": 1.343556883950969e-05, + "loss": 0.2912, + "step": 8505 + }, + { + "epoch": 0.8013000164857164, + "grad_norm": 0.7142693400382996, + "learning_rate": 1.3434150694227925e-05, + "loss": 0.3041, + "step": 8506 + }, + { + "epoch": 0.8013942205788841, + "grad_norm": 0.7693013548851013, + "learning_rate": 1.3432732470642216e-05, + "loss": 0.3037, + "step": 8507 + }, + { + "epoch": 0.801488424672052, + "grad_norm": 0.6982400417327881, + "learning_rate": 1.3431314168784899e-05, + "loss": 0.2771, + "step": 8508 + }, + { + "epoch": 0.8015826287652198, + "grad_norm": 0.8211315274238586, + "learning_rate": 1.342989578868831e-05, + "loss": 0.3382, + "step": 8509 + }, + { + "epoch": 0.8016768328583876, + "grad_norm": 0.6667990684509277, + "learning_rate": 1.3428477330384792e-05, + "loss": 0.2864, + "step": 8510 + }, + { + "epoch": 0.8017710369515555, + "grad_norm": 0.8006913065910339, + "learning_rate": 1.3427058793906693e-05, + "loss": 0.3313, + "step": 8511 + }, + { + "epoch": 0.8018652410447233, + "grad_norm": 0.7486048936843872, + "learning_rate": 1.3425640179286348e-05, + "loss": 0.3317, + "step": 8512 + }, + { + "epoch": 0.8019594451378912, + "grad_norm": 0.7044640183448792, + "learning_rate": 1.3424221486556113e-05, + "loss": 0.3234, + "step": 8513 + }, + { + "epoch": 0.802053649231059, + "grad_norm": 0.6485394835472107, + "learning_rate": 1.3422802715748331e-05, + "loss": 0.3069, + "step": 8514 + }, + { + "epoch": 0.8021478533242269, + "grad_norm": 0.575886607170105, + "learning_rate": 1.3421383866895355e-05, + "loss": 0.2625, + "step": 8515 + }, + { + "epoch": 0.8022420574173947, + "grad_norm": 0.6978061199188232, + "learning_rate": 1.3419964940029533e-05, + "loss": 0.2913, + "step": 8516 + }, + { + "epoch": 0.8023362615105626, + "grad_norm": 1.1715340614318848, + "learning_rate": 1.3418545935183222e-05, + "loss": 0.3217, + "step": 8517 + }, + { + "epoch": 0.8024304656037304, + "grad_norm": 1.9922049045562744, + "learning_rate": 1.3417126852388777e-05, + "loss": 0.295, + "step": 8518 + }, + { + "epoch": 0.8025246696968983, + "grad_norm": 0.7390188574790955, + "learning_rate": 1.3415707691678557e-05, + "loss": 0.3365, + "step": 8519 + }, + { + "epoch": 0.8026188737900661, + "grad_norm": 0.8231778740882874, + "learning_rate": 1.3414288453084918e-05, + "loss": 0.3827, + "step": 8520 + }, + { + "epoch": 0.802713077883234, + "grad_norm": 0.6996579170227051, + "learning_rate": 1.3412869136640221e-05, + "loss": 0.3223, + "step": 8521 + }, + { + "epoch": 0.8028072819764018, + "grad_norm": 1.0390639305114746, + "learning_rate": 1.341144974237683e-05, + "loss": 0.3384, + "step": 8522 + }, + { + "epoch": 0.8029014860695697, + "grad_norm": 0.6792234182357788, + "learning_rate": 1.341003027032711e-05, + "loss": 0.2906, + "step": 8523 + }, + { + "epoch": 0.8029956901627375, + "grad_norm": 0.6705979108810425, + "learning_rate": 1.3408610720523423e-05, + "loss": 0.2696, + "step": 8524 + }, + { + "epoch": 0.8030898942559054, + "grad_norm": 0.8168100118637085, + "learning_rate": 1.3407191092998146e-05, + "loss": 0.3198, + "step": 8525 + }, + { + "epoch": 0.8031840983490732, + "grad_norm": 0.6742059588432312, + "learning_rate": 1.3405771387783637e-05, + "loss": 0.2927, + "step": 8526 + }, + { + "epoch": 0.8032783024422411, + "grad_norm": 0.6609785556793213, + "learning_rate": 1.3404351604912275e-05, + "loss": 0.2444, + "step": 8527 + }, + { + "epoch": 0.8033725065354089, + "grad_norm": 0.9732012748718262, + "learning_rate": 1.3402931744416432e-05, + "loss": 0.3582, + "step": 8528 + }, + { + "epoch": 0.8034667106285768, + "grad_norm": 0.7385396957397461, + "learning_rate": 1.3401511806328483e-05, + "loss": 0.3011, + "step": 8529 + }, + { + "epoch": 0.8035609147217446, + "grad_norm": 0.6966911554336548, + "learning_rate": 1.3400091790680802e-05, + "loss": 0.2904, + "step": 8530 + }, + { + "epoch": 0.8036551188149125, + "grad_norm": 0.6504485011100769, + "learning_rate": 1.3398671697505772e-05, + "loss": 0.2724, + "step": 8531 + }, + { + "epoch": 0.8037493229080803, + "grad_norm": 0.8288382291793823, + "learning_rate": 1.3397251526835771e-05, + "loss": 0.3318, + "step": 8532 + }, + { + "epoch": 0.8038435270012482, + "grad_norm": 0.6640185117721558, + "learning_rate": 1.339583127870318e-05, + "loss": 0.2726, + "step": 8533 + }, + { + "epoch": 0.803937731094416, + "grad_norm": 0.7489379644393921, + "learning_rate": 1.3394410953140384e-05, + "loss": 0.2984, + "step": 8534 + }, + { + "epoch": 0.8040319351875839, + "grad_norm": 0.6930230855941772, + "learning_rate": 1.3392990550179773e-05, + "loss": 0.2948, + "step": 8535 + }, + { + "epoch": 0.8041261392807517, + "grad_norm": 0.6925107836723328, + "learning_rate": 1.3391570069853725e-05, + "loss": 0.3007, + "step": 8536 + }, + { + "epoch": 0.8042203433739196, + "grad_norm": 0.6697828769683838, + "learning_rate": 1.3390149512194635e-05, + "loss": 0.2751, + "step": 8537 + }, + { + "epoch": 0.8043145474670874, + "grad_norm": 0.8221737146377563, + "learning_rate": 1.3388728877234894e-05, + "loss": 0.3089, + "step": 8538 + }, + { + "epoch": 0.8044087515602553, + "grad_norm": 0.7250677943229675, + "learning_rate": 1.3387308165006894e-05, + "loss": 0.3314, + "step": 8539 + }, + { + "epoch": 0.8045029556534231, + "grad_norm": 0.7382446527481079, + "learning_rate": 1.3385887375543029e-05, + "loss": 0.3181, + "step": 8540 + }, + { + "epoch": 0.804597159746591, + "grad_norm": 0.7235196828842163, + "learning_rate": 1.3384466508875696e-05, + "loss": 0.2801, + "step": 8541 + }, + { + "epoch": 0.8046913638397588, + "grad_norm": 0.7349414825439453, + "learning_rate": 1.3383045565037292e-05, + "loss": 0.3033, + "step": 8542 + }, + { + "epoch": 0.8047855679329267, + "grad_norm": 0.7318891882896423, + "learning_rate": 1.3381624544060219e-05, + "loss": 0.32, + "step": 8543 + }, + { + "epoch": 0.8048797720260945, + "grad_norm": 0.7649834156036377, + "learning_rate": 1.3380203445976871e-05, + "loss": 0.3422, + "step": 8544 + }, + { + "epoch": 0.8049739761192624, + "grad_norm": 0.80607008934021, + "learning_rate": 1.3378782270819663e-05, + "loss": 0.3185, + "step": 8545 + }, + { + "epoch": 0.8050681802124302, + "grad_norm": 0.7321194410324097, + "learning_rate": 1.3377361018620991e-05, + "loss": 0.2997, + "step": 8546 + }, + { + "epoch": 0.8051623843055981, + "grad_norm": 0.7480506896972656, + "learning_rate": 1.3375939689413264e-05, + "loss": 0.3253, + "step": 8547 + }, + { + "epoch": 0.8052565883987659, + "grad_norm": 0.7666994333267212, + "learning_rate": 1.3374518283228895e-05, + "loss": 0.3142, + "step": 8548 + }, + { + "epoch": 0.8053507924919338, + "grad_norm": 0.8464433550834656, + "learning_rate": 1.3373096800100285e-05, + "loss": 0.3648, + "step": 8549 + }, + { + "epoch": 0.8054449965851016, + "grad_norm": 0.8145337700843811, + "learning_rate": 1.3371675240059853e-05, + "loss": 0.3464, + "step": 8550 + }, + { + "epoch": 0.8055392006782695, + "grad_norm": 0.7238718271255493, + "learning_rate": 1.3370253603140013e-05, + "loss": 0.2839, + "step": 8551 + }, + { + "epoch": 0.8056334047714373, + "grad_norm": 0.6761792302131653, + "learning_rate": 1.3368831889373178e-05, + "loss": 0.3055, + "step": 8552 + }, + { + "epoch": 0.8057276088646051, + "grad_norm": 0.745002031326294, + "learning_rate": 1.3367410098791764e-05, + "loss": 0.305, + "step": 8553 + }, + { + "epoch": 0.805821812957773, + "grad_norm": 0.7277778387069702, + "learning_rate": 1.3365988231428194e-05, + "loss": 0.2918, + "step": 8554 + }, + { + "epoch": 0.8059160170509408, + "grad_norm": 0.8258783221244812, + "learning_rate": 1.3364566287314888e-05, + "loss": 0.2675, + "step": 8555 + }, + { + "epoch": 0.8060102211441087, + "grad_norm": 0.6759886741638184, + "learning_rate": 1.3363144266484268e-05, + "loss": 0.2788, + "step": 8556 + }, + { + "epoch": 0.8061044252372765, + "grad_norm": 0.7083367705345154, + "learning_rate": 1.3361722168968751e-05, + "loss": 0.3005, + "step": 8557 + }, + { + "epoch": 0.8061986293304444, + "grad_norm": 0.7281975150108337, + "learning_rate": 1.3360299994800774e-05, + "loss": 0.3118, + "step": 8558 + }, + { + "epoch": 0.8062928334236122, + "grad_norm": 0.6623421907424927, + "learning_rate": 1.3358877744012762e-05, + "loss": 0.2769, + "step": 8559 + }, + { + "epoch": 0.8063870375167801, + "grad_norm": 0.6906394362449646, + "learning_rate": 1.3357455416637139e-05, + "loss": 0.2747, + "step": 8560 + }, + { + "epoch": 0.8064812416099479, + "grad_norm": 0.8033696413040161, + "learning_rate": 1.3356033012706342e-05, + "loss": 0.3488, + "step": 8561 + }, + { + "epoch": 0.8065754457031158, + "grad_norm": 0.7539247274398804, + "learning_rate": 1.3354610532252803e-05, + "loss": 0.3444, + "step": 8562 + }, + { + "epoch": 0.8066696497962836, + "grad_norm": 0.7532894611358643, + "learning_rate": 1.3353187975308954e-05, + "loss": 0.2898, + "step": 8563 + }, + { + "epoch": 0.8067638538894515, + "grad_norm": 0.7262154817581177, + "learning_rate": 1.3351765341907232e-05, + "loss": 0.3176, + "step": 8564 + }, + { + "epoch": 0.8068580579826193, + "grad_norm": 0.7816476225852966, + "learning_rate": 1.3350342632080081e-05, + "loss": 0.2781, + "step": 8565 + }, + { + "epoch": 0.8069522620757872, + "grad_norm": 0.8794928789138794, + "learning_rate": 1.3348919845859934e-05, + "loss": 0.3208, + "step": 8566 + }, + { + "epoch": 0.807046466168955, + "grad_norm": 0.7433298826217651, + "learning_rate": 1.3347496983279235e-05, + "loss": 0.2907, + "step": 8567 + }, + { + "epoch": 0.8071406702621229, + "grad_norm": 0.893782377243042, + "learning_rate": 1.334607404437043e-05, + "loss": 0.2912, + "step": 8568 + }, + { + "epoch": 0.8072348743552907, + "grad_norm": 0.7018021941184998, + "learning_rate": 1.3344651029165959e-05, + "loss": 0.2955, + "step": 8569 + }, + { + "epoch": 0.8073290784484586, + "grad_norm": 0.7630308270454407, + "learning_rate": 1.3343227937698273e-05, + "loss": 0.307, + "step": 8570 + }, + { + "epoch": 0.8074232825416264, + "grad_norm": 0.7988154888153076, + "learning_rate": 1.334180476999982e-05, + "loss": 0.3109, + "step": 8571 + }, + { + "epoch": 0.8075174866347943, + "grad_norm": 0.7222198843955994, + "learning_rate": 1.334038152610305e-05, + "loss": 0.3285, + "step": 8572 + }, + { + "epoch": 0.8076116907279621, + "grad_norm": 0.6700325012207031, + "learning_rate": 1.3338958206040418e-05, + "loss": 0.3099, + "step": 8573 + }, + { + "epoch": 0.80770589482113, + "grad_norm": 0.7429363131523132, + "learning_rate": 1.3337534809844371e-05, + "loss": 0.3587, + "step": 8574 + }, + { + "epoch": 0.8078000989142978, + "grad_norm": 0.7255023717880249, + "learning_rate": 1.3336111337547369e-05, + "loss": 0.2952, + "step": 8575 + }, + { + "epoch": 0.8078943030074657, + "grad_norm": 0.8426859378814697, + "learning_rate": 1.333468778918187e-05, + "loss": 0.3165, + "step": 8576 + }, + { + "epoch": 0.8079885071006335, + "grad_norm": 0.6773214340209961, + "learning_rate": 1.333326416478033e-05, + "loss": 0.3051, + "step": 8577 + }, + { + "epoch": 0.8080827111938014, + "grad_norm": 0.8392713665962219, + "learning_rate": 1.3331840464375216e-05, + "loss": 0.3193, + "step": 8578 + }, + { + "epoch": 0.8081769152869692, + "grad_norm": 0.7934224605560303, + "learning_rate": 1.3330416687998987e-05, + "loss": 0.3303, + "step": 8579 + }, + { + "epoch": 0.8082711193801371, + "grad_norm": 0.7700300812721252, + "learning_rate": 1.3328992835684105e-05, + "loss": 0.3536, + "step": 8580 + }, + { + "epoch": 0.8083653234733049, + "grad_norm": 0.7183011174201965, + "learning_rate": 1.3327568907463036e-05, + "loss": 0.3046, + "step": 8581 + }, + { + "epoch": 0.8084595275664728, + "grad_norm": 0.7285114526748657, + "learning_rate": 1.3326144903368253e-05, + "loss": 0.2879, + "step": 8582 + }, + { + "epoch": 0.8085537316596406, + "grad_norm": 0.6812309622764587, + "learning_rate": 1.332472082343222e-05, + "loss": 0.2987, + "step": 8583 + }, + { + "epoch": 0.8086479357528085, + "grad_norm": 0.8201087117195129, + "learning_rate": 1.332329666768741e-05, + "loss": 0.315, + "step": 8584 + }, + { + "epoch": 0.8087421398459763, + "grad_norm": 0.768406331539154, + "learning_rate": 1.33218724361663e-05, + "loss": 0.2689, + "step": 8585 + }, + { + "epoch": 0.8088363439391442, + "grad_norm": 0.7975949048995972, + "learning_rate": 1.3320448128901357e-05, + "loss": 0.3409, + "step": 8586 + }, + { + "epoch": 0.808930548032312, + "grad_norm": 0.6615134477615356, + "learning_rate": 1.3319023745925064e-05, + "loss": 0.2684, + "step": 8587 + }, + { + "epoch": 0.8090247521254799, + "grad_norm": 0.7283914089202881, + "learning_rate": 1.3317599287269896e-05, + "loss": 0.3284, + "step": 8588 + }, + { + "epoch": 0.8091189562186477, + "grad_norm": 0.6497752070426941, + "learning_rate": 1.3316174752968331e-05, + "loss": 0.2859, + "step": 8589 + }, + { + "epoch": 0.8092131603118156, + "grad_norm": 0.8077973127365112, + "learning_rate": 1.3314750143052857e-05, + "loss": 0.2659, + "step": 8590 + }, + { + "epoch": 0.8093073644049834, + "grad_norm": 0.645475447177887, + "learning_rate": 1.3313325457555953e-05, + "loss": 0.2922, + "step": 8591 + }, + { + "epoch": 0.8094015684981513, + "grad_norm": 0.7311770915985107, + "learning_rate": 1.33119006965101e-05, + "loss": 0.3266, + "step": 8592 + }, + { + "epoch": 0.8094957725913191, + "grad_norm": 0.8049769997596741, + "learning_rate": 1.3310475859947796e-05, + "loss": 0.3088, + "step": 8593 + }, + { + "epoch": 0.809589976684487, + "grad_norm": 0.8716092705726624, + "learning_rate": 1.3309050947901518e-05, + "loss": 0.3068, + "step": 8594 + }, + { + "epoch": 0.8096841807776548, + "grad_norm": 0.7258637547492981, + "learning_rate": 1.3307625960403763e-05, + "loss": 0.2921, + "step": 8595 + }, + { + "epoch": 0.8097783848708227, + "grad_norm": 0.7388122081756592, + "learning_rate": 1.330620089748702e-05, + "loss": 0.2809, + "step": 8596 + }, + { + "epoch": 0.8098725889639905, + "grad_norm": 0.6835913062095642, + "learning_rate": 1.330477575918378e-05, + "loss": 0.3199, + "step": 8597 + }, + { + "epoch": 0.8099667930571584, + "grad_norm": 0.7428380250930786, + "learning_rate": 1.3303350545526545e-05, + "loss": 0.3051, + "step": 8598 + }, + { + "epoch": 0.8100609971503262, + "grad_norm": 0.7595810294151306, + "learning_rate": 1.330192525654781e-05, + "loss": 0.3717, + "step": 8599 + }, + { + "epoch": 0.810155201243494, + "grad_norm": 1.198453426361084, + "learning_rate": 1.3300499892280071e-05, + "loss": 0.3035, + "step": 8600 + }, + { + "epoch": 0.8102494053366619, + "grad_norm": 0.6743183135986328, + "learning_rate": 1.3299074452755829e-05, + "loss": 0.3328, + "step": 8601 + }, + { + "epoch": 0.8103436094298297, + "grad_norm": 0.7611492276191711, + "learning_rate": 1.329764893800759e-05, + "loss": 0.2831, + "step": 8602 + }, + { + "epoch": 0.8104378135229976, + "grad_norm": 0.7529149055480957, + "learning_rate": 1.3296223348067855e-05, + "loss": 0.2884, + "step": 8603 + }, + { + "epoch": 0.8105320176161654, + "grad_norm": 0.6950092911720276, + "learning_rate": 1.3294797682969127e-05, + "loss": 0.2806, + "step": 8604 + }, + { + "epoch": 0.8106262217093333, + "grad_norm": 0.7328523993492126, + "learning_rate": 1.329337194274392e-05, + "loss": 0.3027, + "step": 8605 + }, + { + "epoch": 0.8107204258025011, + "grad_norm": 0.7506288290023804, + "learning_rate": 1.3291946127424738e-05, + "loss": 0.3624, + "step": 8606 + }, + { + "epoch": 0.810814629895669, + "grad_norm": 0.8609302639961243, + "learning_rate": 1.3290520237044094e-05, + "loss": 0.3084, + "step": 8607 + }, + { + "epoch": 0.8109088339888368, + "grad_norm": 0.876285195350647, + "learning_rate": 1.3289094271634498e-05, + "loss": 0.3229, + "step": 8608 + }, + { + "epoch": 0.8110030380820047, + "grad_norm": 0.720151960849762, + "learning_rate": 1.3287668231228465e-05, + "loss": 0.2708, + "step": 8609 + }, + { + "epoch": 0.8110972421751725, + "grad_norm": 0.783524751663208, + "learning_rate": 1.3286242115858515e-05, + "loss": 0.3274, + "step": 8610 + }, + { + "epoch": 0.8111914462683404, + "grad_norm": 0.7985210418701172, + "learning_rate": 1.328481592555716e-05, + "loss": 0.3124, + "step": 8611 + }, + { + "epoch": 0.8112856503615082, + "grad_norm": 0.7318972945213318, + "learning_rate": 1.328338966035692e-05, + "loss": 0.3238, + "step": 8612 + }, + { + "epoch": 0.8113798544546761, + "grad_norm": 0.7627847790718079, + "learning_rate": 1.3281963320290325e-05, + "loss": 0.2968, + "step": 8613 + }, + { + "epoch": 0.8114740585478439, + "grad_norm": 0.7643986940383911, + "learning_rate": 1.3280536905389885e-05, + "loss": 0.289, + "step": 8614 + }, + { + "epoch": 0.8115682626410118, + "grad_norm": 0.6229587197303772, + "learning_rate": 1.3279110415688133e-05, + "loss": 0.2873, + "step": 8615 + }, + { + "epoch": 0.8116624667341796, + "grad_norm": 0.8171026110649109, + "learning_rate": 1.3277683851217588e-05, + "loss": 0.3554, + "step": 8616 + }, + { + "epoch": 0.8117566708273475, + "grad_norm": 0.663148045539856, + "learning_rate": 1.3276257212010784e-05, + "loss": 0.2949, + "step": 8617 + }, + { + "epoch": 0.8118508749205153, + "grad_norm": 0.943187415599823, + "learning_rate": 1.3274830498100251e-05, + "loss": 0.3426, + "step": 8618 + }, + { + "epoch": 0.8119450790136832, + "grad_norm": 0.8217980265617371, + "learning_rate": 1.3273403709518518e-05, + "loss": 0.2635, + "step": 8619 + }, + { + "epoch": 0.812039283106851, + "grad_norm": 0.678004801273346, + "learning_rate": 1.3271976846298116e-05, + "loss": 0.2893, + "step": 8620 + }, + { + "epoch": 0.8121334872000189, + "grad_norm": 0.7480604648590088, + "learning_rate": 1.3270549908471581e-05, + "loss": 0.3442, + "step": 8621 + }, + { + "epoch": 0.8122276912931867, + "grad_norm": 0.6902352571487427, + "learning_rate": 1.3269122896071452e-05, + "loss": 0.3241, + "step": 8622 + }, + { + "epoch": 0.8123218953863546, + "grad_norm": 0.7468249797821045, + "learning_rate": 1.3267695809130264e-05, + "loss": 0.3032, + "step": 8623 + }, + { + "epoch": 0.8124160994795224, + "grad_norm": 0.7557992935180664, + "learning_rate": 1.3266268647680558e-05, + "loss": 0.2954, + "step": 8624 + }, + { + "epoch": 0.8125103035726903, + "grad_norm": 0.6958978176116943, + "learning_rate": 1.3264841411754878e-05, + "loss": 0.3289, + "step": 8625 + }, + { + "epoch": 0.8126045076658581, + "grad_norm": 0.7431649565696716, + "learning_rate": 1.326341410138576e-05, + "loss": 0.3093, + "step": 8626 + }, + { + "epoch": 0.812698711759026, + "grad_norm": 0.6600211262702942, + "learning_rate": 1.3261986716605756e-05, + "loss": 0.295, + "step": 8627 + }, + { + "epoch": 0.8127929158521938, + "grad_norm": 0.787713348865509, + "learning_rate": 1.3260559257447412e-05, + "loss": 0.3667, + "step": 8628 + }, + { + "epoch": 0.8128871199453617, + "grad_norm": 0.7798208594322205, + "learning_rate": 1.3259131723943273e-05, + "loss": 0.3057, + "step": 8629 + }, + { + "epoch": 0.8129813240385295, + "grad_norm": 0.7302875518798828, + "learning_rate": 1.3257704116125889e-05, + "loss": 0.2969, + "step": 8630 + }, + { + "epoch": 0.8130755281316974, + "grad_norm": 0.7487370371818542, + "learning_rate": 1.3256276434027815e-05, + "loss": 0.3595, + "step": 8631 + }, + { + "epoch": 0.8131697322248652, + "grad_norm": 0.658248245716095, + "learning_rate": 1.32548486776816e-05, + "loss": 0.3326, + "step": 8632 + }, + { + "epoch": 0.8132639363180331, + "grad_norm": 0.797067403793335, + "learning_rate": 1.3253420847119804e-05, + "loss": 0.3373, + "step": 8633 + }, + { + "epoch": 0.8133581404112009, + "grad_norm": 0.6556969881057739, + "learning_rate": 1.3251992942374978e-05, + "loss": 0.2896, + "step": 8634 + }, + { + "epoch": 0.8134523445043688, + "grad_norm": 0.665752112865448, + "learning_rate": 1.3250564963479686e-05, + "loss": 0.2911, + "step": 8635 + }, + { + "epoch": 0.8135465485975366, + "grad_norm": 0.8131089210510254, + "learning_rate": 1.3249136910466487e-05, + "loss": 0.3108, + "step": 8636 + }, + { + "epoch": 0.8136407526907045, + "grad_norm": 0.7381729483604431, + "learning_rate": 1.3247708783367939e-05, + "loss": 0.3565, + "step": 8637 + }, + { + "epoch": 0.8137349567838723, + "grad_norm": 0.6386496424674988, + "learning_rate": 1.3246280582216608e-05, + "loss": 0.2983, + "step": 8638 + }, + { + "epoch": 0.8138291608770402, + "grad_norm": 0.6995457410812378, + "learning_rate": 1.3244852307045062e-05, + "loss": 0.3003, + "step": 8639 + }, + { + "epoch": 0.813923364970208, + "grad_norm": 0.7102161049842834, + "learning_rate": 1.3243423957885864e-05, + "loss": 0.3335, + "step": 8640 + }, + { + "epoch": 0.8140175690633759, + "grad_norm": 0.6372678279876709, + "learning_rate": 1.324199553477158e-05, + "loss": 0.303, + "step": 8641 + }, + { + "epoch": 0.8141117731565437, + "grad_norm": 0.726729154586792, + "learning_rate": 1.3240567037734789e-05, + "loss": 0.2963, + "step": 8642 + }, + { + "epoch": 0.8142059772497116, + "grad_norm": 0.8117896914482117, + "learning_rate": 1.3239138466808055e-05, + "loss": 0.3094, + "step": 8643 + }, + { + "epoch": 0.8143001813428794, + "grad_norm": 0.7660683393478394, + "learning_rate": 1.3237709822023956e-05, + "loss": 0.3232, + "step": 8644 + }, + { + "epoch": 0.8143943854360471, + "grad_norm": 0.7786475419998169, + "learning_rate": 1.3236281103415064e-05, + "loss": 0.3104, + "step": 8645 + }, + { + "epoch": 0.814488589529215, + "grad_norm": 0.6333548426628113, + "learning_rate": 1.3234852311013959e-05, + "loss": 0.3038, + "step": 8646 + }, + { + "epoch": 0.8145827936223828, + "grad_norm": 0.6839102506637573, + "learning_rate": 1.3233423444853219e-05, + "loss": 0.2659, + "step": 8647 + }, + { + "epoch": 0.8146769977155507, + "grad_norm": 0.7871906757354736, + "learning_rate": 1.3231994504965424e-05, + "loss": 0.2786, + "step": 8648 + }, + { + "epoch": 0.8147712018087185, + "grad_norm": 0.8053258061408997, + "learning_rate": 1.3230565491383153e-05, + "loss": 0.3204, + "step": 8649 + }, + { + "epoch": 0.8148654059018864, + "grad_norm": 0.9789053201675415, + "learning_rate": 1.3229136404138996e-05, + "loss": 0.3266, + "step": 8650 + }, + { + "epoch": 0.8149596099950542, + "grad_norm": 0.6746540069580078, + "learning_rate": 1.3227707243265534e-05, + "loss": 0.2996, + "step": 8651 + }, + { + "epoch": 0.8150538140882221, + "grad_norm": 0.6631419658660889, + "learning_rate": 1.3226278008795355e-05, + "loss": 0.3128, + "step": 8652 + }, + { + "epoch": 0.8151480181813899, + "grad_norm": 0.679245114326477, + "learning_rate": 1.322484870076105e-05, + "loss": 0.2672, + "step": 8653 + }, + { + "epoch": 0.8152422222745578, + "grad_norm": 0.6654112339019775, + "learning_rate": 1.3223419319195206e-05, + "loss": 0.3015, + "step": 8654 + }, + { + "epoch": 0.8153364263677256, + "grad_norm": 0.7208624482154846, + "learning_rate": 1.3221989864130414e-05, + "loss": 0.3241, + "step": 8655 + }, + { + "epoch": 0.8154306304608935, + "grad_norm": 0.6933826804161072, + "learning_rate": 1.3220560335599272e-05, + "loss": 0.3262, + "step": 8656 + }, + { + "epoch": 0.8155248345540613, + "grad_norm": 0.6548870801925659, + "learning_rate": 1.3219130733634374e-05, + "loss": 0.2805, + "step": 8657 + }, + { + "epoch": 0.8156190386472292, + "grad_norm": 0.81364905834198, + "learning_rate": 1.3217701058268315e-05, + "loss": 0.318, + "step": 8658 + }, + { + "epoch": 0.815713242740397, + "grad_norm": 0.7878438830375671, + "learning_rate": 1.32162713095337e-05, + "loss": 0.3259, + "step": 8659 + }, + { + "epoch": 0.8158074468335649, + "grad_norm": 0.8270474672317505, + "learning_rate": 1.321484148746312e-05, + "loss": 0.3321, + "step": 8660 + }, + { + "epoch": 0.8159016509267327, + "grad_norm": 0.7580147981643677, + "learning_rate": 1.3213411592089184e-05, + "loss": 0.3335, + "step": 8661 + }, + { + "epoch": 0.8159958550199006, + "grad_norm": 0.7953311204910278, + "learning_rate": 1.3211981623444494e-05, + "loss": 0.2963, + "step": 8662 + }, + { + "epoch": 0.8160900591130684, + "grad_norm": 0.6340729594230652, + "learning_rate": 1.3210551581561657e-05, + "loss": 0.2878, + "step": 8663 + }, + { + "epoch": 0.8161842632062363, + "grad_norm": 0.6745175123214722, + "learning_rate": 1.3209121466473278e-05, + "loss": 0.311, + "step": 8664 + }, + { + "epoch": 0.8162784672994041, + "grad_norm": 1.4273256063461304, + "learning_rate": 1.3207691278211967e-05, + "loss": 0.3401, + "step": 8665 + }, + { + "epoch": 0.816372671392572, + "grad_norm": 0.7651469111442566, + "learning_rate": 1.320626101681033e-05, + "loss": 0.3327, + "step": 8666 + }, + { + "epoch": 0.8164668754857398, + "grad_norm": 0.6671456694602966, + "learning_rate": 1.3204830682300988e-05, + "loss": 0.3076, + "step": 8667 + }, + { + "epoch": 0.8165610795789077, + "grad_norm": 1.3122570514678955, + "learning_rate": 1.3203400274716549e-05, + "loss": 0.3336, + "step": 8668 + }, + { + "epoch": 0.8166552836720755, + "grad_norm": 0.6474205255508423, + "learning_rate": 1.3201969794089629e-05, + "loss": 0.3058, + "step": 8669 + }, + { + "epoch": 0.8167494877652434, + "grad_norm": 0.6456495523452759, + "learning_rate": 1.3200539240452847e-05, + "loss": 0.2962, + "step": 8670 + }, + { + "epoch": 0.8168436918584112, + "grad_norm": 0.6807658076286316, + "learning_rate": 1.3199108613838818e-05, + "loss": 0.3159, + "step": 8671 + }, + { + "epoch": 0.8169378959515791, + "grad_norm": 0.9739202260971069, + "learning_rate": 1.3197677914280166e-05, + "loss": 0.359, + "step": 8672 + }, + { + "epoch": 0.8170321000447469, + "grad_norm": 0.8745432496070862, + "learning_rate": 1.3196247141809515e-05, + "loss": 0.3103, + "step": 8673 + }, + { + "epoch": 0.8171263041379148, + "grad_norm": 0.843333899974823, + "learning_rate": 1.3194816296459483e-05, + "loss": 0.3447, + "step": 8674 + }, + { + "epoch": 0.8172205082310826, + "grad_norm": 0.8933060765266418, + "learning_rate": 1.31933853782627e-05, + "loss": 0.3144, + "step": 8675 + }, + { + "epoch": 0.8173147123242505, + "grad_norm": 0.7140727043151855, + "learning_rate": 1.319195438725179e-05, + "loss": 0.3006, + "step": 8676 + }, + { + "epoch": 0.8174089164174183, + "grad_norm": 0.8899981379508972, + "learning_rate": 1.3190523323459385e-05, + "loss": 0.3025, + "step": 8677 + }, + { + "epoch": 0.8175031205105862, + "grad_norm": 0.7381685376167297, + "learning_rate": 1.3189092186918113e-05, + "loss": 0.2817, + "step": 8678 + }, + { + "epoch": 0.817597324603754, + "grad_norm": 0.7613934874534607, + "learning_rate": 1.3187660977660608e-05, + "loss": 0.3594, + "step": 8679 + }, + { + "epoch": 0.8176915286969219, + "grad_norm": 0.7203506231307983, + "learning_rate": 1.3186229695719504e-05, + "loss": 0.3037, + "step": 8680 + }, + { + "epoch": 0.8177857327900897, + "grad_norm": 0.7194360494613647, + "learning_rate": 1.3184798341127435e-05, + "loss": 0.3096, + "step": 8681 + }, + { + "epoch": 0.8178799368832576, + "grad_norm": 0.807916522026062, + "learning_rate": 1.3183366913917036e-05, + "loss": 0.3621, + "step": 8682 + }, + { + "epoch": 0.8179741409764254, + "grad_norm": 0.6928552389144897, + "learning_rate": 1.318193541412095e-05, + "loss": 0.3188, + "step": 8683 + }, + { + "epoch": 0.8180683450695933, + "grad_norm": 0.7741749882698059, + "learning_rate": 1.3180503841771817e-05, + "loss": 0.3321, + "step": 8684 + }, + { + "epoch": 0.8181625491627611, + "grad_norm": 0.7837627530097961, + "learning_rate": 1.3179072196902274e-05, + "loss": 0.3519, + "step": 8685 + }, + { + "epoch": 0.818256753255929, + "grad_norm": 0.6680743098258972, + "learning_rate": 1.317764047954497e-05, + "loss": 0.3106, + "step": 8686 + }, + { + "epoch": 0.8183509573490968, + "grad_norm": 0.7467548847198486, + "learning_rate": 1.317620868973255e-05, + "loss": 0.3074, + "step": 8687 + }, + { + "epoch": 0.8184451614422646, + "grad_norm": 0.6809502840042114, + "learning_rate": 1.317477682749766e-05, + "loss": 0.2725, + "step": 8688 + }, + { + "epoch": 0.8185393655354325, + "grad_norm": 0.7166069746017456, + "learning_rate": 1.3173344892872946e-05, + "loss": 0.3012, + "step": 8689 + }, + { + "epoch": 0.8186335696286003, + "grad_norm": 0.7136755585670471, + "learning_rate": 1.3171912885891063e-05, + "loss": 0.2999, + "step": 8690 + }, + { + "epoch": 0.8187277737217682, + "grad_norm": 0.7500886917114258, + "learning_rate": 1.3170480806584658e-05, + "loss": 0.2897, + "step": 8691 + }, + { + "epoch": 0.818821977814936, + "grad_norm": 0.7573025822639465, + "learning_rate": 1.3169048654986387e-05, + "loss": 0.2996, + "step": 8692 + }, + { + "epoch": 0.8189161819081039, + "grad_norm": 0.897589385509491, + "learning_rate": 1.316761643112891e-05, + "loss": 0.3356, + "step": 8693 + }, + { + "epoch": 0.8190103860012717, + "grad_norm": 0.7774412035942078, + "learning_rate": 1.3166184135044877e-05, + "loss": 0.322, + "step": 8694 + }, + { + "epoch": 0.8191045900944396, + "grad_norm": 0.7652751207351685, + "learning_rate": 1.316475176676695e-05, + "loss": 0.3587, + "step": 8695 + }, + { + "epoch": 0.8191987941876074, + "grad_norm": 0.7835428714752197, + "learning_rate": 1.3163319326327788e-05, + "loss": 0.3127, + "step": 8696 + }, + { + "epoch": 0.8192929982807753, + "grad_norm": 0.8188920021057129, + "learning_rate": 1.3161886813760052e-05, + "loss": 0.2686, + "step": 8697 + }, + { + "epoch": 0.8193872023739431, + "grad_norm": 0.7695569396018982, + "learning_rate": 1.316045422909641e-05, + "loss": 0.3348, + "step": 8698 + }, + { + "epoch": 0.819481406467111, + "grad_norm": 0.6132376790046692, + "learning_rate": 1.3159021572369522e-05, + "loss": 0.2984, + "step": 8699 + }, + { + "epoch": 0.8195756105602788, + "grad_norm": 0.7349730730056763, + "learning_rate": 1.3157588843612055e-05, + "loss": 0.3424, + "step": 8700 + }, + { + "epoch": 0.8196698146534467, + "grad_norm": 0.8512101173400879, + "learning_rate": 1.3156156042856678e-05, + "loss": 0.3698, + "step": 8701 + }, + { + "epoch": 0.8197640187466145, + "grad_norm": 0.7094159722328186, + "learning_rate": 1.3154723170136065e-05, + "loss": 0.2975, + "step": 8702 + }, + { + "epoch": 0.8198582228397824, + "grad_norm": 0.7288559079170227, + "learning_rate": 1.3153290225482884e-05, + "loss": 0.3033, + "step": 8703 + }, + { + "epoch": 0.8199524269329502, + "grad_norm": 1.3191355466842651, + "learning_rate": 1.315185720892981e-05, + "loss": 0.2933, + "step": 8704 + }, + { + "epoch": 0.8200466310261181, + "grad_norm": 0.6795741319656372, + "learning_rate": 1.3150424120509518e-05, + "loss": 0.3179, + "step": 8705 + }, + { + "epoch": 0.8201408351192859, + "grad_norm": 0.8080680966377258, + "learning_rate": 1.3148990960254683e-05, + "loss": 0.2983, + "step": 8706 + }, + { + "epoch": 0.8202350392124538, + "grad_norm": 0.6550801396369934, + "learning_rate": 1.3147557728197984e-05, + "loss": 0.2597, + "step": 8707 + }, + { + "epoch": 0.8203292433056216, + "grad_norm": 0.7870469093322754, + "learning_rate": 1.31461244243721e-05, + "loss": 0.323, + "step": 8708 + }, + { + "epoch": 0.8204234473987895, + "grad_norm": 0.9124608635902405, + "learning_rate": 1.3144691048809713e-05, + "loss": 0.3393, + "step": 8709 + }, + { + "epoch": 0.8205176514919573, + "grad_norm": 0.6336477398872375, + "learning_rate": 1.314325760154351e-05, + "loss": 0.2425, + "step": 8710 + }, + { + "epoch": 0.8206118555851252, + "grad_norm": 0.7198314070701599, + "learning_rate": 1.3141824082606167e-05, + "loss": 0.3006, + "step": 8711 + }, + { + "epoch": 0.820706059678293, + "grad_norm": 0.7337607741355896, + "learning_rate": 1.314039049203038e-05, + "loss": 0.314, + "step": 8712 + }, + { + "epoch": 0.8208002637714609, + "grad_norm": 0.7488465309143066, + "learning_rate": 1.3138956829848834e-05, + "loss": 0.2873, + "step": 8713 + }, + { + "epoch": 0.8208944678646287, + "grad_norm": 0.6986730694770813, + "learning_rate": 1.3137523096094216e-05, + "loss": 0.3213, + "step": 8714 + }, + { + "epoch": 0.8209886719577966, + "grad_norm": 0.76261967420578, + "learning_rate": 1.3136089290799219e-05, + "loss": 0.3603, + "step": 8715 + }, + { + "epoch": 0.8210828760509644, + "grad_norm": 0.7061699628829956, + "learning_rate": 1.3134655413996538e-05, + "loss": 0.3344, + "step": 8716 + }, + { + "epoch": 0.8211770801441323, + "grad_norm": 0.6306097507476807, + "learning_rate": 1.3133221465718861e-05, + "loss": 0.295, + "step": 8717 + }, + { + "epoch": 0.8212712842373001, + "grad_norm": 0.7545305490493774, + "learning_rate": 1.3131787445998894e-05, + "loss": 0.3605, + "step": 8718 + }, + { + "epoch": 0.821365488330468, + "grad_norm": 0.6534841060638428, + "learning_rate": 1.3130353354869327e-05, + "loss": 0.3048, + "step": 8719 + }, + { + "epoch": 0.8214596924236358, + "grad_norm": 0.7317593693733215, + "learning_rate": 1.3128919192362864e-05, + "loss": 0.2834, + "step": 8720 + }, + { + "epoch": 0.8215538965168037, + "grad_norm": 0.709783673286438, + "learning_rate": 1.3127484958512202e-05, + "loss": 0.2992, + "step": 8721 + }, + { + "epoch": 0.8216481006099715, + "grad_norm": 0.782183825969696, + "learning_rate": 1.3126050653350049e-05, + "loss": 0.3334, + "step": 8722 + }, + { + "epoch": 0.8217423047031394, + "grad_norm": 0.7888424396514893, + "learning_rate": 1.3124616276909106e-05, + "loss": 0.3249, + "step": 8723 + }, + { + "epoch": 0.8218365087963072, + "grad_norm": 0.7548894286155701, + "learning_rate": 1.312318182922208e-05, + "loss": 0.3512, + "step": 8724 + }, + { + "epoch": 0.8219307128894751, + "grad_norm": 0.7767762541770935, + "learning_rate": 1.3121747310321678e-05, + "loss": 0.3204, + "step": 8725 + }, + { + "epoch": 0.8220249169826429, + "grad_norm": 0.8024447560310364, + "learning_rate": 1.3120312720240607e-05, + "loss": 0.3089, + "step": 8726 + }, + { + "epoch": 0.8221191210758108, + "grad_norm": 0.7439122796058655, + "learning_rate": 1.3118878059011583e-05, + "loss": 0.3111, + "step": 8727 + }, + { + "epoch": 0.8222133251689786, + "grad_norm": 0.7834168076515198, + "learning_rate": 1.3117443326667316e-05, + "loss": 0.2829, + "step": 8728 + }, + { + "epoch": 0.8223075292621465, + "grad_norm": 0.7352764010429382, + "learning_rate": 1.3116008523240518e-05, + "loss": 0.2919, + "step": 8729 + }, + { + "epoch": 0.8224017333553143, + "grad_norm": 0.6868788599967957, + "learning_rate": 1.311457364876391e-05, + "loss": 0.2825, + "step": 8730 + }, + { + "epoch": 0.8224959374484822, + "grad_norm": 0.6694072484970093, + "learning_rate": 1.3113138703270201e-05, + "loss": 0.3106, + "step": 8731 + }, + { + "epoch": 0.82259014154165, + "grad_norm": 0.6664419174194336, + "learning_rate": 1.3111703686792115e-05, + "loss": 0.2988, + "step": 8732 + }, + { + "epoch": 0.8226843456348178, + "grad_norm": 0.7029130458831787, + "learning_rate": 1.3110268599362378e-05, + "loss": 0.2944, + "step": 8733 + }, + { + "epoch": 0.8227785497279857, + "grad_norm": 0.7045284509658813, + "learning_rate": 1.3108833441013702e-05, + "loss": 0.3049, + "step": 8734 + }, + { + "epoch": 0.8228727538211535, + "grad_norm": 0.9626039862632751, + "learning_rate": 1.3107398211778818e-05, + "loss": 0.3167, + "step": 8735 + }, + { + "epoch": 0.8229669579143214, + "grad_norm": 0.7633686661720276, + "learning_rate": 1.3105962911690449e-05, + "loss": 0.3412, + "step": 8736 + }, + { + "epoch": 0.8230611620074892, + "grad_norm": 0.6560431122779846, + "learning_rate": 1.3104527540781323e-05, + "loss": 0.3063, + "step": 8737 + }, + { + "epoch": 0.8231553661006571, + "grad_norm": 0.7915542721748352, + "learning_rate": 1.3103092099084166e-05, + "loss": 0.3123, + "step": 8738 + }, + { + "epoch": 0.8232495701938249, + "grad_norm": 0.8178728818893433, + "learning_rate": 1.310165658663171e-05, + "loss": 0.3104, + "step": 8739 + }, + { + "epoch": 0.8233437742869928, + "grad_norm": 0.6345885992050171, + "learning_rate": 1.3100221003456688e-05, + "loss": 0.2837, + "step": 8740 + }, + { + "epoch": 0.8234379783801606, + "grad_norm": 0.6778290271759033, + "learning_rate": 1.3098785349591832e-05, + "loss": 0.3121, + "step": 8741 + }, + { + "epoch": 0.8235321824733285, + "grad_norm": 0.7523201107978821, + "learning_rate": 1.3097349625069878e-05, + "loss": 0.2865, + "step": 8742 + }, + { + "epoch": 0.8236263865664963, + "grad_norm": 0.6583874225616455, + "learning_rate": 1.3095913829923563e-05, + "loss": 0.2946, + "step": 8743 + }, + { + "epoch": 0.8237205906596642, + "grad_norm": 0.6852019429206848, + "learning_rate": 1.3094477964185624e-05, + "loss": 0.3193, + "step": 8744 + }, + { + "epoch": 0.823814794752832, + "grad_norm": 0.758893609046936, + "learning_rate": 1.3093042027888803e-05, + "loss": 0.3296, + "step": 8745 + }, + { + "epoch": 0.8239089988459999, + "grad_norm": 0.8637267351150513, + "learning_rate": 1.3091606021065836e-05, + "loss": 0.2976, + "step": 8746 + }, + { + "epoch": 0.8240032029391677, + "grad_norm": 0.7250096797943115, + "learning_rate": 1.3090169943749475e-05, + "loss": 0.3296, + "step": 8747 + }, + { + "epoch": 0.8240974070323356, + "grad_norm": 0.6529019474983215, + "learning_rate": 1.3088733795972459e-05, + "loss": 0.3255, + "step": 8748 + }, + { + "epoch": 0.8241916111255034, + "grad_norm": 0.6772639751434326, + "learning_rate": 1.3087297577767536e-05, + "loss": 0.2874, + "step": 8749 + }, + { + "epoch": 0.8242858152186713, + "grad_norm": 0.7299693822860718, + "learning_rate": 1.3085861289167454e-05, + "loss": 0.3123, + "step": 8750 + }, + { + "epoch": 0.8243800193118391, + "grad_norm": 0.8791813254356384, + "learning_rate": 1.3084424930204959e-05, + "loss": 0.3366, + "step": 8751 + }, + { + "epoch": 0.824474223405007, + "grad_norm": 0.7638731598854065, + "learning_rate": 1.3082988500912807e-05, + "loss": 0.289, + "step": 8752 + }, + { + "epoch": 0.8245684274981748, + "grad_norm": 0.6731742024421692, + "learning_rate": 1.3081552001323754e-05, + "loss": 0.3075, + "step": 8753 + }, + { + "epoch": 0.8246626315913427, + "grad_norm": 0.7070082426071167, + "learning_rate": 1.3080115431470543e-05, + "loss": 0.2748, + "step": 8754 + }, + { + "epoch": 0.8247568356845105, + "grad_norm": 0.7427701950073242, + "learning_rate": 1.307867879138594e-05, + "loss": 0.3052, + "step": 8755 + }, + { + "epoch": 0.8248510397776784, + "grad_norm": 0.6710155010223389, + "learning_rate": 1.3077242081102699e-05, + "loss": 0.2963, + "step": 8756 + }, + { + "epoch": 0.8249452438708462, + "grad_norm": 0.7390181422233582, + "learning_rate": 1.3075805300653577e-05, + "loss": 0.2995, + "step": 8757 + }, + { + "epoch": 0.8250394479640141, + "grad_norm": 0.715648353099823, + "learning_rate": 1.3074368450071342e-05, + "loss": 0.257, + "step": 8758 + }, + { + "epoch": 0.8251336520571819, + "grad_norm": 0.8312877416610718, + "learning_rate": 1.3072931529388748e-05, + "loss": 0.3267, + "step": 8759 + }, + { + "epoch": 0.8252278561503498, + "grad_norm": 0.8177542090415955, + "learning_rate": 1.3071494538638565e-05, + "loss": 0.3304, + "step": 8760 + }, + { + "epoch": 0.8253220602435176, + "grad_norm": 0.7219865918159485, + "learning_rate": 1.3070057477853557e-05, + "loss": 0.2874, + "step": 8761 + }, + { + "epoch": 0.8254162643366855, + "grad_norm": 0.6797481775283813, + "learning_rate": 1.3068620347066485e-05, + "loss": 0.2984, + "step": 8762 + }, + { + "epoch": 0.8255104684298533, + "grad_norm": 0.7699027061462402, + "learning_rate": 1.306718314631013e-05, + "loss": 0.3144, + "step": 8763 + }, + { + "epoch": 0.8256046725230212, + "grad_norm": 0.7195324897766113, + "learning_rate": 1.3065745875617252e-05, + "loss": 0.3282, + "step": 8764 + }, + { + "epoch": 0.825698876616189, + "grad_norm": 0.7257832884788513, + "learning_rate": 1.306430853502063e-05, + "loss": 0.3201, + "step": 8765 + }, + { + "epoch": 0.8257930807093569, + "grad_norm": 0.7184802889823914, + "learning_rate": 1.306287112455303e-05, + "loss": 0.3083, + "step": 8766 + }, + { + "epoch": 0.8258872848025247, + "grad_norm": 0.6480834484100342, + "learning_rate": 1.3061433644247234e-05, + "loss": 0.2621, + "step": 8767 + }, + { + "epoch": 0.8259814888956926, + "grad_norm": 0.977313756942749, + "learning_rate": 1.3059996094136016e-05, + "loss": 0.3171, + "step": 8768 + }, + { + "epoch": 0.8260756929888604, + "grad_norm": 0.9768969416618347, + "learning_rate": 1.3058558474252154e-05, + "loss": 0.3278, + "step": 8769 + }, + { + "epoch": 0.8261698970820283, + "grad_norm": 0.8026149868965149, + "learning_rate": 1.3057120784628432e-05, + "loss": 0.3038, + "step": 8770 + }, + { + "epoch": 0.8262641011751961, + "grad_norm": 0.7975835204124451, + "learning_rate": 1.3055683025297623e-05, + "loss": 0.3161, + "step": 8771 + }, + { + "epoch": 0.826358305268364, + "grad_norm": 1.1126796007156372, + "learning_rate": 1.3054245196292517e-05, + "loss": 0.3402, + "step": 8772 + }, + { + "epoch": 0.8264525093615318, + "grad_norm": 0.6256473064422607, + "learning_rate": 1.30528072976459e-05, + "loss": 0.3191, + "step": 8773 + }, + { + "epoch": 0.8265467134546997, + "grad_norm": 0.7434393763542175, + "learning_rate": 1.3051369329390551e-05, + "loss": 0.2777, + "step": 8774 + }, + { + "epoch": 0.8266409175478675, + "grad_norm": 0.6980218291282654, + "learning_rate": 1.3049931291559266e-05, + "loss": 0.3074, + "step": 8775 + }, + { + "epoch": 0.8267351216410354, + "grad_norm": 0.8950267434120178, + "learning_rate": 1.3048493184184829e-05, + "loss": 0.3538, + "step": 8776 + }, + { + "epoch": 0.8268293257342032, + "grad_norm": 0.7111105918884277, + "learning_rate": 1.3047055007300031e-05, + "loss": 0.2921, + "step": 8777 + }, + { + "epoch": 0.826923529827371, + "grad_norm": 0.8025537133216858, + "learning_rate": 1.3045616760937669e-05, + "loss": 0.3031, + "step": 8778 + }, + { + "epoch": 0.8270177339205389, + "grad_norm": 0.7283792495727539, + "learning_rate": 1.3044178445130535e-05, + "loss": 0.3289, + "step": 8779 + }, + { + "epoch": 0.8271119380137067, + "grad_norm": 0.7437872886657715, + "learning_rate": 1.3042740059911425e-05, + "loss": 0.3069, + "step": 8780 + }, + { + "epoch": 0.8272061421068746, + "grad_norm": 0.8354877829551697, + "learning_rate": 1.3041301605313135e-05, + "loss": 0.3501, + "step": 8781 + }, + { + "epoch": 0.8273003462000424, + "grad_norm": 1.0320316553115845, + "learning_rate": 1.3039863081368464e-05, + "loss": 0.3382, + "step": 8782 + }, + { + "epoch": 0.8273945502932102, + "grad_norm": 0.8121195435523987, + "learning_rate": 1.3038424488110215e-05, + "loss": 0.3437, + "step": 8783 + }, + { + "epoch": 0.827488754386378, + "grad_norm": 0.6380713582038879, + "learning_rate": 1.3036985825571189e-05, + "loss": 0.275, + "step": 8784 + }, + { + "epoch": 0.8275829584795459, + "grad_norm": 0.9364820122718811, + "learning_rate": 1.3035547093784187e-05, + "loss": 0.3129, + "step": 8785 + }, + { + "epoch": 0.8276771625727137, + "grad_norm": 0.6679378747940063, + "learning_rate": 1.3034108292782017e-05, + "loss": 0.3092, + "step": 8786 + }, + { + "epoch": 0.8277713666658816, + "grad_norm": 0.8502500057220459, + "learning_rate": 1.3032669422597485e-05, + "loss": 0.3505, + "step": 8787 + }, + { + "epoch": 0.8278655707590494, + "grad_norm": 0.823781430721283, + "learning_rate": 1.3031230483263405e-05, + "loss": 0.3118, + "step": 8788 + }, + { + "epoch": 0.8279597748522173, + "grad_norm": 0.7144771218299866, + "learning_rate": 1.3029791474812576e-05, + "loss": 0.2487, + "step": 8789 + }, + { + "epoch": 0.8280539789453851, + "grad_norm": 0.8531436920166016, + "learning_rate": 1.3028352397277821e-05, + "loss": 0.2593, + "step": 8790 + }, + { + "epoch": 0.828148183038553, + "grad_norm": 0.678350031375885, + "learning_rate": 1.3026913250691943e-05, + "loss": 0.3018, + "step": 8791 + }, + { + "epoch": 0.8282423871317208, + "grad_norm": 0.749647319316864, + "learning_rate": 1.3025474035087764e-05, + "loss": 0.3294, + "step": 8792 + }, + { + "epoch": 0.8283365912248887, + "grad_norm": 0.7394024729728699, + "learning_rate": 1.30240347504981e-05, + "loss": 0.2794, + "step": 8793 + }, + { + "epoch": 0.8284307953180565, + "grad_norm": 0.7589126825332642, + "learning_rate": 1.3022595396955761e-05, + "loss": 0.3293, + "step": 8794 + }, + { + "epoch": 0.8285249994112244, + "grad_norm": 0.6819366216659546, + "learning_rate": 1.3021155974493578e-05, + "loss": 0.2904, + "step": 8795 + }, + { + "epoch": 0.8286192035043922, + "grad_norm": 0.7277462482452393, + "learning_rate": 1.3019716483144365e-05, + "loss": 0.3042, + "step": 8796 + }, + { + "epoch": 0.8287134075975601, + "grad_norm": 0.984160840511322, + "learning_rate": 1.3018276922940945e-05, + "loss": 0.3633, + "step": 8797 + }, + { + "epoch": 0.8288076116907279, + "grad_norm": 0.6747549176216125, + "learning_rate": 1.3016837293916145e-05, + "loss": 0.2695, + "step": 8798 + }, + { + "epoch": 0.8289018157838958, + "grad_norm": 0.7443104386329651, + "learning_rate": 1.3015397596102788e-05, + "loss": 0.2958, + "step": 8799 + }, + { + "epoch": 0.8289960198770636, + "grad_norm": 0.7657442092895508, + "learning_rate": 1.3013957829533702e-05, + "loss": 0.3293, + "step": 8800 + }, + { + "epoch": 0.8290902239702315, + "grad_norm": 0.7322391271591187, + "learning_rate": 1.301251799424172e-05, + "loss": 0.2865, + "step": 8801 + }, + { + "epoch": 0.8291844280633993, + "grad_norm": 0.7862289547920227, + "learning_rate": 1.3011078090259663e-05, + "loss": 0.283, + "step": 8802 + }, + { + "epoch": 0.8292786321565672, + "grad_norm": 0.7371636033058167, + "learning_rate": 1.3009638117620371e-05, + "loss": 0.3134, + "step": 8803 + }, + { + "epoch": 0.829372836249735, + "grad_norm": 0.7526691555976868, + "learning_rate": 1.3008198076356678e-05, + "loss": 0.2759, + "step": 8804 + }, + { + "epoch": 0.8294670403429029, + "grad_norm": 0.7201641201972961, + "learning_rate": 1.3006757966501415e-05, + "loss": 0.3202, + "step": 8805 + }, + { + "epoch": 0.8295612444360707, + "grad_norm": 0.7679572701454163, + "learning_rate": 1.300531778808742e-05, + "loss": 0.3211, + "step": 8806 + }, + { + "epoch": 0.8296554485292386, + "grad_norm": 0.898105263710022, + "learning_rate": 1.3003877541147532e-05, + "loss": 0.3182, + "step": 8807 + }, + { + "epoch": 0.8297496526224064, + "grad_norm": 0.7773075103759766, + "learning_rate": 1.3002437225714588e-05, + "loss": 0.3266, + "step": 8808 + }, + { + "epoch": 0.8298438567155743, + "grad_norm": 0.7979598045349121, + "learning_rate": 1.3000996841821433e-05, + "loss": 0.2846, + "step": 8809 + }, + { + "epoch": 0.8299380608087421, + "grad_norm": 0.7142004370689392, + "learning_rate": 1.2999556389500914e-05, + "loss": 0.3376, + "step": 8810 + }, + { + "epoch": 0.83003226490191, + "grad_norm": 0.7944036722183228, + "learning_rate": 1.2998115868785864e-05, + "loss": 0.3459, + "step": 8811 + }, + { + "epoch": 0.8301264689950778, + "grad_norm": 0.6650295257568359, + "learning_rate": 1.2996675279709135e-05, + "loss": 0.3046, + "step": 8812 + }, + { + "epoch": 0.8302206730882457, + "grad_norm": 0.667598307132721, + "learning_rate": 1.299523462230358e-05, + "loss": 0.2687, + "step": 8813 + }, + { + "epoch": 0.8303148771814135, + "grad_norm": 0.7344732880592346, + "learning_rate": 1.299379389660204e-05, + "loss": 0.3375, + "step": 8814 + }, + { + "epoch": 0.8304090812745814, + "grad_norm": 0.7699588537216187, + "learning_rate": 1.2992353102637372e-05, + "loss": 0.3441, + "step": 8815 + }, + { + "epoch": 0.8305032853677492, + "grad_norm": 0.9087995886802673, + "learning_rate": 1.2990912240442424e-05, + "loss": 0.2961, + "step": 8816 + }, + { + "epoch": 0.830597489460917, + "grad_norm": 0.7107911705970764, + "learning_rate": 1.298947131005005e-05, + "loss": 0.2929, + "step": 8817 + }, + { + "epoch": 0.8306916935540849, + "grad_norm": 0.7627518773078918, + "learning_rate": 1.2988030311493107e-05, + "loss": 0.2942, + "step": 8818 + }, + { + "epoch": 0.8307858976472527, + "grad_norm": 0.9217396378517151, + "learning_rate": 1.2986589244804455e-05, + "loss": 0.3642, + "step": 8819 + }, + { + "epoch": 0.8308801017404206, + "grad_norm": 0.7422599792480469, + "learning_rate": 1.2985148110016947e-05, + "loss": 0.3382, + "step": 8820 + }, + { + "epoch": 0.8309743058335884, + "grad_norm": 0.7851346135139465, + "learning_rate": 1.2983706907163447e-05, + "loss": 0.3288, + "step": 8821 + }, + { + "epoch": 0.8310685099267563, + "grad_norm": 1.0454436540603638, + "learning_rate": 1.2982265636276812e-05, + "loss": 0.3543, + "step": 8822 + }, + { + "epoch": 0.8311627140199241, + "grad_norm": 0.6407907605171204, + "learning_rate": 1.2980824297389912e-05, + "loss": 0.2743, + "step": 8823 + }, + { + "epoch": 0.831256918113092, + "grad_norm": 0.6632653474807739, + "learning_rate": 1.2979382890535606e-05, + "loss": 0.272, + "step": 8824 + }, + { + "epoch": 0.8313511222062598, + "grad_norm": 0.7241531014442444, + "learning_rate": 1.2977941415746763e-05, + "loss": 0.3037, + "step": 8825 + }, + { + "epoch": 0.8314453262994277, + "grad_norm": 1.239408254623413, + "learning_rate": 1.2976499873056248e-05, + "loss": 0.3521, + "step": 8826 + }, + { + "epoch": 0.8315395303925955, + "grad_norm": 0.8416794538497925, + "learning_rate": 1.2975058262496936e-05, + "loss": 0.3393, + "step": 8827 + }, + { + "epoch": 0.8316337344857634, + "grad_norm": 0.6784628033638, + "learning_rate": 1.2973616584101694e-05, + "loss": 0.2875, + "step": 8828 + }, + { + "epoch": 0.8317279385789312, + "grad_norm": 0.6785228848457336, + "learning_rate": 1.2972174837903392e-05, + "loss": 0.3017, + "step": 8829 + }, + { + "epoch": 0.8318221426720991, + "grad_norm": 0.9164506196975708, + "learning_rate": 1.2970733023934911e-05, + "loss": 0.3663, + "step": 8830 + }, + { + "epoch": 0.8319163467652669, + "grad_norm": 0.7244361042976379, + "learning_rate": 1.296929114222912e-05, + "loss": 0.3027, + "step": 8831 + }, + { + "epoch": 0.8320105508584348, + "grad_norm": 0.7114465236663818, + "learning_rate": 1.2967849192818899e-05, + "loss": 0.3005, + "step": 8832 + }, + { + "epoch": 0.8321047549516026, + "grad_norm": 0.6782700419425964, + "learning_rate": 1.296640717573713e-05, + "loss": 0.2951, + "step": 8833 + }, + { + "epoch": 0.8321989590447705, + "grad_norm": 0.8049957156181335, + "learning_rate": 1.2964965091016687e-05, + "loss": 0.2946, + "step": 8834 + }, + { + "epoch": 0.8322931631379383, + "grad_norm": 0.7354930639266968, + "learning_rate": 1.2963522938690454e-05, + "loss": 0.3217, + "step": 8835 + }, + { + "epoch": 0.8323873672311062, + "grad_norm": 0.78761887550354, + "learning_rate": 1.2962080718791316e-05, + "loss": 0.3156, + "step": 8836 + }, + { + "epoch": 0.832481571324274, + "grad_norm": 0.7231392860412598, + "learning_rate": 1.2960638431352155e-05, + "loss": 0.2999, + "step": 8837 + }, + { + "epoch": 0.8325757754174419, + "grad_norm": 0.8216593265533447, + "learning_rate": 1.295919607640586e-05, + "loss": 0.3143, + "step": 8838 + }, + { + "epoch": 0.8326699795106097, + "grad_norm": 0.7374890446662903, + "learning_rate": 1.2957753653985319e-05, + "loss": 0.3323, + "step": 8839 + }, + { + "epoch": 0.8327641836037776, + "grad_norm": 0.7227234840393066, + "learning_rate": 1.2956311164123421e-05, + "loss": 0.2892, + "step": 8840 + }, + { + "epoch": 0.8328583876969454, + "grad_norm": 0.9002649188041687, + "learning_rate": 1.2954868606853058e-05, + "loss": 0.364, + "step": 8841 + }, + { + "epoch": 0.8329525917901133, + "grad_norm": 0.6368039846420288, + "learning_rate": 1.2953425982207116e-05, + "loss": 0.2662, + "step": 8842 + }, + { + "epoch": 0.8330467958832811, + "grad_norm": 0.6957955360412598, + "learning_rate": 1.2951983290218495e-05, + "loss": 0.3073, + "step": 8843 + }, + { + "epoch": 0.833140999976449, + "grad_norm": 0.6631066799163818, + "learning_rate": 1.2950540530920092e-05, + "loss": 0.2702, + "step": 8844 + }, + { + "epoch": 0.8332352040696168, + "grad_norm": 0.6305803060531616, + "learning_rate": 1.2949097704344802e-05, + "loss": 0.3124, + "step": 8845 + }, + { + "epoch": 0.8333294081627847, + "grad_norm": 0.7423282861709595, + "learning_rate": 1.2947654810525521e-05, + "loss": 0.3166, + "step": 8846 + }, + { + "epoch": 0.8334236122559525, + "grad_norm": 0.8736042380332947, + "learning_rate": 1.2946211849495152e-05, + "loss": 0.3018, + "step": 8847 + }, + { + "epoch": 0.8335178163491204, + "grad_norm": 0.7889606952667236, + "learning_rate": 1.2944768821286597e-05, + "loss": 0.2897, + "step": 8848 + }, + { + "epoch": 0.8336120204422882, + "grad_norm": 0.6764425039291382, + "learning_rate": 1.2943325725932759e-05, + "loss": 0.2907, + "step": 8849 + }, + { + "epoch": 0.8337062245354561, + "grad_norm": 0.6257145404815674, + "learning_rate": 1.2941882563466543e-05, + "loss": 0.2429, + "step": 8850 + }, + { + "epoch": 0.8338004286286239, + "grad_norm": 0.6930012106895447, + "learning_rate": 1.2940439333920853e-05, + "loss": 0.3016, + "step": 8851 + }, + { + "epoch": 0.8338946327217918, + "grad_norm": 0.8111011981964111, + "learning_rate": 1.2938996037328601e-05, + "loss": 0.2841, + "step": 8852 + }, + { + "epoch": 0.8339888368149596, + "grad_norm": 0.7202403545379639, + "learning_rate": 1.2937552673722695e-05, + "loss": 0.3125, + "step": 8853 + }, + { + "epoch": 0.8340830409081275, + "grad_norm": 0.7113397717475891, + "learning_rate": 1.2936109243136041e-05, + "loss": 0.2947, + "step": 8854 + }, + { + "epoch": 0.8341772450012953, + "grad_norm": 0.7299506068229675, + "learning_rate": 1.2934665745601557e-05, + "loss": 0.2489, + "step": 8855 + }, + { + "epoch": 0.8342714490944632, + "grad_norm": 0.7287810444831848, + "learning_rate": 1.2933222181152158e-05, + "loss": 0.3338, + "step": 8856 + }, + { + "epoch": 0.834365653187631, + "grad_norm": 0.7398520112037659, + "learning_rate": 1.2931778549820753e-05, + "loss": 0.2905, + "step": 8857 + }, + { + "epoch": 0.8344598572807989, + "grad_norm": 0.8753147125244141, + "learning_rate": 1.2930334851640268e-05, + "loss": 0.3433, + "step": 8858 + }, + { + "epoch": 0.8345540613739667, + "grad_norm": 0.7349756360054016, + "learning_rate": 1.2928891086643611e-05, + "loss": 0.3046, + "step": 8859 + }, + { + "epoch": 0.8346482654671346, + "grad_norm": 0.7209146618843079, + "learning_rate": 1.2927447254863712e-05, + "loss": 0.3194, + "step": 8860 + }, + { + "epoch": 0.8347424695603024, + "grad_norm": 0.7802855372428894, + "learning_rate": 1.2926003356333487e-05, + "loss": 0.3245, + "step": 8861 + }, + { + "epoch": 0.8348366736534703, + "grad_norm": 0.781139075756073, + "learning_rate": 1.2924559391085858e-05, + "loss": 0.338, + "step": 8862 + }, + { + "epoch": 0.8349308777466381, + "grad_norm": 0.7366265654563904, + "learning_rate": 1.2923115359153755e-05, + "loss": 0.3095, + "step": 8863 + }, + { + "epoch": 0.835025081839806, + "grad_norm": 0.7250460386276245, + "learning_rate": 1.2921671260570099e-05, + "loss": 0.29, + "step": 8864 + }, + { + "epoch": 0.8351192859329738, + "grad_norm": 0.6194658279418945, + "learning_rate": 1.2920227095367822e-05, + "loss": 0.2863, + "step": 8865 + }, + { + "epoch": 0.8352134900261416, + "grad_norm": 0.788934588432312, + "learning_rate": 1.2918782863579846e-05, + "loss": 0.29, + "step": 8866 + }, + { + "epoch": 0.8353076941193095, + "grad_norm": 0.8553847074508667, + "learning_rate": 1.2917338565239112e-05, + "loss": 0.3084, + "step": 8867 + }, + { + "epoch": 0.8354018982124773, + "grad_norm": 0.722250759601593, + "learning_rate": 1.2915894200378547e-05, + "loss": 0.302, + "step": 8868 + }, + { + "epoch": 0.8354961023056452, + "grad_norm": 0.6709815263748169, + "learning_rate": 1.2914449769031081e-05, + "loss": 0.2819, + "step": 8869 + }, + { + "epoch": 0.835590306398813, + "grad_norm": 0.6778731346130371, + "learning_rate": 1.2913005271229657e-05, + "loss": 0.2895, + "step": 8870 + }, + { + "epoch": 0.8356845104919809, + "grad_norm": 0.7274365425109863, + "learning_rate": 1.2911560707007204e-05, + "loss": 0.32, + "step": 8871 + }, + { + "epoch": 0.8357787145851487, + "grad_norm": 0.7306736707687378, + "learning_rate": 1.2910116076396669e-05, + "loss": 0.3174, + "step": 8872 + }, + { + "epoch": 0.8358729186783166, + "grad_norm": 0.7256240248680115, + "learning_rate": 1.2908671379430982e-05, + "loss": 0.2735, + "step": 8873 + }, + { + "epoch": 0.8359671227714844, + "grad_norm": 0.656222939491272, + "learning_rate": 1.2907226616143089e-05, + "loss": 0.3148, + "step": 8874 + }, + { + "epoch": 0.8360613268646523, + "grad_norm": 0.7254332304000854, + "learning_rate": 1.2905781786565936e-05, + "loss": 0.3441, + "step": 8875 + }, + { + "epoch": 0.8361555309578201, + "grad_norm": 0.637316107749939, + "learning_rate": 1.2904336890732462e-05, + "loss": 0.2629, + "step": 8876 + }, + { + "epoch": 0.836249735050988, + "grad_norm": 0.7101510763168335, + "learning_rate": 1.2902891928675616e-05, + "loss": 0.3213, + "step": 8877 + }, + { + "epoch": 0.8363439391441558, + "grad_norm": 0.6917797327041626, + "learning_rate": 1.290144690042835e-05, + "loss": 0.3181, + "step": 8878 + }, + { + "epoch": 0.8364381432373237, + "grad_norm": 0.7629390358924866, + "learning_rate": 1.29000018060236e-05, + "loss": 0.286, + "step": 8879 + }, + { + "epoch": 0.8365323473304915, + "grad_norm": 0.663982093334198, + "learning_rate": 1.2898556645494327e-05, + "loss": 0.2563, + "step": 8880 + }, + { + "epoch": 0.8366265514236594, + "grad_norm": 0.7203670144081116, + "learning_rate": 1.289711141887348e-05, + "loss": 0.3328, + "step": 8881 + }, + { + "epoch": 0.8367207555168272, + "grad_norm": 0.8644741773605347, + "learning_rate": 1.2895666126194009e-05, + "loss": 0.3524, + "step": 8882 + }, + { + "epoch": 0.8368149596099951, + "grad_norm": 0.7667208909988403, + "learning_rate": 1.2894220767488877e-05, + "loss": 0.3135, + "step": 8883 + }, + { + "epoch": 0.8369091637031629, + "grad_norm": 0.7790444493293762, + "learning_rate": 1.2892775342791033e-05, + "loss": 0.2839, + "step": 8884 + }, + { + "epoch": 0.8370033677963308, + "grad_norm": 0.6937950849533081, + "learning_rate": 1.2891329852133438e-05, + "loss": 0.2866, + "step": 8885 + }, + { + "epoch": 0.8370975718894986, + "grad_norm": 0.6426435708999634, + "learning_rate": 1.288988429554905e-05, + "loss": 0.3224, + "step": 8886 + }, + { + "epoch": 0.8371917759826665, + "grad_norm": 0.7044891119003296, + "learning_rate": 1.288843867307083e-05, + "loss": 0.2921, + "step": 8887 + }, + { + "epoch": 0.8372859800758343, + "grad_norm": 0.7194440960884094, + "learning_rate": 1.2886992984731743e-05, + "loss": 0.2857, + "step": 8888 + }, + { + "epoch": 0.8373801841690022, + "grad_norm": 0.746876060962677, + "learning_rate": 1.2885547230564748e-05, + "loss": 0.2876, + "step": 8889 + }, + { + "epoch": 0.83747438826217, + "grad_norm": 0.9977391362190247, + "learning_rate": 1.2884101410602821e-05, + "loss": 0.373, + "step": 8890 + }, + { + "epoch": 0.8375685923553379, + "grad_norm": 0.662267804145813, + "learning_rate": 1.2882655524878914e-05, + "loss": 0.2887, + "step": 8891 + }, + { + "epoch": 0.8376627964485057, + "grad_norm": 0.6523311138153076, + "learning_rate": 1.2881209573426005e-05, + "loss": 0.2587, + "step": 8892 + }, + { + "epoch": 0.8377570005416736, + "grad_norm": 0.7460758686065674, + "learning_rate": 1.2879763556277062e-05, + "loss": 0.3022, + "step": 8893 + }, + { + "epoch": 0.8378512046348414, + "grad_norm": 0.7284391522407532, + "learning_rate": 1.2878317473465056e-05, + "loss": 0.3235, + "step": 8894 + }, + { + "epoch": 0.8379454087280093, + "grad_norm": 0.6337546706199646, + "learning_rate": 1.2876871325022962e-05, + "loss": 0.3178, + "step": 8895 + }, + { + "epoch": 0.8380396128211771, + "grad_norm": 0.7316828370094299, + "learning_rate": 1.2875425110983753e-05, + "loss": 0.3159, + "step": 8896 + }, + { + "epoch": 0.838133816914345, + "grad_norm": 0.6527437567710876, + "learning_rate": 1.28739788313804e-05, + "loss": 0.3163, + "step": 8897 + }, + { + "epoch": 0.8382280210075128, + "grad_norm": 0.649264395236969, + "learning_rate": 1.287253248624589e-05, + "loss": 0.2581, + "step": 8898 + }, + { + "epoch": 0.8383222251006807, + "grad_norm": 0.9922779202461243, + "learning_rate": 1.2871086075613196e-05, + "loss": 0.3281, + "step": 8899 + }, + { + "epoch": 0.8384164291938485, + "grad_norm": 0.7026650309562683, + "learning_rate": 1.2869639599515295e-05, + "loss": 0.3276, + "step": 8900 + }, + { + "epoch": 0.8385106332870164, + "grad_norm": 0.7148386240005493, + "learning_rate": 1.286819305798518e-05, + "loss": 0.3371, + "step": 8901 + }, + { + "epoch": 0.8386048373801842, + "grad_norm": 0.6871752142906189, + "learning_rate": 1.2866746451055821e-05, + "loss": 0.315, + "step": 8902 + }, + { + "epoch": 0.8386990414733521, + "grad_norm": 0.7981586456298828, + "learning_rate": 1.2865299778760212e-05, + "loss": 0.3586, + "step": 8903 + }, + { + "epoch": 0.8387932455665199, + "grad_norm": 0.8237510323524475, + "learning_rate": 1.2863853041131338e-05, + "loss": 0.346, + "step": 8904 + }, + { + "epoch": 0.8388874496596878, + "grad_norm": 0.7409459948539734, + "learning_rate": 1.2862406238202186e-05, + "loss": 0.3541, + "step": 8905 + }, + { + "epoch": 0.8389816537528556, + "grad_norm": 0.768153965473175, + "learning_rate": 1.286095937000574e-05, + "loss": 0.293, + "step": 8906 + }, + { + "epoch": 0.8390758578460235, + "grad_norm": 0.6621012091636658, + "learning_rate": 1.2859512436574998e-05, + "loss": 0.2943, + "step": 8907 + }, + { + "epoch": 0.8391700619391913, + "grad_norm": 0.6803906559944153, + "learning_rate": 1.2858065437942955e-05, + "loss": 0.2384, + "step": 8908 + }, + { + "epoch": 0.8392642660323592, + "grad_norm": 0.6534746289253235, + "learning_rate": 1.2856618374142594e-05, + "loss": 0.2841, + "step": 8909 + }, + { + "epoch": 0.839358470125527, + "grad_norm": 0.7702908515930176, + "learning_rate": 1.2855171245206922e-05, + "loss": 0.3456, + "step": 8910 + }, + { + "epoch": 0.8394526742186948, + "grad_norm": 0.7710719704627991, + "learning_rate": 1.2853724051168922e-05, + "loss": 0.3285, + "step": 8911 + }, + { + "epoch": 0.8395468783118627, + "grad_norm": 0.6994277238845825, + "learning_rate": 1.2852276792061607e-05, + "loss": 0.3521, + "step": 8912 + }, + { + "epoch": 0.8396410824050305, + "grad_norm": 0.7324875593185425, + "learning_rate": 1.285082946791797e-05, + "loss": 0.3128, + "step": 8913 + }, + { + "epoch": 0.8397352864981984, + "grad_norm": 0.7418799996376038, + "learning_rate": 1.2849382078771006e-05, + "loss": 0.338, + "step": 8914 + }, + { + "epoch": 0.8398294905913662, + "grad_norm": 0.7456151247024536, + "learning_rate": 1.2847934624653728e-05, + "loss": 0.2961, + "step": 8915 + }, + { + "epoch": 0.8399236946845341, + "grad_norm": 0.7044605016708374, + "learning_rate": 1.284648710559914e-05, + "loss": 0.3607, + "step": 8916 + }, + { + "epoch": 0.8400178987777019, + "grad_norm": 0.7057334184646606, + "learning_rate": 1.2845039521640236e-05, + "loss": 0.3021, + "step": 8917 + }, + { + "epoch": 0.8401121028708698, + "grad_norm": 0.6334710121154785, + "learning_rate": 1.2843591872810039e-05, + "loss": 0.267, + "step": 8918 + }, + { + "epoch": 0.8402063069640376, + "grad_norm": 0.6703156232833862, + "learning_rate": 1.2842144159141543e-05, + "loss": 0.2582, + "step": 8919 + }, + { + "epoch": 0.8403005110572055, + "grad_norm": 0.8019157648086548, + "learning_rate": 1.2840696380667768e-05, + "loss": 0.3043, + "step": 8920 + }, + { + "epoch": 0.8403947151503732, + "grad_norm": 0.7276331782341003, + "learning_rate": 1.2839248537421722e-05, + "loss": 0.2852, + "step": 8921 + }, + { + "epoch": 0.8404889192435411, + "grad_norm": 0.6563223004341125, + "learning_rate": 1.2837800629436417e-05, + "loss": 0.2811, + "step": 8922 + }, + { + "epoch": 0.8405831233367089, + "grad_norm": 0.7013585567474365, + "learning_rate": 1.283635265674487e-05, + "loss": 0.3154, + "step": 8923 + }, + { + "epoch": 0.8406773274298768, + "grad_norm": 0.6967423558235168, + "learning_rate": 1.2834904619380097e-05, + "loss": 0.283, + "step": 8924 + }, + { + "epoch": 0.8407715315230446, + "grad_norm": 0.757985532283783, + "learning_rate": 1.2833456517375113e-05, + "loss": 0.2977, + "step": 8925 + }, + { + "epoch": 0.8408657356162125, + "grad_norm": 0.6783421635627747, + "learning_rate": 1.2832008350762937e-05, + "loss": 0.3367, + "step": 8926 + }, + { + "epoch": 0.8409599397093803, + "grad_norm": 0.6910302639007568, + "learning_rate": 1.2830560119576591e-05, + "loss": 0.3217, + "step": 8927 + }, + { + "epoch": 0.8410541438025482, + "grad_norm": 0.6359944939613342, + "learning_rate": 1.28291118238491e-05, + "loss": 0.2843, + "step": 8928 + }, + { + "epoch": 0.841148347895716, + "grad_norm": 0.8597601056098938, + "learning_rate": 1.2827663463613482e-05, + "loss": 0.3409, + "step": 8929 + }, + { + "epoch": 0.8412425519888839, + "grad_norm": 0.690007746219635, + "learning_rate": 1.2826215038902765e-05, + "loss": 0.3109, + "step": 8930 + }, + { + "epoch": 0.8413367560820517, + "grad_norm": 0.7544068694114685, + "learning_rate": 1.2824766549749972e-05, + "loss": 0.3121, + "step": 8931 + }, + { + "epoch": 0.8414309601752196, + "grad_norm": 0.6776837706565857, + "learning_rate": 1.2823317996188136e-05, + "loss": 0.3134, + "step": 8932 + }, + { + "epoch": 0.8415251642683874, + "grad_norm": 0.7241583466529846, + "learning_rate": 1.2821869378250283e-05, + "loss": 0.2893, + "step": 8933 + }, + { + "epoch": 0.8416193683615553, + "grad_norm": 0.6826990246772766, + "learning_rate": 1.282042069596944e-05, + "loss": 0.2992, + "step": 8934 + }, + { + "epoch": 0.8417135724547231, + "grad_norm": 0.7980566620826721, + "learning_rate": 1.2818971949378647e-05, + "loss": 0.3343, + "step": 8935 + }, + { + "epoch": 0.841807776547891, + "grad_norm": 0.7511221766471863, + "learning_rate": 1.2817523138510934e-05, + "loss": 0.2972, + "step": 8936 + }, + { + "epoch": 0.8419019806410588, + "grad_norm": 0.7395924925804138, + "learning_rate": 1.2816074263399335e-05, + "loss": 0.3441, + "step": 8937 + }, + { + "epoch": 0.8419961847342267, + "grad_norm": 0.6874242424964905, + "learning_rate": 1.281462532407689e-05, + "loss": 0.2941, + "step": 8938 + }, + { + "epoch": 0.8420903888273945, + "grad_norm": 0.7330230474472046, + "learning_rate": 1.281317632057663e-05, + "loss": 0.3146, + "step": 8939 + }, + { + "epoch": 0.8421845929205624, + "grad_norm": 0.6668033599853516, + "learning_rate": 1.2811727252931602e-05, + "loss": 0.2824, + "step": 8940 + }, + { + "epoch": 0.8422787970137302, + "grad_norm": 0.7982563972473145, + "learning_rate": 1.2810278121174844e-05, + "loss": 0.2901, + "step": 8941 + }, + { + "epoch": 0.8423730011068981, + "grad_norm": 0.6179084777832031, + "learning_rate": 1.2808828925339398e-05, + "loss": 0.2847, + "step": 8942 + }, + { + "epoch": 0.8424672052000659, + "grad_norm": 0.8197647333145142, + "learning_rate": 1.280737966545831e-05, + "loss": 0.3355, + "step": 8943 + }, + { + "epoch": 0.8425614092932338, + "grad_norm": 0.6531780958175659, + "learning_rate": 1.2805930341564622e-05, + "loss": 0.2772, + "step": 8944 + }, + { + "epoch": 0.8426556133864016, + "grad_norm": 0.6549702286720276, + "learning_rate": 1.2804480953691388e-05, + "loss": 0.2578, + "step": 8945 + }, + { + "epoch": 0.8427498174795695, + "grad_norm": 0.7435251474380493, + "learning_rate": 1.2803031501871643e-05, + "loss": 0.2965, + "step": 8946 + }, + { + "epoch": 0.8428440215727373, + "grad_norm": 0.8284091353416443, + "learning_rate": 1.280158198613845e-05, + "loss": 0.3444, + "step": 8947 + }, + { + "epoch": 0.8429382256659051, + "grad_norm": 0.8054177165031433, + "learning_rate": 1.2800132406524854e-05, + "loss": 0.3605, + "step": 8948 + }, + { + "epoch": 0.843032429759073, + "grad_norm": 0.7610667943954468, + "learning_rate": 1.2798682763063907e-05, + "loss": 0.2951, + "step": 8949 + }, + { + "epoch": 0.8431266338522408, + "grad_norm": 0.8656819462776184, + "learning_rate": 1.279723305578867e-05, + "loss": 0.3427, + "step": 8950 + }, + { + "epoch": 0.8432208379454087, + "grad_norm": 0.7053451538085938, + "learning_rate": 1.2795783284732186e-05, + "loss": 0.2875, + "step": 8951 + }, + { + "epoch": 0.8433150420385765, + "grad_norm": 0.7782831192016602, + "learning_rate": 1.2794333449927522e-05, + "loss": 0.3648, + "step": 8952 + }, + { + "epoch": 0.8434092461317444, + "grad_norm": 0.6942363381385803, + "learning_rate": 1.2792883551407738e-05, + "loss": 0.2661, + "step": 8953 + }, + { + "epoch": 0.8435034502249122, + "grad_norm": 0.7131900191307068, + "learning_rate": 1.2791433589205884e-05, + "loss": 0.2847, + "step": 8954 + }, + { + "epoch": 0.8435976543180801, + "grad_norm": 0.7252687811851501, + "learning_rate": 1.2789983563355031e-05, + "loss": 0.2764, + "step": 8955 + }, + { + "epoch": 0.8436918584112479, + "grad_norm": 0.7174544334411621, + "learning_rate": 1.2788533473888235e-05, + "loss": 0.3268, + "step": 8956 + }, + { + "epoch": 0.8437860625044158, + "grad_norm": 0.6812805533409119, + "learning_rate": 1.2787083320838566e-05, + "loss": 0.2781, + "step": 8957 + }, + { + "epoch": 0.8438802665975836, + "grad_norm": 0.7570432424545288, + "learning_rate": 1.2785633104239085e-05, + "loss": 0.3372, + "step": 8958 + }, + { + "epoch": 0.8439744706907515, + "grad_norm": 0.7210245132446289, + "learning_rate": 1.2784182824122862e-05, + "loss": 0.3028, + "step": 8959 + }, + { + "epoch": 0.8440686747839193, + "grad_norm": 0.8742713332176208, + "learning_rate": 1.2782732480522966e-05, + "loss": 0.309, + "step": 8960 + }, + { + "epoch": 0.8441628788770872, + "grad_norm": 0.9560702443122864, + "learning_rate": 1.2781282073472463e-05, + "loss": 0.3315, + "step": 8961 + }, + { + "epoch": 0.844257082970255, + "grad_norm": 0.8701736330986023, + "learning_rate": 1.2779831603004426e-05, + "loss": 0.2895, + "step": 8962 + }, + { + "epoch": 0.8443512870634229, + "grad_norm": 0.6817653775215149, + "learning_rate": 1.2778381069151935e-05, + "loss": 0.283, + "step": 8963 + }, + { + "epoch": 0.8444454911565907, + "grad_norm": 0.674261212348938, + "learning_rate": 1.2776930471948057e-05, + "loss": 0.2854, + "step": 8964 + }, + { + "epoch": 0.8445396952497586, + "grad_norm": 0.7767412662506104, + "learning_rate": 1.2775479811425868e-05, + "loss": 0.3446, + "step": 8965 + }, + { + "epoch": 0.8446338993429264, + "grad_norm": 0.7883347272872925, + "learning_rate": 1.2774029087618448e-05, + "loss": 0.3551, + "step": 8966 + }, + { + "epoch": 0.8447281034360943, + "grad_norm": 0.7268330454826355, + "learning_rate": 1.2772578300558874e-05, + "loss": 0.2994, + "step": 8967 + }, + { + "epoch": 0.8448223075292621, + "grad_norm": 1.1079233884811401, + "learning_rate": 1.2771127450280227e-05, + "loss": 0.3112, + "step": 8968 + }, + { + "epoch": 0.84491651162243, + "grad_norm": 0.815112292766571, + "learning_rate": 1.2769676536815589e-05, + "loss": 0.3286, + "step": 8969 + }, + { + "epoch": 0.8450107157155978, + "grad_norm": 0.7777073383331299, + "learning_rate": 1.2768225560198043e-05, + "loss": 0.3416, + "step": 8970 + }, + { + "epoch": 0.8451049198087657, + "grad_norm": 0.7789881229400635, + "learning_rate": 1.2766774520460672e-05, + "loss": 0.3076, + "step": 8971 + }, + { + "epoch": 0.8451991239019335, + "grad_norm": 0.6999005675315857, + "learning_rate": 1.2765323417636561e-05, + "loss": 0.3362, + "step": 8972 + }, + { + "epoch": 0.8452933279951014, + "grad_norm": 0.7440067529678345, + "learning_rate": 1.2763872251758804e-05, + "loss": 0.3184, + "step": 8973 + }, + { + "epoch": 0.8453875320882692, + "grad_norm": 0.6921179294586182, + "learning_rate": 1.276242102286048e-05, + "loss": 0.3165, + "step": 8974 + }, + { + "epoch": 0.8454817361814371, + "grad_norm": 0.7694404125213623, + "learning_rate": 1.2760969730974692e-05, + "loss": 0.3311, + "step": 8975 + }, + { + "epoch": 0.8455759402746049, + "grad_norm": 0.7318397760391235, + "learning_rate": 1.2759518376134516e-05, + "loss": 0.3249, + "step": 8976 + }, + { + "epoch": 0.8456701443677728, + "grad_norm": 0.7305333018302917, + "learning_rate": 1.2758066958373056e-05, + "loss": 0.2975, + "step": 8977 + }, + { + "epoch": 0.8457643484609406, + "grad_norm": 0.8558382391929626, + "learning_rate": 1.2756615477723408e-05, + "loss": 0.3352, + "step": 8978 + }, + { + "epoch": 0.8458585525541085, + "grad_norm": 0.7120863199234009, + "learning_rate": 1.275516393421866e-05, + "loss": 0.2938, + "step": 8979 + }, + { + "epoch": 0.8459527566472763, + "grad_norm": 0.7065699100494385, + "learning_rate": 1.2753712327891915e-05, + "loss": 0.301, + "step": 8980 + }, + { + "epoch": 0.8460469607404442, + "grad_norm": 0.7297618985176086, + "learning_rate": 1.275226065877627e-05, + "loss": 0.2715, + "step": 8981 + }, + { + "epoch": 0.846141164833612, + "grad_norm": 0.7805727124214172, + "learning_rate": 1.2750808926904822e-05, + "loss": 0.3078, + "step": 8982 + }, + { + "epoch": 0.8462353689267799, + "grad_norm": 0.6316404938697815, + "learning_rate": 1.2749357132310683e-05, + "loss": 0.3178, + "step": 8983 + }, + { + "epoch": 0.8463295730199477, + "grad_norm": 0.7475721836090088, + "learning_rate": 1.2747905275026943e-05, + "loss": 0.2786, + "step": 8984 + }, + { + "epoch": 0.8464237771131156, + "grad_norm": 0.8101813793182373, + "learning_rate": 1.2746453355086719e-05, + "loss": 0.33, + "step": 8985 + }, + { + "epoch": 0.8465179812062834, + "grad_norm": 0.9295817017555237, + "learning_rate": 1.2745001372523105e-05, + "loss": 0.3421, + "step": 8986 + }, + { + "epoch": 0.8466121852994513, + "grad_norm": 0.7078453302383423, + "learning_rate": 1.2743549327369218e-05, + "loss": 0.3213, + "step": 8987 + }, + { + "epoch": 0.8467063893926191, + "grad_norm": 0.7733709812164307, + "learning_rate": 1.2742097219658162e-05, + "loss": 0.3138, + "step": 8988 + }, + { + "epoch": 0.846800593485787, + "grad_norm": 0.7479760050773621, + "learning_rate": 1.274064504942305e-05, + "loss": 0.3532, + "step": 8989 + }, + { + "epoch": 0.8468947975789548, + "grad_norm": 0.7153126001358032, + "learning_rate": 1.2739192816696992e-05, + "loss": 0.3314, + "step": 8990 + }, + { + "epoch": 0.8469890016721227, + "grad_norm": 0.7419144511222839, + "learning_rate": 1.27377405215131e-05, + "loss": 0.3355, + "step": 8991 + }, + { + "epoch": 0.8470832057652905, + "grad_norm": 0.6762439608573914, + "learning_rate": 1.2736288163904493e-05, + "loss": 0.3082, + "step": 8992 + }, + { + "epoch": 0.8471774098584584, + "grad_norm": 0.8105480074882507, + "learning_rate": 1.2734835743904283e-05, + "loss": 0.3407, + "step": 8993 + }, + { + "epoch": 0.8472716139516262, + "grad_norm": 0.6770057082176208, + "learning_rate": 1.2733383261545586e-05, + "loss": 0.2747, + "step": 8994 + }, + { + "epoch": 0.847365818044794, + "grad_norm": 0.7751746773719788, + "learning_rate": 1.2731930716861527e-05, + "loss": 0.3293, + "step": 8995 + }, + { + "epoch": 0.8474600221379619, + "grad_norm": 0.8555806279182434, + "learning_rate": 1.2730478109885221e-05, + "loss": 0.3222, + "step": 8996 + }, + { + "epoch": 0.8475542262311297, + "grad_norm": 0.7186029553413391, + "learning_rate": 1.272902544064979e-05, + "loss": 0.299, + "step": 8997 + }, + { + "epoch": 0.8476484303242976, + "grad_norm": 0.8223687410354614, + "learning_rate": 1.2727572709188362e-05, + "loss": 0.3534, + "step": 8998 + }, + { + "epoch": 0.8477426344174654, + "grad_norm": 0.958514392375946, + "learning_rate": 1.2726119915534054e-05, + "loss": 0.323, + "step": 8999 + }, + { + "epoch": 0.8478368385106333, + "grad_norm": 0.6684522032737732, + "learning_rate": 1.2724667059719997e-05, + "loss": 0.2721, + "step": 9000 + }, + { + "epoch": 0.8479310426038011, + "grad_norm": 0.6773232221603394, + "learning_rate": 1.272321414177932e-05, + "loss": 0.3213, + "step": 9001 + }, + { + "epoch": 0.848025246696969, + "grad_norm": 0.8274404406547546, + "learning_rate": 1.2721761161745145e-05, + "loss": 0.2966, + "step": 9002 + }, + { + "epoch": 0.8481194507901368, + "grad_norm": 0.7804051041603088, + "learning_rate": 1.2720308119650608e-05, + "loss": 0.3115, + "step": 9003 + }, + { + "epoch": 0.8482136548833047, + "grad_norm": 0.779718816280365, + "learning_rate": 1.271885501552884e-05, + "loss": 0.3141, + "step": 9004 + }, + { + "epoch": 0.8483078589764725, + "grad_norm": 0.6873429417610168, + "learning_rate": 1.2717401849412972e-05, + "loss": 0.2849, + "step": 9005 + }, + { + "epoch": 0.8484020630696404, + "grad_norm": 0.7158425450325012, + "learning_rate": 1.2715948621336139e-05, + "loss": 0.2973, + "step": 9006 + }, + { + "epoch": 0.8484962671628082, + "grad_norm": 0.7210104465484619, + "learning_rate": 1.2714495331331475e-05, + "loss": 0.305, + "step": 9007 + }, + { + "epoch": 0.8485904712559761, + "grad_norm": 0.7635478973388672, + "learning_rate": 1.2713041979432124e-05, + "loss": 0.32, + "step": 9008 + }, + { + "epoch": 0.8486846753491439, + "grad_norm": 0.660316526889801, + "learning_rate": 1.2711588565671217e-05, + "loss": 0.2652, + "step": 9009 + }, + { + "epoch": 0.8487788794423118, + "grad_norm": 0.7859675288200378, + "learning_rate": 1.27101350900819e-05, + "loss": 0.317, + "step": 9010 + }, + { + "epoch": 0.8488730835354796, + "grad_norm": 0.6476109623908997, + "learning_rate": 1.2708681552697306e-05, + "loss": 0.2452, + "step": 9011 + }, + { + "epoch": 0.8489672876286475, + "grad_norm": 0.7139787077903748, + "learning_rate": 1.270722795355059e-05, + "loss": 0.287, + "step": 9012 + }, + { + "epoch": 0.8490614917218153, + "grad_norm": 0.8316739797592163, + "learning_rate": 1.2705774292674886e-05, + "loss": 0.3582, + "step": 9013 + }, + { + "epoch": 0.8491556958149832, + "grad_norm": 0.6787524819374084, + "learning_rate": 1.2704320570103343e-05, + "loss": 0.3222, + "step": 9014 + }, + { + "epoch": 0.849249899908151, + "grad_norm": 0.7387852668762207, + "learning_rate": 1.2702866785869112e-05, + "loss": 0.3029, + "step": 9015 + }, + { + "epoch": 0.8493441040013189, + "grad_norm": 0.6867208480834961, + "learning_rate": 1.2701412940005335e-05, + "loss": 0.286, + "step": 9016 + }, + { + "epoch": 0.8494383080944867, + "grad_norm": 0.7119094133377075, + "learning_rate": 1.2699959032545164e-05, + "loss": 0.2965, + "step": 9017 + }, + { + "epoch": 0.8495325121876546, + "grad_norm": 0.7823882102966309, + "learning_rate": 1.2698505063521758e-05, + "loss": 0.3153, + "step": 9018 + }, + { + "epoch": 0.8496267162808224, + "grad_norm": 0.6950855851173401, + "learning_rate": 1.2697051032968257e-05, + "loss": 0.2875, + "step": 9019 + }, + { + "epoch": 0.8497209203739903, + "grad_norm": 0.7047410011291504, + "learning_rate": 1.2695596940917825e-05, + "loss": 0.2879, + "step": 9020 + }, + { + "epoch": 0.8498151244671581, + "grad_norm": 0.6160603761672974, + "learning_rate": 1.2694142787403614e-05, + "loss": 0.3105, + "step": 9021 + }, + { + "epoch": 0.849909328560326, + "grad_norm": 0.6546329259872437, + "learning_rate": 1.2692688572458776e-05, + "loss": 0.2806, + "step": 9022 + }, + { + "epoch": 0.8500035326534938, + "grad_norm": 0.6774280667304993, + "learning_rate": 1.2691234296116479e-05, + "loss": 0.2833, + "step": 9023 + }, + { + "epoch": 0.8500977367466617, + "grad_norm": 0.664818286895752, + "learning_rate": 1.2689779958409876e-05, + "loss": 0.2724, + "step": 9024 + }, + { + "epoch": 0.8501919408398295, + "grad_norm": 0.7551313042640686, + "learning_rate": 1.2688325559372132e-05, + "loss": 0.3082, + "step": 9025 + }, + { + "epoch": 0.8502861449329974, + "grad_norm": 0.8147274255752563, + "learning_rate": 1.2686871099036405e-05, + "loss": 0.3745, + "step": 9026 + }, + { + "epoch": 0.8503803490261652, + "grad_norm": 0.5757870674133301, + "learning_rate": 1.2685416577435863e-05, + "loss": 0.2709, + "step": 9027 + }, + { + "epoch": 0.8504745531193331, + "grad_norm": 0.7894459962844849, + "learning_rate": 1.268396199460367e-05, + "loss": 0.2726, + "step": 9028 + }, + { + "epoch": 0.8505687572125009, + "grad_norm": 0.6999770402908325, + "learning_rate": 1.2682507350572995e-05, + "loss": 0.2746, + "step": 9029 + }, + { + "epoch": 0.8506629613056688, + "grad_norm": 0.7287282943725586, + "learning_rate": 1.2681052645377001e-05, + "loss": 0.3125, + "step": 9030 + }, + { + "epoch": 0.8507571653988366, + "grad_norm": 0.7133800983428955, + "learning_rate": 1.267959787904886e-05, + "loss": 0.3326, + "step": 9031 + }, + { + "epoch": 0.8508513694920045, + "grad_norm": 1.2192795276641846, + "learning_rate": 1.2678143051621743e-05, + "loss": 0.3675, + "step": 9032 + }, + { + "epoch": 0.8509455735851723, + "grad_norm": 0.6774299144744873, + "learning_rate": 1.2676688163128826e-05, + "loss": 0.305, + "step": 9033 + }, + { + "epoch": 0.8510397776783402, + "grad_norm": 0.6948447823524475, + "learning_rate": 1.2675233213603275e-05, + "loss": 0.295, + "step": 9034 + }, + { + "epoch": 0.851133981771508, + "grad_norm": 0.6044574975967407, + "learning_rate": 1.2673778203078274e-05, + "loss": 0.298, + "step": 9035 + }, + { + "epoch": 0.8512281858646759, + "grad_norm": 0.7736443281173706, + "learning_rate": 1.267232313158699e-05, + "loss": 0.3249, + "step": 9036 + }, + { + "epoch": 0.8513223899578437, + "grad_norm": 0.6680286526679993, + "learning_rate": 1.2670867999162608e-05, + "loss": 0.2785, + "step": 9037 + }, + { + "epoch": 0.8514165940510116, + "grad_norm": 0.7220253944396973, + "learning_rate": 1.2669412805838305e-05, + "loss": 0.3349, + "step": 9038 + }, + { + "epoch": 0.8515107981441794, + "grad_norm": 0.8169474005699158, + "learning_rate": 1.2667957551647263e-05, + "loss": 0.3097, + "step": 9039 + }, + { + "epoch": 0.8516050022373473, + "grad_norm": 0.7603136897087097, + "learning_rate": 1.2666502236622662e-05, + "loss": 0.3081, + "step": 9040 + }, + { + "epoch": 0.8516992063305151, + "grad_norm": 0.6987825632095337, + "learning_rate": 1.2665046860797686e-05, + "loss": 0.3067, + "step": 9041 + }, + { + "epoch": 0.851793410423683, + "grad_norm": 0.8207119107246399, + "learning_rate": 1.2663591424205519e-05, + "loss": 0.3267, + "step": 9042 + }, + { + "epoch": 0.8518876145168508, + "grad_norm": 0.7657036185264587, + "learning_rate": 1.266213592687935e-05, + "loss": 0.2911, + "step": 9043 + }, + { + "epoch": 0.8519818186100186, + "grad_norm": 0.7158383131027222, + "learning_rate": 1.2660680368852363e-05, + "loss": 0.2853, + "step": 9044 + }, + { + "epoch": 0.8520760227031865, + "grad_norm": 0.8265485167503357, + "learning_rate": 1.265922475015775e-05, + "loss": 0.3354, + "step": 9045 + }, + { + "epoch": 0.8521702267963543, + "grad_norm": 0.6983233690261841, + "learning_rate": 1.2657769070828698e-05, + "loss": 0.2999, + "step": 9046 + }, + { + "epoch": 0.8522644308895222, + "grad_norm": 0.6697626113891602, + "learning_rate": 1.2656313330898401e-05, + "loss": 0.3032, + "step": 9047 + }, + { + "epoch": 0.85235863498269, + "grad_norm": 0.8416416645050049, + "learning_rate": 1.2654857530400055e-05, + "loss": 0.2701, + "step": 9048 + }, + { + "epoch": 0.8524528390758579, + "grad_norm": 0.6764236092567444, + "learning_rate": 1.2653401669366852e-05, + "loss": 0.3055, + "step": 9049 + }, + { + "epoch": 0.8525470431690257, + "grad_norm": 0.6430860757827759, + "learning_rate": 1.2651945747831987e-05, + "loss": 0.3017, + "step": 9050 + }, + { + "epoch": 0.8526412472621936, + "grad_norm": 0.6810445189476013, + "learning_rate": 1.2650489765828653e-05, + "loss": 0.2986, + "step": 9051 + }, + { + "epoch": 0.8527354513553614, + "grad_norm": 0.6868563890457153, + "learning_rate": 1.2649033723390056e-05, + "loss": 0.2996, + "step": 9052 + }, + { + "epoch": 0.8528296554485293, + "grad_norm": 0.8139315843582153, + "learning_rate": 1.2647577620549396e-05, + "loss": 0.2883, + "step": 9053 + }, + { + "epoch": 0.8529238595416971, + "grad_norm": 0.6818740963935852, + "learning_rate": 1.2646121457339866e-05, + "loss": 0.3122, + "step": 9054 + }, + { + "epoch": 0.853018063634865, + "grad_norm": 0.6979965567588806, + "learning_rate": 1.2644665233794682e-05, + "loss": 0.2644, + "step": 9055 + }, + { + "epoch": 0.8531122677280328, + "grad_norm": 1.5787235498428345, + "learning_rate": 1.2643208949947035e-05, + "loss": 0.2998, + "step": 9056 + }, + { + "epoch": 0.8532064718212007, + "grad_norm": 0.6594725251197815, + "learning_rate": 1.2641752605830136e-05, + "loss": 0.3081, + "step": 9057 + }, + { + "epoch": 0.8533006759143685, + "grad_norm": 0.6624385714530945, + "learning_rate": 1.2640296201477195e-05, + "loss": 0.2871, + "step": 9058 + }, + { + "epoch": 0.8533948800075364, + "grad_norm": 0.6775038242340088, + "learning_rate": 1.2638839736921415e-05, + "loss": 0.2817, + "step": 9059 + }, + { + "epoch": 0.8534890841007041, + "grad_norm": 0.7073119878768921, + "learning_rate": 1.2637383212196008e-05, + "loss": 0.2938, + "step": 9060 + }, + { + "epoch": 0.853583288193872, + "grad_norm": 0.6494279503822327, + "learning_rate": 1.2635926627334188e-05, + "loss": 0.2903, + "step": 9061 + }, + { + "epoch": 0.8536774922870398, + "grad_norm": 0.8409245014190674, + "learning_rate": 1.263446998236916e-05, + "loss": 0.3619, + "step": 9062 + }, + { + "epoch": 0.8537716963802077, + "grad_norm": 0.6711735129356384, + "learning_rate": 1.2633013277334145e-05, + "loss": 0.2936, + "step": 9063 + }, + { + "epoch": 0.8538659004733755, + "grad_norm": 0.6291319131851196, + "learning_rate": 1.2631556512262356e-05, + "loss": 0.2554, + "step": 9064 + }, + { + "epoch": 0.8539601045665434, + "grad_norm": 0.6612778306007385, + "learning_rate": 1.2630099687187007e-05, + "loss": 0.2804, + "step": 9065 + }, + { + "epoch": 0.8540543086597112, + "grad_norm": 0.7076035737991333, + "learning_rate": 1.2628642802141317e-05, + "loss": 0.3447, + "step": 9066 + }, + { + "epoch": 0.8541485127528791, + "grad_norm": 0.7418968677520752, + "learning_rate": 1.2627185857158507e-05, + "loss": 0.2744, + "step": 9067 + }, + { + "epoch": 0.8542427168460469, + "grad_norm": 0.7586404085159302, + "learning_rate": 1.2625728852271795e-05, + "loss": 0.3439, + "step": 9068 + }, + { + "epoch": 0.8543369209392148, + "grad_norm": 0.9554126262664795, + "learning_rate": 1.2624271787514406e-05, + "loss": 0.3124, + "step": 9069 + }, + { + "epoch": 0.8544311250323826, + "grad_norm": 0.7535426616668701, + "learning_rate": 1.2622814662919562e-05, + "loss": 0.3191, + "step": 9070 + }, + { + "epoch": 0.8545253291255505, + "grad_norm": 0.8399681448936462, + "learning_rate": 1.2621357478520486e-05, + "loss": 0.3096, + "step": 9071 + }, + { + "epoch": 0.8546195332187183, + "grad_norm": 0.7496296167373657, + "learning_rate": 1.2619900234350406e-05, + "loss": 0.3137, + "step": 9072 + }, + { + "epoch": 0.8547137373118862, + "grad_norm": 0.7252075672149658, + "learning_rate": 1.2618442930442549e-05, + "loss": 0.3052, + "step": 9073 + }, + { + "epoch": 0.854807941405054, + "grad_norm": 0.6654471158981323, + "learning_rate": 1.2616985566830142e-05, + "loss": 0.3134, + "step": 9074 + }, + { + "epoch": 0.8549021454982219, + "grad_norm": 0.7226504683494568, + "learning_rate": 1.2615528143546423e-05, + "loss": 0.3024, + "step": 9075 + }, + { + "epoch": 0.8549963495913897, + "grad_norm": 0.8002542853355408, + "learning_rate": 1.261407066062461e-05, + "loss": 0.3179, + "step": 9076 + }, + { + "epoch": 0.8550905536845576, + "grad_norm": 0.8656417727470398, + "learning_rate": 1.2612613118097945e-05, + "loss": 0.3597, + "step": 9077 + }, + { + "epoch": 0.8551847577777254, + "grad_norm": 0.6418734192848206, + "learning_rate": 1.2611155515999665e-05, + "loss": 0.3104, + "step": 9078 + }, + { + "epoch": 0.8552789618708933, + "grad_norm": 0.6727639436721802, + "learning_rate": 1.2609697854362995e-05, + "loss": 0.2858, + "step": 9079 + }, + { + "epoch": 0.8553731659640611, + "grad_norm": 0.8004004955291748, + "learning_rate": 1.2608240133221181e-05, + "loss": 0.3094, + "step": 9080 + }, + { + "epoch": 0.855467370057229, + "grad_norm": 0.6943681836128235, + "learning_rate": 1.2606782352607458e-05, + "loss": 0.2923, + "step": 9081 + }, + { + "epoch": 0.8555615741503968, + "grad_norm": 0.6713337302207947, + "learning_rate": 1.2605324512555064e-05, + "loss": 0.2749, + "step": 9082 + }, + { + "epoch": 0.8556557782435646, + "grad_norm": 0.6937925815582275, + "learning_rate": 1.2603866613097245e-05, + "loss": 0.287, + "step": 9083 + }, + { + "epoch": 0.8557499823367325, + "grad_norm": 0.7038596868515015, + "learning_rate": 1.2602408654267237e-05, + "loss": 0.3168, + "step": 9084 + }, + { + "epoch": 0.8558441864299003, + "grad_norm": 0.7447677850723267, + "learning_rate": 1.2600950636098292e-05, + "loss": 0.32, + "step": 9085 + }, + { + "epoch": 0.8559383905230682, + "grad_norm": 0.7252728343009949, + "learning_rate": 1.2599492558623646e-05, + "loss": 0.3038, + "step": 9086 + }, + { + "epoch": 0.856032594616236, + "grad_norm": 0.8138175010681152, + "learning_rate": 1.2598034421876548e-05, + "loss": 0.3457, + "step": 9087 + }, + { + "epoch": 0.8561267987094039, + "grad_norm": 0.6639115214347839, + "learning_rate": 1.2596576225890251e-05, + "loss": 0.3187, + "step": 9088 + }, + { + "epoch": 0.8562210028025717, + "grad_norm": 0.6821640133857727, + "learning_rate": 1.2595117970697998e-05, + "loss": 0.2966, + "step": 9089 + }, + { + "epoch": 0.8563152068957396, + "grad_norm": 0.7528424859046936, + "learning_rate": 1.2593659656333044e-05, + "loss": 0.3216, + "step": 9090 + }, + { + "epoch": 0.8564094109889074, + "grad_norm": 0.8091188073158264, + "learning_rate": 1.2592201282828635e-05, + "loss": 0.3225, + "step": 9091 + }, + { + "epoch": 0.8565036150820753, + "grad_norm": 0.6671088933944702, + "learning_rate": 1.2590742850218031e-05, + "loss": 0.2696, + "step": 9092 + }, + { + "epoch": 0.8565978191752431, + "grad_norm": 0.7548508048057556, + "learning_rate": 1.2589284358534486e-05, + "loss": 0.2939, + "step": 9093 + }, + { + "epoch": 0.856692023268411, + "grad_norm": 0.7009268999099731, + "learning_rate": 1.2587825807811248e-05, + "loss": 0.3123, + "step": 9094 + }, + { + "epoch": 0.8567862273615788, + "grad_norm": 0.7987003922462463, + "learning_rate": 1.2586367198081582e-05, + "loss": 0.34, + "step": 9095 + }, + { + "epoch": 0.8568804314547467, + "grad_norm": 0.9772810935974121, + "learning_rate": 1.2584908529378743e-05, + "loss": 0.3126, + "step": 9096 + }, + { + "epoch": 0.8569746355479145, + "grad_norm": 0.8526176810264587, + "learning_rate": 1.258344980173599e-05, + "loss": 0.3678, + "step": 9097 + }, + { + "epoch": 0.8570688396410824, + "grad_norm": 0.8225265145301819, + "learning_rate": 1.2581991015186592e-05, + "loss": 0.2985, + "step": 9098 + }, + { + "epoch": 0.8571630437342502, + "grad_norm": 0.739586353302002, + "learning_rate": 1.2580532169763799e-05, + "loss": 0.2975, + "step": 9099 + }, + { + "epoch": 0.8572572478274181, + "grad_norm": 0.8607017397880554, + "learning_rate": 1.2579073265500886e-05, + "loss": 0.2715, + "step": 9100 + }, + { + "epoch": 0.8573514519205859, + "grad_norm": 0.7922728061676025, + "learning_rate": 1.2577614302431114e-05, + "loss": 0.2931, + "step": 9101 + }, + { + "epoch": 0.8574456560137538, + "grad_norm": 0.7574107646942139, + "learning_rate": 1.2576155280587745e-05, + "loss": 0.3322, + "step": 9102 + }, + { + "epoch": 0.8575398601069216, + "grad_norm": 0.7002633810043335, + "learning_rate": 1.2574696200004055e-05, + "loss": 0.3099, + "step": 9103 + }, + { + "epoch": 0.8576340642000895, + "grad_norm": 0.8383626341819763, + "learning_rate": 1.257323706071331e-05, + "loss": 0.3548, + "step": 9104 + }, + { + "epoch": 0.8577282682932573, + "grad_norm": 0.8669114112854004, + "learning_rate": 1.2571777862748782e-05, + "loss": 0.3364, + "step": 9105 + }, + { + "epoch": 0.8578224723864252, + "grad_norm": 0.706563413143158, + "learning_rate": 1.257031860614374e-05, + "loss": 0.2956, + "step": 9106 + }, + { + "epoch": 0.857916676479593, + "grad_norm": 0.7623336911201477, + "learning_rate": 1.2568859290931454e-05, + "loss": 0.3228, + "step": 9107 + }, + { + "epoch": 0.8580108805727609, + "grad_norm": 0.7400816082954407, + "learning_rate": 1.2567399917145208e-05, + "loss": 0.3163, + "step": 9108 + }, + { + "epoch": 0.8581050846659287, + "grad_norm": 0.7918370366096497, + "learning_rate": 1.2565940484818273e-05, + "loss": 0.3487, + "step": 9109 + }, + { + "epoch": 0.8581992887590966, + "grad_norm": 0.7184942364692688, + "learning_rate": 1.2564480993983928e-05, + "loss": 0.2998, + "step": 9110 + }, + { + "epoch": 0.8582934928522644, + "grad_norm": 0.7348983287811279, + "learning_rate": 1.2563021444675447e-05, + "loss": 0.3106, + "step": 9111 + }, + { + "epoch": 0.8583876969454323, + "grad_norm": 0.6605958342552185, + "learning_rate": 1.2561561836926115e-05, + "loss": 0.2437, + "step": 9112 + }, + { + "epoch": 0.8584819010386001, + "grad_norm": 0.7503715753555298, + "learning_rate": 1.2560102170769212e-05, + "loss": 0.3322, + "step": 9113 + }, + { + "epoch": 0.858576105131768, + "grad_norm": 0.703746497631073, + "learning_rate": 1.255864244623802e-05, + "loss": 0.3332, + "step": 9114 + }, + { + "epoch": 0.8586703092249358, + "grad_norm": 0.6568114161491394, + "learning_rate": 1.2557182663365823e-05, + "loss": 0.3007, + "step": 9115 + }, + { + "epoch": 0.8587645133181037, + "grad_norm": 0.6698923110961914, + "learning_rate": 1.2555722822185906e-05, + "loss": 0.2909, + "step": 9116 + }, + { + "epoch": 0.8588587174112715, + "grad_norm": 0.9063276648521423, + "learning_rate": 1.2554262922731555e-05, + "loss": 0.3045, + "step": 9117 + }, + { + "epoch": 0.8589529215044394, + "grad_norm": 0.7718333005905151, + "learning_rate": 1.2552802965036063e-05, + "loss": 0.2899, + "step": 9118 + }, + { + "epoch": 0.8590471255976072, + "grad_norm": 0.7325762510299683, + "learning_rate": 1.2551342949132713e-05, + "loss": 0.2974, + "step": 9119 + }, + { + "epoch": 0.8591413296907751, + "grad_norm": 0.7019739747047424, + "learning_rate": 1.2549882875054797e-05, + "loss": 0.3174, + "step": 9120 + }, + { + "epoch": 0.8592355337839429, + "grad_norm": 0.8704683780670166, + "learning_rate": 1.2548422742835608e-05, + "loss": 0.2874, + "step": 9121 + }, + { + "epoch": 0.8593297378771108, + "grad_norm": 0.7827633023262024, + "learning_rate": 1.254696255250844e-05, + "loss": 0.3136, + "step": 9122 + }, + { + "epoch": 0.8594239419702786, + "grad_norm": 0.7730802893638611, + "learning_rate": 1.2545502304106588e-05, + "loss": 0.3098, + "step": 9123 + }, + { + "epoch": 0.8595181460634465, + "grad_norm": 0.7378262281417847, + "learning_rate": 1.2544041997663348e-05, + "loss": 0.3058, + "step": 9124 + }, + { + "epoch": 0.8596123501566143, + "grad_norm": 0.6844967603683472, + "learning_rate": 1.2542581633212015e-05, + "loss": 0.3135, + "step": 9125 + }, + { + "epoch": 0.8597065542497822, + "grad_norm": 0.6816369891166687, + "learning_rate": 1.2541121210785887e-05, + "loss": 0.2725, + "step": 9126 + }, + { + "epoch": 0.85980075834295, + "grad_norm": 0.7545812129974365, + "learning_rate": 1.2539660730418264e-05, + "loss": 0.3462, + "step": 9127 + }, + { + "epoch": 0.8598949624361178, + "grad_norm": 0.6996229887008667, + "learning_rate": 1.2538200192142451e-05, + "loss": 0.2869, + "step": 9128 + }, + { + "epoch": 0.8599891665292857, + "grad_norm": 0.7229344248771667, + "learning_rate": 1.253673959599175e-05, + "loss": 0.3101, + "step": 9129 + }, + { + "epoch": 0.8600833706224535, + "grad_norm": 0.637445330619812, + "learning_rate": 1.253527894199946e-05, + "loss": 0.3117, + "step": 9130 + }, + { + "epoch": 0.8601775747156214, + "grad_norm": 0.7283216714859009, + "learning_rate": 1.2533818230198889e-05, + "loss": 0.3119, + "step": 9131 + }, + { + "epoch": 0.8602717788087892, + "grad_norm": 0.69550621509552, + "learning_rate": 1.2532357460623345e-05, + "loss": 0.3136, + "step": 9132 + }, + { + "epoch": 0.8603659829019571, + "grad_norm": 0.7138452529907227, + "learning_rate": 1.2530896633306136e-05, + "loss": 0.2971, + "step": 9133 + }, + { + "epoch": 0.8604601869951249, + "grad_norm": 0.7830208539962769, + "learning_rate": 1.2529435748280566e-05, + "loss": 0.3536, + "step": 9134 + }, + { + "epoch": 0.8605543910882928, + "grad_norm": 0.8075652718544006, + "learning_rate": 1.2527974805579954e-05, + "loss": 0.3704, + "step": 9135 + }, + { + "epoch": 0.8606485951814606, + "grad_norm": 0.7236867547035217, + "learning_rate": 1.2526513805237604e-05, + "loss": 0.2996, + "step": 9136 + }, + { + "epoch": 0.8607427992746285, + "grad_norm": 0.6777334809303284, + "learning_rate": 1.2525052747286832e-05, + "loss": 0.2915, + "step": 9137 + }, + { + "epoch": 0.8608370033677963, + "grad_norm": 0.6908157467842102, + "learning_rate": 1.2523591631760952e-05, + "loss": 0.3125, + "step": 9138 + }, + { + "epoch": 0.8609312074609642, + "grad_norm": 0.7357950806617737, + "learning_rate": 1.2522130458693278e-05, + "loss": 0.3328, + "step": 9139 + }, + { + "epoch": 0.861025411554132, + "grad_norm": 0.861672580242157, + "learning_rate": 1.2520669228117132e-05, + "loss": 0.3602, + "step": 9140 + }, + { + "epoch": 0.8611196156472999, + "grad_norm": 0.6655526161193848, + "learning_rate": 1.251920794006583e-05, + "loss": 0.2671, + "step": 9141 + }, + { + "epoch": 0.8612138197404677, + "grad_norm": 0.6929822564125061, + "learning_rate": 1.2517746594572688e-05, + "loss": 0.3119, + "step": 9142 + }, + { + "epoch": 0.8613080238336356, + "grad_norm": 0.7310613393783569, + "learning_rate": 1.2516285191671031e-05, + "loss": 0.3053, + "step": 9143 + }, + { + "epoch": 0.8614022279268034, + "grad_norm": 0.7642453908920288, + "learning_rate": 1.2514823731394182e-05, + "loss": 0.285, + "step": 9144 + }, + { + "epoch": 0.8614964320199713, + "grad_norm": 0.6509019732475281, + "learning_rate": 1.2513362213775462e-05, + "loss": 0.3139, + "step": 9145 + }, + { + "epoch": 0.8615906361131391, + "grad_norm": 0.6920650005340576, + "learning_rate": 1.2511900638848196e-05, + "loss": 0.334, + "step": 9146 + }, + { + "epoch": 0.861684840206307, + "grad_norm": 0.6638849973678589, + "learning_rate": 1.2510439006645707e-05, + "loss": 0.3166, + "step": 9147 + }, + { + "epoch": 0.8617790442994748, + "grad_norm": 0.727704644203186, + "learning_rate": 1.2508977317201332e-05, + "loss": 0.3115, + "step": 9148 + }, + { + "epoch": 0.8618732483926427, + "grad_norm": 0.7568182349205017, + "learning_rate": 1.2507515570548392e-05, + "loss": 0.3262, + "step": 9149 + }, + { + "epoch": 0.8619674524858105, + "grad_norm": 0.7067240476608276, + "learning_rate": 1.250605376672022e-05, + "loss": 0.3153, + "step": 9150 + }, + { + "epoch": 0.8620616565789784, + "grad_norm": 0.7335924506187439, + "learning_rate": 1.2504591905750143e-05, + "loss": 0.3367, + "step": 9151 + }, + { + "epoch": 0.8621558606721462, + "grad_norm": 0.6466672420501709, + "learning_rate": 1.25031299876715e-05, + "loss": 0.2883, + "step": 9152 + }, + { + "epoch": 0.8622500647653141, + "grad_norm": 0.7279981970787048, + "learning_rate": 1.250166801251762e-05, + "loss": 0.3255, + "step": 9153 + }, + { + "epoch": 0.8623442688584819, + "grad_norm": 0.6738430261611938, + "learning_rate": 1.250020598032184e-05, + "loss": 0.2918, + "step": 9154 + }, + { + "epoch": 0.8624384729516498, + "grad_norm": 0.7319638729095459, + "learning_rate": 1.2498743891117502e-05, + "loss": 0.2955, + "step": 9155 + }, + { + "epoch": 0.8625326770448176, + "grad_norm": 0.8512725830078125, + "learning_rate": 1.2497281744937934e-05, + "loss": 0.3254, + "step": 9156 + }, + { + "epoch": 0.8626268811379855, + "grad_norm": 0.7730419635772705, + "learning_rate": 1.2495819541816483e-05, + "loss": 0.2859, + "step": 9157 + }, + { + "epoch": 0.8627210852311533, + "grad_norm": 0.6937309503555298, + "learning_rate": 1.2494357281786487e-05, + "loss": 0.3012, + "step": 9158 + }, + { + "epoch": 0.8628152893243212, + "grad_norm": 0.8749970197677612, + "learning_rate": 1.2492894964881282e-05, + "loss": 0.3223, + "step": 9159 + }, + { + "epoch": 0.862909493417489, + "grad_norm": 0.6235557198524475, + "learning_rate": 1.249143259113422e-05, + "loss": 0.2971, + "step": 9160 + }, + { + "epoch": 0.8630036975106569, + "grad_norm": 0.6645923256874084, + "learning_rate": 1.2489970160578645e-05, + "loss": 0.3051, + "step": 9161 + }, + { + "epoch": 0.8630979016038247, + "grad_norm": 0.8053779602050781, + "learning_rate": 1.2488507673247894e-05, + "loss": 0.3286, + "step": 9162 + }, + { + "epoch": 0.8631921056969926, + "grad_norm": 0.8143567442893982, + "learning_rate": 1.2487045129175322e-05, + "loss": 0.3026, + "step": 9163 + }, + { + "epoch": 0.8632863097901604, + "grad_norm": 0.6536929607391357, + "learning_rate": 1.2485582528394276e-05, + "loss": 0.2902, + "step": 9164 + }, + { + "epoch": 0.8633805138833283, + "grad_norm": 0.7779171466827393, + "learning_rate": 1.2484119870938102e-05, + "loss": 0.3174, + "step": 9165 + }, + { + "epoch": 0.8634747179764961, + "grad_norm": 0.697607159614563, + "learning_rate": 1.2482657156840157e-05, + "loss": 0.3127, + "step": 9166 + }, + { + "epoch": 0.863568922069664, + "grad_norm": 0.880972683429718, + "learning_rate": 1.2481194386133784e-05, + "loss": 0.3126, + "step": 9167 + }, + { + "epoch": 0.8636631261628318, + "grad_norm": 0.8075266480445862, + "learning_rate": 1.2479731558852345e-05, + "loss": 0.3384, + "step": 9168 + }, + { + "epoch": 0.8637573302559997, + "grad_norm": 0.7963753342628479, + "learning_rate": 1.2478268675029193e-05, + "loss": 0.3422, + "step": 9169 + }, + { + "epoch": 0.8638515343491675, + "grad_norm": 0.751092255115509, + "learning_rate": 1.2476805734697679e-05, + "loss": 0.292, + "step": 9170 + }, + { + "epoch": 0.8639457384423354, + "grad_norm": 0.7625300884246826, + "learning_rate": 1.2475342737891164e-05, + "loss": 0.313, + "step": 9171 + }, + { + "epoch": 0.8640399425355032, + "grad_norm": 0.6691493988037109, + "learning_rate": 1.2473879684643006e-05, + "loss": 0.2766, + "step": 9172 + }, + { + "epoch": 0.864134146628671, + "grad_norm": 0.7864203453063965, + "learning_rate": 1.247241657498657e-05, + "loss": 0.3185, + "step": 9173 + }, + { + "epoch": 0.8642283507218389, + "grad_norm": 0.8323874473571777, + "learning_rate": 1.2470953408955206e-05, + "loss": 0.2968, + "step": 9174 + }, + { + "epoch": 0.8643225548150067, + "grad_norm": 0.6916950941085815, + "learning_rate": 1.2469490186582289e-05, + "loss": 0.2846, + "step": 9175 + }, + { + "epoch": 0.8644167589081746, + "grad_norm": 0.7014223337173462, + "learning_rate": 1.2468026907901171e-05, + "loss": 0.3205, + "step": 9176 + }, + { + "epoch": 0.8645109630013424, + "grad_norm": 0.6320071220397949, + "learning_rate": 1.2466563572945228e-05, + "loss": 0.2632, + "step": 9177 + }, + { + "epoch": 0.8646051670945103, + "grad_norm": 0.8776123523712158, + "learning_rate": 1.2465100181747817e-05, + "loss": 0.3685, + "step": 9178 + }, + { + "epoch": 0.8646993711876781, + "grad_norm": 0.8309136629104614, + "learning_rate": 1.246363673434231e-05, + "loss": 0.3196, + "step": 9179 + }, + { + "epoch": 0.864793575280846, + "grad_norm": 0.8322733640670776, + "learning_rate": 1.2462173230762078e-05, + "loss": 0.2985, + "step": 9180 + }, + { + "epoch": 0.8648877793740138, + "grad_norm": 0.6591123938560486, + "learning_rate": 1.2460709671040486e-05, + "loss": 0.3048, + "step": 9181 + }, + { + "epoch": 0.8649819834671817, + "grad_norm": 0.7137584686279297, + "learning_rate": 1.2459246055210907e-05, + "loss": 0.2903, + "step": 9182 + }, + { + "epoch": 0.8650761875603495, + "grad_norm": 0.6407827734947205, + "learning_rate": 1.2457782383306719e-05, + "loss": 0.304, + "step": 9183 + }, + { + "epoch": 0.8651703916535174, + "grad_norm": 0.7770898938179016, + "learning_rate": 1.2456318655361288e-05, + "loss": 0.2805, + "step": 9184 + }, + { + "epoch": 0.8652645957466852, + "grad_norm": 0.8176446557044983, + "learning_rate": 1.2454854871407993e-05, + "loss": 0.3625, + "step": 9185 + }, + { + "epoch": 0.8653587998398531, + "grad_norm": 0.7962294816970825, + "learning_rate": 1.2453391031480214e-05, + "loss": 0.299, + "step": 9186 + }, + { + "epoch": 0.8654530039330209, + "grad_norm": 0.6298891305923462, + "learning_rate": 1.2451927135611319e-05, + "loss": 0.2982, + "step": 9187 + }, + { + "epoch": 0.8655472080261888, + "grad_norm": 0.7367516160011292, + "learning_rate": 1.2450463183834697e-05, + "loss": 0.301, + "step": 9188 + }, + { + "epoch": 0.8656414121193566, + "grad_norm": 0.7628706097602844, + "learning_rate": 1.2448999176183725e-05, + "loss": 0.3391, + "step": 9189 + }, + { + "epoch": 0.8657356162125245, + "grad_norm": 0.7715069055557251, + "learning_rate": 1.2447535112691784e-05, + "loss": 0.3473, + "step": 9190 + }, + { + "epoch": 0.8658298203056923, + "grad_norm": 0.6850812435150146, + "learning_rate": 1.2446070993392257e-05, + "loss": 0.3036, + "step": 9191 + }, + { + "epoch": 0.8659240243988602, + "grad_norm": 0.7510053515434265, + "learning_rate": 1.2444606818318528e-05, + "loss": 0.337, + "step": 9192 + }, + { + "epoch": 0.866018228492028, + "grad_norm": 0.76163250207901, + "learning_rate": 1.2443142587503983e-05, + "loss": 0.3318, + "step": 9193 + }, + { + "epoch": 0.8661124325851959, + "grad_norm": 0.671679675579071, + "learning_rate": 1.2441678300982007e-05, + "loss": 0.3011, + "step": 9194 + }, + { + "epoch": 0.8662066366783637, + "grad_norm": 0.6736225485801697, + "learning_rate": 1.2440213958785994e-05, + "loss": 0.2872, + "step": 9195 + }, + { + "epoch": 0.8663008407715316, + "grad_norm": 0.779719352722168, + "learning_rate": 1.2438749560949325e-05, + "loss": 0.3442, + "step": 9196 + }, + { + "epoch": 0.8663950448646994, + "grad_norm": 0.7523724436759949, + "learning_rate": 1.2437285107505397e-05, + "loss": 0.2835, + "step": 9197 + }, + { + "epoch": 0.8664892489578672, + "grad_norm": 0.7877782583236694, + "learning_rate": 1.2435820598487599e-05, + "loss": 0.3011, + "step": 9198 + }, + { + "epoch": 0.866583453051035, + "grad_norm": 0.8221269249916077, + "learning_rate": 1.243435603392932e-05, + "loss": 0.3554, + "step": 9199 + }, + { + "epoch": 0.8666776571442029, + "grad_norm": 0.6900723576545715, + "learning_rate": 1.2432891413863964e-05, + "loss": 0.2813, + "step": 9200 + }, + { + "epoch": 0.8667718612373707, + "grad_norm": 0.6873626112937927, + "learning_rate": 1.2431426738324919e-05, + "loss": 0.2665, + "step": 9201 + }, + { + "epoch": 0.8668660653305386, + "grad_norm": 0.8400952219963074, + "learning_rate": 1.2429962007345584e-05, + "loss": 0.3231, + "step": 9202 + }, + { + "epoch": 0.8669602694237064, + "grad_norm": 0.7722880840301514, + "learning_rate": 1.2428497220959359e-05, + "loss": 0.3236, + "step": 9203 + }, + { + "epoch": 0.8670544735168743, + "grad_norm": 0.6867839694023132, + "learning_rate": 1.242703237919964e-05, + "loss": 0.3303, + "step": 9204 + }, + { + "epoch": 0.8671486776100421, + "grad_norm": 0.7867183685302734, + "learning_rate": 1.242556748209983e-05, + "loss": 0.3048, + "step": 9205 + }, + { + "epoch": 0.86724288170321, + "grad_norm": 0.7227653861045837, + "learning_rate": 1.242410252969333e-05, + "loss": 0.2925, + "step": 9206 + }, + { + "epoch": 0.8673370857963778, + "grad_norm": 0.6544963717460632, + "learning_rate": 1.242263752201354e-05, + "loss": 0.3005, + "step": 9207 + }, + { + "epoch": 0.8674312898895457, + "grad_norm": 0.7285255789756775, + "learning_rate": 1.242117245909387e-05, + "loss": 0.2526, + "step": 9208 + }, + { + "epoch": 0.8675254939827135, + "grad_norm": 0.7026352286338806, + "learning_rate": 1.2419707340967726e-05, + "loss": 0.287, + "step": 9209 + }, + { + "epoch": 0.8676196980758814, + "grad_norm": 0.7239752411842346, + "learning_rate": 1.241824216766851e-05, + "loss": 0.3454, + "step": 9210 + }, + { + "epoch": 0.8677139021690492, + "grad_norm": 0.7591168880462646, + "learning_rate": 1.2416776939229633e-05, + "loss": 0.3133, + "step": 9211 + }, + { + "epoch": 0.867808106262217, + "grad_norm": 0.8220389485359192, + "learning_rate": 1.2415311655684506e-05, + "loss": 0.3254, + "step": 9212 + }, + { + "epoch": 0.8679023103553849, + "grad_norm": 0.7573966383934021, + "learning_rate": 1.2413846317066535e-05, + "loss": 0.3084, + "step": 9213 + }, + { + "epoch": 0.8679965144485527, + "grad_norm": 0.7685338854789734, + "learning_rate": 1.2412380923409138e-05, + "loss": 0.3149, + "step": 9214 + }, + { + "epoch": 0.8680907185417206, + "grad_norm": 0.6167116165161133, + "learning_rate": 1.2410915474745724e-05, + "loss": 0.2629, + "step": 9215 + }, + { + "epoch": 0.8681849226348884, + "grad_norm": 0.823922872543335, + "learning_rate": 1.2409449971109705e-05, + "loss": 0.2957, + "step": 9216 + }, + { + "epoch": 0.8682791267280563, + "grad_norm": 0.8153708577156067, + "learning_rate": 1.2407984412534507e-05, + "loss": 0.3213, + "step": 9217 + }, + { + "epoch": 0.8683733308212241, + "grad_norm": 0.8279568552970886, + "learning_rate": 1.2406518799053538e-05, + "loss": 0.3536, + "step": 9218 + }, + { + "epoch": 0.868467534914392, + "grad_norm": 0.7307692766189575, + "learning_rate": 1.2405053130700215e-05, + "loss": 0.2734, + "step": 9219 + }, + { + "epoch": 0.8685617390075598, + "grad_norm": 0.7823532819747925, + "learning_rate": 1.2403587407507965e-05, + "loss": 0.3167, + "step": 9220 + }, + { + "epoch": 0.8686559431007277, + "grad_norm": 0.7097548246383667, + "learning_rate": 1.2402121629510202e-05, + "loss": 0.3035, + "step": 9221 + }, + { + "epoch": 0.8687501471938955, + "grad_norm": 0.7290064096450806, + "learning_rate": 1.240065579674035e-05, + "loss": 0.2687, + "step": 9222 + }, + { + "epoch": 0.8688443512870634, + "grad_norm": 0.642393946647644, + "learning_rate": 1.2399189909231838e-05, + "loss": 0.2765, + "step": 9223 + }, + { + "epoch": 0.8689385553802312, + "grad_norm": 0.7616701722145081, + "learning_rate": 1.2397723967018083e-05, + "loss": 0.2903, + "step": 9224 + }, + { + "epoch": 0.8690327594733991, + "grad_norm": 0.6706497073173523, + "learning_rate": 1.2396257970132514e-05, + "loss": 0.3465, + "step": 9225 + }, + { + "epoch": 0.8691269635665669, + "grad_norm": 0.6539916396141052, + "learning_rate": 1.2394791918608557e-05, + "loss": 0.2923, + "step": 9226 + }, + { + "epoch": 0.8692211676597348, + "grad_norm": 0.7657091617584229, + "learning_rate": 1.239332581247964e-05, + "loss": 0.3147, + "step": 9227 + }, + { + "epoch": 0.8693153717529026, + "grad_norm": 0.8091147541999817, + "learning_rate": 1.2391859651779195e-05, + "loss": 0.3214, + "step": 9228 + }, + { + "epoch": 0.8694095758460705, + "grad_norm": 0.7738662362098694, + "learning_rate": 1.2390393436540649e-05, + "loss": 0.336, + "step": 9229 + }, + { + "epoch": 0.8695037799392383, + "grad_norm": 0.6700285077095032, + "learning_rate": 1.2388927166797438e-05, + "loss": 0.2857, + "step": 9230 + }, + { + "epoch": 0.8695979840324062, + "grad_norm": 0.7972094416618347, + "learning_rate": 1.238746084258299e-05, + "loss": 0.3035, + "step": 9231 + }, + { + "epoch": 0.869692188125574, + "grad_norm": 0.8680013418197632, + "learning_rate": 1.2385994463930743e-05, + "loss": 0.3114, + "step": 9232 + }, + { + "epoch": 0.8697863922187419, + "grad_norm": 0.7896369099617004, + "learning_rate": 1.2384528030874134e-05, + "loss": 0.2947, + "step": 9233 + }, + { + "epoch": 0.8698805963119097, + "grad_norm": 0.6831520795822144, + "learning_rate": 1.2383061543446596e-05, + "loss": 0.3035, + "step": 9234 + }, + { + "epoch": 0.8699748004050776, + "grad_norm": 0.8190691471099854, + "learning_rate": 1.2381595001681574e-05, + "loss": 0.2804, + "step": 9235 + }, + { + "epoch": 0.8700690044982454, + "grad_norm": 0.615748941898346, + "learning_rate": 1.23801284056125e-05, + "loss": 0.2636, + "step": 9236 + }, + { + "epoch": 0.8701632085914133, + "grad_norm": 0.7815951704978943, + "learning_rate": 1.2378661755272817e-05, + "loss": 0.3178, + "step": 9237 + }, + { + "epoch": 0.8702574126845811, + "grad_norm": 0.71673184633255, + "learning_rate": 1.2377195050695967e-05, + "loss": 0.3242, + "step": 9238 + }, + { + "epoch": 0.870351616777749, + "grad_norm": 0.835582971572876, + "learning_rate": 1.2375728291915391e-05, + "loss": 0.3205, + "step": 9239 + }, + { + "epoch": 0.8704458208709168, + "grad_norm": 0.6454624533653259, + "learning_rate": 1.237426147896454e-05, + "loss": 0.3111, + "step": 9240 + }, + { + "epoch": 0.8705400249640847, + "grad_norm": 0.7328464984893799, + "learning_rate": 1.2372794611876855e-05, + "loss": 0.2991, + "step": 9241 + }, + { + "epoch": 0.8706342290572525, + "grad_norm": 0.7155019640922546, + "learning_rate": 1.237132769068578e-05, + "loss": 0.3049, + "step": 9242 + }, + { + "epoch": 0.8707284331504204, + "grad_norm": 1.338820219039917, + "learning_rate": 1.236986071542477e-05, + "loss": 0.3122, + "step": 9243 + }, + { + "epoch": 0.8708226372435882, + "grad_norm": 0.9869731664657593, + "learning_rate": 1.236839368612727e-05, + "loss": 0.2854, + "step": 9244 + }, + { + "epoch": 0.8709168413367561, + "grad_norm": 0.7557142376899719, + "learning_rate": 1.236692660282673e-05, + "loss": 0.3155, + "step": 9245 + }, + { + "epoch": 0.8710110454299239, + "grad_norm": 0.7705155611038208, + "learning_rate": 1.2365459465556603e-05, + "loss": 0.3244, + "step": 9246 + }, + { + "epoch": 0.8711052495230918, + "grad_norm": 0.7242844700813293, + "learning_rate": 1.236399227435034e-05, + "loss": 0.3464, + "step": 9247 + }, + { + "epoch": 0.8711994536162596, + "grad_norm": 0.6580681204795837, + "learning_rate": 1.2362525029241401e-05, + "loss": 0.2681, + "step": 9248 + }, + { + "epoch": 0.8712936577094275, + "grad_norm": 0.79903244972229, + "learning_rate": 1.2361057730263235e-05, + "loss": 0.3381, + "step": 9249 + }, + { + "epoch": 0.8713878618025953, + "grad_norm": 0.7240850329399109, + "learning_rate": 1.2359590377449301e-05, + "loss": 0.3111, + "step": 9250 + }, + { + "epoch": 0.8714820658957632, + "grad_norm": 0.7803201079368591, + "learning_rate": 1.235812297083306e-05, + "loss": 0.2922, + "step": 9251 + }, + { + "epoch": 0.871576269988931, + "grad_norm": 0.801443874835968, + "learning_rate": 1.2356655510447966e-05, + "loss": 0.3139, + "step": 9252 + }, + { + "epoch": 0.8716704740820989, + "grad_norm": 0.7281584143638611, + "learning_rate": 1.2355187996327484e-05, + "loss": 0.3238, + "step": 9253 + }, + { + "epoch": 0.8717646781752667, + "grad_norm": 0.6576111912727356, + "learning_rate": 1.2353720428505072e-05, + "loss": 0.2689, + "step": 9254 + }, + { + "epoch": 0.8718588822684346, + "grad_norm": 0.7188935875892639, + "learning_rate": 1.2352252807014194e-05, + "loss": 0.3289, + "step": 9255 + }, + { + "epoch": 0.8719530863616024, + "grad_norm": 0.7349696755409241, + "learning_rate": 1.2350785131888311e-05, + "loss": 0.3006, + "step": 9256 + }, + { + "epoch": 0.8720472904547703, + "grad_norm": 0.7604920268058777, + "learning_rate": 1.2349317403160897e-05, + "loss": 0.302, + "step": 9257 + }, + { + "epoch": 0.8721414945479381, + "grad_norm": 0.6442971229553223, + "learning_rate": 1.234784962086541e-05, + "loss": 0.2962, + "step": 9258 + }, + { + "epoch": 0.872235698641106, + "grad_norm": 0.6648150682449341, + "learning_rate": 1.234638178503532e-05, + "loss": 0.2951, + "step": 9259 + }, + { + "epoch": 0.8723299027342738, + "grad_norm": 0.7383939623832703, + "learning_rate": 1.2344913895704099e-05, + "loss": 0.2839, + "step": 9260 + }, + { + "epoch": 0.8724241068274416, + "grad_norm": 0.7782507538795471, + "learning_rate": 1.2343445952905212e-05, + "loss": 0.3066, + "step": 9261 + }, + { + "epoch": 0.8725183109206095, + "grad_norm": 0.66548752784729, + "learning_rate": 1.2341977956672135e-05, + "loss": 0.2874, + "step": 9262 + }, + { + "epoch": 0.8726125150137773, + "grad_norm": 0.7760818600654602, + "learning_rate": 1.2340509907038341e-05, + "loss": 0.3039, + "step": 9263 + }, + { + "epoch": 0.8727067191069452, + "grad_norm": 0.7823361754417419, + "learning_rate": 1.2339041804037294e-05, + "loss": 0.292, + "step": 9264 + }, + { + "epoch": 0.872800923200113, + "grad_norm": 0.7515829205513, + "learning_rate": 1.2337573647702483e-05, + "loss": 0.3131, + "step": 9265 + }, + { + "epoch": 0.8728951272932809, + "grad_norm": 0.7954232096672058, + "learning_rate": 1.2336105438067376e-05, + "loss": 0.324, + "step": 9266 + }, + { + "epoch": 0.8729893313864487, + "grad_norm": 0.8027656078338623, + "learning_rate": 1.2334637175165451e-05, + "loss": 0.3211, + "step": 9267 + }, + { + "epoch": 0.8730835354796166, + "grad_norm": 0.7220860719680786, + "learning_rate": 1.233316885903019e-05, + "loss": 0.27, + "step": 9268 + }, + { + "epoch": 0.8731777395727844, + "grad_norm": 0.7339596748352051, + "learning_rate": 1.2331700489695069e-05, + "loss": 0.2881, + "step": 9269 + }, + { + "epoch": 0.8732719436659523, + "grad_norm": 0.7125227451324463, + "learning_rate": 1.2330232067193572e-05, + "loss": 0.2795, + "step": 9270 + }, + { + "epoch": 0.8733661477591201, + "grad_norm": 0.6848975419998169, + "learning_rate": 1.2328763591559176e-05, + "loss": 0.2748, + "step": 9271 + }, + { + "epoch": 0.873460351852288, + "grad_norm": 0.7000641226768494, + "learning_rate": 1.2327295062825373e-05, + "loss": 0.2804, + "step": 9272 + }, + { + "epoch": 0.8735545559454558, + "grad_norm": 0.7253985404968262, + "learning_rate": 1.2325826481025644e-05, + "loss": 0.32, + "step": 9273 + }, + { + "epoch": 0.8736487600386237, + "grad_norm": 0.7008634209632874, + "learning_rate": 1.2324357846193472e-05, + "loss": 0.2772, + "step": 9274 + }, + { + "epoch": 0.8737429641317915, + "grad_norm": 0.6910400390625, + "learning_rate": 1.2322889158362347e-05, + "loss": 0.2928, + "step": 9275 + }, + { + "epoch": 0.8738371682249594, + "grad_norm": 0.8178048729896545, + "learning_rate": 1.2321420417565757e-05, + "loss": 0.3169, + "step": 9276 + }, + { + "epoch": 0.8739313723181272, + "grad_norm": 0.7599455118179321, + "learning_rate": 1.2319951623837189e-05, + "loss": 0.3185, + "step": 9277 + }, + { + "epoch": 0.8740255764112951, + "grad_norm": 0.6750515699386597, + "learning_rate": 1.231848277721014e-05, + "loss": 0.3254, + "step": 9278 + }, + { + "epoch": 0.8741197805044629, + "grad_norm": 0.7708296179771423, + "learning_rate": 1.2317013877718096e-05, + "loss": 0.3207, + "step": 9279 + }, + { + "epoch": 0.8742139845976308, + "grad_norm": 0.8172353506088257, + "learning_rate": 1.2315544925394553e-05, + "loss": 0.302, + "step": 9280 + }, + { + "epoch": 0.8743081886907986, + "grad_norm": 0.7476603388786316, + "learning_rate": 1.2314075920273002e-05, + "loss": 0.3016, + "step": 9281 + }, + { + "epoch": 0.8744023927839665, + "grad_norm": 0.767647385597229, + "learning_rate": 1.2312606862386942e-05, + "loss": 0.3304, + "step": 9282 + }, + { + "epoch": 0.8744965968771343, + "grad_norm": 0.7349490523338318, + "learning_rate": 1.2311137751769875e-05, + "loss": 0.2832, + "step": 9283 + }, + { + "epoch": 0.8745908009703022, + "grad_norm": 0.6973784565925598, + "learning_rate": 1.2309668588455285e-05, + "loss": 0.2557, + "step": 9284 + }, + { + "epoch": 0.87468500506347, + "grad_norm": 0.8286645412445068, + "learning_rate": 1.2308199372476683e-05, + "loss": 0.3596, + "step": 9285 + }, + { + "epoch": 0.8747792091566379, + "grad_norm": 0.744421660900116, + "learning_rate": 1.2306730103867565e-05, + "loss": 0.2555, + "step": 9286 + }, + { + "epoch": 0.8748734132498057, + "grad_norm": 0.7113873958587646, + "learning_rate": 1.2305260782661433e-05, + "loss": 0.2527, + "step": 9287 + }, + { + "epoch": 0.8749676173429736, + "grad_norm": 0.8271864056587219, + "learning_rate": 1.2303791408891792e-05, + "loss": 0.3028, + "step": 9288 + }, + { + "epoch": 0.8750618214361414, + "grad_norm": 0.7463380098342896, + "learning_rate": 1.2302321982592142e-05, + "loss": 0.2915, + "step": 9289 + }, + { + "epoch": 0.8751560255293093, + "grad_norm": 0.8846245408058167, + "learning_rate": 1.2300852503795993e-05, + "loss": 0.3512, + "step": 9290 + }, + { + "epoch": 0.8752502296224771, + "grad_norm": 0.7288370728492737, + "learning_rate": 1.2299382972536842e-05, + "loss": 0.3388, + "step": 9291 + }, + { + "epoch": 0.875344433715645, + "grad_norm": 0.6647352576255798, + "learning_rate": 1.2297913388848208e-05, + "loss": 0.3287, + "step": 9292 + }, + { + "epoch": 0.8754386378088128, + "grad_norm": 0.8051317930221558, + "learning_rate": 1.2296443752763597e-05, + "loss": 0.3185, + "step": 9293 + }, + { + "epoch": 0.8755328419019807, + "grad_norm": 0.8401761651039124, + "learning_rate": 1.2294974064316513e-05, + "loss": 0.3296, + "step": 9294 + }, + { + "epoch": 0.8756270459951485, + "grad_norm": 0.8169759511947632, + "learning_rate": 1.2293504323540473e-05, + "loss": 0.3165, + "step": 9295 + }, + { + "epoch": 0.8757212500883164, + "grad_norm": 0.7035739421844482, + "learning_rate": 1.2292034530468986e-05, + "loss": 0.2821, + "step": 9296 + }, + { + "epoch": 0.8758154541814842, + "grad_norm": 0.6984587907791138, + "learning_rate": 1.2290564685135566e-05, + "loss": 0.2905, + "step": 9297 + }, + { + "epoch": 0.8759096582746521, + "grad_norm": 0.7788722515106201, + "learning_rate": 1.2289094787573732e-05, + "loss": 0.3047, + "step": 9298 + }, + { + "epoch": 0.8760038623678199, + "grad_norm": 0.6994571685791016, + "learning_rate": 1.2287624837816993e-05, + "loss": 0.3134, + "step": 9299 + }, + { + "epoch": 0.8760980664609878, + "grad_norm": 0.9961441159248352, + "learning_rate": 1.2286154835898876e-05, + "loss": 0.3159, + "step": 9300 + }, + { + "epoch": 0.8761922705541556, + "grad_norm": 0.8256969451904297, + "learning_rate": 1.2284684781852887e-05, + "loss": 0.3122, + "step": 9301 + }, + { + "epoch": 0.8762864746473235, + "grad_norm": 0.8606594800949097, + "learning_rate": 1.228321467571255e-05, + "loss": 0.3509, + "step": 9302 + }, + { + "epoch": 0.8763806787404913, + "grad_norm": 0.7667705416679382, + "learning_rate": 1.2281744517511396e-05, + "loss": 0.338, + "step": 9303 + }, + { + "epoch": 0.8764748828336592, + "grad_norm": 0.6364054083824158, + "learning_rate": 1.2280274307282932e-05, + "loss": 0.2827, + "step": 9304 + }, + { + "epoch": 0.876569086926827, + "grad_norm": 0.8485642671585083, + "learning_rate": 1.2278804045060688e-05, + "loss": 0.2519, + "step": 9305 + }, + { + "epoch": 0.8766632910199949, + "grad_norm": 0.8302154541015625, + "learning_rate": 1.227733373087819e-05, + "loss": 0.3256, + "step": 9306 + }, + { + "epoch": 0.8767574951131627, + "grad_norm": 0.8487189412117004, + "learning_rate": 1.2275863364768956e-05, + "loss": 0.2902, + "step": 9307 + }, + { + "epoch": 0.8768516992063305, + "grad_norm": 0.7642054557800293, + "learning_rate": 1.2274392946766522e-05, + "loss": 0.3033, + "step": 9308 + }, + { + "epoch": 0.8769459032994984, + "grad_norm": 0.6982212066650391, + "learning_rate": 1.2272922476904409e-05, + "loss": 0.3033, + "step": 9309 + }, + { + "epoch": 0.8770401073926662, + "grad_norm": 0.6717838644981384, + "learning_rate": 1.2271451955216151e-05, + "loss": 0.2991, + "step": 9310 + }, + { + "epoch": 0.8771343114858341, + "grad_norm": 0.7980400919914246, + "learning_rate": 1.2269981381735272e-05, + "loss": 0.2955, + "step": 9311 + }, + { + "epoch": 0.8772285155790019, + "grad_norm": 0.7617857456207275, + "learning_rate": 1.2268510756495312e-05, + "loss": 0.2952, + "step": 9312 + }, + { + "epoch": 0.8773227196721698, + "grad_norm": 0.9512116312980652, + "learning_rate": 1.2267040079529794e-05, + "loss": 0.3152, + "step": 9313 + }, + { + "epoch": 0.8774169237653376, + "grad_norm": 0.7181690335273743, + "learning_rate": 1.2265569350872257e-05, + "loss": 0.2613, + "step": 9314 + }, + { + "epoch": 0.8775111278585055, + "grad_norm": 0.8266440629959106, + "learning_rate": 1.2264098570556238e-05, + "loss": 0.2696, + "step": 9315 + }, + { + "epoch": 0.8776053319516733, + "grad_norm": 0.7346266508102417, + "learning_rate": 1.2262627738615265e-05, + "loss": 0.2991, + "step": 9316 + }, + { + "epoch": 0.8776995360448412, + "grad_norm": 0.8172141909599304, + "learning_rate": 1.2261156855082882e-05, + "loss": 0.3409, + "step": 9317 + }, + { + "epoch": 0.877793740138009, + "grad_norm": 0.6860228776931763, + "learning_rate": 1.225968591999263e-05, + "loss": 0.2855, + "step": 9318 + }, + { + "epoch": 0.8778879442311769, + "grad_norm": 0.6955971121788025, + "learning_rate": 1.2258214933378038e-05, + "loss": 0.3146, + "step": 9319 + }, + { + "epoch": 0.8779821483243447, + "grad_norm": 0.7971096038818359, + "learning_rate": 1.225674389527266e-05, + "loss": 0.3222, + "step": 9320 + }, + { + "epoch": 0.8780763524175126, + "grad_norm": 0.7157099843025208, + "learning_rate": 1.2255272805710026e-05, + "loss": 0.3392, + "step": 9321 + }, + { + "epoch": 0.8781705565106804, + "grad_norm": 0.6945052146911621, + "learning_rate": 1.2253801664723683e-05, + "loss": 0.3118, + "step": 9322 + }, + { + "epoch": 0.8782647606038483, + "grad_norm": 0.7462865114212036, + "learning_rate": 1.2252330472347183e-05, + "loss": 0.3189, + "step": 9323 + }, + { + "epoch": 0.8783589646970161, + "grad_norm": 0.7465896010398865, + "learning_rate": 1.2250859228614058e-05, + "loss": 0.283, + "step": 9324 + }, + { + "epoch": 0.878453168790184, + "grad_norm": 0.6610503792762756, + "learning_rate": 1.2249387933557864e-05, + "loss": 0.2933, + "step": 9325 + }, + { + "epoch": 0.8785473728833518, + "grad_norm": 0.6051380634307861, + "learning_rate": 1.2247916587212147e-05, + "loss": 0.2686, + "step": 9326 + }, + { + "epoch": 0.8786415769765197, + "grad_norm": 0.8131787180900574, + "learning_rate": 1.2246445189610455e-05, + "loss": 0.2976, + "step": 9327 + }, + { + "epoch": 0.8787357810696875, + "grad_norm": 0.6534878015518188, + "learning_rate": 1.2244973740786338e-05, + "loss": 0.2913, + "step": 9328 + }, + { + "epoch": 0.8788299851628554, + "grad_norm": 0.6280990242958069, + "learning_rate": 1.2243502240773348e-05, + "loss": 0.2646, + "step": 9329 + }, + { + "epoch": 0.8789241892560232, + "grad_norm": 0.6751523613929749, + "learning_rate": 1.224203068960504e-05, + "loss": 0.3356, + "step": 9330 + }, + { + "epoch": 0.8790183933491911, + "grad_norm": 0.8297321796417236, + "learning_rate": 1.224055908731496e-05, + "loss": 0.2952, + "step": 9331 + }, + { + "epoch": 0.8791125974423589, + "grad_norm": 0.6817976832389832, + "learning_rate": 1.2239087433936672e-05, + "loss": 0.2778, + "step": 9332 + }, + { + "epoch": 0.8792068015355268, + "grad_norm": 0.7363234162330627, + "learning_rate": 1.2237615729503726e-05, + "loss": 0.3439, + "step": 9333 + }, + { + "epoch": 0.8793010056286946, + "grad_norm": 0.7076244354248047, + "learning_rate": 1.2236143974049682e-05, + "loss": 0.2871, + "step": 9334 + }, + { + "epoch": 0.8793952097218625, + "grad_norm": 0.7310746312141418, + "learning_rate": 1.2234672167608095e-05, + "loss": 0.2654, + "step": 9335 + }, + { + "epoch": 0.8794894138150302, + "grad_norm": 0.6136433482170105, + "learning_rate": 1.2233200310212527e-05, + "loss": 0.263, + "step": 9336 + }, + { + "epoch": 0.8795836179081981, + "grad_norm": 0.8622340559959412, + "learning_rate": 1.2231728401896539e-05, + "loss": 0.2987, + "step": 9337 + }, + { + "epoch": 0.8796778220013659, + "grad_norm": 1.2962377071380615, + "learning_rate": 1.2230256442693693e-05, + "loss": 0.3027, + "step": 9338 + }, + { + "epoch": 0.8797720260945338, + "grad_norm": 1.405983567237854, + "learning_rate": 1.2228784432637548e-05, + "loss": 0.318, + "step": 9339 + }, + { + "epoch": 0.8798662301877016, + "grad_norm": 0.692185640335083, + "learning_rate": 1.2227312371761675e-05, + "loss": 0.2841, + "step": 9340 + }, + { + "epoch": 0.8799604342808695, + "grad_norm": 0.740404486656189, + "learning_rate": 1.222584026009963e-05, + "loss": 0.3323, + "step": 9341 + }, + { + "epoch": 0.8800546383740373, + "grad_norm": 0.7664980888366699, + "learning_rate": 1.2224368097684986e-05, + "loss": 0.3248, + "step": 9342 + }, + { + "epoch": 0.8801488424672052, + "grad_norm": 0.6764711141586304, + "learning_rate": 1.2222895884551315e-05, + "loss": 0.266, + "step": 9343 + }, + { + "epoch": 0.880243046560373, + "grad_norm": 0.6800090670585632, + "learning_rate": 1.2221423620732175e-05, + "loss": 0.298, + "step": 9344 + }, + { + "epoch": 0.8803372506535408, + "grad_norm": 0.695341169834137, + "learning_rate": 1.2219951306261141e-05, + "loss": 0.3057, + "step": 9345 + }, + { + "epoch": 0.8804314547467087, + "grad_norm": 0.7294071316719055, + "learning_rate": 1.2218478941171787e-05, + "loss": 0.2919, + "step": 9346 + }, + { + "epoch": 0.8805256588398765, + "grad_norm": 0.8023799061775208, + "learning_rate": 1.2217006525497678e-05, + "loss": 0.3484, + "step": 9347 + }, + { + "epoch": 0.8806198629330444, + "grad_norm": 0.8050774931907654, + "learning_rate": 1.2215534059272396e-05, + "loss": 0.2917, + "step": 9348 + }, + { + "epoch": 0.8807140670262122, + "grad_norm": 0.6966261267662048, + "learning_rate": 1.221406154252951e-05, + "loss": 0.3082, + "step": 9349 + }, + { + "epoch": 0.8808082711193801, + "grad_norm": 0.7030807733535767, + "learning_rate": 1.2212588975302595e-05, + "loss": 0.2577, + "step": 9350 + }, + { + "epoch": 0.8809024752125479, + "grad_norm": 0.7408086657524109, + "learning_rate": 1.2211116357625228e-05, + "loss": 0.2876, + "step": 9351 + }, + { + "epoch": 0.8809966793057158, + "grad_norm": 0.9203299880027771, + "learning_rate": 1.2209643689530993e-05, + "loss": 0.324, + "step": 9352 + }, + { + "epoch": 0.8810908833988836, + "grad_norm": 0.6748857498168945, + "learning_rate": 1.220817097105346e-05, + "loss": 0.3035, + "step": 9353 + }, + { + "epoch": 0.8811850874920515, + "grad_norm": 0.655755341053009, + "learning_rate": 1.220669820222622e-05, + "loss": 0.2817, + "step": 9354 + }, + { + "epoch": 0.8812792915852193, + "grad_norm": 0.6666610836982727, + "learning_rate": 1.2205225383082844e-05, + "loss": 0.2832, + "step": 9355 + }, + { + "epoch": 0.8813734956783872, + "grad_norm": 0.7367080450057983, + "learning_rate": 1.2203752513656917e-05, + "loss": 0.296, + "step": 9356 + }, + { + "epoch": 0.881467699771555, + "grad_norm": 0.7991181015968323, + "learning_rate": 1.2202279593982026e-05, + "loss": 0.3193, + "step": 9357 + }, + { + "epoch": 0.8815619038647229, + "grad_norm": 0.816676914691925, + "learning_rate": 1.2200806624091756e-05, + "loss": 0.2916, + "step": 9358 + }, + { + "epoch": 0.8816561079578907, + "grad_norm": 0.656477689743042, + "learning_rate": 1.219933360401969e-05, + "loss": 0.2861, + "step": 9359 + }, + { + "epoch": 0.8817503120510586, + "grad_norm": 0.7736837267875671, + "learning_rate": 1.2197860533799419e-05, + "loss": 0.3243, + "step": 9360 + }, + { + "epoch": 0.8818445161442264, + "grad_norm": 0.60536128282547, + "learning_rate": 1.2196387413464525e-05, + "loss": 0.286, + "step": 9361 + }, + { + "epoch": 0.8819387202373943, + "grad_norm": 0.7745570540428162, + "learning_rate": 1.2194914243048602e-05, + "loss": 0.3209, + "step": 9362 + }, + { + "epoch": 0.8820329243305621, + "grad_norm": 0.69176185131073, + "learning_rate": 1.2193441022585244e-05, + "loss": 0.2918, + "step": 9363 + }, + { + "epoch": 0.88212712842373, + "grad_norm": 0.8644979000091553, + "learning_rate": 1.219196775210803e-05, + "loss": 0.3146, + "step": 9364 + }, + { + "epoch": 0.8822213325168978, + "grad_norm": 0.657992959022522, + "learning_rate": 1.219049443165057e-05, + "loss": 0.3122, + "step": 9365 + }, + { + "epoch": 0.8823155366100657, + "grad_norm": 0.7901890277862549, + "learning_rate": 1.2189021061246447e-05, + "loss": 0.3294, + "step": 9366 + }, + { + "epoch": 0.8824097407032335, + "grad_norm": 0.7417951822280884, + "learning_rate": 1.2187547640929253e-05, + "loss": 0.2868, + "step": 9367 + }, + { + "epoch": 0.8825039447964014, + "grad_norm": 0.7628073692321777, + "learning_rate": 1.2186074170732596e-05, + "loss": 0.3154, + "step": 9368 + }, + { + "epoch": 0.8825981488895692, + "grad_norm": 0.8040744066238403, + "learning_rate": 1.2184600650690067e-05, + "loss": 0.3613, + "step": 9369 + }, + { + "epoch": 0.8826923529827371, + "grad_norm": 0.7684886455535889, + "learning_rate": 1.2183127080835262e-05, + "loss": 0.3367, + "step": 9370 + }, + { + "epoch": 0.8827865570759049, + "grad_norm": 1.0408079624176025, + "learning_rate": 1.2181653461201783e-05, + "loss": 0.3462, + "step": 9371 + }, + { + "epoch": 0.8828807611690728, + "grad_norm": 0.643016517162323, + "learning_rate": 1.218017979182323e-05, + "loss": 0.2872, + "step": 9372 + }, + { + "epoch": 0.8829749652622406, + "grad_norm": 0.7180635929107666, + "learning_rate": 1.2178706072733209e-05, + "loss": 0.311, + "step": 9373 + }, + { + "epoch": 0.8830691693554085, + "grad_norm": 1.2569615840911865, + "learning_rate": 1.217723230396532e-05, + "loss": 0.2956, + "step": 9374 + }, + { + "epoch": 0.8831633734485763, + "grad_norm": 1.8318754434585571, + "learning_rate": 1.2175758485553166e-05, + "loss": 0.2862, + "step": 9375 + }, + { + "epoch": 0.8832575775417442, + "grad_norm": 0.8067878484725952, + "learning_rate": 1.2174284617530354e-05, + "loss": 0.28, + "step": 9376 + }, + { + "epoch": 0.883351781634912, + "grad_norm": 0.652657687664032, + "learning_rate": 1.217281069993049e-05, + "loss": 0.3104, + "step": 9377 + }, + { + "epoch": 0.8834459857280799, + "grad_norm": 0.7842876315116882, + "learning_rate": 1.2171336732787183e-05, + "loss": 0.3545, + "step": 9378 + }, + { + "epoch": 0.8835401898212477, + "grad_norm": 0.8068026304244995, + "learning_rate": 1.2169862716134037e-05, + "loss": 0.2847, + "step": 9379 + }, + { + "epoch": 0.8836343939144156, + "grad_norm": 1.1152421236038208, + "learning_rate": 1.216838865000467e-05, + "loss": 0.3525, + "step": 9380 + }, + { + "epoch": 0.8837285980075834, + "grad_norm": 0.7238123416900635, + "learning_rate": 1.2166914534432686e-05, + "loss": 0.2845, + "step": 9381 + }, + { + "epoch": 0.8838228021007513, + "grad_norm": 0.7315871715545654, + "learning_rate": 1.2165440369451695e-05, + "loss": 0.2881, + "step": 9382 + }, + { + "epoch": 0.8839170061939191, + "grad_norm": 0.7550876140594482, + "learning_rate": 1.2163966155095323e-05, + "loss": 0.3484, + "step": 9383 + }, + { + "epoch": 0.884011210287087, + "grad_norm": 0.6491653919219971, + "learning_rate": 1.216249189139717e-05, + "loss": 0.3005, + "step": 9384 + }, + { + "epoch": 0.8841054143802548, + "grad_norm": 0.7400387525558472, + "learning_rate": 1.2161017578390862e-05, + "loss": 0.2821, + "step": 9385 + }, + { + "epoch": 0.8841996184734227, + "grad_norm": 0.7085525393486023, + "learning_rate": 1.2159543216110008e-05, + "loss": 0.2992, + "step": 9386 + }, + { + "epoch": 0.8842938225665905, + "grad_norm": 0.7039757370948792, + "learning_rate": 1.2158068804588228e-05, + "loss": 0.2984, + "step": 9387 + }, + { + "epoch": 0.8843880266597584, + "grad_norm": 0.7835156917572021, + "learning_rate": 1.2156594343859146e-05, + "loss": 0.3119, + "step": 9388 + }, + { + "epoch": 0.8844822307529262, + "grad_norm": 0.6807800531387329, + "learning_rate": 1.2155119833956373e-05, + "loss": 0.2852, + "step": 9389 + }, + { + "epoch": 0.884576434846094, + "grad_norm": 0.6613379120826721, + "learning_rate": 1.2153645274913537e-05, + "loss": 0.3071, + "step": 9390 + }, + { + "epoch": 0.8846706389392619, + "grad_norm": 0.8164822459220886, + "learning_rate": 1.2152170666764258e-05, + "loss": 0.3069, + "step": 9391 + }, + { + "epoch": 0.8847648430324297, + "grad_norm": 1.0489635467529297, + "learning_rate": 1.215069600954216e-05, + "loss": 0.2741, + "step": 9392 + }, + { + "epoch": 0.8848590471255976, + "grad_norm": 0.6539410948753357, + "learning_rate": 1.2149221303280865e-05, + "loss": 0.3041, + "step": 9393 + }, + { + "epoch": 0.8849532512187654, + "grad_norm": 0.6886880397796631, + "learning_rate": 1.2147746548014003e-05, + "loss": 0.2711, + "step": 9394 + }, + { + "epoch": 0.8850474553119333, + "grad_norm": 0.7714412808418274, + "learning_rate": 1.2146271743775198e-05, + "loss": 0.2941, + "step": 9395 + }, + { + "epoch": 0.8851416594051011, + "grad_norm": 0.7667918801307678, + "learning_rate": 1.2144796890598074e-05, + "loss": 0.2733, + "step": 9396 + }, + { + "epoch": 0.885235863498269, + "grad_norm": 0.9158183932304382, + "learning_rate": 1.2143321988516267e-05, + "loss": 0.3071, + "step": 9397 + }, + { + "epoch": 0.8853300675914368, + "grad_norm": 0.7339330315589905, + "learning_rate": 1.2141847037563405e-05, + "loss": 0.2645, + "step": 9398 + }, + { + "epoch": 0.8854242716846047, + "grad_norm": 1.1290115118026733, + "learning_rate": 1.2140372037773114e-05, + "loss": 0.3165, + "step": 9399 + }, + { + "epoch": 0.8855184757777725, + "grad_norm": 0.7589887380599976, + "learning_rate": 1.2138896989179037e-05, + "loss": 0.3021, + "step": 9400 + }, + { + "epoch": 0.8856126798709404, + "grad_norm": 0.8948603272438049, + "learning_rate": 1.2137421891814796e-05, + "loss": 0.3435, + "step": 9401 + }, + { + "epoch": 0.8857068839641082, + "grad_norm": 0.9510778188705444, + "learning_rate": 1.2135946745714028e-05, + "loss": 0.3555, + "step": 9402 + }, + { + "epoch": 0.8858010880572761, + "grad_norm": 0.8120291233062744, + "learning_rate": 1.2134471550910379e-05, + "loss": 0.3195, + "step": 9403 + }, + { + "epoch": 0.8858952921504439, + "grad_norm": 0.7990771532058716, + "learning_rate": 1.213299630743747e-05, + "loss": 0.2977, + "step": 9404 + }, + { + "epoch": 0.8859894962436118, + "grad_norm": 0.7180115580558777, + "learning_rate": 1.2131521015328948e-05, + "loss": 0.3232, + "step": 9405 + }, + { + "epoch": 0.8860837003367796, + "grad_norm": 0.7164108157157898, + "learning_rate": 1.2130045674618453e-05, + "loss": 0.284, + "step": 9406 + }, + { + "epoch": 0.8861779044299475, + "grad_norm": 0.7136685848236084, + "learning_rate": 1.212857028533962e-05, + "loss": 0.3216, + "step": 9407 + }, + { + "epoch": 0.8862721085231153, + "grad_norm": 0.7712353467941284, + "learning_rate": 1.2127094847526093e-05, + "loss": 0.3209, + "step": 9408 + }, + { + "epoch": 0.8863663126162832, + "grad_norm": 0.7543731331825256, + "learning_rate": 1.2125619361211517e-05, + "loss": 0.318, + "step": 9409 + }, + { + "epoch": 0.886460516709451, + "grad_norm": 0.6535468101501465, + "learning_rate": 1.2124143826429529e-05, + "loss": 0.3081, + "step": 9410 + }, + { + "epoch": 0.8865547208026189, + "grad_norm": 0.7212768197059631, + "learning_rate": 1.2122668243213779e-05, + "loss": 0.2945, + "step": 9411 + }, + { + "epoch": 0.8866489248957867, + "grad_norm": 0.9539259076118469, + "learning_rate": 1.2121192611597905e-05, + "loss": 0.3176, + "step": 9412 + }, + { + "epoch": 0.8867431289889546, + "grad_norm": 0.707371175289154, + "learning_rate": 1.2119716931615564e-05, + "loss": 0.2729, + "step": 9413 + }, + { + "epoch": 0.8868373330821224, + "grad_norm": 0.7178236842155457, + "learning_rate": 1.2118241203300398e-05, + "loss": 0.2651, + "step": 9414 + }, + { + "epoch": 0.8869315371752903, + "grad_norm": 0.7159601449966431, + "learning_rate": 1.2116765426686057e-05, + "loss": 0.2959, + "step": 9415 + }, + { + "epoch": 0.8870257412684581, + "grad_norm": 0.7148156762123108, + "learning_rate": 1.2115289601806186e-05, + "loss": 0.273, + "step": 9416 + }, + { + "epoch": 0.887119945361626, + "grad_norm": 0.6918371319770813, + "learning_rate": 1.2113813728694447e-05, + "loss": 0.2962, + "step": 9417 + }, + { + "epoch": 0.8872141494547938, + "grad_norm": 0.6912872791290283, + "learning_rate": 1.2112337807384482e-05, + "loss": 0.293, + "step": 9418 + }, + { + "epoch": 0.8873083535479617, + "grad_norm": 0.8349743485450745, + "learning_rate": 1.2110861837909948e-05, + "loss": 0.2838, + "step": 9419 + }, + { + "epoch": 0.8874025576411295, + "grad_norm": 0.6753384470939636, + "learning_rate": 1.2109385820304504e-05, + "loss": 0.2874, + "step": 9420 + }, + { + "epoch": 0.8874967617342974, + "grad_norm": 0.7217723727226257, + "learning_rate": 1.2107909754601796e-05, + "loss": 0.3223, + "step": 9421 + }, + { + "epoch": 0.8875909658274652, + "grad_norm": 0.8470667004585266, + "learning_rate": 1.2106433640835487e-05, + "loss": 0.3289, + "step": 9422 + }, + { + "epoch": 0.8876851699206331, + "grad_norm": 0.7165049314498901, + "learning_rate": 1.2104957479039237e-05, + "loss": 0.3103, + "step": 9423 + }, + { + "epoch": 0.8877793740138009, + "grad_norm": 0.7034558057785034, + "learning_rate": 1.2103481269246697e-05, + "loss": 0.3056, + "step": 9424 + }, + { + "epoch": 0.8878735781069688, + "grad_norm": 0.8210559487342834, + "learning_rate": 1.2102005011491534e-05, + "loss": 0.3153, + "step": 9425 + }, + { + "epoch": 0.8879677822001366, + "grad_norm": 0.6928260922431946, + "learning_rate": 1.2100528705807402e-05, + "loss": 0.2682, + "step": 9426 + }, + { + "epoch": 0.8880619862933045, + "grad_norm": 0.8357961177825928, + "learning_rate": 1.209905235222797e-05, + "loss": 0.3144, + "step": 9427 + }, + { + "epoch": 0.8881561903864723, + "grad_norm": 0.6881080865859985, + "learning_rate": 1.2097575950786898e-05, + "loss": 0.289, + "step": 9428 + }, + { + "epoch": 0.8882503944796402, + "grad_norm": 0.7533964514732361, + "learning_rate": 1.2096099501517849e-05, + "loss": 0.3189, + "step": 9429 + }, + { + "epoch": 0.888344598572808, + "grad_norm": 0.8324847221374512, + "learning_rate": 1.209462300445449e-05, + "loss": 0.3248, + "step": 9430 + }, + { + "epoch": 0.8884388026659759, + "grad_norm": 0.7079195380210876, + "learning_rate": 1.2093146459630488e-05, + "loss": 0.2863, + "step": 9431 + }, + { + "epoch": 0.8885330067591437, + "grad_norm": 0.7742490172386169, + "learning_rate": 1.2091669867079507e-05, + "loss": 0.3061, + "step": 9432 + }, + { + "epoch": 0.8886272108523116, + "grad_norm": 0.721615195274353, + "learning_rate": 1.209019322683522e-05, + "loss": 0.3128, + "step": 9433 + }, + { + "epoch": 0.8887214149454794, + "grad_norm": 0.7562946081161499, + "learning_rate": 1.2088716538931296e-05, + "loss": 0.2989, + "step": 9434 + }, + { + "epoch": 0.8888156190386473, + "grad_norm": 0.6650596261024475, + "learning_rate": 1.2087239803401404e-05, + "loss": 0.2734, + "step": 9435 + }, + { + "epoch": 0.8889098231318151, + "grad_norm": 0.6545528173446655, + "learning_rate": 1.2085763020279215e-05, + "loss": 0.2617, + "step": 9436 + }, + { + "epoch": 0.889004027224983, + "grad_norm": 0.752198338508606, + "learning_rate": 1.2084286189598404e-05, + "loss": 0.3128, + "step": 9437 + }, + { + "epoch": 0.8890982313181508, + "grad_norm": 0.6593581438064575, + "learning_rate": 1.2082809311392647e-05, + "loss": 0.2906, + "step": 9438 + }, + { + "epoch": 0.8891924354113186, + "grad_norm": 0.7499755620956421, + "learning_rate": 1.2081332385695612e-05, + "loss": 0.3219, + "step": 9439 + }, + { + "epoch": 0.8892866395044865, + "grad_norm": 0.6921376585960388, + "learning_rate": 1.2079855412540986e-05, + "loss": 0.3054, + "step": 9440 + }, + { + "epoch": 0.8893808435976543, + "grad_norm": 0.8174970149993896, + "learning_rate": 1.2078378391962436e-05, + "loss": 0.3361, + "step": 9441 + }, + { + "epoch": 0.8894750476908222, + "grad_norm": 0.831191897392273, + "learning_rate": 1.2076901323993644e-05, + "loss": 0.318, + "step": 9442 + }, + { + "epoch": 0.88956925178399, + "grad_norm": 0.764427125453949, + "learning_rate": 1.2075424208668291e-05, + "loss": 0.2995, + "step": 9443 + }, + { + "epoch": 0.8896634558771579, + "grad_norm": 0.7526426315307617, + "learning_rate": 1.2073947046020056e-05, + "loss": 0.3039, + "step": 9444 + }, + { + "epoch": 0.8897576599703257, + "grad_norm": 0.8434804677963257, + "learning_rate": 1.207246983608262e-05, + "loss": 0.3452, + "step": 9445 + }, + { + "epoch": 0.8898518640634936, + "grad_norm": 0.7737138867378235, + "learning_rate": 1.2070992578889668e-05, + "loss": 0.3529, + "step": 9446 + }, + { + "epoch": 0.8899460681566614, + "grad_norm": 0.7623361945152283, + "learning_rate": 1.2069515274474882e-05, + "loss": 0.3127, + "step": 9447 + }, + { + "epoch": 0.8900402722498293, + "grad_norm": 0.9720628261566162, + "learning_rate": 1.2068037922871947e-05, + "loss": 0.3202, + "step": 9448 + }, + { + "epoch": 0.8901344763429971, + "grad_norm": 0.7618057131767273, + "learning_rate": 1.206656052411455e-05, + "loss": 0.3202, + "step": 9449 + }, + { + "epoch": 0.890228680436165, + "grad_norm": 0.739187479019165, + "learning_rate": 1.2065083078236375e-05, + "loss": 0.3048, + "step": 9450 + }, + { + "epoch": 0.8903228845293328, + "grad_norm": 0.7619025111198425, + "learning_rate": 1.2063605585271114e-05, + "loss": 0.301, + "step": 9451 + }, + { + "epoch": 0.8904170886225007, + "grad_norm": 0.7123521566390991, + "learning_rate": 1.2062128045252453e-05, + "loss": 0.2904, + "step": 9452 + }, + { + "epoch": 0.8905112927156685, + "grad_norm": 0.8152437806129456, + "learning_rate": 1.2060650458214085e-05, + "loss": 0.2973, + "step": 9453 + }, + { + "epoch": 0.8906054968088364, + "grad_norm": 0.6467706561088562, + "learning_rate": 1.2059172824189698e-05, + "loss": 0.2951, + "step": 9454 + }, + { + "epoch": 0.8906997009020042, + "grad_norm": 0.7300913333892822, + "learning_rate": 1.2057695143212986e-05, + "loss": 0.3186, + "step": 9455 + }, + { + "epoch": 0.8907939049951721, + "grad_norm": 0.6898703575134277, + "learning_rate": 1.2056217415317643e-05, + "loss": 0.3181, + "step": 9456 + }, + { + "epoch": 0.8908881090883399, + "grad_norm": 0.8182839155197144, + "learning_rate": 1.2054739640537363e-05, + "loss": 0.3176, + "step": 9457 + }, + { + "epoch": 0.8909823131815078, + "grad_norm": 0.8536497354507446, + "learning_rate": 1.2053261818905843e-05, + "loss": 0.3706, + "step": 9458 + }, + { + "epoch": 0.8910765172746756, + "grad_norm": 0.7779721617698669, + "learning_rate": 1.2051783950456775e-05, + "loss": 0.3076, + "step": 9459 + }, + { + "epoch": 0.8911707213678435, + "grad_norm": 0.6434833407402039, + "learning_rate": 1.2050306035223864e-05, + "loss": 0.2862, + "step": 9460 + }, + { + "epoch": 0.8912649254610113, + "grad_norm": 0.662363588809967, + "learning_rate": 1.20488280732408e-05, + "loss": 0.2584, + "step": 9461 + }, + { + "epoch": 0.8913591295541792, + "grad_norm": 0.8131331205368042, + "learning_rate": 1.204735006454129e-05, + "loss": 0.3348, + "step": 9462 + }, + { + "epoch": 0.891453333647347, + "grad_norm": 0.7505772113800049, + "learning_rate": 1.2045872009159033e-05, + "loss": 0.2982, + "step": 9463 + }, + { + "epoch": 0.8915475377405149, + "grad_norm": 0.7602881789207458, + "learning_rate": 1.2044393907127728e-05, + "loss": 0.3084, + "step": 9464 + }, + { + "epoch": 0.8916417418336827, + "grad_norm": 0.7289739847183228, + "learning_rate": 1.204291575848108e-05, + "loss": 0.2774, + "step": 9465 + }, + { + "epoch": 0.8917359459268506, + "grad_norm": 0.683515727519989, + "learning_rate": 1.2041437563252794e-05, + "loss": 0.2551, + "step": 9466 + }, + { + "epoch": 0.8918301500200184, + "grad_norm": 0.7214856147766113, + "learning_rate": 1.2039959321476574e-05, + "loss": 0.3255, + "step": 9467 + }, + { + "epoch": 0.8919243541131863, + "grad_norm": 0.8039005994796753, + "learning_rate": 1.2038481033186127e-05, + "loss": 0.3097, + "step": 9468 + }, + { + "epoch": 0.8920185582063541, + "grad_norm": 0.7016351222991943, + "learning_rate": 1.2037002698415161e-05, + "loss": 0.3161, + "step": 9469 + }, + { + "epoch": 0.892112762299522, + "grad_norm": 0.8027306199073792, + "learning_rate": 1.2035524317197382e-05, + "loss": 0.3336, + "step": 9470 + }, + { + "epoch": 0.8922069663926898, + "grad_norm": 0.7177056670188904, + "learning_rate": 1.2034045889566502e-05, + "loss": 0.2785, + "step": 9471 + }, + { + "epoch": 0.8923011704858577, + "grad_norm": 0.879801869392395, + "learning_rate": 1.2032567415556226e-05, + "loss": 0.3042, + "step": 9472 + }, + { + "epoch": 0.8923953745790255, + "grad_norm": 0.8894309401512146, + "learning_rate": 1.2031088895200273e-05, + "loss": 0.3238, + "step": 9473 + }, + { + "epoch": 0.8924895786721934, + "grad_norm": 0.8481342196464539, + "learning_rate": 1.2029610328532354e-05, + "loss": 0.2825, + "step": 9474 + }, + { + "epoch": 0.8925837827653611, + "grad_norm": 0.9710609912872314, + "learning_rate": 1.2028131715586177e-05, + "loss": 0.3322, + "step": 9475 + }, + { + "epoch": 0.892677986858529, + "grad_norm": 0.7773035764694214, + "learning_rate": 1.2026653056395461e-05, + "loss": 0.3022, + "step": 9476 + }, + { + "epoch": 0.8927721909516968, + "grad_norm": 0.7003985643386841, + "learning_rate": 1.2025174350993923e-05, + "loss": 0.2831, + "step": 9477 + }, + { + "epoch": 0.8928663950448646, + "grad_norm": 0.6839850544929504, + "learning_rate": 1.2023695599415275e-05, + "loss": 0.2878, + "step": 9478 + }, + { + "epoch": 0.8929605991380325, + "grad_norm": 1.114830732345581, + "learning_rate": 1.2022216801693239e-05, + "loss": 0.3468, + "step": 9479 + }, + { + "epoch": 0.8930548032312003, + "grad_norm": 0.7672768235206604, + "learning_rate": 1.2020737957861534e-05, + "loss": 0.3178, + "step": 9480 + }, + { + "epoch": 0.8931490073243682, + "grad_norm": 0.888733983039856, + "learning_rate": 1.2019259067953875e-05, + "loss": 0.3392, + "step": 9481 + }, + { + "epoch": 0.893243211417536, + "grad_norm": 0.6489064693450928, + "learning_rate": 1.2017780132003989e-05, + "loss": 0.292, + "step": 9482 + }, + { + "epoch": 0.8933374155107039, + "grad_norm": 0.8030021786689758, + "learning_rate": 1.2016301150045595e-05, + "loss": 0.2649, + "step": 9483 + }, + { + "epoch": 0.8934316196038717, + "grad_norm": 0.7990246415138245, + "learning_rate": 1.2014822122112416e-05, + "loss": 0.2964, + "step": 9484 + }, + { + "epoch": 0.8935258236970396, + "grad_norm": 1.6609044075012207, + "learning_rate": 1.2013343048238176e-05, + "loss": 0.3307, + "step": 9485 + }, + { + "epoch": 0.8936200277902074, + "grad_norm": 0.7960792779922485, + "learning_rate": 1.2011863928456601e-05, + "loss": 0.28, + "step": 9486 + }, + { + "epoch": 0.8937142318833753, + "grad_norm": 0.7090703845024109, + "learning_rate": 1.2010384762801417e-05, + "loss": 0.2901, + "step": 9487 + }, + { + "epoch": 0.8938084359765431, + "grad_norm": 0.7948774099349976, + "learning_rate": 1.2008905551306356e-05, + "loss": 0.3213, + "step": 9488 + }, + { + "epoch": 0.893902640069711, + "grad_norm": 0.796562135219574, + "learning_rate": 1.2007426294005135e-05, + "loss": 0.2879, + "step": 9489 + }, + { + "epoch": 0.8939968441628788, + "grad_norm": 1.003172516822815, + "learning_rate": 1.2005946990931492e-05, + "loss": 0.304, + "step": 9490 + }, + { + "epoch": 0.8940910482560467, + "grad_norm": 0.7436158061027527, + "learning_rate": 1.2004467642119158e-05, + "loss": 0.3429, + "step": 9491 + }, + { + "epoch": 0.8941852523492145, + "grad_norm": 0.8177036046981812, + "learning_rate": 1.2002988247601856e-05, + "loss": 0.3, + "step": 9492 + }, + { + "epoch": 0.8942794564423824, + "grad_norm": 0.724937915802002, + "learning_rate": 1.2001508807413329e-05, + "loss": 0.2238, + "step": 9493 + }, + { + "epoch": 0.8943736605355502, + "grad_norm": 0.8515589237213135, + "learning_rate": 1.2000029321587305e-05, + "loss": 0.3237, + "step": 9494 + }, + { + "epoch": 0.8944678646287181, + "grad_norm": 0.7247291207313538, + "learning_rate": 1.199854979015752e-05, + "loss": 0.2979, + "step": 9495 + }, + { + "epoch": 0.8945620687218859, + "grad_norm": 1.1423425674438477, + "learning_rate": 1.1997070213157707e-05, + "loss": 0.2829, + "step": 9496 + }, + { + "epoch": 0.8946562728150538, + "grad_norm": 0.6843664646148682, + "learning_rate": 1.1995590590621607e-05, + "loss": 0.3035, + "step": 9497 + }, + { + "epoch": 0.8947504769082216, + "grad_norm": 3.6836884021759033, + "learning_rate": 1.1994110922582953e-05, + "loss": 0.308, + "step": 9498 + }, + { + "epoch": 0.8948446810013895, + "grad_norm": 0.6240441203117371, + "learning_rate": 1.1992631209075484e-05, + "loss": 0.2731, + "step": 9499 + }, + { + "epoch": 0.8949388850945573, + "grad_norm": 0.7034563422203064, + "learning_rate": 1.199115145013295e-05, + "loss": 0.3324, + "step": 9500 + }, + { + "epoch": 0.8950330891877252, + "grad_norm": 1.343197226524353, + "learning_rate": 1.1989671645789077e-05, + "loss": 0.3416, + "step": 9501 + }, + { + "epoch": 0.895127293280893, + "grad_norm": 0.759353518486023, + "learning_rate": 1.1988191796077615e-05, + "loss": 0.3248, + "step": 9502 + }, + { + "epoch": 0.8952214973740609, + "grad_norm": 0.6849160194396973, + "learning_rate": 1.1986711901032304e-05, + "loss": 0.3282, + "step": 9503 + }, + { + "epoch": 0.8953157014672287, + "grad_norm": 0.6803606152534485, + "learning_rate": 1.198523196068689e-05, + "loss": 0.3005, + "step": 9504 + }, + { + "epoch": 0.8954099055603966, + "grad_norm": 0.7187288999557495, + "learning_rate": 1.1983751975075118e-05, + "loss": 0.2735, + "step": 9505 + }, + { + "epoch": 0.8955041096535644, + "grad_norm": 0.716555655002594, + "learning_rate": 1.198227194423073e-05, + "loss": 0.3009, + "step": 9506 + }, + { + "epoch": 0.8955983137467323, + "grad_norm": 0.7791658639907837, + "learning_rate": 1.1980791868187477e-05, + "loss": 0.3438, + "step": 9507 + }, + { + "epoch": 0.8956925178399001, + "grad_norm": 0.6820909380912781, + "learning_rate": 1.197931174697911e-05, + "loss": 0.3183, + "step": 9508 + }, + { + "epoch": 0.895786721933068, + "grad_norm": 0.7278851866722107, + "learning_rate": 1.197783158063937e-05, + "loss": 0.3451, + "step": 9509 + }, + { + "epoch": 0.8958809260262358, + "grad_norm": 0.9999712705612183, + "learning_rate": 1.1976351369202013e-05, + "loss": 0.312, + "step": 9510 + }, + { + "epoch": 0.8959751301194037, + "grad_norm": 0.8277131915092468, + "learning_rate": 1.1974871112700788e-05, + "loss": 0.2992, + "step": 9511 + }, + { + "epoch": 0.8960693342125715, + "grad_norm": 0.7437383532524109, + "learning_rate": 1.1973390811169447e-05, + "loss": 0.2881, + "step": 9512 + }, + { + "epoch": 0.8961635383057394, + "grad_norm": 0.7816392183303833, + "learning_rate": 1.1971910464641745e-05, + "loss": 0.3304, + "step": 9513 + }, + { + "epoch": 0.8962577423989072, + "grad_norm": 0.7216968536376953, + "learning_rate": 1.1970430073151434e-05, + "loss": 0.3244, + "step": 9514 + }, + { + "epoch": 0.8963519464920751, + "grad_norm": 0.9721806645393372, + "learning_rate": 1.196894963673227e-05, + "loss": 0.2979, + "step": 9515 + }, + { + "epoch": 0.8964461505852429, + "grad_norm": 0.6242602467536926, + "learning_rate": 1.1967469155418005e-05, + "loss": 0.284, + "step": 9516 + }, + { + "epoch": 0.8965403546784108, + "grad_norm": 0.762058675289154, + "learning_rate": 1.1965988629242407e-05, + "loss": 0.3289, + "step": 9517 + }, + { + "epoch": 0.8966345587715786, + "grad_norm": 0.7527956366539001, + "learning_rate": 1.1964508058239226e-05, + "loss": 0.3258, + "step": 9518 + }, + { + "epoch": 0.8967287628647465, + "grad_norm": 0.7868747115135193, + "learning_rate": 1.1963027442442221e-05, + "loss": 0.3116, + "step": 9519 + }, + { + "epoch": 0.8968229669579143, + "grad_norm": 1.0974806547164917, + "learning_rate": 1.1961546781885156e-05, + "loss": 0.3058, + "step": 9520 + }, + { + "epoch": 0.8969171710510822, + "grad_norm": 0.7916780114173889, + "learning_rate": 1.196006607660179e-05, + "loss": 0.291, + "step": 9521 + }, + { + "epoch": 0.89701137514425, + "grad_norm": 0.8335264921188354, + "learning_rate": 1.1958585326625886e-05, + "loss": 0.3464, + "step": 9522 + }, + { + "epoch": 0.8971055792374178, + "grad_norm": 0.6414207220077515, + "learning_rate": 1.195710453199121e-05, + "loss": 0.3112, + "step": 9523 + }, + { + "epoch": 0.8971997833305857, + "grad_norm": 2.4155995845794678, + "learning_rate": 1.195562369273152e-05, + "loss": 0.2633, + "step": 9524 + }, + { + "epoch": 0.8972939874237535, + "grad_norm": 0.6785614490509033, + "learning_rate": 1.1954142808880589e-05, + "loss": 0.3065, + "step": 9525 + }, + { + "epoch": 0.8973881915169214, + "grad_norm": 0.7296954989433289, + "learning_rate": 1.1952661880472178e-05, + "loss": 0.2965, + "step": 9526 + }, + { + "epoch": 0.8974823956100892, + "grad_norm": 0.7682005763053894, + "learning_rate": 1.1951180907540057e-05, + "loss": 0.3251, + "step": 9527 + }, + { + "epoch": 0.8975765997032571, + "grad_norm": 0.7406534552574158, + "learning_rate": 1.1949699890117994e-05, + "loss": 0.2952, + "step": 9528 + }, + { + "epoch": 0.8976708037964249, + "grad_norm": 0.660403847694397, + "learning_rate": 1.1948218828239757e-05, + "loss": 0.3052, + "step": 9529 + }, + { + "epoch": 0.8977650078895928, + "grad_norm": 0.7532671093940735, + "learning_rate": 1.1946737721939118e-05, + "loss": 0.2863, + "step": 9530 + }, + { + "epoch": 0.8978592119827606, + "grad_norm": 0.6479833126068115, + "learning_rate": 1.194525657124985e-05, + "loss": 0.274, + "step": 9531 + }, + { + "epoch": 0.8979534160759285, + "grad_norm": 0.9556598663330078, + "learning_rate": 1.194377537620572e-05, + "loss": 0.346, + "step": 9532 + }, + { + "epoch": 0.8980476201690963, + "grad_norm": 0.6388131976127625, + "learning_rate": 1.1942294136840508e-05, + "loss": 0.2835, + "step": 9533 + }, + { + "epoch": 0.8981418242622642, + "grad_norm": 0.7453655004501343, + "learning_rate": 1.1940812853187987e-05, + "loss": 0.3264, + "step": 9534 + }, + { + "epoch": 0.898236028355432, + "grad_norm": 3.139777898788452, + "learning_rate": 1.193933152528193e-05, + "loss": 0.3175, + "step": 9535 + }, + { + "epoch": 0.8983302324485999, + "grad_norm": 0.7302778959274292, + "learning_rate": 1.1937850153156115e-05, + "loss": 0.3122, + "step": 9536 + }, + { + "epoch": 0.8984244365417677, + "grad_norm": 0.7076626420021057, + "learning_rate": 1.1936368736844319e-05, + "loss": 0.322, + "step": 9537 + }, + { + "epoch": 0.8985186406349356, + "grad_norm": 0.7167888283729553, + "learning_rate": 1.1934887276380323e-05, + "loss": 0.3075, + "step": 9538 + }, + { + "epoch": 0.8986128447281034, + "grad_norm": 0.6941359639167786, + "learning_rate": 1.19334057717979e-05, + "loss": 0.2818, + "step": 9539 + }, + { + "epoch": 0.8987070488212713, + "grad_norm": 0.6347949504852295, + "learning_rate": 1.1931924223130842e-05, + "loss": 0.2934, + "step": 9540 + }, + { + "epoch": 0.8988012529144391, + "grad_norm": 0.7852635979652405, + "learning_rate": 1.193044263041292e-05, + "loss": 0.3103, + "step": 9541 + }, + { + "epoch": 0.898895457007607, + "grad_norm": 0.8677423000335693, + "learning_rate": 1.1928960993677921e-05, + "loss": 0.3197, + "step": 9542 + }, + { + "epoch": 0.8989896611007748, + "grad_norm": 0.7845224142074585, + "learning_rate": 1.1927479312959629e-05, + "loss": 0.3129, + "step": 9543 + }, + { + "epoch": 0.8990838651939427, + "grad_norm": 0.757025420665741, + "learning_rate": 1.1925997588291827e-05, + "loss": 0.3196, + "step": 9544 + }, + { + "epoch": 0.8991780692871105, + "grad_norm": 0.6933902502059937, + "learning_rate": 1.19245158197083e-05, + "loss": 0.3045, + "step": 9545 + }, + { + "epoch": 0.8992722733802784, + "grad_norm": 0.6712954044342041, + "learning_rate": 1.192303400724284e-05, + "loss": 0.3038, + "step": 9546 + }, + { + "epoch": 0.8993664774734462, + "grad_norm": 0.7381793856620789, + "learning_rate": 1.1921552150929225e-05, + "loss": 0.3107, + "step": 9547 + }, + { + "epoch": 0.8994606815666141, + "grad_norm": 0.6141394972801208, + "learning_rate": 1.1920070250801254e-05, + "loss": 0.3125, + "step": 9548 + }, + { + "epoch": 0.8995548856597819, + "grad_norm": 0.7098128795623779, + "learning_rate": 1.1918588306892709e-05, + "loss": 0.3016, + "step": 9549 + }, + { + "epoch": 0.8996490897529498, + "grad_norm": 0.6733649373054504, + "learning_rate": 1.1917106319237386e-05, + "loss": 0.3008, + "step": 9550 + }, + { + "epoch": 0.8997432938461176, + "grad_norm": 1.0957714319229126, + "learning_rate": 1.1915624287869072e-05, + "loss": 0.2848, + "step": 9551 + }, + { + "epoch": 0.8998374979392855, + "grad_norm": 0.8120837211608887, + "learning_rate": 1.1914142212821563e-05, + "loss": 0.3224, + "step": 9552 + }, + { + "epoch": 0.8999317020324533, + "grad_norm": 1.230957269668579, + "learning_rate": 1.191266009412865e-05, + "loss": 0.3266, + "step": 9553 + }, + { + "epoch": 0.9000259061256212, + "grad_norm": 0.6974923014640808, + "learning_rate": 1.191117793182413e-05, + "loss": 0.2877, + "step": 9554 + }, + { + "epoch": 0.900120110218789, + "grad_norm": 0.7688418030738831, + "learning_rate": 1.1909695725941797e-05, + "loss": 0.3275, + "step": 9555 + }, + { + "epoch": 0.9002143143119569, + "grad_norm": 0.9805521368980408, + "learning_rate": 1.1908213476515447e-05, + "loss": 0.3278, + "step": 9556 + }, + { + "epoch": 0.9003085184051247, + "grad_norm": 0.727298378944397, + "learning_rate": 1.190673118357888e-05, + "loss": 0.2928, + "step": 9557 + }, + { + "epoch": 0.9004027224982926, + "grad_norm": 0.8055543303489685, + "learning_rate": 1.1905248847165893e-05, + "loss": 0.337, + "step": 9558 + }, + { + "epoch": 0.9004969265914604, + "grad_norm": 0.776944637298584, + "learning_rate": 1.1903766467310288e-05, + "loss": 0.3403, + "step": 9559 + }, + { + "epoch": 0.9005911306846283, + "grad_norm": 0.7132166028022766, + "learning_rate": 1.190228404404586e-05, + "loss": 0.3153, + "step": 9560 + }, + { + "epoch": 0.9006853347777961, + "grad_norm": 0.7273151278495789, + "learning_rate": 1.1900801577406413e-05, + "loss": 0.3289, + "step": 9561 + }, + { + "epoch": 0.900779538870964, + "grad_norm": 0.6589033007621765, + "learning_rate": 1.1899319067425752e-05, + "loss": 0.2663, + "step": 9562 + }, + { + "epoch": 0.9008737429641318, + "grad_norm": 0.8022077083587646, + "learning_rate": 1.189783651413768e-05, + "loss": 0.3546, + "step": 9563 + }, + { + "epoch": 0.9009679470572997, + "grad_norm": 0.73952317237854, + "learning_rate": 1.1896353917575997e-05, + "loss": 0.327, + "step": 9564 + }, + { + "epoch": 0.9010621511504675, + "grad_norm": 0.8198251128196716, + "learning_rate": 1.1894871277774515e-05, + "loss": 0.3466, + "step": 9565 + }, + { + "epoch": 0.9011563552436354, + "grad_norm": 0.743465781211853, + "learning_rate": 1.189338859476704e-05, + "loss": 0.3166, + "step": 9566 + }, + { + "epoch": 0.9012505593368032, + "grad_norm": 0.6905034780502319, + "learning_rate": 1.189190586858737e-05, + "loss": 0.2999, + "step": 9567 + }, + { + "epoch": 0.901344763429971, + "grad_norm": 1.2066441774368286, + "learning_rate": 1.1890423099269327e-05, + "loss": 0.3278, + "step": 9568 + }, + { + "epoch": 0.9014389675231389, + "grad_norm": 0.717666506767273, + "learning_rate": 1.1888940286846708e-05, + "loss": 0.3042, + "step": 9569 + }, + { + "epoch": 0.9015331716163067, + "grad_norm": 1.3240638971328735, + "learning_rate": 1.1887457431353333e-05, + "loss": 0.3384, + "step": 9570 + }, + { + "epoch": 0.9016273757094746, + "grad_norm": 0.6661431789398193, + "learning_rate": 1.188597453282301e-05, + "loss": 0.3076, + "step": 9571 + }, + { + "epoch": 0.9017215798026424, + "grad_norm": 0.7865625619888306, + "learning_rate": 1.1884491591289546e-05, + "loss": 0.3067, + "step": 9572 + }, + { + "epoch": 0.9018157838958103, + "grad_norm": 0.7785285115242004, + "learning_rate": 1.1883008606786763e-05, + "loss": 0.269, + "step": 9573 + }, + { + "epoch": 0.9019099879889781, + "grad_norm": 0.8562577962875366, + "learning_rate": 1.1881525579348474e-05, + "loss": 0.3158, + "step": 9574 + }, + { + "epoch": 0.902004192082146, + "grad_norm": 0.712498664855957, + "learning_rate": 1.188004250900849e-05, + "loss": 0.3268, + "step": 9575 + }, + { + "epoch": 0.9020983961753138, + "grad_norm": 0.6795501708984375, + "learning_rate": 1.1878559395800627e-05, + "loss": 0.3064, + "step": 9576 + }, + { + "epoch": 0.9021926002684817, + "grad_norm": 0.7840029001235962, + "learning_rate": 1.1877076239758704e-05, + "loss": 0.3238, + "step": 9577 + }, + { + "epoch": 0.9022868043616495, + "grad_norm": 0.7358947396278381, + "learning_rate": 1.1875593040916544e-05, + "loss": 0.2751, + "step": 9578 + }, + { + "epoch": 0.9023810084548174, + "grad_norm": 0.7333950400352478, + "learning_rate": 1.187410979930796e-05, + "loss": 0.3044, + "step": 9579 + }, + { + "epoch": 0.9024752125479852, + "grad_norm": 0.6706371307373047, + "learning_rate": 1.1872626514966774e-05, + "loss": 0.2949, + "step": 9580 + }, + { + "epoch": 0.9025694166411531, + "grad_norm": 0.6611641645431519, + "learning_rate": 1.1871143187926805e-05, + "loss": 0.2739, + "step": 9581 + }, + { + "epoch": 0.9026636207343209, + "grad_norm": 0.7098549604415894, + "learning_rate": 1.1869659818221881e-05, + "loss": 0.3084, + "step": 9582 + }, + { + "epoch": 0.9027578248274888, + "grad_norm": 0.6501350402832031, + "learning_rate": 1.1868176405885824e-05, + "loss": 0.2853, + "step": 9583 + }, + { + "epoch": 0.9028520289206566, + "grad_norm": 0.6925662755966187, + "learning_rate": 1.1866692950952447e-05, + "loss": 0.2668, + "step": 9584 + }, + { + "epoch": 0.9029462330138245, + "grad_norm": 0.6863833665847778, + "learning_rate": 1.1865209453455593e-05, + "loss": 0.2608, + "step": 9585 + }, + { + "epoch": 0.9030404371069923, + "grad_norm": 0.7631823420524597, + "learning_rate": 1.1863725913429075e-05, + "loss": 0.3226, + "step": 9586 + }, + { + "epoch": 0.9031346412001602, + "grad_norm": 0.821094810962677, + "learning_rate": 1.1862242330906722e-05, + "loss": 0.2749, + "step": 9587 + }, + { + "epoch": 0.903228845293328, + "grad_norm": 0.7188436985015869, + "learning_rate": 1.1860758705922368e-05, + "loss": 0.2917, + "step": 9588 + }, + { + "epoch": 0.9033230493864959, + "grad_norm": 0.6913095116615295, + "learning_rate": 1.1859275038509833e-05, + "loss": 0.2685, + "step": 9589 + }, + { + "epoch": 0.9034172534796637, + "grad_norm": 0.7232502698898315, + "learning_rate": 1.1857791328702955e-05, + "loss": 0.3107, + "step": 9590 + }, + { + "epoch": 0.9035114575728316, + "grad_norm": 0.669734537601471, + "learning_rate": 1.1856307576535562e-05, + "loss": 0.2923, + "step": 9591 + }, + { + "epoch": 0.9036056616659994, + "grad_norm": 0.6539066433906555, + "learning_rate": 1.1854823782041483e-05, + "loss": 0.2882, + "step": 9592 + }, + { + "epoch": 0.9036998657591673, + "grad_norm": 0.6612775921821594, + "learning_rate": 1.1853339945254555e-05, + "loss": 0.3008, + "step": 9593 + }, + { + "epoch": 0.9037940698523351, + "grad_norm": 0.8304771184921265, + "learning_rate": 1.1851856066208609e-05, + "loss": 0.3396, + "step": 9594 + }, + { + "epoch": 0.903888273945503, + "grad_norm": 0.7078601121902466, + "learning_rate": 1.185037214493748e-05, + "loss": 0.2715, + "step": 9595 + }, + { + "epoch": 0.9039824780386708, + "grad_norm": 0.8279368281364441, + "learning_rate": 1.1848888181475005e-05, + "loss": 0.3121, + "step": 9596 + }, + { + "epoch": 0.9040766821318387, + "grad_norm": 0.9535421133041382, + "learning_rate": 1.1847404175855021e-05, + "loss": 0.2851, + "step": 9597 + }, + { + "epoch": 0.9041708862250065, + "grad_norm": 0.732578694820404, + "learning_rate": 1.1845920128111366e-05, + "loss": 0.309, + "step": 9598 + }, + { + "epoch": 0.9042650903181744, + "grad_norm": 0.7540335655212402, + "learning_rate": 1.1844436038277876e-05, + "loss": 0.2867, + "step": 9599 + }, + { + "epoch": 0.9043592944113422, + "grad_norm": 0.8256245851516724, + "learning_rate": 1.184295190638839e-05, + "loss": 0.3496, + "step": 9600 + }, + { + "epoch": 0.9044534985045101, + "grad_norm": 1.0604828596115112, + "learning_rate": 1.1841467732476752e-05, + "loss": 0.3124, + "step": 9601 + }, + { + "epoch": 0.9045477025976779, + "grad_norm": 0.7529759407043457, + "learning_rate": 1.1839983516576802e-05, + "loss": 0.3051, + "step": 9602 + }, + { + "epoch": 0.9046419066908458, + "grad_norm": 0.7639031410217285, + "learning_rate": 1.1838499258722383e-05, + "loss": 0.3123, + "step": 9603 + }, + { + "epoch": 0.9047361107840136, + "grad_norm": 0.6991181969642639, + "learning_rate": 1.1837014958947335e-05, + "loss": 0.3201, + "step": 9604 + }, + { + "epoch": 0.9048303148771815, + "grad_norm": 0.6521792411804199, + "learning_rate": 1.1835530617285509e-05, + "loss": 0.2865, + "step": 9605 + }, + { + "epoch": 0.9049245189703493, + "grad_norm": 0.8181825876235962, + "learning_rate": 1.1834046233770742e-05, + "loss": 0.2623, + "step": 9606 + }, + { + "epoch": 0.9050187230635172, + "grad_norm": 0.7247796654701233, + "learning_rate": 1.1832561808436887e-05, + "loss": 0.3361, + "step": 9607 + }, + { + "epoch": 0.905112927156685, + "grad_norm": 0.6888623237609863, + "learning_rate": 1.1831077341317789e-05, + "loss": 0.2539, + "step": 9608 + }, + { + "epoch": 0.9052071312498529, + "grad_norm": 0.6608344316482544, + "learning_rate": 1.1829592832447295e-05, + "loss": 0.2463, + "step": 9609 + }, + { + "epoch": 0.9053013353430207, + "grad_norm": 0.785529375076294, + "learning_rate": 1.1828108281859252e-05, + "loss": 0.3544, + "step": 9610 + }, + { + "epoch": 0.9053955394361886, + "grad_norm": 0.7120524048805237, + "learning_rate": 1.1826623689587518e-05, + "loss": 0.308, + "step": 9611 + }, + { + "epoch": 0.9054897435293564, + "grad_norm": 0.7383785843849182, + "learning_rate": 1.1825139055665934e-05, + "loss": 0.2889, + "step": 9612 + }, + { + "epoch": 0.9055839476225241, + "grad_norm": 0.6540294885635376, + "learning_rate": 1.182365438012836e-05, + "loss": 0.3068, + "step": 9613 + }, + { + "epoch": 0.905678151715692, + "grad_norm": 0.6447115540504456, + "learning_rate": 1.1822169663008646e-05, + "loss": 0.2977, + "step": 9614 + }, + { + "epoch": 0.9057723558088598, + "grad_norm": 0.6795375347137451, + "learning_rate": 1.1820684904340645e-05, + "loss": 0.2884, + "step": 9615 + }, + { + "epoch": 0.9058665599020277, + "grad_norm": 0.7206788659095764, + "learning_rate": 1.181920010415821e-05, + "loss": 0.2875, + "step": 9616 + }, + { + "epoch": 0.9059607639951955, + "grad_norm": 0.7569628953933716, + "learning_rate": 1.1817715262495202e-05, + "loss": 0.3527, + "step": 9617 + }, + { + "epoch": 0.9060549680883634, + "grad_norm": 0.6905651688575745, + "learning_rate": 1.1816230379385475e-05, + "loss": 0.2894, + "step": 9618 + }, + { + "epoch": 0.9061491721815312, + "grad_norm": 0.7624897360801697, + "learning_rate": 1.1814745454862887e-05, + "loss": 0.3582, + "step": 9619 + }, + { + "epoch": 0.9062433762746991, + "grad_norm": 0.7739545106887817, + "learning_rate": 1.1813260488961295e-05, + "loss": 0.3259, + "step": 9620 + }, + { + "epoch": 0.9063375803678669, + "grad_norm": 0.6178655624389648, + "learning_rate": 1.1811775481714558e-05, + "loss": 0.3016, + "step": 9621 + }, + { + "epoch": 0.9064317844610348, + "grad_norm": 0.6559436917304993, + "learning_rate": 1.1810290433156539e-05, + "loss": 0.3, + "step": 9622 + }, + { + "epoch": 0.9065259885542026, + "grad_norm": 0.7467542290687561, + "learning_rate": 1.1808805343321102e-05, + "loss": 0.3143, + "step": 9623 + }, + { + "epoch": 0.9066201926473705, + "grad_norm": 0.7773847579956055, + "learning_rate": 1.1807320212242099e-05, + "loss": 0.3232, + "step": 9624 + }, + { + "epoch": 0.9067143967405383, + "grad_norm": 0.8095197677612305, + "learning_rate": 1.1805835039953408e-05, + "loss": 0.3258, + "step": 9625 + }, + { + "epoch": 0.9068086008337062, + "grad_norm": 0.6602053642272949, + "learning_rate": 1.1804349826488879e-05, + "loss": 0.2945, + "step": 9626 + }, + { + "epoch": 0.906902804926874, + "grad_norm": 0.6551075577735901, + "learning_rate": 1.1802864571882383e-05, + "loss": 0.2817, + "step": 9627 + }, + { + "epoch": 0.9069970090200419, + "grad_norm": 0.706739068031311, + "learning_rate": 1.1801379276167792e-05, + "loss": 0.3067, + "step": 9628 + }, + { + "epoch": 0.9070912131132097, + "grad_norm": 0.6379954814910889, + "learning_rate": 1.1799893939378964e-05, + "loss": 0.2902, + "step": 9629 + }, + { + "epoch": 0.9071854172063776, + "grad_norm": 0.7236230373382568, + "learning_rate": 1.1798408561549773e-05, + "loss": 0.3103, + "step": 9630 + }, + { + "epoch": 0.9072796212995454, + "grad_norm": 0.6442583203315735, + "learning_rate": 1.1796923142714083e-05, + "loss": 0.2837, + "step": 9631 + }, + { + "epoch": 0.9073738253927133, + "grad_norm": 0.7341998815536499, + "learning_rate": 1.1795437682905765e-05, + "loss": 0.3241, + "step": 9632 + }, + { + "epoch": 0.9074680294858811, + "grad_norm": 0.8652843832969666, + "learning_rate": 1.1793952182158694e-05, + "loss": 0.3223, + "step": 9633 + }, + { + "epoch": 0.907562233579049, + "grad_norm": 0.7407996654510498, + "learning_rate": 1.1792466640506741e-05, + "loss": 0.2794, + "step": 9634 + }, + { + "epoch": 0.9076564376722168, + "grad_norm": 0.6820136308670044, + "learning_rate": 1.1790981057983772e-05, + "loss": 0.2754, + "step": 9635 + }, + { + "epoch": 0.9077506417653847, + "grad_norm": 0.7229933738708496, + "learning_rate": 1.1789495434623665e-05, + "loss": 0.3104, + "step": 9636 + }, + { + "epoch": 0.9078448458585525, + "grad_norm": 0.7318028211593628, + "learning_rate": 1.1788009770460297e-05, + "loss": 0.3201, + "step": 9637 + }, + { + "epoch": 0.9079390499517204, + "grad_norm": 0.7525824904441833, + "learning_rate": 1.1786524065527543e-05, + "loss": 0.2861, + "step": 9638 + }, + { + "epoch": 0.9080332540448882, + "grad_norm": 0.6937665343284607, + "learning_rate": 1.1785038319859274e-05, + "loss": 0.2864, + "step": 9639 + }, + { + "epoch": 0.9081274581380561, + "grad_norm": 0.6146497130393982, + "learning_rate": 1.1783552533489372e-05, + "loss": 0.3043, + "step": 9640 + }, + { + "epoch": 0.9082216622312239, + "grad_norm": 0.960955023765564, + "learning_rate": 1.1782066706451713e-05, + "loss": 0.3027, + "step": 9641 + }, + { + "epoch": 0.9083158663243918, + "grad_norm": 0.8236584663391113, + "learning_rate": 1.1780580838780177e-05, + "loss": 0.2882, + "step": 9642 + }, + { + "epoch": 0.9084100704175596, + "grad_norm": 0.8055413365364075, + "learning_rate": 1.1779094930508646e-05, + "loss": 0.3456, + "step": 9643 + }, + { + "epoch": 0.9085042745107275, + "grad_norm": 0.6914135813713074, + "learning_rate": 1.1777608981670997e-05, + "loss": 0.287, + "step": 9644 + }, + { + "epoch": 0.9085984786038953, + "grad_norm": 0.668573260307312, + "learning_rate": 1.1776122992301118e-05, + "loss": 0.3079, + "step": 9645 + }, + { + "epoch": 0.9086926826970632, + "grad_norm": 0.7232000231742859, + "learning_rate": 1.1774636962432881e-05, + "loss": 0.3582, + "step": 9646 + }, + { + "epoch": 0.908786886790231, + "grad_norm": 0.6752644181251526, + "learning_rate": 1.177315089210018e-05, + "loss": 0.3008, + "step": 9647 + }, + { + "epoch": 0.9088810908833989, + "grad_norm": 0.6755600571632385, + "learning_rate": 1.17716647813369e-05, + "loss": 0.2911, + "step": 9648 + }, + { + "epoch": 0.9089752949765667, + "grad_norm": 0.7653055191040039, + "learning_rate": 1.1770178630176918e-05, + "loss": 0.3149, + "step": 9649 + }, + { + "epoch": 0.9090694990697346, + "grad_norm": 0.7899324893951416, + "learning_rate": 1.1768692438654128e-05, + "loss": 0.3856, + "step": 9650 + }, + { + "epoch": 0.9091637031629024, + "grad_norm": 0.6736056208610535, + "learning_rate": 1.1767206206802416e-05, + "loss": 0.3078, + "step": 9651 + }, + { + "epoch": 0.9092579072560703, + "grad_norm": 0.6566187143325806, + "learning_rate": 1.1765719934655667e-05, + "loss": 0.3031, + "step": 9652 + }, + { + "epoch": 0.9093521113492381, + "grad_norm": 0.7438850402832031, + "learning_rate": 1.1764233622247774e-05, + "loss": 0.3434, + "step": 9653 + }, + { + "epoch": 0.909446315442406, + "grad_norm": 0.7408274412155151, + "learning_rate": 1.1762747269612627e-05, + "loss": 0.3291, + "step": 9654 + }, + { + "epoch": 0.9095405195355738, + "grad_norm": 0.7000565528869629, + "learning_rate": 1.1761260876784115e-05, + "loss": 0.3327, + "step": 9655 + }, + { + "epoch": 0.9096347236287416, + "grad_norm": 0.8495182394981384, + "learning_rate": 1.175977444379613e-05, + "loss": 0.3426, + "step": 9656 + }, + { + "epoch": 0.9097289277219095, + "grad_norm": 0.7792713046073914, + "learning_rate": 1.1758287970682566e-05, + "loss": 0.2978, + "step": 9657 + }, + { + "epoch": 0.9098231318150773, + "grad_norm": 0.6760193109512329, + "learning_rate": 1.1756801457477321e-05, + "loss": 0.2846, + "step": 9658 + }, + { + "epoch": 0.9099173359082452, + "grad_norm": 0.6466975808143616, + "learning_rate": 1.1755314904214284e-05, + "loss": 0.2558, + "step": 9659 + }, + { + "epoch": 0.910011540001413, + "grad_norm": 0.6765708327293396, + "learning_rate": 1.175382831092735e-05, + "loss": 0.3029, + "step": 9660 + }, + { + "epoch": 0.9101057440945809, + "grad_norm": 0.6689856648445129, + "learning_rate": 1.175234167765042e-05, + "loss": 0.2646, + "step": 9661 + }, + { + "epoch": 0.9101999481877487, + "grad_norm": 0.829488217830658, + "learning_rate": 1.175085500441739e-05, + "loss": 0.3299, + "step": 9662 + }, + { + "epoch": 0.9102941522809166, + "grad_norm": 0.6252883672714233, + "learning_rate": 1.1749368291262158e-05, + "loss": 0.2917, + "step": 9663 + }, + { + "epoch": 0.9103883563740844, + "grad_norm": 0.7971799969673157, + "learning_rate": 1.1747881538218622e-05, + "loss": 0.3722, + "step": 9664 + }, + { + "epoch": 0.9104825604672523, + "grad_norm": 0.7582257390022278, + "learning_rate": 1.1746394745320689e-05, + "loss": 0.3084, + "step": 9665 + }, + { + "epoch": 0.9105767645604201, + "grad_norm": 0.8081191778182983, + "learning_rate": 1.1744907912602248e-05, + "loss": 0.2696, + "step": 9666 + }, + { + "epoch": 0.910670968653588, + "grad_norm": 0.6928661465644836, + "learning_rate": 1.1743421040097209e-05, + "loss": 0.2898, + "step": 9667 + }, + { + "epoch": 0.9107651727467558, + "grad_norm": 0.7181206345558167, + "learning_rate": 1.1741934127839479e-05, + "loss": 0.2697, + "step": 9668 + }, + { + "epoch": 0.9108593768399237, + "grad_norm": 0.6876206994056702, + "learning_rate": 1.1740447175862953e-05, + "loss": 0.2896, + "step": 9669 + }, + { + "epoch": 0.9109535809330915, + "grad_norm": 0.7033278346061707, + "learning_rate": 1.173896018420154e-05, + "loss": 0.2986, + "step": 9670 + }, + { + "epoch": 0.9110477850262594, + "grad_norm": 0.6977961659431458, + "learning_rate": 1.1737473152889147e-05, + "loss": 0.2923, + "step": 9671 + }, + { + "epoch": 0.9111419891194272, + "grad_norm": 0.6762495040893555, + "learning_rate": 1.1735986081959676e-05, + "loss": 0.3171, + "step": 9672 + }, + { + "epoch": 0.9112361932125951, + "grad_norm": 0.7551417350769043, + "learning_rate": 1.1734498971447041e-05, + "loss": 0.3043, + "step": 9673 + }, + { + "epoch": 0.9113303973057629, + "grad_norm": 0.6652315258979797, + "learning_rate": 1.1733011821385148e-05, + "loss": 0.2951, + "step": 9674 + }, + { + "epoch": 0.9114246013989308, + "grad_norm": 0.7785511612892151, + "learning_rate": 1.1731524631807903e-05, + "loss": 0.3246, + "step": 9675 + }, + { + "epoch": 0.9115188054920986, + "grad_norm": 0.7023170590400696, + "learning_rate": 1.1730037402749219e-05, + "loss": 0.3026, + "step": 9676 + }, + { + "epoch": 0.9116130095852665, + "grad_norm": 0.7303873300552368, + "learning_rate": 1.1728550134243004e-05, + "loss": 0.3279, + "step": 9677 + }, + { + "epoch": 0.9117072136784343, + "grad_norm": 0.7390693426132202, + "learning_rate": 1.1727062826323174e-05, + "loss": 0.3266, + "step": 9678 + }, + { + "epoch": 0.9118014177716022, + "grad_norm": 0.7931835651397705, + "learning_rate": 1.1725575479023644e-05, + "loss": 0.338, + "step": 9679 + }, + { + "epoch": 0.91189562186477, + "grad_norm": 0.705696165561676, + "learning_rate": 1.1724088092378324e-05, + "loss": 0.3047, + "step": 9680 + }, + { + "epoch": 0.9119898259579379, + "grad_norm": 0.7783174514770508, + "learning_rate": 1.1722600666421125e-05, + "loss": 0.3069, + "step": 9681 + }, + { + "epoch": 0.9120840300511057, + "grad_norm": 0.6609805822372437, + "learning_rate": 1.1721113201185967e-05, + "loss": 0.2575, + "step": 9682 + }, + { + "epoch": 0.9121782341442736, + "grad_norm": 0.7529018521308899, + "learning_rate": 1.1719625696706772e-05, + "loss": 0.2972, + "step": 9683 + }, + { + "epoch": 0.9122724382374414, + "grad_norm": 0.7474442720413208, + "learning_rate": 1.1718138153017445e-05, + "loss": 0.3023, + "step": 9684 + }, + { + "epoch": 0.9123666423306093, + "grad_norm": 0.7671797275543213, + "learning_rate": 1.1716650570151915e-05, + "loss": 0.3207, + "step": 9685 + }, + { + "epoch": 0.9124608464237771, + "grad_norm": 0.6554548740386963, + "learning_rate": 1.1715162948144094e-05, + "loss": 0.3088, + "step": 9686 + }, + { + "epoch": 0.912555050516945, + "grad_norm": 0.6867275238037109, + "learning_rate": 1.1713675287027906e-05, + "loss": 0.2612, + "step": 9687 + }, + { + "epoch": 0.9126492546101128, + "grad_norm": 0.7825465798377991, + "learning_rate": 1.1712187586837276e-05, + "loss": 0.3124, + "step": 9688 + }, + { + "epoch": 0.9127434587032807, + "grad_norm": 0.736897349357605, + "learning_rate": 1.1710699847606116e-05, + "loss": 0.2912, + "step": 9689 + }, + { + "epoch": 0.9128376627964485, + "grad_norm": 0.7599862813949585, + "learning_rate": 1.1709212069368357e-05, + "loss": 0.2825, + "step": 9690 + }, + { + "epoch": 0.9129318668896164, + "grad_norm": 0.7896125912666321, + "learning_rate": 1.1707724252157917e-05, + "loss": 0.2976, + "step": 9691 + }, + { + "epoch": 0.9130260709827842, + "grad_norm": 0.7773582935333252, + "learning_rate": 1.1706236396008723e-05, + "loss": 0.3227, + "step": 9692 + }, + { + "epoch": 0.9131202750759521, + "grad_norm": 0.6425527334213257, + "learning_rate": 1.1704748500954702e-05, + "loss": 0.2724, + "step": 9693 + }, + { + "epoch": 0.9132144791691199, + "grad_norm": 0.90715092420578, + "learning_rate": 1.1703260567029777e-05, + "loss": 0.2973, + "step": 9694 + }, + { + "epoch": 0.9133086832622878, + "grad_norm": 0.6834738254547119, + "learning_rate": 1.1701772594267879e-05, + "loss": 0.2708, + "step": 9695 + }, + { + "epoch": 0.9134028873554556, + "grad_norm": 0.8027157783508301, + "learning_rate": 1.1700284582702933e-05, + "loss": 0.3065, + "step": 9696 + }, + { + "epoch": 0.9134970914486235, + "grad_norm": 0.716576337814331, + "learning_rate": 1.1698796532368869e-05, + "loss": 0.3185, + "step": 9697 + }, + { + "epoch": 0.9135912955417913, + "grad_norm": 0.7953327894210815, + "learning_rate": 1.1697308443299615e-05, + "loss": 0.3345, + "step": 9698 + }, + { + "epoch": 0.9136854996349592, + "grad_norm": 0.7189897298812866, + "learning_rate": 1.1695820315529108e-05, + "loss": 0.3063, + "step": 9699 + }, + { + "epoch": 0.913779703728127, + "grad_norm": 0.5961300730705261, + "learning_rate": 1.1694332149091272e-05, + "loss": 0.253, + "step": 9700 + }, + { + "epoch": 0.9138739078212949, + "grad_norm": 0.7950000166893005, + "learning_rate": 1.1692843944020041e-05, + "loss": 0.2983, + "step": 9701 + }, + { + "epoch": 0.9139681119144627, + "grad_norm": 0.691615104675293, + "learning_rate": 1.1691355700349351e-05, + "loss": 0.3202, + "step": 9702 + }, + { + "epoch": 0.9140623160076305, + "grad_norm": 0.6892995238304138, + "learning_rate": 1.1689867418113138e-05, + "loss": 0.3255, + "step": 9703 + }, + { + "epoch": 0.9141565201007984, + "grad_norm": 0.7660693526268005, + "learning_rate": 1.168837909734533e-05, + "loss": 0.2855, + "step": 9704 + }, + { + "epoch": 0.9142507241939662, + "grad_norm": 0.7879043817520142, + "learning_rate": 1.1686890738079874e-05, + "loss": 0.2495, + "step": 9705 + }, + { + "epoch": 0.9143449282871341, + "grad_norm": 0.7355771064758301, + "learning_rate": 1.1685402340350695e-05, + "loss": 0.3274, + "step": 9706 + }, + { + "epoch": 0.9144391323803019, + "grad_norm": 0.645781934261322, + "learning_rate": 1.1683913904191737e-05, + "loss": 0.2694, + "step": 9707 + }, + { + "epoch": 0.9145333364734698, + "grad_norm": 0.6447874307632446, + "learning_rate": 1.168242542963694e-05, + "loss": 0.2997, + "step": 9708 + }, + { + "epoch": 0.9146275405666376, + "grad_norm": 0.6921257972717285, + "learning_rate": 1.168093691672024e-05, + "loss": 0.2729, + "step": 9709 + }, + { + "epoch": 0.9147217446598055, + "grad_norm": 0.5812404751777649, + "learning_rate": 1.1679448365475579e-05, + "loss": 0.2846, + "step": 9710 + }, + { + "epoch": 0.9148159487529733, + "grad_norm": 0.8316332697868347, + "learning_rate": 1.1677959775936898e-05, + "loss": 0.299, + "step": 9711 + }, + { + "epoch": 0.9149101528461412, + "grad_norm": 0.6069605946540833, + "learning_rate": 1.1676471148138136e-05, + "loss": 0.2762, + "step": 9712 + }, + { + "epoch": 0.915004356939309, + "grad_norm": 0.6823188066482544, + "learning_rate": 1.1674982482113242e-05, + "loss": 0.2947, + "step": 9713 + }, + { + "epoch": 0.9150985610324769, + "grad_norm": 0.8961073160171509, + "learning_rate": 1.1673493777896157e-05, + "loss": 0.2727, + "step": 9714 + }, + { + "epoch": 0.9151927651256447, + "grad_norm": 0.7483460307121277, + "learning_rate": 1.1672005035520826e-05, + "loss": 0.2909, + "step": 9715 + }, + { + "epoch": 0.9152869692188126, + "grad_norm": 0.6980855464935303, + "learning_rate": 1.1670516255021193e-05, + "loss": 0.2999, + "step": 9716 + }, + { + "epoch": 0.9153811733119804, + "grad_norm": 0.732695460319519, + "learning_rate": 1.1669027436431205e-05, + "loss": 0.3461, + "step": 9717 + }, + { + "epoch": 0.9154753774051483, + "grad_norm": 0.9831451773643494, + "learning_rate": 1.1667538579784813e-05, + "loss": 0.3111, + "step": 9718 + }, + { + "epoch": 0.9155695814983161, + "grad_norm": 0.6631169319152832, + "learning_rate": 1.1666049685115963e-05, + "loss": 0.317, + "step": 9719 + }, + { + "epoch": 0.915663785591484, + "grad_norm": 0.6973643898963928, + "learning_rate": 1.1664560752458602e-05, + "loss": 0.2831, + "step": 9720 + }, + { + "epoch": 0.9157579896846518, + "grad_norm": 0.7669511437416077, + "learning_rate": 1.166307178184668e-05, + "loss": 0.3111, + "step": 9721 + }, + { + "epoch": 0.9158521937778197, + "grad_norm": 0.6711779236793518, + "learning_rate": 1.1661582773314151e-05, + "loss": 0.3339, + "step": 9722 + }, + { + "epoch": 0.9159463978709875, + "grad_norm": 0.7318465709686279, + "learning_rate": 1.1660093726894966e-05, + "loss": 0.3181, + "step": 9723 + }, + { + "epoch": 0.9160406019641554, + "grad_norm": 0.6497591733932495, + "learning_rate": 1.1658604642623075e-05, + "loss": 0.2568, + "step": 9724 + }, + { + "epoch": 0.9161348060573232, + "grad_norm": 0.7274473309516907, + "learning_rate": 1.1657115520532436e-05, + "loss": 0.3035, + "step": 9725 + }, + { + "epoch": 0.9162290101504911, + "grad_norm": 0.7577809691429138, + "learning_rate": 1.1655626360656998e-05, + "loss": 0.3076, + "step": 9726 + }, + { + "epoch": 0.9163232142436589, + "grad_norm": 0.6475155353546143, + "learning_rate": 1.1654137163030714e-05, + "loss": 0.2598, + "step": 9727 + }, + { + "epoch": 0.9164174183368268, + "grad_norm": 0.7633470892906189, + "learning_rate": 1.1652647927687553e-05, + "loss": 0.2993, + "step": 9728 + }, + { + "epoch": 0.9165116224299946, + "grad_norm": 0.6230948567390442, + "learning_rate": 1.1651158654661458e-05, + "loss": 0.2753, + "step": 9729 + }, + { + "epoch": 0.9166058265231625, + "grad_norm": 0.7631728053092957, + "learning_rate": 1.1649669343986393e-05, + "loss": 0.3044, + "step": 9730 + }, + { + "epoch": 0.9167000306163303, + "grad_norm": 0.6901342868804932, + "learning_rate": 1.1648179995696319e-05, + "loss": 0.3139, + "step": 9731 + }, + { + "epoch": 0.9167942347094982, + "grad_norm": 0.720503032207489, + "learning_rate": 1.1646690609825186e-05, + "loss": 0.359, + "step": 9732 + }, + { + "epoch": 0.916888438802666, + "grad_norm": 0.6577574610710144, + "learning_rate": 1.1645201186406965e-05, + "loss": 0.2636, + "step": 9733 + }, + { + "epoch": 0.9169826428958339, + "grad_norm": 0.7542081475257874, + "learning_rate": 1.1643711725475613e-05, + "loss": 0.3102, + "step": 9734 + }, + { + "epoch": 0.9170768469890017, + "grad_norm": 0.6242924928665161, + "learning_rate": 1.164222222706509e-05, + "loss": 0.2623, + "step": 9735 + }, + { + "epoch": 0.9171710510821696, + "grad_norm": 0.7153643369674683, + "learning_rate": 1.164073269120936e-05, + "loss": 0.3032, + "step": 9736 + }, + { + "epoch": 0.9172652551753374, + "grad_norm": 0.8726444840431213, + "learning_rate": 1.1639243117942387e-05, + "loss": 0.3244, + "step": 9737 + }, + { + "epoch": 0.9173594592685053, + "grad_norm": 0.7901492714881897, + "learning_rate": 1.1637753507298138e-05, + "loss": 0.2913, + "step": 9738 + }, + { + "epoch": 0.9174536633616731, + "grad_norm": 0.799914538860321, + "learning_rate": 1.1636263859310572e-05, + "loss": 0.3358, + "step": 9739 + }, + { + "epoch": 0.917547867454841, + "grad_norm": 0.7553713321685791, + "learning_rate": 1.1634774174013664e-05, + "loss": 0.2947, + "step": 9740 + }, + { + "epoch": 0.9176420715480088, + "grad_norm": 0.7834179401397705, + "learning_rate": 1.1633284451441373e-05, + "loss": 0.3046, + "step": 9741 + }, + { + "epoch": 0.9177362756411767, + "grad_norm": 0.9598224759101868, + "learning_rate": 1.1631794691627673e-05, + "loss": 0.2837, + "step": 9742 + }, + { + "epoch": 0.9178304797343445, + "grad_norm": 0.7087194919586182, + "learning_rate": 1.163030489460653e-05, + "loss": 0.3184, + "step": 9743 + }, + { + "epoch": 0.9179246838275124, + "grad_norm": 0.7266285419464111, + "learning_rate": 1.1628815060411913e-05, + "loss": 0.3478, + "step": 9744 + }, + { + "epoch": 0.9180188879206802, + "grad_norm": 0.7172000408172607, + "learning_rate": 1.1627325189077796e-05, + "loss": 0.3551, + "step": 9745 + }, + { + "epoch": 0.918113092013848, + "grad_norm": 0.7453349232673645, + "learning_rate": 1.1625835280638147e-05, + "loss": 0.2823, + "step": 9746 + }, + { + "epoch": 0.9182072961070159, + "grad_norm": 0.672122061252594, + "learning_rate": 1.1624345335126939e-05, + "loss": 0.2861, + "step": 9747 + }, + { + "epoch": 0.9183015002001838, + "grad_norm": 0.7447899580001831, + "learning_rate": 1.1622855352578144e-05, + "loss": 0.3103, + "step": 9748 + }, + { + "epoch": 0.9183957042933516, + "grad_norm": 0.6708574295043945, + "learning_rate": 1.1621365333025736e-05, + "loss": 0.3195, + "step": 9749 + }, + { + "epoch": 0.9184899083865194, + "grad_norm": 0.6426267027854919, + "learning_rate": 1.1619875276503694e-05, + "loss": 0.3075, + "step": 9750 + }, + { + "epoch": 0.9185841124796872, + "grad_norm": 0.628358781337738, + "learning_rate": 1.1618385183045991e-05, + "loss": 0.3084, + "step": 9751 + }, + { + "epoch": 0.918678316572855, + "grad_norm": 0.7134820818901062, + "learning_rate": 1.1616895052686598e-05, + "loss": 0.302, + "step": 9752 + }, + { + "epoch": 0.9187725206660229, + "grad_norm": 1.484278917312622, + "learning_rate": 1.1615404885459503e-05, + "loss": 0.3166, + "step": 9753 + }, + { + "epoch": 0.9188667247591907, + "grad_norm": 0.7471532225608826, + "learning_rate": 1.1613914681398677e-05, + "loss": 0.3484, + "step": 9754 + }, + { + "epoch": 0.9189609288523586, + "grad_norm": 0.7318947315216064, + "learning_rate": 1.1612424440538099e-05, + "loss": 0.2967, + "step": 9755 + }, + { + "epoch": 0.9190551329455264, + "grad_norm": 0.8191625475883484, + "learning_rate": 1.1610934162911751e-05, + "loss": 0.287, + "step": 9756 + }, + { + "epoch": 0.9191493370386943, + "grad_norm": 0.6804456114768982, + "learning_rate": 1.160944384855361e-05, + "loss": 0.3196, + "step": 9757 + }, + { + "epoch": 0.9192435411318621, + "grad_norm": 0.6854345798492432, + "learning_rate": 1.1607953497497664e-05, + "loss": 0.3422, + "step": 9758 + }, + { + "epoch": 0.91933774522503, + "grad_norm": 0.6621114611625671, + "learning_rate": 1.160646310977789e-05, + "loss": 0.3121, + "step": 9759 + }, + { + "epoch": 0.9194319493181978, + "grad_norm": 0.7258914113044739, + "learning_rate": 1.1604972685428273e-05, + "loss": 0.2967, + "step": 9760 + }, + { + "epoch": 0.9195261534113657, + "grad_norm": 0.7967356443405151, + "learning_rate": 1.1603482224482793e-05, + "loss": 0.2794, + "step": 9761 + }, + { + "epoch": 0.9196203575045335, + "grad_norm": 0.7581629157066345, + "learning_rate": 1.1601991726975443e-05, + "loss": 0.3259, + "step": 9762 + }, + { + "epoch": 0.9197145615977014, + "grad_norm": 0.8769176006317139, + "learning_rate": 1.1600501192940203e-05, + "loss": 0.3529, + "step": 9763 + }, + { + "epoch": 0.9198087656908692, + "grad_norm": 0.7249931693077087, + "learning_rate": 1.159901062241106e-05, + "loss": 0.2776, + "step": 9764 + }, + { + "epoch": 0.9199029697840371, + "grad_norm": 0.8734459280967712, + "learning_rate": 1.1597520015422003e-05, + "loss": 0.3294, + "step": 9765 + }, + { + "epoch": 0.9199971738772049, + "grad_norm": 0.8458078503608704, + "learning_rate": 1.1596029372007018e-05, + "loss": 0.2858, + "step": 9766 + }, + { + "epoch": 0.9200913779703728, + "grad_norm": 0.7283058166503906, + "learning_rate": 1.1594538692200094e-05, + "loss": 0.3031, + "step": 9767 + }, + { + "epoch": 0.9201855820635406, + "grad_norm": 0.6813495755195618, + "learning_rate": 1.1593047976035226e-05, + "loss": 0.2686, + "step": 9768 + }, + { + "epoch": 0.9202797861567085, + "grad_norm": 0.7182820439338684, + "learning_rate": 1.1591557223546394e-05, + "loss": 0.3068, + "step": 9769 + }, + { + "epoch": 0.9203739902498763, + "grad_norm": 0.9071686863899231, + "learning_rate": 1.15900664347676e-05, + "loss": 0.2864, + "step": 9770 + }, + { + "epoch": 0.9204681943430442, + "grad_norm": 0.7065544724464417, + "learning_rate": 1.1588575609732833e-05, + "loss": 0.3142, + "step": 9771 + }, + { + "epoch": 0.920562398436212, + "grad_norm": 0.9485142827033997, + "learning_rate": 1.1587084748476082e-05, + "loss": 0.3159, + "step": 9772 + }, + { + "epoch": 0.9206566025293799, + "grad_norm": 0.7493008375167847, + "learning_rate": 1.1585593851031346e-05, + "loss": 0.3286, + "step": 9773 + }, + { + "epoch": 0.9207508066225477, + "grad_norm": 0.7871357202529907, + "learning_rate": 1.158410291743262e-05, + "loss": 0.2836, + "step": 9774 + }, + { + "epoch": 0.9208450107157156, + "grad_norm": 0.815697193145752, + "learning_rate": 1.1582611947713896e-05, + "loss": 0.3324, + "step": 9775 + }, + { + "epoch": 0.9209392148088834, + "grad_norm": 0.7253048419952393, + "learning_rate": 1.1581120941909172e-05, + "loss": 0.2837, + "step": 9776 + }, + { + "epoch": 0.9210334189020513, + "grad_norm": 0.7250169515609741, + "learning_rate": 1.1579629900052442e-05, + "loss": 0.3094, + "step": 9777 + }, + { + "epoch": 0.9211276229952191, + "grad_norm": 2.9205780029296875, + "learning_rate": 1.1578138822177711e-05, + "loss": 0.3058, + "step": 9778 + }, + { + "epoch": 0.921221827088387, + "grad_norm": 0.6662350296974182, + "learning_rate": 1.1576647708318975e-05, + "loss": 0.2927, + "step": 9779 + }, + { + "epoch": 0.9213160311815548, + "grad_norm": 0.8149073719978333, + "learning_rate": 1.1575156558510232e-05, + "loss": 0.3248, + "step": 9780 + }, + { + "epoch": 0.9214102352747227, + "grad_norm": 0.7538041472434998, + "learning_rate": 1.1573665372785482e-05, + "loss": 0.2923, + "step": 9781 + }, + { + "epoch": 0.9215044393678905, + "grad_norm": 0.6875706315040588, + "learning_rate": 1.157217415117873e-05, + "loss": 0.3075, + "step": 9782 + }, + { + "epoch": 0.9215986434610584, + "grad_norm": 0.733664333820343, + "learning_rate": 1.1570682893723975e-05, + "loss": 0.3223, + "step": 9783 + }, + { + "epoch": 0.9216928475542262, + "grad_norm": 0.6781983971595764, + "learning_rate": 1.1569191600455219e-05, + "loss": 0.3025, + "step": 9784 + }, + { + "epoch": 0.921787051647394, + "grad_norm": 0.915701687335968, + "learning_rate": 1.1567700271406473e-05, + "loss": 0.3676, + "step": 9785 + }, + { + "epoch": 0.9218812557405619, + "grad_norm": 0.697933554649353, + "learning_rate": 1.1566208906611728e-05, + "loss": 0.3016, + "step": 9786 + }, + { + "epoch": 0.9219754598337297, + "grad_norm": 0.734963059425354, + "learning_rate": 1.1564717506105006e-05, + "loss": 0.3465, + "step": 9787 + }, + { + "epoch": 0.9220696639268976, + "grad_norm": 0.6184594631195068, + "learning_rate": 1.15632260699203e-05, + "loss": 0.2867, + "step": 9788 + }, + { + "epoch": 0.9221638680200654, + "grad_norm": 0.756898045539856, + "learning_rate": 1.1561734598091624e-05, + "loss": 0.2658, + "step": 9789 + }, + { + "epoch": 0.9222580721132333, + "grad_norm": 0.7471708059310913, + "learning_rate": 1.1560243090652982e-05, + "loss": 0.283, + "step": 9790 + }, + { + "epoch": 0.9223522762064011, + "grad_norm": 1.0527830123901367, + "learning_rate": 1.1558751547638387e-05, + "loss": 0.3118, + "step": 9791 + }, + { + "epoch": 0.922446480299569, + "grad_norm": 0.7618290781974792, + "learning_rate": 1.1557259969081841e-05, + "loss": 0.2852, + "step": 9792 + }, + { + "epoch": 0.9225406843927368, + "grad_norm": 0.7631360292434692, + "learning_rate": 1.1555768355017368e-05, + "loss": 0.3068, + "step": 9793 + }, + { + "epoch": 0.9226348884859047, + "grad_norm": 0.7242071032524109, + "learning_rate": 1.1554276705478964e-05, + "loss": 0.292, + "step": 9794 + }, + { + "epoch": 0.9227290925790725, + "grad_norm": 0.6794859170913696, + "learning_rate": 1.155278502050065e-05, + "loss": 0.3321, + "step": 9795 + }, + { + "epoch": 0.9228232966722404, + "grad_norm": 0.7960864901542664, + "learning_rate": 1.1551293300116435e-05, + "loss": 0.3055, + "step": 9796 + }, + { + "epoch": 0.9229175007654082, + "grad_norm": 0.6520786285400391, + "learning_rate": 1.1549801544360333e-05, + "loss": 0.3028, + "step": 9797 + }, + { + "epoch": 0.9230117048585761, + "grad_norm": 0.6842049956321716, + "learning_rate": 1.154830975326636e-05, + "loss": 0.2905, + "step": 9798 + }, + { + "epoch": 0.9231059089517439, + "grad_norm": 1.0332279205322266, + "learning_rate": 1.1546817926868529e-05, + "loss": 0.325, + "step": 9799 + }, + { + "epoch": 0.9232001130449118, + "grad_norm": 0.6430279016494751, + "learning_rate": 1.154532606520086e-05, + "loss": 0.2846, + "step": 9800 + }, + { + "epoch": 0.9232943171380796, + "grad_norm": 0.7600269317626953, + "learning_rate": 1.1543834168297363e-05, + "loss": 0.3688, + "step": 9801 + }, + { + "epoch": 0.9233885212312475, + "grad_norm": 0.66355299949646, + "learning_rate": 1.154234223619206e-05, + "loss": 0.2635, + "step": 9802 + }, + { + "epoch": 0.9234827253244153, + "grad_norm": 0.7806983590126038, + "learning_rate": 1.1540850268918973e-05, + "loss": 0.3218, + "step": 9803 + }, + { + "epoch": 0.9235769294175832, + "grad_norm": 0.6844689846038818, + "learning_rate": 1.1539358266512114e-05, + "loss": 0.3022, + "step": 9804 + }, + { + "epoch": 0.923671133510751, + "grad_norm": 0.7546319961547852, + "learning_rate": 1.1537866229005505e-05, + "loss": 0.3212, + "step": 9805 + }, + { + "epoch": 0.9237653376039189, + "grad_norm": 0.7607617378234863, + "learning_rate": 1.1536374156433168e-05, + "loss": 0.3094, + "step": 9806 + }, + { + "epoch": 0.9238595416970867, + "grad_norm": 0.6887412071228027, + "learning_rate": 1.1534882048829126e-05, + "loss": 0.3002, + "step": 9807 + }, + { + "epoch": 0.9239537457902546, + "grad_norm": 0.7532636523246765, + "learning_rate": 1.1533389906227396e-05, + "loss": 0.2705, + "step": 9808 + }, + { + "epoch": 0.9240479498834224, + "grad_norm": 0.7626668810844421, + "learning_rate": 1.1531897728662008e-05, + "loss": 0.298, + "step": 9809 + }, + { + "epoch": 0.9241421539765903, + "grad_norm": 0.6450669765472412, + "learning_rate": 1.153040551616698e-05, + "loss": 0.2722, + "step": 9810 + }, + { + "epoch": 0.9242363580697581, + "grad_norm": 0.9094744324684143, + "learning_rate": 1.1528913268776342e-05, + "loss": 0.3366, + "step": 9811 + }, + { + "epoch": 0.924330562162926, + "grad_norm": 0.6546444296836853, + "learning_rate": 1.1527420986524114e-05, + "loss": 0.3202, + "step": 9812 + }, + { + "epoch": 0.9244247662560938, + "grad_norm": 0.8807068467140198, + "learning_rate": 1.152592866944433e-05, + "loss": 0.3083, + "step": 9813 + }, + { + "epoch": 0.9245189703492617, + "grad_norm": 0.7280904054641724, + "learning_rate": 1.1524436317571008e-05, + "loss": 0.2781, + "step": 9814 + }, + { + "epoch": 0.9246131744424295, + "grad_norm": 0.8132057785987854, + "learning_rate": 1.1522943930938185e-05, + "loss": 0.3002, + "step": 9815 + }, + { + "epoch": 0.9247073785355974, + "grad_norm": 0.7954848408699036, + "learning_rate": 1.152145150957988e-05, + "loss": 0.2989, + "step": 9816 + }, + { + "epoch": 0.9248015826287652, + "grad_norm": 0.7015737295150757, + "learning_rate": 1.151995905353013e-05, + "loss": 0.3033, + "step": 9817 + }, + { + "epoch": 0.9248957867219331, + "grad_norm": 0.8260770440101624, + "learning_rate": 1.1518466562822961e-05, + "loss": 0.3103, + "step": 9818 + }, + { + "epoch": 0.9249899908151009, + "grad_norm": 0.6957612633705139, + "learning_rate": 1.1516974037492408e-05, + "loss": 0.2967, + "step": 9819 + }, + { + "epoch": 0.9250841949082688, + "grad_norm": 0.7427754402160645, + "learning_rate": 1.1515481477572502e-05, + "loss": 0.3448, + "step": 9820 + }, + { + "epoch": 0.9251783990014366, + "grad_norm": 0.8178347945213318, + "learning_rate": 1.1513988883097271e-05, + "loss": 0.3638, + "step": 9821 + }, + { + "epoch": 0.9252726030946045, + "grad_norm": 0.8922828435897827, + "learning_rate": 1.1512496254100756e-05, + "loss": 0.2693, + "step": 9822 + }, + { + "epoch": 0.9253668071877723, + "grad_norm": 0.9412736892700195, + "learning_rate": 1.1511003590616984e-05, + "loss": 0.3258, + "step": 9823 + }, + { + "epoch": 0.9254610112809402, + "grad_norm": 0.6674445271492004, + "learning_rate": 1.1509510892679994e-05, + "loss": 0.2777, + "step": 9824 + }, + { + "epoch": 0.925555215374108, + "grad_norm": 0.595525860786438, + "learning_rate": 1.1508018160323825e-05, + "loss": 0.2653, + "step": 9825 + }, + { + "epoch": 0.9256494194672759, + "grad_norm": 0.7466319799423218, + "learning_rate": 1.1506525393582505e-05, + "loss": 0.3316, + "step": 9826 + }, + { + "epoch": 0.9257436235604437, + "grad_norm": 0.7886889576911926, + "learning_rate": 1.1505032592490077e-05, + "loss": 0.3134, + "step": 9827 + }, + { + "epoch": 0.9258378276536116, + "grad_norm": 0.704491913318634, + "learning_rate": 1.150353975708058e-05, + "loss": 0.3163, + "step": 9828 + }, + { + "epoch": 0.9259320317467794, + "grad_norm": 0.6642715334892273, + "learning_rate": 1.150204688738805e-05, + "loss": 0.2488, + "step": 9829 + }, + { + "epoch": 0.9260262358399473, + "grad_norm": 0.610035240650177, + "learning_rate": 1.1500553983446527e-05, + "loss": 0.2435, + "step": 9830 + }, + { + "epoch": 0.9261204399331151, + "grad_norm": 0.7585195899009705, + "learning_rate": 1.1499061045290057e-05, + "loss": 0.312, + "step": 9831 + }, + { + "epoch": 0.926214644026283, + "grad_norm": 0.6760469675064087, + "learning_rate": 1.149756807295267e-05, + "loss": 0.2902, + "step": 9832 + }, + { + "epoch": 0.9263088481194508, + "grad_norm": 0.6611597537994385, + "learning_rate": 1.1496075066468422e-05, + "loss": 0.293, + "step": 9833 + }, + { + "epoch": 0.9264030522126186, + "grad_norm": 0.7652198672294617, + "learning_rate": 1.1494582025871343e-05, + "loss": 0.3129, + "step": 9834 + }, + { + "epoch": 0.9264972563057865, + "grad_norm": 0.6663600206375122, + "learning_rate": 1.1493088951195486e-05, + "loss": 0.2845, + "step": 9835 + }, + { + "epoch": 0.9265914603989543, + "grad_norm": 0.7061387300491333, + "learning_rate": 1.1491595842474892e-05, + "loss": 0.3229, + "step": 9836 + }, + { + "epoch": 0.9266856644921222, + "grad_norm": 0.6897725462913513, + "learning_rate": 1.1490102699743602e-05, + "loss": 0.3143, + "step": 9837 + }, + { + "epoch": 0.92677986858529, + "grad_norm": 0.575584888458252, + "learning_rate": 1.1488609523035667e-05, + "loss": 0.2256, + "step": 9838 + }, + { + "epoch": 0.9268740726784579, + "grad_norm": 0.8263952136039734, + "learning_rate": 1.1487116312385135e-05, + "loss": 0.3084, + "step": 9839 + }, + { + "epoch": 0.9269682767716257, + "grad_norm": 0.6921063661575317, + "learning_rate": 1.1485623067826053e-05, + "loss": 0.3259, + "step": 9840 + }, + { + "epoch": 0.9270624808647936, + "grad_norm": 0.7334921956062317, + "learning_rate": 1.1484129789392462e-05, + "loss": 0.312, + "step": 9841 + }, + { + "epoch": 0.9271566849579614, + "grad_norm": 0.688546895980835, + "learning_rate": 1.148263647711842e-05, + "loss": 0.268, + "step": 9842 + }, + { + "epoch": 0.9272508890511293, + "grad_norm": 0.7391108274459839, + "learning_rate": 1.1481143131037976e-05, + "loss": 0.2781, + "step": 9843 + }, + { + "epoch": 0.9273450931442971, + "grad_norm": 0.7827560305595398, + "learning_rate": 1.147964975118517e-05, + "loss": 0.3151, + "step": 9844 + }, + { + "epoch": 0.927439297237465, + "grad_norm": 0.7109737396240234, + "learning_rate": 1.147815633759407e-05, + "loss": 0.299, + "step": 9845 + }, + { + "epoch": 0.9275335013306328, + "grad_norm": 0.7421813011169434, + "learning_rate": 1.1476662890298713e-05, + "loss": 0.286, + "step": 9846 + }, + { + "epoch": 0.9276277054238007, + "grad_norm": 0.7915611863136292, + "learning_rate": 1.1475169409333163e-05, + "loss": 0.2693, + "step": 9847 + }, + { + "epoch": 0.9277219095169685, + "grad_norm": 0.7203260064125061, + "learning_rate": 1.1473675894731468e-05, + "loss": 0.3195, + "step": 9848 + }, + { + "epoch": 0.9278161136101364, + "grad_norm": 0.7064208984375, + "learning_rate": 1.147218234652768e-05, + "loss": 0.3031, + "step": 9849 + }, + { + "epoch": 0.9279103177033042, + "grad_norm": 0.6969903707504272, + "learning_rate": 1.1470688764755862e-05, + "loss": 0.2761, + "step": 9850 + }, + { + "epoch": 0.9280045217964721, + "grad_norm": 0.7239713668823242, + "learning_rate": 1.1469195149450063e-05, + "loss": 0.2885, + "step": 9851 + }, + { + "epoch": 0.9280987258896399, + "grad_norm": 0.7491258978843689, + "learning_rate": 1.1467701500644344e-05, + "loss": 0.303, + "step": 9852 + }, + { + "epoch": 0.9281929299828078, + "grad_norm": 0.6255550980567932, + "learning_rate": 1.1466207818372764e-05, + "loss": 0.2871, + "step": 9853 + }, + { + "epoch": 0.9282871340759756, + "grad_norm": 0.8438575863838196, + "learning_rate": 1.146471410266937e-05, + "loss": 0.3044, + "step": 9854 + }, + { + "epoch": 0.9283813381691435, + "grad_norm": 0.7754700183868408, + "learning_rate": 1.1463220353568236e-05, + "loss": 0.2985, + "step": 9855 + }, + { + "epoch": 0.9284755422623113, + "grad_norm": 0.6884915232658386, + "learning_rate": 1.1461726571103413e-05, + "loss": 0.3185, + "step": 9856 + }, + { + "epoch": 0.9285697463554792, + "grad_norm": 0.6350425481796265, + "learning_rate": 1.1460232755308962e-05, + "loss": 0.2889, + "step": 9857 + }, + { + "epoch": 0.928663950448647, + "grad_norm": 0.7048108577728271, + "learning_rate": 1.1458738906218947e-05, + "loss": 0.3242, + "step": 9858 + }, + { + "epoch": 0.9287581545418149, + "grad_norm": 0.8387030959129333, + "learning_rate": 1.145724502386743e-05, + "loss": 0.3983, + "step": 9859 + }, + { + "epoch": 0.9288523586349827, + "grad_norm": 0.8022667765617371, + "learning_rate": 1.1455751108288474e-05, + "loss": 0.2726, + "step": 9860 + }, + { + "epoch": 0.9289465627281506, + "grad_norm": 0.6724227070808411, + "learning_rate": 1.1454257159516139e-05, + "loss": 0.2752, + "step": 9861 + }, + { + "epoch": 0.9290407668213184, + "grad_norm": 0.6579088568687439, + "learning_rate": 1.1452763177584491e-05, + "loss": 0.291, + "step": 9862 + }, + { + "epoch": 0.9291349709144863, + "grad_norm": 0.6796449422836304, + "learning_rate": 1.1451269162527598e-05, + "loss": 0.3241, + "step": 9863 + }, + { + "epoch": 0.9292291750076541, + "grad_norm": 0.6931931972503662, + "learning_rate": 1.1449775114379523e-05, + "loss": 0.3118, + "step": 9864 + }, + { + "epoch": 0.929323379100822, + "grad_norm": 0.7058824896812439, + "learning_rate": 1.1448281033174333e-05, + "loss": 0.3212, + "step": 9865 + }, + { + "epoch": 0.9294175831939898, + "grad_norm": 0.7156518697738647, + "learning_rate": 1.1446786918946094e-05, + "loss": 0.3393, + "step": 9866 + }, + { + "epoch": 0.9295117872871577, + "grad_norm": 0.7381418943405151, + "learning_rate": 1.1445292771728877e-05, + "loss": 0.3464, + "step": 9867 + }, + { + "epoch": 0.9296059913803255, + "grad_norm": 0.725773811340332, + "learning_rate": 1.1443798591556751e-05, + "loss": 0.2916, + "step": 9868 + }, + { + "epoch": 0.9297001954734934, + "grad_norm": 0.8342340588569641, + "learning_rate": 1.1442304378463782e-05, + "loss": 0.3409, + "step": 9869 + }, + { + "epoch": 0.9297943995666612, + "grad_norm": 0.6926758885383606, + "learning_rate": 1.1440810132484043e-05, + "loss": 0.2702, + "step": 9870 + }, + { + "epoch": 0.9298886036598291, + "grad_norm": 0.721227765083313, + "learning_rate": 1.1439315853651607e-05, + "loss": 0.2773, + "step": 9871 + }, + { + "epoch": 0.9299828077529969, + "grad_norm": 0.8517300486564636, + "learning_rate": 1.143782154200054e-05, + "loss": 0.292, + "step": 9872 + }, + { + "epoch": 0.9300770118461648, + "grad_norm": 0.649409830570221, + "learning_rate": 1.1436327197564926e-05, + "loss": 0.2606, + "step": 9873 + }, + { + "epoch": 0.9301712159393326, + "grad_norm": 0.7920855283737183, + "learning_rate": 1.1434832820378821e-05, + "loss": 0.2314, + "step": 9874 + }, + { + "epoch": 0.9302654200325005, + "grad_norm": 0.6702718734741211, + "learning_rate": 1.1433338410476313e-05, + "loss": 0.2517, + "step": 9875 + }, + { + "epoch": 0.9303596241256683, + "grad_norm": 0.6656098961830139, + "learning_rate": 1.1431843967891471e-05, + "loss": 0.2636, + "step": 9876 + }, + { + "epoch": 0.9304538282188362, + "grad_norm": 0.6624272465705872, + "learning_rate": 1.1430349492658372e-05, + "loss": 0.2923, + "step": 9877 + }, + { + "epoch": 0.930548032312004, + "grad_norm": 0.7577387690544128, + "learning_rate": 1.1428854984811095e-05, + "loss": 0.2981, + "step": 9878 + }, + { + "epoch": 0.9306422364051719, + "grad_norm": 0.6593563556671143, + "learning_rate": 1.1427360444383715e-05, + "loss": 0.2602, + "step": 9879 + }, + { + "epoch": 0.9307364404983397, + "grad_norm": 0.6353920102119446, + "learning_rate": 1.1425865871410306e-05, + "loss": 0.2895, + "step": 9880 + }, + { + "epoch": 0.9308306445915075, + "grad_norm": 0.9484451413154602, + "learning_rate": 1.1424371265924951e-05, + "loss": 0.3441, + "step": 9881 + }, + { + "epoch": 0.9309248486846754, + "grad_norm": 0.7541933655738831, + "learning_rate": 1.142287662796173e-05, + "loss": 0.3365, + "step": 9882 + }, + { + "epoch": 0.9310190527778432, + "grad_norm": 1.1388295888900757, + "learning_rate": 1.142138195755472e-05, + "loss": 0.3024, + "step": 9883 + }, + { + "epoch": 0.9311132568710111, + "grad_norm": 0.7818012237548828, + "learning_rate": 1.1419887254738005e-05, + "loss": 0.3064, + "step": 9884 + }, + { + "epoch": 0.931207460964179, + "grad_norm": 0.8127809762954712, + "learning_rate": 1.1418392519545665e-05, + "loss": 0.3171, + "step": 9885 + }, + { + "epoch": 0.9313016650573468, + "grad_norm": 0.8229526877403259, + "learning_rate": 1.1416897752011777e-05, + "loss": 0.3034, + "step": 9886 + }, + { + "epoch": 0.9313958691505146, + "grad_norm": 0.7284834980964661, + "learning_rate": 1.1415402952170434e-05, + "loss": 0.2828, + "step": 9887 + }, + { + "epoch": 0.9314900732436825, + "grad_norm": 0.7439441680908203, + "learning_rate": 1.1413908120055712e-05, + "loss": 0.3078, + "step": 9888 + }, + { + "epoch": 0.9315842773368503, + "grad_norm": 0.7771740555763245, + "learning_rate": 1.1412413255701698e-05, + "loss": 0.3444, + "step": 9889 + }, + { + "epoch": 0.9316784814300181, + "grad_norm": 0.8692653179168701, + "learning_rate": 1.1410918359142482e-05, + "loss": 0.2878, + "step": 9890 + }, + { + "epoch": 0.9317726855231859, + "grad_norm": 0.7473660707473755, + "learning_rate": 1.1409423430412141e-05, + "loss": 0.281, + "step": 9891 + }, + { + "epoch": 0.9318668896163538, + "grad_norm": 0.6817148327827454, + "learning_rate": 1.1407928469544765e-05, + "loss": 0.2449, + "step": 9892 + }, + { + "epoch": 0.9319610937095216, + "grad_norm": 0.7222283482551575, + "learning_rate": 1.1406433476574446e-05, + "loss": 0.309, + "step": 9893 + }, + { + "epoch": 0.9320552978026895, + "grad_norm": 0.6757322549819946, + "learning_rate": 1.1404938451535265e-05, + "loss": 0.2896, + "step": 9894 + }, + { + "epoch": 0.9321495018958573, + "grad_norm": 0.6814132332801819, + "learning_rate": 1.1403443394461318e-05, + "loss": 0.3168, + "step": 9895 + }, + { + "epoch": 0.9322437059890252, + "grad_norm": 0.7547572255134583, + "learning_rate": 1.140194830538669e-05, + "loss": 0.3021, + "step": 9896 + }, + { + "epoch": 0.932337910082193, + "grad_norm": 0.6935548186302185, + "learning_rate": 1.140045318434547e-05, + "loss": 0.2886, + "step": 9897 + }, + { + "epoch": 0.9324321141753609, + "grad_norm": 0.7061550617218018, + "learning_rate": 1.1398958031371756e-05, + "loss": 0.3078, + "step": 9898 + }, + { + "epoch": 0.9325263182685287, + "grad_norm": 0.8823407292366028, + "learning_rate": 1.1397462846499633e-05, + "loss": 0.2965, + "step": 9899 + }, + { + "epoch": 0.9326205223616966, + "grad_norm": 0.7356183528900146, + "learning_rate": 1.1395967629763196e-05, + "loss": 0.3226, + "step": 9900 + }, + { + "epoch": 0.9327147264548644, + "grad_norm": 0.6505039930343628, + "learning_rate": 1.1394472381196537e-05, + "loss": 0.2615, + "step": 9901 + }, + { + "epoch": 0.9328089305480323, + "grad_norm": 0.6908877491950989, + "learning_rate": 1.1392977100833753e-05, + "loss": 0.2697, + "step": 9902 + }, + { + "epoch": 0.9329031346412001, + "grad_norm": 0.8316591382026672, + "learning_rate": 1.1391481788708937e-05, + "loss": 0.3014, + "step": 9903 + }, + { + "epoch": 0.932997338734368, + "grad_norm": 0.6694990396499634, + "learning_rate": 1.1389986444856184e-05, + "loss": 0.2987, + "step": 9904 + }, + { + "epoch": 0.9330915428275358, + "grad_norm": 0.708440899848938, + "learning_rate": 1.138849106930959e-05, + "loss": 0.2957, + "step": 9905 + }, + { + "epoch": 0.9331857469207037, + "grad_norm": 0.745836615562439, + "learning_rate": 1.138699566210325e-05, + "loss": 0.2988, + "step": 9906 + }, + { + "epoch": 0.9332799510138715, + "grad_norm": 0.779005229473114, + "learning_rate": 1.1385500223271266e-05, + "loss": 0.2996, + "step": 9907 + }, + { + "epoch": 0.9333741551070394, + "grad_norm": 0.777803897857666, + "learning_rate": 1.1384004752847734e-05, + "loss": 0.2515, + "step": 9908 + }, + { + "epoch": 0.9334683592002072, + "grad_norm": 0.7843090295791626, + "learning_rate": 1.1382509250866754e-05, + "loss": 0.3664, + "step": 9909 + }, + { + "epoch": 0.9335625632933751, + "grad_norm": 0.6366344094276428, + "learning_rate": 1.1381013717362426e-05, + "loss": 0.2819, + "step": 9910 + }, + { + "epoch": 0.9336567673865429, + "grad_norm": 0.8661206960678101, + "learning_rate": 1.1379518152368846e-05, + "loss": 0.3445, + "step": 9911 + }, + { + "epoch": 0.9337509714797108, + "grad_norm": 0.7795618772506714, + "learning_rate": 1.1378022555920119e-05, + "loss": 0.2953, + "step": 9912 + }, + { + "epoch": 0.9338451755728786, + "grad_norm": 0.7671486139297485, + "learning_rate": 1.137652692805035e-05, + "loss": 0.3227, + "step": 9913 + }, + { + "epoch": 0.9339393796660465, + "grad_norm": 0.6926409006118774, + "learning_rate": 1.1375031268793638e-05, + "loss": 0.2894, + "step": 9914 + }, + { + "epoch": 0.9340335837592143, + "grad_norm": 0.6194040775299072, + "learning_rate": 1.1373535578184083e-05, + "loss": 0.2646, + "step": 9915 + }, + { + "epoch": 0.9341277878523822, + "grad_norm": 0.738124430179596, + "learning_rate": 1.1372039856255795e-05, + "loss": 0.3311, + "step": 9916 + }, + { + "epoch": 0.93422199194555, + "grad_norm": 0.7832310199737549, + "learning_rate": 1.1370544103042875e-05, + "loss": 0.2662, + "step": 9917 + }, + { + "epoch": 0.9343161960387178, + "grad_norm": 0.7839985489845276, + "learning_rate": 1.1369048318579429e-05, + "loss": 0.3003, + "step": 9918 + }, + { + "epoch": 0.9344104001318857, + "grad_norm": 0.6679766178131104, + "learning_rate": 1.1367552502899568e-05, + "loss": 0.2668, + "step": 9919 + }, + { + "epoch": 0.9345046042250535, + "grad_norm": 0.6642523407936096, + "learning_rate": 1.1366056656037395e-05, + "loss": 0.2585, + "step": 9920 + }, + { + "epoch": 0.9345988083182214, + "grad_norm": 0.7177343964576721, + "learning_rate": 1.1364560778027011e-05, + "loss": 0.2941, + "step": 9921 + }, + { + "epoch": 0.9346930124113892, + "grad_norm": 0.7531906366348267, + "learning_rate": 1.1363064868902536e-05, + "loss": 0.2959, + "step": 9922 + }, + { + "epoch": 0.9347872165045571, + "grad_norm": 0.7104706764221191, + "learning_rate": 1.1361568928698074e-05, + "loss": 0.3115, + "step": 9923 + }, + { + "epoch": 0.9348814205977249, + "grad_norm": 0.7429775595664978, + "learning_rate": 1.1360072957447734e-05, + "loss": 0.3076, + "step": 9924 + }, + { + "epoch": 0.9349756246908928, + "grad_norm": 0.8083294630050659, + "learning_rate": 1.135857695518563e-05, + "loss": 0.3072, + "step": 9925 + }, + { + "epoch": 0.9350698287840606, + "grad_norm": 0.8144860863685608, + "learning_rate": 1.1357080921945865e-05, + "loss": 0.2975, + "step": 9926 + }, + { + "epoch": 0.9351640328772285, + "grad_norm": 0.6939120292663574, + "learning_rate": 1.1355584857762559e-05, + "loss": 0.301, + "step": 9927 + }, + { + "epoch": 0.9352582369703963, + "grad_norm": 0.7598026394844055, + "learning_rate": 1.1354088762669822e-05, + "loss": 0.306, + "step": 9928 + }, + { + "epoch": 0.9353524410635642, + "grad_norm": 0.9581432938575745, + "learning_rate": 1.1352592636701765e-05, + "loss": 0.2699, + "step": 9929 + }, + { + "epoch": 0.935446645156732, + "grad_norm": 0.7145228385925293, + "learning_rate": 1.1351096479892508e-05, + "loss": 0.3034, + "step": 9930 + }, + { + "epoch": 0.9355408492498999, + "grad_norm": 0.7225978374481201, + "learning_rate": 1.1349600292276158e-05, + "loss": 0.3001, + "step": 9931 + }, + { + "epoch": 0.9356350533430677, + "grad_norm": 0.9868305921554565, + "learning_rate": 1.1348104073886831e-05, + "loss": 0.2675, + "step": 9932 + }, + { + "epoch": 0.9357292574362356, + "grad_norm": 0.7206736207008362, + "learning_rate": 1.1346607824758656e-05, + "loss": 0.2776, + "step": 9933 + }, + { + "epoch": 0.9358234615294034, + "grad_norm": 0.7475821375846863, + "learning_rate": 1.1345111544925734e-05, + "loss": 0.3236, + "step": 9934 + }, + { + "epoch": 0.9359176656225713, + "grad_norm": 0.8190663456916809, + "learning_rate": 1.1343615234422188e-05, + "loss": 0.2922, + "step": 9935 + }, + { + "epoch": 0.9360118697157391, + "grad_norm": 0.6947363018989563, + "learning_rate": 1.1342118893282139e-05, + "loss": 0.3138, + "step": 9936 + }, + { + "epoch": 0.936106073808907, + "grad_norm": 0.7332528233528137, + "learning_rate": 1.13406225215397e-05, + "loss": 0.3075, + "step": 9937 + }, + { + "epoch": 0.9362002779020748, + "grad_norm": 0.7393556833267212, + "learning_rate": 1.1339126119228999e-05, + "loss": 0.3154, + "step": 9938 + }, + { + "epoch": 0.9362944819952427, + "grad_norm": 0.7267153263092041, + "learning_rate": 1.1337629686384149e-05, + "loss": 0.3139, + "step": 9939 + }, + { + "epoch": 0.9363886860884105, + "grad_norm": 0.7483210563659668, + "learning_rate": 1.1336133223039274e-05, + "loss": 0.3265, + "step": 9940 + }, + { + "epoch": 0.9364828901815784, + "grad_norm": 0.6914429664611816, + "learning_rate": 1.1334636729228493e-05, + "loss": 0.2977, + "step": 9941 + }, + { + "epoch": 0.9365770942747462, + "grad_norm": 0.6975411176681519, + "learning_rate": 1.1333140204985933e-05, + "loss": 0.3084, + "step": 9942 + }, + { + "epoch": 0.9366712983679141, + "grad_norm": 0.7118569016456604, + "learning_rate": 1.1331643650345715e-05, + "loss": 0.2696, + "step": 9943 + }, + { + "epoch": 0.9367655024610819, + "grad_norm": 0.8160148859024048, + "learning_rate": 1.1330147065341962e-05, + "loss": 0.3043, + "step": 9944 + }, + { + "epoch": 0.9368597065542498, + "grad_norm": 0.7015519738197327, + "learning_rate": 1.1328650450008798e-05, + "loss": 0.3057, + "step": 9945 + }, + { + "epoch": 0.9369539106474176, + "grad_norm": 0.7510275840759277, + "learning_rate": 1.1327153804380346e-05, + "loss": 0.3295, + "step": 9946 + }, + { + "epoch": 0.9370481147405855, + "grad_norm": 0.7069833278656006, + "learning_rate": 1.1325657128490739e-05, + "loss": 0.329, + "step": 9947 + }, + { + "epoch": 0.9371423188337533, + "grad_norm": 0.7049331068992615, + "learning_rate": 1.13241604223741e-05, + "loss": 0.3128, + "step": 9948 + }, + { + "epoch": 0.9372365229269212, + "grad_norm": 0.7377355098724365, + "learning_rate": 1.132266368606455e-05, + "loss": 0.3248, + "step": 9949 + }, + { + "epoch": 0.937330727020089, + "grad_norm": 0.6691461205482483, + "learning_rate": 1.132116691959623e-05, + "loss": 0.2914, + "step": 9950 + }, + { + "epoch": 0.9374249311132569, + "grad_norm": 1.0119857788085938, + "learning_rate": 1.1319670123003254e-05, + "loss": 0.3384, + "step": 9951 + }, + { + "epoch": 0.9375191352064247, + "grad_norm": 0.7098222970962524, + "learning_rate": 1.1318173296319761e-05, + "loss": 0.2547, + "step": 9952 + }, + { + "epoch": 0.9376133392995926, + "grad_norm": 0.7042816281318665, + "learning_rate": 1.1316676439579881e-05, + "loss": 0.2686, + "step": 9953 + }, + { + "epoch": 0.9377075433927604, + "grad_norm": 0.7121096849441528, + "learning_rate": 1.131517955281774e-05, + "loss": 0.3142, + "step": 9954 + }, + { + "epoch": 0.9378017474859283, + "grad_norm": 0.7205197811126709, + "learning_rate": 1.131368263606747e-05, + "loss": 0.3285, + "step": 9955 + }, + { + "epoch": 0.9378959515790961, + "grad_norm": 0.7869858741760254, + "learning_rate": 1.1312185689363204e-05, + "loss": 0.2983, + "step": 9956 + }, + { + "epoch": 0.937990155672264, + "grad_norm": 0.6667638421058655, + "learning_rate": 1.1310688712739076e-05, + "loss": 0.2635, + "step": 9957 + }, + { + "epoch": 0.9380843597654318, + "grad_norm": 0.6950367093086243, + "learning_rate": 1.1309191706229216e-05, + "loss": 0.3376, + "step": 9958 + }, + { + "epoch": 0.9381785638585997, + "grad_norm": 0.7347050309181213, + "learning_rate": 1.1307694669867765e-05, + "loss": 0.274, + "step": 9959 + }, + { + "epoch": 0.9382727679517675, + "grad_norm": 0.7395270466804504, + "learning_rate": 1.130619760368885e-05, + "loss": 0.3042, + "step": 9960 + }, + { + "epoch": 0.9383669720449354, + "grad_norm": 0.8214524984359741, + "learning_rate": 1.130470050772661e-05, + "loss": 0.293, + "step": 9961 + }, + { + "epoch": 0.9384611761381032, + "grad_norm": 0.6774492263793945, + "learning_rate": 1.1303203382015182e-05, + "loss": 0.2689, + "step": 9962 + }, + { + "epoch": 0.938555380231271, + "grad_norm": 0.7465950846672058, + "learning_rate": 1.13017062265887e-05, + "loss": 0.2746, + "step": 9963 + }, + { + "epoch": 0.9386495843244389, + "grad_norm": 0.5934969186782837, + "learning_rate": 1.1300209041481304e-05, + "loss": 0.2624, + "step": 9964 + }, + { + "epoch": 0.9387437884176068, + "grad_norm": 0.8484015464782715, + "learning_rate": 1.129871182672713e-05, + "loss": 0.3006, + "step": 9965 + }, + { + "epoch": 0.9388379925107746, + "grad_norm": 0.7751550674438477, + "learning_rate": 1.1297214582360319e-05, + "loss": 0.3313, + "step": 9966 + }, + { + "epoch": 0.9389321966039424, + "grad_norm": 0.7646822333335876, + "learning_rate": 1.1295717308415009e-05, + "loss": 0.299, + "step": 9967 + }, + { + "epoch": 0.9390264006971103, + "grad_norm": 0.7235328555107117, + "learning_rate": 1.129422000492534e-05, + "loss": 0.296, + "step": 9968 + }, + { + "epoch": 0.9391206047902781, + "grad_norm": 0.7357309460639954, + "learning_rate": 1.1292722671925451e-05, + "loss": 0.2723, + "step": 9969 + }, + { + "epoch": 0.939214808883446, + "grad_norm": 0.747172474861145, + "learning_rate": 1.1291225309449492e-05, + "loss": 0.2914, + "step": 9970 + }, + { + "epoch": 0.9393090129766138, + "grad_norm": 0.8299241065979004, + "learning_rate": 1.1289727917531593e-05, + "loss": 0.3233, + "step": 9971 + }, + { + "epoch": 0.9394032170697817, + "grad_norm": 0.6824995279312134, + "learning_rate": 1.1288230496205904e-05, + "loss": 0.2949, + "step": 9972 + }, + { + "epoch": 0.9394974211629495, + "grad_norm": 0.817967414855957, + "learning_rate": 1.128673304550657e-05, + "loss": 0.3349, + "step": 9973 + }, + { + "epoch": 0.9395916252561174, + "grad_norm": 0.7143617272377014, + "learning_rate": 1.1285235565467731e-05, + "loss": 0.3303, + "step": 9974 + }, + { + "epoch": 0.9396858293492852, + "grad_norm": 0.6996434330940247, + "learning_rate": 1.1283738056123535e-05, + "loss": 0.2744, + "step": 9975 + }, + { + "epoch": 0.9397800334424531, + "grad_norm": 0.6489402651786804, + "learning_rate": 1.1282240517508123e-05, + "loss": 0.3029, + "step": 9976 + }, + { + "epoch": 0.9398742375356209, + "grad_norm": 0.7647433280944824, + "learning_rate": 1.1280742949655646e-05, + "loss": 0.3179, + "step": 9977 + }, + { + "epoch": 0.9399684416287888, + "grad_norm": 0.5993421673774719, + "learning_rate": 1.1279245352600248e-05, + "loss": 0.2802, + "step": 9978 + }, + { + "epoch": 0.9400626457219566, + "grad_norm": 0.7115922570228577, + "learning_rate": 1.1277747726376078e-05, + "loss": 0.2926, + "step": 9979 + }, + { + "epoch": 0.9401568498151245, + "grad_norm": 0.8292811512947083, + "learning_rate": 1.1276250071017284e-05, + "loss": 0.2971, + "step": 9980 + }, + { + "epoch": 0.9402510539082923, + "grad_norm": 0.7397706508636475, + "learning_rate": 1.1274752386558017e-05, + "loss": 0.2909, + "step": 9981 + }, + { + "epoch": 0.9403452580014602, + "grad_norm": 0.7106196880340576, + "learning_rate": 1.127325467303242e-05, + "loss": 0.3205, + "step": 9982 + }, + { + "epoch": 0.940439462094628, + "grad_norm": 0.65058434009552, + "learning_rate": 1.1271756930474651e-05, + "loss": 0.2966, + "step": 9983 + }, + { + "epoch": 0.9405336661877959, + "grad_norm": 0.6805201768875122, + "learning_rate": 1.1270259158918855e-05, + "loss": 0.2574, + "step": 9984 + }, + { + "epoch": 0.9406278702809637, + "grad_norm": 0.7617127895355225, + "learning_rate": 1.1268761358399187e-05, + "loss": 0.3205, + "step": 9985 + }, + { + "epoch": 0.9407220743741316, + "grad_norm": 0.8599696755409241, + "learning_rate": 1.1267263528949794e-05, + "loss": 0.3309, + "step": 9986 + }, + { + "epoch": 0.9408162784672994, + "grad_norm": 0.74057537317276, + "learning_rate": 1.1265765670604838e-05, + "loss": 0.2995, + "step": 9987 + }, + { + "epoch": 0.9409104825604673, + "grad_norm": 0.8807875514030457, + "learning_rate": 1.1264267783398463e-05, + "loss": 0.3159, + "step": 9988 + }, + { + "epoch": 0.9410046866536351, + "grad_norm": 0.7203329801559448, + "learning_rate": 1.1262769867364828e-05, + "loss": 0.2876, + "step": 9989 + }, + { + "epoch": 0.941098890746803, + "grad_norm": 0.5872389078140259, + "learning_rate": 1.1261271922538093e-05, + "loss": 0.279, + "step": 9990 + }, + { + "epoch": 0.9411930948399708, + "grad_norm": 0.9001421332359314, + "learning_rate": 1.12597739489524e-05, + "loss": 0.3243, + "step": 9991 + }, + { + "epoch": 0.9412872989331387, + "grad_norm": 0.6735695600509644, + "learning_rate": 1.1258275946641915e-05, + "loss": 0.287, + "step": 9992 + }, + { + "epoch": 0.9413815030263065, + "grad_norm": 0.6174083352088928, + "learning_rate": 1.1256777915640796e-05, + "loss": 0.2673, + "step": 9993 + }, + { + "epoch": 0.9414757071194744, + "grad_norm": 0.6570601463317871, + "learning_rate": 1.125527985598319e-05, + "loss": 0.267, + "step": 9994 + }, + { + "epoch": 0.9415699112126422, + "grad_norm": 0.789585292339325, + "learning_rate": 1.1253781767703267e-05, + "loss": 0.2891, + "step": 9995 + }, + { + "epoch": 0.9416641153058101, + "grad_norm": 0.8143724799156189, + "learning_rate": 1.1252283650835181e-05, + "loss": 0.3323, + "step": 9996 + }, + { + "epoch": 0.9417583193989779, + "grad_norm": 0.657049834728241, + "learning_rate": 1.1250785505413087e-05, + "loss": 0.2655, + "step": 9997 + }, + { + "epoch": 0.9418525234921458, + "grad_norm": 0.7499266266822815, + "learning_rate": 1.1249287331471152e-05, + "loss": 0.3034, + "step": 9998 + }, + { + "epoch": 0.9419467275853136, + "grad_norm": 0.6467655897140503, + "learning_rate": 1.1247789129043534e-05, + "loss": 0.2865, + "step": 9999 + }, + { + "epoch": 0.9420409316784815, + "grad_norm": 0.7246831059455872, + "learning_rate": 1.1246290898164393e-05, + "loss": 0.3224, + "step": 10000 + }, + { + "epoch": 0.9421351357716493, + "grad_norm": 0.7325423359870911, + "learning_rate": 1.1244792638867895e-05, + "loss": 0.3302, + "step": 10001 + }, + { + "epoch": 0.9422293398648172, + "grad_norm": 0.696479082107544, + "learning_rate": 1.1243294351188196e-05, + "loss": 0.3, + "step": 10002 + }, + { + "epoch": 0.942323543957985, + "grad_norm": 0.7129887342453003, + "learning_rate": 1.1241796035159464e-05, + "loss": 0.3351, + "step": 10003 + }, + { + "epoch": 0.9424177480511529, + "grad_norm": 1.0603524446487427, + "learning_rate": 1.1240297690815862e-05, + "loss": 0.338, + "step": 10004 + }, + { + "epoch": 0.9425119521443207, + "grad_norm": 0.7714053988456726, + "learning_rate": 1.1238799318191556e-05, + "loss": 0.3243, + "step": 10005 + }, + { + "epoch": 0.9426061562374886, + "grad_norm": 0.6713439226150513, + "learning_rate": 1.1237300917320708e-05, + "loss": 0.2984, + "step": 10006 + }, + { + "epoch": 0.9427003603306564, + "grad_norm": 0.6070764064788818, + "learning_rate": 1.1235802488237486e-05, + "loss": 0.2728, + "step": 10007 + }, + { + "epoch": 0.9427945644238243, + "grad_norm": 0.7760406732559204, + "learning_rate": 1.1234304030976055e-05, + "loss": 0.2977, + "step": 10008 + }, + { + "epoch": 0.9428887685169921, + "grad_norm": 0.8591163754463196, + "learning_rate": 1.1232805545570585e-05, + "loss": 0.3368, + "step": 10009 + }, + { + "epoch": 0.94298297261016, + "grad_norm": 0.691236138343811, + "learning_rate": 1.1231307032055243e-05, + "loss": 0.3039, + "step": 10010 + }, + { + "epoch": 0.9430771767033278, + "grad_norm": 0.7132549285888672, + "learning_rate": 1.1229808490464193e-05, + "loss": 0.305, + "step": 10011 + }, + { + "epoch": 0.9431713807964957, + "grad_norm": 0.682148277759552, + "learning_rate": 1.1228309920831608e-05, + "loss": 0.3053, + "step": 10012 + }, + { + "epoch": 0.9432655848896635, + "grad_norm": 0.7111742496490479, + "learning_rate": 1.122681132319166e-05, + "loss": 0.3239, + "step": 10013 + }, + { + "epoch": 0.9433597889828313, + "grad_norm": 1.16665780544281, + "learning_rate": 1.1225312697578514e-05, + "loss": 0.2779, + "step": 10014 + }, + { + "epoch": 0.9434539930759992, + "grad_norm": 0.6893547177314758, + "learning_rate": 1.1223814044026344e-05, + "loss": 0.2827, + "step": 10015 + }, + { + "epoch": 0.943548197169167, + "grad_norm": 0.7077202796936035, + "learning_rate": 1.1222315362569323e-05, + "loss": 0.3069, + "step": 10016 + }, + { + "epoch": 0.9436424012623349, + "grad_norm": 0.7060577273368835, + "learning_rate": 1.1220816653241617e-05, + "loss": 0.312, + "step": 10017 + }, + { + "epoch": 0.9437366053555027, + "grad_norm": 1.0103267431259155, + "learning_rate": 1.1219317916077407e-05, + "loss": 0.306, + "step": 10018 + }, + { + "epoch": 0.9438308094486706, + "grad_norm": 0.7344710826873779, + "learning_rate": 1.1217819151110864e-05, + "loss": 0.3348, + "step": 10019 + }, + { + "epoch": 0.9439250135418384, + "grad_norm": 0.6737074851989746, + "learning_rate": 1.1216320358376158e-05, + "loss": 0.2857, + "step": 10020 + }, + { + "epoch": 0.9440192176350063, + "grad_norm": 0.7511048913002014, + "learning_rate": 1.1214821537907469e-05, + "loss": 0.3162, + "step": 10021 + }, + { + "epoch": 0.9441134217281741, + "grad_norm": 0.674433708190918, + "learning_rate": 1.1213322689738968e-05, + "loss": 0.2954, + "step": 10022 + }, + { + "epoch": 0.944207625821342, + "grad_norm": 0.71649169921875, + "learning_rate": 1.1211823813904834e-05, + "loss": 0.2882, + "step": 10023 + }, + { + "epoch": 0.9443018299145098, + "grad_norm": 0.600676953792572, + "learning_rate": 1.1210324910439242e-05, + "loss": 0.2599, + "step": 10024 + }, + { + "epoch": 0.9443960340076777, + "grad_norm": 0.7122073173522949, + "learning_rate": 1.1208825979376374e-05, + "loss": 0.3038, + "step": 10025 + }, + { + "epoch": 0.9444902381008455, + "grad_norm": 0.9700125455856323, + "learning_rate": 1.12073270207504e-05, + "loss": 0.3057, + "step": 10026 + }, + { + "epoch": 0.9445844421940134, + "grad_norm": 0.7915657758712769, + "learning_rate": 1.1205828034595506e-05, + "loss": 0.3049, + "step": 10027 + }, + { + "epoch": 0.9446786462871811, + "grad_norm": 0.8250691890716553, + "learning_rate": 1.1204329020945866e-05, + "loss": 0.3537, + "step": 10028 + }, + { + "epoch": 0.944772850380349, + "grad_norm": 0.8041085600852966, + "learning_rate": 1.1202829979835658e-05, + "loss": 0.2847, + "step": 10029 + }, + { + "epoch": 0.9448670544735168, + "grad_norm": 0.702141523361206, + "learning_rate": 1.1201330911299076e-05, + "loss": 0.3104, + "step": 10030 + }, + { + "epoch": 0.9449612585666847, + "grad_norm": 1.1383745670318604, + "learning_rate": 1.1199831815370284e-05, + "loss": 0.338, + "step": 10031 + }, + { + "epoch": 0.9450554626598525, + "grad_norm": 0.6468948721885681, + "learning_rate": 1.1198332692083469e-05, + "loss": 0.2807, + "step": 10032 + }, + { + "epoch": 0.9451496667530204, + "grad_norm": 0.752659261226654, + "learning_rate": 1.1196833541472823e-05, + "loss": 0.298, + "step": 10033 + }, + { + "epoch": 0.9452438708461882, + "grad_norm": 0.6877515316009521, + "learning_rate": 1.1195334363572513e-05, + "loss": 0.2745, + "step": 10034 + }, + { + "epoch": 0.9453380749393561, + "grad_norm": 0.8536050915718079, + "learning_rate": 1.1193835158416737e-05, + "loss": 0.3299, + "step": 10035 + }, + { + "epoch": 0.9454322790325239, + "grad_norm": 0.6285669803619385, + "learning_rate": 1.1192335926039673e-05, + "loss": 0.2666, + "step": 10036 + }, + { + "epoch": 0.9455264831256918, + "grad_norm": 0.7156527638435364, + "learning_rate": 1.1190836666475503e-05, + "loss": 0.2997, + "step": 10037 + }, + { + "epoch": 0.9456206872188596, + "grad_norm": 0.8850581049919128, + "learning_rate": 1.1189337379758415e-05, + "loss": 0.3337, + "step": 10038 + }, + { + "epoch": 0.9457148913120275, + "grad_norm": 0.7407930493354797, + "learning_rate": 1.1187838065922598e-05, + "loss": 0.2895, + "step": 10039 + }, + { + "epoch": 0.9458090954051953, + "grad_norm": 0.791225254535675, + "learning_rate": 1.1186338725002238e-05, + "loss": 0.2869, + "step": 10040 + }, + { + "epoch": 0.9459032994983632, + "grad_norm": 0.6969056129455566, + "learning_rate": 1.1184839357031516e-05, + "loss": 0.2986, + "step": 10041 + }, + { + "epoch": 0.945997503591531, + "grad_norm": 0.6812271475791931, + "learning_rate": 1.1183339962044624e-05, + "loss": 0.3017, + "step": 10042 + }, + { + "epoch": 0.9460917076846989, + "grad_norm": 0.7941222190856934, + "learning_rate": 1.1181840540075752e-05, + "loss": 0.3112, + "step": 10043 + }, + { + "epoch": 0.9461859117778667, + "grad_norm": 0.7960657477378845, + "learning_rate": 1.1180341091159091e-05, + "loss": 0.275, + "step": 10044 + }, + { + "epoch": 0.9462801158710346, + "grad_norm": 0.954747200012207, + "learning_rate": 1.1178841615328824e-05, + "loss": 0.3356, + "step": 10045 + }, + { + "epoch": 0.9463743199642024, + "grad_norm": 0.7375147342681885, + "learning_rate": 1.1177342112619145e-05, + "loss": 0.3079, + "step": 10046 + }, + { + "epoch": 0.9464685240573703, + "grad_norm": 0.8293735384941101, + "learning_rate": 1.1175842583064247e-05, + "loss": 0.3262, + "step": 10047 + }, + { + "epoch": 0.9465627281505381, + "grad_norm": 0.7221106886863708, + "learning_rate": 1.1174343026698318e-05, + "loss": 0.3709, + "step": 10048 + }, + { + "epoch": 0.946656932243706, + "grad_norm": 0.7418912649154663, + "learning_rate": 1.1172843443555552e-05, + "loss": 0.2864, + "step": 10049 + }, + { + "epoch": 0.9467511363368738, + "grad_norm": 0.6042212843894958, + "learning_rate": 1.1171343833670146e-05, + "loss": 0.2939, + "step": 10050 + }, + { + "epoch": 0.9468453404300416, + "grad_norm": 0.6550147533416748, + "learning_rate": 1.1169844197076282e-05, + "loss": 0.2701, + "step": 10051 + }, + { + "epoch": 0.9469395445232095, + "grad_norm": 0.7457708716392517, + "learning_rate": 1.1168344533808164e-05, + "loss": 0.3481, + "step": 10052 + }, + { + "epoch": 0.9470337486163773, + "grad_norm": 0.7013445496559143, + "learning_rate": 1.1166844843899986e-05, + "loss": 0.3074, + "step": 10053 + }, + { + "epoch": 0.9471279527095452, + "grad_norm": 0.5988826751708984, + "learning_rate": 1.1165345127385938e-05, + "loss": 0.243, + "step": 10054 + }, + { + "epoch": 0.947222156802713, + "grad_norm": 0.7739607095718384, + "learning_rate": 1.116384538430022e-05, + "loss": 0.3289, + "step": 10055 + }, + { + "epoch": 0.9473163608958809, + "grad_norm": 0.7369787693023682, + "learning_rate": 1.1162345614677029e-05, + "loss": 0.3182, + "step": 10056 + }, + { + "epoch": 0.9474105649890487, + "grad_norm": 0.6781864762306213, + "learning_rate": 1.1160845818550556e-05, + "loss": 0.2643, + "step": 10057 + }, + { + "epoch": 0.9475047690822166, + "grad_norm": 0.7162042260169983, + "learning_rate": 1.1159345995955007e-05, + "loss": 0.2918, + "step": 10058 + }, + { + "epoch": 0.9475989731753844, + "grad_norm": 0.887563169002533, + "learning_rate": 1.1157846146924576e-05, + "loss": 0.3374, + "step": 10059 + }, + { + "epoch": 0.9476931772685523, + "grad_norm": 0.6928210258483887, + "learning_rate": 1.1156346271493461e-05, + "loss": 0.2709, + "step": 10060 + }, + { + "epoch": 0.9477873813617201, + "grad_norm": 0.6558769941329956, + "learning_rate": 1.1154846369695864e-05, + "loss": 0.2921, + "step": 10061 + }, + { + "epoch": 0.947881585454888, + "grad_norm": 0.7086412310600281, + "learning_rate": 1.115334644156598e-05, + "loss": 0.307, + "step": 10062 + }, + { + "epoch": 0.9479757895480558, + "grad_norm": 0.7829697728157043, + "learning_rate": 1.1151846487138016e-05, + "loss": 0.3213, + "step": 10063 + }, + { + "epoch": 0.9480699936412237, + "grad_norm": 0.8936841487884521, + "learning_rate": 1.1150346506446173e-05, + "loss": 0.2704, + "step": 10064 + }, + { + "epoch": 0.9481641977343915, + "grad_norm": 0.6925931572914124, + "learning_rate": 1.1148846499524648e-05, + "loss": 0.2528, + "step": 10065 + }, + { + "epoch": 0.9482584018275594, + "grad_norm": 0.6956936120986938, + "learning_rate": 1.1147346466407645e-05, + "loss": 0.2561, + "step": 10066 + }, + { + "epoch": 0.9483526059207272, + "grad_norm": 0.6767364740371704, + "learning_rate": 1.1145846407129371e-05, + "loss": 0.3033, + "step": 10067 + }, + { + "epoch": 0.9484468100138951, + "grad_norm": 0.7587303519248962, + "learning_rate": 1.1144346321724027e-05, + "loss": 0.318, + "step": 10068 + }, + { + "epoch": 0.9485410141070629, + "grad_norm": 1.040582299232483, + "learning_rate": 1.1142846210225812e-05, + "loss": 0.3675, + "step": 10069 + }, + { + "epoch": 0.9486352182002308, + "grad_norm": 1.0852214097976685, + "learning_rate": 1.1141346072668944e-05, + "loss": 0.3118, + "step": 10070 + }, + { + "epoch": 0.9487294222933986, + "grad_norm": 0.6972841024398804, + "learning_rate": 1.1139845909087614e-05, + "loss": 0.2794, + "step": 10071 + }, + { + "epoch": 0.9488236263865665, + "grad_norm": 0.7939976453781128, + "learning_rate": 1.1138345719516038e-05, + "loss": 0.3207, + "step": 10072 + }, + { + "epoch": 0.9489178304797343, + "grad_norm": 0.733759880065918, + "learning_rate": 1.1136845503988418e-05, + "loss": 0.3217, + "step": 10073 + }, + { + "epoch": 0.9490120345729022, + "grad_norm": 0.8559455871582031, + "learning_rate": 1.113534526253896e-05, + "loss": 0.3239, + "step": 10074 + }, + { + "epoch": 0.94910623866607, + "grad_norm": 0.8254885077476501, + "learning_rate": 1.1133844995201877e-05, + "loss": 0.3143, + "step": 10075 + }, + { + "epoch": 0.9492004427592379, + "grad_norm": 0.8626786470413208, + "learning_rate": 1.1132344702011375e-05, + "loss": 0.2763, + "step": 10076 + }, + { + "epoch": 0.9492946468524057, + "grad_norm": 0.786437451839447, + "learning_rate": 1.1130844383001658e-05, + "loss": 0.3107, + "step": 10077 + }, + { + "epoch": 0.9493888509455736, + "grad_norm": 0.7566415667533875, + "learning_rate": 1.1129344038206945e-05, + "loss": 0.26, + "step": 10078 + }, + { + "epoch": 0.9494830550387414, + "grad_norm": 0.6886942982673645, + "learning_rate": 1.112784366766144e-05, + "loss": 0.3093, + "step": 10079 + }, + { + "epoch": 0.9495772591319093, + "grad_norm": 0.7516670227050781, + "learning_rate": 1.1126343271399356e-05, + "loss": 0.2878, + "step": 10080 + }, + { + "epoch": 0.9496714632250771, + "grad_norm": 0.5929188132286072, + "learning_rate": 1.1124842849454903e-05, + "loss": 0.2698, + "step": 10081 + }, + { + "epoch": 0.949765667318245, + "grad_norm": 0.6839563846588135, + "learning_rate": 1.1123342401862292e-05, + "loss": 0.3097, + "step": 10082 + }, + { + "epoch": 0.9498598714114128, + "grad_norm": 0.7374429702758789, + "learning_rate": 1.1121841928655739e-05, + "loss": 0.2625, + "step": 10083 + }, + { + "epoch": 0.9499540755045807, + "grad_norm": 0.6900131106376648, + "learning_rate": 1.1120341429869454e-05, + "loss": 0.2946, + "step": 10084 + }, + { + "epoch": 0.9500482795977485, + "grad_norm": 0.7028419375419617, + "learning_rate": 1.111884090553765e-05, + "loss": 0.2934, + "step": 10085 + }, + { + "epoch": 0.9501424836909164, + "grad_norm": 0.745921790599823, + "learning_rate": 1.1117340355694544e-05, + "loss": 0.31, + "step": 10086 + }, + { + "epoch": 0.9502366877840842, + "grad_norm": 0.84515780210495, + "learning_rate": 1.111583978037435e-05, + "loss": 0.3079, + "step": 10087 + }, + { + "epoch": 0.9503308918772521, + "grad_norm": 0.7456156015396118, + "learning_rate": 1.1114339179611286e-05, + "loss": 0.3248, + "step": 10088 + }, + { + "epoch": 0.9504250959704199, + "grad_norm": 1.3288429975509644, + "learning_rate": 1.1112838553439563e-05, + "loss": 0.2677, + "step": 10089 + }, + { + "epoch": 0.9505193000635878, + "grad_norm": 0.6854212880134583, + "learning_rate": 1.1111337901893402e-05, + "loss": 0.265, + "step": 10090 + }, + { + "epoch": 0.9506135041567556, + "grad_norm": 0.7276402115821838, + "learning_rate": 1.1109837225007014e-05, + "loss": 0.2898, + "step": 10091 + }, + { + "epoch": 0.9507077082499235, + "grad_norm": 0.7948499917984009, + "learning_rate": 1.1108336522814624e-05, + "loss": 0.3183, + "step": 10092 + }, + { + "epoch": 0.9508019123430913, + "grad_norm": 0.6931252479553223, + "learning_rate": 1.1106835795350448e-05, + "loss": 0.3133, + "step": 10093 + }, + { + "epoch": 0.9508961164362592, + "grad_norm": 0.6482195258140564, + "learning_rate": 1.1105335042648701e-05, + "loss": 0.275, + "step": 10094 + }, + { + "epoch": 0.950990320529427, + "grad_norm": 0.7661250233650208, + "learning_rate": 1.1103834264743607e-05, + "loss": 0.3256, + "step": 10095 + }, + { + "epoch": 0.9510845246225949, + "grad_norm": 0.8063918352127075, + "learning_rate": 1.1102333461669386e-05, + "loss": 0.3035, + "step": 10096 + }, + { + "epoch": 0.9511787287157627, + "grad_norm": 0.8431673645973206, + "learning_rate": 1.1100832633460254e-05, + "loss": 0.317, + "step": 10097 + }, + { + "epoch": 0.9512729328089305, + "grad_norm": 0.8334606885910034, + "learning_rate": 1.109933178015044e-05, + "loss": 0.2756, + "step": 10098 + }, + { + "epoch": 0.9513671369020984, + "grad_norm": 0.7119786143302917, + "learning_rate": 1.1097830901774159e-05, + "loss": 0.3085, + "step": 10099 + }, + { + "epoch": 0.9514613409952662, + "grad_norm": 0.6422887444496155, + "learning_rate": 1.1096329998365636e-05, + "loss": 0.3312, + "step": 10100 + }, + { + "epoch": 0.9515555450884341, + "grad_norm": 1.3474273681640625, + "learning_rate": 1.1094829069959092e-05, + "loss": 0.3206, + "step": 10101 + }, + { + "epoch": 0.951649749181602, + "grad_norm": 0.6415268182754517, + "learning_rate": 1.1093328116588753e-05, + "loss": 0.2597, + "step": 10102 + }, + { + "epoch": 0.9517439532747698, + "grad_norm": 0.7060954570770264, + "learning_rate": 1.1091827138288842e-05, + "loss": 0.298, + "step": 10103 + }, + { + "epoch": 0.9518381573679376, + "grad_norm": 0.787318766117096, + "learning_rate": 1.1090326135093584e-05, + "loss": 0.3084, + "step": 10104 + }, + { + "epoch": 0.9519323614611055, + "grad_norm": 0.7191594839096069, + "learning_rate": 1.1088825107037204e-05, + "loss": 0.3231, + "step": 10105 + }, + { + "epoch": 0.9520265655542733, + "grad_norm": 0.7752695679664612, + "learning_rate": 1.1087324054153925e-05, + "loss": 0.3025, + "step": 10106 + }, + { + "epoch": 0.9521207696474412, + "grad_norm": 0.7337049245834351, + "learning_rate": 1.108582297647798e-05, + "loss": 0.2811, + "step": 10107 + }, + { + "epoch": 0.952214973740609, + "grad_norm": 0.6960436701774597, + "learning_rate": 1.108432187404359e-05, + "loss": 0.2939, + "step": 10108 + }, + { + "epoch": 0.9523091778337769, + "grad_norm": 0.8235780596733093, + "learning_rate": 1.1082820746884984e-05, + "loss": 0.3027, + "step": 10109 + }, + { + "epoch": 0.9524033819269447, + "grad_norm": 0.6529624462127686, + "learning_rate": 1.1081319595036392e-05, + "loss": 0.3243, + "step": 10110 + }, + { + "epoch": 0.9524975860201126, + "grad_norm": 0.7159841656684875, + "learning_rate": 1.107981841853204e-05, + "loss": 0.3139, + "step": 10111 + }, + { + "epoch": 0.9525917901132804, + "grad_norm": 0.7163367867469788, + "learning_rate": 1.1078317217406158e-05, + "loss": 0.3001, + "step": 10112 + }, + { + "epoch": 0.9526859942064483, + "grad_norm": 0.7652973532676697, + "learning_rate": 1.1076815991692978e-05, + "loss": 0.2614, + "step": 10113 + }, + { + "epoch": 0.9527801982996161, + "grad_norm": 0.692310094833374, + "learning_rate": 1.1075314741426721e-05, + "loss": 0.2799, + "step": 10114 + }, + { + "epoch": 0.952874402392784, + "grad_norm": 0.787175714969635, + "learning_rate": 1.1073813466641633e-05, + "loss": 0.3231, + "step": 10115 + }, + { + "epoch": 0.9529686064859518, + "grad_norm": 0.6837242245674133, + "learning_rate": 1.1072312167371932e-05, + "loss": 0.3428, + "step": 10116 + }, + { + "epoch": 0.9530628105791197, + "grad_norm": 1.015246033668518, + "learning_rate": 1.1070810843651856e-05, + "loss": 0.3004, + "step": 10117 + }, + { + "epoch": 0.9531570146722875, + "grad_norm": 0.7722873091697693, + "learning_rate": 1.1069309495515642e-05, + "loss": 0.2799, + "step": 10118 + }, + { + "epoch": 0.9532512187654554, + "grad_norm": 0.7015636563301086, + "learning_rate": 1.1067808122997511e-05, + "loss": 0.2809, + "step": 10119 + }, + { + "epoch": 0.9533454228586232, + "grad_norm": 0.6829482316970825, + "learning_rate": 1.1066306726131709e-05, + "loss": 0.3179, + "step": 10120 + }, + { + "epoch": 0.9534396269517911, + "grad_norm": 0.6982603073120117, + "learning_rate": 1.1064805304952459e-05, + "loss": 0.3144, + "step": 10121 + }, + { + "epoch": 0.9535338310449589, + "grad_norm": 0.7205075621604919, + "learning_rate": 1.1063303859494004e-05, + "loss": 0.3069, + "step": 10122 + }, + { + "epoch": 0.9536280351381268, + "grad_norm": 0.6632360219955444, + "learning_rate": 1.1061802389790576e-05, + "loss": 0.318, + "step": 10123 + }, + { + "epoch": 0.9537222392312946, + "grad_norm": 0.7651366591453552, + "learning_rate": 1.1060300895876412e-05, + "loss": 0.3239, + "step": 10124 + }, + { + "epoch": 0.9538164433244625, + "grad_norm": 0.6194216012954712, + "learning_rate": 1.105879937778575e-05, + "loss": 0.2508, + "step": 10125 + }, + { + "epoch": 0.9539106474176303, + "grad_norm": 0.9242895841598511, + "learning_rate": 1.105729783555282e-05, + "loss": 0.3313, + "step": 10126 + }, + { + "epoch": 0.9540048515107982, + "grad_norm": 0.7633430361747742, + "learning_rate": 1.1055796269211868e-05, + "loss": 0.2767, + "step": 10127 + }, + { + "epoch": 0.954099055603966, + "grad_norm": 0.8665676116943359, + "learning_rate": 1.1054294678797126e-05, + "loss": 0.3245, + "step": 10128 + }, + { + "epoch": 0.9541932596971339, + "grad_norm": 0.7046676278114319, + "learning_rate": 1.1052793064342835e-05, + "loss": 0.3082, + "step": 10129 + }, + { + "epoch": 0.9542874637903017, + "grad_norm": 0.6994553804397583, + "learning_rate": 1.1051291425883237e-05, + "loss": 0.2944, + "step": 10130 + }, + { + "epoch": 0.9543816678834696, + "grad_norm": 0.6774550080299377, + "learning_rate": 1.1049789763452565e-05, + "loss": 0.3189, + "step": 10131 + }, + { + "epoch": 0.9544758719766374, + "grad_norm": 0.7153673768043518, + "learning_rate": 1.1048288077085065e-05, + "loss": 0.2849, + "step": 10132 + }, + { + "epoch": 0.9545700760698053, + "grad_norm": 0.8802182674407959, + "learning_rate": 1.1046786366814974e-05, + "loss": 0.3355, + "step": 10133 + }, + { + "epoch": 0.9546642801629731, + "grad_norm": 0.7563669681549072, + "learning_rate": 1.1045284632676535e-05, + "loss": 0.3308, + "step": 10134 + }, + { + "epoch": 0.954758484256141, + "grad_norm": 0.7985135316848755, + "learning_rate": 1.1043782874703992e-05, + "loss": 0.3175, + "step": 10135 + }, + { + "epoch": 0.9548526883493088, + "grad_norm": 0.6169210076332092, + "learning_rate": 1.1042281092931584e-05, + "loss": 0.2815, + "step": 10136 + }, + { + "epoch": 0.9549468924424767, + "grad_norm": 0.6767523884773254, + "learning_rate": 1.1040779287393553e-05, + "loss": 0.3045, + "step": 10137 + }, + { + "epoch": 0.9550410965356445, + "grad_norm": 0.6888035535812378, + "learning_rate": 1.103927745812415e-05, + "loss": 0.288, + "step": 10138 + }, + { + "epoch": 0.9551353006288124, + "grad_norm": 0.7685443162918091, + "learning_rate": 1.1037775605157608e-05, + "loss": 0.3002, + "step": 10139 + }, + { + "epoch": 0.9552295047219802, + "grad_norm": 0.6887650489807129, + "learning_rate": 1.103627372852818e-05, + "loss": 0.3077, + "step": 10140 + }, + { + "epoch": 0.955323708815148, + "grad_norm": 0.8367592692375183, + "learning_rate": 1.1034771828270107e-05, + "loss": 0.315, + "step": 10141 + }, + { + "epoch": 0.9554179129083159, + "grad_norm": 0.8385595679283142, + "learning_rate": 1.1033269904417636e-05, + "loss": 0.2844, + "step": 10142 + }, + { + "epoch": 0.9555121170014838, + "grad_norm": 0.8150827884674072, + "learning_rate": 1.1031767957005015e-05, + "loss": 0.301, + "step": 10143 + }, + { + "epoch": 0.9556063210946516, + "grad_norm": 0.6458438634872437, + "learning_rate": 1.1030265986066488e-05, + "loss": 0.3448, + "step": 10144 + }, + { + "epoch": 0.9557005251878194, + "grad_norm": 0.8682982921600342, + "learning_rate": 1.1028763991636304e-05, + "loss": 0.2766, + "step": 10145 + }, + { + "epoch": 0.9557947292809873, + "grad_norm": 0.7733973264694214, + "learning_rate": 1.1027261973748709e-05, + "loss": 0.3141, + "step": 10146 + }, + { + "epoch": 0.9558889333741551, + "grad_norm": 0.7197454571723938, + "learning_rate": 1.1025759932437951e-05, + "loss": 0.3315, + "step": 10147 + }, + { + "epoch": 0.955983137467323, + "grad_norm": 0.6769589781761169, + "learning_rate": 1.1024257867738284e-05, + "loss": 0.2866, + "step": 10148 + }, + { + "epoch": 0.9560773415604908, + "grad_norm": 0.8776320219039917, + "learning_rate": 1.1022755779683949e-05, + "loss": 0.3154, + "step": 10149 + }, + { + "epoch": 0.9561715456536587, + "grad_norm": 0.6937875747680664, + "learning_rate": 1.1021253668309206e-05, + "loss": 0.2806, + "step": 10150 + }, + { + "epoch": 0.9562657497468265, + "grad_norm": 0.6783882975578308, + "learning_rate": 1.1019751533648295e-05, + "loss": 0.3129, + "step": 10151 + }, + { + "epoch": 0.9563599538399944, + "grad_norm": 0.6711874604225159, + "learning_rate": 1.1018249375735475e-05, + "loss": 0.2802, + "step": 10152 + }, + { + "epoch": 0.9564541579331622, + "grad_norm": 0.7765811085700989, + "learning_rate": 1.1016747194604994e-05, + "loss": 0.3678, + "step": 10153 + }, + { + "epoch": 0.9565483620263301, + "grad_norm": 0.8686683773994446, + "learning_rate": 1.10152449902911e-05, + "loss": 0.2879, + "step": 10154 + }, + { + "epoch": 0.9566425661194979, + "grad_norm": 0.7165391445159912, + "learning_rate": 1.1013742762828054e-05, + "loss": 0.3097, + "step": 10155 + }, + { + "epoch": 0.9567367702126658, + "grad_norm": 0.6474335789680481, + "learning_rate": 1.1012240512250107e-05, + "loss": 0.2584, + "step": 10156 + }, + { + "epoch": 0.9568309743058336, + "grad_norm": 0.6844523549079895, + "learning_rate": 1.1010738238591507e-05, + "loss": 0.3393, + "step": 10157 + }, + { + "epoch": 0.9569251783990015, + "grad_norm": 0.6997516751289368, + "learning_rate": 1.1009235941886516e-05, + "loss": 0.3498, + "step": 10158 + }, + { + "epoch": 0.9570193824921693, + "grad_norm": 0.6587337851524353, + "learning_rate": 1.1007733622169381e-05, + "loss": 0.3142, + "step": 10159 + }, + { + "epoch": 0.9571135865853372, + "grad_norm": 0.6753538846969604, + "learning_rate": 1.1006231279474365e-05, + "loss": 0.3001, + "step": 10160 + }, + { + "epoch": 0.957207790678505, + "grad_norm": 0.626148521900177, + "learning_rate": 1.1004728913835717e-05, + "loss": 0.2723, + "step": 10161 + }, + { + "epoch": 0.9573019947716729, + "grad_norm": 0.8601760864257812, + "learning_rate": 1.1003226525287693e-05, + "loss": 0.3074, + "step": 10162 + }, + { + "epoch": 0.9573961988648407, + "grad_norm": 0.7324545383453369, + "learning_rate": 1.1001724113864558e-05, + "loss": 0.2976, + "step": 10163 + }, + { + "epoch": 0.9574904029580086, + "grad_norm": 0.789679765701294, + "learning_rate": 1.1000221679600562e-05, + "loss": 0.3237, + "step": 10164 + }, + { + "epoch": 0.9575846070511764, + "grad_norm": 0.6558475494384766, + "learning_rate": 1.0998719222529966e-05, + "loss": 0.2752, + "step": 10165 + }, + { + "epoch": 0.9576788111443442, + "grad_norm": 0.8714675307273865, + "learning_rate": 1.0997216742687022e-05, + "loss": 0.2974, + "step": 10166 + }, + { + "epoch": 0.957773015237512, + "grad_norm": 0.8132731914520264, + "learning_rate": 1.0995714240105999e-05, + "loss": 0.2722, + "step": 10167 + }, + { + "epoch": 0.9578672193306799, + "grad_norm": 0.6529768705368042, + "learning_rate": 1.099421171482115e-05, + "loss": 0.2786, + "step": 10168 + }, + { + "epoch": 0.9579614234238477, + "grad_norm": 0.7080832719802856, + "learning_rate": 1.0992709166866738e-05, + "loss": 0.2823, + "step": 10169 + }, + { + "epoch": 0.9580556275170156, + "grad_norm": 0.6487346291542053, + "learning_rate": 1.0991206596277023e-05, + "loss": 0.2689, + "step": 10170 + }, + { + "epoch": 0.9581498316101834, + "grad_norm": 0.8792241215705872, + "learning_rate": 1.098970400308626e-05, + "loss": 0.3442, + "step": 10171 + }, + { + "epoch": 0.9582440357033513, + "grad_norm": 0.6926685571670532, + "learning_rate": 1.0988201387328716e-05, + "loss": 0.3028, + "step": 10172 + }, + { + "epoch": 0.9583382397965191, + "grad_norm": 0.7001669406890869, + "learning_rate": 1.0986698749038654e-05, + "loss": 0.2992, + "step": 10173 + }, + { + "epoch": 0.958432443889687, + "grad_norm": 0.6960703730583191, + "learning_rate": 1.0985196088250332e-05, + "loss": 0.3035, + "step": 10174 + }, + { + "epoch": 0.9585266479828548, + "grad_norm": 1.045765995979309, + "learning_rate": 1.098369340499802e-05, + "loss": 0.3026, + "step": 10175 + }, + { + "epoch": 0.9586208520760227, + "grad_norm": 0.7972753643989563, + "learning_rate": 1.0982190699315974e-05, + "loss": 0.3068, + "step": 10176 + }, + { + "epoch": 0.9587150561691905, + "grad_norm": 0.7426035404205322, + "learning_rate": 1.098068797123846e-05, + "loss": 0.3237, + "step": 10177 + }, + { + "epoch": 0.9588092602623584, + "grad_norm": 0.6838176846504211, + "learning_rate": 1.0979185220799747e-05, + "loss": 0.2966, + "step": 10178 + }, + { + "epoch": 0.9589034643555262, + "grad_norm": 0.7067477107048035, + "learning_rate": 1.0977682448034092e-05, + "loss": 0.3412, + "step": 10179 + }, + { + "epoch": 0.958997668448694, + "grad_norm": 0.8020039200782776, + "learning_rate": 1.0976179652975769e-05, + "loss": 0.3128, + "step": 10180 + }, + { + "epoch": 0.9590918725418619, + "grad_norm": 0.7284244894981384, + "learning_rate": 1.0974676835659039e-05, + "loss": 0.2662, + "step": 10181 + }, + { + "epoch": 0.9591860766350297, + "grad_norm": 0.7223349213600159, + "learning_rate": 1.0973173996118169e-05, + "loss": 0.3001, + "step": 10182 + }, + { + "epoch": 0.9592802807281976, + "grad_norm": 0.7770244479179382, + "learning_rate": 1.097167113438743e-05, + "loss": 0.3341, + "step": 10183 + }, + { + "epoch": 0.9593744848213654, + "grad_norm": 0.6680354475975037, + "learning_rate": 1.0970168250501083e-05, + "loss": 0.3141, + "step": 10184 + }, + { + "epoch": 0.9594686889145333, + "grad_norm": 0.7461274862289429, + "learning_rate": 1.09686653444934e-05, + "loss": 0.3054, + "step": 10185 + }, + { + "epoch": 0.9595628930077011, + "grad_norm": 0.6635683178901672, + "learning_rate": 1.0967162416398649e-05, + "loss": 0.2738, + "step": 10186 + }, + { + "epoch": 0.959657097100869, + "grad_norm": 0.5671108365058899, + "learning_rate": 1.0965659466251102e-05, + "loss": 0.2548, + "step": 10187 + }, + { + "epoch": 0.9597513011940368, + "grad_norm": 0.8083769083023071, + "learning_rate": 1.0964156494085023e-05, + "loss": 0.3534, + "step": 10188 + }, + { + "epoch": 0.9598455052872047, + "grad_norm": 0.6782791018486023, + "learning_rate": 1.0962653499934686e-05, + "loss": 0.2924, + "step": 10189 + }, + { + "epoch": 0.9599397093803725, + "grad_norm": 0.697952926158905, + "learning_rate": 1.096115048383436e-05, + "loss": 0.2853, + "step": 10190 + }, + { + "epoch": 0.9600339134735404, + "grad_norm": 0.7625333070755005, + "learning_rate": 1.0959647445818315e-05, + "loss": 0.3413, + "step": 10191 + }, + { + "epoch": 0.9601281175667082, + "grad_norm": 0.6706798076629639, + "learning_rate": 1.0958144385920826e-05, + "loss": 0.3111, + "step": 10192 + }, + { + "epoch": 0.9602223216598761, + "grad_norm": 0.7211931943893433, + "learning_rate": 1.0956641304176164e-05, + "loss": 0.2685, + "step": 10193 + }, + { + "epoch": 0.9603165257530439, + "grad_norm": 0.7056351900100708, + "learning_rate": 1.0955138200618598e-05, + "loss": 0.2943, + "step": 10194 + }, + { + "epoch": 0.9604107298462118, + "grad_norm": 0.7085780501365662, + "learning_rate": 1.0953635075282405e-05, + "loss": 0.2904, + "step": 10195 + }, + { + "epoch": 0.9605049339393796, + "grad_norm": 0.6431682705879211, + "learning_rate": 1.095213192820186e-05, + "loss": 0.2553, + "step": 10196 + }, + { + "epoch": 0.9605991380325475, + "grad_norm": 0.680701494216919, + "learning_rate": 1.095062875941123e-05, + "loss": 0.3144, + "step": 10197 + }, + { + "epoch": 0.9606933421257153, + "grad_norm": 0.7396478056907654, + "learning_rate": 1.0949125568944799e-05, + "loss": 0.2858, + "step": 10198 + }, + { + "epoch": 0.9607875462188832, + "grad_norm": 0.6372470855712891, + "learning_rate": 1.0947622356836834e-05, + "loss": 0.281, + "step": 10199 + }, + { + "epoch": 0.960881750312051, + "grad_norm": 0.7818843722343445, + "learning_rate": 1.0946119123121615e-05, + "loss": 0.3254, + "step": 10200 + }, + { + "epoch": 0.9609759544052189, + "grad_norm": 0.6971850395202637, + "learning_rate": 1.0944615867833415e-05, + "loss": 0.3234, + "step": 10201 + }, + { + "epoch": 0.9610701584983867, + "grad_norm": 0.7708349823951721, + "learning_rate": 1.0943112591006514e-05, + "loss": 0.3201, + "step": 10202 + }, + { + "epoch": 0.9611643625915546, + "grad_norm": 0.6892606616020203, + "learning_rate": 1.0941609292675186e-05, + "loss": 0.2662, + "step": 10203 + }, + { + "epoch": 0.9612585666847224, + "grad_norm": 0.7433465123176575, + "learning_rate": 1.094010597287371e-05, + "loss": 0.3191, + "step": 10204 + }, + { + "epoch": 0.9613527707778903, + "grad_norm": 0.6859291195869446, + "learning_rate": 1.0938602631636366e-05, + "loss": 0.2704, + "step": 10205 + }, + { + "epoch": 0.9614469748710581, + "grad_norm": 0.6968269348144531, + "learning_rate": 1.0937099268997428e-05, + "loss": 0.3008, + "step": 10206 + }, + { + "epoch": 0.961541178964226, + "grad_norm": 0.6571604013442993, + "learning_rate": 1.093559588499118e-05, + "loss": 0.2769, + "step": 10207 + }, + { + "epoch": 0.9616353830573938, + "grad_norm": 0.6723941564559937, + "learning_rate": 1.0934092479651897e-05, + "loss": 0.3109, + "step": 10208 + }, + { + "epoch": 0.9617295871505617, + "grad_norm": 0.6984074711799622, + "learning_rate": 1.0932589053013862e-05, + "loss": 0.3219, + "step": 10209 + }, + { + "epoch": 0.9618237912437295, + "grad_norm": 0.5909333229064941, + "learning_rate": 1.0931085605111354e-05, + "loss": 0.2634, + "step": 10210 + }, + { + "epoch": 0.9619179953368974, + "grad_norm": 0.7381008863449097, + "learning_rate": 1.0929582135978651e-05, + "loss": 0.3142, + "step": 10211 + }, + { + "epoch": 0.9620121994300652, + "grad_norm": 0.7191177606582642, + "learning_rate": 1.0928078645650042e-05, + "loss": 0.2785, + "step": 10212 + }, + { + "epoch": 0.9621064035232331, + "grad_norm": 0.7118578553199768, + "learning_rate": 1.0926575134159805e-05, + "loss": 0.3119, + "step": 10213 + }, + { + "epoch": 0.9622006076164009, + "grad_norm": 0.799423336982727, + "learning_rate": 1.0925071601542218e-05, + "loss": 0.3041, + "step": 10214 + }, + { + "epoch": 0.9622948117095688, + "grad_norm": 0.7251821756362915, + "learning_rate": 1.0923568047831572e-05, + "loss": 0.3398, + "step": 10215 + }, + { + "epoch": 0.9623890158027366, + "grad_norm": 0.856343686580658, + "learning_rate": 1.0922064473062144e-05, + "loss": 0.2934, + "step": 10216 + }, + { + "epoch": 0.9624832198959045, + "grad_norm": 0.6228210926055908, + "learning_rate": 1.0920560877268218e-05, + "loss": 0.2785, + "step": 10217 + }, + { + "epoch": 0.9625774239890723, + "grad_norm": 0.7916138172149658, + "learning_rate": 1.0919057260484084e-05, + "loss": 0.3263, + "step": 10218 + }, + { + "epoch": 0.9626716280822402, + "grad_norm": 0.678949236869812, + "learning_rate": 1.0917553622744022e-05, + "loss": 0.2778, + "step": 10219 + }, + { + "epoch": 0.962765832175408, + "grad_norm": 0.6774210929870605, + "learning_rate": 1.0916049964082319e-05, + "loss": 0.2849, + "step": 10220 + }, + { + "epoch": 0.9628600362685759, + "grad_norm": 0.8104318380355835, + "learning_rate": 1.0914546284533259e-05, + "loss": 0.3602, + "step": 10221 + }, + { + "epoch": 0.9629542403617437, + "grad_norm": 0.7064844965934753, + "learning_rate": 1.0913042584131126e-05, + "loss": 0.334, + "step": 10222 + }, + { + "epoch": 0.9630484444549116, + "grad_norm": 0.8207798600196838, + "learning_rate": 1.0911538862910213e-05, + "loss": 0.2751, + "step": 10223 + }, + { + "epoch": 0.9631426485480794, + "grad_norm": 0.7319862842559814, + "learning_rate": 1.0910035120904807e-05, + "loss": 0.276, + "step": 10224 + }, + { + "epoch": 0.9632368526412473, + "grad_norm": 0.7391321659088135, + "learning_rate": 1.090853135814919e-05, + "loss": 0.332, + "step": 10225 + }, + { + "epoch": 0.9633310567344151, + "grad_norm": 0.6955612301826477, + "learning_rate": 1.0907027574677653e-05, + "loss": 0.3061, + "step": 10226 + }, + { + "epoch": 0.963425260827583, + "grad_norm": 0.6962710618972778, + "learning_rate": 1.0905523770524485e-05, + "loss": 0.3074, + "step": 10227 + }, + { + "epoch": 0.9635194649207508, + "grad_norm": 0.8167105913162231, + "learning_rate": 1.0904019945723976e-05, + "loss": 0.273, + "step": 10228 + }, + { + "epoch": 0.9636136690139187, + "grad_norm": 0.7759659290313721, + "learning_rate": 1.0902516100310412e-05, + "loss": 0.3294, + "step": 10229 + }, + { + "epoch": 0.9637078731070865, + "grad_norm": 0.7253290414810181, + "learning_rate": 1.0901012234318088e-05, + "loss": 0.2891, + "step": 10230 + }, + { + "epoch": 0.9638020772002543, + "grad_norm": 0.7606688737869263, + "learning_rate": 1.0899508347781287e-05, + "loss": 0.3169, + "step": 10231 + }, + { + "epoch": 0.9638962812934222, + "grad_norm": 0.9359011650085449, + "learning_rate": 1.0898004440734306e-05, + "loss": 0.3641, + "step": 10232 + }, + { + "epoch": 0.96399048538659, + "grad_norm": 0.7431262135505676, + "learning_rate": 1.0896500513211438e-05, + "loss": 0.2878, + "step": 10233 + }, + { + "epoch": 0.9640846894797579, + "grad_norm": 1.0001311302185059, + "learning_rate": 1.0894996565246969e-05, + "loss": 0.3094, + "step": 10234 + }, + { + "epoch": 0.9641788935729257, + "grad_norm": 0.784214198589325, + "learning_rate": 1.0893492596875197e-05, + "loss": 0.3009, + "step": 10235 + }, + { + "epoch": 0.9642730976660936, + "grad_norm": 0.7183936834335327, + "learning_rate": 1.0891988608130409e-05, + "loss": 0.2952, + "step": 10236 + }, + { + "epoch": 0.9643673017592614, + "grad_norm": 0.7166551947593689, + "learning_rate": 1.08904845990469e-05, + "loss": 0.2988, + "step": 10237 + }, + { + "epoch": 0.9644615058524293, + "grad_norm": 0.6997542977333069, + "learning_rate": 1.0888980569658968e-05, + "loss": 0.306, + "step": 10238 + }, + { + "epoch": 0.9645557099455971, + "grad_norm": 0.787003219127655, + "learning_rate": 1.0887476520000904e-05, + "loss": 0.3121, + "step": 10239 + }, + { + "epoch": 0.964649914038765, + "grad_norm": 0.771612823009491, + "learning_rate": 1.0885972450107003e-05, + "loss": 0.2869, + "step": 10240 + }, + { + "epoch": 0.9647441181319328, + "grad_norm": 0.6689867377281189, + "learning_rate": 1.088446836001156e-05, + "loss": 0.2983, + "step": 10241 + }, + { + "epoch": 0.9648383222251007, + "grad_norm": 0.8120307922363281, + "learning_rate": 1.0882964249748868e-05, + "loss": 0.3111, + "step": 10242 + }, + { + "epoch": 0.9649325263182685, + "grad_norm": 0.6862969398498535, + "learning_rate": 1.0881460119353227e-05, + "loss": 0.3418, + "step": 10243 + }, + { + "epoch": 0.9650267304114364, + "grad_norm": 0.7631906270980835, + "learning_rate": 1.0879955968858932e-05, + "loss": 0.2827, + "step": 10244 + }, + { + "epoch": 0.9651209345046042, + "grad_norm": 0.8895875811576843, + "learning_rate": 1.0878451798300282e-05, + "loss": 0.3556, + "step": 10245 + }, + { + "epoch": 0.9652151385977721, + "grad_norm": 0.8678480386734009, + "learning_rate": 1.087694760771157e-05, + "loss": 0.2989, + "step": 10246 + }, + { + "epoch": 0.9653093426909399, + "grad_norm": 0.8030105233192444, + "learning_rate": 1.0875443397127096e-05, + "loss": 0.3347, + "step": 10247 + }, + { + "epoch": 0.9654035467841078, + "grad_norm": 0.6832453012466431, + "learning_rate": 1.0873939166581163e-05, + "loss": 0.3018, + "step": 10248 + }, + { + "epoch": 0.9654977508772756, + "grad_norm": 0.7587977647781372, + "learning_rate": 1.0872434916108061e-05, + "loss": 0.2979, + "step": 10249 + }, + { + "epoch": 0.9655919549704435, + "grad_norm": 0.6465497612953186, + "learning_rate": 1.0870930645742098e-05, + "loss": 0.2519, + "step": 10250 + }, + { + "epoch": 0.9656861590636113, + "grad_norm": 0.7264884114265442, + "learning_rate": 1.0869426355517562e-05, + "loss": 0.2611, + "step": 10251 + }, + { + "epoch": 0.9657803631567792, + "grad_norm": 0.6426544189453125, + "learning_rate": 1.0867922045468766e-05, + "loss": 0.2739, + "step": 10252 + }, + { + "epoch": 0.965874567249947, + "grad_norm": 0.8288610577583313, + "learning_rate": 1.0866417715630005e-05, + "loss": 0.342, + "step": 10253 + }, + { + "epoch": 0.9659687713431149, + "grad_norm": 0.7182257175445557, + "learning_rate": 1.0864913366035577e-05, + "loss": 0.3289, + "step": 10254 + }, + { + "epoch": 0.9660629754362827, + "grad_norm": 0.8025250434875488, + "learning_rate": 1.0863408996719792e-05, + "loss": 0.3216, + "step": 10255 + }, + { + "epoch": 0.9661571795294506, + "grad_norm": 0.7690209150314331, + "learning_rate": 1.0861904607716942e-05, + "loss": 0.2928, + "step": 10256 + }, + { + "epoch": 0.9662513836226184, + "grad_norm": 0.7235234975814819, + "learning_rate": 1.0860400199061332e-05, + "loss": 0.3086, + "step": 10257 + }, + { + "epoch": 0.9663455877157863, + "grad_norm": 0.7197400331497192, + "learning_rate": 1.0858895770787275e-05, + "loss": 0.283, + "step": 10258 + }, + { + "epoch": 0.9664397918089541, + "grad_norm": 0.6490856409072876, + "learning_rate": 1.0857391322929059e-05, + "loss": 0.2786, + "step": 10259 + }, + { + "epoch": 0.966533995902122, + "grad_norm": 0.6831339001655579, + "learning_rate": 1.0855886855520996e-05, + "loss": 0.261, + "step": 10260 + }, + { + "epoch": 0.9666281999952898, + "grad_norm": 0.6745830774307251, + "learning_rate": 1.0854382368597391e-05, + "loss": 0.3096, + "step": 10261 + }, + { + "epoch": 0.9667224040884577, + "grad_norm": 0.6728605031967163, + "learning_rate": 1.0852877862192543e-05, + "loss": 0.2733, + "step": 10262 + }, + { + "epoch": 0.9668166081816255, + "grad_norm": 0.7601017951965332, + "learning_rate": 1.0851373336340765e-05, + "loss": 0.3254, + "step": 10263 + }, + { + "epoch": 0.9669108122747934, + "grad_norm": 0.783606231212616, + "learning_rate": 1.0849868791076358e-05, + "loss": 0.3069, + "step": 10264 + }, + { + "epoch": 0.9670050163679612, + "grad_norm": 0.7248871326446533, + "learning_rate": 1.0848364226433627e-05, + "loss": 0.2864, + "step": 10265 + }, + { + "epoch": 0.9670992204611291, + "grad_norm": 0.8170263767242432, + "learning_rate": 1.0846859642446878e-05, + "loss": 0.3202, + "step": 10266 + }, + { + "epoch": 0.9671934245542969, + "grad_norm": 0.8528434038162231, + "learning_rate": 1.0845355039150423e-05, + "loss": 0.2834, + "step": 10267 + }, + { + "epoch": 0.9672876286474648, + "grad_norm": 0.8308974504470825, + "learning_rate": 1.0843850416578563e-05, + "loss": 0.3271, + "step": 10268 + }, + { + "epoch": 0.9673818327406326, + "grad_norm": 0.6711557507514954, + "learning_rate": 1.084234577476561e-05, + "loss": 0.2953, + "step": 10269 + }, + { + "epoch": 0.9674760368338005, + "grad_norm": 0.7348352074623108, + "learning_rate": 1.0840841113745871e-05, + "loss": 0.2895, + "step": 10270 + }, + { + "epoch": 0.9675702409269683, + "grad_norm": 1.1439179182052612, + "learning_rate": 1.0839336433553651e-05, + "loss": 0.3044, + "step": 10271 + }, + { + "epoch": 0.9676644450201362, + "grad_norm": 0.7765445113182068, + "learning_rate": 1.0837831734223266e-05, + "loss": 0.3103, + "step": 10272 + }, + { + "epoch": 0.967758649113304, + "grad_norm": 0.7693551778793335, + "learning_rate": 1.0836327015789018e-05, + "loss": 0.3007, + "step": 10273 + }, + { + "epoch": 0.9678528532064719, + "grad_norm": 0.7013092041015625, + "learning_rate": 1.0834822278285221e-05, + "loss": 0.2802, + "step": 10274 + }, + { + "epoch": 0.9679470572996397, + "grad_norm": 0.6682015657424927, + "learning_rate": 1.0833317521746192e-05, + "loss": 0.2832, + "step": 10275 + }, + { + "epoch": 0.9680412613928076, + "grad_norm": 0.6417153477668762, + "learning_rate": 1.0831812746206228e-05, + "loss": 0.2715, + "step": 10276 + }, + { + "epoch": 0.9681354654859754, + "grad_norm": 0.7720031142234802, + "learning_rate": 1.0830307951699646e-05, + "loss": 0.3253, + "step": 10277 + }, + { + "epoch": 0.9682296695791432, + "grad_norm": 0.8440465331077576, + "learning_rate": 1.0828803138260765e-05, + "loss": 0.3455, + "step": 10278 + }, + { + "epoch": 0.9683238736723111, + "grad_norm": 0.8167608380317688, + "learning_rate": 1.0827298305923884e-05, + "loss": 0.2866, + "step": 10279 + }, + { + "epoch": 0.968418077765479, + "grad_norm": 0.7632136344909668, + "learning_rate": 1.0825793454723325e-05, + "loss": 0.308, + "step": 10280 + }, + { + "epoch": 0.9685122818586468, + "grad_norm": 2.2348220348358154, + "learning_rate": 1.0824288584693399e-05, + "loss": 0.3379, + "step": 10281 + }, + { + "epoch": 0.9686064859518146, + "grad_norm": 0.6997781991958618, + "learning_rate": 1.0822783695868414e-05, + "loss": 0.3148, + "step": 10282 + }, + { + "epoch": 0.9687006900449825, + "grad_norm": 0.7183207273483276, + "learning_rate": 1.0821278788282694e-05, + "loss": 0.3103, + "step": 10283 + }, + { + "epoch": 0.9687948941381503, + "grad_norm": 0.7390533089637756, + "learning_rate": 1.0819773861970547e-05, + "loss": 0.3037, + "step": 10284 + }, + { + "epoch": 0.9688890982313182, + "grad_norm": 0.9706563353538513, + "learning_rate": 1.0818268916966286e-05, + "loss": 0.3684, + "step": 10285 + }, + { + "epoch": 0.968983302324486, + "grad_norm": 0.7521471977233887, + "learning_rate": 1.0816763953304228e-05, + "loss": 0.2829, + "step": 10286 + }, + { + "epoch": 0.9690775064176539, + "grad_norm": 0.7831737399101257, + "learning_rate": 1.0815258971018687e-05, + "loss": 0.3107, + "step": 10287 + }, + { + "epoch": 0.9691717105108217, + "grad_norm": 0.8647821545600891, + "learning_rate": 1.0813753970143985e-05, + "loss": 0.2824, + "step": 10288 + }, + { + "epoch": 0.9692659146039896, + "grad_norm": 0.68235182762146, + "learning_rate": 1.0812248950714433e-05, + "loss": 0.2735, + "step": 10289 + }, + { + "epoch": 0.9693601186971574, + "grad_norm": 0.8001142144203186, + "learning_rate": 1.0810743912764348e-05, + "loss": 0.3009, + "step": 10290 + }, + { + "epoch": 0.9694543227903253, + "grad_norm": 0.8159835934638977, + "learning_rate": 1.0809238856328047e-05, + "loss": 0.3137, + "step": 10291 + }, + { + "epoch": 0.9695485268834931, + "grad_norm": 0.7067996263504028, + "learning_rate": 1.080773378143985e-05, + "loss": 0.2873, + "step": 10292 + }, + { + "epoch": 0.969642730976661, + "grad_norm": 0.6628618240356445, + "learning_rate": 1.080622868813407e-05, + "loss": 0.307, + "step": 10293 + }, + { + "epoch": 0.9697369350698288, + "grad_norm": 0.7208366990089417, + "learning_rate": 1.0804723576445031e-05, + "loss": 0.3128, + "step": 10294 + }, + { + "epoch": 0.9698311391629967, + "grad_norm": 0.625045657157898, + "learning_rate": 1.0803218446407054e-05, + "loss": 0.2761, + "step": 10295 + }, + { + "epoch": 0.9699253432561645, + "grad_norm": 0.7062149047851562, + "learning_rate": 1.080171329805445e-05, + "loss": 0.2652, + "step": 10296 + }, + { + "epoch": 0.9700195473493324, + "grad_norm": 0.7849803566932678, + "learning_rate": 1.0800208131421542e-05, + "loss": 0.29, + "step": 10297 + }, + { + "epoch": 0.9701137514425002, + "grad_norm": 0.6654258966445923, + "learning_rate": 1.0798702946542657e-05, + "loss": 0.3138, + "step": 10298 + }, + { + "epoch": 0.9702079555356681, + "grad_norm": 0.8465496301651001, + "learning_rate": 1.0797197743452104e-05, + "loss": 0.3054, + "step": 10299 + }, + { + "epoch": 0.9703021596288359, + "grad_norm": 0.702889084815979, + "learning_rate": 1.0795692522184211e-05, + "loss": 0.3081, + "step": 10300 + }, + { + "epoch": 0.9703963637220038, + "grad_norm": 0.681449830532074, + "learning_rate": 1.0794187282773298e-05, + "loss": 0.2785, + "step": 10301 + }, + { + "epoch": 0.9704905678151716, + "grad_norm": 0.7109034657478333, + "learning_rate": 1.0792682025253684e-05, + "loss": 0.2984, + "step": 10302 + }, + { + "epoch": 0.9705847719083395, + "grad_norm": 0.6962335109710693, + "learning_rate": 1.0791176749659697e-05, + "loss": 0.2884, + "step": 10303 + }, + { + "epoch": 0.9706789760015073, + "grad_norm": 0.7099591493606567, + "learning_rate": 1.078967145602566e-05, + "loss": 0.2751, + "step": 10304 + }, + { + "epoch": 0.9707731800946751, + "grad_norm": 0.7957950830459595, + "learning_rate": 1.0788166144385888e-05, + "loss": 0.2776, + "step": 10305 + }, + { + "epoch": 0.9708673841878429, + "grad_norm": 0.6819398403167725, + "learning_rate": 1.078666081477471e-05, + "loss": 0.2879, + "step": 10306 + }, + { + "epoch": 0.9709615882810108, + "grad_norm": 0.7258957028388977, + "learning_rate": 1.0785155467226447e-05, + "loss": 0.3034, + "step": 10307 + }, + { + "epoch": 0.9710557923741786, + "grad_norm": 1.0399584770202637, + "learning_rate": 1.0783650101775426e-05, + "loss": 0.3274, + "step": 10308 + }, + { + "epoch": 0.9711499964673465, + "grad_norm": 0.6950438618659973, + "learning_rate": 1.0782144718455973e-05, + "loss": 0.2864, + "step": 10309 + }, + { + "epoch": 0.9712442005605143, + "grad_norm": 0.7077235579490662, + "learning_rate": 1.0780639317302411e-05, + "loss": 0.2906, + "step": 10310 + }, + { + "epoch": 0.9713384046536822, + "grad_norm": 0.6788902878761292, + "learning_rate": 1.0779133898349061e-05, + "loss": 0.291, + "step": 10311 + }, + { + "epoch": 0.97143260874685, + "grad_norm": 0.7881458401679993, + "learning_rate": 1.0777628461630256e-05, + "loss": 0.315, + "step": 10312 + }, + { + "epoch": 0.9715268128400179, + "grad_norm": 0.6583048105239868, + "learning_rate": 1.077612300718032e-05, + "loss": 0.2472, + "step": 10313 + }, + { + "epoch": 0.9716210169331857, + "grad_norm": 0.7339376211166382, + "learning_rate": 1.0774617535033575e-05, + "loss": 0.3042, + "step": 10314 + }, + { + "epoch": 0.9717152210263535, + "grad_norm": 0.794305145740509, + "learning_rate": 1.0773112045224361e-05, + "loss": 0.326, + "step": 10315 + }, + { + "epoch": 0.9718094251195214, + "grad_norm": 0.8386337757110596, + "learning_rate": 1.077160653778699e-05, + "loss": 0.302, + "step": 10316 + }, + { + "epoch": 0.9719036292126892, + "grad_norm": 0.6783521771430969, + "learning_rate": 1.0770101012755796e-05, + "loss": 0.2899, + "step": 10317 + }, + { + "epoch": 0.9719978333058571, + "grad_norm": 1.1961525678634644, + "learning_rate": 1.0768595470165111e-05, + "loss": 0.3049, + "step": 10318 + }, + { + "epoch": 0.9720920373990249, + "grad_norm": 0.790729820728302, + "learning_rate": 1.0767089910049258e-05, + "loss": 0.2792, + "step": 10319 + }, + { + "epoch": 0.9721862414921928, + "grad_norm": 0.7801897525787354, + "learning_rate": 1.0765584332442572e-05, + "loss": 0.296, + "step": 10320 + }, + { + "epoch": 0.9722804455853606, + "grad_norm": 0.6824880838394165, + "learning_rate": 1.0764078737379378e-05, + "loss": 0.2898, + "step": 10321 + }, + { + "epoch": 0.9723746496785285, + "grad_norm": 0.6680640578269958, + "learning_rate": 1.0762573124894004e-05, + "loss": 0.3144, + "step": 10322 + }, + { + "epoch": 0.9724688537716963, + "grad_norm": 0.8827727437019348, + "learning_rate": 1.0761067495020787e-05, + "loss": 0.3371, + "step": 10323 + }, + { + "epoch": 0.9725630578648642, + "grad_norm": 0.8456539511680603, + "learning_rate": 1.0759561847794053e-05, + "loss": 0.3434, + "step": 10324 + }, + { + "epoch": 0.972657261958032, + "grad_norm": 0.7169541716575623, + "learning_rate": 1.0758056183248135e-05, + "loss": 0.2596, + "step": 10325 + }, + { + "epoch": 0.9727514660511999, + "grad_norm": 0.7187843322753906, + "learning_rate": 1.0756550501417361e-05, + "loss": 0.262, + "step": 10326 + }, + { + "epoch": 0.9728456701443677, + "grad_norm": 0.7109638452529907, + "learning_rate": 1.0755044802336067e-05, + "loss": 0.3325, + "step": 10327 + }, + { + "epoch": 0.9729398742375356, + "grad_norm": 0.6972149610519409, + "learning_rate": 1.0753539086038584e-05, + "loss": 0.2626, + "step": 10328 + }, + { + "epoch": 0.9730340783307034, + "grad_norm": 0.6843149065971375, + "learning_rate": 1.0752033352559246e-05, + "loss": 0.259, + "step": 10329 + }, + { + "epoch": 0.9731282824238713, + "grad_norm": 0.6997199058532715, + "learning_rate": 1.0750527601932384e-05, + "loss": 0.2713, + "step": 10330 + }, + { + "epoch": 0.9732224865170391, + "grad_norm": 0.750153660774231, + "learning_rate": 1.074902183419233e-05, + "loss": 0.3021, + "step": 10331 + }, + { + "epoch": 0.973316690610207, + "grad_norm": 0.7035261392593384, + "learning_rate": 1.074751604937342e-05, + "loss": 0.3244, + "step": 10332 + }, + { + "epoch": 0.9734108947033748, + "grad_norm": 0.7157569527626038, + "learning_rate": 1.074601024750999e-05, + "loss": 0.2886, + "step": 10333 + }, + { + "epoch": 0.9735050987965427, + "grad_norm": 0.7078803181648254, + "learning_rate": 1.074450442863637e-05, + "loss": 0.3366, + "step": 10334 + }, + { + "epoch": 0.9735993028897105, + "grad_norm": 0.7504318952560425, + "learning_rate": 1.0742998592786902e-05, + "loss": 0.3068, + "step": 10335 + }, + { + "epoch": 0.9736935069828784, + "grad_norm": 0.7075466513633728, + "learning_rate": 1.0741492739995913e-05, + "loss": 0.3001, + "step": 10336 + }, + { + "epoch": 0.9737877110760462, + "grad_norm": 0.7589266896247864, + "learning_rate": 1.0739986870297743e-05, + "loss": 0.3228, + "step": 10337 + }, + { + "epoch": 0.9738819151692141, + "grad_norm": 0.7659551501274109, + "learning_rate": 1.073848098372673e-05, + "loss": 0.3021, + "step": 10338 + }, + { + "epoch": 0.9739761192623819, + "grad_norm": 0.7225384712219238, + "learning_rate": 1.0736975080317206e-05, + "loss": 0.2984, + "step": 10339 + }, + { + "epoch": 0.9740703233555498, + "grad_norm": 0.6561002731323242, + "learning_rate": 1.0735469160103514e-05, + "loss": 0.2875, + "step": 10340 + }, + { + "epoch": 0.9741645274487176, + "grad_norm": 0.7799216508865356, + "learning_rate": 1.0733963223119986e-05, + "loss": 0.288, + "step": 10341 + }, + { + "epoch": 0.9742587315418855, + "grad_norm": 0.7443419098854065, + "learning_rate": 1.073245726940096e-05, + "loss": 0.2611, + "step": 10342 + }, + { + "epoch": 0.9743529356350533, + "grad_norm": 0.8268982172012329, + "learning_rate": 1.0730951298980776e-05, + "loss": 0.3118, + "step": 10343 + }, + { + "epoch": 0.9744471397282212, + "grad_norm": 0.7606011629104614, + "learning_rate": 1.0729445311893773e-05, + "loss": 0.2986, + "step": 10344 + }, + { + "epoch": 0.974541343821389, + "grad_norm": 0.7274667620658875, + "learning_rate": 1.0727939308174289e-05, + "loss": 0.2986, + "step": 10345 + }, + { + "epoch": 0.9746355479145569, + "grad_norm": 0.6468337774276733, + "learning_rate": 1.0726433287856664e-05, + "loss": 0.2774, + "step": 10346 + }, + { + "epoch": 0.9747297520077247, + "grad_norm": 0.9713823199272156, + "learning_rate": 1.0724927250975232e-05, + "loss": 0.2726, + "step": 10347 + }, + { + "epoch": 0.9748239561008926, + "grad_norm": 0.6387431025505066, + "learning_rate": 1.072342119756434e-05, + "loss": 0.2803, + "step": 10348 + }, + { + "epoch": 0.9749181601940604, + "grad_norm": 0.7059564590454102, + "learning_rate": 1.0721915127658329e-05, + "loss": 0.2579, + "step": 10349 + }, + { + "epoch": 0.9750123642872283, + "grad_norm": 0.7223420739173889, + "learning_rate": 1.0720409041291533e-05, + "loss": 0.3094, + "step": 10350 + }, + { + "epoch": 0.9751065683803961, + "grad_norm": 0.6966363191604614, + "learning_rate": 1.0718902938498296e-05, + "loss": 0.2888, + "step": 10351 + }, + { + "epoch": 0.975200772473564, + "grad_norm": 0.7751204371452332, + "learning_rate": 1.0717396819312961e-05, + "loss": 0.3039, + "step": 10352 + }, + { + "epoch": 0.9752949765667318, + "grad_norm": 0.7578683495521545, + "learning_rate": 1.0715890683769872e-05, + "loss": 0.2725, + "step": 10353 + }, + { + "epoch": 0.9753891806598997, + "grad_norm": 0.6831819415092468, + "learning_rate": 1.0714384531903365e-05, + "loss": 0.2739, + "step": 10354 + }, + { + "epoch": 0.9754833847530675, + "grad_norm": 0.7249935269355774, + "learning_rate": 1.071287836374779e-05, + "loss": 0.3009, + "step": 10355 + }, + { + "epoch": 0.9755775888462354, + "grad_norm": 0.6584034562110901, + "learning_rate": 1.071137217933748e-05, + "loss": 0.2563, + "step": 10356 + }, + { + "epoch": 0.9756717929394032, + "grad_norm": 0.730527400970459, + "learning_rate": 1.0709865978706789e-05, + "loss": 0.2708, + "step": 10357 + }, + { + "epoch": 0.975765997032571, + "grad_norm": 0.8241204023361206, + "learning_rate": 1.0708359761890053e-05, + "loss": 0.3543, + "step": 10358 + }, + { + "epoch": 0.9758602011257389, + "grad_norm": 0.8479092121124268, + "learning_rate": 1.0706853528921618e-05, + "loss": 0.3116, + "step": 10359 + }, + { + "epoch": 0.9759544052189068, + "grad_norm": 0.7382973432540894, + "learning_rate": 1.070534727983583e-05, + "loss": 0.302, + "step": 10360 + }, + { + "epoch": 0.9760486093120746, + "grad_norm": 0.7855536341667175, + "learning_rate": 1.0703841014667037e-05, + "loss": 0.3572, + "step": 10361 + }, + { + "epoch": 0.9761428134052424, + "grad_norm": 0.7271184325218201, + "learning_rate": 1.0702334733449575e-05, + "loss": 0.3006, + "step": 10362 + }, + { + "epoch": 0.9762370174984103, + "grad_norm": 0.6775922775268555, + "learning_rate": 1.0700828436217798e-05, + "loss": 0.3102, + "step": 10363 + }, + { + "epoch": 0.9763312215915781, + "grad_norm": 0.7173593640327454, + "learning_rate": 1.0699322123006051e-05, + "loss": 0.2864, + "step": 10364 + }, + { + "epoch": 0.976425425684746, + "grad_norm": 0.7771925330162048, + "learning_rate": 1.0697815793848676e-05, + "loss": 0.2918, + "step": 10365 + }, + { + "epoch": 0.9765196297779138, + "grad_norm": 0.7389848232269287, + "learning_rate": 1.069630944878002e-05, + "loss": 0.2738, + "step": 10366 + }, + { + "epoch": 0.9766138338710817, + "grad_norm": 0.6529532074928284, + "learning_rate": 1.0694803087834431e-05, + "loss": 0.3264, + "step": 10367 + }, + { + "epoch": 0.9767080379642495, + "grad_norm": 0.8133367300033569, + "learning_rate": 1.069329671104626e-05, + "loss": 0.2879, + "step": 10368 + }, + { + "epoch": 0.9768022420574174, + "grad_norm": 0.8488715887069702, + "learning_rate": 1.069179031844985e-05, + "loss": 0.3017, + "step": 10369 + }, + { + "epoch": 0.9768964461505852, + "grad_norm": 0.6893444061279297, + "learning_rate": 1.0690283910079553e-05, + "loss": 0.3052, + "step": 10370 + }, + { + "epoch": 0.9769906502437531, + "grad_norm": 0.7704743146896362, + "learning_rate": 1.0688777485969713e-05, + "loss": 0.2782, + "step": 10371 + }, + { + "epoch": 0.9770848543369209, + "grad_norm": 0.9016629457473755, + "learning_rate": 1.0687271046154684e-05, + "loss": 0.2984, + "step": 10372 + }, + { + "epoch": 0.9771790584300888, + "grad_norm": 0.7004095315933228, + "learning_rate": 1.068576459066881e-05, + "loss": 0.3166, + "step": 10373 + }, + { + "epoch": 0.9772732625232566, + "grad_norm": 0.6561674475669861, + "learning_rate": 1.068425811954644e-05, + "loss": 0.2771, + "step": 10374 + }, + { + "epoch": 0.9773674666164245, + "grad_norm": 0.7221732139587402, + "learning_rate": 1.0682751632821933e-05, + "loss": 0.3031, + "step": 10375 + }, + { + "epoch": 0.9774616707095923, + "grad_norm": 0.6564479470252991, + "learning_rate": 1.0681245130529627e-05, + "loss": 0.2721, + "step": 10376 + }, + { + "epoch": 0.9775558748027602, + "grad_norm": 0.7864837050437927, + "learning_rate": 1.0679738612703882e-05, + "loss": 0.3361, + "step": 10377 + }, + { + "epoch": 0.977650078895928, + "grad_norm": 0.7487345337867737, + "learning_rate": 1.0678232079379045e-05, + "loss": 0.3285, + "step": 10378 + }, + { + "epoch": 0.9777442829890959, + "grad_norm": 0.6579439043998718, + "learning_rate": 1.0676725530589467e-05, + "loss": 0.2895, + "step": 10379 + }, + { + "epoch": 0.9778384870822637, + "grad_norm": 0.7135502099990845, + "learning_rate": 1.06752189663695e-05, + "loss": 0.3288, + "step": 10380 + }, + { + "epoch": 0.9779326911754316, + "grad_norm": 0.919353187084198, + "learning_rate": 1.0673712386753496e-05, + "loss": 0.3116, + "step": 10381 + }, + { + "epoch": 0.9780268952685994, + "grad_norm": 0.6300274133682251, + "learning_rate": 1.0672205791775807e-05, + "loss": 0.261, + "step": 10382 + }, + { + "epoch": 0.9781210993617673, + "grad_norm": 0.6668910384178162, + "learning_rate": 1.067069918147079e-05, + "loss": 0.2945, + "step": 10383 + }, + { + "epoch": 0.9782153034549351, + "grad_norm": 0.8164402842521667, + "learning_rate": 1.0669192555872791e-05, + "loss": 0.2734, + "step": 10384 + }, + { + "epoch": 0.978309507548103, + "grad_norm": 0.6610966324806213, + "learning_rate": 1.0667685915016168e-05, + "loss": 0.343, + "step": 10385 + }, + { + "epoch": 0.9784037116412708, + "grad_norm": 0.7320244908332825, + "learning_rate": 1.0666179258935274e-05, + "loss": 0.2982, + "step": 10386 + }, + { + "epoch": 0.9784979157344387, + "grad_norm": 0.7519034743309021, + "learning_rate": 1.0664672587664462e-05, + "loss": 0.3295, + "step": 10387 + }, + { + "epoch": 0.9785921198276065, + "grad_norm": 0.7669429779052734, + "learning_rate": 1.0663165901238088e-05, + "loss": 0.3212, + "step": 10388 + }, + { + "epoch": 0.9786863239207744, + "grad_norm": 0.6347747445106506, + "learning_rate": 1.0661659199690505e-05, + "loss": 0.29, + "step": 10389 + }, + { + "epoch": 0.9787805280139422, + "grad_norm": 0.7060967683792114, + "learning_rate": 1.066015248305607e-05, + "loss": 0.3017, + "step": 10390 + }, + { + "epoch": 0.9788747321071101, + "grad_norm": 0.7511218786239624, + "learning_rate": 1.0658645751369134e-05, + "loss": 0.2958, + "step": 10391 + }, + { + "epoch": 0.9789689362002779, + "grad_norm": 0.7689938545227051, + "learning_rate": 1.0657139004664058e-05, + "loss": 0.3165, + "step": 10392 + }, + { + "epoch": 0.9790631402934458, + "grad_norm": 1.196301817893982, + "learning_rate": 1.06556322429752e-05, + "loss": 0.2614, + "step": 10393 + }, + { + "epoch": 0.9791573443866136, + "grad_norm": 0.733065128326416, + "learning_rate": 1.0654125466336907e-05, + "loss": 0.3121, + "step": 10394 + }, + { + "epoch": 0.9792515484797815, + "grad_norm": 0.7399395108222961, + "learning_rate": 1.0652618674783549e-05, + "loss": 0.2614, + "step": 10395 + }, + { + "epoch": 0.9793457525729493, + "grad_norm": 0.719694972038269, + "learning_rate": 1.0651111868349469e-05, + "loss": 0.3156, + "step": 10396 + }, + { + "epoch": 0.9794399566661172, + "grad_norm": 1.1388739347457886, + "learning_rate": 1.0649605047069034e-05, + "loss": 0.2794, + "step": 10397 + }, + { + "epoch": 0.979534160759285, + "grad_norm": 0.7065989971160889, + "learning_rate": 1.06480982109766e-05, + "loss": 0.2849, + "step": 10398 + }, + { + "epoch": 0.9796283648524529, + "grad_norm": 0.8086827993392944, + "learning_rate": 1.0646591360106524e-05, + "loss": 0.3242, + "step": 10399 + }, + { + "epoch": 0.9797225689456207, + "grad_norm": 0.6797522306442261, + "learning_rate": 1.0645084494493166e-05, + "loss": 0.3007, + "step": 10400 + }, + { + "epoch": 0.9798167730387886, + "grad_norm": 0.8553151488304138, + "learning_rate": 1.064357761417088e-05, + "loss": 0.3233, + "step": 10401 + }, + { + "epoch": 0.9799109771319564, + "grad_norm": 0.6668685674667358, + "learning_rate": 1.0642070719174031e-05, + "loss": 0.2742, + "step": 10402 + }, + { + "epoch": 0.9800051812251243, + "grad_norm": 0.7501025795936584, + "learning_rate": 1.064056380953698e-05, + "loss": 0.339, + "step": 10403 + }, + { + "epoch": 0.9800993853182921, + "grad_norm": 0.743687629699707, + "learning_rate": 1.0639056885294082e-05, + "loss": 0.3128, + "step": 10404 + }, + { + "epoch": 0.98019358941146, + "grad_norm": 0.5909627676010132, + "learning_rate": 1.0637549946479698e-05, + "loss": 0.2524, + "step": 10405 + }, + { + "epoch": 0.9802877935046278, + "grad_norm": 0.6835361123085022, + "learning_rate": 1.0636042993128188e-05, + "loss": 0.2923, + "step": 10406 + }, + { + "epoch": 0.9803819975977957, + "grad_norm": 0.7401196956634521, + "learning_rate": 1.0634536025273914e-05, + "loss": 0.3109, + "step": 10407 + }, + { + "epoch": 0.9804762016909635, + "grad_norm": 0.7609918713569641, + "learning_rate": 1.0633029042951239e-05, + "loss": 0.3087, + "step": 10408 + }, + { + "epoch": 0.9805704057841313, + "grad_norm": 0.6468635201454163, + "learning_rate": 1.0631522046194522e-05, + "loss": 0.2722, + "step": 10409 + }, + { + "epoch": 0.9806646098772992, + "grad_norm": 0.7094852328300476, + "learning_rate": 1.0630015035038125e-05, + "loss": 0.2677, + "step": 10410 + }, + { + "epoch": 0.980758813970467, + "grad_norm": 0.7232741117477417, + "learning_rate": 1.0628508009516412e-05, + "loss": 0.2533, + "step": 10411 + }, + { + "epoch": 0.9808530180636349, + "grad_norm": 0.7106176018714905, + "learning_rate": 1.0627000969663743e-05, + "loss": 0.3123, + "step": 10412 + }, + { + "epoch": 0.9809472221568027, + "grad_norm": 0.6309818625450134, + "learning_rate": 1.0625493915514485e-05, + "loss": 0.2648, + "step": 10413 + }, + { + "epoch": 0.9810414262499706, + "grad_norm": 0.7269724011421204, + "learning_rate": 1.0623986847102994e-05, + "loss": 0.3464, + "step": 10414 + }, + { + "epoch": 0.9811356303431384, + "grad_norm": 0.6727473735809326, + "learning_rate": 1.0622479764463645e-05, + "loss": 0.3051, + "step": 10415 + }, + { + "epoch": 0.9812298344363063, + "grad_norm": 0.6313793063163757, + "learning_rate": 1.0620972667630787e-05, + "loss": 0.2706, + "step": 10416 + }, + { + "epoch": 0.9813240385294741, + "grad_norm": 0.7293052673339844, + "learning_rate": 1.0619465556638797e-05, + "loss": 0.2566, + "step": 10417 + }, + { + "epoch": 0.981418242622642, + "grad_norm": 0.7017768025398254, + "learning_rate": 1.0617958431522034e-05, + "loss": 0.3137, + "step": 10418 + }, + { + "epoch": 0.9815124467158098, + "grad_norm": 0.7957296371459961, + "learning_rate": 1.0616451292314858e-05, + "loss": 0.3159, + "step": 10419 + }, + { + "epoch": 0.9816066508089777, + "grad_norm": 0.7720123529434204, + "learning_rate": 1.0614944139051644e-05, + "loss": 0.3147, + "step": 10420 + }, + { + "epoch": 0.9817008549021455, + "grad_norm": 0.9559768438339233, + "learning_rate": 1.061343697176675e-05, + "loss": 0.2942, + "step": 10421 + }, + { + "epoch": 0.9817950589953134, + "grad_norm": 0.7781189680099487, + "learning_rate": 1.0611929790494543e-05, + "loss": 0.3103, + "step": 10422 + }, + { + "epoch": 0.9818892630884812, + "grad_norm": 0.7041938900947571, + "learning_rate": 1.0610422595269396e-05, + "loss": 0.3141, + "step": 10423 + }, + { + "epoch": 0.9819834671816491, + "grad_norm": 0.6958533525466919, + "learning_rate": 1.0608915386125667e-05, + "loss": 0.2783, + "step": 10424 + }, + { + "epoch": 0.9820776712748169, + "grad_norm": 0.6525768041610718, + "learning_rate": 1.0607408163097725e-05, + "loss": 0.3082, + "step": 10425 + }, + { + "epoch": 0.9821718753679848, + "grad_norm": 0.6321045160293579, + "learning_rate": 1.060590092621994e-05, + "loss": 0.2484, + "step": 10426 + }, + { + "epoch": 0.9822660794611526, + "grad_norm": 0.8602375984191895, + "learning_rate": 1.0604393675526672e-05, + "loss": 0.3413, + "step": 10427 + }, + { + "epoch": 0.9823602835543205, + "grad_norm": 0.7665597796440125, + "learning_rate": 1.0602886411052295e-05, + "loss": 0.3293, + "step": 10428 + }, + { + "epoch": 0.9824544876474883, + "grad_norm": 0.7466394305229187, + "learning_rate": 1.0601379132831177e-05, + "loss": 0.3092, + "step": 10429 + }, + { + "epoch": 0.9825486917406562, + "grad_norm": 0.7269977331161499, + "learning_rate": 1.0599871840897687e-05, + "loss": 0.3493, + "step": 10430 + }, + { + "epoch": 0.982642895833824, + "grad_norm": 0.6265439987182617, + "learning_rate": 1.0598364535286186e-05, + "loss": 0.2718, + "step": 10431 + }, + { + "epoch": 0.9827370999269919, + "grad_norm": 0.749789297580719, + "learning_rate": 1.0596857216031051e-05, + "loss": 0.3097, + "step": 10432 + }, + { + "epoch": 0.9828313040201597, + "grad_norm": 0.7870447039604187, + "learning_rate": 1.0595349883166648e-05, + "loss": 0.3272, + "step": 10433 + }, + { + "epoch": 0.9829255081133276, + "grad_norm": 0.8104708194732666, + "learning_rate": 1.0593842536727345e-05, + "loss": 0.3392, + "step": 10434 + }, + { + "epoch": 0.9830197122064954, + "grad_norm": 0.7863287925720215, + "learning_rate": 1.0592335176747518e-05, + "loss": 0.3183, + "step": 10435 + }, + { + "epoch": 0.9831139162996633, + "grad_norm": 0.628355860710144, + "learning_rate": 1.059082780326153e-05, + "loss": 0.3132, + "step": 10436 + }, + { + "epoch": 0.9832081203928311, + "grad_norm": 0.6768056750297546, + "learning_rate": 1.0589320416303756e-05, + "loss": 0.2725, + "step": 10437 + }, + { + "epoch": 0.983302324485999, + "grad_norm": 0.7083317041397095, + "learning_rate": 1.0587813015908566e-05, + "loss": 0.305, + "step": 10438 + }, + { + "epoch": 0.9833965285791668, + "grad_norm": 0.6532204747200012, + "learning_rate": 1.0586305602110326e-05, + "loss": 0.3015, + "step": 10439 + }, + { + "epoch": 0.9834907326723347, + "grad_norm": 0.7867683172225952, + "learning_rate": 1.0584798174943414e-05, + "loss": 0.3143, + "step": 10440 + }, + { + "epoch": 0.9835849367655025, + "grad_norm": 0.8491891622543335, + "learning_rate": 1.0583290734442199e-05, + "loss": 0.3207, + "step": 10441 + }, + { + "epoch": 0.9836791408586704, + "grad_norm": 0.6541094779968262, + "learning_rate": 1.0581783280641051e-05, + "loss": 0.26, + "step": 10442 + }, + { + "epoch": 0.9837733449518381, + "grad_norm": 0.7106014490127563, + "learning_rate": 1.058027581357435e-05, + "loss": 0.2963, + "step": 10443 + }, + { + "epoch": 0.983867549045006, + "grad_norm": 0.7384240031242371, + "learning_rate": 1.0578768333276458e-05, + "loss": 0.2582, + "step": 10444 + }, + { + "epoch": 0.9839617531381738, + "grad_norm": 0.7477413415908813, + "learning_rate": 1.0577260839781756e-05, + "loss": 0.3046, + "step": 10445 + }, + { + "epoch": 0.9840559572313416, + "grad_norm": 0.7462862133979797, + "learning_rate": 1.0575753333124615e-05, + "loss": 0.2727, + "step": 10446 + }, + { + "epoch": 0.9841501613245095, + "grad_norm": 0.8706429600715637, + "learning_rate": 1.0574245813339404e-05, + "loss": 0.3115, + "step": 10447 + }, + { + "epoch": 0.9842443654176773, + "grad_norm": 0.66429603099823, + "learning_rate": 1.05727382804605e-05, + "loss": 0.2772, + "step": 10448 + }, + { + "epoch": 0.9843385695108452, + "grad_norm": 0.6963960528373718, + "learning_rate": 1.057123073452228e-05, + "loss": 0.2912, + "step": 10449 + }, + { + "epoch": 0.984432773604013, + "grad_norm": 0.7144415974617004, + "learning_rate": 1.0569723175559116e-05, + "loss": 0.3204, + "step": 10450 + }, + { + "epoch": 0.9845269776971809, + "grad_norm": 0.822078287601471, + "learning_rate": 1.0568215603605378e-05, + "loss": 0.2814, + "step": 10451 + }, + { + "epoch": 0.9846211817903487, + "grad_norm": 0.8428067564964294, + "learning_rate": 1.056670801869545e-05, + "loss": 0.3445, + "step": 10452 + }, + { + "epoch": 0.9847153858835166, + "grad_norm": 0.6889626979827881, + "learning_rate": 1.0565200420863704e-05, + "loss": 0.2855, + "step": 10453 + }, + { + "epoch": 0.9848095899766844, + "grad_norm": 0.6799871921539307, + "learning_rate": 1.056369281014451e-05, + "loss": 0.3132, + "step": 10454 + }, + { + "epoch": 0.9849037940698523, + "grad_norm": 0.7179400324821472, + "learning_rate": 1.0562185186572253e-05, + "loss": 0.2966, + "step": 10455 + }, + { + "epoch": 0.9849979981630201, + "grad_norm": 0.6540926694869995, + "learning_rate": 1.05606775501813e-05, + "loss": 0.2944, + "step": 10456 + }, + { + "epoch": 0.985092202256188, + "grad_norm": 0.6726449131965637, + "learning_rate": 1.0559169901006035e-05, + "loss": 0.2665, + "step": 10457 + }, + { + "epoch": 0.9851864063493558, + "grad_norm": 0.7637098431587219, + "learning_rate": 1.0557662239080828e-05, + "loss": 0.3253, + "step": 10458 + }, + { + "epoch": 0.9852806104425237, + "grad_norm": 0.7527045011520386, + "learning_rate": 1.055615456444006e-05, + "loss": 0.315, + "step": 10459 + }, + { + "epoch": 0.9853748145356915, + "grad_norm": 0.702427327632904, + "learning_rate": 1.055464687711811e-05, + "loss": 0.309, + "step": 10460 + }, + { + "epoch": 0.9854690186288594, + "grad_norm": 0.7186090350151062, + "learning_rate": 1.0553139177149354e-05, + "loss": 0.2892, + "step": 10461 + }, + { + "epoch": 0.9855632227220272, + "grad_norm": 0.6679766178131104, + "learning_rate": 1.0551631464568167e-05, + "loss": 0.3033, + "step": 10462 + }, + { + "epoch": 0.9856574268151951, + "grad_norm": 0.7070468664169312, + "learning_rate": 1.0550123739408931e-05, + "loss": 0.3331, + "step": 10463 + }, + { + "epoch": 0.9857516309083629, + "grad_norm": 0.7657076716423035, + "learning_rate": 1.054861600170602e-05, + "loss": 0.3279, + "step": 10464 + }, + { + "epoch": 0.9858458350015308, + "grad_norm": 0.7013779878616333, + "learning_rate": 1.054710825149382e-05, + "loss": 0.3321, + "step": 10465 + }, + { + "epoch": 0.9859400390946986, + "grad_norm": 0.6874800324440002, + "learning_rate": 1.0545600488806704e-05, + "loss": 0.2896, + "step": 10466 + }, + { + "epoch": 0.9860342431878665, + "grad_norm": 0.7229523062705994, + "learning_rate": 1.054409271367905e-05, + "loss": 0.3216, + "step": 10467 + }, + { + "epoch": 0.9861284472810343, + "grad_norm": 0.7149501442909241, + "learning_rate": 1.0542584926145244e-05, + "loss": 0.3407, + "step": 10468 + }, + { + "epoch": 0.9862226513742022, + "grad_norm": 0.6803480982780457, + "learning_rate": 1.0541077126239663e-05, + "loss": 0.2873, + "step": 10469 + }, + { + "epoch": 0.98631685546737, + "grad_norm": 0.7159126400947571, + "learning_rate": 1.0539569313996687e-05, + "loss": 0.2917, + "step": 10470 + }, + { + "epoch": 0.9864110595605379, + "grad_norm": 0.8723127841949463, + "learning_rate": 1.0538061489450692e-05, + "loss": 0.3273, + "step": 10471 + }, + { + "epoch": 0.9865052636537057, + "grad_norm": 0.6796349883079529, + "learning_rate": 1.0536553652636068e-05, + "loss": 0.2872, + "step": 10472 + }, + { + "epoch": 0.9865994677468736, + "grad_norm": 1.1715443134307861, + "learning_rate": 1.0535045803587189e-05, + "loss": 0.3398, + "step": 10473 + }, + { + "epoch": 0.9866936718400414, + "grad_norm": 0.9951081871986389, + "learning_rate": 1.053353794233844e-05, + "loss": 0.2785, + "step": 10474 + }, + { + "epoch": 0.9867878759332093, + "grad_norm": 0.8327958583831787, + "learning_rate": 1.0532030068924198e-05, + "loss": 0.3256, + "step": 10475 + }, + { + "epoch": 0.9868820800263771, + "grad_norm": 0.7180770039558411, + "learning_rate": 1.0530522183378846e-05, + "loss": 0.3308, + "step": 10476 + }, + { + "epoch": 0.986976284119545, + "grad_norm": 0.7871127724647522, + "learning_rate": 1.0529014285736772e-05, + "loss": 0.3154, + "step": 10477 + }, + { + "epoch": 0.9870704882127128, + "grad_norm": 0.7883843779563904, + "learning_rate": 1.0527506376032352e-05, + "loss": 0.3313, + "step": 10478 + }, + { + "epoch": 0.9871646923058807, + "grad_norm": 0.7578481435775757, + "learning_rate": 1.052599845429997e-05, + "loss": 0.3048, + "step": 10479 + }, + { + "epoch": 0.9872588963990485, + "grad_norm": 37.6483039855957, + "learning_rate": 1.0524490520574011e-05, + "loss": 0.3027, + "step": 10480 + }, + { + "epoch": 0.9873531004922164, + "grad_norm": 0.7600743174552917, + "learning_rate": 1.0522982574888857e-05, + "loss": 0.3268, + "step": 10481 + }, + { + "epoch": 0.9874473045853842, + "grad_norm": 0.7806258797645569, + "learning_rate": 1.052147461727889e-05, + "loss": 0.3235, + "step": 10482 + }, + { + "epoch": 0.9875415086785521, + "grad_norm": 0.7319111824035645, + "learning_rate": 1.0519966647778499e-05, + "loss": 0.3504, + "step": 10483 + }, + { + "epoch": 0.9876357127717199, + "grad_norm": 0.7317287921905518, + "learning_rate": 1.051845866642206e-05, + "loss": 0.307, + "step": 10484 + }, + { + "epoch": 0.9877299168648878, + "grad_norm": 0.7134479284286499, + "learning_rate": 1.0516950673243965e-05, + "loss": 0.276, + "step": 10485 + }, + { + "epoch": 0.9878241209580556, + "grad_norm": 0.7581774592399597, + "learning_rate": 1.0515442668278595e-05, + "loss": 0.2814, + "step": 10486 + }, + { + "epoch": 0.9879183250512235, + "grad_norm": 0.7357668280601501, + "learning_rate": 1.051393465156033e-05, + "loss": 0.327, + "step": 10487 + }, + { + "epoch": 0.9880125291443913, + "grad_norm": 0.6782602071762085, + "learning_rate": 1.0512426623123566e-05, + "loss": 0.2757, + "step": 10488 + }, + { + "epoch": 0.9881067332375592, + "grad_norm": 0.7066102623939514, + "learning_rate": 1.051091858300268e-05, + "loss": 0.2976, + "step": 10489 + }, + { + "epoch": 0.988200937330727, + "grad_norm": 0.674705445766449, + "learning_rate": 1.050941053123206e-05, + "loss": 0.3039, + "step": 10490 + }, + { + "epoch": 0.9882951414238949, + "grad_norm": 0.7260640263557434, + "learning_rate": 1.0507902467846092e-05, + "loss": 0.2665, + "step": 10491 + }, + { + "epoch": 0.9883893455170627, + "grad_norm": 0.7960880994796753, + "learning_rate": 1.0506394392879165e-05, + "loss": 0.3039, + "step": 10492 + }, + { + "epoch": 0.9884835496102305, + "grad_norm": 0.7668411731719971, + "learning_rate": 1.050488630636566e-05, + "loss": 0.2966, + "step": 10493 + }, + { + "epoch": 0.9885777537033984, + "grad_norm": 0.6721380949020386, + "learning_rate": 1.0503378208339968e-05, + "loss": 0.2783, + "step": 10494 + }, + { + "epoch": 0.9886719577965662, + "grad_norm": 0.8086415529251099, + "learning_rate": 1.0501870098836473e-05, + "loss": 0.3181, + "step": 10495 + }, + { + "epoch": 0.9887661618897341, + "grad_norm": 0.8411343693733215, + "learning_rate": 1.0500361977889562e-05, + "loss": 0.3261, + "step": 10496 + }, + { + "epoch": 0.988860365982902, + "grad_norm": 0.6973224878311157, + "learning_rate": 1.0498853845533628e-05, + "loss": 0.2869, + "step": 10497 + }, + { + "epoch": 0.9889545700760698, + "grad_norm": 0.8233364224433899, + "learning_rate": 1.0497345701803052e-05, + "loss": 0.2975, + "step": 10498 + }, + { + "epoch": 0.9890487741692376, + "grad_norm": 0.7598685622215271, + "learning_rate": 1.0495837546732224e-05, + "loss": 0.3326, + "step": 10499 + }, + { + "epoch": 0.9891429782624055, + "grad_norm": 0.7478805780410767, + "learning_rate": 1.0494329380355535e-05, + "loss": 0.2836, + "step": 10500 + }, + { + "epoch": 0.9892371823555733, + "grad_norm": 0.743462324142456, + "learning_rate": 1.0492821202707373e-05, + "loss": 0.2974, + "step": 10501 + }, + { + "epoch": 0.9893313864487412, + "grad_norm": 0.693759560585022, + "learning_rate": 1.0491313013822122e-05, + "loss": 0.2839, + "step": 10502 + }, + { + "epoch": 0.989425590541909, + "grad_norm": 0.727215051651001, + "learning_rate": 1.0489804813734176e-05, + "loss": 0.3064, + "step": 10503 + }, + { + "epoch": 0.9895197946350769, + "grad_norm": 0.8441318869590759, + "learning_rate": 1.0488296602477923e-05, + "loss": 0.3066, + "step": 10504 + }, + { + "epoch": 0.9896139987282447, + "grad_norm": 0.7129061818122864, + "learning_rate": 1.0486788380087754e-05, + "loss": 0.2813, + "step": 10505 + }, + { + "epoch": 0.9897082028214126, + "grad_norm": 0.6906604170799255, + "learning_rate": 1.0485280146598055e-05, + "loss": 0.3135, + "step": 10506 + }, + { + "epoch": 0.9898024069145804, + "grad_norm": 0.8256998062133789, + "learning_rate": 1.0483771902043216e-05, + "loss": 0.327, + "step": 10507 + }, + { + "epoch": 0.9898966110077483, + "grad_norm": 0.775102972984314, + "learning_rate": 1.0482263646457632e-05, + "loss": 0.2884, + "step": 10508 + }, + { + "epoch": 0.9899908151009161, + "grad_norm": 0.6462504267692566, + "learning_rate": 1.0480755379875693e-05, + "loss": 0.2908, + "step": 10509 + }, + { + "epoch": 0.990085019194084, + "grad_norm": 0.7563470602035522, + "learning_rate": 1.0479247102331787e-05, + "loss": 0.2923, + "step": 10510 + }, + { + "epoch": 0.9901792232872518, + "grad_norm": 0.7145566940307617, + "learning_rate": 1.0477738813860303e-05, + "loss": 0.2962, + "step": 10511 + }, + { + "epoch": 0.9902734273804197, + "grad_norm": 0.7924150228500366, + "learning_rate": 1.0476230514495636e-05, + "loss": 0.2747, + "step": 10512 + }, + { + "epoch": 0.9903676314735875, + "grad_norm": 0.7354665994644165, + "learning_rate": 1.0474722204272178e-05, + "loss": 0.2917, + "step": 10513 + }, + { + "epoch": 0.9904618355667554, + "grad_norm": 0.7262409329414368, + "learning_rate": 1.0473213883224321e-05, + "loss": 0.3027, + "step": 10514 + }, + { + "epoch": 0.9905560396599232, + "grad_norm": 0.8383549451828003, + "learning_rate": 1.0471705551386453e-05, + "loss": 0.3241, + "step": 10515 + }, + { + "epoch": 0.9906502437530911, + "grad_norm": 0.7284969091415405, + "learning_rate": 1.047019720879297e-05, + "loss": 0.2917, + "step": 10516 + }, + { + "epoch": 0.9907444478462589, + "grad_norm": 0.7100862860679626, + "learning_rate": 1.0468688855478265e-05, + "loss": 0.2953, + "step": 10517 + }, + { + "epoch": 0.9908386519394268, + "grad_norm": 0.7132425308227539, + "learning_rate": 1.0467180491476725e-05, + "loss": 0.3224, + "step": 10518 + }, + { + "epoch": 0.9909328560325946, + "grad_norm": 0.7249391078948975, + "learning_rate": 1.0465672116822749e-05, + "loss": 0.3134, + "step": 10519 + }, + { + "epoch": 0.9910270601257625, + "grad_norm": 0.7816787958145142, + "learning_rate": 1.0464163731550731e-05, + "loss": 0.3201, + "step": 10520 + }, + { + "epoch": 0.9911212642189303, + "grad_norm": 0.8399257659912109, + "learning_rate": 1.046265533569506e-05, + "loss": 0.306, + "step": 10521 + }, + { + "epoch": 0.9912154683120982, + "grad_norm": 0.8124042749404907, + "learning_rate": 1.046114692929013e-05, + "loss": 0.2871, + "step": 10522 + }, + { + "epoch": 0.991309672405266, + "grad_norm": 0.754949152469635, + "learning_rate": 1.0459638512370343e-05, + "loss": 0.2695, + "step": 10523 + }, + { + "epoch": 0.9914038764984339, + "grad_norm": 0.6890938878059387, + "learning_rate": 1.0458130084970082e-05, + "loss": 0.3235, + "step": 10524 + }, + { + "epoch": 0.9914980805916017, + "grad_norm": 0.7518118619918823, + "learning_rate": 1.0456621647123748e-05, + "loss": 0.305, + "step": 10525 + }, + { + "epoch": 0.9915922846847696, + "grad_norm": 0.7360654473304749, + "learning_rate": 1.0455113198865734e-05, + "loss": 0.3069, + "step": 10526 + }, + { + "epoch": 0.9916864887779374, + "grad_norm": 0.9580203294754028, + "learning_rate": 1.0453604740230434e-05, + "loss": 0.3491, + "step": 10527 + }, + { + "epoch": 0.9917806928711053, + "grad_norm": 0.8052199482917786, + "learning_rate": 1.0452096271252246e-05, + "loss": 0.3402, + "step": 10528 + }, + { + "epoch": 0.9918748969642731, + "grad_norm": 0.6957103610038757, + "learning_rate": 1.0450587791965562e-05, + "loss": 0.2986, + "step": 10529 + }, + { + "epoch": 0.991969101057441, + "grad_norm": 0.7063989639282227, + "learning_rate": 1.0449079302404783e-05, + "loss": 0.2928, + "step": 10530 + }, + { + "epoch": 0.9920633051506088, + "grad_norm": 0.8932196497917175, + "learning_rate": 1.0447570802604298e-05, + "loss": 0.3508, + "step": 10531 + }, + { + "epoch": 0.9921575092437767, + "grad_norm": 0.600779116153717, + "learning_rate": 1.0446062292598508e-05, + "loss": 0.2767, + "step": 10532 + }, + { + "epoch": 0.9922517133369445, + "grad_norm": 0.7640441060066223, + "learning_rate": 1.0444553772421808e-05, + "loss": 0.345, + "step": 10533 + }, + { + "epoch": 0.9923459174301124, + "grad_norm": 0.8529372811317444, + "learning_rate": 1.0443045242108596e-05, + "loss": 0.2843, + "step": 10534 + }, + { + "epoch": 0.9924401215232802, + "grad_norm": 0.9221624135971069, + "learning_rate": 1.0441536701693266e-05, + "loss": 0.2849, + "step": 10535 + }, + { + "epoch": 0.992534325616448, + "grad_norm": 0.8267768025398254, + "learning_rate": 1.0440028151210213e-05, + "loss": 0.3047, + "step": 10536 + }, + { + "epoch": 0.9926285297096159, + "grad_norm": 0.7317991852760315, + "learning_rate": 1.0438519590693842e-05, + "loss": 0.3379, + "step": 10537 + }, + { + "epoch": 0.9927227338027838, + "grad_norm": 0.7200465798377991, + "learning_rate": 1.0437011020178544e-05, + "loss": 0.3151, + "step": 10538 + }, + { + "epoch": 0.9928169378959516, + "grad_norm": 0.8047158122062683, + "learning_rate": 1.0435502439698719e-05, + "loss": 0.2999, + "step": 10539 + }, + { + "epoch": 0.9929111419891195, + "grad_norm": 0.7040883898735046, + "learning_rate": 1.0433993849288768e-05, + "loss": 0.2768, + "step": 10540 + }, + { + "epoch": 0.9930053460822873, + "grad_norm": 0.6733881831169128, + "learning_rate": 1.0432485248983081e-05, + "loss": 0.2618, + "step": 10541 + }, + { + "epoch": 0.9930995501754551, + "grad_norm": 1.1577588319778442, + "learning_rate": 1.0430976638816064e-05, + "loss": 0.2659, + "step": 10542 + }, + { + "epoch": 0.993193754268623, + "grad_norm": 0.8506754040718079, + "learning_rate": 1.0429468018822115e-05, + "loss": 0.3336, + "step": 10543 + }, + { + "epoch": 0.9932879583617908, + "grad_norm": 0.6909233331680298, + "learning_rate": 1.0427959389035626e-05, + "loss": 0.271, + "step": 10544 + }, + { + "epoch": 0.9933821624549587, + "grad_norm": 0.7297505140304565, + "learning_rate": 1.0426450749491006e-05, + "loss": 0.3096, + "step": 10545 + }, + { + "epoch": 0.9934763665481265, + "grad_norm": 0.7545581459999084, + "learning_rate": 1.0424942100222648e-05, + "loss": 0.3125, + "step": 10546 + }, + { + "epoch": 0.9935705706412944, + "grad_norm": 0.6902565956115723, + "learning_rate": 1.0423433441264952e-05, + "loss": 0.3131, + "step": 10547 + }, + { + "epoch": 0.9936647747344622, + "grad_norm": 0.7904943227767944, + "learning_rate": 1.042192477265232e-05, + "loss": 0.2775, + "step": 10548 + }, + { + "epoch": 0.9937589788276301, + "grad_norm": 0.7091799974441528, + "learning_rate": 1.042041609441915e-05, + "loss": 0.2871, + "step": 10549 + }, + { + "epoch": 0.9938531829207979, + "grad_norm": 0.7303634285926819, + "learning_rate": 1.0418907406599844e-05, + "loss": 0.3114, + "step": 10550 + }, + { + "epoch": 0.9939473870139658, + "grad_norm": 0.6448248624801636, + "learning_rate": 1.0417398709228797e-05, + "loss": 0.2664, + "step": 10551 + }, + { + "epoch": 0.9940415911071336, + "grad_norm": 0.9918820858001709, + "learning_rate": 1.0415890002340417e-05, + "loss": 0.2608, + "step": 10552 + }, + { + "epoch": 0.9941357952003015, + "grad_norm": 0.6706486344337463, + "learning_rate": 1.0414381285969102e-05, + "loss": 0.2819, + "step": 10553 + }, + { + "epoch": 0.9942299992934693, + "grad_norm": 0.7919266223907471, + "learning_rate": 1.0412872560149254e-05, + "loss": 0.2864, + "step": 10554 + }, + { + "epoch": 0.9943242033866372, + "grad_norm": 0.9205607175827026, + "learning_rate": 1.041136382491527e-05, + "loss": 0.3748, + "step": 10555 + }, + { + "epoch": 0.994418407479805, + "grad_norm": 0.7769069075584412, + "learning_rate": 1.0409855080301556e-05, + "loss": 0.3131, + "step": 10556 + }, + { + "epoch": 0.9945126115729729, + "grad_norm": 0.847683846950531, + "learning_rate": 1.0408346326342514e-05, + "loss": 0.3019, + "step": 10557 + }, + { + "epoch": 0.9946068156661407, + "grad_norm": 0.7511101365089417, + "learning_rate": 1.0406837563072542e-05, + "loss": 0.2966, + "step": 10558 + }, + { + "epoch": 0.9947010197593086, + "grad_norm": 0.7299827933311462, + "learning_rate": 1.0405328790526043e-05, + "loss": 0.3143, + "step": 10559 + }, + { + "epoch": 0.9947952238524764, + "grad_norm": 0.7897573709487915, + "learning_rate": 1.0403820008737426e-05, + "loss": 0.2425, + "step": 10560 + }, + { + "epoch": 0.9948894279456443, + "grad_norm": 0.6592634916305542, + "learning_rate": 1.0402311217741083e-05, + "loss": 0.2787, + "step": 10561 + }, + { + "epoch": 0.9949836320388121, + "grad_norm": 0.6745985150337219, + "learning_rate": 1.0400802417571423e-05, + "loss": 0.283, + "step": 10562 + }, + { + "epoch": 0.99507783613198, + "grad_norm": 0.7424817681312561, + "learning_rate": 1.0399293608262853e-05, + "loss": 0.3466, + "step": 10563 + }, + { + "epoch": 0.9951720402251478, + "grad_norm": 0.6457298398017883, + "learning_rate": 1.0397784789849765e-05, + "loss": 0.2557, + "step": 10564 + }, + { + "epoch": 0.9952662443183157, + "grad_norm": 0.7130741477012634, + "learning_rate": 1.039627596236657e-05, + "loss": 0.2846, + "step": 10565 + }, + { + "epoch": 0.9953604484114835, + "grad_norm": 0.7130588293075562, + "learning_rate": 1.0394767125847673e-05, + "loss": 0.2589, + "step": 10566 + }, + { + "epoch": 0.9954546525046514, + "grad_norm": 0.7500724196434021, + "learning_rate": 1.0393258280327471e-05, + "loss": 0.3023, + "step": 10567 + }, + { + "epoch": 0.9955488565978192, + "grad_norm": 0.6984314322471619, + "learning_rate": 1.0391749425840376e-05, + "loss": 0.2962, + "step": 10568 + }, + { + "epoch": 0.9956430606909871, + "grad_norm": 0.7248314619064331, + "learning_rate": 1.0390240562420785e-05, + "loss": 0.3181, + "step": 10569 + }, + { + "epoch": 0.9957372647841549, + "grad_norm": 0.7185055017471313, + "learning_rate": 1.0388731690103108e-05, + "loss": 0.256, + "step": 10570 + }, + { + "epoch": 0.9958314688773228, + "grad_norm": 0.7030216455459595, + "learning_rate": 1.0387222808921746e-05, + "loss": 0.2612, + "step": 10571 + }, + { + "epoch": 0.9959256729704906, + "grad_norm": 0.792762041091919, + "learning_rate": 1.0385713918911104e-05, + "loss": 0.3206, + "step": 10572 + }, + { + "epoch": 0.9960198770636585, + "grad_norm": 0.7445237040519714, + "learning_rate": 1.038420502010559e-05, + "loss": 0.2767, + "step": 10573 + }, + { + "epoch": 0.9961140811568263, + "grad_norm": 0.9818106293678284, + "learning_rate": 1.038269611253961e-05, + "loss": 0.3443, + "step": 10574 + }, + { + "epoch": 0.9962082852499942, + "grad_norm": 0.705763578414917, + "learning_rate": 1.0381187196247564e-05, + "loss": 0.2888, + "step": 10575 + }, + { + "epoch": 0.996302489343162, + "grad_norm": 0.9176934957504272, + "learning_rate": 1.0379678271263858e-05, + "loss": 0.2826, + "step": 10576 + }, + { + "epoch": 0.9963966934363299, + "grad_norm": 0.7013608813285828, + "learning_rate": 1.0378169337622903e-05, + "loss": 0.2778, + "step": 10577 + }, + { + "epoch": 0.9964908975294977, + "grad_norm": 0.7365220785140991, + "learning_rate": 1.0376660395359102e-05, + "loss": 0.3191, + "step": 10578 + }, + { + "epoch": 0.9965851016226656, + "grad_norm": 0.732226550579071, + "learning_rate": 1.037515144450686e-05, + "loss": 0.2999, + "step": 10579 + }, + { + "epoch": 0.9966793057158334, + "grad_norm": 0.7567113637924194, + "learning_rate": 1.0373642485100588e-05, + "loss": 0.2923, + "step": 10580 + }, + { + "epoch": 0.9967735098090011, + "grad_norm": 0.7371570467948914, + "learning_rate": 1.0372133517174688e-05, + "loss": 0.2887, + "step": 10581 + }, + { + "epoch": 0.996867713902169, + "grad_norm": 0.7575872540473938, + "learning_rate": 1.0370624540763565e-05, + "loss": 0.3108, + "step": 10582 + }, + { + "epoch": 0.9969619179953368, + "grad_norm": 0.6133670210838318, + "learning_rate": 1.0369115555901635e-05, + "loss": 0.2714, + "step": 10583 + }, + { + "epoch": 0.9970561220885047, + "grad_norm": 0.7405552864074707, + "learning_rate": 1.0367606562623294e-05, + "loss": 0.3138, + "step": 10584 + }, + { + "epoch": 0.9971503261816725, + "grad_norm": 0.7196900248527527, + "learning_rate": 1.0366097560962957e-05, + "loss": 0.3005, + "step": 10585 + }, + { + "epoch": 0.9972445302748404, + "grad_norm": 0.7500787377357483, + "learning_rate": 1.036458855095503e-05, + "loss": 0.2924, + "step": 10586 + }, + { + "epoch": 0.9973387343680082, + "grad_norm": 0.7023894190788269, + "learning_rate": 1.036307953263392e-05, + "loss": 0.2997, + "step": 10587 + }, + { + "epoch": 0.9974329384611761, + "grad_norm": 0.7843096256256104, + "learning_rate": 1.0361570506034036e-05, + "loss": 0.3239, + "step": 10588 + }, + { + "epoch": 0.9975271425543439, + "grad_norm": 0.7718815207481384, + "learning_rate": 1.0360061471189784e-05, + "loss": 0.3153, + "step": 10589 + }, + { + "epoch": 0.9976213466475118, + "grad_norm": 0.7284173369407654, + "learning_rate": 1.0358552428135576e-05, + "loss": 0.2914, + "step": 10590 + }, + { + "epoch": 0.9977155507406796, + "grad_norm": 0.6925811767578125, + "learning_rate": 1.0357043376905816e-05, + "loss": 0.2854, + "step": 10591 + }, + { + "epoch": 0.9978097548338475, + "grad_norm": 0.7613288760185242, + "learning_rate": 1.0355534317534914e-05, + "loss": 0.3135, + "step": 10592 + }, + { + "epoch": 0.9979039589270153, + "grad_norm": 0.756247341632843, + "learning_rate": 1.035402525005728e-05, + "loss": 0.2934, + "step": 10593 + }, + { + "epoch": 0.9979981630201832, + "grad_norm": 0.7116743326187134, + "learning_rate": 1.0352516174507325e-05, + "loss": 0.2988, + "step": 10594 + }, + { + "epoch": 0.998092367113351, + "grad_norm": 0.7530072331428528, + "learning_rate": 1.0351007090919457e-05, + "loss": 0.3, + "step": 10595 + }, + { + "epoch": 0.9981865712065189, + "grad_norm": 0.7185389995574951, + "learning_rate": 1.0349497999328077e-05, + "loss": 0.2883, + "step": 10596 + }, + { + "epoch": 0.9982807752996867, + "grad_norm": 0.6984468102455139, + "learning_rate": 1.034798889976761e-05, + "loss": 0.268, + "step": 10597 + }, + { + "epoch": 0.9983749793928546, + "grad_norm": 0.7011573910713196, + "learning_rate": 1.0346479792272454e-05, + "loss": 0.2977, + "step": 10598 + }, + { + "epoch": 0.9984691834860224, + "grad_norm": 0.7571693658828735, + "learning_rate": 1.0344970676877021e-05, + "loss": 0.3152, + "step": 10599 + }, + { + "epoch": 0.9985633875791903, + "grad_norm": 0.9057065844535828, + "learning_rate": 1.034346155361573e-05, + "loss": 0.3215, + "step": 10600 + }, + { + "epoch": 0.9986575916723581, + "grad_norm": 0.7775444388389587, + "learning_rate": 1.0341952422522979e-05, + "loss": 0.337, + "step": 10601 + }, + { + "epoch": 0.998751795765526, + "grad_norm": 0.6971397995948792, + "learning_rate": 1.0340443283633183e-05, + "loss": 0.309, + "step": 10602 + }, + { + "epoch": 0.9988459998586938, + "grad_norm": 0.7716683745384216, + "learning_rate": 1.033893413698076e-05, + "loss": 0.3643, + "step": 10603 + }, + { + "epoch": 0.9989402039518617, + "grad_norm": 0.6274122595787048, + "learning_rate": 1.0337424982600109e-05, + "loss": 0.2364, + "step": 10604 + }, + { + "epoch": 0.9990344080450295, + "grad_norm": 0.8103241324424744, + "learning_rate": 1.0335915820525649e-05, + "loss": 0.3426, + "step": 10605 + }, + { + "epoch": 0.9991286121381974, + "grad_norm": 0.700080931186676, + "learning_rate": 1.0334406650791788e-05, + "loss": 0.2933, + "step": 10606 + }, + { + "epoch": 0.9992228162313652, + "grad_norm": 0.7084948420524597, + "learning_rate": 1.0332897473432937e-05, + "loss": 0.2848, + "step": 10607 + }, + { + "epoch": 0.9993170203245331, + "grad_norm": 0.7010513544082642, + "learning_rate": 1.033138828848351e-05, + "loss": 0.2712, + "step": 10608 + }, + { + "epoch": 0.9994112244177009, + "grad_norm": 0.7678072452545166, + "learning_rate": 1.0329879095977917e-05, + "loss": 0.2786, + "step": 10609 + }, + { + "epoch": 0.9995054285108688, + "grad_norm": 0.7514888644218445, + "learning_rate": 1.0328369895950572e-05, + "loss": 0.2837, + "step": 10610 + }, + { + "epoch": 0.9995996326040366, + "grad_norm": 0.6897455453872681, + "learning_rate": 1.0326860688435884e-05, + "loss": 0.3043, + "step": 10611 + }, + { + "epoch": 0.9996938366972045, + "grad_norm": 0.823022186756134, + "learning_rate": 1.0325351473468265e-05, + "loss": 0.3451, + "step": 10612 + }, + { + "epoch": 0.9997880407903723, + "grad_norm": 0.7792565822601318, + "learning_rate": 1.0323842251082132e-05, + "loss": 0.296, + "step": 10613 + }, + { + "epoch": 0.9998822448835402, + "grad_norm": 0.9984522461891174, + "learning_rate": 1.0322333021311896e-05, + "loss": 0.3364, + "step": 10614 + }, + { + "epoch": 0.999976448976708, + "grad_norm": 0.7459888458251953, + "learning_rate": 1.0320823784191967e-05, + "loss": 0.2825, + "step": 10615 + }, + { + "epoch": 1.000070653069876, + "grad_norm": 0.5776287317276001, + "learning_rate": 1.0319314539756758e-05, + "loss": 0.2124, + "step": 10616 + }, + { + "epoch": 1.0001648571630437, + "grad_norm": 0.611218273639679, + "learning_rate": 1.0317805288040686e-05, + "loss": 0.2471, + "step": 10617 + }, + { + "epoch": 1.0002590612562117, + "grad_norm": 0.6110851168632507, + "learning_rate": 1.031629602907816e-05, + "loss": 0.1887, + "step": 10618 + }, + { + "epoch": 1.0003532653493794, + "grad_norm": 0.556437611579895, + "learning_rate": 1.0314786762903595e-05, + "loss": 0.2225, + "step": 10619 + }, + { + "epoch": 1.0004474694425474, + "grad_norm": 0.5871204137802124, + "learning_rate": 1.031327748955141e-05, + "loss": 0.2372, + "step": 10620 + }, + { + "epoch": 1.000541673535715, + "grad_norm": 0.6837234497070312, + "learning_rate": 1.0311768209056008e-05, + "loss": 0.2108, + "step": 10621 + }, + { + "epoch": 1.000635877628883, + "grad_norm": 0.5930719971656799, + "learning_rate": 1.031025892145181e-05, + "loss": 0.2146, + "step": 10622 + }, + { + "epoch": 1.0007300817220508, + "grad_norm": 0.6106625199317932, + "learning_rate": 1.0308749626773231e-05, + "loss": 0.2136, + "step": 10623 + }, + { + "epoch": 1.0008242858152188, + "grad_norm": 0.7455571889877319, + "learning_rate": 1.030724032505468e-05, + "loss": 0.2166, + "step": 10624 + }, + { + "epoch": 1.0009184899083865, + "grad_norm": 0.6277820467948914, + "learning_rate": 1.0305731016330575e-05, + "loss": 0.2535, + "step": 10625 + }, + { + "epoch": 1.0010126940015545, + "grad_norm": 0.6769752502441406, + "learning_rate": 1.0304221700635333e-05, + "loss": 0.216, + "step": 10626 + }, + { + "epoch": 1.0011068980947222, + "grad_norm": 0.728179395198822, + "learning_rate": 1.0302712378003364e-05, + "loss": 0.2411, + "step": 10627 + }, + { + "epoch": 1.0012011021878902, + "grad_norm": 0.6951244473457336, + "learning_rate": 1.0301203048469084e-05, + "loss": 0.208, + "step": 10628 + }, + { + "epoch": 1.001295306281058, + "grad_norm": 0.6990310549736023, + "learning_rate": 1.0299693712066909e-05, + "loss": 0.2267, + "step": 10629 + }, + { + "epoch": 1.0013895103742259, + "grad_norm": 0.6202535629272461, + "learning_rate": 1.0298184368831254e-05, + "loss": 0.2249, + "step": 10630 + }, + { + "epoch": 1.0014837144673936, + "grad_norm": 0.5896943211555481, + "learning_rate": 1.0296675018796536e-05, + "loss": 0.2036, + "step": 10631 + }, + { + "epoch": 1.0015779185605616, + "grad_norm": 0.6231122016906738, + "learning_rate": 1.0295165661997164e-05, + "loss": 0.2477, + "step": 10632 + }, + { + "epoch": 1.0016721226537293, + "grad_norm": 0.6731133460998535, + "learning_rate": 1.0293656298467562e-05, + "loss": 0.2097, + "step": 10633 + }, + { + "epoch": 1.0017663267468973, + "grad_norm": 0.645645797252655, + "learning_rate": 1.0292146928242141e-05, + "loss": 0.2147, + "step": 10634 + }, + { + "epoch": 1.001860530840065, + "grad_norm": 0.7247464656829834, + "learning_rate": 1.029063755135532e-05, + "loss": 0.1859, + "step": 10635 + }, + { + "epoch": 1.001954734933233, + "grad_norm": 0.811730146408081, + "learning_rate": 1.028912816784151e-05, + "loss": 0.2021, + "step": 10636 + }, + { + "epoch": 1.0020489390264007, + "grad_norm": 0.7885605692863464, + "learning_rate": 1.0287618777735132e-05, + "loss": 0.2245, + "step": 10637 + }, + { + "epoch": 1.0021431431195686, + "grad_norm": 0.652739405632019, + "learning_rate": 1.0286109381070603e-05, + "loss": 0.2233, + "step": 10638 + }, + { + "epoch": 1.0022373472127364, + "grad_norm": 0.7091842889785767, + "learning_rate": 1.0284599977882336e-05, + "loss": 0.1906, + "step": 10639 + }, + { + "epoch": 1.0023315513059043, + "grad_norm": 0.6556228399276733, + "learning_rate": 1.0283090568204752e-05, + "loss": 0.2275, + "step": 10640 + }, + { + "epoch": 1.002425755399072, + "grad_norm": 0.7560815811157227, + "learning_rate": 1.0281581152072261e-05, + "loss": 0.1991, + "step": 10641 + }, + { + "epoch": 1.00251995949224, + "grad_norm": 0.6686923503875732, + "learning_rate": 1.0280071729519285e-05, + "loss": 0.2113, + "step": 10642 + }, + { + "epoch": 1.0026141635854078, + "grad_norm": 0.6659157872200012, + "learning_rate": 1.0278562300580246e-05, + "loss": 0.2184, + "step": 10643 + }, + { + "epoch": 1.0027083676785757, + "grad_norm": 0.7567133903503418, + "learning_rate": 1.027705286528955e-05, + "loss": 0.2211, + "step": 10644 + }, + { + "epoch": 1.0028025717717435, + "grad_norm": 0.654885470867157, + "learning_rate": 1.0275543423681622e-05, + "loss": 0.2028, + "step": 10645 + }, + { + "epoch": 1.0028967758649114, + "grad_norm": 0.7210602164268494, + "learning_rate": 1.0274033975790878e-05, + "loss": 0.2532, + "step": 10646 + }, + { + "epoch": 1.0029909799580792, + "grad_norm": 0.8513142466545105, + "learning_rate": 1.0272524521651732e-05, + "loss": 0.202, + "step": 10647 + }, + { + "epoch": 1.0030851840512471, + "grad_norm": 0.8838748335838318, + "learning_rate": 1.027101506129861e-05, + "loss": 0.2338, + "step": 10648 + }, + { + "epoch": 1.0031793881444149, + "grad_norm": 0.9266117215156555, + "learning_rate": 1.0269505594765925e-05, + "loss": 0.2567, + "step": 10649 + }, + { + "epoch": 1.0032735922375826, + "grad_norm": 0.6988517045974731, + "learning_rate": 1.0267996122088095e-05, + "loss": 0.1968, + "step": 10650 + }, + { + "epoch": 1.0033677963307506, + "grad_norm": 0.6875061988830566, + "learning_rate": 1.0266486643299539e-05, + "loss": 0.2122, + "step": 10651 + }, + { + "epoch": 1.0034620004239183, + "grad_norm": 0.7033585906028748, + "learning_rate": 1.0264977158434673e-05, + "loss": 0.2196, + "step": 10652 + }, + { + "epoch": 1.0035562045170863, + "grad_norm": 0.7428792715072632, + "learning_rate": 1.026346766752792e-05, + "loss": 0.2183, + "step": 10653 + }, + { + "epoch": 1.003650408610254, + "grad_norm": 0.6367530822753906, + "learning_rate": 1.0261958170613697e-05, + "loss": 0.2193, + "step": 10654 + }, + { + "epoch": 1.003744612703422, + "grad_norm": 0.6572229862213135, + "learning_rate": 1.0260448667726424e-05, + "loss": 0.2164, + "step": 10655 + }, + { + "epoch": 1.0038388167965897, + "grad_norm": 0.5711113810539246, + "learning_rate": 1.0258939158900514e-05, + "loss": 0.1956, + "step": 10656 + }, + { + "epoch": 1.0039330208897577, + "grad_norm": 0.6450173258781433, + "learning_rate": 1.0257429644170393e-05, + "loss": 0.1928, + "step": 10657 + }, + { + "epoch": 1.0040272249829254, + "grad_norm": 0.6765127182006836, + "learning_rate": 1.025592012357048e-05, + "loss": 0.2384, + "step": 10658 + }, + { + "epoch": 1.0041214290760934, + "grad_norm": 0.6757077574729919, + "learning_rate": 1.0254410597135189e-05, + "loss": 0.2337, + "step": 10659 + }, + { + "epoch": 1.004215633169261, + "grad_norm": 0.611599862575531, + "learning_rate": 1.0252901064898949e-05, + "loss": 0.19, + "step": 10660 + }, + { + "epoch": 1.004309837262429, + "grad_norm": 0.5806671380996704, + "learning_rate": 1.0251391526896169e-05, + "loss": 0.1947, + "step": 10661 + }, + { + "epoch": 1.0044040413555968, + "grad_norm": 0.67947918176651, + "learning_rate": 1.0249881983161272e-05, + "loss": 0.2318, + "step": 10662 + }, + { + "epoch": 1.0044982454487648, + "grad_norm": 0.725274384021759, + "learning_rate": 1.0248372433728682e-05, + "loss": 0.2054, + "step": 10663 + }, + { + "epoch": 1.0045924495419325, + "grad_norm": 0.6442673802375793, + "learning_rate": 1.0246862878632815e-05, + "loss": 0.1971, + "step": 10664 + }, + { + "epoch": 1.0046866536351005, + "grad_norm": 0.6970698833465576, + "learning_rate": 1.0245353317908094e-05, + "loss": 0.2096, + "step": 10665 + }, + { + "epoch": 1.0047808577282682, + "grad_norm": 0.8498097658157349, + "learning_rate": 1.0243843751588937e-05, + "loss": 0.2019, + "step": 10666 + }, + { + "epoch": 1.0048750618214362, + "grad_norm": 0.60224848985672, + "learning_rate": 1.0242334179709763e-05, + "loss": 0.1702, + "step": 10667 + }, + { + "epoch": 1.004969265914604, + "grad_norm": 0.6674405932426453, + "learning_rate": 1.0240824602304997e-05, + "loss": 0.2119, + "step": 10668 + }, + { + "epoch": 1.0050634700077719, + "grad_norm": 0.59090256690979, + "learning_rate": 1.023931501940906e-05, + "loss": 0.1856, + "step": 10669 + }, + { + "epoch": 1.0051576741009396, + "grad_norm": 0.5844379663467407, + "learning_rate": 1.0237805431056369e-05, + "loss": 0.1678, + "step": 10670 + }, + { + "epoch": 1.0052518781941076, + "grad_norm": 0.7779214978218079, + "learning_rate": 1.0236295837281347e-05, + "loss": 0.2177, + "step": 10671 + }, + { + "epoch": 1.0053460822872753, + "grad_norm": 0.7210147380828857, + "learning_rate": 1.0234786238118411e-05, + "loss": 0.1968, + "step": 10672 + }, + { + "epoch": 1.0054402863804432, + "grad_norm": 0.6426734924316406, + "learning_rate": 1.0233276633601986e-05, + "loss": 0.204, + "step": 10673 + }, + { + "epoch": 1.005534490473611, + "grad_norm": 0.786760687828064, + "learning_rate": 1.0231767023766497e-05, + "loss": 0.2029, + "step": 10674 + }, + { + "epoch": 1.005628694566779, + "grad_norm": 0.6654820442199707, + "learning_rate": 1.023025740864636e-05, + "loss": 0.2221, + "step": 10675 + }, + { + "epoch": 1.0057228986599467, + "grad_norm": 0.6283759474754333, + "learning_rate": 1.0228747788275997e-05, + "loss": 0.2045, + "step": 10676 + }, + { + "epoch": 1.0058171027531146, + "grad_norm": 0.6686487197875977, + "learning_rate": 1.0227238162689832e-05, + "loss": 0.1847, + "step": 10677 + }, + { + "epoch": 1.0059113068462824, + "grad_norm": 0.7455902695655823, + "learning_rate": 1.0225728531922284e-05, + "loss": 0.252, + "step": 10678 + }, + { + "epoch": 1.0060055109394503, + "grad_norm": 0.6051475405693054, + "learning_rate": 1.0224218896007776e-05, + "loss": 0.2022, + "step": 10679 + }, + { + "epoch": 1.006099715032618, + "grad_norm": 0.6798374652862549, + "learning_rate": 1.0222709254980733e-05, + "loss": 0.1825, + "step": 10680 + }, + { + "epoch": 1.006193919125786, + "grad_norm": 0.8446162939071655, + "learning_rate": 1.0221199608875572e-05, + "loss": 0.2211, + "step": 10681 + }, + { + "epoch": 1.0062881232189538, + "grad_norm": 0.6640855073928833, + "learning_rate": 1.021968995772672e-05, + "loss": 0.2039, + "step": 10682 + }, + { + "epoch": 1.0063823273121217, + "grad_norm": 0.7586714029312134, + "learning_rate": 1.0218180301568595e-05, + "loss": 0.2162, + "step": 10683 + }, + { + "epoch": 1.0064765314052895, + "grad_norm": 0.6467330455780029, + "learning_rate": 1.0216670640435622e-05, + "loss": 0.1713, + "step": 10684 + }, + { + "epoch": 1.0065707354984574, + "grad_norm": 0.5871751308441162, + "learning_rate": 1.0215160974362224e-05, + "loss": 0.1872, + "step": 10685 + }, + { + "epoch": 1.0066649395916252, + "grad_norm": 0.6697161197662354, + "learning_rate": 1.0213651303382824e-05, + "loss": 0.2269, + "step": 10686 + }, + { + "epoch": 1.0067591436847931, + "grad_norm": 0.6335304975509644, + "learning_rate": 1.021214162753184e-05, + "loss": 0.2163, + "step": 10687 + }, + { + "epoch": 1.0068533477779609, + "grad_norm": 0.5786669254302979, + "learning_rate": 1.0210631946843703e-05, + "loss": 0.2004, + "step": 10688 + }, + { + "epoch": 1.0069475518711288, + "grad_norm": 0.6199154257774353, + "learning_rate": 1.0209122261352831e-05, + "loss": 0.2083, + "step": 10689 + }, + { + "epoch": 1.0070417559642966, + "grad_norm": 0.7040600180625916, + "learning_rate": 1.0207612571093648e-05, + "loss": 0.2471, + "step": 10690 + }, + { + "epoch": 1.0071359600574645, + "grad_norm": 0.7345625162124634, + "learning_rate": 1.0206102876100576e-05, + "loss": 0.228, + "step": 10691 + }, + { + "epoch": 1.0072301641506323, + "grad_norm": 0.6340999007225037, + "learning_rate": 1.020459317640804e-05, + "loss": 0.2024, + "step": 10692 + }, + { + "epoch": 1.0073243682438002, + "grad_norm": 0.7458804249763489, + "learning_rate": 1.0203083472050463e-05, + "loss": 0.2293, + "step": 10693 + }, + { + "epoch": 1.007418572336968, + "grad_norm": 0.5789769291877747, + "learning_rate": 1.0201573763062272e-05, + "loss": 0.1882, + "step": 10694 + }, + { + "epoch": 1.007512776430136, + "grad_norm": 0.624512255191803, + "learning_rate": 1.0200064049477885e-05, + "loss": 0.2011, + "step": 10695 + }, + { + "epoch": 1.0076069805233037, + "grad_norm": 0.9252504706382751, + "learning_rate": 1.0198554331331725e-05, + "loss": 0.2212, + "step": 10696 + }, + { + "epoch": 1.0077011846164716, + "grad_norm": 0.7407596111297607, + "learning_rate": 1.0197044608658223e-05, + "loss": 0.221, + "step": 10697 + }, + { + "epoch": 1.0077953887096394, + "grad_norm": 0.6435703039169312, + "learning_rate": 1.01955348814918e-05, + "loss": 0.2135, + "step": 10698 + }, + { + "epoch": 1.0078895928028073, + "grad_norm": 0.5939406156539917, + "learning_rate": 1.0194025149866875e-05, + "loss": 0.1927, + "step": 10699 + }, + { + "epoch": 1.007983796895975, + "grad_norm": 0.6581075191497803, + "learning_rate": 1.0192515413817882e-05, + "loss": 0.2022, + "step": 10700 + }, + { + "epoch": 1.008078000989143, + "grad_norm": 0.7556645274162292, + "learning_rate": 1.0191005673379235e-05, + "loss": 0.2247, + "step": 10701 + }, + { + "epoch": 1.0081722050823108, + "grad_norm": 0.671176016330719, + "learning_rate": 1.0189495928585367e-05, + "loss": 0.234, + "step": 10702 + }, + { + "epoch": 1.0082664091754787, + "grad_norm": 0.6827737092971802, + "learning_rate": 1.0187986179470698e-05, + "loss": 0.2139, + "step": 10703 + }, + { + "epoch": 1.0083606132686465, + "grad_norm": 0.9258418679237366, + "learning_rate": 1.0186476426069649e-05, + "loss": 0.2034, + "step": 10704 + }, + { + "epoch": 1.0084548173618144, + "grad_norm": 0.6644185781478882, + "learning_rate": 1.0184966668416653e-05, + "loss": 0.2176, + "step": 10705 + }, + { + "epoch": 1.0085490214549822, + "grad_norm": 0.5872127413749695, + "learning_rate": 1.0183456906546132e-05, + "loss": 0.178, + "step": 10706 + }, + { + "epoch": 1.0086432255481501, + "grad_norm": 0.6497471928596497, + "learning_rate": 1.0181947140492507e-05, + "loss": 0.2437, + "step": 10707 + }, + { + "epoch": 1.0087374296413179, + "grad_norm": 3.966637372970581, + "learning_rate": 1.018043737029021e-05, + "loss": 0.2113, + "step": 10708 + }, + { + "epoch": 1.0088316337344858, + "grad_norm": 0.6871297359466553, + "learning_rate": 1.0178927595973658e-05, + "loss": 0.2056, + "step": 10709 + }, + { + "epoch": 1.0089258378276535, + "grad_norm": 0.6563407182693481, + "learning_rate": 1.0177417817577282e-05, + "loss": 0.2176, + "step": 10710 + }, + { + "epoch": 1.0090200419208215, + "grad_norm": 0.7464789748191833, + "learning_rate": 1.0175908035135505e-05, + "loss": 0.2365, + "step": 10711 + }, + { + "epoch": 1.0091142460139892, + "grad_norm": 0.6258571743965149, + "learning_rate": 1.017439824868275e-05, + "loss": 0.1954, + "step": 10712 + }, + { + "epoch": 1.0092084501071572, + "grad_norm": 0.7025963664054871, + "learning_rate": 1.0172888458253447e-05, + "loss": 0.2411, + "step": 10713 + }, + { + "epoch": 1.009302654200325, + "grad_norm": 0.7283008098602295, + "learning_rate": 1.017137866388202e-05, + "loss": 0.1888, + "step": 10714 + }, + { + "epoch": 1.009396858293493, + "grad_norm": 0.6318821907043457, + "learning_rate": 1.0169868865602896e-05, + "loss": 0.1918, + "step": 10715 + }, + { + "epoch": 1.0094910623866606, + "grad_norm": 0.6122881770133972, + "learning_rate": 1.0168359063450496e-05, + "loss": 0.1975, + "step": 10716 + }, + { + "epoch": 1.0095852664798286, + "grad_norm": 0.6684684157371521, + "learning_rate": 1.0166849257459252e-05, + "loss": 0.2264, + "step": 10717 + }, + { + "epoch": 1.0096794705729963, + "grad_norm": 0.6784617304801941, + "learning_rate": 1.0165339447663586e-05, + "loss": 0.2152, + "step": 10718 + }, + { + "epoch": 1.0097736746661643, + "grad_norm": 0.5989708304405212, + "learning_rate": 1.0163829634097924e-05, + "loss": 0.1978, + "step": 10719 + }, + { + "epoch": 1.009867878759332, + "grad_norm": 0.6263830661773682, + "learning_rate": 1.0162319816796695e-05, + "loss": 0.1702, + "step": 10720 + }, + { + "epoch": 1.0099620828525, + "grad_norm": 0.7663062214851379, + "learning_rate": 1.016080999579432e-05, + "loss": 0.2473, + "step": 10721 + }, + { + "epoch": 1.0100562869456677, + "grad_norm": 0.7144594192504883, + "learning_rate": 1.0159300171125232e-05, + "loss": 0.2081, + "step": 10722 + }, + { + "epoch": 1.0101504910388357, + "grad_norm": 0.6704587936401367, + "learning_rate": 1.0157790342823852e-05, + "loss": 0.2102, + "step": 10723 + }, + { + "epoch": 1.0102446951320034, + "grad_norm": 0.6900226473808289, + "learning_rate": 1.0156280510924605e-05, + "loss": 0.2229, + "step": 10724 + }, + { + "epoch": 1.0103388992251714, + "grad_norm": 0.630725085735321, + "learning_rate": 1.0154770675461925e-05, + "loss": 0.2113, + "step": 10725 + }, + { + "epoch": 1.0104331033183391, + "grad_norm": 0.6396355032920837, + "learning_rate": 1.0153260836470233e-05, + "loss": 0.2056, + "step": 10726 + }, + { + "epoch": 1.010527307411507, + "grad_norm": 0.6431342959403992, + "learning_rate": 1.0151750993983956e-05, + "loss": 0.2118, + "step": 10727 + }, + { + "epoch": 1.0106215115046748, + "grad_norm": 0.6093985438346863, + "learning_rate": 1.0150241148037526e-05, + "loss": 0.1958, + "step": 10728 + }, + { + "epoch": 1.0107157155978428, + "grad_norm": 0.7072876691818237, + "learning_rate": 1.014873129866536e-05, + "loss": 0.2537, + "step": 10729 + }, + { + "epoch": 1.0108099196910105, + "grad_norm": 0.6593238711357117, + "learning_rate": 1.0147221445901893e-05, + "loss": 0.2219, + "step": 10730 + }, + { + "epoch": 1.0109041237841785, + "grad_norm": 0.6366419792175293, + "learning_rate": 1.0145711589781549e-05, + "loss": 0.1772, + "step": 10731 + }, + { + "epoch": 1.0109983278773462, + "grad_norm": 0.781413197517395, + "learning_rate": 1.0144201730338754e-05, + "loss": 0.2034, + "step": 10732 + }, + { + "epoch": 1.0110925319705142, + "grad_norm": 0.6375175714492798, + "learning_rate": 1.0142691867607937e-05, + "loss": 0.2266, + "step": 10733 + }, + { + "epoch": 1.011186736063682, + "grad_norm": 0.6226152181625366, + "learning_rate": 1.0141182001623526e-05, + "loss": 0.2056, + "step": 10734 + }, + { + "epoch": 1.0112809401568499, + "grad_norm": 0.6545280814170837, + "learning_rate": 1.0139672132419946e-05, + "loss": 0.2032, + "step": 10735 + }, + { + "epoch": 1.0113751442500176, + "grad_norm": 0.6611790657043457, + "learning_rate": 1.0138162260031625e-05, + "loss": 0.2471, + "step": 10736 + }, + { + "epoch": 1.0114693483431856, + "grad_norm": 0.6579394340515137, + "learning_rate": 1.0136652384492993e-05, + "loss": 0.2119, + "step": 10737 + }, + { + "epoch": 1.0115635524363533, + "grad_norm": 0.6166621446609497, + "learning_rate": 1.0135142505838473e-05, + "loss": 0.1873, + "step": 10738 + }, + { + "epoch": 1.0116577565295213, + "grad_norm": 0.6546235680580139, + "learning_rate": 1.0133632624102495e-05, + "loss": 0.2196, + "step": 10739 + }, + { + "epoch": 1.011751960622689, + "grad_norm": 0.728863000869751, + "learning_rate": 1.013212273931949e-05, + "loss": 0.1993, + "step": 10740 + }, + { + "epoch": 1.011846164715857, + "grad_norm": 0.9198817014694214, + "learning_rate": 1.0130612851523877e-05, + "loss": 0.2055, + "step": 10741 + }, + { + "epoch": 1.0119403688090247, + "grad_norm": 0.7382006049156189, + "learning_rate": 1.0129102960750092e-05, + "loss": 0.2055, + "step": 10742 + }, + { + "epoch": 1.0120345729021927, + "grad_norm": 0.6523581147193909, + "learning_rate": 1.012759306703256e-05, + "loss": 0.2293, + "step": 10743 + }, + { + "epoch": 1.0121287769953604, + "grad_norm": 0.6328745484352112, + "learning_rate": 1.0126083170405707e-05, + "loss": 0.1897, + "step": 10744 + }, + { + "epoch": 1.0122229810885284, + "grad_norm": 0.700139045715332, + "learning_rate": 1.0124573270903963e-05, + "loss": 0.2323, + "step": 10745 + }, + { + "epoch": 1.012317185181696, + "grad_norm": 0.7212764620780945, + "learning_rate": 1.0123063368561759e-05, + "loss": 0.2214, + "step": 10746 + }, + { + "epoch": 1.012411389274864, + "grad_norm": 0.6312685608863831, + "learning_rate": 1.0121553463413514e-05, + "loss": 0.2082, + "step": 10747 + }, + { + "epoch": 1.0125055933680318, + "grad_norm": 0.6925767064094543, + "learning_rate": 1.0120043555493669e-05, + "loss": 0.1998, + "step": 10748 + }, + { + "epoch": 1.0125997974611998, + "grad_norm": 0.6870275139808655, + "learning_rate": 1.0118533644836638e-05, + "loss": 0.2121, + "step": 10749 + }, + { + "epoch": 1.0126940015543675, + "grad_norm": 0.7039703726768494, + "learning_rate": 1.0117023731476863e-05, + "loss": 0.2, + "step": 10750 + }, + { + "epoch": 1.0127882056475355, + "grad_norm": 0.6422266364097595, + "learning_rate": 1.0115513815448763e-05, + "loss": 0.2028, + "step": 10751 + }, + { + "epoch": 1.0128824097407032, + "grad_norm": 0.6739658117294312, + "learning_rate": 1.0114003896786768e-05, + "loss": 0.2156, + "step": 10752 + }, + { + "epoch": 1.0129766138338712, + "grad_norm": 0.7240651249885559, + "learning_rate": 1.011249397552531e-05, + "loss": 0.2283, + "step": 10753 + }, + { + "epoch": 1.013070817927039, + "grad_norm": 0.6318476796150208, + "learning_rate": 1.0110984051698815e-05, + "loss": 0.1977, + "step": 10754 + }, + { + "epoch": 1.0131650220202069, + "grad_norm": 0.6608891487121582, + "learning_rate": 1.0109474125341714e-05, + "loss": 0.2149, + "step": 10755 + }, + { + "epoch": 1.0132592261133746, + "grad_norm": 0.6064279675483704, + "learning_rate": 1.0107964196488429e-05, + "loss": 0.2058, + "step": 10756 + }, + { + "epoch": 1.0133534302065426, + "grad_norm": 0.6015726327896118, + "learning_rate": 1.0106454265173396e-05, + "loss": 0.1744, + "step": 10757 + }, + { + "epoch": 1.0134476342997103, + "grad_norm": 0.6723529100418091, + "learning_rate": 1.0104944331431042e-05, + "loss": 0.2059, + "step": 10758 + }, + { + "epoch": 1.0135418383928783, + "grad_norm": 0.6537312865257263, + "learning_rate": 1.0103434395295792e-05, + "loss": 0.1999, + "step": 10759 + }, + { + "epoch": 1.013636042486046, + "grad_norm": 0.5958169102668762, + "learning_rate": 1.0101924456802081e-05, + "loss": 0.186, + "step": 10760 + }, + { + "epoch": 1.013730246579214, + "grad_norm": 0.6968947052955627, + "learning_rate": 1.0100414515984334e-05, + "loss": 0.2188, + "step": 10761 + }, + { + "epoch": 1.0138244506723817, + "grad_norm": 0.6198421120643616, + "learning_rate": 1.009890457287698e-05, + "loss": 0.2056, + "step": 10762 + }, + { + "epoch": 1.0139186547655497, + "grad_norm": 0.667673647403717, + "learning_rate": 1.009739462751445e-05, + "loss": 0.2032, + "step": 10763 + }, + { + "epoch": 1.0140128588587174, + "grad_norm": 0.6936910152435303, + "learning_rate": 1.0095884679931167e-05, + "loss": 0.2003, + "step": 10764 + }, + { + "epoch": 1.0141070629518854, + "grad_norm": 0.7887927293777466, + "learning_rate": 1.009437473016157e-05, + "loss": 0.1833, + "step": 10765 + }, + { + "epoch": 1.014201267045053, + "grad_norm": 0.7284408211708069, + "learning_rate": 1.0092864778240083e-05, + "loss": 0.2124, + "step": 10766 + }, + { + "epoch": 1.014295471138221, + "grad_norm": 0.6176424622535706, + "learning_rate": 1.0091354824201132e-05, + "loss": 0.2068, + "step": 10767 + }, + { + "epoch": 1.0143896752313888, + "grad_norm": 0.6302241086959839, + "learning_rate": 1.0089844868079154e-05, + "loss": 0.2106, + "step": 10768 + }, + { + "epoch": 1.0144838793245567, + "grad_norm": 0.58731609582901, + "learning_rate": 1.0088334909908573e-05, + "loss": 0.2072, + "step": 10769 + }, + { + "epoch": 1.0145780834177245, + "grad_norm": 0.6222063302993774, + "learning_rate": 1.0086824949723819e-05, + "loss": 0.1731, + "step": 10770 + }, + { + "epoch": 1.0146722875108924, + "grad_norm": 0.7802337408065796, + "learning_rate": 1.0085314987559323e-05, + "loss": 0.1853, + "step": 10771 + }, + { + "epoch": 1.0147664916040602, + "grad_norm": 0.7324212789535522, + "learning_rate": 1.008380502344951e-05, + "loss": 0.2386, + "step": 10772 + }, + { + "epoch": 1.0148606956972281, + "grad_norm": 0.6973498463630676, + "learning_rate": 1.0082295057428815e-05, + "loss": 0.2009, + "step": 10773 + }, + { + "epoch": 1.0149548997903959, + "grad_norm": 0.6584519743919373, + "learning_rate": 1.0080785089531664e-05, + "loss": 0.213, + "step": 10774 + }, + { + "epoch": 1.0150491038835638, + "grad_norm": 0.6616883277893066, + "learning_rate": 1.007927511979249e-05, + "loss": 0.2092, + "step": 10775 + }, + { + "epoch": 1.0151433079767316, + "grad_norm": 0.6478118300437927, + "learning_rate": 1.0077765148245719e-05, + "loss": 0.2311, + "step": 10776 + }, + { + "epoch": 1.0152375120698995, + "grad_norm": 0.6347939372062683, + "learning_rate": 1.0076255174925784e-05, + "loss": 0.1993, + "step": 10777 + }, + { + "epoch": 1.0153317161630673, + "grad_norm": 0.6749362945556641, + "learning_rate": 1.0074745199867112e-05, + "loss": 0.2187, + "step": 10778 + }, + { + "epoch": 1.0154259202562352, + "grad_norm": 0.6531940698623657, + "learning_rate": 1.0073235223104134e-05, + "loss": 0.2265, + "step": 10779 + }, + { + "epoch": 1.015520124349403, + "grad_norm": 0.6377983689308167, + "learning_rate": 1.0071725244671281e-05, + "loss": 0.2221, + "step": 10780 + }, + { + "epoch": 1.015614328442571, + "grad_norm": 0.6461182236671448, + "learning_rate": 1.0070215264602979e-05, + "loss": 0.2089, + "step": 10781 + }, + { + "epoch": 1.0157085325357387, + "grad_norm": 0.6446294188499451, + "learning_rate": 1.0068705282933663e-05, + "loss": 0.1976, + "step": 10782 + }, + { + "epoch": 1.0158027366289066, + "grad_norm": 0.6441165208816528, + "learning_rate": 1.0067195299697759e-05, + "loss": 0.2274, + "step": 10783 + }, + { + "epoch": 1.0158969407220744, + "grad_norm": 0.6639395952224731, + "learning_rate": 1.0065685314929696e-05, + "loss": 0.2471, + "step": 10784 + }, + { + "epoch": 1.0159911448152423, + "grad_norm": 0.6566084027290344, + "learning_rate": 1.0064175328663909e-05, + "loss": 0.2144, + "step": 10785 + }, + { + "epoch": 1.01608534890841, + "grad_norm": 0.667574942111969, + "learning_rate": 1.0062665340934826e-05, + "loss": 0.2055, + "step": 10786 + }, + { + "epoch": 1.016179553001578, + "grad_norm": 0.6373234987258911, + "learning_rate": 1.0061155351776872e-05, + "loss": 0.1934, + "step": 10787 + }, + { + "epoch": 1.0162737570947458, + "grad_norm": 0.6480892300605774, + "learning_rate": 1.0059645361224489e-05, + "loss": 0.1978, + "step": 10788 + }, + { + "epoch": 1.0163679611879135, + "grad_norm": 0.6134656071662903, + "learning_rate": 1.0058135369312091e-05, + "loss": 0.2006, + "step": 10789 + }, + { + "epoch": 1.0164621652810815, + "grad_norm": 0.6004596948623657, + "learning_rate": 1.0056625376074122e-05, + "loss": 0.1936, + "step": 10790 + }, + { + "epoch": 1.0165563693742492, + "grad_norm": 0.6127801537513733, + "learning_rate": 1.0055115381545006e-05, + "loss": 0.2298, + "step": 10791 + }, + { + "epoch": 1.0166505734674172, + "grad_norm": 0.6786653995513916, + "learning_rate": 1.0053605385759174e-05, + "loss": 0.2204, + "step": 10792 + }, + { + "epoch": 1.016744777560585, + "grad_norm": 0.7291155457496643, + "learning_rate": 1.0052095388751054e-05, + "loss": 0.1885, + "step": 10793 + }, + { + "epoch": 1.0168389816537529, + "grad_norm": 0.9409392476081848, + "learning_rate": 1.0050585390555082e-05, + "loss": 0.2066, + "step": 10794 + }, + { + "epoch": 1.0169331857469206, + "grad_norm": 0.6275922060012817, + "learning_rate": 1.0049075391205682e-05, + "loss": 0.2104, + "step": 10795 + }, + { + "epoch": 1.0170273898400886, + "grad_norm": 0.6719104647636414, + "learning_rate": 1.0047565390737289e-05, + "loss": 0.2161, + "step": 10796 + }, + { + "epoch": 1.0171215939332563, + "grad_norm": 0.679837703704834, + "learning_rate": 1.004605538918433e-05, + "loss": 0.2249, + "step": 10797 + }, + { + "epoch": 1.0172157980264243, + "grad_norm": 0.6401480436325073, + "learning_rate": 1.004454538658124e-05, + "loss": 0.2195, + "step": 10798 + }, + { + "epoch": 1.017310002119592, + "grad_norm": 0.6298019886016846, + "learning_rate": 1.0043035382962443e-05, + "loss": 0.205, + "step": 10799 + }, + { + "epoch": 1.01740420621276, + "grad_norm": 0.6132528781890869, + "learning_rate": 1.0041525378362376e-05, + "loss": 0.1985, + "step": 10800 + }, + { + "epoch": 1.0174984103059277, + "grad_norm": 0.7292924523353577, + "learning_rate": 1.0040015372815461e-05, + "loss": 0.1995, + "step": 10801 + }, + { + "epoch": 1.0175926143990957, + "grad_norm": 1.1963964700698853, + "learning_rate": 1.0038505366356137e-05, + "loss": 0.2185, + "step": 10802 + }, + { + "epoch": 1.0176868184922634, + "grad_norm": 0.5991237163543701, + "learning_rate": 1.0036995359018833e-05, + "loss": 0.2053, + "step": 10803 + }, + { + "epoch": 1.0177810225854313, + "grad_norm": 0.6105757355690002, + "learning_rate": 1.0035485350837972e-05, + "loss": 0.2409, + "step": 10804 + }, + { + "epoch": 1.017875226678599, + "grad_norm": 0.7335970997810364, + "learning_rate": 1.0033975341847995e-05, + "loss": 0.2392, + "step": 10805 + }, + { + "epoch": 1.017969430771767, + "grad_norm": 0.6251050233840942, + "learning_rate": 1.0032465332083326e-05, + "loss": 0.1974, + "step": 10806 + }, + { + "epoch": 1.0180636348649348, + "grad_norm": 0.7072638273239136, + "learning_rate": 1.0030955321578396e-05, + "loss": 0.228, + "step": 10807 + }, + { + "epoch": 1.0181578389581027, + "grad_norm": 0.6788835525512695, + "learning_rate": 1.002944531036764e-05, + "loss": 0.1957, + "step": 10808 + }, + { + "epoch": 1.0182520430512705, + "grad_norm": 0.6908219456672668, + "learning_rate": 1.0027935298485483e-05, + "loss": 0.2295, + "step": 10809 + }, + { + "epoch": 1.0183462471444384, + "grad_norm": 0.6195359230041504, + "learning_rate": 1.0026425285966359e-05, + "loss": 0.2202, + "step": 10810 + }, + { + "epoch": 1.0184404512376062, + "grad_norm": 0.6958038210868835, + "learning_rate": 1.0024915272844697e-05, + "loss": 0.1981, + "step": 10811 + }, + { + "epoch": 1.0185346553307741, + "grad_norm": 0.7101927995681763, + "learning_rate": 1.0023405259154928e-05, + "loss": 0.1961, + "step": 10812 + }, + { + "epoch": 1.0186288594239419, + "grad_norm": 0.6944587230682373, + "learning_rate": 1.0021895244931484e-05, + "loss": 0.2171, + "step": 10813 + }, + { + "epoch": 1.0187230635171098, + "grad_norm": 0.7558724880218506, + "learning_rate": 1.0020385230208793e-05, + "loss": 0.2125, + "step": 10814 + }, + { + "epoch": 1.0188172676102776, + "grad_norm": 0.6603408455848694, + "learning_rate": 1.0018875215021289e-05, + "loss": 0.201, + "step": 10815 + }, + { + "epoch": 1.0189114717034455, + "grad_norm": 0.671822726726532, + "learning_rate": 1.00173651994034e-05, + "loss": 0.2273, + "step": 10816 + }, + { + "epoch": 1.0190056757966133, + "grad_norm": 0.5749891400337219, + "learning_rate": 1.0015855183389555e-05, + "loss": 0.1738, + "step": 10817 + }, + { + "epoch": 1.0190998798897812, + "grad_norm": 0.6361928582191467, + "learning_rate": 1.0014345167014192e-05, + "loss": 0.221, + "step": 10818 + }, + { + "epoch": 1.019194083982949, + "grad_norm": 0.6200592517852783, + "learning_rate": 1.0012835150311735e-05, + "loss": 0.1937, + "step": 10819 + }, + { + "epoch": 1.019288288076117, + "grad_norm": 0.6894503831863403, + "learning_rate": 1.0011325133316618e-05, + "loss": 0.2092, + "step": 10820 + }, + { + "epoch": 1.0193824921692847, + "grad_norm": 0.6642686128616333, + "learning_rate": 1.0009815116063266e-05, + "loss": 0.2247, + "step": 10821 + }, + { + "epoch": 1.0194766962624526, + "grad_norm": 0.6428064703941345, + "learning_rate": 1.0008305098586118e-05, + "loss": 0.2111, + "step": 10822 + }, + { + "epoch": 1.0195709003556204, + "grad_norm": 0.7898526787757874, + "learning_rate": 1.00067950809196e-05, + "loss": 0.2023, + "step": 10823 + }, + { + "epoch": 1.0196651044487883, + "grad_norm": 0.7990137934684753, + "learning_rate": 1.0005285063098142e-05, + "loss": 0.2154, + "step": 10824 + }, + { + "epoch": 1.019759308541956, + "grad_norm": 0.6745801568031311, + "learning_rate": 1.0003775045156181e-05, + "loss": 0.1916, + "step": 10825 + }, + { + "epoch": 1.019853512635124, + "grad_norm": 0.5938249826431274, + "learning_rate": 1.000226502712814e-05, + "loss": 0.184, + "step": 10826 + }, + { + "epoch": 1.0199477167282918, + "grad_norm": 0.7442857623100281, + "learning_rate": 1.0000755009048451e-05, + "loss": 0.2054, + "step": 10827 + }, + { + "epoch": 1.0200419208214597, + "grad_norm": 0.7236988544464111, + "learning_rate": 9.999244990951552e-06, + "loss": 0.2061, + "step": 10828 + }, + { + "epoch": 1.0201361249146275, + "grad_norm": 0.6749070286750793, + "learning_rate": 9.997734972871866e-06, + "loss": 0.2091, + "step": 10829 + }, + { + "epoch": 1.0202303290077954, + "grad_norm": 0.7106935977935791, + "learning_rate": 9.99622495484382e-06, + "loss": 0.2112, + "step": 10830 + }, + { + "epoch": 1.0203245331009632, + "grad_norm": 0.6278476715087891, + "learning_rate": 9.99471493690186e-06, + "loss": 0.1964, + "step": 10831 + }, + { + "epoch": 1.0204187371941311, + "grad_norm": 0.6725781559944153, + "learning_rate": 9.993204919080403e-06, + "loss": 0.2281, + "step": 10832 + }, + { + "epoch": 1.0205129412872989, + "grad_norm": 0.8848792910575867, + "learning_rate": 9.991694901413884e-06, + "loss": 0.2322, + "step": 10833 + }, + { + "epoch": 1.0206071453804668, + "grad_norm": 0.6739000678062439, + "learning_rate": 9.990184883936737e-06, + "loss": 0.2271, + "step": 10834 + }, + { + "epoch": 1.0207013494736346, + "grad_norm": 0.6107320785522461, + "learning_rate": 9.988674866683387e-06, + "loss": 0.2006, + "step": 10835 + }, + { + "epoch": 1.0207955535668025, + "grad_norm": 0.690765917301178, + "learning_rate": 9.987164849688268e-06, + "loss": 0.2221, + "step": 10836 + }, + { + "epoch": 1.0208897576599703, + "grad_norm": 1.850305199623108, + "learning_rate": 9.985654832985811e-06, + "loss": 0.2109, + "step": 10837 + }, + { + "epoch": 1.0209839617531382, + "grad_norm": 0.6318346858024597, + "learning_rate": 9.984144816610446e-06, + "loss": 0.2066, + "step": 10838 + }, + { + "epoch": 1.021078165846306, + "grad_norm": 0.6936797499656677, + "learning_rate": 9.982634800596605e-06, + "loss": 0.2236, + "step": 10839 + }, + { + "epoch": 1.021172369939474, + "grad_norm": 0.6386310458183289, + "learning_rate": 9.981124784978715e-06, + "loss": 0.2171, + "step": 10840 + }, + { + "epoch": 1.0212665740326416, + "grad_norm": 0.6437522768974304, + "learning_rate": 9.97961476979121e-06, + "loss": 0.2275, + "step": 10841 + }, + { + "epoch": 1.0213607781258096, + "grad_norm": 0.7574900388717651, + "learning_rate": 9.97810475506852e-06, + "loss": 0.2567, + "step": 10842 + }, + { + "epoch": 1.0214549822189773, + "grad_norm": 0.7311277389526367, + "learning_rate": 9.976594740845074e-06, + "loss": 0.2187, + "step": 10843 + }, + { + "epoch": 1.0215491863121453, + "grad_norm": 0.6731746196746826, + "learning_rate": 9.975084727155305e-06, + "loss": 0.2042, + "step": 10844 + }, + { + "epoch": 1.021643390405313, + "grad_norm": 0.6768555641174316, + "learning_rate": 9.973574714033646e-06, + "loss": 0.2212, + "step": 10845 + }, + { + "epoch": 1.021737594498481, + "grad_norm": 0.6536171436309814, + "learning_rate": 9.972064701514517e-06, + "loss": 0.2525, + "step": 10846 + }, + { + "epoch": 1.0218317985916487, + "grad_norm": 0.6061612963676453, + "learning_rate": 9.970554689632362e-06, + "loss": 0.1982, + "step": 10847 + }, + { + "epoch": 1.0219260026848167, + "grad_norm": 0.7942737936973572, + "learning_rate": 9.969044678421606e-06, + "loss": 0.2146, + "step": 10848 + }, + { + "epoch": 1.0220202067779844, + "grad_norm": 0.6257474422454834, + "learning_rate": 9.967534667916679e-06, + "loss": 0.2119, + "step": 10849 + }, + { + "epoch": 1.0221144108711524, + "grad_norm": 0.6741740703582764, + "learning_rate": 9.966024658152008e-06, + "loss": 0.2179, + "step": 10850 + }, + { + "epoch": 1.0222086149643201, + "grad_norm": 0.6527076363563538, + "learning_rate": 9.96451464916203e-06, + "loss": 0.2225, + "step": 10851 + }, + { + "epoch": 1.022302819057488, + "grad_norm": 0.8687999248504639, + "learning_rate": 9.963004640981173e-06, + "loss": 0.2367, + "step": 10852 + }, + { + "epoch": 1.0223970231506558, + "grad_norm": 0.626018762588501, + "learning_rate": 9.961494633643865e-06, + "loss": 0.2015, + "step": 10853 + }, + { + "epoch": 1.0224912272438238, + "grad_norm": 0.6199095249176025, + "learning_rate": 9.959984627184542e-06, + "loss": 0.199, + "step": 10854 + }, + { + "epoch": 1.0225854313369915, + "grad_norm": 0.6614289283752441, + "learning_rate": 9.958474621637631e-06, + "loss": 0.2275, + "step": 10855 + }, + { + "epoch": 1.0226796354301595, + "grad_norm": 0.5417171120643616, + "learning_rate": 9.956964617037559e-06, + "loss": 0.1752, + "step": 10856 + }, + { + "epoch": 1.0227738395233272, + "grad_norm": 0.6422715783119202, + "learning_rate": 9.955454613418764e-06, + "loss": 0.2182, + "step": 10857 + }, + { + "epoch": 1.0228680436164952, + "grad_norm": 0.690487802028656, + "learning_rate": 9.953944610815672e-06, + "loss": 0.163, + "step": 10858 + }, + { + "epoch": 1.022962247709663, + "grad_norm": 0.6234443187713623, + "learning_rate": 9.952434609262714e-06, + "loss": 0.1922, + "step": 10859 + }, + { + "epoch": 1.023056451802831, + "grad_norm": 0.7341983914375305, + "learning_rate": 9.95092460879432e-06, + "loss": 0.1863, + "step": 10860 + }, + { + "epoch": 1.0231506558959986, + "grad_norm": 0.6341428160667419, + "learning_rate": 9.949414609444922e-06, + "loss": 0.2195, + "step": 10861 + }, + { + "epoch": 1.0232448599891666, + "grad_norm": 0.6796596646308899, + "learning_rate": 9.947904611248949e-06, + "loss": 0.1961, + "step": 10862 + }, + { + "epoch": 1.0233390640823343, + "grad_norm": 0.6194525361061096, + "learning_rate": 9.946394614240828e-06, + "loss": 0.2029, + "step": 10863 + }, + { + "epoch": 1.0234332681755023, + "grad_norm": 0.7756684422492981, + "learning_rate": 9.944884618454996e-06, + "loss": 0.1822, + "step": 10864 + }, + { + "epoch": 1.02352747226867, + "grad_norm": 0.6143449544906616, + "learning_rate": 9.943374623925883e-06, + "loss": 0.1737, + "step": 10865 + }, + { + "epoch": 1.023621676361838, + "grad_norm": 0.6507072448730469, + "learning_rate": 9.941864630687909e-06, + "loss": 0.2248, + "step": 10866 + }, + { + "epoch": 1.0237158804550057, + "grad_norm": 0.6425033807754517, + "learning_rate": 9.940354638775514e-06, + "loss": 0.2249, + "step": 10867 + }, + { + "epoch": 1.0238100845481737, + "grad_norm": 0.6521602272987366, + "learning_rate": 9.938844648223131e-06, + "loss": 0.2257, + "step": 10868 + }, + { + "epoch": 1.0239042886413414, + "grad_norm": 0.6503294706344604, + "learning_rate": 9.93733465906518e-06, + "loss": 0.2155, + "step": 10869 + }, + { + "epoch": 1.0239984927345094, + "grad_norm": 0.6127226948738098, + "learning_rate": 9.935824671336094e-06, + "loss": 0.1817, + "step": 10870 + }, + { + "epoch": 1.0240926968276771, + "grad_norm": 0.6373822093009949, + "learning_rate": 9.934314685070306e-06, + "loss": 0.1933, + "step": 10871 + }, + { + "epoch": 1.024186900920845, + "grad_norm": 0.653621256351471, + "learning_rate": 9.932804700302246e-06, + "loss": 0.2156, + "step": 10872 + }, + { + "epoch": 1.0242811050140128, + "grad_norm": 0.6706910133361816, + "learning_rate": 9.93129471706634e-06, + "loss": 0.1948, + "step": 10873 + }, + { + "epoch": 1.0243753091071808, + "grad_norm": 0.6643623113632202, + "learning_rate": 9.929784735397023e-06, + "loss": 0.2363, + "step": 10874 + }, + { + "epoch": 1.0244695132003485, + "grad_norm": 0.6217973232269287, + "learning_rate": 9.928274755328724e-06, + "loss": 0.1918, + "step": 10875 + }, + { + "epoch": 1.0245637172935165, + "grad_norm": 0.664268970489502, + "learning_rate": 9.926764776895867e-06, + "loss": 0.2057, + "step": 10876 + }, + { + "epoch": 1.0246579213866842, + "grad_norm": 0.7142734527587891, + "learning_rate": 9.925254800132891e-06, + "loss": 0.2035, + "step": 10877 + }, + { + "epoch": 1.0247521254798522, + "grad_norm": 0.6665133833885193, + "learning_rate": 9.92374482507422e-06, + "loss": 0.225, + "step": 10878 + }, + { + "epoch": 1.02484632957302, + "grad_norm": 0.6146891117095947, + "learning_rate": 9.922234851754284e-06, + "loss": 0.2156, + "step": 10879 + }, + { + "epoch": 1.0249405336661879, + "grad_norm": 0.5748670101165771, + "learning_rate": 9.920724880207511e-06, + "loss": 0.1864, + "step": 10880 + }, + { + "epoch": 1.0250347377593556, + "grad_norm": 0.6536259055137634, + "learning_rate": 9.919214910468337e-06, + "loss": 0.1927, + "step": 10881 + }, + { + "epoch": 1.0251289418525236, + "grad_norm": 0.6859068870544434, + "learning_rate": 9.917704942571188e-06, + "loss": 0.1908, + "step": 10882 + }, + { + "epoch": 1.0252231459456913, + "grad_norm": 0.6591455340385437, + "learning_rate": 9.916194976550492e-06, + "loss": 0.2106, + "step": 10883 + }, + { + "epoch": 1.0253173500388593, + "grad_norm": 0.746976375579834, + "learning_rate": 9.914685012440682e-06, + "loss": 0.2315, + "step": 10884 + }, + { + "epoch": 1.025411554132027, + "grad_norm": 0.6313568353652954, + "learning_rate": 9.913175050276186e-06, + "loss": 0.213, + "step": 10885 + }, + { + "epoch": 1.025505758225195, + "grad_norm": 0.6824766993522644, + "learning_rate": 9.911665090091428e-06, + "loss": 0.2032, + "step": 10886 + }, + { + "epoch": 1.0255999623183627, + "grad_norm": 0.6259440183639526, + "learning_rate": 9.910155131920847e-06, + "loss": 0.2026, + "step": 10887 + }, + { + "epoch": 1.0256941664115307, + "grad_norm": 0.6822636723518372, + "learning_rate": 9.90864517579887e-06, + "loss": 0.2129, + "step": 10888 + }, + { + "epoch": 1.0257883705046984, + "grad_norm": 0.6597427129745483, + "learning_rate": 9.907135221759923e-06, + "loss": 0.224, + "step": 10889 + }, + { + "epoch": 1.0258825745978664, + "grad_norm": 0.6362802386283875, + "learning_rate": 9.905625269838433e-06, + "loss": 0.2391, + "step": 10890 + }, + { + "epoch": 1.025976778691034, + "grad_norm": 0.5726861357688904, + "learning_rate": 9.904115320068834e-06, + "loss": 0.2121, + "step": 10891 + }, + { + "epoch": 1.026070982784202, + "grad_norm": 0.6730748414993286, + "learning_rate": 9.902605372485557e-06, + "loss": 0.2164, + "step": 10892 + }, + { + "epoch": 1.0261651868773698, + "grad_norm": 0.6526716351509094, + "learning_rate": 9.901095427123023e-06, + "loss": 0.2212, + "step": 10893 + }, + { + "epoch": 1.0262593909705378, + "grad_norm": 0.6757125854492188, + "learning_rate": 9.89958548401567e-06, + "loss": 0.2075, + "step": 10894 + }, + { + "epoch": 1.0263535950637055, + "grad_norm": 0.6140555143356323, + "learning_rate": 9.898075543197922e-06, + "loss": 0.2261, + "step": 10895 + }, + { + "epoch": 1.0264477991568735, + "grad_norm": 0.5689371228218079, + "learning_rate": 9.89656560470421e-06, + "loss": 0.183, + "step": 10896 + }, + { + "epoch": 1.0265420032500412, + "grad_norm": 0.6465091705322266, + "learning_rate": 9.895055668568961e-06, + "loss": 0.2144, + "step": 10897 + }, + { + "epoch": 1.0266362073432092, + "grad_norm": 0.5773385167121887, + "learning_rate": 9.893545734826607e-06, + "loss": 0.1752, + "step": 10898 + }, + { + "epoch": 1.0267304114363769, + "grad_norm": 0.6092222332954407, + "learning_rate": 9.892035803511573e-06, + "loss": 0.2131, + "step": 10899 + }, + { + "epoch": 1.0268246155295448, + "grad_norm": 0.6916167140007019, + "learning_rate": 9.89052587465829e-06, + "loss": 0.2065, + "step": 10900 + }, + { + "epoch": 1.0269188196227126, + "grad_norm": 0.6038517355918884, + "learning_rate": 9.889015948301187e-06, + "loss": 0.1852, + "step": 10901 + }, + { + "epoch": 1.0270130237158805, + "grad_norm": 0.6681758165359497, + "learning_rate": 9.887506024474693e-06, + "loss": 0.2398, + "step": 10902 + }, + { + "epoch": 1.0271072278090483, + "grad_norm": 0.6332974433898926, + "learning_rate": 9.885996103213232e-06, + "loss": 0.2119, + "step": 10903 + }, + { + "epoch": 1.0272014319022162, + "grad_norm": 0.6054478287696838, + "learning_rate": 9.88448618455124e-06, + "loss": 0.1904, + "step": 10904 + }, + { + "epoch": 1.027295635995384, + "grad_norm": 0.6860628128051758, + "learning_rate": 9.882976268523142e-06, + "loss": 0.2044, + "step": 10905 + }, + { + "epoch": 1.027389840088552, + "grad_norm": 0.7579703330993652, + "learning_rate": 9.88146635516336e-06, + "loss": 0.2086, + "step": 10906 + }, + { + "epoch": 1.0274840441817197, + "grad_norm": 0.6385505199432373, + "learning_rate": 9.879956444506335e-06, + "loss": 0.222, + "step": 10907 + }, + { + "epoch": 1.0275782482748876, + "grad_norm": 0.653588056564331, + "learning_rate": 9.878446536586488e-06, + "loss": 0.2068, + "step": 10908 + }, + { + "epoch": 1.0276724523680554, + "grad_norm": 0.6662572622299194, + "learning_rate": 9.876936631438248e-06, + "loss": 0.2241, + "step": 10909 + }, + { + "epoch": 1.0277666564612233, + "grad_norm": 0.8349880576133728, + "learning_rate": 9.875426729096039e-06, + "loss": 0.2215, + "step": 10910 + }, + { + "epoch": 1.027860860554391, + "grad_norm": 0.6116170287132263, + "learning_rate": 9.873916829594297e-06, + "loss": 0.1833, + "step": 10911 + }, + { + "epoch": 1.027955064647559, + "grad_norm": 0.6977446675300598, + "learning_rate": 9.872406932967444e-06, + "loss": 0.2297, + "step": 10912 + }, + { + "epoch": 1.0280492687407268, + "grad_norm": 0.7331418395042419, + "learning_rate": 9.87089703924991e-06, + "loss": 0.2323, + "step": 10913 + }, + { + "epoch": 1.0281434728338947, + "grad_norm": 0.7110864520072937, + "learning_rate": 9.869387148476124e-06, + "loss": 0.2375, + "step": 10914 + }, + { + "epoch": 1.0282376769270625, + "grad_norm": 0.630600094795227, + "learning_rate": 9.867877260680515e-06, + "loss": 0.183, + "step": 10915 + }, + { + "epoch": 1.0283318810202304, + "grad_norm": 0.6984477639198303, + "learning_rate": 9.866367375897505e-06, + "loss": 0.2169, + "step": 10916 + }, + { + "epoch": 1.0284260851133982, + "grad_norm": 0.6631993055343628, + "learning_rate": 9.864857494161529e-06, + "loss": 0.2074, + "step": 10917 + }, + { + "epoch": 1.0285202892065661, + "grad_norm": 0.7021864652633667, + "learning_rate": 9.86334761550701e-06, + "loss": 0.2312, + "step": 10918 + }, + { + "epoch": 1.0286144932997339, + "grad_norm": 0.7496550679206848, + "learning_rate": 9.861837739968378e-06, + "loss": 0.197, + "step": 10919 + }, + { + "epoch": 1.0287086973929018, + "grad_norm": 0.6108748316764832, + "learning_rate": 9.860327867580056e-06, + "loss": 0.1962, + "step": 10920 + }, + { + "epoch": 1.0288029014860696, + "grad_norm": 0.8925476670265198, + "learning_rate": 9.858817998376477e-06, + "loss": 0.1996, + "step": 10921 + }, + { + "epoch": 1.0288971055792375, + "grad_norm": 0.7436450123786926, + "learning_rate": 9.857308132392068e-06, + "loss": 0.2193, + "step": 10922 + }, + { + "epoch": 1.0289913096724053, + "grad_norm": 0.6071893572807312, + "learning_rate": 9.855798269661247e-06, + "loss": 0.1978, + "step": 10923 + }, + { + "epoch": 1.0290855137655732, + "grad_norm": 0.6819487810134888, + "learning_rate": 9.854288410218455e-06, + "loss": 0.2203, + "step": 10924 + }, + { + "epoch": 1.029179717858741, + "grad_norm": 0.6285249590873718, + "learning_rate": 9.852778554098112e-06, + "loss": 0.2207, + "step": 10925 + }, + { + "epoch": 1.029273921951909, + "grad_norm": 0.6124079823493958, + "learning_rate": 9.851268701334641e-06, + "loss": 0.1822, + "step": 10926 + }, + { + "epoch": 1.0293681260450767, + "grad_norm": 0.7141607999801636, + "learning_rate": 9.849758851962478e-06, + "loss": 0.2219, + "step": 10927 + }, + { + "epoch": 1.0294623301382444, + "grad_norm": 0.6542896032333374, + "learning_rate": 9.848249006016045e-06, + "loss": 0.2122, + "step": 10928 + }, + { + "epoch": 1.0295565342314124, + "grad_norm": 0.6309689283370972, + "learning_rate": 9.846739163529772e-06, + "loss": 0.2033, + "step": 10929 + }, + { + "epoch": 1.02965073832458, + "grad_norm": 0.6441240906715393, + "learning_rate": 9.845229324538076e-06, + "loss": 0.2164, + "step": 10930 + }, + { + "epoch": 1.029744942417748, + "grad_norm": 0.6353326439857483, + "learning_rate": 9.843719489075396e-06, + "loss": 0.1908, + "step": 10931 + }, + { + "epoch": 1.0298391465109158, + "grad_norm": 0.7253024578094482, + "learning_rate": 9.842209657176153e-06, + "loss": 0.2615, + "step": 10932 + }, + { + "epoch": 1.0299333506040838, + "grad_norm": 0.669043779373169, + "learning_rate": 9.840699828874771e-06, + "loss": 0.2226, + "step": 10933 + }, + { + "epoch": 1.0300275546972515, + "grad_norm": 0.5972657799720764, + "learning_rate": 9.839190004205683e-06, + "loss": 0.2161, + "step": 10934 + }, + { + "epoch": 1.0301217587904195, + "grad_norm": 0.6432631015777588, + "learning_rate": 9.83768018320331e-06, + "loss": 0.2063, + "step": 10935 + }, + { + "epoch": 1.0302159628835872, + "grad_norm": 0.7154762744903564, + "learning_rate": 9.836170365902077e-06, + "loss": 0.2356, + "step": 10936 + }, + { + "epoch": 1.0303101669767551, + "grad_norm": 0.7650432586669922, + "learning_rate": 9.834660552336415e-06, + "loss": 0.2184, + "step": 10937 + }, + { + "epoch": 1.0304043710699229, + "grad_norm": 0.6580452919006348, + "learning_rate": 9.83315074254075e-06, + "loss": 0.2188, + "step": 10938 + }, + { + "epoch": 1.0304985751630908, + "grad_norm": 0.6253951787948608, + "learning_rate": 9.831640936549505e-06, + "loss": 0.2178, + "step": 10939 + }, + { + "epoch": 1.0305927792562586, + "grad_norm": 0.6630215048789978, + "learning_rate": 9.830131134397106e-06, + "loss": 0.2232, + "step": 10940 + }, + { + "epoch": 1.0306869833494265, + "grad_norm": 0.7034470438957214, + "learning_rate": 9.828621336117981e-06, + "loss": 0.2141, + "step": 10941 + }, + { + "epoch": 1.0307811874425943, + "grad_norm": 0.6886473894119263, + "learning_rate": 9.827111541746558e-06, + "loss": 0.2268, + "step": 10942 + }, + { + "epoch": 1.0308753915357622, + "grad_norm": 0.6353757977485657, + "learning_rate": 9.82560175131725e-06, + "loss": 0.2147, + "step": 10943 + }, + { + "epoch": 1.03096959562893, + "grad_norm": 0.6014813184738159, + "learning_rate": 9.824091964864499e-06, + "loss": 0.218, + "step": 10944 + }, + { + "epoch": 1.031063799722098, + "grad_norm": 0.6371111273765564, + "learning_rate": 9.822582182422723e-06, + "loss": 0.2031, + "step": 10945 + }, + { + "epoch": 1.0311580038152657, + "grad_norm": 0.8969886302947998, + "learning_rate": 9.821072404026344e-06, + "loss": 0.2276, + "step": 10946 + }, + { + "epoch": 1.0312522079084336, + "grad_norm": 0.6839308142662048, + "learning_rate": 9.819562629709793e-06, + "loss": 0.2224, + "step": 10947 + }, + { + "epoch": 1.0313464120016014, + "grad_norm": 0.6386514902114868, + "learning_rate": 9.818052859507497e-06, + "loss": 0.2296, + "step": 10948 + }, + { + "epoch": 1.0314406160947693, + "grad_norm": 0.6815319657325745, + "learning_rate": 9.816543093453873e-06, + "loss": 0.2071, + "step": 10949 + }, + { + "epoch": 1.031534820187937, + "grad_norm": 0.6995004415512085, + "learning_rate": 9.81503333158335e-06, + "loss": 0.2135, + "step": 10950 + }, + { + "epoch": 1.031629024281105, + "grad_norm": 0.6554959416389465, + "learning_rate": 9.813523573930353e-06, + "loss": 0.2238, + "step": 10951 + }, + { + "epoch": 1.0317232283742728, + "grad_norm": 0.6632347106933594, + "learning_rate": 9.812013820529307e-06, + "loss": 0.2303, + "step": 10952 + }, + { + "epoch": 1.0318174324674407, + "grad_norm": 0.652472734451294, + "learning_rate": 9.810504071414637e-06, + "loss": 0.1878, + "step": 10953 + }, + { + "epoch": 1.0319116365606085, + "grad_norm": 0.6303935647010803, + "learning_rate": 9.808994326620767e-06, + "loss": 0.2169, + "step": 10954 + }, + { + "epoch": 1.0320058406537764, + "grad_norm": 0.6247922778129578, + "learning_rate": 9.807484586182123e-06, + "loss": 0.2248, + "step": 10955 + }, + { + "epoch": 1.0321000447469442, + "grad_norm": 0.6658794283866882, + "learning_rate": 9.805974850133125e-06, + "loss": 0.2139, + "step": 10956 + }, + { + "epoch": 1.0321942488401121, + "grad_norm": 0.6316248774528503, + "learning_rate": 9.804465118508203e-06, + "loss": 0.1928, + "step": 10957 + }, + { + "epoch": 1.0322884529332799, + "grad_norm": 0.6552563309669495, + "learning_rate": 9.802955391341779e-06, + "loss": 0.216, + "step": 10958 + }, + { + "epoch": 1.0323826570264478, + "grad_norm": 0.6724657416343689, + "learning_rate": 9.801445668668278e-06, + "loss": 0.2377, + "step": 10959 + }, + { + "epoch": 1.0324768611196156, + "grad_norm": 0.8886076807975769, + "learning_rate": 9.79993595052212e-06, + "loss": 0.21, + "step": 10960 + }, + { + "epoch": 1.0325710652127835, + "grad_norm": 0.6898882389068604, + "learning_rate": 9.798426236937733e-06, + "loss": 0.2221, + "step": 10961 + }, + { + "epoch": 1.0326652693059513, + "grad_norm": 0.6199621558189392, + "learning_rate": 9.796916527949542e-06, + "loss": 0.1927, + "step": 10962 + }, + { + "epoch": 1.0327594733991192, + "grad_norm": 0.7419622540473938, + "learning_rate": 9.795406823591962e-06, + "loss": 0.2262, + "step": 10963 + }, + { + "epoch": 1.032853677492287, + "grad_norm": 0.6263675689697266, + "learning_rate": 9.793897123899426e-06, + "loss": 0.1947, + "step": 10964 + }, + { + "epoch": 1.032947881585455, + "grad_norm": 0.6655123233795166, + "learning_rate": 9.792387428906358e-06, + "loss": 0.2115, + "step": 10965 + }, + { + "epoch": 1.0330420856786227, + "grad_norm": 0.6305708289146423, + "learning_rate": 9.79087773864717e-06, + "loss": 0.2126, + "step": 10966 + }, + { + "epoch": 1.0331362897717906, + "grad_norm": 0.6121314167976379, + "learning_rate": 9.7893680531563e-06, + "loss": 0.2015, + "step": 10967 + }, + { + "epoch": 1.0332304938649584, + "grad_norm": 0.6803262829780579, + "learning_rate": 9.787858372468163e-06, + "loss": 0.2536, + "step": 10968 + }, + { + "epoch": 1.0333246979581263, + "grad_norm": 0.6640075445175171, + "learning_rate": 9.786348696617183e-06, + "loss": 0.1957, + "step": 10969 + }, + { + "epoch": 1.033418902051294, + "grad_norm": 0.6151512861251831, + "learning_rate": 9.78483902563778e-06, + "loss": 0.1923, + "step": 10970 + }, + { + "epoch": 1.033513106144462, + "grad_norm": 0.5444778203964233, + "learning_rate": 9.783329359564383e-06, + "loss": 0.1618, + "step": 10971 + }, + { + "epoch": 1.0336073102376298, + "grad_norm": 0.6734565496444702, + "learning_rate": 9.78181969843141e-06, + "loss": 0.2121, + "step": 10972 + }, + { + "epoch": 1.0337015143307977, + "grad_norm": 0.5637355446815491, + "learning_rate": 9.780310042273284e-06, + "loss": 0.2118, + "step": 10973 + }, + { + "epoch": 1.0337957184239654, + "grad_norm": 0.6288042664527893, + "learning_rate": 9.778800391124431e-06, + "loss": 0.2194, + "step": 10974 + }, + { + "epoch": 1.0338899225171334, + "grad_norm": 0.6832186579704285, + "learning_rate": 9.777290745019272e-06, + "loss": 0.2185, + "step": 10975 + }, + { + "epoch": 1.0339841266103011, + "grad_norm": 0.6883850693702698, + "learning_rate": 9.775781103992226e-06, + "loss": 0.2508, + "step": 10976 + }, + { + "epoch": 1.034078330703469, + "grad_norm": 0.7300069332122803, + "learning_rate": 9.774271468077718e-06, + "loss": 0.2227, + "step": 10977 + }, + { + "epoch": 1.0341725347966368, + "grad_norm": 0.6199600696563721, + "learning_rate": 9.772761837310172e-06, + "loss": 0.2047, + "step": 10978 + }, + { + "epoch": 1.0342667388898048, + "grad_norm": 0.6576501727104187, + "learning_rate": 9.771252211724006e-06, + "loss": 0.1924, + "step": 10979 + }, + { + "epoch": 1.0343609429829725, + "grad_norm": 0.6371155977249146, + "learning_rate": 9.769742591353642e-06, + "loss": 0.1941, + "step": 10980 + }, + { + "epoch": 1.0344551470761405, + "grad_norm": 0.6634193062782288, + "learning_rate": 9.768232976233505e-06, + "loss": 0.2037, + "step": 10981 + }, + { + "epoch": 1.0345493511693082, + "grad_norm": 0.6280701756477356, + "learning_rate": 9.766723366398017e-06, + "loss": 0.1811, + "step": 10982 + }, + { + "epoch": 1.0346435552624762, + "grad_norm": 0.708373486995697, + "learning_rate": 9.76521376188159e-06, + "loss": 0.1923, + "step": 10983 + }, + { + "epoch": 1.034737759355644, + "grad_norm": 0.6773320436477661, + "learning_rate": 9.763704162718656e-06, + "loss": 0.2172, + "step": 10984 + }, + { + "epoch": 1.034831963448812, + "grad_norm": 0.6665389537811279, + "learning_rate": 9.762194568943636e-06, + "loss": 0.2116, + "step": 10985 + }, + { + "epoch": 1.0349261675419796, + "grad_norm": 0.6778730750083923, + "learning_rate": 9.760684980590942e-06, + "loss": 0.2022, + "step": 10986 + }, + { + "epoch": 1.0350203716351476, + "grad_norm": 0.7299953699111938, + "learning_rate": 9.759175397695004e-06, + "loss": 0.2095, + "step": 10987 + }, + { + "epoch": 1.0351145757283153, + "grad_norm": 0.6160703301429749, + "learning_rate": 9.757665820290239e-06, + "loss": 0.2156, + "step": 10988 + }, + { + "epoch": 1.0352087798214833, + "grad_norm": 0.6470587849617004, + "learning_rate": 9.756156248411068e-06, + "loss": 0.1955, + "step": 10989 + }, + { + "epoch": 1.035302983914651, + "grad_norm": 0.6191893219947815, + "learning_rate": 9.75464668209191e-06, + "loss": 0.2266, + "step": 10990 + }, + { + "epoch": 1.035397188007819, + "grad_norm": 0.668607234954834, + "learning_rate": 9.753137121367188e-06, + "loss": 0.1918, + "step": 10991 + }, + { + "epoch": 1.0354913921009867, + "grad_norm": 0.604935348033905, + "learning_rate": 9.751627566271323e-06, + "loss": 0.2174, + "step": 10992 + }, + { + "epoch": 1.0355855961941547, + "grad_norm": 0.6688987612724304, + "learning_rate": 9.75011801683873e-06, + "loss": 0.2052, + "step": 10993 + }, + { + "epoch": 1.0356798002873224, + "grad_norm": 0.6503415703773499, + "learning_rate": 9.748608473103836e-06, + "loss": 0.1955, + "step": 10994 + }, + { + "epoch": 1.0357740043804904, + "grad_norm": 0.7032379508018494, + "learning_rate": 9.747098935101056e-06, + "loss": 0.2387, + "step": 10995 + }, + { + "epoch": 1.0358682084736581, + "grad_norm": 0.6519894599914551, + "learning_rate": 9.745589402864811e-06, + "loss": 0.2263, + "step": 10996 + }, + { + "epoch": 1.035962412566826, + "grad_norm": 0.6277257204055786, + "learning_rate": 9.744079876429522e-06, + "loss": 0.2053, + "step": 10997 + }, + { + "epoch": 1.0360566166599938, + "grad_norm": 0.5464719533920288, + "learning_rate": 9.742570355829608e-06, + "loss": 0.1853, + "step": 10998 + }, + { + "epoch": 1.0361508207531618, + "grad_norm": 0.635711669921875, + "learning_rate": 9.74106084109949e-06, + "loss": 0.1883, + "step": 10999 + }, + { + "epoch": 1.0362450248463295, + "grad_norm": 0.6020753979682922, + "learning_rate": 9.73955133227358e-06, + "loss": 0.2011, + "step": 11000 + }, + { + "epoch": 1.0363392289394975, + "grad_norm": 0.6382079720497131, + "learning_rate": 9.738041829386306e-06, + "loss": 0.2134, + "step": 11001 + }, + { + "epoch": 1.0364334330326652, + "grad_norm": 0.6182296276092529, + "learning_rate": 9.736532332472085e-06, + "loss": 0.2061, + "step": 11002 + }, + { + "epoch": 1.0365276371258332, + "grad_norm": 0.6536548733711243, + "learning_rate": 9.735022841565329e-06, + "loss": 0.2012, + "step": 11003 + }, + { + "epoch": 1.036621841219001, + "grad_norm": 0.7733579277992249, + "learning_rate": 9.733513356700465e-06, + "loss": 0.241, + "step": 11004 + }, + { + "epoch": 1.0367160453121689, + "grad_norm": 0.6516435146331787, + "learning_rate": 9.73200387791191e-06, + "loss": 0.1928, + "step": 11005 + }, + { + "epoch": 1.0368102494053366, + "grad_norm": 0.6627474427223206, + "learning_rate": 9.730494405234077e-06, + "loss": 0.2008, + "step": 11006 + }, + { + "epoch": 1.0369044534985046, + "grad_norm": 0.6160629987716675, + "learning_rate": 9.728984938701393e-06, + "loss": 0.2325, + "step": 11007 + }, + { + "epoch": 1.0369986575916723, + "grad_norm": 0.6392260193824768, + "learning_rate": 9.72747547834827e-06, + "loss": 0.224, + "step": 11008 + }, + { + "epoch": 1.0370928616848403, + "grad_norm": 0.6877565979957581, + "learning_rate": 9.725966024209128e-06, + "loss": 0.2315, + "step": 11009 + }, + { + "epoch": 1.037187065778008, + "grad_norm": 0.6176539063453674, + "learning_rate": 9.724456576318383e-06, + "loss": 0.2242, + "step": 11010 + }, + { + "epoch": 1.037281269871176, + "grad_norm": 0.6558178067207336, + "learning_rate": 9.722947134710453e-06, + "loss": 0.2257, + "step": 11011 + }, + { + "epoch": 1.0373754739643437, + "grad_norm": 0.5705680847167969, + "learning_rate": 9.72143769941976e-06, + "loss": 0.1796, + "step": 11012 + }, + { + "epoch": 1.0374696780575117, + "grad_norm": 0.6435481309890747, + "learning_rate": 9.719928270480715e-06, + "loss": 0.1914, + "step": 11013 + }, + { + "epoch": 1.0375638821506794, + "grad_norm": 0.6671053171157837, + "learning_rate": 9.71841884792774e-06, + "loss": 0.2176, + "step": 11014 + }, + { + "epoch": 1.0376580862438474, + "grad_norm": 0.6727092862129211, + "learning_rate": 9.716909431795251e-06, + "loss": 0.2306, + "step": 11015 + }, + { + "epoch": 1.037752290337015, + "grad_norm": 0.6252629160881042, + "learning_rate": 9.715400022117665e-06, + "loss": 0.2012, + "step": 11016 + }, + { + "epoch": 1.037846494430183, + "grad_norm": 0.7260280847549438, + "learning_rate": 9.713890618929398e-06, + "loss": 0.2412, + "step": 11017 + }, + { + "epoch": 1.0379406985233508, + "grad_norm": 0.6636776924133301, + "learning_rate": 9.712381222264869e-06, + "loss": 0.2062, + "step": 11018 + }, + { + "epoch": 1.0380349026165188, + "grad_norm": 0.7553560733795166, + "learning_rate": 9.71087183215849e-06, + "loss": 0.1941, + "step": 11019 + }, + { + "epoch": 1.0381291067096865, + "grad_norm": 0.5909751057624817, + "learning_rate": 9.709362448644682e-06, + "loss": 0.2143, + "step": 11020 + }, + { + "epoch": 1.0382233108028545, + "grad_norm": 0.7922289371490479, + "learning_rate": 9.707853071757862e-06, + "loss": 0.2381, + "step": 11021 + }, + { + "epoch": 1.0383175148960222, + "grad_norm": 0.6449324488639832, + "learning_rate": 9.706343701532443e-06, + "loss": 0.2166, + "step": 11022 + }, + { + "epoch": 1.0384117189891902, + "grad_norm": 0.6129333972930908, + "learning_rate": 9.704834338002836e-06, + "loss": 0.1894, + "step": 11023 + }, + { + "epoch": 1.038505923082358, + "grad_norm": 0.6376360058784485, + "learning_rate": 9.703324981203467e-06, + "loss": 0.2108, + "step": 11024 + }, + { + "epoch": 1.0386001271755259, + "grad_norm": 0.9939081072807312, + "learning_rate": 9.70181563116875e-06, + "loss": 0.2334, + "step": 11025 + }, + { + "epoch": 1.0386943312686936, + "grad_norm": 0.6956937909126282, + "learning_rate": 9.700306287933093e-06, + "loss": 0.1997, + "step": 11026 + }, + { + "epoch": 1.0387885353618616, + "grad_norm": 0.6432368159294128, + "learning_rate": 9.69879695153092e-06, + "loss": 0.2151, + "step": 11027 + }, + { + "epoch": 1.0388827394550293, + "grad_norm": 0.6428807377815247, + "learning_rate": 9.697287621996641e-06, + "loss": 0.2081, + "step": 11028 + }, + { + "epoch": 1.0389769435481973, + "grad_norm": 0.6315459609031677, + "learning_rate": 9.695778299364672e-06, + "loss": 0.2133, + "step": 11029 + }, + { + "epoch": 1.039071147641365, + "grad_norm": 0.5934397578239441, + "learning_rate": 9.694268983669427e-06, + "loss": 0.2324, + "step": 11030 + }, + { + "epoch": 1.039165351734533, + "grad_norm": 0.612694501876831, + "learning_rate": 9.692759674945322e-06, + "loss": 0.2326, + "step": 11031 + }, + { + "epoch": 1.0392595558277007, + "grad_norm": 0.6528658866882324, + "learning_rate": 9.691250373226774e-06, + "loss": 0.1921, + "step": 11032 + }, + { + "epoch": 1.0393537599208686, + "grad_norm": 0.6557685136795044, + "learning_rate": 9.689741078548191e-06, + "loss": 0.2006, + "step": 11033 + }, + { + "epoch": 1.0394479640140364, + "grad_norm": 0.7540048360824585, + "learning_rate": 9.688231790943996e-06, + "loss": 0.2121, + "step": 11034 + }, + { + "epoch": 1.0395421681072043, + "grad_norm": 0.7017741203308105, + "learning_rate": 9.686722510448595e-06, + "loss": 0.2058, + "step": 11035 + }, + { + "epoch": 1.039636372200372, + "grad_norm": 0.617885947227478, + "learning_rate": 9.685213237096405e-06, + "loss": 0.1962, + "step": 11036 + }, + { + "epoch": 1.03973057629354, + "grad_norm": 0.640734851360321, + "learning_rate": 9.683703970921841e-06, + "loss": 0.2028, + "step": 11037 + }, + { + "epoch": 1.0398247803867078, + "grad_norm": 0.6963291764259338, + "learning_rate": 9.682194711959318e-06, + "loss": 0.2363, + "step": 11038 + }, + { + "epoch": 1.0399189844798757, + "grad_norm": 0.657353401184082, + "learning_rate": 9.680685460243247e-06, + "loss": 0.1941, + "step": 11039 + }, + { + "epoch": 1.0400131885730435, + "grad_norm": 0.664580762386322, + "learning_rate": 9.679176215808037e-06, + "loss": 0.2117, + "step": 11040 + }, + { + "epoch": 1.0401073926662114, + "grad_norm": 0.6473305225372314, + "learning_rate": 9.677666978688108e-06, + "loss": 0.2196, + "step": 11041 + }, + { + "epoch": 1.0402015967593792, + "grad_norm": 0.6388459205627441, + "learning_rate": 9.676157748917873e-06, + "loss": 0.2136, + "step": 11042 + }, + { + "epoch": 1.0402958008525471, + "grad_norm": 0.6057829856872559, + "learning_rate": 9.674648526531735e-06, + "loss": 0.201, + "step": 11043 + }, + { + "epoch": 1.0403900049457149, + "grad_norm": 0.6156550049781799, + "learning_rate": 9.673139311564118e-06, + "loss": 0.1977, + "step": 11044 + }, + { + "epoch": 1.0404842090388828, + "grad_norm": 0.6064395904541016, + "learning_rate": 9.671630104049433e-06, + "loss": 0.191, + "step": 11045 + }, + { + "epoch": 1.0405784131320506, + "grad_norm": 0.6397035717964172, + "learning_rate": 9.670120904022083e-06, + "loss": 0.1881, + "step": 11046 + }, + { + "epoch": 1.0406726172252185, + "grad_norm": 0.6849590539932251, + "learning_rate": 9.668611711516494e-06, + "loss": 0.2182, + "step": 11047 + }, + { + "epoch": 1.0407668213183863, + "grad_norm": 0.6409408450126648, + "learning_rate": 9.667102526567068e-06, + "loss": 0.1893, + "step": 11048 + }, + { + "epoch": 1.0408610254115542, + "grad_norm": 0.6189360618591309, + "learning_rate": 9.665593349208218e-06, + "loss": 0.195, + "step": 11049 + }, + { + "epoch": 1.040955229504722, + "grad_norm": 0.6303878426551819, + "learning_rate": 9.664084179474354e-06, + "loss": 0.2559, + "step": 11050 + }, + { + "epoch": 1.04104943359789, + "grad_norm": 0.6208665370941162, + "learning_rate": 9.662575017399894e-06, + "loss": 0.2201, + "step": 11051 + }, + { + "epoch": 1.0411436376910577, + "grad_norm": 0.7549915313720703, + "learning_rate": 9.661065863019246e-06, + "loss": 0.2151, + "step": 11052 + }, + { + "epoch": 1.0412378417842256, + "grad_norm": 0.7950330376625061, + "learning_rate": 9.659556716366817e-06, + "loss": 0.2223, + "step": 11053 + }, + { + "epoch": 1.0413320458773934, + "grad_norm": 0.5968732833862305, + "learning_rate": 9.658047577477022e-06, + "loss": 0.2203, + "step": 11054 + }, + { + "epoch": 1.0414262499705613, + "grad_norm": 0.7243004441261292, + "learning_rate": 9.656538446384275e-06, + "loss": 0.2219, + "step": 11055 + }, + { + "epoch": 1.041520454063729, + "grad_norm": 0.6180680394172668, + "learning_rate": 9.655029323122977e-06, + "loss": 0.203, + "step": 11056 + }, + { + "epoch": 1.041614658156897, + "grad_norm": 0.688209593296051, + "learning_rate": 9.653520207727547e-06, + "loss": 0.1887, + "step": 11057 + }, + { + "epoch": 1.0417088622500648, + "grad_norm": 0.6917232275009155, + "learning_rate": 9.652011100232394e-06, + "loss": 0.2015, + "step": 11058 + }, + { + "epoch": 1.0418030663432327, + "grad_norm": 0.7278743982315063, + "learning_rate": 9.650502000671926e-06, + "loss": 0.2152, + "step": 11059 + }, + { + "epoch": 1.0418972704364005, + "grad_norm": 0.6593081951141357, + "learning_rate": 9.648992909080548e-06, + "loss": 0.2227, + "step": 11060 + }, + { + "epoch": 1.0419914745295684, + "grad_norm": 0.691805899143219, + "learning_rate": 9.647483825492678e-06, + "loss": 0.2047, + "step": 11061 + }, + { + "epoch": 1.0420856786227362, + "grad_norm": 0.6252630352973938, + "learning_rate": 9.645974749942725e-06, + "loss": 0.1945, + "step": 11062 + }, + { + "epoch": 1.042179882715904, + "grad_norm": 0.6684446334838867, + "learning_rate": 9.644465682465088e-06, + "loss": 0.2126, + "step": 11063 + }, + { + "epoch": 1.0422740868090719, + "grad_norm": 0.6513932943344116, + "learning_rate": 9.642956623094187e-06, + "loss": 0.2104, + "step": 11064 + }, + { + "epoch": 1.0423682909022398, + "grad_norm": 0.6777861714363098, + "learning_rate": 9.641447571864429e-06, + "loss": 0.2193, + "step": 11065 + }, + { + "epoch": 1.0424624949954076, + "grad_norm": 0.643218457698822, + "learning_rate": 9.639938528810217e-06, + "loss": 0.2164, + "step": 11066 + }, + { + "epoch": 1.0425566990885753, + "grad_norm": 0.6765201687812805, + "learning_rate": 9.638429493965967e-06, + "loss": 0.2277, + "step": 11067 + }, + { + "epoch": 1.0426509031817432, + "grad_norm": 0.6563106775283813, + "learning_rate": 9.636920467366082e-06, + "loss": 0.2017, + "step": 11068 + }, + { + "epoch": 1.0427451072749112, + "grad_norm": 0.6716633439064026, + "learning_rate": 9.635411449044974e-06, + "loss": 0.205, + "step": 11069 + }, + { + "epoch": 1.042839311368079, + "grad_norm": 0.6161030530929565, + "learning_rate": 9.633902439037044e-06, + "loss": 0.2199, + "step": 11070 + }, + { + "epoch": 1.0429335154612467, + "grad_norm": 0.6739822030067444, + "learning_rate": 9.63239343737671e-06, + "loss": 0.2062, + "step": 11071 + }, + { + "epoch": 1.0430277195544146, + "grad_norm": 0.6891419291496277, + "learning_rate": 9.63088444409837e-06, + "loss": 0.197, + "step": 11072 + }, + { + "epoch": 1.0431219236475824, + "grad_norm": 0.7476945519447327, + "learning_rate": 9.629375459236437e-06, + "loss": 0.2007, + "step": 11073 + }, + { + "epoch": 1.0432161277407503, + "grad_norm": 0.6027140617370605, + "learning_rate": 9.627866482825316e-06, + "loss": 0.209, + "step": 11074 + }, + { + "epoch": 1.043310331833918, + "grad_norm": 0.6068741083145142, + "learning_rate": 9.626357514899417e-06, + "loss": 0.226, + "step": 11075 + }, + { + "epoch": 1.043404535927086, + "grad_norm": 0.6412231922149658, + "learning_rate": 9.62484855549314e-06, + "loss": 0.2042, + "step": 11076 + }, + { + "epoch": 1.0434987400202538, + "grad_norm": 0.6265538930892944, + "learning_rate": 9.623339604640901e-06, + "loss": 0.2413, + "step": 11077 + }, + { + "epoch": 1.0435929441134217, + "grad_norm": 0.6812928318977356, + "learning_rate": 9.6218306623771e-06, + "loss": 0.1969, + "step": 11078 + }, + { + "epoch": 1.0436871482065895, + "grad_norm": 0.6189242601394653, + "learning_rate": 9.620321728736147e-06, + "loss": 0.2004, + "step": 11079 + }, + { + "epoch": 1.0437813522997574, + "grad_norm": 0.7051004767417908, + "learning_rate": 9.61881280375244e-06, + "loss": 0.2268, + "step": 11080 + }, + { + "epoch": 1.0438755563929252, + "grad_norm": 0.5873299837112427, + "learning_rate": 9.617303887460393e-06, + "loss": 0.1984, + "step": 11081 + }, + { + "epoch": 1.0439697604860931, + "grad_norm": 0.636796236038208, + "learning_rate": 9.615794979894414e-06, + "loss": 0.2032, + "step": 11082 + }, + { + "epoch": 1.0440639645792609, + "grad_norm": 0.7137956023216248, + "learning_rate": 9.614286081088895e-06, + "loss": 0.2417, + "step": 11083 + }, + { + "epoch": 1.0441581686724288, + "grad_norm": 0.6883149743080139, + "learning_rate": 9.612777191078257e-06, + "loss": 0.2662, + "step": 11084 + }, + { + "epoch": 1.0442523727655966, + "grad_norm": 0.6227160692214966, + "learning_rate": 9.611268309896897e-06, + "loss": 0.216, + "step": 11085 + }, + { + "epoch": 1.0443465768587645, + "grad_norm": 0.6602711081504822, + "learning_rate": 9.609759437579215e-06, + "loss": 0.1916, + "step": 11086 + }, + { + "epoch": 1.0444407809519323, + "grad_norm": 0.5795473456382751, + "learning_rate": 9.608250574159627e-06, + "loss": 0.195, + "step": 11087 + }, + { + "epoch": 1.0445349850451002, + "grad_norm": 0.6337112784385681, + "learning_rate": 9.606741719672532e-06, + "loss": 0.2007, + "step": 11088 + }, + { + "epoch": 1.044629189138268, + "grad_norm": 0.6955795884132385, + "learning_rate": 9.605232874152333e-06, + "loss": 0.2133, + "step": 11089 + }, + { + "epoch": 1.044723393231436, + "grad_norm": 0.6677690744400024, + "learning_rate": 9.603724037633431e-06, + "loss": 0.2169, + "step": 11090 + }, + { + "epoch": 1.0448175973246037, + "grad_norm": 0.7634860873222351, + "learning_rate": 9.602215210150238e-06, + "loss": 0.2089, + "step": 11091 + }, + { + "epoch": 1.0449118014177716, + "grad_norm": 0.6360622048377991, + "learning_rate": 9.600706391737154e-06, + "loss": 0.1844, + "step": 11092 + }, + { + "epoch": 1.0450060055109394, + "grad_norm": 0.5976507067680359, + "learning_rate": 9.599197582428577e-06, + "loss": 0.1806, + "step": 11093 + }, + { + "epoch": 1.0451002096041073, + "grad_norm": 0.7010049223899841, + "learning_rate": 9.59768878225892e-06, + "loss": 0.2238, + "step": 11094 + }, + { + "epoch": 1.045194413697275, + "grad_norm": 0.6693528890609741, + "learning_rate": 9.596179991262579e-06, + "loss": 0.2199, + "step": 11095 + }, + { + "epoch": 1.045288617790443, + "grad_norm": 0.5872026085853577, + "learning_rate": 9.594671209473957e-06, + "loss": 0.206, + "step": 11096 + }, + { + "epoch": 1.0453828218836108, + "grad_norm": 0.615420401096344, + "learning_rate": 9.593162436927461e-06, + "loss": 0.2318, + "step": 11097 + }, + { + "epoch": 1.0454770259767787, + "grad_norm": 0.6226284503936768, + "learning_rate": 9.59165367365749e-06, + "loss": 0.2395, + "step": 11098 + }, + { + "epoch": 1.0455712300699465, + "grad_norm": 0.6633398532867432, + "learning_rate": 9.59014491969845e-06, + "loss": 0.2263, + "step": 11099 + }, + { + "epoch": 1.0456654341631144, + "grad_norm": 0.6114773750305176, + "learning_rate": 9.58863617508473e-06, + "loss": 0.1905, + "step": 11100 + }, + { + "epoch": 1.0457596382562822, + "grad_norm": 0.7100580930709839, + "learning_rate": 9.587127439850749e-06, + "loss": 0.2085, + "step": 11101 + }, + { + "epoch": 1.0458538423494501, + "grad_norm": 0.6336383819580078, + "learning_rate": 9.585618714030903e-06, + "loss": 0.2293, + "step": 11102 + }, + { + "epoch": 1.0459480464426179, + "grad_norm": 0.691080629825592, + "learning_rate": 9.584109997659583e-06, + "loss": 0.2264, + "step": 11103 + }, + { + "epoch": 1.0460422505357858, + "grad_norm": 0.7221998572349548, + "learning_rate": 9.582601290771206e-06, + "loss": 0.2094, + "step": 11104 + }, + { + "epoch": 1.0461364546289535, + "grad_norm": 0.7102656960487366, + "learning_rate": 9.581092593400163e-06, + "loss": 0.22, + "step": 11105 + }, + { + "epoch": 1.0462306587221215, + "grad_norm": 0.6529754996299744, + "learning_rate": 9.579583905580851e-06, + "loss": 0.2149, + "step": 11106 + }, + { + "epoch": 1.0463248628152892, + "grad_norm": 0.5638948082923889, + "learning_rate": 9.578075227347684e-06, + "loss": 0.1858, + "step": 11107 + }, + { + "epoch": 1.0464190669084572, + "grad_norm": 0.6523354053497314, + "learning_rate": 9.576566558735053e-06, + "loss": 0.2017, + "step": 11108 + }, + { + "epoch": 1.046513271001625, + "grad_norm": 0.6081815361976624, + "learning_rate": 9.575057899777357e-06, + "loss": 0.1962, + "step": 11109 + }, + { + "epoch": 1.046607475094793, + "grad_norm": 0.6833523511886597, + "learning_rate": 9.573549250508996e-06, + "loss": 0.2032, + "step": 11110 + }, + { + "epoch": 1.0467016791879606, + "grad_norm": 0.5638574957847595, + "learning_rate": 9.572040610964376e-06, + "loss": 0.18, + "step": 11111 + }, + { + "epoch": 1.0467958832811286, + "grad_norm": 0.6635776162147522, + "learning_rate": 9.57053198117789e-06, + "loss": 0.2159, + "step": 11112 + }, + { + "epoch": 1.0468900873742963, + "grad_norm": 0.6376252770423889, + "learning_rate": 9.569023361183938e-06, + "loss": 0.238, + "step": 11113 + }, + { + "epoch": 1.0469842914674643, + "grad_norm": 0.6444991827011108, + "learning_rate": 9.56751475101692e-06, + "loss": 0.2136, + "step": 11114 + }, + { + "epoch": 1.047078495560632, + "grad_norm": 0.6417834162712097, + "learning_rate": 9.566006150711237e-06, + "loss": 0.2129, + "step": 11115 + }, + { + "epoch": 1.0471726996538, + "grad_norm": 0.6237179040908813, + "learning_rate": 9.564497560301281e-06, + "loss": 0.2141, + "step": 11116 + }, + { + "epoch": 1.0472669037469677, + "grad_norm": 0.6647252440452576, + "learning_rate": 9.562988979821457e-06, + "loss": 0.2051, + "step": 11117 + }, + { + "epoch": 1.0473611078401357, + "grad_norm": 0.6281054019927979, + "learning_rate": 9.561480409306161e-06, + "loss": 0.2059, + "step": 11118 + }, + { + "epoch": 1.0474553119333034, + "grad_norm": 0.8527956008911133, + "learning_rate": 9.55997184878979e-06, + "loss": 0.2117, + "step": 11119 + }, + { + "epoch": 1.0475495160264714, + "grad_norm": 0.6849957704544067, + "learning_rate": 9.558463298306737e-06, + "loss": 0.2004, + "step": 11120 + }, + { + "epoch": 1.0476437201196391, + "grad_norm": 0.5989345908164978, + "learning_rate": 9.556954757891408e-06, + "loss": 0.1716, + "step": 11121 + }, + { + "epoch": 1.047737924212807, + "grad_norm": 0.6292548179626465, + "learning_rate": 9.555446227578198e-06, + "loss": 0.226, + "step": 11122 + }, + { + "epoch": 1.0478321283059748, + "grad_norm": 0.6838903427124023, + "learning_rate": 9.553937707401492e-06, + "loss": 0.2272, + "step": 11123 + }, + { + "epoch": 1.0479263323991428, + "grad_norm": 0.6710837483406067, + "learning_rate": 9.552429197395705e-06, + "loss": 0.2254, + "step": 11124 + }, + { + "epoch": 1.0480205364923105, + "grad_norm": 0.6228378415107727, + "learning_rate": 9.550920697595222e-06, + "loss": 0.2361, + "step": 11125 + }, + { + "epoch": 1.0481147405854785, + "grad_norm": 0.6198393106460571, + "learning_rate": 9.549412208034436e-06, + "loss": 0.2086, + "step": 11126 + }, + { + "epoch": 1.0482089446786462, + "grad_norm": 0.6320314407348633, + "learning_rate": 9.547903728747758e-06, + "loss": 0.1968, + "step": 11127 + }, + { + "epoch": 1.0483031487718142, + "grad_norm": 0.7348677515983582, + "learning_rate": 9.546395259769569e-06, + "loss": 0.223, + "step": 11128 + }, + { + "epoch": 1.048397352864982, + "grad_norm": 0.7054775953292847, + "learning_rate": 9.54488680113427e-06, + "loss": 0.208, + "step": 11129 + }, + { + "epoch": 1.0484915569581499, + "grad_norm": 0.613116443157196, + "learning_rate": 9.543378352876256e-06, + "loss": 0.2086, + "step": 11130 + }, + { + "epoch": 1.0485857610513176, + "grad_norm": 0.625372588634491, + "learning_rate": 9.541869915029923e-06, + "loss": 0.2189, + "step": 11131 + }, + { + "epoch": 1.0486799651444856, + "grad_norm": 0.6397774815559387, + "learning_rate": 9.540361487629662e-06, + "loss": 0.1869, + "step": 11132 + }, + { + "epoch": 1.0487741692376533, + "grad_norm": 0.6696832776069641, + "learning_rate": 9.538853070709871e-06, + "loss": 0.1887, + "step": 11133 + }, + { + "epoch": 1.0488683733308213, + "grad_norm": 0.6500979661941528, + "learning_rate": 9.537344664304943e-06, + "loss": 0.2168, + "step": 11134 + }, + { + "epoch": 1.048962577423989, + "grad_norm": 0.5896238088607788, + "learning_rate": 9.535836268449272e-06, + "loss": 0.1997, + "step": 11135 + }, + { + "epoch": 1.049056781517157, + "grad_norm": 0.657002329826355, + "learning_rate": 9.534327883177251e-06, + "loss": 0.2252, + "step": 11136 + }, + { + "epoch": 1.0491509856103247, + "grad_norm": 0.7159499526023865, + "learning_rate": 9.532819508523277e-06, + "loss": 0.1993, + "step": 11137 + }, + { + "epoch": 1.0492451897034927, + "grad_norm": 0.6413044929504395, + "learning_rate": 9.53131114452174e-06, + "loss": 0.2157, + "step": 11138 + }, + { + "epoch": 1.0493393937966604, + "grad_norm": 0.6382307410240173, + "learning_rate": 9.529802791207035e-06, + "loss": 0.2204, + "step": 11139 + }, + { + "epoch": 1.0494335978898284, + "grad_norm": 0.7286831140518188, + "learning_rate": 9.528294448613548e-06, + "loss": 0.2294, + "step": 11140 + }, + { + "epoch": 1.0495278019829961, + "grad_norm": 0.6109779477119446, + "learning_rate": 9.526786116775682e-06, + "loss": 0.1874, + "step": 11141 + }, + { + "epoch": 1.049622006076164, + "grad_norm": 0.662490963935852, + "learning_rate": 9.525277795727827e-06, + "loss": 0.232, + "step": 11142 + }, + { + "epoch": 1.0497162101693318, + "grad_norm": 0.6784884333610535, + "learning_rate": 9.523769485504364e-06, + "loss": 0.2213, + "step": 11143 + }, + { + "epoch": 1.0498104142624998, + "grad_norm": 0.6889349222183228, + "learning_rate": 9.5222611861397e-06, + "loss": 0.212, + "step": 11144 + }, + { + "epoch": 1.0499046183556675, + "grad_norm": 0.6845928430557251, + "learning_rate": 9.52075289766822e-06, + "loss": 0.198, + "step": 11145 + }, + { + "epoch": 1.0499988224488355, + "grad_norm": 0.6736336350440979, + "learning_rate": 9.519244620124309e-06, + "loss": 0.2574, + "step": 11146 + }, + { + "epoch": 1.0500930265420032, + "grad_norm": 0.5495937466621399, + "learning_rate": 9.51773635354237e-06, + "loss": 0.1832, + "step": 11147 + }, + { + "epoch": 1.0501872306351712, + "grad_norm": 0.6508906483650208, + "learning_rate": 9.516228097956787e-06, + "loss": 0.2156, + "step": 11148 + }, + { + "epoch": 1.050281434728339, + "grad_norm": 0.6272395253181458, + "learning_rate": 9.51471985340195e-06, + "loss": 0.2307, + "step": 11149 + }, + { + "epoch": 1.0503756388215069, + "grad_norm": 0.8044071793556213, + "learning_rate": 9.51321161991225e-06, + "loss": 0.2278, + "step": 11150 + }, + { + "epoch": 1.0504698429146746, + "grad_norm": 0.7866191864013672, + "learning_rate": 9.51170339752208e-06, + "loss": 0.2312, + "step": 11151 + }, + { + "epoch": 1.0505640470078426, + "grad_norm": 0.5838554501533508, + "learning_rate": 9.510195186265827e-06, + "loss": 0.208, + "step": 11152 + }, + { + "epoch": 1.0506582511010103, + "grad_norm": 0.6547302007675171, + "learning_rate": 9.50868698617788e-06, + "loss": 0.2101, + "step": 11153 + }, + { + "epoch": 1.0507524551941783, + "grad_norm": 0.6752921938896179, + "learning_rate": 9.50717879729263e-06, + "loss": 0.1913, + "step": 11154 + }, + { + "epoch": 1.050846659287346, + "grad_norm": 0.7714093327522278, + "learning_rate": 9.505670619644468e-06, + "loss": 0.2245, + "step": 11155 + }, + { + "epoch": 1.050940863380514, + "grad_norm": 0.6669865846633911, + "learning_rate": 9.504162453267776e-06, + "loss": 0.1967, + "step": 11156 + }, + { + "epoch": 1.0510350674736817, + "grad_norm": 0.6319842338562012, + "learning_rate": 9.502654298196952e-06, + "loss": 0.2148, + "step": 11157 + }, + { + "epoch": 1.0511292715668497, + "grad_norm": 0.7395252585411072, + "learning_rate": 9.501146154466377e-06, + "loss": 0.2034, + "step": 11158 + }, + { + "epoch": 1.0512234756600174, + "grad_norm": 0.6132408976554871, + "learning_rate": 9.499638022110443e-06, + "loss": 0.2215, + "step": 11159 + }, + { + "epoch": 1.0513176797531854, + "grad_norm": 0.6248003244400024, + "learning_rate": 9.49812990116353e-06, + "loss": 0.2071, + "step": 11160 + }, + { + "epoch": 1.051411883846353, + "grad_norm": 0.6967898607254028, + "learning_rate": 9.496621791660036e-06, + "loss": 0.2356, + "step": 11161 + }, + { + "epoch": 1.051506087939521, + "grad_norm": 0.6121363043785095, + "learning_rate": 9.495113693634346e-06, + "loss": 0.192, + "step": 11162 + }, + { + "epoch": 1.0516002920326888, + "grad_norm": 0.6453368067741394, + "learning_rate": 9.493605607120837e-06, + "loss": 0.214, + "step": 11163 + }, + { + "epoch": 1.0516944961258567, + "grad_norm": 0.968544065952301, + "learning_rate": 9.492097532153911e-06, + "loss": 0.2009, + "step": 11164 + }, + { + "epoch": 1.0517887002190245, + "grad_norm": 0.6154932379722595, + "learning_rate": 9.490589468767944e-06, + "loss": 0.1957, + "step": 11165 + }, + { + "epoch": 1.0518829043121924, + "grad_norm": 0.6451613306999207, + "learning_rate": 9.48908141699732e-06, + "loss": 0.227, + "step": 11166 + }, + { + "epoch": 1.0519771084053602, + "grad_norm": 0.6756893396377563, + "learning_rate": 9.487573376876437e-06, + "loss": 0.2165, + "step": 11167 + }, + { + "epoch": 1.0520713124985281, + "grad_norm": 0.6194762587547302, + "learning_rate": 9.486065348439671e-06, + "loss": 0.2002, + "step": 11168 + }, + { + "epoch": 1.0521655165916959, + "grad_norm": 0.7091140151023865, + "learning_rate": 9.48455733172141e-06, + "loss": 0.2079, + "step": 11169 + }, + { + "epoch": 1.0522597206848638, + "grad_norm": 0.6055117845535278, + "learning_rate": 9.483049326756037e-06, + "loss": 0.1973, + "step": 11170 + }, + { + "epoch": 1.0523539247780316, + "grad_norm": 0.6349391937255859, + "learning_rate": 9.481541333577942e-06, + "loss": 0.1835, + "step": 11171 + }, + { + "epoch": 1.0524481288711995, + "grad_norm": 0.6326323747634888, + "learning_rate": 9.480033352221506e-06, + "loss": 0.2033, + "step": 11172 + }, + { + "epoch": 1.0525423329643673, + "grad_norm": 0.679470419883728, + "learning_rate": 9.478525382721111e-06, + "loss": 0.2208, + "step": 11173 + }, + { + "epoch": 1.0526365370575352, + "grad_norm": 0.6802979111671448, + "learning_rate": 9.477017425111146e-06, + "loss": 0.2196, + "step": 11174 + }, + { + "epoch": 1.052730741150703, + "grad_norm": 0.6467894315719604, + "learning_rate": 9.475509479425992e-06, + "loss": 0.2216, + "step": 11175 + }, + { + "epoch": 1.052824945243871, + "grad_norm": 0.7351828813552856, + "learning_rate": 9.474001545700031e-06, + "loss": 0.2204, + "step": 11176 + }, + { + "epoch": 1.0529191493370387, + "grad_norm": 0.6442490220069885, + "learning_rate": 9.472493623967651e-06, + "loss": 0.1981, + "step": 11177 + }, + { + "epoch": 1.0530133534302066, + "grad_norm": 0.6150367259979248, + "learning_rate": 9.470985714263232e-06, + "loss": 0.2111, + "step": 11178 + }, + { + "epoch": 1.0531075575233744, + "grad_norm": 0.635668158531189, + "learning_rate": 9.46947781662116e-06, + "loss": 0.2153, + "step": 11179 + }, + { + "epoch": 1.0532017616165423, + "grad_norm": 0.7739942669868469, + "learning_rate": 9.467969931075805e-06, + "loss": 0.2399, + "step": 11180 + }, + { + "epoch": 1.05329596570971, + "grad_norm": 0.6315284967422485, + "learning_rate": 9.466462057661564e-06, + "loss": 0.1995, + "step": 11181 + }, + { + "epoch": 1.053390169802878, + "grad_norm": 0.6025282740592957, + "learning_rate": 9.464954196412816e-06, + "loss": 0.2135, + "step": 11182 + }, + { + "epoch": 1.0534843738960458, + "grad_norm": 0.6514477729797363, + "learning_rate": 9.463446347363933e-06, + "loss": 0.2194, + "step": 11183 + }, + { + "epoch": 1.0535785779892137, + "grad_norm": 0.6931238770484924, + "learning_rate": 9.46193851054931e-06, + "loss": 0.2023, + "step": 11184 + }, + { + "epoch": 1.0536727820823815, + "grad_norm": 0.7367113828659058, + "learning_rate": 9.460430686003318e-06, + "loss": 0.2308, + "step": 11185 + }, + { + "epoch": 1.0537669861755494, + "grad_norm": 0.5855939984321594, + "learning_rate": 9.458922873760337e-06, + "loss": 0.189, + "step": 11186 + }, + { + "epoch": 1.0538611902687172, + "grad_norm": 0.6554651260375977, + "learning_rate": 9.457415073854757e-06, + "loss": 0.228, + "step": 11187 + }, + { + "epoch": 1.0539553943618851, + "grad_norm": 0.6301096677780151, + "learning_rate": 9.455907286320953e-06, + "loss": 0.2024, + "step": 11188 + }, + { + "epoch": 1.0540495984550529, + "grad_norm": 0.6394056081771851, + "learning_rate": 9.454399511193302e-06, + "loss": 0.2447, + "step": 11189 + }, + { + "epoch": 1.0541438025482208, + "grad_norm": 0.6422435641288757, + "learning_rate": 9.452891748506183e-06, + "loss": 0.2062, + "step": 11190 + }, + { + "epoch": 1.0542380066413886, + "grad_norm": 1.5789260864257812, + "learning_rate": 9.451383998293981e-06, + "loss": 0.1762, + "step": 11191 + }, + { + "epoch": 1.0543322107345565, + "grad_norm": 0.6653083562850952, + "learning_rate": 9.449876260591074e-06, + "loss": 0.2014, + "step": 11192 + }, + { + "epoch": 1.0544264148277243, + "grad_norm": 0.6630948781967163, + "learning_rate": 9.448368535431835e-06, + "loss": 0.1933, + "step": 11193 + }, + { + "epoch": 1.0545206189208922, + "grad_norm": 0.6700137853622437, + "learning_rate": 9.44686082285065e-06, + "loss": 0.2047, + "step": 11194 + }, + { + "epoch": 1.05461482301406, + "grad_norm": 0.6125686168670654, + "learning_rate": 9.445353122881893e-06, + "loss": 0.2084, + "step": 11195 + }, + { + "epoch": 1.054709027107228, + "grad_norm": 0.636107325553894, + "learning_rate": 9.443845435559941e-06, + "loss": 0.2062, + "step": 11196 + }, + { + "epoch": 1.0548032312003957, + "grad_norm": 0.6367817521095276, + "learning_rate": 9.442337760919174e-06, + "loss": 0.2403, + "step": 11197 + }, + { + "epoch": 1.0548974352935636, + "grad_norm": 0.6435432434082031, + "learning_rate": 9.440830098993969e-06, + "loss": 0.2289, + "step": 11198 + }, + { + "epoch": 1.0549916393867314, + "grad_norm": 0.5958701372146606, + "learning_rate": 9.439322449818705e-06, + "loss": 0.1877, + "step": 11199 + }, + { + "epoch": 1.0550858434798993, + "grad_norm": 0.6421607136726379, + "learning_rate": 9.43781481342775e-06, + "loss": 0.1893, + "step": 11200 + }, + { + "epoch": 1.055180047573067, + "grad_norm": 0.6000587344169617, + "learning_rate": 9.436307189855492e-06, + "loss": 0.1936, + "step": 11201 + }, + { + "epoch": 1.0552742516662348, + "grad_norm": 0.61556077003479, + "learning_rate": 9.434799579136301e-06, + "loss": 0.1812, + "step": 11202 + }, + { + "epoch": 1.0553684557594027, + "grad_norm": 0.5927000045776367, + "learning_rate": 9.43329198130455e-06, + "loss": 0.1851, + "step": 11203 + }, + { + "epoch": 1.0554626598525707, + "grad_norm": 0.6748753190040588, + "learning_rate": 9.431784396394624e-06, + "loss": 0.2094, + "step": 11204 + }, + { + "epoch": 1.0555568639457384, + "grad_norm": 0.800971269607544, + "learning_rate": 9.430276824440889e-06, + "loss": 0.2309, + "step": 11205 + }, + { + "epoch": 1.0556510680389062, + "grad_norm": 0.6645275950431824, + "learning_rate": 9.42876926547772e-06, + "loss": 0.2186, + "step": 11206 + }, + { + "epoch": 1.0557452721320741, + "grad_norm": 0.7603010535240173, + "learning_rate": 9.427261719539502e-06, + "loss": 0.2148, + "step": 11207 + }, + { + "epoch": 1.0558394762252419, + "grad_norm": 0.6783728003501892, + "learning_rate": 9.425754186660601e-06, + "loss": 0.2174, + "step": 11208 + }, + { + "epoch": 1.0559336803184098, + "grad_norm": 0.7164636254310608, + "learning_rate": 9.424246666875392e-06, + "loss": 0.2398, + "step": 11209 + }, + { + "epoch": 1.0560278844115776, + "grad_norm": 0.6651371121406555, + "learning_rate": 9.422739160218248e-06, + "loss": 0.2181, + "step": 11210 + }, + { + "epoch": 1.0561220885047455, + "grad_norm": 0.6751751899719238, + "learning_rate": 9.421231666723543e-06, + "loss": 0.2154, + "step": 11211 + }, + { + "epoch": 1.0562162925979133, + "grad_norm": 0.6399869918823242, + "learning_rate": 9.419724186425654e-06, + "loss": 0.1956, + "step": 11212 + }, + { + "epoch": 1.0563104966910812, + "grad_norm": 0.7180332541465759, + "learning_rate": 9.418216719358947e-06, + "loss": 0.2125, + "step": 11213 + }, + { + "epoch": 1.056404700784249, + "grad_norm": 0.654536247253418, + "learning_rate": 9.416709265557803e-06, + "loss": 0.2027, + "step": 11214 + }, + { + "epoch": 1.056498904877417, + "grad_norm": 0.5757598280906677, + "learning_rate": 9.41520182505659e-06, + "loss": 0.1865, + "step": 11215 + }, + { + "epoch": 1.0565931089705847, + "grad_norm": 0.6821197271347046, + "learning_rate": 9.413694397889676e-06, + "loss": 0.2014, + "step": 11216 + }, + { + "epoch": 1.0566873130637526, + "grad_norm": 0.8251752853393555, + "learning_rate": 9.412186984091438e-06, + "loss": 0.2462, + "step": 11217 + }, + { + "epoch": 1.0567815171569204, + "grad_norm": 0.6823638081550598, + "learning_rate": 9.410679583696247e-06, + "loss": 0.1921, + "step": 11218 + }, + { + "epoch": 1.0568757212500883, + "grad_norm": 0.6674739122390747, + "learning_rate": 9.409172196738474e-06, + "loss": 0.2096, + "step": 11219 + }, + { + "epoch": 1.056969925343256, + "grad_norm": 0.5945629477500916, + "learning_rate": 9.407664823252483e-06, + "loss": 0.1812, + "step": 11220 + }, + { + "epoch": 1.057064129436424, + "grad_norm": 0.5962544083595276, + "learning_rate": 9.406157463272657e-06, + "loss": 0.1884, + "step": 11221 + }, + { + "epoch": 1.0571583335295918, + "grad_norm": 0.6588229537010193, + "learning_rate": 9.404650116833357e-06, + "loss": 0.2061, + "step": 11222 + }, + { + "epoch": 1.0572525376227597, + "grad_norm": 0.6700302362442017, + "learning_rate": 9.40314278396895e-06, + "loss": 0.2181, + "step": 11223 + }, + { + "epoch": 1.0573467417159275, + "grad_norm": 0.6322035789489746, + "learning_rate": 9.401635464713817e-06, + "loss": 0.1991, + "step": 11224 + }, + { + "epoch": 1.0574409458090954, + "grad_norm": 0.7199676036834717, + "learning_rate": 9.40012815910232e-06, + "loss": 0.2043, + "step": 11225 + }, + { + "epoch": 1.0575351499022632, + "grad_norm": 0.6011017560958862, + "learning_rate": 9.398620867168823e-06, + "loss": 0.1958, + "step": 11226 + }, + { + "epoch": 1.0576293539954311, + "grad_norm": 0.5843146443367004, + "learning_rate": 9.397113588947708e-06, + "loss": 0.1841, + "step": 11227 + }, + { + "epoch": 1.0577235580885989, + "grad_norm": 0.5755860805511475, + "learning_rate": 9.395606324473331e-06, + "loss": 0.1828, + "step": 11228 + }, + { + "epoch": 1.0578177621817668, + "grad_norm": 0.7034083604812622, + "learning_rate": 9.394099073780066e-06, + "loss": 0.2021, + "step": 11229 + }, + { + "epoch": 1.0579119662749346, + "grad_norm": 0.6693611741065979, + "learning_rate": 9.392591836902278e-06, + "loss": 0.2441, + "step": 11230 + }, + { + "epoch": 1.0580061703681025, + "grad_norm": 0.5777305364608765, + "learning_rate": 9.391084613874337e-06, + "loss": 0.2235, + "step": 11231 + }, + { + "epoch": 1.0581003744612703, + "grad_norm": 0.5965675711631775, + "learning_rate": 9.389577404730607e-06, + "loss": 0.1691, + "step": 11232 + }, + { + "epoch": 1.0581945785544382, + "grad_norm": 0.6609938740730286, + "learning_rate": 9.388070209505457e-06, + "loss": 0.2126, + "step": 11233 + }, + { + "epoch": 1.058288782647606, + "grad_norm": 0.655796468257904, + "learning_rate": 9.386563028233253e-06, + "loss": 0.2003, + "step": 11234 + }, + { + "epoch": 1.058382986740774, + "grad_norm": 0.6271539926528931, + "learning_rate": 9.38505586094836e-06, + "loss": 0.2071, + "step": 11235 + }, + { + "epoch": 1.0584771908339417, + "grad_norm": 0.6357953548431396, + "learning_rate": 9.383548707685144e-06, + "loss": 0.1967, + "step": 11236 + }, + { + "epoch": 1.0585713949271096, + "grad_norm": 0.6697683334350586, + "learning_rate": 9.382041568477972e-06, + "loss": 0.2285, + "step": 11237 + }, + { + "epoch": 1.0586655990202773, + "grad_norm": 0.6043030023574829, + "learning_rate": 9.380534443361206e-06, + "loss": 0.1843, + "step": 11238 + }, + { + "epoch": 1.0587598031134453, + "grad_norm": 0.6747494339942932, + "learning_rate": 9.379027332369217e-06, + "loss": 0.2013, + "step": 11239 + }, + { + "epoch": 1.058854007206613, + "grad_norm": 0.6584952473640442, + "learning_rate": 9.377520235536358e-06, + "loss": 0.2164, + "step": 11240 + }, + { + "epoch": 1.058948211299781, + "grad_norm": 0.6754341125488281, + "learning_rate": 9.376013152897008e-06, + "loss": 0.2374, + "step": 11241 + }, + { + "epoch": 1.0590424153929487, + "grad_norm": 0.743152916431427, + "learning_rate": 9.37450608448552e-06, + "loss": 0.2149, + "step": 11242 + }, + { + "epoch": 1.0591366194861167, + "grad_norm": 0.6113778352737427, + "learning_rate": 9.372999030336257e-06, + "loss": 0.221, + "step": 11243 + }, + { + "epoch": 1.0592308235792844, + "grad_norm": 0.7457681894302368, + "learning_rate": 9.371491990483591e-06, + "loss": 0.1781, + "step": 11244 + }, + { + "epoch": 1.0593250276724524, + "grad_norm": 0.6920148134231567, + "learning_rate": 9.36998496496188e-06, + "loss": 0.2149, + "step": 11245 + }, + { + "epoch": 1.0594192317656201, + "grad_norm": 0.6757007241249084, + "learning_rate": 9.368477953805481e-06, + "loss": 0.2273, + "step": 11246 + }, + { + "epoch": 1.059513435858788, + "grad_norm": 0.6448094248771667, + "learning_rate": 9.366970957048764e-06, + "loss": 0.2001, + "step": 11247 + }, + { + "epoch": 1.0596076399519558, + "grad_norm": 0.687659740447998, + "learning_rate": 9.365463974726089e-06, + "loss": 0.2095, + "step": 11248 + }, + { + "epoch": 1.0597018440451238, + "grad_norm": 0.6897541880607605, + "learning_rate": 9.363957006871817e-06, + "loss": 0.196, + "step": 11249 + }, + { + "epoch": 1.0597960481382915, + "grad_norm": 0.7084280252456665, + "learning_rate": 9.362450053520307e-06, + "loss": 0.2137, + "step": 11250 + }, + { + "epoch": 1.0598902522314595, + "grad_norm": 0.6229495406150818, + "learning_rate": 9.360943114705923e-06, + "loss": 0.2077, + "step": 11251 + }, + { + "epoch": 1.0599844563246272, + "grad_norm": 0.7185355424880981, + "learning_rate": 9.359436190463025e-06, + "loss": 0.203, + "step": 11252 + }, + { + "epoch": 1.0600786604177952, + "grad_norm": 0.6672647595405579, + "learning_rate": 9.357929280825967e-06, + "loss": 0.2254, + "step": 11253 + }, + { + "epoch": 1.060172864510963, + "grad_norm": 0.5557436943054199, + "learning_rate": 9.35642238582912e-06, + "loss": 0.1844, + "step": 11254 + }, + { + "epoch": 1.060267068604131, + "grad_norm": 0.6353731751441956, + "learning_rate": 9.354915505506839e-06, + "loss": 0.2042, + "step": 11255 + }, + { + "epoch": 1.0603612726972986, + "grad_norm": 0.6660376787185669, + "learning_rate": 9.353408639893477e-06, + "loss": 0.2472, + "step": 11256 + }, + { + "epoch": 1.0604554767904666, + "grad_norm": 0.7244245409965515, + "learning_rate": 9.351901789023402e-06, + "loss": 0.2262, + "step": 11257 + }, + { + "epoch": 1.0605496808836343, + "grad_norm": 0.7049255967140198, + "learning_rate": 9.350394952930968e-06, + "loss": 0.2233, + "step": 11258 + }, + { + "epoch": 1.0606438849768023, + "grad_norm": 0.5961927175521851, + "learning_rate": 9.348888131650536e-06, + "loss": 0.2, + "step": 11259 + }, + { + "epoch": 1.06073808906997, + "grad_norm": 0.6763758063316345, + "learning_rate": 9.347381325216455e-06, + "loss": 0.2219, + "step": 11260 + }, + { + "epoch": 1.060832293163138, + "grad_norm": 0.6583464741706848, + "learning_rate": 9.345874533663095e-06, + "loss": 0.1839, + "step": 11261 + }, + { + "epoch": 1.0609264972563057, + "grad_norm": 0.6635313034057617, + "learning_rate": 9.344367757024807e-06, + "loss": 0.2039, + "step": 11262 + }, + { + "epoch": 1.0610207013494737, + "grad_norm": 0.6379163265228271, + "learning_rate": 9.34286099533594e-06, + "loss": 0.1773, + "step": 11263 + }, + { + "epoch": 1.0611149054426414, + "grad_norm": 0.6061589121818542, + "learning_rate": 9.341354248630868e-06, + "loss": 0.1876, + "step": 11264 + }, + { + "epoch": 1.0612091095358094, + "grad_norm": 0.702111005783081, + "learning_rate": 9.339847516943935e-06, + "loss": 0.2026, + "step": 11265 + }, + { + "epoch": 1.0613033136289771, + "grad_norm": 0.6586620211601257, + "learning_rate": 9.338340800309498e-06, + "loss": 0.2251, + "step": 11266 + }, + { + "epoch": 1.061397517722145, + "grad_norm": 0.6755000948905945, + "learning_rate": 9.336834098761915e-06, + "loss": 0.2407, + "step": 11267 + }, + { + "epoch": 1.0614917218153128, + "grad_norm": 0.5839751958847046, + "learning_rate": 9.335327412335541e-06, + "loss": 0.1932, + "step": 11268 + }, + { + "epoch": 1.0615859259084808, + "grad_norm": 0.6575443148612976, + "learning_rate": 9.33382074106473e-06, + "loss": 0.1992, + "step": 11269 + }, + { + "epoch": 1.0616801300016485, + "grad_norm": 0.65425705909729, + "learning_rate": 9.332314084983834e-06, + "loss": 0.2018, + "step": 11270 + }, + { + "epoch": 1.0617743340948165, + "grad_norm": 0.7227454781532288, + "learning_rate": 9.33080744412721e-06, + "loss": 0.2277, + "step": 11271 + }, + { + "epoch": 1.0618685381879842, + "grad_norm": 0.6328867673873901, + "learning_rate": 9.329300818529215e-06, + "loss": 0.229, + "step": 11272 + }, + { + "epoch": 1.0619627422811522, + "grad_norm": 0.6115502119064331, + "learning_rate": 9.327794208224193e-06, + "loss": 0.1956, + "step": 11273 + }, + { + "epoch": 1.06205694637432, + "grad_norm": 1.076008677482605, + "learning_rate": 9.326287613246506e-06, + "loss": 0.2003, + "step": 11274 + }, + { + "epoch": 1.0621511504674879, + "grad_norm": 0.6074058413505554, + "learning_rate": 9.324781033630504e-06, + "loss": 0.1988, + "step": 11275 + }, + { + "epoch": 1.0622453545606556, + "grad_norm": 0.5642720460891724, + "learning_rate": 9.323274469410535e-06, + "loss": 0.1781, + "step": 11276 + }, + { + "epoch": 1.0623395586538236, + "grad_norm": 0.6526278257369995, + "learning_rate": 9.321767920620958e-06, + "loss": 0.216, + "step": 11277 + }, + { + "epoch": 1.0624337627469913, + "grad_norm": 0.5836901664733887, + "learning_rate": 9.32026138729612e-06, + "loss": 0.1925, + "step": 11278 + }, + { + "epoch": 1.0625279668401593, + "grad_norm": 0.6546528935432434, + "learning_rate": 9.318754869470376e-06, + "loss": 0.1974, + "step": 11279 + }, + { + "epoch": 1.062622170933327, + "grad_norm": 0.6255246996879578, + "learning_rate": 9.31724836717807e-06, + "loss": 0.2095, + "step": 11280 + }, + { + "epoch": 1.062716375026495, + "grad_norm": 0.5917826890945435, + "learning_rate": 9.315741880453562e-06, + "loss": 0.217, + "step": 11281 + }, + { + "epoch": 1.0628105791196627, + "grad_norm": 0.6791194677352905, + "learning_rate": 9.314235409331196e-06, + "loss": 0.2142, + "step": 11282 + }, + { + "epoch": 1.0629047832128307, + "grad_norm": 0.6301477551460266, + "learning_rate": 9.312728953845318e-06, + "loss": 0.2018, + "step": 11283 + }, + { + "epoch": 1.0629989873059984, + "grad_norm": 0.6922203898429871, + "learning_rate": 9.31122251403029e-06, + "loss": 0.2069, + "step": 11284 + }, + { + "epoch": 1.0630931913991664, + "grad_norm": 0.6575744152069092, + "learning_rate": 9.30971608992045e-06, + "loss": 0.1947, + "step": 11285 + }, + { + "epoch": 1.063187395492334, + "grad_norm": 0.7077332735061646, + "learning_rate": 9.308209681550151e-06, + "loss": 0.2213, + "step": 11286 + }, + { + "epoch": 1.063281599585502, + "grad_norm": 0.6976880431175232, + "learning_rate": 9.306703288953742e-06, + "loss": 0.199, + "step": 11287 + }, + { + "epoch": 1.0633758036786698, + "grad_norm": 0.6417021751403809, + "learning_rate": 9.30519691216557e-06, + "loss": 0.2011, + "step": 11288 + }, + { + "epoch": 1.0634700077718378, + "grad_norm": 0.6133342981338501, + "learning_rate": 9.303690551219983e-06, + "loss": 0.1947, + "step": 11289 + }, + { + "epoch": 1.0635642118650055, + "grad_norm": 0.5926858186721802, + "learning_rate": 9.302184206151328e-06, + "loss": 0.1868, + "step": 11290 + }, + { + "epoch": 1.0636584159581735, + "grad_norm": 0.7458446025848389, + "learning_rate": 9.300677876993954e-06, + "loss": 0.2519, + "step": 11291 + }, + { + "epoch": 1.0637526200513412, + "grad_norm": 0.6571434140205383, + "learning_rate": 9.299171563782204e-06, + "loss": 0.2131, + "step": 11292 + }, + { + "epoch": 1.0638468241445092, + "grad_norm": 0.6311631798744202, + "learning_rate": 9.297665266550425e-06, + "loss": 0.2142, + "step": 11293 + }, + { + "epoch": 1.063941028237677, + "grad_norm": 0.6098727583885193, + "learning_rate": 9.296158985332966e-06, + "loss": 0.2132, + "step": 11294 + }, + { + "epoch": 1.0640352323308448, + "grad_norm": 0.6526821851730347, + "learning_rate": 9.29465272016417e-06, + "loss": 0.1984, + "step": 11295 + }, + { + "epoch": 1.0641294364240126, + "grad_norm": 0.6677765846252441, + "learning_rate": 9.293146471078383e-06, + "loss": 0.2243, + "step": 11296 + }, + { + "epoch": 1.0642236405171805, + "grad_norm": 0.6665986776351929, + "learning_rate": 9.291640238109949e-06, + "loss": 0.2035, + "step": 11297 + }, + { + "epoch": 1.0643178446103483, + "grad_norm": 0.7022556662559509, + "learning_rate": 9.290134021293215e-06, + "loss": 0.2245, + "step": 11298 + }, + { + "epoch": 1.0644120487035162, + "grad_norm": 0.6495939493179321, + "learning_rate": 9.288627820662525e-06, + "loss": 0.1929, + "step": 11299 + }, + { + "epoch": 1.064506252796684, + "grad_norm": 0.6828340291976929, + "learning_rate": 9.287121636252214e-06, + "loss": 0.2223, + "step": 11300 + }, + { + "epoch": 1.064600456889852, + "grad_norm": 0.7064345479011536, + "learning_rate": 9.285615468096638e-06, + "loss": 0.2131, + "step": 11301 + }, + { + "epoch": 1.0646946609830197, + "grad_norm": 0.5541938543319702, + "learning_rate": 9.284109316230133e-06, + "loss": 0.1714, + "step": 11302 + }, + { + "epoch": 1.0647888650761876, + "grad_norm": 0.714510440826416, + "learning_rate": 9.282603180687037e-06, + "loss": 0.2158, + "step": 11303 + }, + { + "epoch": 1.0648830691693554, + "grad_norm": 0.7199380993843079, + "learning_rate": 9.281097061501707e-06, + "loss": 0.2386, + "step": 11304 + }, + { + "epoch": 1.0649772732625233, + "grad_norm": 0.6494677066802979, + "learning_rate": 9.279590958708472e-06, + "loss": 0.2296, + "step": 11305 + }, + { + "epoch": 1.065071477355691, + "grad_norm": 0.6022455096244812, + "learning_rate": 9.278084872341675e-06, + "loss": 0.2045, + "step": 11306 + }, + { + "epoch": 1.065165681448859, + "grad_norm": 0.6770690083503723, + "learning_rate": 9.276578802435661e-06, + "loss": 0.2057, + "step": 11307 + }, + { + "epoch": 1.0652598855420268, + "grad_norm": 0.765457034111023, + "learning_rate": 9.275072749024771e-06, + "loss": 0.2148, + "step": 11308 + }, + { + "epoch": 1.0653540896351947, + "grad_norm": 0.6799235343933105, + "learning_rate": 9.273566712143343e-06, + "loss": 0.2046, + "step": 11309 + }, + { + "epoch": 1.0654482937283625, + "grad_norm": 0.6458833813667297, + "learning_rate": 9.272060691825714e-06, + "loss": 0.1892, + "step": 11310 + }, + { + "epoch": 1.0655424978215304, + "grad_norm": 0.6030128598213196, + "learning_rate": 9.270554688106229e-06, + "loss": 0.1804, + "step": 11311 + }, + { + "epoch": 1.0656367019146982, + "grad_norm": 0.6258808374404907, + "learning_rate": 9.269048701019226e-06, + "loss": 0.1843, + "step": 11312 + }, + { + "epoch": 1.0657309060078661, + "grad_norm": 0.6587503552436829, + "learning_rate": 9.267542730599042e-06, + "loss": 0.1917, + "step": 11313 + }, + { + "epoch": 1.0658251101010339, + "grad_norm": 0.7243843078613281, + "learning_rate": 9.266036776880016e-06, + "loss": 0.2126, + "step": 11314 + }, + { + "epoch": 1.0659193141942018, + "grad_norm": 0.6364871859550476, + "learning_rate": 9.26453083989649e-06, + "loss": 0.2465, + "step": 11315 + }, + { + "epoch": 1.0660135182873696, + "grad_norm": 0.6186479926109314, + "learning_rate": 9.263024919682794e-06, + "loss": 0.2243, + "step": 11316 + }, + { + "epoch": 1.0661077223805375, + "grad_norm": 0.7175317406654358, + "learning_rate": 9.261519016273271e-06, + "loss": 0.2146, + "step": 11317 + }, + { + "epoch": 1.0662019264737053, + "grad_norm": 0.6257708072662354, + "learning_rate": 9.26001312970226e-06, + "loss": 0.1892, + "step": 11318 + }, + { + "epoch": 1.0662961305668732, + "grad_norm": 0.6223662495613098, + "learning_rate": 9.258507260004092e-06, + "loss": 0.1993, + "step": 11319 + }, + { + "epoch": 1.066390334660041, + "grad_norm": 0.6748189926147461, + "learning_rate": 9.2570014072131e-06, + "loss": 0.1911, + "step": 11320 + }, + { + "epoch": 1.066484538753209, + "grad_norm": 0.6674708724021912, + "learning_rate": 9.255495571363631e-06, + "loss": 0.1946, + "step": 11321 + }, + { + "epoch": 1.0665787428463767, + "grad_norm": 0.7132311463356018, + "learning_rate": 9.253989752490014e-06, + "loss": 0.192, + "step": 11322 + }, + { + "epoch": 1.0666729469395446, + "grad_norm": 0.6067463159561157, + "learning_rate": 9.25248395062658e-06, + "loss": 0.2067, + "step": 11323 + }, + { + "epoch": 1.0667671510327124, + "grad_norm": 0.6910226941108704, + "learning_rate": 9.250978165807672e-06, + "loss": 0.2155, + "step": 11324 + }, + { + "epoch": 1.0668613551258803, + "grad_norm": 0.6239519119262695, + "learning_rate": 9.24947239806762e-06, + "loss": 0.232, + "step": 11325 + }, + { + "epoch": 1.066955559219048, + "grad_norm": 0.5738794803619385, + "learning_rate": 9.247966647440755e-06, + "loss": 0.1773, + "step": 11326 + }, + { + "epoch": 1.067049763312216, + "grad_norm": 0.6636790633201599, + "learning_rate": 9.246460913961417e-06, + "loss": 0.2303, + "step": 11327 + }, + { + "epoch": 1.0671439674053838, + "grad_norm": 0.604157030582428, + "learning_rate": 9.244955197663934e-06, + "loss": 0.1934, + "step": 11328 + }, + { + "epoch": 1.0672381714985517, + "grad_norm": 0.6229135990142822, + "learning_rate": 9.243449498582642e-06, + "loss": 0.2159, + "step": 11329 + }, + { + "epoch": 1.0673323755917195, + "grad_norm": 0.6797246932983398, + "learning_rate": 9.241943816751868e-06, + "loss": 0.2194, + "step": 11330 + }, + { + "epoch": 1.0674265796848874, + "grad_norm": 0.5860040783882141, + "learning_rate": 9.24043815220595e-06, + "loss": 0.1858, + "step": 11331 + }, + { + "epoch": 1.0675207837780551, + "grad_norm": 0.6930223703384399, + "learning_rate": 9.238932504979217e-06, + "loss": 0.2176, + "step": 11332 + }, + { + "epoch": 1.0676149878712229, + "grad_norm": 0.6356987357139587, + "learning_rate": 9.237426875105998e-06, + "loss": 0.2453, + "step": 11333 + }, + { + "epoch": 1.0677091919643908, + "grad_norm": 0.6183640956878662, + "learning_rate": 9.235921262620625e-06, + "loss": 0.2038, + "step": 11334 + }, + { + "epoch": 1.0678033960575588, + "grad_norm": 0.6134758591651917, + "learning_rate": 9.234415667557432e-06, + "loss": 0.1974, + "step": 11335 + }, + { + "epoch": 1.0678976001507265, + "grad_norm": 0.650878369808197, + "learning_rate": 9.232910089950743e-06, + "loss": 0.1967, + "step": 11336 + }, + { + "epoch": 1.0679918042438943, + "grad_norm": 0.6640805006027222, + "learning_rate": 9.23140452983489e-06, + "loss": 0.2245, + "step": 11337 + }, + { + "epoch": 1.0680860083370622, + "grad_norm": 0.6172104477882385, + "learning_rate": 9.229898987244207e-06, + "loss": 0.1994, + "step": 11338 + }, + { + "epoch": 1.0681802124302302, + "grad_norm": 0.6730762720108032, + "learning_rate": 9.228393462213017e-06, + "loss": 0.2197, + "step": 11339 + }, + { + "epoch": 1.068274416523398, + "grad_norm": 0.6202109456062317, + "learning_rate": 9.226887954775642e-06, + "loss": 0.1859, + "step": 11340 + }, + { + "epoch": 1.0683686206165657, + "grad_norm": 0.587462842464447, + "learning_rate": 9.225382464966426e-06, + "loss": 0.1813, + "step": 11341 + }, + { + "epoch": 1.0684628247097336, + "grad_norm": 0.7010436058044434, + "learning_rate": 9.223876992819685e-06, + "loss": 0.2041, + "step": 11342 + }, + { + "epoch": 1.0685570288029016, + "grad_norm": 0.6356572508811951, + "learning_rate": 9.222371538369744e-06, + "loss": 0.2076, + "step": 11343 + }, + { + "epoch": 1.0686512328960693, + "grad_norm": 0.7032904624938965, + "learning_rate": 9.220866101650942e-06, + "loss": 0.2438, + "step": 11344 + }, + { + "epoch": 1.068745436989237, + "grad_norm": 0.5845944881439209, + "learning_rate": 9.219360682697594e-06, + "loss": 0.1816, + "step": 11345 + }, + { + "epoch": 1.068839641082405, + "grad_norm": 0.6367470622062683, + "learning_rate": 9.21785528154403e-06, + "loss": 0.2149, + "step": 11346 + }, + { + "epoch": 1.068933845175573, + "grad_norm": 0.6324123740196228, + "learning_rate": 9.216349898224575e-06, + "loss": 0.2021, + "step": 11347 + }, + { + "epoch": 1.0690280492687407, + "grad_norm": 0.6973478198051453, + "learning_rate": 9.214844532773557e-06, + "loss": 0.2401, + "step": 11348 + }, + { + "epoch": 1.0691222533619085, + "grad_norm": 0.6931522488594055, + "learning_rate": 9.213339185225294e-06, + "loss": 0.2096, + "step": 11349 + }, + { + "epoch": 1.0692164574550764, + "grad_norm": 0.6650674939155579, + "learning_rate": 9.211833855614115e-06, + "loss": 0.2045, + "step": 11350 + }, + { + "epoch": 1.0693106615482442, + "grad_norm": 0.8251972198486328, + "learning_rate": 9.210328543974346e-06, + "loss": 0.2136, + "step": 11351 + }, + { + "epoch": 1.0694048656414121, + "grad_norm": 0.6624812483787537, + "learning_rate": 9.208823250340305e-06, + "loss": 0.2299, + "step": 11352 + }, + { + "epoch": 1.0694990697345799, + "grad_norm": 0.6316957473754883, + "learning_rate": 9.207317974746314e-06, + "loss": 0.1824, + "step": 11353 + }, + { + "epoch": 1.0695932738277478, + "grad_norm": 0.8904557228088379, + "learning_rate": 9.205812717226705e-06, + "loss": 0.2263, + "step": 11354 + }, + { + "epoch": 1.0696874779209156, + "grad_norm": 0.6415508389472961, + "learning_rate": 9.204307477815792e-06, + "loss": 0.22, + "step": 11355 + }, + { + "epoch": 1.0697816820140835, + "grad_norm": 0.5946205258369446, + "learning_rate": 9.202802256547897e-06, + "loss": 0.1842, + "step": 11356 + }, + { + "epoch": 1.0698758861072513, + "grad_norm": 0.6503834128379822, + "learning_rate": 9.201297053457348e-06, + "loss": 0.2193, + "step": 11357 + }, + { + "epoch": 1.0699700902004192, + "grad_norm": 0.722682774066925, + "learning_rate": 9.19979186857846e-06, + "loss": 0.2533, + "step": 11358 + }, + { + "epoch": 1.070064294293587, + "grad_norm": 0.6491664052009583, + "learning_rate": 9.198286701945556e-06, + "loss": 0.1835, + "step": 11359 + }, + { + "epoch": 1.070158498386755, + "grad_norm": 0.623892605304718, + "learning_rate": 9.196781553592948e-06, + "loss": 0.2086, + "step": 11360 + }, + { + "epoch": 1.0702527024799227, + "grad_norm": 0.6431731581687927, + "learning_rate": 9.19527642355497e-06, + "loss": 0.1842, + "step": 11361 + }, + { + "epoch": 1.0703469065730906, + "grad_norm": 0.7300872206687927, + "learning_rate": 9.193771311865933e-06, + "loss": 0.2475, + "step": 11362 + }, + { + "epoch": 1.0704411106662584, + "grad_norm": 0.6609445214271545, + "learning_rate": 9.192266218560156e-06, + "loss": 0.2091, + "step": 11363 + }, + { + "epoch": 1.0705353147594263, + "grad_norm": 0.6846212148666382, + "learning_rate": 9.190761143671958e-06, + "loss": 0.2429, + "step": 11364 + }, + { + "epoch": 1.070629518852594, + "grad_norm": 0.7073555588722229, + "learning_rate": 9.189256087235657e-06, + "loss": 0.2215, + "step": 11365 + }, + { + "epoch": 1.070723722945762, + "grad_norm": 0.7998649477958679, + "learning_rate": 9.18775104928557e-06, + "loss": 0.1956, + "step": 11366 + }, + { + "epoch": 1.0708179270389298, + "grad_norm": 0.6570374965667725, + "learning_rate": 9.186246029856019e-06, + "loss": 0.2422, + "step": 11367 + }, + { + "epoch": 1.0709121311320977, + "grad_norm": 0.6321113109588623, + "learning_rate": 9.184741028981314e-06, + "loss": 0.2223, + "step": 11368 + }, + { + "epoch": 1.0710063352252654, + "grad_norm": 0.7635213732719421, + "learning_rate": 9.183236046695777e-06, + "loss": 0.2479, + "step": 11369 + }, + { + "epoch": 1.0711005393184334, + "grad_norm": 0.6423308253288269, + "learning_rate": 9.181731083033719e-06, + "loss": 0.1775, + "step": 11370 + }, + { + "epoch": 1.0711947434116011, + "grad_norm": 0.7300950288772583, + "learning_rate": 9.180226138029458e-06, + "loss": 0.1821, + "step": 11371 + }, + { + "epoch": 1.071288947504769, + "grad_norm": 0.670215368270874, + "learning_rate": 9.17872121171731e-06, + "loss": 0.1956, + "step": 11372 + }, + { + "epoch": 1.0713831515979368, + "grad_norm": 0.6570760011672974, + "learning_rate": 9.177216304131586e-06, + "loss": 0.2121, + "step": 11373 + }, + { + "epoch": 1.0714773556911048, + "grad_norm": 0.6348326206207275, + "learning_rate": 9.175711415306604e-06, + "loss": 0.2124, + "step": 11374 + }, + { + "epoch": 1.0715715597842725, + "grad_norm": 0.6312444806098938, + "learning_rate": 9.174206545276678e-06, + "loss": 0.2139, + "step": 11375 + }, + { + "epoch": 1.0716657638774405, + "grad_norm": 0.6658427715301514, + "learning_rate": 9.172701694076118e-06, + "loss": 0.2017, + "step": 11376 + }, + { + "epoch": 1.0717599679706082, + "grad_norm": 0.5940864682197571, + "learning_rate": 9.17119686173924e-06, + "loss": 0.2113, + "step": 11377 + }, + { + "epoch": 1.0718541720637762, + "grad_norm": 0.6423086524009705, + "learning_rate": 9.169692048300357e-06, + "loss": 0.2081, + "step": 11378 + }, + { + "epoch": 1.071948376156944, + "grad_norm": 0.5828261971473694, + "learning_rate": 9.168187253793779e-06, + "loss": 0.196, + "step": 11379 + }, + { + "epoch": 1.072042580250112, + "grad_norm": 0.691644012928009, + "learning_rate": 9.166682478253812e-06, + "loss": 0.2051, + "step": 11380 + }, + { + "epoch": 1.0721367843432796, + "grad_norm": 0.6989017128944397, + "learning_rate": 9.16517772171478e-06, + "loss": 0.2251, + "step": 11381 + }, + { + "epoch": 1.0722309884364476, + "grad_norm": 0.6330791711807251, + "learning_rate": 9.163672984210985e-06, + "loss": 0.1756, + "step": 11382 + }, + { + "epoch": 1.0723251925296153, + "grad_norm": 0.659974992275238, + "learning_rate": 9.162168265776739e-06, + "loss": 0.2001, + "step": 11383 + }, + { + "epoch": 1.0724193966227833, + "grad_norm": 0.5928322672843933, + "learning_rate": 9.160663566446352e-06, + "loss": 0.2088, + "step": 11384 + }, + { + "epoch": 1.072513600715951, + "grad_norm": 0.7007295489311218, + "learning_rate": 9.159158886254134e-06, + "loss": 0.1932, + "step": 11385 + }, + { + "epoch": 1.072607804809119, + "grad_norm": 0.7131476998329163, + "learning_rate": 9.157654225234392e-06, + "loss": 0.2272, + "step": 11386 + }, + { + "epoch": 1.0727020089022867, + "grad_norm": 0.6879523396492004, + "learning_rate": 9.15614958342144e-06, + "loss": 0.2103, + "step": 11387 + }, + { + "epoch": 1.0727962129954547, + "grad_norm": 0.6187193989753723, + "learning_rate": 9.154644960849582e-06, + "loss": 0.2147, + "step": 11388 + }, + { + "epoch": 1.0728904170886224, + "grad_norm": 0.6488523483276367, + "learning_rate": 9.153140357553124e-06, + "loss": 0.2265, + "step": 11389 + }, + { + "epoch": 1.0729846211817904, + "grad_norm": 0.6526038646697998, + "learning_rate": 9.151635773566376e-06, + "loss": 0.2202, + "step": 11390 + }, + { + "epoch": 1.0730788252749581, + "grad_norm": 0.7171820402145386, + "learning_rate": 9.150131208923645e-06, + "loss": 0.2127, + "step": 11391 + }, + { + "epoch": 1.073173029368126, + "grad_norm": 0.6093924641609192, + "learning_rate": 9.148626663659237e-06, + "loss": 0.1775, + "step": 11392 + }, + { + "epoch": 1.0732672334612938, + "grad_norm": 0.6443316340446472, + "learning_rate": 9.147122137807456e-06, + "loss": 0.2021, + "step": 11393 + }, + { + "epoch": 1.0733614375544618, + "grad_norm": 0.6613069772720337, + "learning_rate": 9.145617631402612e-06, + "loss": 0.216, + "step": 11394 + }, + { + "epoch": 1.0734556416476295, + "grad_norm": 0.6950787305831909, + "learning_rate": 9.144113144479006e-06, + "loss": 0.223, + "step": 11395 + }, + { + "epoch": 1.0735498457407975, + "grad_norm": 0.6569063663482666, + "learning_rate": 9.142608677070943e-06, + "loss": 0.2013, + "step": 11396 + }, + { + "epoch": 1.0736440498339652, + "grad_norm": 0.6433458924293518, + "learning_rate": 9.14110422921273e-06, + "loss": 0.2061, + "step": 11397 + }, + { + "epoch": 1.0737382539271332, + "grad_norm": 0.6032097935676575, + "learning_rate": 9.13959980093867e-06, + "loss": 0.2157, + "step": 11398 + }, + { + "epoch": 1.073832458020301, + "grad_norm": 0.6845543384552002, + "learning_rate": 9.138095392283063e-06, + "loss": 0.2149, + "step": 11399 + }, + { + "epoch": 1.0739266621134689, + "grad_norm": 0.7228466868400574, + "learning_rate": 9.13659100328021e-06, + "loss": 0.2223, + "step": 11400 + }, + { + "epoch": 1.0740208662066366, + "grad_norm": 0.6130920648574829, + "learning_rate": 9.135086633964427e-06, + "loss": 0.198, + "step": 11401 + }, + { + "epoch": 1.0741150702998046, + "grad_norm": 0.6409655809402466, + "learning_rate": 9.13358228437e-06, + "loss": 0.1913, + "step": 11402 + }, + { + "epoch": 1.0742092743929723, + "grad_norm": 0.6924681663513184, + "learning_rate": 9.132077954531236e-06, + "loss": 0.1838, + "step": 11403 + }, + { + "epoch": 1.0743034784861403, + "grad_norm": 0.6371403336524963, + "learning_rate": 9.13057364448244e-06, + "loss": 0.1925, + "step": 11404 + }, + { + "epoch": 1.074397682579308, + "grad_norm": 0.6719380021095276, + "learning_rate": 9.129069354257909e-06, + "loss": 0.2164, + "step": 11405 + }, + { + "epoch": 1.074491886672476, + "grad_norm": 0.7450310587882996, + "learning_rate": 9.127565083891942e-06, + "loss": 0.2039, + "step": 11406 + }, + { + "epoch": 1.0745860907656437, + "grad_norm": 0.6084794402122498, + "learning_rate": 9.12606083341884e-06, + "loss": 0.2046, + "step": 11407 + }, + { + "epoch": 1.0746802948588117, + "grad_norm": 0.6674574613571167, + "learning_rate": 9.124556602872905e-06, + "loss": 0.2092, + "step": 11408 + }, + { + "epoch": 1.0747744989519794, + "grad_norm": 0.6063975691795349, + "learning_rate": 9.123052392288433e-06, + "loss": 0.1961, + "step": 11409 + }, + { + "epoch": 1.0748687030451474, + "grad_norm": 0.6519051194190979, + "learning_rate": 9.121548201699721e-06, + "loss": 0.2143, + "step": 11410 + }, + { + "epoch": 1.074962907138315, + "grad_norm": 0.6986181735992432, + "learning_rate": 9.12004403114107e-06, + "loss": 0.2124, + "step": 11411 + }, + { + "epoch": 1.075057111231483, + "grad_norm": 0.7183970808982849, + "learning_rate": 9.118539880646775e-06, + "loss": 0.2766, + "step": 11412 + }, + { + "epoch": 1.0751513153246508, + "grad_norm": 0.6105647087097168, + "learning_rate": 9.117035750251134e-06, + "loss": 0.2162, + "step": 11413 + }, + { + "epoch": 1.0752455194178188, + "grad_norm": 0.6653372645378113, + "learning_rate": 9.115531639988443e-06, + "loss": 0.185, + "step": 11414 + }, + { + "epoch": 1.0753397235109865, + "grad_norm": 0.650631844997406, + "learning_rate": 9.114027549893e-06, + "loss": 0.2188, + "step": 11415 + }, + { + "epoch": 1.0754339276041545, + "grad_norm": 0.6427672505378723, + "learning_rate": 9.112523479999096e-06, + "loss": 0.1923, + "step": 11416 + }, + { + "epoch": 1.0755281316973222, + "grad_norm": 0.7197262048721313, + "learning_rate": 9.111019430341033e-06, + "loss": 0.2176, + "step": 11417 + }, + { + "epoch": 1.0756223357904902, + "grad_norm": 0.746242344379425, + "learning_rate": 9.109515400953102e-06, + "loss": 0.2164, + "step": 11418 + }, + { + "epoch": 1.075716539883658, + "grad_norm": 0.6563817262649536, + "learning_rate": 9.108011391869596e-06, + "loss": 0.1778, + "step": 11419 + }, + { + "epoch": 1.0758107439768259, + "grad_norm": 0.6394875049591064, + "learning_rate": 9.106507403124805e-06, + "loss": 0.2326, + "step": 11420 + }, + { + "epoch": 1.0759049480699936, + "grad_norm": 0.633727490901947, + "learning_rate": 9.105003434753035e-06, + "loss": 0.1941, + "step": 11421 + }, + { + "epoch": 1.0759991521631616, + "grad_norm": 0.6746935844421387, + "learning_rate": 9.103499486788567e-06, + "loss": 0.2017, + "step": 11422 + }, + { + "epoch": 1.0760933562563293, + "grad_norm": 0.7106413841247559, + "learning_rate": 9.101995559265696e-06, + "loss": 0.2331, + "step": 11423 + }, + { + "epoch": 1.0761875603494973, + "grad_norm": 0.6105821132659912, + "learning_rate": 9.100491652218716e-06, + "loss": 0.1998, + "step": 11424 + }, + { + "epoch": 1.076281764442665, + "grad_norm": 0.6974877715110779, + "learning_rate": 9.098987765681917e-06, + "loss": 0.2107, + "step": 11425 + }, + { + "epoch": 1.076375968535833, + "grad_norm": 0.6682244539260864, + "learning_rate": 9.09748389968959e-06, + "loss": 0.1996, + "step": 11426 + }, + { + "epoch": 1.0764701726290007, + "grad_norm": 0.6583214998245239, + "learning_rate": 9.095980054276027e-06, + "loss": 0.2036, + "step": 11427 + }, + { + "epoch": 1.0765643767221686, + "grad_norm": 0.6335369348526001, + "learning_rate": 9.094476229475517e-06, + "loss": 0.2029, + "step": 11428 + }, + { + "epoch": 1.0766585808153364, + "grad_norm": 0.6718069314956665, + "learning_rate": 9.09297242532235e-06, + "loss": 0.2091, + "step": 11429 + }, + { + "epoch": 1.0767527849085043, + "grad_norm": 0.5730756521224976, + "learning_rate": 9.091468641850812e-06, + "loss": 0.1944, + "step": 11430 + }, + { + "epoch": 1.076846989001672, + "grad_norm": 0.6275970339775085, + "learning_rate": 9.089964879095197e-06, + "loss": 0.217, + "step": 11431 + }, + { + "epoch": 1.07694119309484, + "grad_norm": 0.6263253092765808, + "learning_rate": 9.088461137089788e-06, + "loss": 0.2075, + "step": 11432 + }, + { + "epoch": 1.0770353971880078, + "grad_norm": 0.5872949361801147, + "learning_rate": 9.086957415868874e-06, + "loss": 0.2032, + "step": 11433 + }, + { + "epoch": 1.0771296012811757, + "grad_norm": 0.6495922803878784, + "learning_rate": 9.085453715466746e-06, + "loss": 0.2209, + "step": 11434 + }, + { + "epoch": 1.0772238053743435, + "grad_norm": 0.5955852270126343, + "learning_rate": 9.083950035917688e-06, + "loss": 0.1929, + "step": 11435 + }, + { + "epoch": 1.0773180094675114, + "grad_norm": 0.7622807025909424, + "learning_rate": 9.08244637725598e-06, + "loss": 0.1952, + "step": 11436 + }, + { + "epoch": 1.0774122135606792, + "grad_norm": 0.6685764789581299, + "learning_rate": 9.080942739515917e-06, + "loss": 0.1969, + "step": 11437 + }, + { + "epoch": 1.0775064176538471, + "grad_norm": 0.6288415193557739, + "learning_rate": 9.079439122731786e-06, + "loss": 0.2077, + "step": 11438 + }, + { + "epoch": 1.0776006217470149, + "grad_norm": 0.6691920757293701, + "learning_rate": 9.077935526937862e-06, + "loss": 0.207, + "step": 11439 + }, + { + "epoch": 1.0776948258401828, + "grad_norm": 0.6530357003211975, + "learning_rate": 9.076431952168432e-06, + "loss": 0.1769, + "step": 11440 + }, + { + "epoch": 1.0777890299333506, + "grad_norm": 0.6887516975402832, + "learning_rate": 9.074928398457785e-06, + "loss": 0.1851, + "step": 11441 + }, + { + "epoch": 1.0778832340265185, + "grad_norm": 0.6515269875526428, + "learning_rate": 9.073424865840202e-06, + "loss": 0.2113, + "step": 11442 + }, + { + "epoch": 1.0779774381196863, + "grad_norm": 0.6228629350662231, + "learning_rate": 9.071921354349961e-06, + "loss": 0.2162, + "step": 11443 + }, + { + "epoch": 1.0780716422128542, + "grad_norm": 0.5888656973838806, + "learning_rate": 9.07041786402135e-06, + "loss": 0.1957, + "step": 11444 + }, + { + "epoch": 1.078165846306022, + "grad_norm": 0.67299485206604, + "learning_rate": 9.068914394888651e-06, + "loss": 0.1915, + "step": 11445 + }, + { + "epoch": 1.07826005039919, + "grad_norm": 0.6621066927909851, + "learning_rate": 9.06741094698614e-06, + "loss": 0.202, + "step": 11446 + }, + { + "epoch": 1.0783542544923577, + "grad_norm": 0.6297824382781982, + "learning_rate": 9.065907520348104e-06, + "loss": 0.1877, + "step": 11447 + }, + { + "epoch": 1.0784484585855256, + "grad_norm": 0.6334999203681946, + "learning_rate": 9.064404115008824e-06, + "loss": 0.2458, + "step": 11448 + }, + { + "epoch": 1.0785426626786934, + "grad_norm": 0.5947049856185913, + "learning_rate": 9.062900731002575e-06, + "loss": 0.1841, + "step": 11449 + }, + { + "epoch": 1.0786368667718613, + "grad_norm": 0.6204099059104919, + "learning_rate": 9.061397368363635e-06, + "loss": 0.2255, + "step": 11450 + }, + { + "epoch": 1.078731070865029, + "grad_norm": 0.6177732348442078, + "learning_rate": 9.05989402712629e-06, + "loss": 0.2146, + "step": 11451 + }, + { + "epoch": 1.078825274958197, + "grad_norm": 0.7066769599914551, + "learning_rate": 9.058390707324817e-06, + "loss": 0.2223, + "step": 11452 + }, + { + "epoch": 1.0789194790513648, + "grad_norm": 0.6135818958282471, + "learning_rate": 9.056887408993488e-06, + "loss": 0.2093, + "step": 11453 + }, + { + "epoch": 1.0790136831445327, + "grad_norm": 0.5583428144454956, + "learning_rate": 9.055384132166587e-06, + "loss": 0.2067, + "step": 11454 + }, + { + "epoch": 1.0791078872377005, + "grad_norm": 0.7905045747756958, + "learning_rate": 9.053880876878392e-06, + "loss": 0.1852, + "step": 11455 + }, + { + "epoch": 1.0792020913308684, + "grad_norm": 0.7285098433494568, + "learning_rate": 9.052377643163168e-06, + "loss": 0.2386, + "step": 11456 + }, + { + "epoch": 1.0792962954240362, + "grad_norm": 0.6515780091285706, + "learning_rate": 9.050874431055205e-06, + "loss": 0.2123, + "step": 11457 + }, + { + "epoch": 1.0793904995172041, + "grad_norm": 0.6538022756576538, + "learning_rate": 9.049371240588774e-06, + "loss": 0.1863, + "step": 11458 + }, + { + "epoch": 1.0794847036103719, + "grad_norm": 0.6690369844436646, + "learning_rate": 9.047868071798146e-06, + "loss": 0.2184, + "step": 11459 + }, + { + "epoch": 1.0795789077035398, + "grad_norm": 0.6839024424552917, + "learning_rate": 9.046364924717598e-06, + "loss": 0.2219, + "step": 11460 + }, + { + "epoch": 1.0796731117967076, + "grad_norm": 0.6652877330780029, + "learning_rate": 9.044861799381407e-06, + "loss": 0.2138, + "step": 11461 + }, + { + "epoch": 1.0797673158898755, + "grad_norm": 0.6577285528182983, + "learning_rate": 9.043358695823841e-06, + "loss": 0.2233, + "step": 11462 + }, + { + "epoch": 1.0798615199830432, + "grad_norm": 0.724801242351532, + "learning_rate": 9.041855614079177e-06, + "loss": 0.2355, + "step": 11463 + }, + { + "epoch": 1.0799557240762112, + "grad_norm": 0.5934066772460938, + "learning_rate": 9.040352554181688e-06, + "loss": 0.174, + "step": 11464 + }, + { + "epoch": 1.080049928169379, + "grad_norm": 0.7215653657913208, + "learning_rate": 9.038849516165643e-06, + "loss": 0.2055, + "step": 11465 + }, + { + "epoch": 1.080144132262547, + "grad_norm": 0.6846879720687866, + "learning_rate": 9.037346500065318e-06, + "loss": 0.2189, + "step": 11466 + }, + { + "epoch": 1.0802383363557146, + "grad_norm": 0.6117352247238159, + "learning_rate": 9.03584350591498e-06, + "loss": 0.2015, + "step": 11467 + }, + { + "epoch": 1.0803325404488826, + "grad_norm": 0.6313402652740479, + "learning_rate": 9.034340533748901e-06, + "loss": 0.198, + "step": 11468 + }, + { + "epoch": 1.0804267445420503, + "grad_norm": 0.6113153100013733, + "learning_rate": 9.032837583601354e-06, + "loss": 0.2173, + "step": 11469 + }, + { + "epoch": 1.0805209486352183, + "grad_norm": 0.6613195538520813, + "learning_rate": 9.031334655506601e-06, + "loss": 0.2144, + "step": 11470 + }, + { + "epoch": 1.080615152728386, + "grad_norm": 0.661891520023346, + "learning_rate": 9.02983174949892e-06, + "loss": 0.1997, + "step": 11471 + }, + { + "epoch": 1.0807093568215538, + "grad_norm": 0.6959601044654846, + "learning_rate": 9.028328865612574e-06, + "loss": 0.2257, + "step": 11472 + }, + { + "epoch": 1.0808035609147217, + "grad_norm": 0.701431930065155, + "learning_rate": 9.026826003881831e-06, + "loss": 0.2089, + "step": 11473 + }, + { + "epoch": 1.0808977650078897, + "grad_norm": 0.6624088287353516, + "learning_rate": 9.025323164340962e-06, + "loss": 0.1993, + "step": 11474 + }, + { + "epoch": 1.0809919691010574, + "grad_norm": 0.684364378452301, + "learning_rate": 9.023820347024236e-06, + "loss": 0.1855, + "step": 11475 + }, + { + "epoch": 1.0810861731942252, + "grad_norm": 0.6876487135887146, + "learning_rate": 9.022317551965908e-06, + "loss": 0.2242, + "step": 11476 + }, + { + "epoch": 1.0811803772873931, + "grad_norm": 0.5800148248672485, + "learning_rate": 9.020814779200255e-06, + "loss": 0.1951, + "step": 11477 + }, + { + "epoch": 1.081274581380561, + "grad_norm": 0.8090377449989319, + "learning_rate": 9.019312028761544e-06, + "loss": 0.2272, + "step": 11478 + }, + { + "epoch": 1.0813687854737288, + "grad_norm": 0.6785503625869751, + "learning_rate": 9.017809300684031e-06, + "loss": 0.2314, + "step": 11479 + }, + { + "epoch": 1.0814629895668966, + "grad_norm": 0.7226163744926453, + "learning_rate": 9.016306595001985e-06, + "loss": 0.2475, + "step": 11480 + }, + { + "epoch": 1.0815571936600645, + "grad_norm": 0.6443579196929932, + "learning_rate": 9.01480391174967e-06, + "loss": 0.1798, + "step": 11481 + }, + { + "epoch": 1.0816513977532325, + "grad_norm": 0.6113689541816711, + "learning_rate": 9.013301250961351e-06, + "loss": 0.185, + "step": 11482 + }, + { + "epoch": 1.0817456018464002, + "grad_norm": 0.6521504521369934, + "learning_rate": 9.011798612671286e-06, + "loss": 0.1943, + "step": 11483 + }, + { + "epoch": 1.081839805939568, + "grad_norm": 0.6238689422607422, + "learning_rate": 9.010295996913744e-06, + "loss": 0.2107, + "step": 11484 + }, + { + "epoch": 1.081934010032736, + "grad_norm": 0.6215823888778687, + "learning_rate": 9.008793403722984e-06, + "loss": 0.2387, + "step": 11485 + }, + { + "epoch": 1.0820282141259039, + "grad_norm": 0.5636431574821472, + "learning_rate": 9.007290833133264e-06, + "loss": 0.1883, + "step": 11486 + }, + { + "epoch": 1.0821224182190716, + "grad_norm": 0.6293860077857971, + "learning_rate": 9.005788285178851e-06, + "loss": 0.2032, + "step": 11487 + }, + { + "epoch": 1.0822166223122394, + "grad_norm": 0.6379905343055725, + "learning_rate": 9.004285759894004e-06, + "loss": 0.211, + "step": 11488 + }, + { + "epoch": 1.0823108264054073, + "grad_norm": 0.6105638742446899, + "learning_rate": 9.00278325731298e-06, + "loss": 0.1981, + "step": 11489 + }, + { + "epoch": 1.082405030498575, + "grad_norm": 0.6221534609794617, + "learning_rate": 9.001280777470038e-06, + "loss": 0.1923, + "step": 11490 + }, + { + "epoch": 1.082499234591743, + "grad_norm": 0.6618968844413757, + "learning_rate": 8.999778320399441e-06, + "loss": 0.1764, + "step": 11491 + }, + { + "epoch": 1.0825934386849108, + "grad_norm": 0.6597989201545715, + "learning_rate": 8.998275886135446e-06, + "loss": 0.2261, + "step": 11492 + }, + { + "epoch": 1.0826876427780787, + "grad_norm": 0.6791762709617615, + "learning_rate": 8.996773474712307e-06, + "loss": 0.234, + "step": 11493 + }, + { + "epoch": 1.0827818468712465, + "grad_norm": 0.5946647524833679, + "learning_rate": 8.995271086164287e-06, + "loss": 0.1826, + "step": 11494 + }, + { + "epoch": 1.0828760509644144, + "grad_norm": 0.5931686162948608, + "learning_rate": 8.993768720525642e-06, + "loss": 0.202, + "step": 11495 + }, + { + "epoch": 1.0829702550575822, + "grad_norm": 0.6495991945266724, + "learning_rate": 8.992266377830619e-06, + "loss": 0.2053, + "step": 11496 + }, + { + "epoch": 1.0830644591507501, + "grad_norm": 0.6207659840583801, + "learning_rate": 8.990764058113486e-06, + "loss": 0.1994, + "step": 11497 + }, + { + "epoch": 1.0831586632439179, + "grad_norm": 0.6050601601600647, + "learning_rate": 8.989261761408496e-06, + "loss": 0.196, + "step": 11498 + }, + { + "epoch": 1.0832528673370858, + "grad_norm": 0.6067149639129639, + "learning_rate": 8.9877594877499e-06, + "loss": 0.2223, + "step": 11499 + }, + { + "epoch": 1.0833470714302535, + "grad_norm": 0.6345522403717041, + "learning_rate": 8.986257237171947e-06, + "loss": 0.2142, + "step": 11500 + }, + { + "epoch": 1.0834412755234215, + "grad_norm": 0.682036817073822, + "learning_rate": 8.984755009708903e-06, + "loss": 0.2266, + "step": 11501 + }, + { + "epoch": 1.0835354796165892, + "grad_norm": 0.6006115674972534, + "learning_rate": 8.983252805395011e-06, + "loss": 0.1968, + "step": 11502 + }, + { + "epoch": 1.0836296837097572, + "grad_norm": 0.6100013852119446, + "learning_rate": 8.98175062426453e-06, + "loss": 0.223, + "step": 11503 + }, + { + "epoch": 1.083723887802925, + "grad_norm": 0.6218665838241577, + "learning_rate": 8.980248466351708e-06, + "loss": 0.201, + "step": 11504 + }, + { + "epoch": 1.083818091896093, + "grad_norm": 0.7070738077163696, + "learning_rate": 8.978746331690799e-06, + "loss": 0.2226, + "step": 11505 + }, + { + "epoch": 1.0839122959892606, + "grad_norm": 0.5810721516609192, + "learning_rate": 8.977244220316051e-06, + "loss": 0.2086, + "step": 11506 + }, + { + "epoch": 1.0840065000824286, + "grad_norm": 0.6965169310569763, + "learning_rate": 8.975742132261719e-06, + "loss": 0.23, + "step": 11507 + }, + { + "epoch": 1.0841007041755963, + "grad_norm": 0.7126672863960266, + "learning_rate": 8.97424006756205e-06, + "loss": 0.2139, + "step": 11508 + }, + { + "epoch": 1.0841949082687643, + "grad_norm": 0.7475795745849609, + "learning_rate": 8.972738026251295e-06, + "loss": 0.1893, + "step": 11509 + }, + { + "epoch": 1.084289112361932, + "grad_norm": 0.6632144451141357, + "learning_rate": 8.971236008363698e-06, + "loss": 0.2225, + "step": 11510 + }, + { + "epoch": 1.0843833164551, + "grad_norm": 0.6255399584770203, + "learning_rate": 8.969734013933514e-06, + "loss": 0.2064, + "step": 11511 + }, + { + "epoch": 1.0844775205482677, + "grad_norm": 0.7544078230857849, + "learning_rate": 8.968232042994988e-06, + "loss": 0.2123, + "step": 11512 + }, + { + "epoch": 1.0845717246414357, + "grad_norm": 0.7256659269332886, + "learning_rate": 8.966730095582363e-06, + "loss": 0.1864, + "step": 11513 + }, + { + "epoch": 1.0846659287346034, + "grad_norm": 0.6728140711784363, + "learning_rate": 8.965228171729894e-06, + "loss": 0.2011, + "step": 11514 + }, + { + "epoch": 1.0847601328277714, + "grad_norm": 0.688495397567749, + "learning_rate": 8.963726271471825e-06, + "loss": 0.2061, + "step": 11515 + }, + { + "epoch": 1.0848543369209391, + "grad_norm": 0.6378021240234375, + "learning_rate": 8.962224394842393e-06, + "loss": 0.2243, + "step": 11516 + }, + { + "epoch": 1.084948541014107, + "grad_norm": 0.6383763551712036, + "learning_rate": 8.960722541875854e-06, + "loss": 0.193, + "step": 11517 + }, + { + "epoch": 1.0850427451072748, + "grad_norm": 0.6939751505851746, + "learning_rate": 8.95922071260645e-06, + "loss": 0.2142, + "step": 11518 + }, + { + "epoch": 1.0851369492004428, + "grad_norm": 0.7284465432167053, + "learning_rate": 8.957718907068422e-06, + "loss": 0.2245, + "step": 11519 + }, + { + "epoch": 1.0852311532936105, + "grad_norm": 0.618031919002533, + "learning_rate": 8.956217125296011e-06, + "loss": 0.2071, + "step": 11520 + }, + { + "epoch": 1.0853253573867785, + "grad_norm": 0.5998331308364868, + "learning_rate": 8.954715367323468e-06, + "loss": 0.1908, + "step": 11521 + }, + { + "epoch": 1.0854195614799462, + "grad_norm": 0.6736962795257568, + "learning_rate": 8.95321363318503e-06, + "loss": 0.221, + "step": 11522 + }, + { + "epoch": 1.0855137655731142, + "grad_norm": 0.6591629385948181, + "learning_rate": 8.951711922914937e-06, + "loss": 0.1875, + "step": 11523 + }, + { + "epoch": 1.085607969666282, + "grad_norm": 0.5983251929283142, + "learning_rate": 8.950210236547437e-06, + "loss": 0.1992, + "step": 11524 + }, + { + "epoch": 1.0857021737594499, + "grad_norm": 0.6645702719688416, + "learning_rate": 8.948708574116768e-06, + "loss": 0.2453, + "step": 11525 + }, + { + "epoch": 1.0857963778526176, + "grad_norm": 0.6778318285942078, + "learning_rate": 8.947206935657165e-06, + "loss": 0.1903, + "step": 11526 + }, + { + "epoch": 1.0858905819457856, + "grad_norm": 0.6595224142074585, + "learning_rate": 8.945705321202876e-06, + "loss": 0.1976, + "step": 11527 + }, + { + "epoch": 1.0859847860389533, + "grad_norm": 0.6204866766929626, + "learning_rate": 8.944203730788134e-06, + "loss": 0.2079, + "step": 11528 + }, + { + "epoch": 1.0860789901321213, + "grad_norm": 0.6215004920959473, + "learning_rate": 8.942702164447181e-06, + "loss": 0.1862, + "step": 11529 + }, + { + "epoch": 1.086173194225289, + "grad_norm": 0.6346989274024963, + "learning_rate": 8.941200622214254e-06, + "loss": 0.2117, + "step": 11530 + }, + { + "epoch": 1.086267398318457, + "grad_norm": 0.6370419263839722, + "learning_rate": 8.93969910412359e-06, + "loss": 0.2039, + "step": 11531 + }, + { + "epoch": 1.0863616024116247, + "grad_norm": 0.5927845239639282, + "learning_rate": 8.938197610209429e-06, + "loss": 0.197, + "step": 11532 + }, + { + "epoch": 1.0864558065047927, + "grad_norm": 0.6324030160903931, + "learning_rate": 8.936696140505997e-06, + "loss": 0.2092, + "step": 11533 + }, + { + "epoch": 1.0865500105979604, + "grad_norm": 0.6234959959983826, + "learning_rate": 8.935194695047543e-06, + "loss": 0.2032, + "step": 11534 + }, + { + "epoch": 1.0866442146911284, + "grad_norm": 0.6283422112464905, + "learning_rate": 8.933693273868298e-06, + "loss": 0.1923, + "step": 11535 + }, + { + "epoch": 1.0867384187842961, + "grad_norm": 0.6398184895515442, + "learning_rate": 8.93219187700249e-06, + "loss": 0.2109, + "step": 11536 + }, + { + "epoch": 1.086832622877464, + "grad_norm": 0.7055505514144897, + "learning_rate": 8.930690504484362e-06, + "loss": 0.2371, + "step": 11537 + }, + { + "epoch": 1.0869268269706318, + "grad_norm": 0.6347362399101257, + "learning_rate": 8.929189156348146e-06, + "loss": 0.2087, + "step": 11538 + }, + { + "epoch": 1.0870210310637998, + "grad_norm": 0.6415491700172424, + "learning_rate": 8.927687832628072e-06, + "loss": 0.227, + "step": 11539 + }, + { + "epoch": 1.0871152351569675, + "grad_norm": 0.6096305847167969, + "learning_rate": 8.92618653335837e-06, + "loss": 0.1748, + "step": 11540 + }, + { + "epoch": 1.0872094392501355, + "grad_norm": 0.6415346264839172, + "learning_rate": 8.92468525857328e-06, + "loss": 0.199, + "step": 11541 + }, + { + "epoch": 1.0873036433433032, + "grad_norm": 0.6052883267402649, + "learning_rate": 8.923184008307029e-06, + "loss": 0.1776, + "step": 11542 + }, + { + "epoch": 1.0873978474364712, + "grad_norm": 0.6329769492149353, + "learning_rate": 8.921682782593844e-06, + "loss": 0.1992, + "step": 11543 + }, + { + "epoch": 1.087492051529639, + "grad_norm": 0.6819673180580139, + "learning_rate": 8.920181581467963e-06, + "loss": 0.2024, + "step": 11544 + }, + { + "epoch": 1.0875862556228069, + "grad_norm": 0.6423011422157288, + "learning_rate": 8.918680404963613e-06, + "loss": 0.2095, + "step": 11545 + }, + { + "epoch": 1.0876804597159746, + "grad_norm": 0.5891927480697632, + "learning_rate": 8.917179253115018e-06, + "loss": 0.2009, + "step": 11546 + }, + { + "epoch": 1.0877746638091426, + "grad_norm": 0.6175965666770935, + "learning_rate": 8.915678125956411e-06, + "loss": 0.2024, + "step": 11547 + }, + { + "epoch": 1.0878688679023103, + "grad_norm": 0.623192548751831, + "learning_rate": 8.914177023522022e-06, + "loss": 0.2193, + "step": 11548 + }, + { + "epoch": 1.0879630719954783, + "grad_norm": 0.6816789507865906, + "learning_rate": 8.912675945846076e-06, + "loss": 0.1949, + "step": 11549 + }, + { + "epoch": 1.088057276088646, + "grad_norm": 0.5938009023666382, + "learning_rate": 8.911174892962798e-06, + "loss": 0.1997, + "step": 11550 + }, + { + "epoch": 1.088151480181814, + "grad_norm": 0.6361944079399109, + "learning_rate": 8.909673864906418e-06, + "loss": 0.1786, + "step": 11551 + }, + { + "epoch": 1.0882456842749817, + "grad_norm": 0.6905481219291687, + "learning_rate": 8.908172861711165e-06, + "loss": 0.197, + "step": 11552 + }, + { + "epoch": 1.0883398883681497, + "grad_norm": 0.6696801781654358, + "learning_rate": 8.906671883411248e-06, + "loss": 0.2219, + "step": 11553 + }, + { + "epoch": 1.0884340924613174, + "grad_norm": 0.6810381412506104, + "learning_rate": 8.905170930040911e-06, + "loss": 0.2015, + "step": 11554 + }, + { + "epoch": 1.0885282965544854, + "grad_norm": 0.6643819808959961, + "learning_rate": 8.90367000163437e-06, + "loss": 0.2155, + "step": 11555 + }, + { + "epoch": 1.088622500647653, + "grad_norm": 0.800377607345581, + "learning_rate": 8.902169098225843e-06, + "loss": 0.2062, + "step": 11556 + }, + { + "epoch": 1.088716704740821, + "grad_norm": 0.6669199466705322, + "learning_rate": 8.900668219849562e-06, + "loss": 0.2056, + "step": 11557 + }, + { + "epoch": 1.0888109088339888, + "grad_norm": 0.6585336327552795, + "learning_rate": 8.899167366539748e-06, + "loss": 0.2203, + "step": 11558 + }, + { + "epoch": 1.0889051129271567, + "grad_norm": 0.6978430151939392, + "learning_rate": 8.897666538330619e-06, + "loss": 0.1806, + "step": 11559 + }, + { + "epoch": 1.0889993170203245, + "grad_norm": 0.6860094666481018, + "learning_rate": 8.896165735256396e-06, + "loss": 0.2212, + "step": 11560 + }, + { + "epoch": 1.0890935211134924, + "grad_norm": 0.6172075867652893, + "learning_rate": 8.894664957351302e-06, + "loss": 0.2004, + "step": 11561 + }, + { + "epoch": 1.0891877252066602, + "grad_norm": 0.6051881909370422, + "learning_rate": 8.893164204649557e-06, + "loss": 0.222, + "step": 11562 + }, + { + "epoch": 1.0892819292998281, + "grad_norm": 0.6277562975883484, + "learning_rate": 8.891663477185378e-06, + "loss": 0.1965, + "step": 11563 + }, + { + "epoch": 1.0893761333929959, + "grad_norm": 0.6940679550170898, + "learning_rate": 8.890162774992988e-06, + "loss": 0.2307, + "step": 11564 + }, + { + "epoch": 1.0894703374861638, + "grad_norm": 0.75794517993927, + "learning_rate": 8.888662098106603e-06, + "loss": 0.2627, + "step": 11565 + }, + { + "epoch": 1.0895645415793316, + "grad_norm": 0.6346994638442993, + "learning_rate": 8.887161446560439e-06, + "loss": 0.1979, + "step": 11566 + }, + { + "epoch": 1.0896587456724995, + "grad_norm": 0.6490880250930786, + "learning_rate": 8.885660820388717e-06, + "loss": 0.2133, + "step": 11567 + }, + { + "epoch": 1.0897529497656673, + "grad_norm": 0.6442879438400269, + "learning_rate": 8.884160219625651e-06, + "loss": 0.2047, + "step": 11568 + }, + { + "epoch": 1.0898471538588352, + "grad_norm": 0.644882321357727, + "learning_rate": 8.882659644305457e-06, + "loss": 0.2047, + "step": 11569 + }, + { + "epoch": 1.089941357952003, + "grad_norm": 0.6053950786590576, + "learning_rate": 8.881159094462351e-06, + "loss": 0.2019, + "step": 11570 + }, + { + "epoch": 1.090035562045171, + "grad_norm": 0.5669155120849609, + "learning_rate": 8.879658570130549e-06, + "loss": 0.2011, + "step": 11571 + }, + { + "epoch": 1.0901297661383387, + "grad_norm": 0.6672794222831726, + "learning_rate": 8.878158071344266e-06, + "loss": 0.2326, + "step": 11572 + }, + { + "epoch": 1.0902239702315066, + "grad_norm": 0.6535966396331787, + "learning_rate": 8.87665759813771e-06, + "loss": 0.2188, + "step": 11573 + }, + { + "epoch": 1.0903181743246744, + "grad_norm": 0.7254478335380554, + "learning_rate": 8.875157150545099e-06, + "loss": 0.2112, + "step": 11574 + }, + { + "epoch": 1.0904123784178423, + "grad_norm": 0.6615368127822876, + "learning_rate": 8.873656728600649e-06, + "loss": 0.2244, + "step": 11575 + }, + { + "epoch": 1.09050658251101, + "grad_norm": 0.675292432308197, + "learning_rate": 8.87215633233856e-06, + "loss": 0.2232, + "step": 11576 + }, + { + "epoch": 1.090600786604178, + "grad_norm": 0.5594111084938049, + "learning_rate": 8.870655961793057e-06, + "loss": 0.1753, + "step": 11577 + }, + { + "epoch": 1.0906949906973458, + "grad_norm": 0.5725761651992798, + "learning_rate": 8.869155616998343e-06, + "loss": 0.1941, + "step": 11578 + }, + { + "epoch": 1.0907891947905137, + "grad_norm": 0.6048107147216797, + "learning_rate": 8.86765529798863e-06, + "loss": 0.1874, + "step": 11579 + }, + { + "epoch": 1.0908833988836815, + "grad_norm": 0.6969088315963745, + "learning_rate": 8.866155004798127e-06, + "loss": 0.1968, + "step": 11580 + }, + { + "epoch": 1.0909776029768494, + "grad_norm": 0.6184073090553284, + "learning_rate": 8.864654737461042e-06, + "loss": 0.2067, + "step": 11581 + }, + { + "epoch": 1.0910718070700172, + "grad_norm": 0.6290350556373596, + "learning_rate": 8.863154496011588e-06, + "loss": 0.2249, + "step": 11582 + }, + { + "epoch": 1.0911660111631851, + "grad_norm": 0.6085337996482849, + "learning_rate": 8.861654280483965e-06, + "loss": 0.1897, + "step": 11583 + }, + { + "epoch": 1.0912602152563529, + "grad_norm": 0.5991551280021667, + "learning_rate": 8.860154090912388e-06, + "loss": 0.1789, + "step": 11584 + }, + { + "epoch": 1.0913544193495208, + "grad_norm": 0.6495198607444763, + "learning_rate": 8.858653927331061e-06, + "loss": 0.1981, + "step": 11585 + }, + { + "epoch": 1.0914486234426886, + "grad_norm": 0.5962744355201721, + "learning_rate": 8.857153789774188e-06, + "loss": 0.1957, + "step": 11586 + }, + { + "epoch": 1.0915428275358565, + "grad_norm": 0.6588103175163269, + "learning_rate": 8.855653678275977e-06, + "loss": 0.2188, + "step": 11587 + }, + { + "epoch": 1.0916370316290243, + "grad_norm": 0.5866307020187378, + "learning_rate": 8.85415359287063e-06, + "loss": 0.196, + "step": 11588 + }, + { + "epoch": 1.0917312357221922, + "grad_norm": 0.5782949328422546, + "learning_rate": 8.852653533592356e-06, + "loss": 0.1938, + "step": 11589 + }, + { + "epoch": 1.09182543981536, + "grad_norm": 0.5999405980110168, + "learning_rate": 8.851153500475354e-06, + "loss": 0.1614, + "step": 11590 + }, + { + "epoch": 1.091919643908528, + "grad_norm": 0.652942955493927, + "learning_rate": 8.84965349355383e-06, + "loss": 0.2082, + "step": 11591 + }, + { + "epoch": 1.0920138480016957, + "grad_norm": 0.6853984594345093, + "learning_rate": 8.848153512861987e-06, + "loss": 0.2133, + "step": 11592 + }, + { + "epoch": 1.0921080520948636, + "grad_norm": 0.7114432454109192, + "learning_rate": 8.84665355843402e-06, + "loss": 0.21, + "step": 11593 + }, + { + "epoch": 1.0922022561880314, + "grad_norm": 0.6580626964569092, + "learning_rate": 8.84515363030414e-06, + "loss": 0.2203, + "step": 11594 + }, + { + "epoch": 1.0922964602811993, + "grad_norm": 0.6535913348197937, + "learning_rate": 8.843653728506544e-06, + "loss": 0.2227, + "step": 11595 + }, + { + "epoch": 1.092390664374367, + "grad_norm": 0.6289659142494202, + "learning_rate": 8.842153853075426e-06, + "loss": 0.1904, + "step": 11596 + }, + { + "epoch": 1.092484868467535, + "grad_norm": 0.7105277180671692, + "learning_rate": 8.840654004044996e-06, + "loss": 0.1893, + "step": 11597 + }, + { + "epoch": 1.0925790725607027, + "grad_norm": 0.6600250601768494, + "learning_rate": 8.839154181449447e-06, + "loss": 0.1956, + "step": 11598 + }, + { + "epoch": 1.0926732766538707, + "grad_norm": 0.6647025942802429, + "learning_rate": 8.837654385322976e-06, + "loss": 0.1906, + "step": 11599 + }, + { + "epoch": 1.0927674807470384, + "grad_norm": 0.6685451865196228, + "learning_rate": 8.836154615699782e-06, + "loss": 0.2304, + "step": 11600 + }, + { + "epoch": 1.0928616848402064, + "grad_norm": 0.590814471244812, + "learning_rate": 8.834654872614065e-06, + "loss": 0.1957, + "step": 11601 + }, + { + "epoch": 1.0929558889333741, + "grad_norm": 0.637969434261322, + "learning_rate": 8.83315515610002e-06, + "loss": 0.2056, + "step": 11602 + }, + { + "epoch": 1.093050093026542, + "grad_norm": 0.652006983757019, + "learning_rate": 8.831655466191837e-06, + "loss": 0.1958, + "step": 11603 + }, + { + "epoch": 1.0931442971197098, + "grad_norm": 0.6946149468421936, + "learning_rate": 8.830155802923721e-06, + "loss": 0.2242, + "step": 11604 + }, + { + "epoch": 1.0932385012128778, + "grad_norm": 0.6098273992538452, + "learning_rate": 8.828656166329861e-06, + "loss": 0.2052, + "step": 11605 + }, + { + "epoch": 1.0933327053060455, + "grad_norm": 0.622878909111023, + "learning_rate": 8.82715655644445e-06, + "loss": 0.2085, + "step": 11606 + }, + { + "epoch": 1.0934269093992135, + "grad_norm": 0.6437327861785889, + "learning_rate": 8.825656973301684e-06, + "loss": 0.2518, + "step": 11607 + }, + { + "epoch": 1.0935211134923812, + "grad_norm": 0.5694655179977417, + "learning_rate": 8.824157416935756e-06, + "loss": 0.2081, + "step": 11608 + }, + { + "epoch": 1.0936153175855492, + "grad_norm": 0.5934538841247559, + "learning_rate": 8.822657887380857e-06, + "loss": 0.206, + "step": 11609 + }, + { + "epoch": 1.093709521678717, + "grad_norm": 0.6617633104324341, + "learning_rate": 8.82115838467118e-06, + "loss": 0.2397, + "step": 11610 + }, + { + "epoch": 1.0938037257718847, + "grad_norm": 0.6666903495788574, + "learning_rate": 8.819658908840914e-06, + "loss": 0.2311, + "step": 11611 + }, + { + "epoch": 1.0938979298650526, + "grad_norm": 0.6600042581558228, + "learning_rate": 8.818159459924253e-06, + "loss": 0.2169, + "step": 11612 + }, + { + "epoch": 1.0939921339582206, + "grad_norm": 0.6306442022323608, + "learning_rate": 8.816660037955377e-06, + "loss": 0.2284, + "step": 11613 + }, + { + "epoch": 1.0940863380513883, + "grad_norm": 0.6237883567810059, + "learning_rate": 8.815160642968487e-06, + "loss": 0.2261, + "step": 11614 + }, + { + "epoch": 1.094180542144556, + "grad_norm": 0.7384260892868042, + "learning_rate": 8.813661274997769e-06, + "loss": 0.2222, + "step": 11615 + }, + { + "epoch": 1.094274746237724, + "grad_norm": 0.6339855194091797, + "learning_rate": 8.812161934077402e-06, + "loss": 0.2098, + "step": 11616 + }, + { + "epoch": 1.094368950330892, + "grad_norm": 0.6157028675079346, + "learning_rate": 8.810662620241586e-06, + "loss": 0.1952, + "step": 11617 + }, + { + "epoch": 1.0944631544240597, + "grad_norm": 0.5984799861907959, + "learning_rate": 8.8091633335245e-06, + "loss": 0.1948, + "step": 11618 + }, + { + "epoch": 1.0945573585172275, + "grad_norm": 0.6805873513221741, + "learning_rate": 8.807664073960332e-06, + "loss": 0.2006, + "step": 11619 + }, + { + "epoch": 1.0946515626103954, + "grad_norm": 0.6823112964630127, + "learning_rate": 8.806164841583266e-06, + "loss": 0.2344, + "step": 11620 + }, + { + "epoch": 1.0947457667035634, + "grad_norm": 0.632782518863678, + "learning_rate": 8.804665636427488e-06, + "loss": 0.1886, + "step": 11621 + }, + { + "epoch": 1.0948399707967311, + "grad_norm": 0.6591693758964539, + "learning_rate": 8.803166458527182e-06, + "loss": 0.2164, + "step": 11622 + }, + { + "epoch": 1.0949341748898989, + "grad_norm": 0.5850099325180054, + "learning_rate": 8.801667307916531e-06, + "loss": 0.1872, + "step": 11623 + }, + { + "epoch": 1.0950283789830668, + "grad_norm": 0.7214787006378174, + "learning_rate": 8.80016818462972e-06, + "loss": 0.2192, + "step": 11624 + }, + { + "epoch": 1.0951225830762348, + "grad_norm": 0.6723693609237671, + "learning_rate": 8.79866908870093e-06, + "loss": 0.2203, + "step": 11625 + }, + { + "epoch": 1.0952167871694025, + "grad_norm": 0.6110305190086365, + "learning_rate": 8.79717002016434e-06, + "loss": 0.1854, + "step": 11626 + }, + { + "epoch": 1.0953109912625703, + "grad_norm": 0.5688706636428833, + "learning_rate": 8.795670979054137e-06, + "loss": 0.1741, + "step": 11627 + }, + { + "epoch": 1.0954051953557382, + "grad_norm": 0.6427212953567505, + "learning_rate": 8.7941719654045e-06, + "loss": 0.2174, + "step": 11628 + }, + { + "epoch": 1.095499399448906, + "grad_norm": 0.636704683303833, + "learning_rate": 8.792672979249603e-06, + "loss": 0.1924, + "step": 11629 + }, + { + "epoch": 1.095593603542074, + "grad_norm": 0.6884451508522034, + "learning_rate": 8.79117402062363e-06, + "loss": 0.2185, + "step": 11630 + }, + { + "epoch": 1.0956878076352417, + "grad_norm": 0.711936891078949, + "learning_rate": 8.78967508956076e-06, + "loss": 0.2183, + "step": 11631 + }, + { + "epoch": 1.0957820117284096, + "grad_norm": 0.6035027503967285, + "learning_rate": 8.78817618609517e-06, + "loss": 0.2006, + "step": 11632 + }, + { + "epoch": 1.0958762158215773, + "grad_norm": 0.6631470322608948, + "learning_rate": 8.786677310261032e-06, + "loss": 0.2043, + "step": 11633 + }, + { + "epoch": 1.0959704199147453, + "grad_norm": 0.6563029885292053, + "learning_rate": 8.785178462092533e-06, + "loss": 0.2068, + "step": 11634 + }, + { + "epoch": 1.096064624007913, + "grad_norm": 0.6526097059249878, + "learning_rate": 8.783679641623845e-06, + "loss": 0.2003, + "step": 11635 + }, + { + "epoch": 1.096158828101081, + "grad_norm": 0.6027750372886658, + "learning_rate": 8.782180848889138e-06, + "loss": 0.1854, + "step": 11636 + }, + { + "epoch": 1.0962530321942487, + "grad_norm": 0.6719428300857544, + "learning_rate": 8.780682083922594e-06, + "loss": 0.193, + "step": 11637 + }, + { + "epoch": 1.0963472362874167, + "grad_norm": 0.6075634360313416, + "learning_rate": 8.779183346758384e-06, + "loss": 0.2078, + "step": 11638 + }, + { + "epoch": 1.0964414403805844, + "grad_norm": 0.6529176831245422, + "learning_rate": 8.777684637430682e-06, + "loss": 0.1811, + "step": 11639 + }, + { + "epoch": 1.0965356444737524, + "grad_norm": 0.5914357900619507, + "learning_rate": 8.776185955973658e-06, + "loss": 0.184, + "step": 11640 + }, + { + "epoch": 1.0966298485669201, + "grad_norm": 0.63169264793396, + "learning_rate": 8.774687302421488e-06, + "loss": 0.2126, + "step": 11641 + }, + { + "epoch": 1.096724052660088, + "grad_norm": 0.7162529826164246, + "learning_rate": 8.773188676808344e-06, + "loss": 0.2254, + "step": 11642 + }, + { + "epoch": 1.0968182567532558, + "grad_norm": 0.6307613849639893, + "learning_rate": 8.771690079168394e-06, + "loss": 0.1867, + "step": 11643 + }, + { + "epoch": 1.0969124608464238, + "grad_norm": 0.625088632106781, + "learning_rate": 8.77019150953581e-06, + "loss": 0.2097, + "step": 11644 + }, + { + "epoch": 1.0970066649395915, + "grad_norm": 0.6546489000320435, + "learning_rate": 8.768692967944762e-06, + "loss": 0.2039, + "step": 11645 + }, + { + "epoch": 1.0971008690327595, + "grad_norm": 0.6251490712165833, + "learning_rate": 8.767194454429417e-06, + "loss": 0.1926, + "step": 11646 + }, + { + "epoch": 1.0971950731259272, + "grad_norm": 0.7071504592895508, + "learning_rate": 8.765695969023946e-06, + "loss": 0.2273, + "step": 11647 + }, + { + "epoch": 1.0972892772190952, + "grad_norm": 0.669477641582489, + "learning_rate": 8.764197511762518e-06, + "loss": 0.2302, + "step": 11648 + }, + { + "epoch": 1.097383481312263, + "grad_norm": 0.6609790325164795, + "learning_rate": 8.762699082679298e-06, + "loss": 0.1951, + "step": 11649 + }, + { + "epoch": 1.097477685405431, + "grad_norm": 0.7205175757408142, + "learning_rate": 8.761200681808446e-06, + "loss": 0.2342, + "step": 11650 + }, + { + "epoch": 1.0975718894985986, + "grad_norm": 0.6903570294380188, + "learning_rate": 8.75970230918414e-06, + "loss": 0.2139, + "step": 11651 + }, + { + "epoch": 1.0976660935917666, + "grad_norm": 0.7559558153152466, + "learning_rate": 8.758203964840541e-06, + "loss": 0.2194, + "step": 11652 + }, + { + "epoch": 1.0977602976849343, + "grad_norm": 0.6063870787620544, + "learning_rate": 8.756705648811805e-06, + "loss": 0.202, + "step": 11653 + }, + { + "epoch": 1.0978545017781023, + "grad_norm": 0.6134804487228394, + "learning_rate": 8.755207361132109e-06, + "loss": 0.2014, + "step": 11654 + }, + { + "epoch": 1.09794870587127, + "grad_norm": 0.5935834050178528, + "learning_rate": 8.75370910183561e-06, + "loss": 0.2243, + "step": 11655 + }, + { + "epoch": 1.098042909964438, + "grad_norm": 0.7034485340118408, + "learning_rate": 8.752210870956466e-06, + "loss": 0.2343, + "step": 11656 + }, + { + "epoch": 1.0981371140576057, + "grad_norm": 0.6824043393135071, + "learning_rate": 8.75071266852885e-06, + "loss": 0.2055, + "step": 11657 + }, + { + "epoch": 1.0982313181507737, + "grad_norm": 0.6324033141136169, + "learning_rate": 8.749214494586915e-06, + "loss": 0.1887, + "step": 11658 + }, + { + "epoch": 1.0983255222439414, + "grad_norm": 0.7541441917419434, + "learning_rate": 8.747716349164826e-06, + "loss": 0.2306, + "step": 11659 + }, + { + "epoch": 1.0984197263371094, + "grad_norm": 0.6856192946434021, + "learning_rate": 8.746218232296735e-06, + "loss": 0.2238, + "step": 11660 + }, + { + "epoch": 1.0985139304302771, + "grad_norm": 0.7444167733192444, + "learning_rate": 8.744720144016812e-06, + "loss": 0.2246, + "step": 11661 + }, + { + "epoch": 1.098608134523445, + "grad_norm": 0.7645224332809448, + "learning_rate": 8.743222084359211e-06, + "loss": 0.2135, + "step": 11662 + }, + { + "epoch": 1.0987023386166128, + "grad_norm": 0.6658270955085754, + "learning_rate": 8.741724053358087e-06, + "loss": 0.2117, + "step": 11663 + }, + { + "epoch": 1.0987965427097808, + "grad_norm": 0.6010409593582153, + "learning_rate": 8.740226051047602e-06, + "loss": 0.1861, + "step": 11664 + }, + { + "epoch": 1.0988907468029485, + "grad_norm": 0.6358391046524048, + "learning_rate": 8.738728077461913e-06, + "loss": 0.2139, + "step": 11665 + }, + { + "epoch": 1.0989849508961165, + "grad_norm": 0.6551379561424255, + "learning_rate": 8.737230132635172e-06, + "loss": 0.2146, + "step": 11666 + }, + { + "epoch": 1.0990791549892842, + "grad_norm": 0.6351631879806519, + "learning_rate": 8.735732216601538e-06, + "loss": 0.1985, + "step": 11667 + }, + { + "epoch": 1.0991733590824522, + "grad_norm": 0.6814453601837158, + "learning_rate": 8.734234329395165e-06, + "loss": 0.2227, + "step": 11668 + }, + { + "epoch": 1.09926756317562, + "grad_norm": 0.6973292827606201, + "learning_rate": 8.73273647105021e-06, + "loss": 0.2359, + "step": 11669 + }, + { + "epoch": 1.0993617672687879, + "grad_norm": 0.5711583495140076, + "learning_rate": 8.731238641600816e-06, + "loss": 0.1736, + "step": 11670 + }, + { + "epoch": 1.0994559713619556, + "grad_norm": 0.6745663285255432, + "learning_rate": 8.729740841081148e-06, + "loss": 0.2029, + "step": 11671 + }, + { + "epoch": 1.0995501754551236, + "grad_norm": 0.6072559356689453, + "learning_rate": 8.728243069525355e-06, + "loss": 0.1888, + "step": 11672 + }, + { + "epoch": 1.0996443795482913, + "grad_norm": 0.6745792627334595, + "learning_rate": 8.726745326967581e-06, + "loss": 0.202, + "step": 11673 + }, + { + "epoch": 1.0997385836414593, + "grad_norm": 0.6435964107513428, + "learning_rate": 8.725247613441985e-06, + "loss": 0.2054, + "step": 11674 + }, + { + "epoch": 1.099832787734627, + "grad_norm": 0.7328150272369385, + "learning_rate": 8.723749928982719e-06, + "loss": 0.2138, + "step": 11675 + }, + { + "epoch": 1.099926991827795, + "grad_norm": 0.6047179102897644, + "learning_rate": 8.72225227362392e-06, + "loss": 0.1979, + "step": 11676 + }, + { + "epoch": 1.1000211959209627, + "grad_norm": 0.6192255616188049, + "learning_rate": 8.720754647399754e-06, + "loss": 0.1978, + "step": 11677 + }, + { + "epoch": 1.1001154000141307, + "grad_norm": 0.6152708530426025, + "learning_rate": 8.719257050344359e-06, + "loss": 0.1798, + "step": 11678 + }, + { + "epoch": 1.1002096041072984, + "grad_norm": 0.6412601470947266, + "learning_rate": 8.71775948249188e-06, + "loss": 0.1605, + "step": 11679 + }, + { + "epoch": 1.1003038082004664, + "grad_norm": 0.6825234889984131, + "learning_rate": 8.71626194387647e-06, + "loss": 0.2359, + "step": 11680 + }, + { + "epoch": 1.100398012293634, + "grad_norm": 0.6711715459823608, + "learning_rate": 8.714764434532272e-06, + "loss": 0.2102, + "step": 11681 + }, + { + "epoch": 1.100492216386802, + "grad_norm": 0.6632339954376221, + "learning_rate": 8.713266954493434e-06, + "loss": 0.2191, + "step": 11682 + }, + { + "epoch": 1.1005864204799698, + "grad_norm": 0.6640236973762512, + "learning_rate": 8.711769503794096e-06, + "loss": 0.199, + "step": 11683 + }, + { + "epoch": 1.1006806245731378, + "grad_norm": 0.6721137762069702, + "learning_rate": 8.710272082468409e-06, + "loss": 0.2014, + "step": 11684 + }, + { + "epoch": 1.1007748286663055, + "grad_norm": 0.7337496876716614, + "learning_rate": 8.708774690550513e-06, + "loss": 0.2101, + "step": 11685 + }, + { + "epoch": 1.1008690327594735, + "grad_norm": 0.8009522557258606, + "learning_rate": 8.707277328074549e-06, + "loss": 0.2162, + "step": 11686 + }, + { + "epoch": 1.1009632368526412, + "grad_norm": 0.6302037835121155, + "learning_rate": 8.705779995074664e-06, + "loss": 0.1856, + "step": 11687 + }, + { + "epoch": 1.1010574409458092, + "grad_norm": 0.6537947654724121, + "learning_rate": 8.704282691584995e-06, + "loss": 0.195, + "step": 11688 + }, + { + "epoch": 1.101151645038977, + "grad_norm": 0.6836397647857666, + "learning_rate": 8.702785417639688e-06, + "loss": 0.1874, + "step": 11689 + }, + { + "epoch": 1.1012458491321448, + "grad_norm": 0.6943057775497437, + "learning_rate": 8.701288173272871e-06, + "loss": 0.2013, + "step": 11690 + }, + { + "epoch": 1.1013400532253126, + "grad_norm": 0.6722994446754456, + "learning_rate": 8.699790958518699e-06, + "loss": 0.1942, + "step": 11691 + }, + { + "epoch": 1.1014342573184805, + "grad_norm": 0.6170687675476074, + "learning_rate": 8.698293773411305e-06, + "loss": 0.1767, + "step": 11692 + }, + { + "epoch": 1.1015284614116483, + "grad_norm": 0.7113544940948486, + "learning_rate": 8.69679661798482e-06, + "loss": 0.1988, + "step": 11693 + }, + { + "epoch": 1.1016226655048162, + "grad_norm": 0.7036173939704895, + "learning_rate": 8.695299492273392e-06, + "loss": 0.2662, + "step": 11694 + }, + { + "epoch": 1.101716869597984, + "grad_norm": 0.7076472640037537, + "learning_rate": 8.693802396311154e-06, + "loss": 0.2451, + "step": 11695 + }, + { + "epoch": 1.101811073691152, + "grad_norm": 0.6691346168518066, + "learning_rate": 8.692305330132236e-06, + "loss": 0.203, + "step": 11696 + }, + { + "epoch": 1.1019052777843197, + "grad_norm": 0.7017386555671692, + "learning_rate": 8.690808293770786e-06, + "loss": 0.1932, + "step": 11697 + }, + { + "epoch": 1.1019994818774876, + "grad_norm": 0.6661994457244873, + "learning_rate": 8.689311287260928e-06, + "loss": 0.2003, + "step": 11698 + }, + { + "epoch": 1.1020936859706554, + "grad_norm": 0.5827733874320984, + "learning_rate": 8.6878143106368e-06, + "loss": 0.2073, + "step": 11699 + }, + { + "epoch": 1.1021878900638233, + "grad_norm": 0.7229766845703125, + "learning_rate": 8.686317363932534e-06, + "loss": 0.2429, + "step": 11700 + }, + { + "epoch": 1.102282094156991, + "grad_norm": 0.6007014513015747, + "learning_rate": 8.684820447182266e-06, + "loss": 0.1814, + "step": 11701 + }, + { + "epoch": 1.102376298250159, + "grad_norm": 0.6572347283363342, + "learning_rate": 8.683323560420124e-06, + "loss": 0.1853, + "step": 11702 + }, + { + "epoch": 1.1024705023433268, + "grad_norm": 0.635355532169342, + "learning_rate": 8.681826703680239e-06, + "loss": 0.2203, + "step": 11703 + }, + { + "epoch": 1.1025647064364947, + "grad_norm": 0.5935506820678711, + "learning_rate": 8.680329876996748e-06, + "loss": 0.1947, + "step": 11704 + }, + { + "epoch": 1.1026589105296625, + "grad_norm": 0.6532561182975769, + "learning_rate": 8.678833080403775e-06, + "loss": 0.1956, + "step": 11705 + }, + { + "epoch": 1.1027531146228304, + "grad_norm": 0.7315452694892883, + "learning_rate": 8.67733631393545e-06, + "loss": 0.2611, + "step": 11706 + }, + { + "epoch": 1.1028473187159982, + "grad_norm": 0.6310397386550903, + "learning_rate": 8.675839577625905e-06, + "loss": 0.226, + "step": 11707 + }, + { + "epoch": 1.1029415228091661, + "grad_norm": 0.8106204271316528, + "learning_rate": 8.674342871509264e-06, + "loss": 0.1859, + "step": 11708 + }, + { + "epoch": 1.1030357269023339, + "grad_norm": 0.6311600804328918, + "learning_rate": 8.672846195619657e-06, + "loss": 0.1975, + "step": 11709 + }, + { + "epoch": 1.1031299309955018, + "grad_norm": 0.6800163388252258, + "learning_rate": 8.671349549991205e-06, + "loss": 0.2085, + "step": 11710 + }, + { + "epoch": 1.1032241350886696, + "grad_norm": 0.6995189785957336, + "learning_rate": 8.669852934658042e-06, + "loss": 0.2297, + "step": 11711 + }, + { + "epoch": 1.1033183391818375, + "grad_norm": 0.7022854089736938, + "learning_rate": 8.66835634965429e-06, + "loss": 0.2305, + "step": 11712 + }, + { + "epoch": 1.1034125432750053, + "grad_norm": 0.6383044123649597, + "learning_rate": 8.666859795014068e-06, + "loss": 0.2095, + "step": 11713 + }, + { + "epoch": 1.1035067473681732, + "grad_norm": 0.6150676012039185, + "learning_rate": 8.665363270771509e-06, + "loss": 0.2127, + "step": 11714 + }, + { + "epoch": 1.103600951461341, + "grad_norm": 0.759537935256958, + "learning_rate": 8.663866776960731e-06, + "loss": 0.2217, + "step": 11715 + }, + { + "epoch": 1.103695155554509, + "grad_norm": 0.7568871974945068, + "learning_rate": 8.662370313615853e-06, + "loss": 0.2085, + "step": 11716 + }, + { + "epoch": 1.1037893596476767, + "grad_norm": 0.697677731513977, + "learning_rate": 8.660873880771006e-06, + "loss": 0.203, + "step": 11717 + }, + { + "epoch": 1.1038835637408446, + "grad_norm": 0.8748740553855896, + "learning_rate": 8.659377478460302e-06, + "loss": 0.2642, + "step": 11718 + }, + { + "epoch": 1.1039777678340124, + "grad_norm": 0.7091565132141113, + "learning_rate": 8.657881106717868e-06, + "loss": 0.2184, + "step": 11719 + }, + { + "epoch": 1.1040719719271803, + "grad_norm": 0.9920811653137207, + "learning_rate": 8.656384765577814e-06, + "loss": 0.2077, + "step": 11720 + }, + { + "epoch": 1.104166176020348, + "grad_norm": 0.6734057664871216, + "learning_rate": 8.654888455074271e-06, + "loss": 0.2222, + "step": 11721 + }, + { + "epoch": 1.104260380113516, + "grad_norm": 0.6326597332954407, + "learning_rate": 8.65339217524135e-06, + "loss": 0.2028, + "step": 11722 + }, + { + "epoch": 1.1043545842066838, + "grad_norm": 0.6460282206535339, + "learning_rate": 8.651895926113167e-06, + "loss": 0.2109, + "step": 11723 + }, + { + "epoch": 1.1044487882998517, + "grad_norm": 0.6907562017440796, + "learning_rate": 8.650399707723846e-06, + "loss": 0.2058, + "step": 11724 + }, + { + "epoch": 1.1045429923930195, + "grad_norm": 2.4076602458953857, + "learning_rate": 8.648903520107497e-06, + "loss": 0.2338, + "step": 11725 + }, + { + "epoch": 1.1046371964861874, + "grad_norm": 0.6401410698890686, + "learning_rate": 8.647407363298237e-06, + "loss": 0.1814, + "step": 11726 + }, + { + "epoch": 1.1047314005793551, + "grad_norm": 0.6281068921089172, + "learning_rate": 8.645911237330182e-06, + "loss": 0.2007, + "step": 11727 + }, + { + "epoch": 1.104825604672523, + "grad_norm": 0.6514576077461243, + "learning_rate": 8.644415142237444e-06, + "loss": 0.1875, + "step": 11728 + }, + { + "epoch": 1.1049198087656908, + "grad_norm": 0.5694442987442017, + "learning_rate": 8.64291907805414e-06, + "loss": 0.1759, + "step": 11729 + }, + { + "epoch": 1.1050140128588588, + "grad_norm": 0.7495072484016418, + "learning_rate": 8.641423044814375e-06, + "loss": 0.2444, + "step": 11730 + }, + { + "epoch": 1.1051082169520265, + "grad_norm": 0.7319170236587524, + "learning_rate": 8.639927042552268e-06, + "loss": 0.186, + "step": 11731 + }, + { + "epoch": 1.1052024210451945, + "grad_norm": 1.0071709156036377, + "learning_rate": 8.63843107130193e-06, + "loss": 0.2278, + "step": 11732 + }, + { + "epoch": 1.1052966251383622, + "grad_norm": 0.7035053372383118, + "learning_rate": 8.636935131097464e-06, + "loss": 0.2202, + "step": 11733 + }, + { + "epoch": 1.1053908292315302, + "grad_norm": 0.6001750230789185, + "learning_rate": 8.63543922197299e-06, + "loss": 0.1793, + "step": 11734 + }, + { + "epoch": 1.105485033324698, + "grad_norm": 0.6339883804321289, + "learning_rate": 8.633943343962612e-06, + "loss": 0.2, + "step": 11735 + }, + { + "epoch": 1.105579237417866, + "grad_norm": 0.7535054683685303, + "learning_rate": 8.632447497100432e-06, + "loss": 0.2263, + "step": 11736 + }, + { + "epoch": 1.1056734415110336, + "grad_norm": 0.6110571026802063, + "learning_rate": 8.630951681420573e-06, + "loss": 0.2038, + "step": 11737 + }, + { + "epoch": 1.1057676456042016, + "grad_norm": 0.7600181102752686, + "learning_rate": 8.629455896957128e-06, + "loss": 0.2065, + "step": 11738 + }, + { + "epoch": 1.1058618496973693, + "grad_norm": 0.6588578224182129, + "learning_rate": 8.62796014374421e-06, + "loss": 0.2164, + "step": 11739 + }, + { + "epoch": 1.1059560537905373, + "grad_norm": 0.6615300178527832, + "learning_rate": 8.626464421815919e-06, + "loss": 0.213, + "step": 11740 + }, + { + "epoch": 1.106050257883705, + "grad_norm": 0.6047887206077576, + "learning_rate": 8.624968731206367e-06, + "loss": 0.1913, + "step": 11741 + }, + { + "epoch": 1.106144461976873, + "grad_norm": 0.5762773752212524, + "learning_rate": 8.623473071949653e-06, + "loss": 0.1804, + "step": 11742 + }, + { + "epoch": 1.1062386660700407, + "grad_norm": 0.643365204334259, + "learning_rate": 8.62197744407988e-06, + "loss": 0.1802, + "step": 11743 + }, + { + "epoch": 1.1063328701632087, + "grad_norm": 0.7280917167663574, + "learning_rate": 8.620481847631155e-06, + "loss": 0.191, + "step": 11744 + }, + { + "epoch": 1.1064270742563764, + "grad_norm": 0.6445494294166565, + "learning_rate": 8.618986282637578e-06, + "loss": 0.1941, + "step": 11745 + }, + { + "epoch": 1.1065212783495444, + "grad_norm": 0.7732815742492676, + "learning_rate": 8.617490749133248e-06, + "loss": 0.194, + "step": 11746 + }, + { + "epoch": 1.1066154824427121, + "grad_norm": 0.6079898476600647, + "learning_rate": 8.615995247152267e-06, + "loss": 0.2002, + "step": 11747 + }, + { + "epoch": 1.10670968653588, + "grad_norm": 0.6569620370864868, + "learning_rate": 8.614499776728736e-06, + "loss": 0.2097, + "step": 11748 + }, + { + "epoch": 1.1068038906290478, + "grad_norm": 0.6424642205238342, + "learning_rate": 8.613004337896755e-06, + "loss": 0.2209, + "step": 11749 + }, + { + "epoch": 1.1068980947222156, + "grad_norm": 0.7390209436416626, + "learning_rate": 8.611508930690413e-06, + "loss": 0.2421, + "step": 11750 + }, + { + "epoch": 1.1069922988153835, + "grad_norm": 0.6789264678955078, + "learning_rate": 8.61001355514382e-06, + "loss": 0.2249, + "step": 11751 + }, + { + "epoch": 1.1070865029085515, + "grad_norm": 0.6255180239677429, + "learning_rate": 8.608518211291068e-06, + "loss": 0.2085, + "step": 11752 + }, + { + "epoch": 1.1071807070017192, + "grad_norm": 0.7084512114524841, + "learning_rate": 8.607022899166247e-06, + "loss": 0.228, + "step": 11753 + }, + { + "epoch": 1.107274911094887, + "grad_norm": 0.5759256482124329, + "learning_rate": 8.605527618803465e-06, + "loss": 0.1984, + "step": 11754 + }, + { + "epoch": 1.107369115188055, + "grad_norm": 0.6552428007125854, + "learning_rate": 8.604032370236807e-06, + "loss": 0.2124, + "step": 11755 + }, + { + "epoch": 1.1074633192812229, + "grad_norm": 0.6375650763511658, + "learning_rate": 8.602537153500368e-06, + "loss": 0.2087, + "step": 11756 + }, + { + "epoch": 1.1075575233743906, + "grad_norm": 0.6352382302284241, + "learning_rate": 8.601041968628247e-06, + "loss": 0.2042, + "step": 11757 + }, + { + "epoch": 1.1076517274675584, + "grad_norm": 0.6800411343574524, + "learning_rate": 8.59954681565453e-06, + "loss": 0.1964, + "step": 11758 + }, + { + "epoch": 1.1077459315607263, + "grad_norm": 0.687708854675293, + "learning_rate": 8.598051694613315e-06, + "loss": 0.2422, + "step": 11759 + }, + { + "epoch": 1.1078401356538943, + "grad_norm": 0.6685158610343933, + "learning_rate": 8.596556605538685e-06, + "loss": 0.2075, + "step": 11760 + }, + { + "epoch": 1.107934339747062, + "grad_norm": 0.6764061450958252, + "learning_rate": 8.595061548464736e-06, + "loss": 0.2299, + "step": 11761 + }, + { + "epoch": 1.1080285438402298, + "grad_norm": 0.6402743458747864, + "learning_rate": 8.593566523425559e-06, + "loss": 0.2019, + "step": 11762 + }, + { + "epoch": 1.1081227479333977, + "grad_norm": 0.7387195229530334, + "learning_rate": 8.592071530455236e-06, + "loss": 0.2048, + "step": 11763 + }, + { + "epoch": 1.1082169520265657, + "grad_norm": 0.6278302073478699, + "learning_rate": 8.590576569587862e-06, + "loss": 0.2075, + "step": 11764 + }, + { + "epoch": 1.1083111561197334, + "grad_norm": 0.6486092209815979, + "learning_rate": 8.589081640857523e-06, + "loss": 0.2137, + "step": 11765 + }, + { + "epoch": 1.1084053602129011, + "grad_norm": 0.6360762119293213, + "learning_rate": 8.587586744298302e-06, + "loss": 0.2009, + "step": 11766 + }, + { + "epoch": 1.108499564306069, + "grad_norm": 0.6645756959915161, + "learning_rate": 8.58609187994429e-06, + "loss": 0.2031, + "step": 11767 + }, + { + "epoch": 1.1085937683992368, + "grad_norm": 0.6493579745292664, + "learning_rate": 8.58459704782957e-06, + "loss": 0.2101, + "step": 11768 + }, + { + "epoch": 1.1086879724924048, + "grad_norm": 0.7045307159423828, + "learning_rate": 8.583102247988228e-06, + "loss": 0.2002, + "step": 11769 + }, + { + "epoch": 1.1087821765855725, + "grad_norm": 0.6780544519424438, + "learning_rate": 8.581607480454338e-06, + "loss": 0.1974, + "step": 11770 + }, + { + "epoch": 1.1088763806787405, + "grad_norm": 0.6566255688667297, + "learning_rate": 8.580112745261997e-06, + "loss": 0.2079, + "step": 11771 + }, + { + "epoch": 1.1089705847719082, + "grad_norm": 0.8988806009292603, + "learning_rate": 8.578618042445284e-06, + "loss": 0.1921, + "step": 11772 + }, + { + "epoch": 1.1090647888650762, + "grad_norm": 0.6179317235946655, + "learning_rate": 8.57712337203827e-06, + "loss": 0.2279, + "step": 11773 + }, + { + "epoch": 1.109158992958244, + "grad_norm": 0.5989957451820374, + "learning_rate": 8.57562873407505e-06, + "loss": 0.1734, + "step": 11774 + }, + { + "epoch": 1.109253197051412, + "grad_norm": 0.6424252986907959, + "learning_rate": 8.574134128589697e-06, + "loss": 0.2012, + "step": 11775 + }, + { + "epoch": 1.1093474011445796, + "grad_norm": 0.6136184334754944, + "learning_rate": 8.572639555616286e-06, + "loss": 0.1978, + "step": 11776 + }, + { + "epoch": 1.1094416052377476, + "grad_norm": 0.6892116069793701, + "learning_rate": 8.571145015188908e-06, + "loss": 0.2275, + "step": 11777 + }, + { + "epoch": 1.1095358093309153, + "grad_norm": 0.6547119617462158, + "learning_rate": 8.56965050734163e-06, + "loss": 0.212, + "step": 11778 + }, + { + "epoch": 1.1096300134240833, + "grad_norm": 0.6131852865219116, + "learning_rate": 8.568156032108532e-06, + "loss": 0.1803, + "step": 11779 + }, + { + "epoch": 1.109724217517251, + "grad_norm": 0.6796516180038452, + "learning_rate": 8.56666158952369e-06, + "loss": 0.2181, + "step": 11780 + }, + { + "epoch": 1.109818421610419, + "grad_norm": 0.6487653851509094, + "learning_rate": 8.565167179621182e-06, + "loss": 0.2054, + "step": 11781 + }, + { + "epoch": 1.1099126257035867, + "grad_norm": 0.5514594912528992, + "learning_rate": 8.563672802435081e-06, + "loss": 0.1671, + "step": 11782 + }, + { + "epoch": 1.1100068297967547, + "grad_norm": 0.6476712226867676, + "learning_rate": 8.56217845799946e-06, + "loss": 0.2163, + "step": 11783 + }, + { + "epoch": 1.1101010338899224, + "grad_norm": 0.642052173614502, + "learning_rate": 8.560684146348396e-06, + "loss": 0.1897, + "step": 11784 + }, + { + "epoch": 1.1101952379830904, + "grad_norm": 0.6319531202316284, + "learning_rate": 8.55918986751596e-06, + "loss": 0.2055, + "step": 11785 + }, + { + "epoch": 1.1102894420762581, + "grad_norm": 0.7351044416427612, + "learning_rate": 8.55769562153622e-06, + "loss": 0.2262, + "step": 11786 + }, + { + "epoch": 1.110383646169426, + "grad_norm": 0.678261399269104, + "learning_rate": 8.556201408443252e-06, + "loss": 0.2186, + "step": 11787 + }, + { + "epoch": 1.1104778502625938, + "grad_norm": 0.6289682388305664, + "learning_rate": 8.554707228271126e-06, + "loss": 0.2083, + "step": 11788 + }, + { + "epoch": 1.1105720543557618, + "grad_norm": 0.6691647171974182, + "learning_rate": 8.553213081053911e-06, + "loss": 0.1934, + "step": 11789 + }, + { + "epoch": 1.1106662584489295, + "grad_norm": 0.6942034363746643, + "learning_rate": 8.55171896682567e-06, + "loss": 0.249, + "step": 11790 + }, + { + "epoch": 1.1107604625420975, + "grad_norm": 0.6387681365013123, + "learning_rate": 8.55022488562048e-06, + "loss": 0.203, + "step": 11791 + }, + { + "epoch": 1.1108546666352652, + "grad_norm": 0.700570285320282, + "learning_rate": 8.548730837472407e-06, + "loss": 0.2319, + "step": 11792 + }, + { + "epoch": 1.1109488707284332, + "grad_norm": 0.5858840942382812, + "learning_rate": 8.54723682241551e-06, + "loss": 0.1965, + "step": 11793 + }, + { + "epoch": 1.111043074821601, + "grad_norm": 0.6187609434127808, + "learning_rate": 8.545742840483866e-06, + "loss": 0.2105, + "step": 11794 + }, + { + "epoch": 1.1111372789147689, + "grad_norm": 0.6499419212341309, + "learning_rate": 8.544248891711531e-06, + "loss": 0.2445, + "step": 11795 + }, + { + "epoch": 1.1112314830079366, + "grad_norm": 0.7379053235054016, + "learning_rate": 8.54275497613257e-06, + "loss": 0.2381, + "step": 11796 + }, + { + "epoch": 1.1113256871011046, + "grad_norm": 0.6513186097145081, + "learning_rate": 8.541261093781054e-06, + "loss": 0.1893, + "step": 11797 + }, + { + "epoch": 1.1114198911942723, + "grad_norm": 0.6960347294807434, + "learning_rate": 8.539767244691041e-06, + "loss": 0.2097, + "step": 11798 + }, + { + "epoch": 1.1115140952874403, + "grad_norm": 0.6200478076934814, + "learning_rate": 8.538273428896592e-06, + "loss": 0.1919, + "step": 11799 + }, + { + "epoch": 1.111608299380608, + "grad_norm": 0.6413607001304626, + "learning_rate": 8.536779646431768e-06, + "loss": 0.2083, + "step": 11800 + }, + { + "epoch": 1.111702503473776, + "grad_norm": 0.6362359523773193, + "learning_rate": 8.535285897330631e-06, + "loss": 0.2185, + "step": 11801 + }, + { + "epoch": 1.1117967075669437, + "grad_norm": 0.6481806039810181, + "learning_rate": 8.533792181627243e-06, + "loss": 0.2027, + "step": 11802 + }, + { + "epoch": 1.1118909116601117, + "grad_norm": 0.6828119158744812, + "learning_rate": 8.532298499355657e-06, + "loss": 0.1832, + "step": 11803 + }, + { + "epoch": 1.1119851157532794, + "grad_norm": 0.5892437696456909, + "learning_rate": 8.530804850549939e-06, + "loss": 0.1835, + "step": 11804 + }, + { + "epoch": 1.1120793198464474, + "grad_norm": 0.6388912796974182, + "learning_rate": 8.529311235244143e-06, + "loss": 0.1828, + "step": 11805 + }, + { + "epoch": 1.112173523939615, + "grad_norm": 0.6662203073501587, + "learning_rate": 8.52781765347232e-06, + "loss": 0.2192, + "step": 11806 + }, + { + "epoch": 1.112267728032783, + "grad_norm": 0.712853193283081, + "learning_rate": 8.526324105268535e-06, + "loss": 0.2299, + "step": 11807 + }, + { + "epoch": 1.1123619321259508, + "grad_norm": 0.6238781809806824, + "learning_rate": 8.524830590666839e-06, + "loss": 0.1947, + "step": 11808 + }, + { + "epoch": 1.1124561362191188, + "grad_norm": 0.6149243116378784, + "learning_rate": 8.52333710970129e-06, + "loss": 0.1874, + "step": 11809 + }, + { + "epoch": 1.1125503403122865, + "grad_norm": 0.6006473898887634, + "learning_rate": 8.521843662405933e-06, + "loss": 0.2126, + "step": 11810 + }, + { + "epoch": 1.1126445444054545, + "grad_norm": 0.6180854439735413, + "learning_rate": 8.520350248814831e-06, + "loss": 0.2052, + "step": 11811 + }, + { + "epoch": 1.1127387484986222, + "grad_norm": 0.6338762640953064, + "learning_rate": 8.518856868962031e-06, + "loss": 0.2085, + "step": 11812 + }, + { + "epoch": 1.1128329525917902, + "grad_norm": 0.6446571946144104, + "learning_rate": 8.51736352288158e-06, + "loss": 0.2099, + "step": 11813 + }, + { + "epoch": 1.112927156684958, + "grad_norm": 0.6481737494468689, + "learning_rate": 8.51587021060754e-06, + "loss": 0.1995, + "step": 11814 + }, + { + "epoch": 1.1130213607781259, + "grad_norm": 0.629331648349762, + "learning_rate": 8.514376932173952e-06, + "loss": 0.1833, + "step": 11815 + }, + { + "epoch": 1.1131155648712936, + "grad_norm": 0.6144024729728699, + "learning_rate": 8.512883687614863e-06, + "loss": 0.2161, + "step": 11816 + }, + { + "epoch": 1.1132097689644616, + "grad_norm": 0.6038481593132019, + "learning_rate": 8.511390476964334e-06, + "loss": 0.1959, + "step": 11817 + }, + { + "epoch": 1.1133039730576293, + "grad_norm": 0.6613956689834595, + "learning_rate": 8.509897300256402e-06, + "loss": 0.1957, + "step": 11818 + }, + { + "epoch": 1.1133981771507973, + "grad_norm": 0.6780512928962708, + "learning_rate": 8.508404157525115e-06, + "loss": 0.228, + "step": 11819 + }, + { + "epoch": 1.113492381243965, + "grad_norm": 0.607157826423645, + "learning_rate": 8.506911048804517e-06, + "loss": 0.2018, + "step": 11820 + }, + { + "epoch": 1.113586585337133, + "grad_norm": 0.7037463784217834, + "learning_rate": 8.505417974128658e-06, + "loss": 0.1987, + "step": 11821 + }, + { + "epoch": 1.1136807894303007, + "grad_norm": 0.6566210985183716, + "learning_rate": 8.503924933531583e-06, + "loss": 0.2101, + "step": 11822 + }, + { + "epoch": 1.1137749935234686, + "grad_norm": 0.6401715278625488, + "learning_rate": 8.50243192704733e-06, + "loss": 0.2123, + "step": 11823 + }, + { + "epoch": 1.1138691976166364, + "grad_norm": 0.5940774083137512, + "learning_rate": 8.500938954709948e-06, + "loss": 0.1861, + "step": 11824 + }, + { + "epoch": 1.1139634017098043, + "grad_norm": 0.5777052044868469, + "learning_rate": 8.499446016553475e-06, + "loss": 0.1898, + "step": 11825 + }, + { + "epoch": 1.114057605802972, + "grad_norm": 0.6409168243408203, + "learning_rate": 8.497953112611952e-06, + "loss": 0.2081, + "step": 11826 + }, + { + "epoch": 1.11415180989614, + "grad_norm": 0.604097843170166, + "learning_rate": 8.496460242919422e-06, + "loss": 0.2036, + "step": 11827 + }, + { + "epoch": 1.1142460139893078, + "grad_norm": 0.6331413984298706, + "learning_rate": 8.494967407509925e-06, + "loss": 0.2257, + "step": 11828 + }, + { + "epoch": 1.1143402180824757, + "grad_norm": 0.6245782375335693, + "learning_rate": 8.4934746064175e-06, + "loss": 0.2282, + "step": 11829 + }, + { + "epoch": 1.1144344221756435, + "grad_norm": 0.6485175490379333, + "learning_rate": 8.491981839676177e-06, + "loss": 0.2185, + "step": 11830 + }, + { + "epoch": 1.1145286262688114, + "grad_norm": 0.5743849277496338, + "learning_rate": 8.490489107320009e-06, + "loss": 0.1938, + "step": 11831 + }, + { + "epoch": 1.1146228303619792, + "grad_norm": 0.6654359698295593, + "learning_rate": 8.48899640938302e-06, + "loss": 0.1899, + "step": 11832 + }, + { + "epoch": 1.1147170344551471, + "grad_norm": 0.6442403793334961, + "learning_rate": 8.487503745899246e-06, + "loss": 0.1992, + "step": 11833 + }, + { + "epoch": 1.1148112385483149, + "grad_norm": 0.639529287815094, + "learning_rate": 8.486011116902732e-06, + "loss": 0.2036, + "step": 11834 + }, + { + "epoch": 1.1149054426414828, + "grad_norm": 0.6333575248718262, + "learning_rate": 8.484518522427503e-06, + "loss": 0.2089, + "step": 11835 + }, + { + "epoch": 1.1149996467346506, + "grad_norm": 0.6548263430595398, + "learning_rate": 8.483025962507592e-06, + "loss": 0.2195, + "step": 11836 + }, + { + "epoch": 1.1150938508278185, + "grad_norm": 0.60414057970047, + "learning_rate": 8.48153343717704e-06, + "loss": 0.1989, + "step": 11837 + }, + { + "epoch": 1.1151880549209863, + "grad_norm": 0.6815771460533142, + "learning_rate": 8.480040946469875e-06, + "loss": 0.2001, + "step": 11838 + }, + { + "epoch": 1.1152822590141542, + "grad_norm": 0.609862744808197, + "learning_rate": 8.478548490420125e-06, + "loss": 0.196, + "step": 11839 + }, + { + "epoch": 1.115376463107322, + "grad_norm": 0.6562811136245728, + "learning_rate": 8.47705606906182e-06, + "loss": 0.2332, + "step": 11840 + }, + { + "epoch": 1.11547066720049, + "grad_norm": 0.6821162104606628, + "learning_rate": 8.475563682428995e-06, + "loss": 0.2138, + "step": 11841 + }, + { + "epoch": 1.1155648712936577, + "grad_norm": 0.6668700575828552, + "learning_rate": 8.474071330555675e-06, + "loss": 0.2076, + "step": 11842 + }, + { + "epoch": 1.1156590753868256, + "grad_norm": 0.643703818321228, + "learning_rate": 8.472579013475886e-06, + "loss": 0.2125, + "step": 11843 + }, + { + "epoch": 1.1157532794799934, + "grad_norm": 0.686159610748291, + "learning_rate": 8.47108673122366e-06, + "loss": 0.214, + "step": 11844 + }, + { + "epoch": 1.1158474835731613, + "grad_norm": 0.6090878248214722, + "learning_rate": 8.469594483833022e-06, + "loss": 0.2332, + "step": 11845 + }, + { + "epoch": 1.115941687666329, + "grad_norm": 0.6408948302268982, + "learning_rate": 8.468102271337994e-06, + "loss": 0.2, + "step": 11846 + }, + { + "epoch": 1.116035891759497, + "grad_norm": 0.6762485504150391, + "learning_rate": 8.466610093772605e-06, + "loss": 0.2055, + "step": 11847 + }, + { + "epoch": 1.1161300958526648, + "grad_norm": 0.6224777102470398, + "learning_rate": 8.465117951170879e-06, + "loss": 0.2134, + "step": 11848 + }, + { + "epoch": 1.1162242999458327, + "grad_norm": 0.6820507049560547, + "learning_rate": 8.463625843566836e-06, + "loss": 0.2147, + "step": 11849 + }, + { + "epoch": 1.1163185040390005, + "grad_norm": 0.6299790143966675, + "learning_rate": 8.462133770994496e-06, + "loss": 0.2125, + "step": 11850 + }, + { + "epoch": 1.1164127081321684, + "grad_norm": 0.7445378303527832, + "learning_rate": 8.460641733487891e-06, + "loss": 0.2122, + "step": 11851 + }, + { + "epoch": 1.1165069122253362, + "grad_norm": 0.5993331074714661, + "learning_rate": 8.459149731081032e-06, + "loss": 0.2066, + "step": 11852 + }, + { + "epoch": 1.1166011163185041, + "grad_norm": 0.6086986660957336, + "learning_rate": 8.457657763807938e-06, + "loss": 0.1964, + "step": 11853 + }, + { + "epoch": 1.1166953204116719, + "grad_norm": 0.618959903717041, + "learning_rate": 8.45616583170264e-06, + "loss": 0.1907, + "step": 11854 + }, + { + "epoch": 1.1167895245048398, + "grad_norm": 0.6163588166236877, + "learning_rate": 8.454673934799145e-06, + "loss": 0.2023, + "step": 11855 + }, + { + "epoch": 1.1168837285980076, + "grad_norm": 0.702180027961731, + "learning_rate": 8.453182073131473e-06, + "loss": 0.2241, + "step": 11856 + }, + { + "epoch": 1.1169779326911755, + "grad_norm": 0.6430617570877075, + "learning_rate": 8.451690246733642e-06, + "loss": 0.2058, + "step": 11857 + }, + { + "epoch": 1.1170721367843433, + "grad_norm": 0.599604070186615, + "learning_rate": 8.45019845563967e-06, + "loss": 0.2253, + "step": 11858 + }, + { + "epoch": 1.1171663408775112, + "grad_norm": 0.6289446949958801, + "learning_rate": 8.44870669988357e-06, + "loss": 0.2062, + "step": 11859 + }, + { + "epoch": 1.117260544970679, + "grad_norm": 0.7407355904579163, + "learning_rate": 8.447214979499353e-06, + "loss": 0.2297, + "step": 11860 + }, + { + "epoch": 1.117354749063847, + "grad_norm": 0.6510844230651855, + "learning_rate": 8.445723294521039e-06, + "loss": 0.2355, + "step": 11861 + }, + { + "epoch": 1.1174489531570146, + "grad_norm": 0.6228619813919067, + "learning_rate": 8.444231644982637e-06, + "loss": 0.2265, + "step": 11862 + }, + { + "epoch": 1.1175431572501826, + "grad_norm": 0.6194037199020386, + "learning_rate": 8.442740030918157e-06, + "loss": 0.2022, + "step": 11863 + }, + { + "epoch": 1.1176373613433503, + "grad_norm": 0.6037282943725586, + "learning_rate": 8.441248452361616e-06, + "loss": 0.2143, + "step": 11864 + }, + { + "epoch": 1.1177315654365183, + "grad_norm": 0.6462535858154297, + "learning_rate": 8.439756909347021e-06, + "loss": 0.1977, + "step": 11865 + }, + { + "epoch": 1.117825769529686, + "grad_norm": 0.6289598941802979, + "learning_rate": 8.438265401908378e-06, + "loss": 0.2045, + "step": 11866 + }, + { + "epoch": 1.117919973622854, + "grad_norm": 0.6735430359840393, + "learning_rate": 8.436773930079703e-06, + "loss": 0.2309, + "step": 11867 + }, + { + "epoch": 1.1180141777160217, + "grad_norm": 0.6193204522132874, + "learning_rate": 8.435282493894999e-06, + "loss": 0.2137, + "step": 11868 + }, + { + "epoch": 1.1181083818091897, + "grad_norm": 0.7990676760673523, + "learning_rate": 8.433791093388275e-06, + "loss": 0.1938, + "step": 11869 + }, + { + "epoch": 1.1182025859023574, + "grad_norm": 0.6005562543869019, + "learning_rate": 8.43229972859353e-06, + "loss": 0.1974, + "step": 11870 + }, + { + "epoch": 1.1182967899955254, + "grad_norm": 0.6522964835166931, + "learning_rate": 8.430808399544785e-06, + "loss": 0.1952, + "step": 11871 + }, + { + "epoch": 1.1183909940886931, + "grad_norm": 0.6723132729530334, + "learning_rate": 8.42931710627603e-06, + "loss": 0.2225, + "step": 11872 + }, + { + "epoch": 1.118485198181861, + "grad_norm": 0.6188422441482544, + "learning_rate": 8.427825848821272e-06, + "loss": 0.1956, + "step": 11873 + }, + { + "epoch": 1.1185794022750288, + "grad_norm": 0.6260895729064941, + "learning_rate": 8.42633462721452e-06, + "loss": 0.217, + "step": 11874 + }, + { + "epoch": 1.1186736063681968, + "grad_norm": 0.6683898568153381, + "learning_rate": 8.424843441489773e-06, + "loss": 0.2198, + "step": 11875 + }, + { + "epoch": 1.1187678104613645, + "grad_norm": 0.6727320551872253, + "learning_rate": 8.423352291681027e-06, + "loss": 0.206, + "step": 11876 + }, + { + "epoch": 1.1188620145545325, + "grad_norm": 0.6028059124946594, + "learning_rate": 8.42186117782229e-06, + "loss": 0.1869, + "step": 11877 + }, + { + "epoch": 1.1189562186477002, + "grad_norm": 0.6548950672149658, + "learning_rate": 8.42037009994756e-06, + "loss": 0.2057, + "step": 11878 + }, + { + "epoch": 1.1190504227408682, + "grad_norm": 0.6151114702224731, + "learning_rate": 8.418879058090833e-06, + "loss": 0.2108, + "step": 11879 + }, + { + "epoch": 1.119144626834036, + "grad_norm": 0.6699797511100769, + "learning_rate": 8.417388052286108e-06, + "loss": 0.2083, + "step": 11880 + }, + { + "epoch": 1.1192388309272039, + "grad_norm": 0.6789939403533936, + "learning_rate": 8.415897082567383e-06, + "loss": 0.2274, + "step": 11881 + }, + { + "epoch": 1.1193330350203716, + "grad_norm": 0.687531054019928, + "learning_rate": 8.414406148968657e-06, + "loss": 0.24, + "step": 11882 + }, + { + "epoch": 1.1194272391135396, + "grad_norm": 0.7133904099464417, + "learning_rate": 8.41291525152392e-06, + "loss": 0.1995, + "step": 11883 + }, + { + "epoch": 1.1195214432067073, + "grad_norm": 0.6760929822921753, + "learning_rate": 8.41142439026717e-06, + "loss": 0.2037, + "step": 11884 + }, + { + "epoch": 1.1196156472998753, + "grad_norm": 0.575566828250885, + "learning_rate": 8.409933565232402e-06, + "loss": 0.1958, + "step": 11885 + }, + { + "epoch": 1.119709851393043, + "grad_norm": 0.6504658460617065, + "learning_rate": 8.408442776453606e-06, + "loss": 0.2191, + "step": 11886 + }, + { + "epoch": 1.119804055486211, + "grad_norm": 0.643571138381958, + "learning_rate": 8.406952023964778e-06, + "loss": 0.1965, + "step": 11887 + }, + { + "epoch": 1.1198982595793787, + "grad_norm": 0.6231205463409424, + "learning_rate": 8.405461307799909e-06, + "loss": 0.193, + "step": 11888 + }, + { + "epoch": 1.1199924636725465, + "grad_norm": 0.5547332763671875, + "learning_rate": 8.403970627992988e-06, + "loss": 0.174, + "step": 11889 + }, + { + "epoch": 1.1200866677657144, + "grad_norm": 0.6632810235023499, + "learning_rate": 8.402479984578e-06, + "loss": 0.2057, + "step": 11890 + }, + { + "epoch": 1.1201808718588824, + "grad_norm": 0.688934862613678, + "learning_rate": 8.400989377588944e-06, + "loss": 0.2127, + "step": 11891 + }, + { + "epoch": 1.1202750759520501, + "grad_norm": 0.6259660124778748, + "learning_rate": 8.399498807059802e-06, + "loss": 0.1898, + "step": 11892 + }, + { + "epoch": 1.1203692800452179, + "grad_norm": 0.6416435241699219, + "learning_rate": 8.398008273024557e-06, + "loss": 0.2004, + "step": 11893 + }, + { + "epoch": 1.1204634841383858, + "grad_norm": 0.6270011067390442, + "learning_rate": 8.396517775517208e-06, + "loss": 0.2, + "step": 11894 + }, + { + "epoch": 1.1205576882315538, + "grad_norm": 0.628013014793396, + "learning_rate": 8.395027314571734e-06, + "loss": 0.2063, + "step": 11895 + }, + { + "epoch": 1.1206518923247215, + "grad_norm": 0.6344326138496399, + "learning_rate": 8.393536890222114e-06, + "loss": 0.206, + "step": 11896 + }, + { + "epoch": 1.1207460964178892, + "grad_norm": 0.7040308713912964, + "learning_rate": 8.39204650250234e-06, + "loss": 0.2091, + "step": 11897 + }, + { + "epoch": 1.1208403005110572, + "grad_norm": 0.6208798289299011, + "learning_rate": 8.390556151446393e-06, + "loss": 0.1957, + "step": 11898 + }, + { + "epoch": 1.1209345046042252, + "grad_norm": 0.7067717909812927, + "learning_rate": 8.389065837088254e-06, + "loss": 0.219, + "step": 11899 + }, + { + "epoch": 1.121028708697393, + "grad_norm": 0.6420107483863831, + "learning_rate": 8.387575559461905e-06, + "loss": 0.205, + "step": 11900 + }, + { + "epoch": 1.1211229127905606, + "grad_norm": 0.651452362537384, + "learning_rate": 8.386085318601328e-06, + "loss": 0.211, + "step": 11901 + }, + { + "epoch": 1.1212171168837286, + "grad_norm": 0.6510005593299866, + "learning_rate": 8.3845951145405e-06, + "loss": 0.2113, + "step": 11902 + }, + { + "epoch": 1.1213113209768966, + "grad_norm": 0.632051408290863, + "learning_rate": 8.3831049473134e-06, + "loss": 0.1981, + "step": 11903 + }, + { + "epoch": 1.1214055250700643, + "grad_norm": 0.7081713080406189, + "learning_rate": 8.381614816954012e-06, + "loss": 0.2134, + "step": 11904 + }, + { + "epoch": 1.121499729163232, + "grad_norm": 0.6058084964752197, + "learning_rate": 8.380124723496308e-06, + "loss": 0.175, + "step": 11905 + }, + { + "epoch": 1.1215939332564, + "grad_norm": 0.6154360771179199, + "learning_rate": 8.378634666974264e-06, + "loss": 0.2037, + "step": 11906 + }, + { + "epoch": 1.1216881373495677, + "grad_norm": 0.6388738751411438, + "learning_rate": 8.37714464742186e-06, + "loss": 0.1925, + "step": 11907 + }, + { + "epoch": 1.1217823414427357, + "grad_norm": 0.6988288760185242, + "learning_rate": 8.375654664873065e-06, + "loss": 0.211, + "step": 11908 + }, + { + "epoch": 1.1218765455359034, + "grad_norm": 0.7009318470954895, + "learning_rate": 8.37416471936186e-06, + "loss": 0.2147, + "step": 11909 + }, + { + "epoch": 1.1219707496290714, + "grad_norm": 0.7610530853271484, + "learning_rate": 8.372674810922206e-06, + "loss": 0.2121, + "step": 11910 + }, + { + "epoch": 1.1220649537222391, + "grad_norm": 0.6230190396308899, + "learning_rate": 8.37118493958809e-06, + "loss": 0.2024, + "step": 11911 + }, + { + "epoch": 1.122159157815407, + "grad_norm": 0.6252520084381104, + "learning_rate": 8.369695105393474e-06, + "loss": 0.1882, + "step": 11912 + }, + { + "epoch": 1.1222533619085748, + "grad_norm": 0.6650586724281311, + "learning_rate": 8.368205308372327e-06, + "loss": 0.2058, + "step": 11913 + }, + { + "epoch": 1.1223475660017428, + "grad_norm": 0.7434647679328918, + "learning_rate": 8.36671554855863e-06, + "loss": 0.1781, + "step": 11914 + }, + { + "epoch": 1.1224417700949105, + "grad_norm": 0.6390784382820129, + "learning_rate": 8.365225825986341e-06, + "loss": 0.1926, + "step": 11915 + }, + { + "epoch": 1.1225359741880785, + "grad_norm": 0.6207288503646851, + "learning_rate": 8.36373614068943e-06, + "loss": 0.1966, + "step": 11916 + }, + { + "epoch": 1.1226301782812462, + "grad_norm": 0.6809665560722351, + "learning_rate": 8.362246492701866e-06, + "loss": 0.2353, + "step": 11917 + }, + { + "epoch": 1.1227243823744142, + "grad_norm": 0.7236821055412292, + "learning_rate": 8.360756882057617e-06, + "loss": 0.2126, + "step": 11918 + }, + { + "epoch": 1.122818586467582, + "grad_norm": 0.6286883354187012, + "learning_rate": 8.359267308790644e-06, + "loss": 0.19, + "step": 11919 + }, + { + "epoch": 1.1229127905607499, + "grad_norm": 0.647922158241272, + "learning_rate": 8.357777772934914e-06, + "loss": 0.1853, + "step": 11920 + }, + { + "epoch": 1.1230069946539176, + "grad_norm": 0.7578187584877014, + "learning_rate": 8.356288274524392e-06, + "loss": 0.2371, + "step": 11921 + }, + { + "epoch": 1.1231011987470856, + "grad_norm": 0.6257461905479431, + "learning_rate": 8.354798813593038e-06, + "loss": 0.2035, + "step": 11922 + }, + { + "epoch": 1.1231954028402533, + "grad_norm": 0.711329996585846, + "learning_rate": 8.353309390174814e-06, + "loss": 0.196, + "step": 11923 + }, + { + "epoch": 1.1232896069334213, + "grad_norm": 0.6793182492256165, + "learning_rate": 8.351820004303686e-06, + "loss": 0.2159, + "step": 11924 + }, + { + "epoch": 1.123383811026589, + "grad_norm": 0.6541587114334106, + "learning_rate": 8.350330656013608e-06, + "loss": 0.1909, + "step": 11925 + }, + { + "epoch": 1.123478015119757, + "grad_norm": 0.6262333393096924, + "learning_rate": 8.348841345338544e-06, + "loss": 0.2198, + "step": 11926 + }, + { + "epoch": 1.1235722192129247, + "grad_norm": 0.6772410273551941, + "learning_rate": 8.34735207231245e-06, + "loss": 0.24, + "step": 11927 + }, + { + "epoch": 1.1236664233060927, + "grad_norm": 0.5929433703422546, + "learning_rate": 8.345862836969288e-06, + "loss": 0.2013, + "step": 11928 + }, + { + "epoch": 1.1237606273992604, + "grad_norm": 0.6401875615119934, + "learning_rate": 8.344373639343008e-06, + "loss": 0.1817, + "step": 11929 + }, + { + "epoch": 1.1238548314924284, + "grad_norm": 0.670573890209198, + "learning_rate": 8.342884479467566e-06, + "loss": 0.2177, + "step": 11930 + }, + { + "epoch": 1.1239490355855961, + "grad_norm": 0.6134161353111267, + "learning_rate": 8.341395357376928e-06, + "loss": 0.1717, + "step": 11931 + }, + { + "epoch": 1.124043239678764, + "grad_norm": 0.6540322303771973, + "learning_rate": 8.339906273105038e-06, + "loss": 0.2413, + "step": 11932 + }, + { + "epoch": 1.1241374437719318, + "grad_norm": 0.6331095099449158, + "learning_rate": 8.338417226685849e-06, + "loss": 0.2031, + "step": 11933 + }, + { + "epoch": 1.1242316478650998, + "grad_norm": 0.6158044338226318, + "learning_rate": 8.336928218153322e-06, + "loss": 0.1873, + "step": 11934 + }, + { + "epoch": 1.1243258519582675, + "grad_norm": 0.6778693199157715, + "learning_rate": 8.335439247541403e-06, + "loss": 0.207, + "step": 11935 + }, + { + "epoch": 1.1244200560514355, + "grad_norm": 0.6944397687911987, + "learning_rate": 8.333950314884039e-06, + "loss": 0.212, + "step": 11936 + }, + { + "epoch": 1.1245142601446032, + "grad_norm": 0.6102316975593567, + "learning_rate": 8.332461420215188e-06, + "loss": 0.1995, + "step": 11937 + }, + { + "epoch": 1.1246084642377712, + "grad_norm": 0.6247611045837402, + "learning_rate": 8.330972563568796e-06, + "loss": 0.2118, + "step": 11938 + }, + { + "epoch": 1.124702668330939, + "grad_norm": 0.6267822980880737, + "learning_rate": 8.32948374497881e-06, + "loss": 0.2012, + "step": 11939 + }, + { + "epoch": 1.1247968724241069, + "grad_norm": 0.6360173225402832, + "learning_rate": 8.327994964479177e-06, + "loss": 0.2037, + "step": 11940 + }, + { + "epoch": 1.1248910765172746, + "grad_norm": 0.5730154514312744, + "learning_rate": 8.326506222103845e-06, + "loss": 0.1711, + "step": 11941 + }, + { + "epoch": 1.1249852806104426, + "grad_norm": 0.6362870335578918, + "learning_rate": 8.325017517886761e-06, + "loss": 0.2035, + "step": 11942 + }, + { + "epoch": 1.1250794847036103, + "grad_norm": 1.0078284740447998, + "learning_rate": 8.323528851861864e-06, + "loss": 0.2354, + "step": 11943 + }, + { + "epoch": 1.1251736887967783, + "grad_norm": 0.6259084343910217, + "learning_rate": 8.322040224063105e-06, + "loss": 0.1933, + "step": 11944 + }, + { + "epoch": 1.125267892889946, + "grad_norm": 0.6518905162811279, + "learning_rate": 8.320551634524424e-06, + "loss": 0.2141, + "step": 11945 + }, + { + "epoch": 1.125362096983114, + "grad_norm": 0.6681150794029236, + "learning_rate": 8.319063083279761e-06, + "loss": 0.2225, + "step": 11946 + }, + { + "epoch": 1.1254563010762817, + "grad_norm": 0.6188362240791321, + "learning_rate": 8.317574570363063e-06, + "loss": 0.1716, + "step": 11947 + }, + { + "epoch": 1.1255505051694497, + "grad_norm": 0.6135340332984924, + "learning_rate": 8.316086095808268e-06, + "loss": 0.1972, + "step": 11948 + }, + { + "epoch": 1.1256447092626174, + "grad_norm": 0.7542541027069092, + "learning_rate": 8.31459765964931e-06, + "loss": 0.2325, + "step": 11949 + }, + { + "epoch": 1.1257389133557854, + "grad_norm": 0.6483733057975769, + "learning_rate": 8.313109261920127e-06, + "loss": 0.1796, + "step": 11950 + }, + { + "epoch": 1.125833117448953, + "grad_norm": 0.7310289144515991, + "learning_rate": 8.311620902654672e-06, + "loss": 0.2056, + "step": 11951 + }, + { + "epoch": 1.125927321542121, + "grad_norm": 0.6474401950836182, + "learning_rate": 8.310132581886867e-06, + "loss": 0.1949, + "step": 11952 + }, + { + "epoch": 1.1260215256352888, + "grad_norm": 0.6151375770568848, + "learning_rate": 8.308644299650649e-06, + "loss": 0.21, + "step": 11953 + }, + { + "epoch": 1.1261157297284567, + "grad_norm": 0.6289949417114258, + "learning_rate": 8.307156055979962e-06, + "loss": 0.1941, + "step": 11954 + }, + { + "epoch": 1.1262099338216245, + "grad_norm": 0.7216406464576721, + "learning_rate": 8.305667850908733e-06, + "loss": 0.2065, + "step": 11955 + }, + { + "epoch": 1.1263041379147924, + "grad_norm": 0.6386764645576477, + "learning_rate": 8.304179684470897e-06, + "loss": 0.2036, + "step": 11956 + }, + { + "epoch": 1.1263983420079602, + "grad_norm": 0.6260892748832703, + "learning_rate": 8.302691556700387e-06, + "loss": 0.2045, + "step": 11957 + }, + { + "epoch": 1.1264925461011281, + "grad_norm": 0.6668972969055176, + "learning_rate": 8.301203467631136e-06, + "loss": 0.238, + "step": 11958 + }, + { + "epoch": 1.1265867501942959, + "grad_norm": 0.7042335867881775, + "learning_rate": 8.299715417297072e-06, + "loss": 0.2159, + "step": 11959 + }, + { + "epoch": 1.1266809542874638, + "grad_norm": 0.6877856850624084, + "learning_rate": 8.298227405732124e-06, + "loss": 0.2075, + "step": 11960 + }, + { + "epoch": 1.1267751583806316, + "grad_norm": 0.6739022135734558, + "learning_rate": 8.296739432970225e-06, + "loss": 0.2085, + "step": 11961 + }, + { + "epoch": 1.1268693624737995, + "grad_norm": 0.6096248626708984, + "learning_rate": 8.295251499045303e-06, + "loss": 0.189, + "step": 11962 + }, + { + "epoch": 1.1269635665669673, + "grad_norm": 0.599459707736969, + "learning_rate": 8.293763603991279e-06, + "loss": 0.1856, + "step": 11963 + }, + { + "epoch": 1.1270577706601352, + "grad_norm": 0.6436136960983276, + "learning_rate": 8.292275747842086e-06, + "loss": 0.1912, + "step": 11964 + }, + { + "epoch": 1.127151974753303, + "grad_norm": 0.6300147771835327, + "learning_rate": 8.290787930631648e-06, + "loss": 0.1908, + "step": 11965 + }, + { + "epoch": 1.127246178846471, + "grad_norm": 0.5731162428855896, + "learning_rate": 8.289300152393884e-06, + "loss": 0.1888, + "step": 11966 + }, + { + "epoch": 1.1273403829396387, + "grad_norm": 0.6125643253326416, + "learning_rate": 8.287812413162727e-06, + "loss": 0.1915, + "step": 11967 + }, + { + "epoch": 1.1274345870328066, + "grad_norm": 0.6469168066978455, + "learning_rate": 8.286324712972095e-06, + "loss": 0.2075, + "step": 11968 + }, + { + "epoch": 1.1275287911259744, + "grad_norm": 0.6300848126411438, + "learning_rate": 8.28483705185591e-06, + "loss": 0.2087, + "step": 11969 + }, + { + "epoch": 1.1276229952191423, + "grad_norm": 0.6567912697792053, + "learning_rate": 8.283349429848087e-06, + "loss": 0.1985, + "step": 11970 + }, + { + "epoch": 1.12771719931231, + "grad_norm": 0.6428489089012146, + "learning_rate": 8.281861846982558e-06, + "loss": 0.2212, + "step": 11971 + }, + { + "epoch": 1.127811403405478, + "grad_norm": 0.7107322812080383, + "learning_rate": 8.280374303293235e-06, + "loss": 0.2077, + "step": 11972 + }, + { + "epoch": 1.1279056074986458, + "grad_norm": 0.680479884147644, + "learning_rate": 8.278886798814034e-06, + "loss": 0.2132, + "step": 11973 + }, + { + "epoch": 1.1279998115918137, + "grad_norm": 0.610516369342804, + "learning_rate": 8.27739933357888e-06, + "loss": 0.1945, + "step": 11974 + }, + { + "epoch": 1.1280940156849815, + "grad_norm": 0.6916232705116272, + "learning_rate": 8.275911907621683e-06, + "loss": 0.2075, + "step": 11975 + }, + { + "epoch": 1.1281882197781494, + "grad_norm": 0.6947224140167236, + "learning_rate": 8.274424520976358e-06, + "loss": 0.2099, + "step": 11976 + }, + { + "epoch": 1.1282824238713172, + "grad_norm": 0.6378065347671509, + "learning_rate": 8.272937173676828e-06, + "loss": 0.2184, + "step": 11977 + }, + { + "epoch": 1.1283766279644851, + "grad_norm": 0.7788832187652588, + "learning_rate": 8.271449865757e-06, + "loss": 0.2184, + "step": 11978 + }, + { + "epoch": 1.1284708320576529, + "grad_norm": 0.6401863694190979, + "learning_rate": 8.269962597250786e-06, + "loss": 0.2462, + "step": 11979 + }, + { + "epoch": 1.1285650361508208, + "grad_norm": 0.6124123930931091, + "learning_rate": 8.2684753681921e-06, + "loss": 0.2183, + "step": 11980 + }, + { + "epoch": 1.1286592402439886, + "grad_norm": 0.7102654576301575, + "learning_rate": 8.266988178614857e-06, + "loss": 0.2293, + "step": 11981 + }, + { + "epoch": 1.1287534443371565, + "grad_norm": 0.6006560921669006, + "learning_rate": 8.26550102855296e-06, + "loss": 0.1955, + "step": 11982 + }, + { + "epoch": 1.1288476484303243, + "grad_norm": 0.6482653617858887, + "learning_rate": 8.264013918040324e-06, + "loss": 0.2279, + "step": 11983 + }, + { + "epoch": 1.1289418525234922, + "grad_norm": 0.6651721000671387, + "learning_rate": 8.262526847110856e-06, + "loss": 0.178, + "step": 11984 + }, + { + "epoch": 1.12903605661666, + "grad_norm": 0.6731470823287964, + "learning_rate": 8.261039815798461e-06, + "loss": 0.2186, + "step": 11985 + }, + { + "epoch": 1.129130260709828, + "grad_norm": 0.7015734314918518, + "learning_rate": 8.259552824137049e-06, + "loss": 0.2227, + "step": 11986 + }, + { + "epoch": 1.1292244648029957, + "grad_norm": 0.6767539978027344, + "learning_rate": 8.258065872160523e-06, + "loss": 0.191, + "step": 11987 + }, + { + "epoch": 1.1293186688961636, + "grad_norm": 0.6386114358901978, + "learning_rate": 8.256578959902794e-06, + "loss": 0.2152, + "step": 11988 + }, + { + "epoch": 1.1294128729893314, + "grad_norm": 0.6761164665222168, + "learning_rate": 8.255092087397757e-06, + "loss": 0.1971, + "step": 11989 + }, + { + "epoch": 1.1295070770824993, + "grad_norm": 0.6379042863845825, + "learning_rate": 8.253605254679316e-06, + "loss": 0.2519, + "step": 11990 + }, + { + "epoch": 1.129601281175667, + "grad_norm": 0.7005086541175842, + "learning_rate": 8.252118461781381e-06, + "loss": 0.2181, + "step": 11991 + }, + { + "epoch": 1.129695485268835, + "grad_norm": 0.7167004942893982, + "learning_rate": 8.250631708737848e-06, + "loss": 0.2465, + "step": 11992 + }, + { + "epoch": 1.1297896893620027, + "grad_norm": 0.6855339407920837, + "learning_rate": 8.249144995582613e-06, + "loss": 0.2097, + "step": 11993 + }, + { + "epoch": 1.1298838934551707, + "grad_norm": 0.6948375105857849, + "learning_rate": 8.247658322349583e-06, + "loss": 0.2115, + "step": 11994 + }, + { + "epoch": 1.1299780975483384, + "grad_norm": 0.6016854643821716, + "learning_rate": 8.246171689072654e-06, + "loss": 0.1975, + "step": 11995 + }, + { + "epoch": 1.1300723016415064, + "grad_norm": 0.701055109500885, + "learning_rate": 8.24468509578572e-06, + "loss": 0.2097, + "step": 11996 + }, + { + "epoch": 1.1301665057346741, + "grad_norm": 0.5774961709976196, + "learning_rate": 8.243198542522682e-06, + "loss": 0.1831, + "step": 11997 + }, + { + "epoch": 1.130260709827842, + "grad_norm": 0.6355651617050171, + "learning_rate": 8.241712029317435e-06, + "loss": 0.1829, + "step": 11998 + }, + { + "epoch": 1.1303549139210098, + "grad_norm": 0.6253888607025146, + "learning_rate": 8.240225556203874e-06, + "loss": 0.2275, + "step": 11999 + }, + { + "epoch": 1.1304491180141778, + "grad_norm": 0.6373803019523621, + "learning_rate": 8.238739123215887e-06, + "loss": 0.1924, + "step": 12000 + }, + { + "epoch": 1.1305433221073455, + "grad_norm": 0.6929498314857483, + "learning_rate": 8.237252730387376e-06, + "loss": 0.2081, + "step": 12001 + }, + { + "epoch": 1.1306375262005135, + "grad_norm": 0.7598393559455872, + "learning_rate": 8.235766377752229e-06, + "loss": 0.2178, + "step": 12002 + }, + { + "epoch": 1.1307317302936812, + "grad_norm": 0.649556577205658, + "learning_rate": 8.234280065344335e-06, + "loss": 0.1958, + "step": 12003 + }, + { + "epoch": 1.1308259343868492, + "grad_norm": 0.6266392469406128, + "learning_rate": 8.232793793197586e-06, + "loss": 0.1946, + "step": 12004 + }, + { + "epoch": 1.130920138480017, + "grad_norm": 0.7205418348312378, + "learning_rate": 8.231307561345874e-06, + "loss": 0.2089, + "step": 12005 + }, + { + "epoch": 1.131014342573185, + "grad_norm": 0.6992611289024353, + "learning_rate": 8.229821369823082e-06, + "loss": 0.221, + "step": 12006 + }, + { + "epoch": 1.1311085466663526, + "grad_norm": 0.6830902099609375, + "learning_rate": 8.228335218663103e-06, + "loss": 0.2184, + "step": 12007 + }, + { + "epoch": 1.1312027507595206, + "grad_norm": 1.000243067741394, + "learning_rate": 8.226849107899822e-06, + "loss": 0.1867, + "step": 12008 + }, + { + "epoch": 1.1312969548526883, + "grad_norm": 0.5924058556556702, + "learning_rate": 8.225363037567122e-06, + "loss": 0.2159, + "step": 12009 + }, + { + "epoch": 1.1313911589458563, + "grad_norm": 0.6228819489479065, + "learning_rate": 8.223877007698885e-06, + "loss": 0.1992, + "step": 12010 + }, + { + "epoch": 1.131485363039024, + "grad_norm": 0.6410822868347168, + "learning_rate": 8.222391018329007e-06, + "loss": 0.2064, + "step": 12011 + }, + { + "epoch": 1.131579567132192, + "grad_norm": 0.6461016535758972, + "learning_rate": 8.220905069491359e-06, + "loss": 0.208, + "step": 12012 + }, + { + "epoch": 1.1316737712253597, + "grad_norm": 0.7207659482955933, + "learning_rate": 8.219419161219825e-06, + "loss": 0.1996, + "step": 12013 + }, + { + "epoch": 1.1317679753185277, + "grad_norm": 0.6678380966186523, + "learning_rate": 8.21793329354829e-06, + "loss": 0.211, + "step": 12014 + }, + { + "epoch": 1.1318621794116954, + "grad_norm": 0.6664488911628723, + "learning_rate": 8.216447466510633e-06, + "loss": 0.1997, + "step": 12015 + }, + { + "epoch": 1.1319563835048632, + "grad_norm": 0.649957001209259, + "learning_rate": 8.214961680140728e-06, + "loss": 0.2047, + "step": 12016 + }, + { + "epoch": 1.1320505875980311, + "grad_norm": 0.6333931088447571, + "learning_rate": 8.21347593447246e-06, + "loss": 0.2266, + "step": 12017 + }, + { + "epoch": 1.132144791691199, + "grad_norm": 0.6505257487297058, + "learning_rate": 8.211990229539704e-06, + "loss": 0.2159, + "step": 12018 + }, + { + "epoch": 1.1322389957843668, + "grad_norm": 0.5816699862480164, + "learning_rate": 8.210504565376337e-06, + "loss": 0.1764, + "step": 12019 + }, + { + "epoch": 1.1323331998775346, + "grad_norm": 0.8375851511955261, + "learning_rate": 8.20901894201623e-06, + "loss": 0.2656, + "step": 12020 + }, + { + "epoch": 1.1324274039707025, + "grad_norm": 0.7156752943992615, + "learning_rate": 8.207533359493262e-06, + "loss": 0.2226, + "step": 12021 + }, + { + "epoch": 1.1325216080638705, + "grad_norm": 0.7169298529624939, + "learning_rate": 8.206047817841308e-06, + "loss": 0.2352, + "step": 12022 + }, + { + "epoch": 1.1326158121570382, + "grad_norm": 0.6565631628036499, + "learning_rate": 8.204562317094235e-06, + "loss": 0.2273, + "step": 12023 + }, + { + "epoch": 1.132710016250206, + "grad_norm": 0.6851234436035156, + "learning_rate": 8.203076857285919e-06, + "loss": 0.2034, + "step": 12024 + }, + { + "epoch": 1.132804220343374, + "grad_norm": 0.5998619198799133, + "learning_rate": 8.20159143845023e-06, + "loss": 0.1939, + "step": 12025 + }, + { + "epoch": 1.1328984244365419, + "grad_norm": 0.6665204763412476, + "learning_rate": 8.200106060621036e-06, + "loss": 0.1994, + "step": 12026 + }, + { + "epoch": 1.1329926285297096, + "grad_norm": 0.6673848628997803, + "learning_rate": 8.19862072383221e-06, + "loss": 0.2355, + "step": 12027 + }, + { + "epoch": 1.1330868326228773, + "grad_norm": 0.622778594493866, + "learning_rate": 8.197135428117618e-06, + "loss": 0.1933, + "step": 12028 + }, + { + "epoch": 1.1331810367160453, + "grad_norm": 0.6481777429580688, + "learning_rate": 8.195650173511127e-06, + "loss": 0.1957, + "step": 12029 + }, + { + "epoch": 1.1332752408092133, + "grad_norm": 0.6352075338363647, + "learning_rate": 8.194164960046595e-06, + "loss": 0.1932, + "step": 12030 + }, + { + "epoch": 1.133369444902381, + "grad_norm": 0.6170874834060669, + "learning_rate": 8.192679787757903e-06, + "loss": 0.208, + "step": 12031 + }, + { + "epoch": 1.1334636489955487, + "grad_norm": 0.6173102855682373, + "learning_rate": 8.191194656678905e-06, + "loss": 0.1949, + "step": 12032 + }, + { + "epoch": 1.1335578530887167, + "grad_norm": 0.7190761566162109, + "learning_rate": 8.189709566843463e-06, + "loss": 0.2199, + "step": 12033 + }, + { + "epoch": 1.1336520571818847, + "grad_norm": 0.6681669354438782, + "learning_rate": 8.188224518285445e-06, + "loss": 0.1875, + "step": 12034 + }, + { + "epoch": 1.1337462612750524, + "grad_norm": 0.6125658750534058, + "learning_rate": 8.18673951103871e-06, + "loss": 0.194, + "step": 12035 + }, + { + "epoch": 1.1338404653682201, + "grad_norm": 0.5775545239448547, + "learning_rate": 8.185254545137115e-06, + "loss": 0.1859, + "step": 12036 + }, + { + "epoch": 1.133934669461388, + "grad_norm": 0.5781084299087524, + "learning_rate": 8.183769620614528e-06, + "loss": 0.1816, + "step": 12037 + }, + { + "epoch": 1.134028873554556, + "grad_norm": 0.6491709351539612, + "learning_rate": 8.1822847375048e-06, + "loss": 0.1779, + "step": 12038 + }, + { + "epoch": 1.1341230776477238, + "grad_norm": 0.6196013689041138, + "learning_rate": 8.180799895841793e-06, + "loss": 0.1854, + "step": 12039 + }, + { + "epoch": 1.1342172817408915, + "grad_norm": 0.6702143549919128, + "learning_rate": 8.179315095659358e-06, + "loss": 0.2014, + "step": 12040 + }, + { + "epoch": 1.1343114858340595, + "grad_norm": 0.639610230922699, + "learning_rate": 8.177830336991357e-06, + "loss": 0.1889, + "step": 12041 + }, + { + "epoch": 1.1344056899272275, + "grad_norm": 0.59451824426651, + "learning_rate": 8.176345619871643e-06, + "loss": 0.2034, + "step": 12042 + }, + { + "epoch": 1.1344998940203952, + "grad_norm": 0.6340506076812744, + "learning_rate": 8.174860944334067e-06, + "loss": 0.1984, + "step": 12043 + }, + { + "epoch": 1.134594098113563, + "grad_norm": 0.5687589049339294, + "learning_rate": 8.173376310412486e-06, + "loss": 0.1574, + "step": 12044 + }, + { + "epoch": 1.134688302206731, + "grad_norm": 0.5838512182235718, + "learning_rate": 8.171891718140753e-06, + "loss": 0.1831, + "step": 12045 + }, + { + "epoch": 1.1347825062998989, + "grad_norm": 0.6016603708267212, + "learning_rate": 8.170407167552707e-06, + "loss": 0.211, + "step": 12046 + }, + { + "epoch": 1.1348767103930666, + "grad_norm": 0.6274499893188477, + "learning_rate": 8.168922658682214e-06, + "loss": 0.2022, + "step": 12047 + }, + { + "epoch": 1.1349709144862343, + "grad_norm": 0.6957648396492004, + "learning_rate": 8.167438191563118e-06, + "loss": 0.2305, + "step": 12048 + }, + { + "epoch": 1.1350651185794023, + "grad_norm": 0.742519736289978, + "learning_rate": 8.165953766229263e-06, + "loss": 0.2084, + "step": 12049 + }, + { + "epoch": 1.1351593226725702, + "grad_norm": 0.630852222442627, + "learning_rate": 8.164469382714493e-06, + "loss": 0.1887, + "step": 12050 + }, + { + "epoch": 1.135253526765738, + "grad_norm": 0.612669825553894, + "learning_rate": 8.162985041052668e-06, + "loss": 0.2063, + "step": 12051 + }, + { + "epoch": 1.1353477308589057, + "grad_norm": 0.6570709943771362, + "learning_rate": 8.16150074127762e-06, + "loss": 0.2442, + "step": 12052 + }, + { + "epoch": 1.1354419349520737, + "grad_norm": 0.6268219947814941, + "learning_rate": 8.1600164834232e-06, + "loss": 0.208, + "step": 12053 + }, + { + "epoch": 1.1355361390452414, + "grad_norm": 0.6095151901245117, + "learning_rate": 8.15853226752325e-06, + "loss": 0.2111, + "step": 12054 + }, + { + "epoch": 1.1356303431384094, + "grad_norm": 0.6499700546264648, + "learning_rate": 8.157048093611613e-06, + "loss": 0.1994, + "step": 12055 + }, + { + "epoch": 1.1357245472315771, + "grad_norm": 0.7538737654685974, + "learning_rate": 8.155563961722127e-06, + "loss": 0.1941, + "step": 12056 + }, + { + "epoch": 1.135818751324745, + "grad_norm": 0.7377210259437561, + "learning_rate": 8.154079871888637e-06, + "loss": 0.2218, + "step": 12057 + }, + { + "epoch": 1.1359129554179128, + "grad_norm": 0.7277575135231018, + "learning_rate": 8.15259582414498e-06, + "loss": 0.2024, + "step": 12058 + }, + { + "epoch": 1.1360071595110808, + "grad_norm": 0.5854521989822388, + "learning_rate": 8.151111818524997e-06, + "loss": 0.1743, + "step": 12059 + }, + { + "epoch": 1.1361013636042485, + "grad_norm": 0.625900149345398, + "learning_rate": 8.149627855062521e-06, + "loss": 0.223, + "step": 12060 + }, + { + "epoch": 1.1361955676974165, + "grad_norm": 0.6748207807540894, + "learning_rate": 8.148143933791393e-06, + "loss": 0.2024, + "step": 12061 + }, + { + "epoch": 1.1362897717905842, + "grad_norm": 0.6645220518112183, + "learning_rate": 8.146660054745449e-06, + "loss": 0.2194, + "step": 12062 + }, + { + "epoch": 1.1363839758837522, + "grad_norm": 0.6353769302368164, + "learning_rate": 8.145176217958519e-06, + "loss": 0.2102, + "step": 12063 + }, + { + "epoch": 1.13647817997692, + "grad_norm": 0.6586669683456421, + "learning_rate": 8.143692423464442e-06, + "loss": 0.2161, + "step": 12064 + }, + { + "epoch": 1.1365723840700879, + "grad_norm": 0.6031149625778198, + "learning_rate": 8.14220867129705e-06, + "loss": 0.1881, + "step": 12065 + }, + { + "epoch": 1.1366665881632556, + "grad_norm": 0.7279676795005798, + "learning_rate": 8.140724961490167e-06, + "loss": 0.2362, + "step": 12066 + }, + { + "epoch": 1.1367607922564236, + "grad_norm": 0.6587242484092712, + "learning_rate": 8.139241294077636e-06, + "loss": 0.2243, + "step": 12067 + }, + { + "epoch": 1.1368549963495913, + "grad_norm": 0.6596072912216187, + "learning_rate": 8.137757669093283e-06, + "loss": 0.2028, + "step": 12068 + }, + { + "epoch": 1.1369492004427593, + "grad_norm": 0.6711840629577637, + "learning_rate": 8.13627408657093e-06, + "loss": 0.2048, + "step": 12069 + }, + { + "epoch": 1.137043404535927, + "grad_norm": 0.6759728789329529, + "learning_rate": 8.134790546544409e-06, + "loss": 0.2269, + "step": 12070 + }, + { + "epoch": 1.137137608629095, + "grad_norm": 0.7026951313018799, + "learning_rate": 8.133307049047554e-06, + "loss": 0.2362, + "step": 12071 + }, + { + "epoch": 1.1372318127222627, + "grad_norm": 0.7006614804267883, + "learning_rate": 8.131823594114183e-06, + "loss": 0.2183, + "step": 12072 + }, + { + "epoch": 1.1373260168154307, + "grad_norm": 0.6411715149879456, + "learning_rate": 8.13034018177812e-06, + "loss": 0.2158, + "step": 12073 + }, + { + "epoch": 1.1374202209085984, + "grad_norm": 0.6297891139984131, + "learning_rate": 8.128856812073196e-06, + "loss": 0.1788, + "step": 12074 + }, + { + "epoch": 1.1375144250017664, + "grad_norm": 0.5406700968742371, + "learning_rate": 8.127373485033231e-06, + "loss": 0.2036, + "step": 12075 + }, + { + "epoch": 1.137608629094934, + "grad_norm": 0.669718861579895, + "learning_rate": 8.125890200692043e-06, + "loss": 0.196, + "step": 12076 + }, + { + "epoch": 1.137702833188102, + "grad_norm": 0.6460605263710022, + "learning_rate": 8.124406959083459e-06, + "loss": 0.184, + "step": 12077 + }, + { + "epoch": 1.1377970372812698, + "grad_norm": 0.6431288719177246, + "learning_rate": 8.122923760241297e-06, + "loss": 0.1834, + "step": 12078 + }, + { + "epoch": 1.1378912413744378, + "grad_norm": 0.6451385617256165, + "learning_rate": 8.121440604199378e-06, + "loss": 0.1941, + "step": 12079 + }, + { + "epoch": 1.1379854454676055, + "grad_norm": 0.5862488746643066, + "learning_rate": 8.119957490991514e-06, + "loss": 0.2062, + "step": 12080 + }, + { + "epoch": 1.1380796495607735, + "grad_norm": 0.7165570259094238, + "learning_rate": 8.11847442065153e-06, + "loss": 0.2116, + "step": 12081 + }, + { + "epoch": 1.1381738536539412, + "grad_norm": 0.5758606791496277, + "learning_rate": 8.116991393213239e-06, + "loss": 0.194, + "step": 12082 + }, + { + "epoch": 1.1382680577471092, + "grad_norm": 0.6506445407867432, + "learning_rate": 8.115508408710454e-06, + "loss": 0.1872, + "step": 12083 + }, + { + "epoch": 1.138362261840277, + "grad_norm": 0.5742412805557251, + "learning_rate": 8.114025467176994e-06, + "loss": 0.1747, + "step": 12084 + }, + { + "epoch": 1.1384564659334448, + "grad_norm": 0.6786984801292419, + "learning_rate": 8.112542568646672e-06, + "loss": 0.2083, + "step": 12085 + }, + { + "epoch": 1.1385506700266126, + "grad_norm": 0.6596458554267883, + "learning_rate": 8.11105971315329e-06, + "loss": 0.1861, + "step": 12086 + }, + { + "epoch": 1.1386448741197805, + "grad_norm": 0.7559734582901001, + "learning_rate": 8.109576900730676e-06, + "loss": 0.2352, + "step": 12087 + }, + { + "epoch": 1.1387390782129483, + "grad_norm": 0.6362413167953491, + "learning_rate": 8.108094131412633e-06, + "loss": 0.198, + "step": 12088 + }, + { + "epoch": 1.1388332823061162, + "grad_norm": 0.6498036980628967, + "learning_rate": 8.106611405232967e-06, + "loss": 0.2126, + "step": 12089 + }, + { + "epoch": 1.138927486399284, + "grad_norm": 0.6648655533790588, + "learning_rate": 8.105128722225486e-06, + "loss": 0.2004, + "step": 12090 + }, + { + "epoch": 1.139021690492452, + "grad_norm": 0.657875120639801, + "learning_rate": 8.103646082424004e-06, + "loss": 0.2038, + "step": 12091 + }, + { + "epoch": 1.1391158945856197, + "grad_norm": 0.744015097618103, + "learning_rate": 8.102163485862324e-06, + "loss": 0.2375, + "step": 12092 + }, + { + "epoch": 1.1392100986787876, + "grad_norm": 0.6036882996559143, + "learning_rate": 8.10068093257425e-06, + "loss": 0.2114, + "step": 12093 + }, + { + "epoch": 1.1393043027719554, + "grad_norm": 0.6374387145042419, + "learning_rate": 8.099198422593589e-06, + "loss": 0.2025, + "step": 12094 + }, + { + "epoch": 1.1393985068651233, + "grad_norm": 0.6632814407348633, + "learning_rate": 8.097715955954145e-06, + "loss": 0.2065, + "step": 12095 + }, + { + "epoch": 1.139492710958291, + "grad_norm": 0.6246944665908813, + "learning_rate": 8.096233532689718e-06, + "loss": 0.2078, + "step": 12096 + }, + { + "epoch": 1.139586915051459, + "grad_norm": 0.6394679546356201, + "learning_rate": 8.094751152834109e-06, + "loss": 0.1938, + "step": 12097 + }, + { + "epoch": 1.1396811191446268, + "grad_norm": 0.5818282961845398, + "learning_rate": 8.093268816421122e-06, + "loss": 0.1783, + "step": 12098 + }, + { + "epoch": 1.1397753232377947, + "grad_norm": 0.649113118648529, + "learning_rate": 8.091786523484557e-06, + "loss": 0.2074, + "step": 12099 + }, + { + "epoch": 1.1398695273309625, + "grad_norm": 0.6660482287406921, + "learning_rate": 8.090304274058205e-06, + "loss": 0.2113, + "step": 12100 + }, + { + "epoch": 1.1399637314241304, + "grad_norm": 0.7327325344085693, + "learning_rate": 8.088822068175872e-06, + "loss": 0.2187, + "step": 12101 + }, + { + "epoch": 1.1400579355172982, + "grad_norm": 0.699038028717041, + "learning_rate": 8.087339905871354e-06, + "loss": 0.2182, + "step": 12102 + }, + { + "epoch": 1.1401521396104661, + "grad_norm": 0.6511684060096741, + "learning_rate": 8.085857787178439e-06, + "loss": 0.1888, + "step": 12103 + }, + { + "epoch": 1.1402463437036339, + "grad_norm": 0.6134809851646423, + "learning_rate": 8.08437571213093e-06, + "loss": 0.2038, + "step": 12104 + }, + { + "epoch": 1.1403405477968018, + "grad_norm": 0.6674476265907288, + "learning_rate": 8.082893680762619e-06, + "loss": 0.2282, + "step": 12105 + }, + { + "epoch": 1.1404347518899696, + "grad_norm": 0.5779377222061157, + "learning_rate": 8.081411693107291e-06, + "loss": 0.2, + "step": 12106 + }, + { + "epoch": 1.1405289559831375, + "grad_norm": 0.6250529885292053, + "learning_rate": 8.079929749198748e-06, + "loss": 0.2253, + "step": 12107 + }, + { + "epoch": 1.1406231600763053, + "grad_norm": 0.6807493567466736, + "learning_rate": 8.078447849070777e-06, + "loss": 0.2181, + "step": 12108 + }, + { + "epoch": 1.1407173641694732, + "grad_norm": 0.715938150882721, + "learning_rate": 8.076965992757166e-06, + "loss": 0.221, + "step": 12109 + }, + { + "epoch": 1.140811568262641, + "grad_norm": 0.6325286030769348, + "learning_rate": 8.075484180291702e-06, + "loss": 0.1948, + "step": 12110 + }, + { + "epoch": 1.140905772355809, + "grad_norm": 0.6969076991081238, + "learning_rate": 8.074002411708177e-06, + "loss": 0.2131, + "step": 12111 + }, + { + "epoch": 1.1409999764489767, + "grad_norm": 0.7182613611221313, + "learning_rate": 8.072520687040376e-06, + "loss": 0.1993, + "step": 12112 + }, + { + "epoch": 1.1410941805421446, + "grad_norm": 0.6815456748008728, + "learning_rate": 8.07103900632208e-06, + "loss": 0.1952, + "step": 12113 + }, + { + "epoch": 1.1411883846353124, + "grad_norm": 0.6510999798774719, + "learning_rate": 8.069557369587084e-06, + "loss": 0.1902, + "step": 12114 + }, + { + "epoch": 1.1412825887284803, + "grad_norm": 0.6855291128158569, + "learning_rate": 8.068075776869163e-06, + "loss": 0.2093, + "step": 12115 + }, + { + "epoch": 1.141376792821648, + "grad_norm": 0.6667503118515015, + "learning_rate": 8.066594228202101e-06, + "loss": 0.2271, + "step": 12116 + }, + { + "epoch": 1.141470996914816, + "grad_norm": 0.6633936762809753, + "learning_rate": 8.06511272361968e-06, + "loss": 0.1893, + "step": 12117 + }, + { + "epoch": 1.1415652010079838, + "grad_norm": 0.6098134517669678, + "learning_rate": 8.063631263155684e-06, + "loss": 0.1943, + "step": 12118 + }, + { + "epoch": 1.1416594051011517, + "grad_norm": 0.6373329758644104, + "learning_rate": 8.06214984684389e-06, + "loss": 0.2103, + "step": 12119 + }, + { + "epoch": 1.1417536091943195, + "grad_norm": 0.6690663695335388, + "learning_rate": 8.060668474718072e-06, + "loss": 0.2412, + "step": 12120 + }, + { + "epoch": 1.1418478132874874, + "grad_norm": 0.6494943499565125, + "learning_rate": 8.059187146812015e-06, + "loss": 0.1868, + "step": 12121 + }, + { + "epoch": 1.1419420173806551, + "grad_norm": 0.5967381596565247, + "learning_rate": 8.057705863159493e-06, + "loss": 0.199, + "step": 12122 + }, + { + "epoch": 1.142036221473823, + "grad_norm": 0.7099337577819824, + "learning_rate": 8.05622462379428e-06, + "loss": 0.2196, + "step": 12123 + }, + { + "epoch": 1.1421304255669908, + "grad_norm": 0.6998152732849121, + "learning_rate": 8.054743428750153e-06, + "loss": 0.2266, + "step": 12124 + }, + { + "epoch": 1.1422246296601588, + "grad_norm": 0.6493479609489441, + "learning_rate": 8.053262278060887e-06, + "loss": 0.1901, + "step": 12125 + }, + { + "epoch": 1.1423188337533265, + "grad_norm": 0.6034460067749023, + "learning_rate": 8.051781171760244e-06, + "loss": 0.1894, + "step": 12126 + }, + { + "epoch": 1.1424130378464945, + "grad_norm": 0.6795820593833923, + "learning_rate": 8.050300109882008e-06, + "loss": 0.2199, + "step": 12127 + }, + { + "epoch": 1.1425072419396622, + "grad_norm": 0.6849047541618347, + "learning_rate": 8.048819092459947e-06, + "loss": 0.2142, + "step": 12128 + }, + { + "epoch": 1.1426014460328302, + "grad_norm": 0.6387856006622314, + "learning_rate": 8.047338119527827e-06, + "loss": 0.1775, + "step": 12129 + }, + { + "epoch": 1.142695650125998, + "grad_norm": 0.6189826726913452, + "learning_rate": 8.045857191119414e-06, + "loss": 0.2052, + "step": 12130 + }, + { + "epoch": 1.142789854219166, + "grad_norm": 0.6556556224822998, + "learning_rate": 8.044376307268482e-06, + "loss": 0.2027, + "step": 12131 + }, + { + "epoch": 1.1428840583123336, + "grad_norm": 0.6616007685661316, + "learning_rate": 8.042895468008794e-06, + "loss": 0.2151, + "step": 12132 + }, + { + "epoch": 1.1429782624055016, + "grad_norm": 0.6787793040275574, + "learning_rate": 8.041414673374116e-06, + "loss": 0.2132, + "step": 12133 + }, + { + "epoch": 1.1430724664986693, + "grad_norm": 0.6502068042755127, + "learning_rate": 8.039933923398213e-06, + "loss": 0.1681, + "step": 12134 + }, + { + "epoch": 1.1431666705918373, + "grad_norm": 0.6827070713043213, + "learning_rate": 8.038453218114847e-06, + "loss": 0.2444, + "step": 12135 + }, + { + "epoch": 1.143260874685005, + "grad_norm": 0.7281052470207214, + "learning_rate": 8.03697255755778e-06, + "loss": 0.2186, + "step": 12136 + }, + { + "epoch": 1.143355078778173, + "grad_norm": 0.5791061520576477, + "learning_rate": 8.035491941760779e-06, + "loss": 0.2034, + "step": 12137 + }, + { + "epoch": 1.1434492828713407, + "grad_norm": 0.5991633534431458, + "learning_rate": 8.034011370757596e-06, + "loss": 0.2068, + "step": 12138 + }, + { + "epoch": 1.1435434869645087, + "grad_norm": 0.7107877731323242, + "learning_rate": 8.032530844581997e-06, + "loss": 0.2051, + "step": 12139 + }, + { + "epoch": 1.1436376910576764, + "grad_norm": 0.5817581415176392, + "learning_rate": 8.031050363267733e-06, + "loss": 0.2096, + "step": 12140 + }, + { + "epoch": 1.1437318951508444, + "grad_norm": 0.6221569776535034, + "learning_rate": 8.029569926848571e-06, + "loss": 0.2036, + "step": 12141 + }, + { + "epoch": 1.1438260992440121, + "grad_norm": 0.6726136207580566, + "learning_rate": 8.02808953535826e-06, + "loss": 0.2004, + "step": 12142 + }, + { + "epoch": 1.14392030333718, + "grad_norm": 0.6434982419013977, + "learning_rate": 8.026609188830554e-06, + "loss": 0.2298, + "step": 12143 + }, + { + "epoch": 1.1440145074303478, + "grad_norm": 0.6025883555412292, + "learning_rate": 8.025128887299213e-06, + "loss": 0.201, + "step": 12144 + }, + { + "epoch": 1.1441087115235158, + "grad_norm": 0.6930113434791565, + "learning_rate": 8.02364863079799e-06, + "loss": 0.2143, + "step": 12145 + }, + { + "epoch": 1.1442029156166835, + "grad_norm": 0.6260280013084412, + "learning_rate": 8.02216841936063e-06, + "loss": 0.1886, + "step": 12146 + }, + { + "epoch": 1.1442971197098515, + "grad_norm": 0.6329609155654907, + "learning_rate": 8.020688253020891e-06, + "loss": 0.2103, + "step": 12147 + }, + { + "epoch": 1.1443913238030192, + "grad_norm": 0.6741276979446411, + "learning_rate": 8.019208131812524e-06, + "loss": 0.2093, + "step": 12148 + }, + { + "epoch": 1.1444855278961872, + "grad_norm": 0.6147240400314331, + "learning_rate": 8.017728055769274e-06, + "loss": 0.1918, + "step": 12149 + }, + { + "epoch": 1.144579731989355, + "grad_norm": 0.7587670683860779, + "learning_rate": 8.016248024924886e-06, + "loss": 0.2144, + "step": 12150 + }, + { + "epoch": 1.1446739360825229, + "grad_norm": 0.666145920753479, + "learning_rate": 8.014768039313114e-06, + "loss": 0.1831, + "step": 12151 + }, + { + "epoch": 1.1447681401756906, + "grad_norm": 0.6701474189758301, + "learning_rate": 8.013288098967701e-06, + "loss": 0.2252, + "step": 12152 + }, + { + "epoch": 1.1448623442688586, + "grad_norm": 0.6293619871139526, + "learning_rate": 8.01180820392239e-06, + "loss": 0.1958, + "step": 12153 + }, + { + "epoch": 1.1449565483620263, + "grad_norm": 0.7255494594573975, + "learning_rate": 8.010328354210928e-06, + "loss": 0.2127, + "step": 12154 + }, + { + "epoch": 1.145050752455194, + "grad_norm": 0.6386851072311401, + "learning_rate": 8.008848549867057e-06, + "loss": 0.2206, + "step": 12155 + }, + { + "epoch": 1.145144956548362, + "grad_norm": 0.6834396123886108, + "learning_rate": 8.007368790924514e-06, + "loss": 0.1989, + "step": 12156 + }, + { + "epoch": 1.14523916064153, + "grad_norm": 0.614005446434021, + "learning_rate": 8.00588907741705e-06, + "loss": 0.2109, + "step": 12157 + }, + { + "epoch": 1.1453333647346977, + "grad_norm": 0.6057419776916504, + "learning_rate": 8.004409409378398e-06, + "loss": 0.2006, + "step": 12158 + }, + { + "epoch": 1.1454275688278654, + "grad_norm": 0.63899165391922, + "learning_rate": 8.002929786842297e-06, + "loss": 0.1942, + "step": 12159 + }, + { + "epoch": 1.1455217729210334, + "grad_norm": 0.713495135307312, + "learning_rate": 8.001450209842483e-06, + "loss": 0.2145, + "step": 12160 + }, + { + "epoch": 1.1456159770142014, + "grad_norm": 0.7448211312294006, + "learning_rate": 7.999970678412697e-06, + "loss": 0.2467, + "step": 12161 + }, + { + "epoch": 1.145710181107369, + "grad_norm": 0.6611918210983276, + "learning_rate": 7.998491192586676e-06, + "loss": 0.211, + "step": 12162 + }, + { + "epoch": 1.1458043852005368, + "grad_norm": 0.691112220287323, + "learning_rate": 7.997011752398144e-06, + "loss": 0.2324, + "step": 12163 + }, + { + "epoch": 1.1458985892937048, + "grad_norm": 0.7479667663574219, + "learning_rate": 7.995532357880847e-06, + "loss": 0.238, + "step": 12164 + }, + { + "epoch": 1.1459927933868728, + "grad_norm": 0.6619879603385925, + "learning_rate": 7.994053009068513e-06, + "loss": 0.1808, + "step": 12165 + }, + { + "epoch": 1.1460869974800405, + "grad_norm": 0.687891960144043, + "learning_rate": 7.992573705994867e-06, + "loss": 0.2338, + "step": 12166 + }, + { + "epoch": 1.1461812015732082, + "grad_norm": 0.5902931690216064, + "learning_rate": 7.991094448693648e-06, + "loss": 0.1774, + "step": 12167 + }, + { + "epoch": 1.1462754056663762, + "grad_norm": 0.6471607089042664, + "learning_rate": 7.989615237198585e-06, + "loss": 0.2051, + "step": 12168 + }, + { + "epoch": 1.1463696097595442, + "grad_norm": 0.6792701482772827, + "learning_rate": 7.988136071543404e-06, + "loss": 0.236, + "step": 12169 + }, + { + "epoch": 1.146463813852712, + "grad_norm": 0.6185738444328308, + "learning_rate": 7.986656951761826e-06, + "loss": 0.2208, + "step": 12170 + }, + { + "epoch": 1.1465580179458796, + "grad_norm": 0.648489236831665, + "learning_rate": 7.98517787788759e-06, + "loss": 0.2062, + "step": 12171 + }, + { + "epoch": 1.1466522220390476, + "grad_norm": 0.6400814652442932, + "learning_rate": 7.98369884995441e-06, + "loss": 0.2254, + "step": 12172 + }, + { + "epoch": 1.1467464261322156, + "grad_norm": 0.629786491394043, + "learning_rate": 7.982219867996013e-06, + "loss": 0.2239, + "step": 12173 + }, + { + "epoch": 1.1468406302253833, + "grad_norm": 0.8736804127693176, + "learning_rate": 7.980740932046126e-06, + "loss": 0.169, + "step": 12174 + }, + { + "epoch": 1.146934834318551, + "grad_norm": 0.762713611125946, + "learning_rate": 7.979262042138472e-06, + "loss": 0.2463, + "step": 12175 + }, + { + "epoch": 1.147029038411719, + "grad_norm": 0.6414589881896973, + "learning_rate": 7.977783198306763e-06, + "loss": 0.2084, + "step": 12176 + }, + { + "epoch": 1.147123242504887, + "grad_norm": 0.6481419801712036, + "learning_rate": 7.976304400584726e-06, + "loss": 0.1933, + "step": 12177 + }, + { + "epoch": 1.1472174465980547, + "grad_norm": 0.8233293890953064, + "learning_rate": 7.974825649006082e-06, + "loss": 0.1863, + "step": 12178 + }, + { + "epoch": 1.1473116506912224, + "grad_norm": 0.7363768219947815, + "learning_rate": 7.973346943604542e-06, + "loss": 0.2164, + "step": 12179 + }, + { + "epoch": 1.1474058547843904, + "grad_norm": 0.6874502897262573, + "learning_rate": 7.971868284413824e-06, + "loss": 0.2156, + "step": 12180 + }, + { + "epoch": 1.1475000588775583, + "grad_norm": 0.6711171269416809, + "learning_rate": 7.970389671467648e-06, + "loss": 0.2253, + "step": 12181 + }, + { + "epoch": 1.147594262970726, + "grad_norm": 0.6555377244949341, + "learning_rate": 7.96891110479973e-06, + "loss": 0.2056, + "step": 12182 + }, + { + "epoch": 1.1476884670638938, + "grad_norm": 0.6161142587661743, + "learning_rate": 7.967432584443772e-06, + "loss": 0.2113, + "step": 12183 + }, + { + "epoch": 1.1477826711570618, + "grad_norm": 0.631897509098053, + "learning_rate": 7.9659541104335e-06, + "loss": 0.1881, + "step": 12184 + }, + { + "epoch": 1.1478768752502297, + "grad_norm": 0.590204656124115, + "learning_rate": 7.964475682802623e-06, + "loss": 0.1989, + "step": 12185 + }, + { + "epoch": 1.1479710793433975, + "grad_norm": 0.6091127991676331, + "learning_rate": 7.962997301584839e-06, + "loss": 0.1945, + "step": 12186 + }, + { + "epoch": 1.1480652834365652, + "grad_norm": 0.6342639327049255, + "learning_rate": 7.961518966813876e-06, + "loss": 0.2219, + "step": 12187 + }, + { + "epoch": 1.1481594875297332, + "grad_norm": 0.6492775678634644, + "learning_rate": 7.96004067852343e-06, + "loss": 0.2176, + "step": 12188 + }, + { + "epoch": 1.148253691622901, + "grad_norm": 0.5854193568229675, + "learning_rate": 7.958562436747211e-06, + "loss": 0.1888, + "step": 12189 + }, + { + "epoch": 1.1483478957160689, + "grad_norm": 0.7202615737915039, + "learning_rate": 7.957084241518922e-06, + "loss": 0.1923, + "step": 12190 + }, + { + "epoch": 1.1484420998092366, + "grad_norm": 0.6243315935134888, + "learning_rate": 7.955606092872277e-06, + "loss": 0.1824, + "step": 12191 + }, + { + "epoch": 1.1485363039024046, + "grad_norm": 0.6550903916358948, + "learning_rate": 7.954127990840972e-06, + "loss": 0.212, + "step": 12192 + }, + { + "epoch": 1.1486305079955723, + "grad_norm": 0.6427294611930847, + "learning_rate": 7.952649935458713e-06, + "loss": 0.2087, + "step": 12193 + }, + { + "epoch": 1.1487247120887403, + "grad_norm": 0.6429688930511475, + "learning_rate": 7.951171926759202e-06, + "loss": 0.2163, + "step": 12194 + }, + { + "epoch": 1.148818916181908, + "grad_norm": 0.6400426030158997, + "learning_rate": 7.949693964776141e-06, + "loss": 0.1919, + "step": 12195 + }, + { + "epoch": 1.148913120275076, + "grad_norm": 0.6039251685142517, + "learning_rate": 7.948216049543226e-06, + "loss": 0.203, + "step": 12196 + }, + { + "epoch": 1.1490073243682437, + "grad_norm": 0.632346510887146, + "learning_rate": 7.94673818109416e-06, + "loss": 0.1949, + "step": 12197 + }, + { + "epoch": 1.1491015284614117, + "grad_norm": 0.6787921786308289, + "learning_rate": 7.945260359462638e-06, + "loss": 0.2552, + "step": 12198 + }, + { + "epoch": 1.1491957325545794, + "grad_norm": 0.7587026357650757, + "learning_rate": 7.94378258468236e-06, + "loss": 0.2462, + "step": 12199 + }, + { + "epoch": 1.1492899366477474, + "grad_norm": 0.6545671820640564, + "learning_rate": 7.942304856787016e-06, + "loss": 0.2238, + "step": 12200 + }, + { + "epoch": 1.149384140740915, + "grad_norm": 0.685653567314148, + "learning_rate": 7.940827175810305e-06, + "loss": 0.1981, + "step": 12201 + }, + { + "epoch": 1.149478344834083, + "grad_norm": 0.6183797121047974, + "learning_rate": 7.939349541785922e-06, + "loss": 0.2105, + "step": 12202 + }, + { + "epoch": 1.1495725489272508, + "grad_norm": 0.7067667245864868, + "learning_rate": 7.937871954747548e-06, + "loss": 0.2069, + "step": 12203 + }, + { + "epoch": 1.1496667530204188, + "grad_norm": 0.6052165627479553, + "learning_rate": 7.936394414728889e-06, + "loss": 0.1944, + "step": 12204 + }, + { + "epoch": 1.1497609571135865, + "grad_norm": 0.710982620716095, + "learning_rate": 7.93491692176363e-06, + "loss": 0.196, + "step": 12205 + }, + { + "epoch": 1.1498551612067545, + "grad_norm": 0.6403450965881348, + "learning_rate": 7.933439475885452e-06, + "loss": 0.2085, + "step": 12206 + }, + { + "epoch": 1.1499493652999222, + "grad_norm": 0.6260942220687866, + "learning_rate": 7.931962077128058e-06, + "loss": 0.2026, + "step": 12207 + }, + { + "epoch": 1.1500435693930902, + "grad_norm": 0.6076931953430176, + "learning_rate": 7.930484725525123e-06, + "loss": 0.208, + "step": 12208 + }, + { + "epoch": 1.150137773486258, + "grad_norm": 0.640314519405365, + "learning_rate": 7.929007421110337e-06, + "loss": 0.207, + "step": 12209 + }, + { + "epoch": 1.1502319775794259, + "grad_norm": 0.7645244002342224, + "learning_rate": 7.927530163917383e-06, + "loss": 0.2023, + "step": 12210 + }, + { + "epoch": 1.1503261816725936, + "grad_norm": 0.632294774055481, + "learning_rate": 7.926052953979948e-06, + "loss": 0.1963, + "step": 12211 + }, + { + "epoch": 1.1504203857657616, + "grad_norm": 0.6345540285110474, + "learning_rate": 7.924575791331714e-06, + "loss": 0.1985, + "step": 12212 + }, + { + "epoch": 1.1505145898589293, + "grad_norm": 0.7057055234909058, + "learning_rate": 7.923098676006358e-06, + "loss": 0.2406, + "step": 12213 + }, + { + "epoch": 1.1506087939520973, + "grad_norm": 0.6412834525108337, + "learning_rate": 7.921621608037568e-06, + "loss": 0.2069, + "step": 12214 + }, + { + "epoch": 1.150702998045265, + "grad_norm": 0.5932299494743347, + "learning_rate": 7.920144587459019e-06, + "loss": 0.1827, + "step": 12215 + }, + { + "epoch": 1.150797202138433, + "grad_norm": 0.6430896520614624, + "learning_rate": 7.918667614304388e-06, + "loss": 0.2161, + "step": 12216 + }, + { + "epoch": 1.1508914062316007, + "grad_norm": 0.7487487196922302, + "learning_rate": 7.917190688607356e-06, + "loss": 0.224, + "step": 12217 + }, + { + "epoch": 1.1509856103247686, + "grad_norm": 0.6349984407424927, + "learning_rate": 7.915713810401598e-06, + "loss": 0.2256, + "step": 12218 + }, + { + "epoch": 1.1510798144179364, + "grad_norm": 0.7175769805908203, + "learning_rate": 7.914236979720787e-06, + "loss": 0.2441, + "step": 12219 + }, + { + "epoch": 1.1511740185111043, + "grad_norm": 0.6573382616043091, + "learning_rate": 7.912760196598599e-06, + "loss": 0.2042, + "step": 12220 + }, + { + "epoch": 1.151268222604272, + "grad_norm": 0.6913100481033325, + "learning_rate": 7.911283461068705e-06, + "loss": 0.1968, + "step": 12221 + }, + { + "epoch": 1.15136242669744, + "grad_norm": 0.6502411961555481, + "learning_rate": 7.909806773164784e-06, + "loss": 0.2123, + "step": 12222 + }, + { + "epoch": 1.1514566307906078, + "grad_norm": 0.6149377226829529, + "learning_rate": 7.908330132920495e-06, + "loss": 0.1897, + "step": 12223 + }, + { + "epoch": 1.1515508348837757, + "grad_norm": 0.6500223278999329, + "learning_rate": 7.906853540369514e-06, + "loss": 0.2148, + "step": 12224 + }, + { + "epoch": 1.1516450389769435, + "grad_norm": 0.6620779633522034, + "learning_rate": 7.905376995545516e-06, + "loss": 0.2227, + "step": 12225 + }, + { + "epoch": 1.1517392430701114, + "grad_norm": 0.6362714767456055, + "learning_rate": 7.903900498482153e-06, + "loss": 0.2076, + "step": 12226 + }, + { + "epoch": 1.1518334471632792, + "grad_norm": 0.657136082649231, + "learning_rate": 7.902424049213107e-06, + "loss": 0.2269, + "step": 12227 + }, + { + "epoch": 1.1519276512564471, + "grad_norm": 0.705045223236084, + "learning_rate": 7.900947647772036e-06, + "loss": 0.2058, + "step": 12228 + }, + { + "epoch": 1.1520218553496149, + "grad_norm": 0.6532936692237854, + "learning_rate": 7.899471294192602e-06, + "loss": 0.1965, + "step": 12229 + }, + { + "epoch": 1.1521160594427828, + "grad_norm": 0.5644189119338989, + "learning_rate": 7.897994988508471e-06, + "loss": 0.1666, + "step": 12230 + }, + { + "epoch": 1.1522102635359506, + "grad_norm": 0.6563928127288818, + "learning_rate": 7.896518730753307e-06, + "loss": 0.1903, + "step": 12231 + }, + { + "epoch": 1.1523044676291185, + "grad_norm": 0.6150180697441101, + "learning_rate": 7.895042520960768e-06, + "loss": 0.2156, + "step": 12232 + }, + { + "epoch": 1.1523986717222863, + "grad_norm": 0.7883591651916504, + "learning_rate": 7.893566359164513e-06, + "loss": 0.2516, + "step": 12233 + }, + { + "epoch": 1.1524928758154542, + "grad_norm": 0.5889561772346497, + "learning_rate": 7.892090245398206e-06, + "loss": 0.1712, + "step": 12234 + }, + { + "epoch": 1.152587079908622, + "grad_norm": 0.6123316287994385, + "learning_rate": 7.8906141796955e-06, + "loss": 0.2013, + "step": 12235 + }, + { + "epoch": 1.15268128400179, + "grad_norm": 0.6226065158843994, + "learning_rate": 7.889138162090052e-06, + "loss": 0.2058, + "step": 12236 + }, + { + "epoch": 1.1527754880949577, + "grad_norm": 0.6096005439758301, + "learning_rate": 7.88766219261552e-06, + "loss": 0.2002, + "step": 12237 + }, + { + "epoch": 1.1528696921881256, + "grad_norm": 0.571349561214447, + "learning_rate": 7.886186271305557e-06, + "loss": 0.1857, + "step": 12238 + }, + { + "epoch": 1.1529638962812934, + "grad_norm": 0.7446098327636719, + "learning_rate": 7.884710398193815e-06, + "loss": 0.2326, + "step": 12239 + }, + { + "epoch": 1.1530581003744613, + "grad_norm": 0.6592982411384583, + "learning_rate": 7.883234573313948e-06, + "loss": 0.2201, + "step": 12240 + }, + { + "epoch": 1.153152304467629, + "grad_norm": 0.6531952619552612, + "learning_rate": 7.881758796699605e-06, + "loss": 0.1954, + "step": 12241 + }, + { + "epoch": 1.153246508560797, + "grad_norm": 0.5881925821304321, + "learning_rate": 7.880283068384441e-06, + "loss": 0.1895, + "step": 12242 + }, + { + "epoch": 1.1533407126539648, + "grad_norm": 0.6539220213890076, + "learning_rate": 7.878807388402095e-06, + "loss": 0.2017, + "step": 12243 + }, + { + "epoch": 1.1534349167471327, + "grad_norm": 0.7027574777603149, + "learning_rate": 7.877331756786225e-06, + "loss": 0.1787, + "step": 12244 + }, + { + "epoch": 1.1535291208403005, + "grad_norm": 0.6493908166885376, + "learning_rate": 7.875856173570476e-06, + "loss": 0.2037, + "step": 12245 + }, + { + "epoch": 1.1536233249334684, + "grad_norm": 0.5787128806114197, + "learning_rate": 7.874380638788485e-06, + "loss": 0.1722, + "step": 12246 + }, + { + "epoch": 1.1537175290266362, + "grad_norm": 0.6447485685348511, + "learning_rate": 7.87290515247391e-06, + "loss": 0.2114, + "step": 12247 + }, + { + "epoch": 1.1538117331198041, + "grad_norm": 0.6590207815170288, + "learning_rate": 7.871429714660383e-06, + "loss": 0.2085, + "step": 12248 + }, + { + "epoch": 1.1539059372129719, + "grad_norm": 0.7718537449836731, + "learning_rate": 7.869954325381552e-06, + "loss": 0.1774, + "step": 12249 + }, + { + "epoch": 1.1540001413061398, + "grad_norm": 0.6921826004981995, + "learning_rate": 7.868478984671054e-06, + "loss": 0.2331, + "step": 12250 + }, + { + "epoch": 1.1540943453993076, + "grad_norm": 0.6519464254379272, + "learning_rate": 7.867003692562533e-06, + "loss": 0.2406, + "step": 12251 + }, + { + "epoch": 1.1541885494924755, + "grad_norm": 0.6791825890541077, + "learning_rate": 7.865528449089628e-06, + "loss": 0.2313, + "step": 12252 + }, + { + "epoch": 1.1542827535856433, + "grad_norm": 0.7045342922210693, + "learning_rate": 7.864053254285973e-06, + "loss": 0.2058, + "step": 12253 + }, + { + "epoch": 1.1543769576788112, + "grad_norm": 0.6821885704994202, + "learning_rate": 7.862578108185208e-06, + "loss": 0.2188, + "step": 12254 + }, + { + "epoch": 1.154471161771979, + "grad_norm": 0.7017951011657715, + "learning_rate": 7.861103010820968e-06, + "loss": 0.224, + "step": 12255 + }, + { + "epoch": 1.154565365865147, + "grad_norm": 0.6462750434875488, + "learning_rate": 7.859627962226886e-06, + "loss": 0.2187, + "step": 12256 + }, + { + "epoch": 1.1546595699583146, + "grad_norm": 0.6207707524299622, + "learning_rate": 7.858152962436598e-06, + "loss": 0.2381, + "step": 12257 + }, + { + "epoch": 1.1547537740514826, + "grad_norm": 0.6876768469810486, + "learning_rate": 7.856678011483734e-06, + "loss": 0.204, + "step": 12258 + }, + { + "epoch": 1.1548479781446503, + "grad_norm": 0.6457028985023499, + "learning_rate": 7.85520310940193e-06, + "loss": 0.2276, + "step": 12259 + }, + { + "epoch": 1.1549421822378183, + "grad_norm": 0.6530483365058899, + "learning_rate": 7.853728256224806e-06, + "loss": 0.2135, + "step": 12260 + }, + { + "epoch": 1.155036386330986, + "grad_norm": 0.6017844080924988, + "learning_rate": 7.852253451986e-06, + "loss": 0.186, + "step": 12261 + }, + { + "epoch": 1.155130590424154, + "grad_norm": 0.658333420753479, + "learning_rate": 7.850778696719139e-06, + "loss": 0.2166, + "step": 12262 + }, + { + "epoch": 1.1552247945173217, + "grad_norm": 0.6860843300819397, + "learning_rate": 7.849303990457842e-06, + "loss": 0.2184, + "step": 12263 + }, + { + "epoch": 1.1553189986104897, + "grad_norm": 0.5907440185546875, + "learning_rate": 7.847829333235744e-06, + "loss": 0.1875, + "step": 12264 + }, + { + "epoch": 1.1554132027036574, + "grad_norm": 0.6708579659461975, + "learning_rate": 7.846354725086467e-06, + "loss": 0.2166, + "step": 12265 + }, + { + "epoch": 1.1555074067968254, + "grad_norm": 0.7163122892379761, + "learning_rate": 7.844880166043627e-06, + "loss": 0.2338, + "step": 12266 + }, + { + "epoch": 1.1556016108899931, + "grad_norm": 0.6653092503547668, + "learning_rate": 7.84340565614086e-06, + "loss": 0.2095, + "step": 12267 + }, + { + "epoch": 1.155695814983161, + "grad_norm": 0.7395054697990417, + "learning_rate": 7.841931195411775e-06, + "loss": 0.1974, + "step": 12268 + }, + { + "epoch": 1.1557900190763288, + "grad_norm": 0.6676589250564575, + "learning_rate": 7.840456783889997e-06, + "loss": 0.2337, + "step": 12269 + }, + { + "epoch": 1.1558842231694968, + "grad_norm": 0.5892910957336426, + "learning_rate": 7.838982421609143e-06, + "loss": 0.2136, + "step": 12270 + }, + { + "epoch": 1.1559784272626645, + "grad_norm": 0.6500517129898071, + "learning_rate": 7.837508108602833e-06, + "loss": 0.2204, + "step": 12271 + }, + { + "epoch": 1.1560726313558325, + "grad_norm": 0.5860692858695984, + "learning_rate": 7.836033844904683e-06, + "loss": 0.1869, + "step": 12272 + }, + { + "epoch": 1.1561668354490002, + "grad_norm": 0.5967850685119629, + "learning_rate": 7.834559630548305e-06, + "loss": 0.1981, + "step": 12273 + }, + { + "epoch": 1.1562610395421682, + "grad_norm": 0.6711938381195068, + "learning_rate": 7.833085465567318e-06, + "loss": 0.2193, + "step": 12274 + }, + { + "epoch": 1.156355243635336, + "grad_norm": 0.6680728793144226, + "learning_rate": 7.831611349995335e-06, + "loss": 0.2319, + "step": 12275 + }, + { + "epoch": 1.1564494477285039, + "grad_norm": 0.5889575481414795, + "learning_rate": 7.830137283865965e-06, + "loss": 0.1822, + "step": 12276 + }, + { + "epoch": 1.1565436518216716, + "grad_norm": 0.6993216276168823, + "learning_rate": 7.82866326721282e-06, + "loss": 0.2094, + "step": 12277 + }, + { + "epoch": 1.1566378559148396, + "grad_norm": 0.6535523533821106, + "learning_rate": 7.827189300069513e-06, + "loss": 0.2183, + "step": 12278 + }, + { + "epoch": 1.1567320600080073, + "grad_norm": 0.6322247385978699, + "learning_rate": 7.825715382469651e-06, + "loss": 0.2107, + "step": 12279 + }, + { + "epoch": 1.1568262641011753, + "grad_norm": 0.6894719004631042, + "learning_rate": 7.824241514446835e-06, + "loss": 0.2256, + "step": 12280 + }, + { + "epoch": 1.156920468194343, + "grad_norm": 0.6726341247558594, + "learning_rate": 7.822767696034683e-06, + "loss": 0.2103, + "step": 12281 + }, + { + "epoch": 1.157014672287511, + "grad_norm": 0.629125714302063, + "learning_rate": 7.821293927266795e-06, + "loss": 0.1987, + "step": 12282 + }, + { + "epoch": 1.1571088763806787, + "grad_norm": 0.5876413583755493, + "learning_rate": 7.819820208176769e-06, + "loss": 0.1881, + "step": 12283 + }, + { + "epoch": 1.1572030804738467, + "grad_norm": 0.8241168260574341, + "learning_rate": 7.818346538798219e-06, + "loss": 0.2325, + "step": 12284 + }, + { + "epoch": 1.1572972845670144, + "grad_norm": 0.6371445059776306, + "learning_rate": 7.816872919164745e-06, + "loss": 0.2025, + "step": 12285 + }, + { + "epoch": 1.1573914886601824, + "grad_norm": 0.723773181438446, + "learning_rate": 7.815399349309935e-06, + "loss": 0.1986, + "step": 12286 + }, + { + "epoch": 1.1574856927533501, + "grad_norm": 0.599209725856781, + "learning_rate": 7.813925829267407e-06, + "loss": 0.1793, + "step": 12287 + }, + { + "epoch": 1.157579896846518, + "grad_norm": 0.608439028263092, + "learning_rate": 7.812452359070748e-06, + "loss": 0.1624, + "step": 12288 + }, + { + "epoch": 1.1576741009396858, + "grad_norm": 0.58980393409729, + "learning_rate": 7.81097893875356e-06, + "loss": 0.1982, + "step": 12289 + }, + { + "epoch": 1.1577683050328538, + "grad_norm": 0.6739835739135742, + "learning_rate": 7.809505568349434e-06, + "loss": 0.2403, + "step": 12290 + }, + { + "epoch": 1.1578625091260215, + "grad_norm": 0.6799923777580261, + "learning_rate": 7.80803224789197e-06, + "loss": 0.2229, + "step": 12291 + }, + { + "epoch": 1.1579567132191895, + "grad_norm": 0.745564341545105, + "learning_rate": 7.806558977414763e-06, + "loss": 0.1875, + "step": 12292 + }, + { + "epoch": 1.1580509173123572, + "grad_norm": 0.6821733117103577, + "learning_rate": 7.8050857569514e-06, + "loss": 0.2278, + "step": 12293 + }, + { + "epoch": 1.158145121405525, + "grad_norm": 0.7225754857063293, + "learning_rate": 7.803612586535478e-06, + "loss": 0.2225, + "step": 12294 + }, + { + "epoch": 1.158239325498693, + "grad_norm": 0.6468100547790527, + "learning_rate": 7.802139466200586e-06, + "loss": 0.2064, + "step": 12295 + }, + { + "epoch": 1.1583335295918609, + "grad_norm": 0.6715960502624512, + "learning_rate": 7.80066639598031e-06, + "loss": 0.2039, + "step": 12296 + }, + { + "epoch": 1.1584277336850286, + "grad_norm": 0.5877363681793213, + "learning_rate": 7.799193375908245e-06, + "loss": 0.1999, + "step": 12297 + }, + { + "epoch": 1.1585219377781963, + "grad_norm": 0.6958502531051636, + "learning_rate": 7.797720406017975e-06, + "loss": 0.212, + "step": 12298 + }, + { + "epoch": 1.1586161418713643, + "grad_norm": 0.6634026169776917, + "learning_rate": 7.796247486343088e-06, + "loss": 0.2296, + "step": 12299 + }, + { + "epoch": 1.1587103459645323, + "grad_norm": 0.660721480846405, + "learning_rate": 7.79477461691716e-06, + "loss": 0.1861, + "step": 12300 + }, + { + "epoch": 1.1588045500577, + "grad_norm": 0.6169300079345703, + "learning_rate": 7.793301797773785e-06, + "loss": 0.1884, + "step": 12301 + }, + { + "epoch": 1.1588987541508677, + "grad_norm": 0.616231381893158, + "learning_rate": 7.791829028946544e-06, + "loss": 0.1909, + "step": 12302 + }, + { + "epoch": 1.1589929582440357, + "grad_norm": 0.6795166730880737, + "learning_rate": 7.790356310469009e-06, + "loss": 0.2078, + "step": 12303 + }, + { + "epoch": 1.1590871623372037, + "grad_norm": 0.6517577171325684, + "learning_rate": 7.788883642374774e-06, + "loss": 0.2032, + "step": 12304 + }, + { + "epoch": 1.1591813664303714, + "grad_norm": 0.6272116899490356, + "learning_rate": 7.78741102469741e-06, + "loss": 0.1733, + "step": 12305 + }, + { + "epoch": 1.1592755705235391, + "grad_norm": 0.6892327070236206, + "learning_rate": 7.785938457470492e-06, + "loss": 0.2244, + "step": 12306 + }, + { + "epoch": 1.159369774616707, + "grad_norm": 0.6756359338760376, + "learning_rate": 7.784465940727608e-06, + "loss": 0.2017, + "step": 12307 + }, + { + "epoch": 1.159463978709875, + "grad_norm": 0.6559037566184998, + "learning_rate": 7.782993474502323e-06, + "loss": 0.2332, + "step": 12308 + }, + { + "epoch": 1.1595581828030428, + "grad_norm": 0.6312324404716492, + "learning_rate": 7.781521058828218e-06, + "loss": 0.2001, + "step": 12309 + }, + { + "epoch": 1.1596523868962105, + "grad_norm": 0.6661595702171326, + "learning_rate": 7.78004869373886e-06, + "loss": 0.2121, + "step": 12310 + }, + { + "epoch": 1.1597465909893785, + "grad_norm": 0.752112865447998, + "learning_rate": 7.778576379267828e-06, + "loss": 0.2355, + "step": 12311 + }, + { + "epoch": 1.1598407950825464, + "grad_norm": 0.674813985824585, + "learning_rate": 7.77710411544869e-06, + "loss": 0.2039, + "step": 12312 + }, + { + "epoch": 1.1599349991757142, + "grad_norm": 0.653131365776062, + "learning_rate": 7.775631902315012e-06, + "loss": 0.2443, + "step": 12313 + }, + { + "epoch": 1.160029203268882, + "grad_norm": 0.6176143288612366, + "learning_rate": 7.774159739900371e-06, + "loss": 0.21, + "step": 12314 + }, + { + "epoch": 1.1601234073620499, + "grad_norm": 0.6649085879325867, + "learning_rate": 7.77268762823833e-06, + "loss": 0.1977, + "step": 12315 + }, + { + "epoch": 1.1602176114552178, + "grad_norm": 0.6967582106590271, + "learning_rate": 7.771215567362454e-06, + "loss": 0.1981, + "step": 12316 + }, + { + "epoch": 1.1603118155483856, + "grad_norm": 0.6099845170974731, + "learning_rate": 7.76974355730631e-06, + "loss": 0.192, + "step": 12317 + }, + { + "epoch": 1.1604060196415533, + "grad_norm": 0.7052638530731201, + "learning_rate": 7.768271598103465e-06, + "loss": 0.2115, + "step": 12318 + }, + { + "epoch": 1.1605002237347213, + "grad_norm": 0.6722414493560791, + "learning_rate": 7.766799689787478e-06, + "loss": 0.24, + "step": 12319 + }, + { + "epoch": 1.1605944278278892, + "grad_norm": 0.674592912197113, + "learning_rate": 7.765327832391907e-06, + "loss": 0.2174, + "step": 12320 + }, + { + "epoch": 1.160688631921057, + "grad_norm": 0.6404471397399902, + "learning_rate": 7.763856025950321e-06, + "loss": 0.2006, + "step": 12321 + }, + { + "epoch": 1.1607828360142247, + "grad_norm": 0.6440375447273254, + "learning_rate": 7.762384270496279e-06, + "loss": 0.2116, + "step": 12322 + }, + { + "epoch": 1.1608770401073927, + "grad_norm": 0.6806245446205139, + "learning_rate": 7.760912566063328e-06, + "loss": 0.216, + "step": 12323 + }, + { + "epoch": 1.1609712442005606, + "grad_norm": 0.6037777066230774, + "learning_rate": 7.759440912685043e-06, + "loss": 0.2031, + "step": 12324 + }, + { + "epoch": 1.1610654482937284, + "grad_norm": 0.6117852330207825, + "learning_rate": 7.757969310394965e-06, + "loss": 0.195, + "step": 12325 + }, + { + "epoch": 1.1611596523868961, + "grad_norm": 0.6487354040145874, + "learning_rate": 7.756497759226652e-06, + "loss": 0.1964, + "step": 12326 + }, + { + "epoch": 1.161253856480064, + "grad_norm": 0.5659785866737366, + "learning_rate": 7.755026259213665e-06, + "loss": 0.1934, + "step": 12327 + }, + { + "epoch": 1.1613480605732318, + "grad_norm": 0.5937870740890503, + "learning_rate": 7.753554810389549e-06, + "loss": 0.1827, + "step": 12328 + }, + { + "epoch": 1.1614422646663998, + "grad_norm": 0.6416780352592468, + "learning_rate": 7.752083412787858e-06, + "loss": 0.2114, + "step": 12329 + }, + { + "epoch": 1.1615364687595675, + "grad_norm": 0.6302727460861206, + "learning_rate": 7.750612066442138e-06, + "loss": 0.2041, + "step": 12330 + }, + { + "epoch": 1.1616306728527355, + "grad_norm": 0.6919407248497009, + "learning_rate": 7.749140771385945e-06, + "loss": 0.2045, + "step": 12331 + }, + { + "epoch": 1.1617248769459032, + "grad_norm": 0.6700026988983154, + "learning_rate": 7.747669527652824e-06, + "loss": 0.2063, + "step": 12332 + }, + { + "epoch": 1.1618190810390712, + "grad_norm": 0.5566019415855408, + "learning_rate": 7.746198335276318e-06, + "loss": 0.2084, + "step": 12333 + }, + { + "epoch": 1.161913285132239, + "grad_norm": 0.7124121189117432, + "learning_rate": 7.744727194289977e-06, + "loss": 0.2255, + "step": 12334 + }, + { + "epoch": 1.1620074892254069, + "grad_norm": 0.6468402147293091, + "learning_rate": 7.743256104727346e-06, + "loss": 0.2163, + "step": 12335 + }, + { + "epoch": 1.1621016933185746, + "grad_norm": 0.7208757400512695, + "learning_rate": 7.741785066621962e-06, + "loss": 0.199, + "step": 12336 + }, + { + "epoch": 1.1621958974117426, + "grad_norm": 0.6091558933258057, + "learning_rate": 7.740314080007374e-06, + "loss": 0.1943, + "step": 12337 + }, + { + "epoch": 1.1622901015049103, + "grad_norm": 0.646746039390564, + "learning_rate": 7.738843144917119e-06, + "loss": 0.2223, + "step": 12338 + }, + { + "epoch": 1.1623843055980783, + "grad_norm": 0.661420464515686, + "learning_rate": 7.737372261384738e-06, + "loss": 0.2055, + "step": 12339 + }, + { + "epoch": 1.162478509691246, + "grad_norm": 0.6283174753189087, + "learning_rate": 7.735901429443766e-06, + "loss": 0.1886, + "step": 12340 + }, + { + "epoch": 1.162572713784414, + "grad_norm": 0.6365818977355957, + "learning_rate": 7.734430649127745e-06, + "loss": 0.1838, + "step": 12341 + }, + { + "epoch": 1.1626669178775817, + "grad_norm": 0.6031051874160767, + "learning_rate": 7.73295992047021e-06, + "loss": 0.2094, + "step": 12342 + }, + { + "epoch": 1.1627611219707497, + "grad_norm": 0.6952539086341858, + "learning_rate": 7.73148924350469e-06, + "loss": 0.2276, + "step": 12343 + }, + { + "epoch": 1.1628553260639174, + "grad_norm": 0.6067687273025513, + "learning_rate": 7.73001861826473e-06, + "loss": 0.192, + "step": 12344 + }, + { + "epoch": 1.1629495301570854, + "grad_norm": 0.6541218161582947, + "learning_rate": 7.728548044783854e-06, + "loss": 0.2019, + "step": 12345 + }, + { + "epoch": 1.163043734250253, + "grad_norm": 0.7503097057342529, + "learning_rate": 7.72707752309559e-06, + "loss": 0.1968, + "step": 12346 + }, + { + "epoch": 1.163137938343421, + "grad_norm": 0.7333747148513794, + "learning_rate": 7.725607053233482e-06, + "loss": 0.2061, + "step": 12347 + }, + { + "epoch": 1.1632321424365888, + "grad_norm": 0.6139345765113831, + "learning_rate": 7.724136635231047e-06, + "loss": 0.2112, + "step": 12348 + }, + { + "epoch": 1.1633263465297567, + "grad_norm": 0.6344771981239319, + "learning_rate": 7.722666269121815e-06, + "loss": 0.2271, + "step": 12349 + }, + { + "epoch": 1.1634205506229245, + "grad_norm": 0.6152917146682739, + "learning_rate": 7.721195954939315e-06, + "loss": 0.2004, + "step": 12350 + }, + { + "epoch": 1.1635147547160924, + "grad_norm": 0.5891529321670532, + "learning_rate": 7.719725692717071e-06, + "loss": 0.2099, + "step": 12351 + }, + { + "epoch": 1.1636089588092602, + "grad_norm": 0.6408778429031372, + "learning_rate": 7.718255482488609e-06, + "loss": 0.2316, + "step": 12352 + }, + { + "epoch": 1.1637031629024281, + "grad_norm": 0.6090258359909058, + "learning_rate": 7.716785324287447e-06, + "loss": 0.1987, + "step": 12353 + }, + { + "epoch": 1.1637973669955959, + "grad_norm": 0.6803856492042542, + "learning_rate": 7.715315218147116e-06, + "loss": 0.2157, + "step": 12354 + }, + { + "epoch": 1.1638915710887638, + "grad_norm": 0.7470532655715942, + "learning_rate": 7.71384516410113e-06, + "loss": 0.2351, + "step": 12355 + }, + { + "epoch": 1.1639857751819316, + "grad_norm": 0.6469044089317322, + "learning_rate": 7.712375162183007e-06, + "loss": 0.2039, + "step": 12356 + }, + { + "epoch": 1.1640799792750995, + "grad_norm": 0.6264247894287109, + "learning_rate": 7.710905212426271e-06, + "loss": 0.2036, + "step": 12357 + }, + { + "epoch": 1.1641741833682673, + "grad_norm": 0.6687905788421631, + "learning_rate": 7.709435314864435e-06, + "loss": 0.1995, + "step": 12358 + }, + { + "epoch": 1.1642683874614352, + "grad_norm": 0.5562198162078857, + "learning_rate": 7.70796546953102e-06, + "loss": 0.1949, + "step": 12359 + }, + { + "epoch": 1.164362591554603, + "grad_norm": 0.6398401260375977, + "learning_rate": 7.70649567645953e-06, + "loss": 0.2098, + "step": 12360 + }, + { + "epoch": 1.164456795647771, + "grad_norm": 0.6486343741416931, + "learning_rate": 7.70502593568349e-06, + "loss": 0.2288, + "step": 12361 + }, + { + "epoch": 1.1645509997409387, + "grad_norm": 0.6357438564300537, + "learning_rate": 7.70355624723641e-06, + "loss": 0.2155, + "step": 12362 + }, + { + "epoch": 1.1646452038341066, + "grad_norm": 0.8581362962722778, + "learning_rate": 7.702086611151792e-06, + "loss": 0.1986, + "step": 12363 + }, + { + "epoch": 1.1647394079272744, + "grad_norm": 0.6492730975151062, + "learning_rate": 7.70061702746316e-06, + "loss": 0.228, + "step": 12364 + }, + { + "epoch": 1.1648336120204423, + "grad_norm": 0.6923218369483948, + "learning_rate": 7.699147496204014e-06, + "loss": 0.2222, + "step": 12365 + }, + { + "epoch": 1.16492781611361, + "grad_norm": 0.6648165583610535, + "learning_rate": 7.697678017407858e-06, + "loss": 0.2008, + "step": 12366 + }, + { + "epoch": 1.165022020206778, + "grad_norm": 0.6043445467948914, + "learning_rate": 7.696208591108211e-06, + "loss": 0.1895, + "step": 12367 + }, + { + "epoch": 1.1651162242999458, + "grad_norm": 0.597754955291748, + "learning_rate": 7.694739217338569e-06, + "loss": 0.1972, + "step": 12368 + }, + { + "epoch": 1.1652104283931137, + "grad_norm": 0.6218483448028564, + "learning_rate": 7.693269896132438e-06, + "loss": 0.1866, + "step": 12369 + }, + { + "epoch": 1.1653046324862815, + "grad_norm": 0.6251735091209412, + "learning_rate": 7.691800627523319e-06, + "loss": 0.1971, + "step": 12370 + }, + { + "epoch": 1.1653988365794494, + "grad_norm": 0.5660892724990845, + "learning_rate": 7.690331411544716e-06, + "loss": 0.1685, + "step": 12371 + }, + { + "epoch": 1.1654930406726172, + "grad_norm": 0.646435022354126, + "learning_rate": 7.688862248230132e-06, + "loss": 0.2005, + "step": 12372 + }, + { + "epoch": 1.1655872447657851, + "grad_norm": 0.6149870753288269, + "learning_rate": 7.687393137613056e-06, + "loss": 0.1722, + "step": 12373 + }, + { + "epoch": 1.1656814488589529, + "grad_norm": 0.7423371076583862, + "learning_rate": 7.685924079727e-06, + "loss": 0.2273, + "step": 12374 + }, + { + "epoch": 1.1657756529521208, + "grad_norm": 0.8053909540176392, + "learning_rate": 7.684455074605452e-06, + "loss": 0.2573, + "step": 12375 + }, + { + "epoch": 1.1658698570452886, + "grad_norm": 0.7121588587760925, + "learning_rate": 7.682986122281906e-06, + "loss": 0.2391, + "step": 12376 + }, + { + "epoch": 1.1659640611384565, + "grad_norm": 0.6963463425636292, + "learning_rate": 7.681517222789863e-06, + "loss": 0.2259, + "step": 12377 + }, + { + "epoch": 1.1660582652316243, + "grad_norm": 0.6686302423477173, + "learning_rate": 7.680048376162813e-06, + "loss": 0.2208, + "step": 12378 + }, + { + "epoch": 1.1661524693247922, + "grad_norm": 0.5913288593292236, + "learning_rate": 7.67857958243425e-06, + "loss": 0.2128, + "step": 12379 + }, + { + "epoch": 1.16624667341796, + "grad_norm": 0.7313753366470337, + "learning_rate": 7.677110841637654e-06, + "loss": 0.2324, + "step": 12380 + }, + { + "epoch": 1.166340877511128, + "grad_norm": 0.7491220831871033, + "learning_rate": 7.675642153806531e-06, + "loss": 0.2179, + "step": 12381 + }, + { + "epoch": 1.1664350816042957, + "grad_norm": 0.6844172477722168, + "learning_rate": 7.674173518974362e-06, + "loss": 0.1929, + "step": 12382 + }, + { + "epoch": 1.1665292856974636, + "grad_norm": 0.5895171165466309, + "learning_rate": 7.672704937174627e-06, + "loss": 0.2041, + "step": 12383 + }, + { + "epoch": 1.1666234897906314, + "grad_norm": 0.6461688876152039, + "learning_rate": 7.671236408440826e-06, + "loss": 0.1769, + "step": 12384 + }, + { + "epoch": 1.1667176938837993, + "grad_norm": 0.6043210625648499, + "learning_rate": 7.669767932806433e-06, + "loss": 0.1939, + "step": 12385 + }, + { + "epoch": 1.166811897976967, + "grad_norm": 0.6969326734542847, + "learning_rate": 7.668299510304931e-06, + "loss": 0.2281, + "step": 12386 + }, + { + "epoch": 1.166906102070135, + "grad_norm": 0.5954148173332214, + "learning_rate": 7.666831140969814e-06, + "loss": 0.201, + "step": 12387 + }, + { + "epoch": 1.1670003061633027, + "grad_norm": 0.6823163032531738, + "learning_rate": 7.665362824834552e-06, + "loss": 0.2005, + "step": 12388 + }, + { + "epoch": 1.1670945102564707, + "grad_norm": 0.6804181337356567, + "learning_rate": 7.663894561932628e-06, + "loss": 0.211, + "step": 12389 + }, + { + "epoch": 1.1671887143496384, + "grad_norm": 0.6363169550895691, + "learning_rate": 7.662426352297519e-06, + "loss": 0.2165, + "step": 12390 + }, + { + "epoch": 1.1672829184428064, + "grad_norm": 0.6093109846115112, + "learning_rate": 7.660958195962707e-06, + "loss": 0.198, + "step": 12391 + }, + { + "epoch": 1.1673771225359741, + "grad_norm": 0.6550191044807434, + "learning_rate": 7.659490092961665e-06, + "loss": 0.183, + "step": 12392 + }, + { + "epoch": 1.167471326629142, + "grad_norm": 0.9614073634147644, + "learning_rate": 7.658022043327867e-06, + "loss": 0.2112, + "step": 12393 + }, + { + "epoch": 1.1675655307223098, + "grad_norm": 0.6851538419723511, + "learning_rate": 7.65655404709479e-06, + "loss": 0.2063, + "step": 12394 + }, + { + "epoch": 1.1676597348154778, + "grad_norm": 0.6280253529548645, + "learning_rate": 7.655086104295904e-06, + "loss": 0.2169, + "step": 12395 + }, + { + "epoch": 1.1677539389086455, + "grad_norm": 0.6450567841529846, + "learning_rate": 7.65361821496468e-06, + "loss": 0.2165, + "step": 12396 + }, + { + "epoch": 1.1678481430018135, + "grad_norm": 0.6430543661117554, + "learning_rate": 7.652150379134593e-06, + "loss": 0.2103, + "step": 12397 + }, + { + "epoch": 1.1679423470949812, + "grad_norm": 0.6612107753753662, + "learning_rate": 7.650682596839107e-06, + "loss": 0.2159, + "step": 12398 + }, + { + "epoch": 1.1680365511881492, + "grad_norm": 0.6067182421684265, + "learning_rate": 7.649214868111692e-06, + "loss": 0.2196, + "step": 12399 + }, + { + "epoch": 1.168130755281317, + "grad_norm": 0.6457757353782654, + "learning_rate": 7.647747192985808e-06, + "loss": 0.209, + "step": 12400 + }, + { + "epoch": 1.168224959374485, + "grad_norm": 0.6386038064956665, + "learning_rate": 7.646279571494931e-06, + "loss": 0.1793, + "step": 12401 + }, + { + "epoch": 1.1683191634676526, + "grad_norm": 0.6418023705482483, + "learning_rate": 7.644812003672521e-06, + "loss": 0.2076, + "step": 12402 + }, + { + "epoch": 1.1684133675608206, + "grad_norm": 0.6170333027839661, + "learning_rate": 7.643344489552033e-06, + "loss": 0.1892, + "step": 12403 + }, + { + "epoch": 1.1685075716539883, + "grad_norm": 0.5894877910614014, + "learning_rate": 7.641877029166943e-06, + "loss": 0.1613, + "step": 12404 + }, + { + "epoch": 1.1686017757471563, + "grad_norm": 0.5999869108200073, + "learning_rate": 7.640409622550702e-06, + "loss": 0.2207, + "step": 12405 + }, + { + "epoch": 1.168695979840324, + "grad_norm": 0.641161322593689, + "learning_rate": 7.638942269736765e-06, + "loss": 0.2084, + "step": 12406 + }, + { + "epoch": 1.168790183933492, + "grad_norm": 0.616152286529541, + "learning_rate": 7.637474970758602e-06, + "loss": 0.2041, + "step": 12407 + }, + { + "epoch": 1.1688843880266597, + "grad_norm": 0.6475275754928589, + "learning_rate": 7.636007725649662e-06, + "loss": 0.2315, + "step": 12408 + }, + { + "epoch": 1.1689785921198277, + "grad_norm": 0.698611855506897, + "learning_rate": 7.634540534443402e-06, + "loss": 0.1858, + "step": 12409 + }, + { + "epoch": 1.1690727962129954, + "grad_norm": 0.6852428913116455, + "learning_rate": 7.633073397173274e-06, + "loss": 0.2193, + "step": 12410 + }, + { + "epoch": 1.1691670003061634, + "grad_norm": 0.6028615832328796, + "learning_rate": 7.631606313872736e-06, + "loss": 0.2073, + "step": 12411 + }, + { + "epoch": 1.1692612043993311, + "grad_norm": 0.5900949835777283, + "learning_rate": 7.630139284575233e-06, + "loss": 0.1696, + "step": 12412 + }, + { + "epoch": 1.169355408492499, + "grad_norm": 0.64311683177948, + "learning_rate": 7.628672309314221e-06, + "loss": 0.2132, + "step": 12413 + }, + { + "epoch": 1.1694496125856668, + "grad_norm": 0.6563279032707214, + "learning_rate": 7.627205388123149e-06, + "loss": 0.2041, + "step": 12414 + }, + { + "epoch": 1.1695438166788348, + "grad_norm": 0.6046981811523438, + "learning_rate": 7.625738521035463e-06, + "loss": 0.1825, + "step": 12415 + }, + { + "epoch": 1.1696380207720025, + "grad_norm": 0.6331227421760559, + "learning_rate": 7.6242717080846096e-06, + "loss": 0.1966, + "step": 12416 + }, + { + "epoch": 1.1697322248651705, + "grad_norm": 0.6417523622512817, + "learning_rate": 7.622804949304037e-06, + "loss": 0.2222, + "step": 12417 + }, + { + "epoch": 1.1698264289583382, + "grad_norm": 0.610335111618042, + "learning_rate": 7.6213382447271875e-06, + "loss": 0.1899, + "step": 12418 + }, + { + "epoch": 1.1699206330515062, + "grad_norm": 0.5904601216316223, + "learning_rate": 7.619871594387507e-06, + "loss": 0.1816, + "step": 12419 + }, + { + "epoch": 1.170014837144674, + "grad_norm": 0.6495140790939331, + "learning_rate": 7.618404998318428e-06, + "loss": 0.2124, + "step": 12420 + }, + { + "epoch": 1.1701090412378419, + "grad_norm": 0.630237877368927, + "learning_rate": 7.616938456553405e-06, + "loss": 0.2125, + "step": 12421 + }, + { + "epoch": 1.1702032453310096, + "grad_norm": 0.6966006755828857, + "learning_rate": 7.6154719691258696e-06, + "loss": 0.1984, + "step": 12422 + }, + { + "epoch": 1.1702974494241776, + "grad_norm": 0.6157569289207458, + "learning_rate": 7.614005536069257e-06, + "loss": 0.1825, + "step": 12423 + }, + { + "epoch": 1.1703916535173453, + "grad_norm": 0.6618136763572693, + "learning_rate": 7.612539157417013e-06, + "loss": 0.2246, + "step": 12424 + }, + { + "epoch": 1.1704858576105133, + "grad_norm": 0.6306236982345581, + "learning_rate": 7.611072833202568e-06, + "loss": 0.2135, + "step": 12425 + }, + { + "epoch": 1.170580061703681, + "grad_norm": 0.6721271276473999, + "learning_rate": 7.609606563459351e-06, + "loss": 0.1914, + "step": 12426 + }, + { + "epoch": 1.170674265796849, + "grad_norm": 0.6963659524917603, + "learning_rate": 7.608140348220808e-06, + "loss": 0.2175, + "step": 12427 + }, + { + "epoch": 1.1707684698900167, + "grad_norm": 0.5983977317810059, + "learning_rate": 7.606674187520362e-06, + "loss": 0.199, + "step": 12428 + }, + { + "epoch": 1.1708626739831847, + "grad_norm": 0.6682076454162598, + "learning_rate": 7.6052080813914466e-06, + "loss": 0.2196, + "step": 12429 + }, + { + "epoch": 1.1709568780763524, + "grad_norm": 0.7540499567985535, + "learning_rate": 7.603742029867488e-06, + "loss": 0.2503, + "step": 12430 + }, + { + "epoch": 1.1710510821695204, + "grad_norm": 0.7160568833351135, + "learning_rate": 7.602276032981919e-06, + "loss": 0.2099, + "step": 12431 + }, + { + "epoch": 1.171145286262688, + "grad_norm": 0.6036234498023987, + "learning_rate": 7.600810090768165e-06, + "loss": 0.2025, + "step": 12432 + }, + { + "epoch": 1.1712394903558558, + "grad_norm": 0.5967129468917847, + "learning_rate": 7.599344203259648e-06, + "loss": 0.1864, + "step": 12433 + }, + { + "epoch": 1.1713336944490238, + "grad_norm": 0.6145238280296326, + "learning_rate": 7.5978783704898e-06, + "loss": 0.198, + "step": 12434 + }, + { + "epoch": 1.1714278985421918, + "grad_norm": 0.6215044260025024, + "learning_rate": 7.5964125924920395e-06, + "loss": 0.1923, + "step": 12435 + }, + { + "epoch": 1.1715221026353595, + "grad_norm": 0.6010145545005798, + "learning_rate": 7.5949468692997865e-06, + "loss": 0.1897, + "step": 12436 + }, + { + "epoch": 1.1716163067285272, + "grad_norm": 0.6129345893859863, + "learning_rate": 7.593481200946467e-06, + "loss": 0.2052, + "step": 12437 + }, + { + "epoch": 1.1717105108216952, + "grad_norm": 0.6434243321418762, + "learning_rate": 7.5920155874654965e-06, + "loss": 0.196, + "step": 12438 + }, + { + "epoch": 1.1718047149148632, + "grad_norm": 0.5903400778770447, + "learning_rate": 7.590550028890298e-06, + "loss": 0.2001, + "step": 12439 + }, + { + "epoch": 1.171898919008031, + "grad_norm": 0.7123304009437561, + "learning_rate": 7.589084525254278e-06, + "loss": 0.2554, + "step": 12440 + }, + { + "epoch": 1.1719931231011986, + "grad_norm": 0.63892662525177, + "learning_rate": 7.587619076590867e-06, + "loss": 0.2232, + "step": 12441 + }, + { + "epoch": 1.1720873271943666, + "grad_norm": 0.7637938857078552, + "learning_rate": 7.586153682933468e-06, + "loss": 0.1986, + "step": 12442 + }, + { + "epoch": 1.1721815312875346, + "grad_norm": 0.6968367695808411, + "learning_rate": 7.584688344315495e-06, + "loss": 0.1966, + "step": 12443 + }, + { + "epoch": 1.1722757353807023, + "grad_norm": 0.6212029457092285, + "learning_rate": 7.5832230607703696e-06, + "loss": 0.2226, + "step": 12444 + }, + { + "epoch": 1.17236993947387, + "grad_norm": 0.6339184045791626, + "learning_rate": 7.5817578323314935e-06, + "loss": 0.2022, + "step": 12445 + }, + { + "epoch": 1.172464143567038, + "grad_norm": 0.6979104280471802, + "learning_rate": 7.580292659032274e-06, + "loss": 0.202, + "step": 12446 + }, + { + "epoch": 1.172558347660206, + "grad_norm": 0.6808230876922607, + "learning_rate": 7.578827540906132e-06, + "loss": 0.2121, + "step": 12447 + }, + { + "epoch": 1.1726525517533737, + "grad_norm": 0.6211710572242737, + "learning_rate": 7.577362477986463e-06, + "loss": 0.22, + "step": 12448 + }, + { + "epoch": 1.1727467558465414, + "grad_norm": 0.5926870703697205, + "learning_rate": 7.575897470306677e-06, + "loss": 0.185, + "step": 12449 + }, + { + "epoch": 1.1728409599397094, + "grad_norm": 0.6486456394195557, + "learning_rate": 7.574432517900174e-06, + "loss": 0.1874, + "step": 12450 + }, + { + "epoch": 1.1729351640328773, + "grad_norm": 0.5857219696044922, + "learning_rate": 7.572967620800364e-06, + "loss": 0.1827, + "step": 12451 + }, + { + "epoch": 1.173029368126045, + "grad_norm": 0.6361328363418579, + "learning_rate": 7.571502779040646e-06, + "loss": 0.1992, + "step": 12452 + }, + { + "epoch": 1.1731235722192128, + "grad_norm": 0.6601576209068298, + "learning_rate": 7.570037992654418e-06, + "loss": 0.2079, + "step": 12453 + }, + { + "epoch": 1.1732177763123808, + "grad_norm": 0.6744426488876343, + "learning_rate": 7.568573261675083e-06, + "loss": 0.2184, + "step": 12454 + }, + { + "epoch": 1.1733119804055487, + "grad_norm": 0.6544932723045349, + "learning_rate": 7.56710858613604e-06, + "loss": 0.2035, + "step": 12455 + }, + { + "epoch": 1.1734061844987165, + "grad_norm": 0.6785704493522644, + "learning_rate": 7.5656439660706795e-06, + "loss": 0.1711, + "step": 12456 + }, + { + "epoch": 1.1735003885918842, + "grad_norm": 0.6942324638366699, + "learning_rate": 7.564179401512404e-06, + "loss": 0.2108, + "step": 12457 + }, + { + "epoch": 1.1735945926850522, + "grad_norm": 0.7869208455085754, + "learning_rate": 7.562714892494606e-06, + "loss": 0.2006, + "step": 12458 + }, + { + "epoch": 1.1736887967782201, + "grad_norm": 0.6158934831619263, + "learning_rate": 7.561250439050679e-06, + "loss": 0.1902, + "step": 12459 + }, + { + "epoch": 1.1737830008713879, + "grad_norm": 0.6476064324378967, + "learning_rate": 7.559786041214008e-06, + "loss": 0.2208, + "step": 12460 + }, + { + "epoch": 1.1738772049645556, + "grad_norm": 0.6589324474334717, + "learning_rate": 7.558321699017995e-06, + "loss": 0.2154, + "step": 12461 + }, + { + "epoch": 1.1739714090577236, + "grad_norm": 0.6076422333717346, + "learning_rate": 7.556857412496021e-06, + "loss": 0.1895, + "step": 12462 + }, + { + "epoch": 1.1740656131508915, + "grad_norm": 0.6216500401496887, + "learning_rate": 7.555393181681473e-06, + "loss": 0.2136, + "step": 12463 + }, + { + "epoch": 1.1741598172440593, + "grad_norm": 0.7523494958877563, + "learning_rate": 7.553929006607747e-06, + "loss": 0.2385, + "step": 12464 + }, + { + "epoch": 1.174254021337227, + "grad_norm": 0.6163733005523682, + "learning_rate": 7.55246488730822e-06, + "loss": 0.1742, + "step": 12465 + }, + { + "epoch": 1.174348225430395, + "grad_norm": 0.6610745191574097, + "learning_rate": 7.551000823816278e-06, + "loss": 0.2228, + "step": 12466 + }, + { + "epoch": 1.1744424295235627, + "grad_norm": 0.7351732850074768, + "learning_rate": 7.549536816165306e-06, + "loss": 0.2204, + "step": 12467 + }, + { + "epoch": 1.1745366336167307, + "grad_norm": 0.708152711391449, + "learning_rate": 7.548072864388684e-06, + "loss": 0.2432, + "step": 12468 + }, + { + "epoch": 1.1746308377098984, + "grad_norm": 0.5890923142433167, + "learning_rate": 7.546608968519793e-06, + "loss": 0.1723, + "step": 12469 + }, + { + "epoch": 1.1747250418030664, + "grad_norm": 0.689551055431366, + "learning_rate": 7.545145128592009e-06, + "loss": 0.243, + "step": 12470 + }, + { + "epoch": 1.174819245896234, + "grad_norm": 0.6270465850830078, + "learning_rate": 7.543681344638716e-06, + "loss": 0.1835, + "step": 12471 + }, + { + "epoch": 1.174913449989402, + "grad_norm": 0.6303161978721619, + "learning_rate": 7.542217616693286e-06, + "loss": 0.2158, + "step": 12472 + }, + { + "epoch": 1.1750076540825698, + "grad_norm": 0.6889188289642334, + "learning_rate": 7.540753944789094e-06, + "loss": 0.1872, + "step": 12473 + }, + { + "epoch": 1.1751018581757378, + "grad_norm": 0.6560834646224976, + "learning_rate": 7.539290328959517e-06, + "loss": 0.1973, + "step": 12474 + }, + { + "epoch": 1.1751960622689055, + "grad_norm": 0.668216347694397, + "learning_rate": 7.537826769237926e-06, + "loss": 0.1869, + "step": 12475 + }, + { + "epoch": 1.1752902663620735, + "grad_norm": 0.7283650040626526, + "learning_rate": 7.53636326565769e-06, + "loss": 0.2107, + "step": 12476 + }, + { + "epoch": 1.1753844704552412, + "grad_norm": 0.6102532744407654, + "learning_rate": 7.534899818252185e-06, + "loss": 0.2, + "step": 12477 + }, + { + "epoch": 1.1754786745484092, + "grad_norm": 0.6530992984771729, + "learning_rate": 7.533436427054776e-06, + "loss": 0.1785, + "step": 12478 + }, + { + "epoch": 1.175572878641577, + "grad_norm": 0.6950181722640991, + "learning_rate": 7.531973092098832e-06, + "loss": 0.1912, + "step": 12479 + }, + { + "epoch": 1.1756670827347449, + "grad_norm": 0.7558364272117615, + "learning_rate": 7.5305098134177135e-06, + "loss": 0.2076, + "step": 12480 + }, + { + "epoch": 1.1757612868279126, + "grad_norm": 0.6683874130249023, + "learning_rate": 7.5290465910447966e-06, + "loss": 0.2078, + "step": 12481 + }, + { + "epoch": 1.1758554909210805, + "grad_norm": 0.6612247228622437, + "learning_rate": 7.527583425013436e-06, + "loss": 0.2154, + "step": 12482 + }, + { + "epoch": 1.1759496950142483, + "grad_norm": 0.6922287940979004, + "learning_rate": 7.526120315356993e-06, + "loss": 0.2315, + "step": 12483 + }, + { + "epoch": 1.1760438991074162, + "grad_norm": 0.6004050374031067, + "learning_rate": 7.524657262108839e-06, + "loss": 0.1611, + "step": 12484 + }, + { + "epoch": 1.176138103200584, + "grad_norm": 0.7073184847831726, + "learning_rate": 7.523194265302326e-06, + "loss": 0.2229, + "step": 12485 + }, + { + "epoch": 1.176232307293752, + "grad_norm": 0.7124186754226685, + "learning_rate": 7.521731324970812e-06, + "loss": 0.2222, + "step": 12486 + }, + { + "epoch": 1.1763265113869197, + "grad_norm": 0.6939953565597534, + "learning_rate": 7.520268441147658e-06, + "loss": 0.2161, + "step": 12487 + }, + { + "epoch": 1.1764207154800876, + "grad_norm": 0.6451032161712646, + "learning_rate": 7.518805613866219e-06, + "loss": 0.194, + "step": 12488 + }, + { + "epoch": 1.1765149195732554, + "grad_norm": 0.6584877371788025, + "learning_rate": 7.517342843159849e-06, + "loss": 0.2101, + "step": 12489 + }, + { + "epoch": 1.1766091236664233, + "grad_norm": 0.6335259675979614, + "learning_rate": 7.5158801290619e-06, + "loss": 0.2135, + "step": 12490 + }, + { + "epoch": 1.176703327759591, + "grad_norm": 0.6652558445930481, + "learning_rate": 7.514417471605728e-06, + "loss": 0.2138, + "step": 12491 + }, + { + "epoch": 1.176797531852759, + "grad_norm": 0.6807827949523926, + "learning_rate": 7.5129548708246805e-06, + "loss": 0.2554, + "step": 12492 + }, + { + "epoch": 1.1768917359459268, + "grad_norm": 0.589968204498291, + "learning_rate": 7.511492326752107e-06, + "loss": 0.1976, + "step": 12493 + }, + { + "epoch": 1.1769859400390947, + "grad_norm": 0.6328553557395935, + "learning_rate": 7.510029839421359e-06, + "loss": 0.1914, + "step": 12494 + }, + { + "epoch": 1.1770801441322625, + "grad_norm": 0.6645973920822144, + "learning_rate": 7.508567408865781e-06, + "loss": 0.2058, + "step": 12495 + }, + { + "epoch": 1.1771743482254304, + "grad_norm": 0.6332281827926636, + "learning_rate": 7.507105035118718e-06, + "loss": 0.1884, + "step": 12496 + }, + { + "epoch": 1.1772685523185982, + "grad_norm": 0.5919604897499084, + "learning_rate": 7.5056427182135175e-06, + "loss": 0.2119, + "step": 12497 + }, + { + "epoch": 1.1773627564117661, + "grad_norm": 0.6027520298957825, + "learning_rate": 7.50418045818352e-06, + "loss": 0.1999, + "step": 12498 + }, + { + "epoch": 1.1774569605049339, + "grad_norm": 0.7183255553245544, + "learning_rate": 7.502718255062071e-06, + "loss": 0.2141, + "step": 12499 + }, + { + "epoch": 1.1775511645981018, + "grad_norm": 0.6482458114624023, + "learning_rate": 7.5012561088825e-06, + "loss": 0.2052, + "step": 12500 + }, + { + "epoch": 1.1776453686912696, + "grad_norm": 0.6224600672721863, + "learning_rate": 7.499794019678162e-06, + "loss": 0.207, + "step": 12501 + }, + { + "epoch": 1.1777395727844375, + "grad_norm": 0.6822807192802429, + "learning_rate": 7.4983319874823835e-06, + "loss": 0.2177, + "step": 12502 + }, + { + "epoch": 1.1778337768776053, + "grad_norm": 0.6278730034828186, + "learning_rate": 7.496870012328501e-06, + "loss": 0.2042, + "step": 12503 + }, + { + "epoch": 1.1779279809707732, + "grad_norm": 0.6707857847213745, + "learning_rate": 7.4954080942498605e-06, + "loss": 0.2379, + "step": 12504 + }, + { + "epoch": 1.178022185063941, + "grad_norm": 0.6079289317131042, + "learning_rate": 7.493946233279787e-06, + "loss": 0.1798, + "step": 12505 + }, + { + "epoch": 1.178116389157109, + "grad_norm": 0.6748082041740417, + "learning_rate": 7.492484429451611e-06, + "loss": 0.1939, + "step": 12506 + }, + { + "epoch": 1.1782105932502767, + "grad_norm": 0.6287468075752258, + "learning_rate": 7.491022682798671e-06, + "loss": 0.1925, + "step": 12507 + }, + { + "epoch": 1.1783047973434446, + "grad_norm": 0.7353914380073547, + "learning_rate": 7.489560993354295e-06, + "loss": 0.2013, + "step": 12508 + }, + { + "epoch": 1.1783990014366124, + "grad_norm": 0.6673984527587891, + "learning_rate": 7.4880993611518095e-06, + "loss": 0.2013, + "step": 12509 + }, + { + "epoch": 1.1784932055297803, + "grad_norm": 0.6351611614227295, + "learning_rate": 7.486637786224542e-06, + "loss": 0.1962, + "step": 12510 + }, + { + "epoch": 1.178587409622948, + "grad_norm": 0.596457302570343, + "learning_rate": 7.485176268605821e-06, + "loss": 0.1843, + "step": 12511 + }, + { + "epoch": 1.178681613716116, + "grad_norm": 0.7209934592247009, + "learning_rate": 7.483714808328971e-06, + "loss": 0.2134, + "step": 12512 + }, + { + "epoch": 1.1787758178092838, + "grad_norm": 0.6333665251731873, + "learning_rate": 7.4822534054273135e-06, + "loss": 0.2021, + "step": 12513 + }, + { + "epoch": 1.1788700219024517, + "grad_norm": 0.57803875207901, + "learning_rate": 7.480792059934173e-06, + "loss": 0.2022, + "step": 12514 + }, + { + "epoch": 1.1789642259956195, + "grad_norm": 0.7156559824943542, + "learning_rate": 7.47933077188287e-06, + "loss": 0.2289, + "step": 12515 + }, + { + "epoch": 1.1790584300887874, + "grad_norm": 0.6474944949150085, + "learning_rate": 7.477869541306721e-06, + "loss": 0.2016, + "step": 12516 + }, + { + "epoch": 1.1791526341819552, + "grad_norm": 0.623877763748169, + "learning_rate": 7.476408368239051e-06, + "loss": 0.191, + "step": 12517 + }, + { + "epoch": 1.179246838275123, + "grad_norm": 0.7743015289306641, + "learning_rate": 7.474947252713171e-06, + "loss": 0.2005, + "step": 12518 + }, + { + "epoch": 1.1793410423682908, + "grad_norm": 0.6596753597259521, + "learning_rate": 7.473486194762403e-06, + "loss": 0.2034, + "step": 12519 + }, + { + "epoch": 1.1794352464614588, + "grad_norm": 0.6988006830215454, + "learning_rate": 7.47202519442005e-06, + "loss": 0.2149, + "step": 12520 + }, + { + "epoch": 1.1795294505546265, + "grad_norm": 0.6761751174926758, + "learning_rate": 7.470564251719437e-06, + "loss": 0.2149, + "step": 12521 + }, + { + "epoch": 1.1796236546477945, + "grad_norm": 0.7213813662528992, + "learning_rate": 7.46910336669387e-06, + "loss": 0.2026, + "step": 12522 + }, + { + "epoch": 1.1797178587409622, + "grad_norm": 0.6943807601928711, + "learning_rate": 7.467642539376655e-06, + "loss": 0.2007, + "step": 12523 + }, + { + "epoch": 1.1798120628341302, + "grad_norm": 0.6666421294212341, + "learning_rate": 7.4661817698011145e-06, + "loss": 0.2192, + "step": 12524 + }, + { + "epoch": 1.179906266927298, + "grad_norm": 0.6767361760139465, + "learning_rate": 7.4647210580005445e-06, + "loss": 0.2038, + "step": 12525 + }, + { + "epoch": 1.180000471020466, + "grad_norm": 0.6544678807258606, + "learning_rate": 7.4632604040082545e-06, + "loss": 0.1918, + "step": 12526 + }, + { + "epoch": 1.1800946751136336, + "grad_norm": 0.6642581224441528, + "learning_rate": 7.4617998078575515e-06, + "loss": 0.2051, + "step": 12527 + }, + { + "epoch": 1.1801888792068016, + "grad_norm": 0.6531805396080017, + "learning_rate": 7.460339269581739e-06, + "loss": 0.2206, + "step": 12528 + }, + { + "epoch": 1.1802830832999693, + "grad_norm": 0.7012746334075928, + "learning_rate": 7.458878789214119e-06, + "loss": 0.1871, + "step": 12529 + }, + { + "epoch": 1.1803772873931373, + "grad_norm": 0.6027641892433167, + "learning_rate": 7.4574183667879895e-06, + "loss": 0.2015, + "step": 12530 + }, + { + "epoch": 1.180471491486305, + "grad_norm": 0.6755235195159912, + "learning_rate": 7.455958002336656e-06, + "loss": 0.2006, + "step": 12531 + }, + { + "epoch": 1.180565695579473, + "grad_norm": 0.696238100528717, + "learning_rate": 7.454497695893415e-06, + "loss": 0.2063, + "step": 12532 + }, + { + "epoch": 1.1806598996726407, + "grad_norm": 0.6899204850196838, + "learning_rate": 7.45303744749156e-06, + "loss": 0.1945, + "step": 12533 + }, + { + "epoch": 1.1807541037658087, + "grad_norm": 0.6807505488395691, + "learning_rate": 7.451577257164393e-06, + "loss": 0.2066, + "step": 12534 + }, + { + "epoch": 1.1808483078589764, + "grad_norm": 0.7076438069343567, + "learning_rate": 7.450117124945206e-06, + "loss": 0.2182, + "step": 12535 + }, + { + "epoch": 1.1809425119521444, + "grad_norm": 0.5715807676315308, + "learning_rate": 7.44865705086729e-06, + "loss": 0.1783, + "step": 12536 + }, + { + "epoch": 1.1810367160453121, + "grad_norm": 0.6518290638923645, + "learning_rate": 7.44719703496394e-06, + "loss": 0.202, + "step": 12537 + }, + { + "epoch": 1.18113092013848, + "grad_norm": 0.6332467794418335, + "learning_rate": 7.445737077268448e-06, + "loss": 0.1998, + "step": 12538 + }, + { + "epoch": 1.1812251242316478, + "grad_norm": 0.6892487406730652, + "learning_rate": 7.444277177814099e-06, + "loss": 0.2227, + "step": 12539 + }, + { + "epoch": 1.1813193283248158, + "grad_norm": 0.6697837710380554, + "learning_rate": 7.442817336634178e-06, + "loss": 0.2212, + "step": 12540 + }, + { + "epoch": 1.1814135324179835, + "grad_norm": 0.6435509324073792, + "learning_rate": 7.441357553761984e-06, + "loss": 0.2123, + "step": 12541 + }, + { + "epoch": 1.1815077365111515, + "grad_norm": 0.6908721327781677, + "learning_rate": 7.439897829230793e-06, + "loss": 0.216, + "step": 12542 + }, + { + "epoch": 1.1816019406043192, + "grad_norm": 0.6827598214149475, + "learning_rate": 7.438438163073884e-06, + "loss": 0.2277, + "step": 12543 + }, + { + "epoch": 1.1816961446974872, + "grad_norm": 0.6248451471328735, + "learning_rate": 7.436978555324556e-06, + "loss": 0.1872, + "step": 12544 + }, + { + "epoch": 1.181790348790655, + "grad_norm": 0.6349536776542664, + "learning_rate": 7.435519006016077e-06, + "loss": 0.2059, + "step": 12545 + }, + { + "epoch": 1.1818845528838229, + "grad_norm": 0.6186675429344177, + "learning_rate": 7.434059515181729e-06, + "loss": 0.1891, + "step": 12546 + }, + { + "epoch": 1.1819787569769906, + "grad_norm": 0.64864581823349, + "learning_rate": 7.432600082854794e-06, + "loss": 0.1901, + "step": 12547 + }, + { + "epoch": 1.1820729610701586, + "grad_norm": 0.5495665669441223, + "learning_rate": 7.431140709068547e-06, + "loss": 0.1865, + "step": 12548 + }, + { + "epoch": 1.1821671651633263, + "grad_norm": 0.7360191941261292, + "learning_rate": 7.429681393856266e-06, + "loss": 0.1868, + "step": 12549 + }, + { + "epoch": 1.1822613692564943, + "grad_norm": 0.6350335478782654, + "learning_rate": 7.428222137251222e-06, + "loss": 0.2165, + "step": 12550 + }, + { + "epoch": 1.182355573349662, + "grad_norm": 0.6836426854133606, + "learning_rate": 7.426762939286693e-06, + "loss": 0.1853, + "step": 12551 + }, + { + "epoch": 1.18244977744283, + "grad_norm": 0.5950830578804016, + "learning_rate": 7.425303799995946e-06, + "loss": 0.1647, + "step": 12552 + }, + { + "epoch": 1.1825439815359977, + "grad_norm": 0.6418923139572144, + "learning_rate": 7.423844719412255e-06, + "loss": 0.2196, + "step": 12553 + }, + { + "epoch": 1.1826381856291657, + "grad_norm": 0.5919622182846069, + "learning_rate": 7.42238569756889e-06, + "loss": 0.1896, + "step": 12554 + }, + { + "epoch": 1.1827323897223334, + "grad_norm": 0.6559080481529236, + "learning_rate": 7.420926734499117e-06, + "loss": 0.2069, + "step": 12555 + }, + { + "epoch": 1.1828265938155014, + "grad_norm": 0.6933526396751404, + "learning_rate": 7.419467830236201e-06, + "loss": 0.1995, + "step": 12556 + }, + { + "epoch": 1.182920797908669, + "grad_norm": 0.6160508394241333, + "learning_rate": 7.418008984813412e-06, + "loss": 0.2059, + "step": 12557 + }, + { + "epoch": 1.183015002001837, + "grad_norm": 0.5865676403045654, + "learning_rate": 7.416550198264012e-06, + "loss": 0.1793, + "step": 12558 + }, + { + "epoch": 1.1831092060950048, + "grad_norm": 0.5630083680152893, + "learning_rate": 7.415091470621263e-06, + "loss": 0.2157, + "step": 12559 + }, + { + "epoch": 1.1832034101881728, + "grad_norm": 0.6468684673309326, + "learning_rate": 7.41363280191842e-06, + "loss": 0.225, + "step": 12560 + }, + { + "epoch": 1.1832976142813405, + "grad_norm": 0.6333187222480774, + "learning_rate": 7.412174192188756e-06, + "loss": 0.2012, + "step": 12561 + }, + { + "epoch": 1.1833918183745085, + "grad_norm": 0.6520709991455078, + "learning_rate": 7.41071564146552e-06, + "loss": 0.2156, + "step": 12562 + }, + { + "epoch": 1.1834860224676762, + "grad_norm": 0.679338276386261, + "learning_rate": 7.409257149781968e-06, + "loss": 0.2315, + "step": 12563 + }, + { + "epoch": 1.1835802265608442, + "grad_norm": 0.6333425045013428, + "learning_rate": 7.407798717171366e-06, + "loss": 0.2022, + "step": 12564 + }, + { + "epoch": 1.183674430654012, + "grad_norm": 0.6089506149291992, + "learning_rate": 7.40634034366696e-06, + "loss": 0.2, + "step": 12565 + }, + { + "epoch": 1.1837686347471799, + "grad_norm": 0.6142938137054443, + "learning_rate": 7.404882029302003e-06, + "loss": 0.1756, + "step": 12566 + }, + { + "epoch": 1.1838628388403476, + "grad_norm": 0.708831787109375, + "learning_rate": 7.403423774109751e-06, + "loss": 0.2034, + "step": 12567 + }, + { + "epoch": 1.1839570429335156, + "grad_norm": 0.664275050163269, + "learning_rate": 7.401965578123453e-06, + "loss": 0.1882, + "step": 12568 + }, + { + "epoch": 1.1840512470266833, + "grad_norm": 0.6040635704994202, + "learning_rate": 7.400507441376359e-06, + "loss": 0.1848, + "step": 12569 + }, + { + "epoch": 1.1841454511198513, + "grad_norm": 0.6533285975456238, + "learning_rate": 7.399049363901712e-06, + "loss": 0.2238, + "step": 12570 + }, + { + "epoch": 1.184239655213019, + "grad_norm": 0.823172390460968, + "learning_rate": 7.397591345732764e-06, + "loss": 0.2184, + "step": 12571 + }, + { + "epoch": 1.1843338593061867, + "grad_norm": 0.7458483576774597, + "learning_rate": 7.396133386902758e-06, + "loss": 0.2068, + "step": 12572 + }, + { + "epoch": 1.1844280633993547, + "grad_norm": 0.674351155757904, + "learning_rate": 7.394675487444936e-06, + "loss": 0.2016, + "step": 12573 + }, + { + "epoch": 1.1845222674925227, + "grad_norm": 0.7157514691352844, + "learning_rate": 7.393217647392545e-06, + "loss": 0.2092, + "step": 12574 + }, + { + "epoch": 1.1846164715856904, + "grad_norm": 0.605705738067627, + "learning_rate": 7.391759866778821e-06, + "loss": 0.2076, + "step": 12575 + }, + { + "epoch": 1.1847106756788581, + "grad_norm": 0.5888360738754272, + "learning_rate": 7.390302145637005e-06, + "loss": 0.1777, + "step": 12576 + }, + { + "epoch": 1.184804879772026, + "grad_norm": 0.6310580372810364, + "learning_rate": 7.388844484000339e-06, + "loss": 0.2215, + "step": 12577 + }, + { + "epoch": 1.184899083865194, + "grad_norm": 0.690157413482666, + "learning_rate": 7.387386881902058e-06, + "loss": 0.2221, + "step": 12578 + }, + { + "epoch": 1.1849932879583618, + "grad_norm": 0.8984596133232117, + "learning_rate": 7.385929339375395e-06, + "loss": 0.1956, + "step": 12579 + }, + { + "epoch": 1.1850874920515295, + "grad_norm": 0.6356021761894226, + "learning_rate": 7.384471856453581e-06, + "loss": 0.2152, + "step": 12580 + }, + { + "epoch": 1.1851816961446975, + "grad_norm": 0.6510480642318726, + "learning_rate": 7.383014433169859e-06, + "loss": 0.2015, + "step": 12581 + }, + { + "epoch": 1.1852759002378654, + "grad_norm": 0.6540406942367554, + "learning_rate": 7.381557069557454e-06, + "loss": 0.2114, + "step": 12582 + }, + { + "epoch": 1.1853701043310332, + "grad_norm": 0.6514879465103149, + "learning_rate": 7.380099765649598e-06, + "loss": 0.2137, + "step": 12583 + }, + { + "epoch": 1.185464308424201, + "grad_norm": 0.7055293321609497, + "learning_rate": 7.3786425214795176e-06, + "loss": 0.2143, + "step": 12584 + }, + { + "epoch": 1.1855585125173689, + "grad_norm": 0.6761283874511719, + "learning_rate": 7.377185337080443e-06, + "loss": 0.218, + "step": 12585 + }, + { + "epoch": 1.1856527166105368, + "grad_norm": 0.6523527503013611, + "learning_rate": 7.375728212485597e-06, + "loss": 0.1639, + "step": 12586 + }, + { + "epoch": 1.1857469207037046, + "grad_norm": 0.6509504914283752, + "learning_rate": 7.374271147728207e-06, + "loss": 0.2084, + "step": 12587 + }, + { + "epoch": 1.1858411247968723, + "grad_norm": 0.6438972353935242, + "learning_rate": 7.372814142841498e-06, + "loss": 0.1865, + "step": 12588 + }, + { + "epoch": 1.1859353288900403, + "grad_norm": 0.644361674785614, + "learning_rate": 7.371357197858687e-06, + "loss": 0.2121, + "step": 12589 + }, + { + "epoch": 1.1860295329832082, + "grad_norm": 0.6154738068580627, + "learning_rate": 7.3699003128129964e-06, + "loss": 0.1989, + "step": 12590 + }, + { + "epoch": 1.186123737076376, + "grad_norm": 0.6335304975509644, + "learning_rate": 7.368443487737648e-06, + "loss": 0.1922, + "step": 12591 + }, + { + "epoch": 1.1862179411695437, + "grad_norm": 0.7136722207069397, + "learning_rate": 7.366986722665858e-06, + "loss": 0.2122, + "step": 12592 + }, + { + "epoch": 1.1863121452627117, + "grad_norm": 0.6671651601791382, + "learning_rate": 7.365530017630842e-06, + "loss": 0.2194, + "step": 12593 + }, + { + "epoch": 1.1864063493558796, + "grad_norm": 0.6870070099830627, + "learning_rate": 7.364073372665816e-06, + "loss": 0.1895, + "step": 12594 + }, + { + "epoch": 1.1865005534490474, + "grad_norm": 0.6628912687301636, + "learning_rate": 7.362616787803993e-06, + "loss": 0.2084, + "step": 12595 + }, + { + "epoch": 1.186594757542215, + "grad_norm": 0.6799843311309814, + "learning_rate": 7.361160263078586e-06, + "loss": 0.2107, + "step": 12596 + }, + { + "epoch": 1.186688961635383, + "grad_norm": 0.6107242107391357, + "learning_rate": 7.359703798522808e-06, + "loss": 0.1985, + "step": 12597 + }, + { + "epoch": 1.186783165728551, + "grad_norm": 0.6183947324752808, + "learning_rate": 7.358247394169868e-06, + "loss": 0.177, + "step": 12598 + }, + { + "epoch": 1.1868773698217188, + "grad_norm": 0.6542305946350098, + "learning_rate": 7.356791050052972e-06, + "loss": 0.1907, + "step": 12599 + }, + { + "epoch": 1.1869715739148865, + "grad_norm": 0.677675187587738, + "learning_rate": 7.355334766205322e-06, + "loss": 0.2207, + "step": 12600 + }, + { + "epoch": 1.1870657780080545, + "grad_norm": 0.667528510093689, + "learning_rate": 7.3538785426601354e-06, + "loss": 0.2138, + "step": 12601 + }, + { + "epoch": 1.1871599821012224, + "grad_norm": 0.6824607849121094, + "learning_rate": 7.35242237945061e-06, + "loss": 0.2155, + "step": 12602 + }, + { + "epoch": 1.1872541861943902, + "grad_norm": 0.6306003332138062, + "learning_rate": 7.3509662766099455e-06, + "loss": 0.1909, + "step": 12603 + }, + { + "epoch": 1.187348390287558, + "grad_norm": 0.7506347298622131, + "learning_rate": 7.34951023417135e-06, + "loss": 0.1972, + "step": 12604 + }, + { + "epoch": 1.1874425943807259, + "grad_norm": 0.7019657492637634, + "learning_rate": 7.34805425216802e-06, + "loss": 0.2044, + "step": 12605 + }, + { + "epoch": 1.1875367984738936, + "grad_norm": 0.6580947041511536, + "learning_rate": 7.346598330633151e-06, + "loss": 0.2005, + "step": 12606 + }, + { + "epoch": 1.1876310025670616, + "grad_norm": 0.6721909642219543, + "learning_rate": 7.345142469599947e-06, + "loss": 0.1894, + "step": 12607 + }, + { + "epoch": 1.1877252066602293, + "grad_norm": 0.6698289513587952, + "learning_rate": 7.343686669101599e-06, + "loss": 0.2035, + "step": 12608 + }, + { + "epoch": 1.1878194107533973, + "grad_norm": 0.6518452167510986, + "learning_rate": 7.342230929171305e-06, + "loss": 0.2063, + "step": 12609 + }, + { + "epoch": 1.187913614846565, + "grad_norm": 0.6841627955436707, + "learning_rate": 7.3407752498422535e-06, + "loss": 0.182, + "step": 12610 + }, + { + "epoch": 1.188007818939733, + "grad_norm": 0.6407322287559509, + "learning_rate": 7.33931963114764e-06, + "loss": 0.2162, + "step": 12611 + }, + { + "epoch": 1.1881020230329007, + "grad_norm": 0.6667520403862, + "learning_rate": 7.337864073120655e-06, + "loss": 0.2067, + "step": 12612 + }, + { + "epoch": 1.1881962271260686, + "grad_norm": 0.6596465110778809, + "learning_rate": 7.336408575794482e-06, + "loss": 0.2074, + "step": 12613 + }, + { + "epoch": 1.1882904312192364, + "grad_norm": 0.6905499696731567, + "learning_rate": 7.334953139202317e-06, + "loss": 0.2295, + "step": 12614 + }, + { + "epoch": 1.1883846353124043, + "grad_norm": 0.7734467387199402, + "learning_rate": 7.333497763377342e-06, + "loss": 0.2074, + "step": 12615 + }, + { + "epoch": 1.188478839405572, + "grad_norm": 0.6414253115653992, + "learning_rate": 7.3320424483527385e-06, + "loss": 0.1943, + "step": 12616 + }, + { + "epoch": 1.18857304349874, + "grad_norm": 0.6604819893836975, + "learning_rate": 7.330587194161696e-06, + "loss": 0.2206, + "step": 12617 + }, + { + "epoch": 1.1886672475919078, + "grad_norm": 0.6642550826072693, + "learning_rate": 7.329132000837395e-06, + "loss": 0.2202, + "step": 12618 + }, + { + "epoch": 1.1887614516850757, + "grad_norm": 0.6313382983207703, + "learning_rate": 7.327676868413014e-06, + "loss": 0.1991, + "step": 12619 + }, + { + "epoch": 1.1888556557782435, + "grad_norm": 0.615211546421051, + "learning_rate": 7.326221796921729e-06, + "loss": 0.2157, + "step": 12620 + }, + { + "epoch": 1.1889498598714114, + "grad_norm": 0.6930890083312988, + "learning_rate": 7.324766786396728e-06, + "loss": 0.1996, + "step": 12621 + }, + { + "epoch": 1.1890440639645792, + "grad_norm": 0.7138121724128723, + "learning_rate": 7.32331183687118e-06, + "loss": 0.2139, + "step": 12622 + }, + { + "epoch": 1.1891382680577471, + "grad_norm": 0.7115221619606018, + "learning_rate": 7.321856948378259e-06, + "loss": 0.2063, + "step": 12623 + }, + { + "epoch": 1.1892324721509149, + "grad_norm": 0.6268814206123352, + "learning_rate": 7.320402120951143e-06, + "loss": 0.1943, + "step": 12624 + }, + { + "epoch": 1.1893266762440828, + "grad_norm": 0.6795153617858887, + "learning_rate": 7.318947354623004e-06, + "loss": 0.2382, + "step": 12625 + }, + { + "epoch": 1.1894208803372506, + "grad_norm": 0.6876603364944458, + "learning_rate": 7.317492649427009e-06, + "loss": 0.2209, + "step": 12626 + }, + { + "epoch": 1.1895150844304185, + "grad_norm": 0.643279492855072, + "learning_rate": 7.316038005396332e-06, + "loss": 0.1934, + "step": 12627 + }, + { + "epoch": 1.1896092885235863, + "grad_norm": 0.6221898794174194, + "learning_rate": 7.314583422564139e-06, + "loss": 0.2046, + "step": 12628 + }, + { + "epoch": 1.1897034926167542, + "grad_norm": 0.668470561504364, + "learning_rate": 7.313128900963597e-06, + "loss": 0.1938, + "step": 12629 + }, + { + "epoch": 1.189797696709922, + "grad_norm": 0.7933385372161865, + "learning_rate": 7.311674440627872e-06, + "loss": 0.2082, + "step": 12630 + }, + { + "epoch": 1.18989190080309, + "grad_norm": 0.5909566283226013, + "learning_rate": 7.310220041590126e-06, + "loss": 0.1987, + "step": 12631 + }, + { + "epoch": 1.1899861048962577, + "grad_norm": 0.6252971291542053, + "learning_rate": 7.308765703883525e-06, + "loss": 0.2297, + "step": 12632 + }, + { + "epoch": 1.1900803089894256, + "grad_norm": 0.6908274292945862, + "learning_rate": 7.307311427541224e-06, + "loss": 0.2108, + "step": 12633 + }, + { + "epoch": 1.1901745130825934, + "grad_norm": 0.661973774433136, + "learning_rate": 7.30585721259639e-06, + "loss": 0.2129, + "step": 12634 + }, + { + "epoch": 1.1902687171757613, + "grad_norm": 0.6261277794837952, + "learning_rate": 7.304403059082179e-06, + "loss": 0.2081, + "step": 12635 + }, + { + "epoch": 1.190362921268929, + "grad_norm": 0.6275283098220825, + "learning_rate": 7.302948967031744e-06, + "loss": 0.1968, + "step": 12636 + }, + { + "epoch": 1.190457125362097, + "grad_norm": 0.6762524843215942, + "learning_rate": 7.301494936478245e-06, + "loss": 0.211, + "step": 12637 + }, + { + "epoch": 1.1905513294552648, + "grad_norm": 0.6919872164726257, + "learning_rate": 7.300040967454838e-06, + "loss": 0.2074, + "step": 12638 + }, + { + "epoch": 1.1906455335484327, + "grad_norm": 0.7369116544723511, + "learning_rate": 7.29858705999467e-06, + "loss": 0.2365, + "step": 12639 + }, + { + "epoch": 1.1907397376416005, + "grad_norm": 0.6140055060386658, + "learning_rate": 7.297133214130891e-06, + "loss": 0.1955, + "step": 12640 + }, + { + "epoch": 1.1908339417347684, + "grad_norm": 0.6694202423095703, + "learning_rate": 7.295679429896661e-06, + "loss": 0.1833, + "step": 12641 + }, + { + "epoch": 1.1909281458279362, + "grad_norm": 0.633593738079071, + "learning_rate": 7.29422570732512e-06, + "loss": 0.1973, + "step": 12642 + }, + { + "epoch": 1.1910223499211041, + "grad_norm": 0.6085483431816101, + "learning_rate": 7.292772046449415e-06, + "loss": 0.2112, + "step": 12643 + }, + { + "epoch": 1.1911165540142719, + "grad_norm": 0.6511340141296387, + "learning_rate": 7.291318447302695e-06, + "loss": 0.2361, + "step": 12644 + }, + { + "epoch": 1.1912107581074398, + "grad_norm": 0.6604608297348022, + "learning_rate": 7.289864909918107e-06, + "loss": 0.2217, + "step": 12645 + }, + { + "epoch": 1.1913049622006076, + "grad_norm": 0.7020126581192017, + "learning_rate": 7.288411434328786e-06, + "loss": 0.2117, + "step": 12646 + }, + { + "epoch": 1.1913991662937755, + "grad_norm": 0.7185226082801819, + "learning_rate": 7.28695802056788e-06, + "loss": 0.2367, + "step": 12647 + }, + { + "epoch": 1.1914933703869433, + "grad_norm": 0.6534253358840942, + "learning_rate": 7.285504668668526e-06, + "loss": 0.2128, + "step": 12648 + }, + { + "epoch": 1.1915875744801112, + "grad_norm": 0.6461052298545837, + "learning_rate": 7.284051378663865e-06, + "loss": 0.2209, + "step": 12649 + }, + { + "epoch": 1.191681778573279, + "grad_norm": 0.5969422459602356, + "learning_rate": 7.282598150587032e-06, + "loss": 0.2062, + "step": 12650 + }, + { + "epoch": 1.191775982666447, + "grad_norm": 0.5316562056541443, + "learning_rate": 7.281144984471163e-06, + "loss": 0.1623, + "step": 12651 + }, + { + "epoch": 1.1918701867596146, + "grad_norm": 0.61534583568573, + "learning_rate": 7.279691880349395e-06, + "loss": 0.2, + "step": 12652 + }, + { + "epoch": 1.1919643908527826, + "grad_norm": 0.6426992416381836, + "learning_rate": 7.278238838254857e-06, + "loss": 0.1846, + "step": 12653 + }, + { + "epoch": 1.1920585949459503, + "grad_norm": 0.630556583404541, + "learning_rate": 7.276785858220684e-06, + "loss": 0.2134, + "step": 12654 + }, + { + "epoch": 1.1921527990391183, + "grad_norm": 0.588656485080719, + "learning_rate": 7.275332940280006e-06, + "loss": 0.1721, + "step": 12655 + }, + { + "epoch": 1.192247003132286, + "grad_norm": 0.6570435166358948, + "learning_rate": 7.273880084465947e-06, + "loss": 0.1993, + "step": 12656 + }, + { + "epoch": 1.192341207225454, + "grad_norm": 0.6573622226715088, + "learning_rate": 7.272427290811641e-06, + "loss": 0.2247, + "step": 12657 + }, + { + "epoch": 1.1924354113186217, + "grad_norm": 0.6184468865394592, + "learning_rate": 7.270974559350214e-06, + "loss": 0.1763, + "step": 12658 + }, + { + "epoch": 1.1925296154117897, + "grad_norm": 0.6759619116783142, + "learning_rate": 7.269521890114785e-06, + "loss": 0.1932, + "step": 12659 + }, + { + "epoch": 1.1926238195049574, + "grad_norm": 0.6438455581665039, + "learning_rate": 7.268069283138475e-06, + "loss": 0.1992, + "step": 12660 + }, + { + "epoch": 1.1927180235981254, + "grad_norm": 0.5674619078636169, + "learning_rate": 7.2666167384544175e-06, + "loss": 0.1669, + "step": 12661 + }, + { + "epoch": 1.1928122276912931, + "grad_norm": 0.6349161863327026, + "learning_rate": 7.265164256095723e-06, + "loss": 0.1925, + "step": 12662 + }, + { + "epoch": 1.192906431784461, + "grad_norm": 0.6780090928077698, + "learning_rate": 7.26371183609551e-06, + "loss": 0.2053, + "step": 12663 + }, + { + "epoch": 1.1930006358776288, + "grad_norm": 0.6443299651145935, + "learning_rate": 7.262259478486901e-06, + "loss": 0.2151, + "step": 12664 + }, + { + "epoch": 1.1930948399707968, + "grad_norm": 0.604374885559082, + "learning_rate": 7.260807183303011e-06, + "loss": 0.1814, + "step": 12665 + }, + { + "epoch": 1.1931890440639645, + "grad_norm": 0.6148853302001953, + "learning_rate": 7.259354950576951e-06, + "loss": 0.1961, + "step": 12666 + }, + { + "epoch": 1.1932832481571325, + "grad_norm": 0.6508539319038391, + "learning_rate": 7.257902780341839e-06, + "loss": 0.1921, + "step": 12667 + }, + { + "epoch": 1.1933774522503002, + "grad_norm": 0.7368025779724121, + "learning_rate": 7.256450672630785e-06, + "loss": 0.2105, + "step": 12668 + }, + { + "epoch": 1.1934716563434682, + "grad_norm": 0.6457901000976562, + "learning_rate": 7.254998627476897e-06, + "loss": 0.1963, + "step": 12669 + }, + { + "epoch": 1.193565860436636, + "grad_norm": 0.6276993751525879, + "learning_rate": 7.253546644913285e-06, + "loss": 0.1979, + "step": 12670 + }, + { + "epoch": 1.1936600645298039, + "grad_norm": 0.6053666472434998, + "learning_rate": 7.252094724973057e-06, + "loss": 0.1601, + "step": 12671 + }, + { + "epoch": 1.1937542686229716, + "grad_norm": 0.7851318717002869, + "learning_rate": 7.250642867689322e-06, + "loss": 0.2185, + "step": 12672 + }, + { + "epoch": 1.1938484727161396, + "grad_norm": 0.6257517337799072, + "learning_rate": 7.249191073095176e-06, + "loss": 0.1791, + "step": 12673 + }, + { + "epoch": 1.1939426768093073, + "grad_norm": 0.8658941388130188, + "learning_rate": 7.2477393412237314e-06, + "loss": 0.2269, + "step": 12674 + }, + { + "epoch": 1.1940368809024753, + "grad_norm": 0.6948623657226562, + "learning_rate": 7.24628767210809e-06, + "loss": 0.2226, + "step": 12675 + }, + { + "epoch": 1.194131084995643, + "grad_norm": 0.6017744541168213, + "learning_rate": 7.24483606578134e-06, + "loss": 0.203, + "step": 12676 + }, + { + "epoch": 1.194225289088811, + "grad_norm": 0.7382233738899231, + "learning_rate": 7.243384522276593e-06, + "loss": 0.2513, + "step": 12677 + }, + { + "epoch": 1.1943194931819787, + "grad_norm": 0.6766628623008728, + "learning_rate": 7.241933041626945e-06, + "loss": 0.1978, + "step": 12678 + }, + { + "epoch": 1.1944136972751467, + "grad_norm": 0.6479530334472656, + "learning_rate": 7.240481623865488e-06, + "loss": 0.1964, + "step": 12679 + }, + { + "epoch": 1.1945079013683144, + "grad_norm": 0.6650206446647644, + "learning_rate": 7.239030269025311e-06, + "loss": 0.1894, + "step": 12680 + }, + { + "epoch": 1.1946021054614824, + "grad_norm": 0.634466826915741, + "learning_rate": 7.237578977139521e-06, + "loss": 0.2038, + "step": 12681 + }, + { + "epoch": 1.1946963095546501, + "grad_norm": 0.6568295359611511, + "learning_rate": 7.236127748241201e-06, + "loss": 0.2111, + "step": 12682 + }, + { + "epoch": 1.194790513647818, + "grad_norm": 0.6371886730194092, + "learning_rate": 7.23467658236344e-06, + "loss": 0.2103, + "step": 12683 + }, + { + "epoch": 1.1948847177409858, + "grad_norm": 0.5894503593444824, + "learning_rate": 7.2332254795393315e-06, + "loss": 0.182, + "step": 12684 + }, + { + "epoch": 1.1949789218341538, + "grad_norm": 0.6459717750549316, + "learning_rate": 7.2317744398019616e-06, + "loss": 0.1934, + "step": 12685 + }, + { + "epoch": 1.1950731259273215, + "grad_norm": 0.6525779962539673, + "learning_rate": 7.230323463184414e-06, + "loss": 0.2003, + "step": 12686 + }, + { + "epoch": 1.1951673300204895, + "grad_norm": 0.6206629872322083, + "learning_rate": 7.228872549719776e-06, + "loss": 0.22, + "step": 12687 + }, + { + "epoch": 1.1952615341136572, + "grad_norm": 0.8373090028762817, + "learning_rate": 7.227421699441129e-06, + "loss": 0.2092, + "step": 12688 + }, + { + "epoch": 1.1953557382068252, + "grad_norm": 0.6752511858940125, + "learning_rate": 7.225970912381557e-06, + "loss": 0.1905, + "step": 12689 + }, + { + "epoch": 1.195449942299993, + "grad_norm": 0.7148832082748413, + "learning_rate": 7.224520188574134e-06, + "loss": 0.1988, + "step": 12690 + }, + { + "epoch": 1.1955441463931609, + "grad_norm": 0.6180127263069153, + "learning_rate": 7.223069528051947e-06, + "loss": 0.195, + "step": 12691 + }, + { + "epoch": 1.1956383504863286, + "grad_norm": 0.6713067889213562, + "learning_rate": 7.2216189308480675e-06, + "loss": 0.2345, + "step": 12692 + }, + { + "epoch": 1.1957325545794966, + "grad_norm": 0.5889583826065063, + "learning_rate": 7.220168396995573e-06, + "loss": 0.1793, + "step": 12693 + }, + { + "epoch": 1.1958267586726643, + "grad_norm": 0.654589831829071, + "learning_rate": 7.218717926527539e-06, + "loss": 0.2051, + "step": 12694 + }, + { + "epoch": 1.1959209627658323, + "grad_norm": 0.6394796371459961, + "learning_rate": 7.21726751947704e-06, + "loss": 0.1782, + "step": 12695 + }, + { + "epoch": 1.196015166859, + "grad_norm": 0.6581370830535889, + "learning_rate": 7.21581717587714e-06, + "loss": 0.2304, + "step": 12696 + }, + { + "epoch": 1.196109370952168, + "grad_norm": 0.7615405917167664, + "learning_rate": 7.214366895760916e-06, + "loss": 0.2245, + "step": 12697 + }, + { + "epoch": 1.1962035750453357, + "grad_norm": 0.5310841798782349, + "learning_rate": 7.2129166791614395e-06, + "loss": 0.1746, + "step": 12698 + }, + { + "epoch": 1.1962977791385037, + "grad_norm": 0.6486091017723083, + "learning_rate": 7.21146652611177e-06, + "loss": 0.1939, + "step": 12699 + }, + { + "epoch": 1.1963919832316714, + "grad_norm": 0.6611186861991882, + "learning_rate": 7.2100164366449736e-06, + "loss": 0.21, + "step": 12700 + }, + { + "epoch": 1.1964861873248394, + "grad_norm": 0.7448510527610779, + "learning_rate": 7.208566410794119e-06, + "loss": 0.2355, + "step": 12701 + }, + { + "epoch": 1.196580391418007, + "grad_norm": 0.6172866225242615, + "learning_rate": 7.207116448592269e-06, + "loss": 0.1887, + "step": 12702 + }, + { + "epoch": 1.196674595511175, + "grad_norm": 0.6458644866943359, + "learning_rate": 7.205666550072478e-06, + "loss": 0.1809, + "step": 12703 + }, + { + "epoch": 1.1967687996043428, + "grad_norm": 0.7035966515541077, + "learning_rate": 7.204216715267817e-06, + "loss": 0.2233, + "step": 12704 + }, + { + "epoch": 1.1968630036975108, + "grad_norm": 0.6677559614181519, + "learning_rate": 7.202766944211337e-06, + "loss": 0.2208, + "step": 12705 + }, + { + "epoch": 1.1969572077906785, + "grad_norm": 0.6460862755775452, + "learning_rate": 7.201317236936094e-06, + "loss": 0.206, + "step": 12706 + }, + { + "epoch": 1.1970514118838465, + "grad_norm": 0.706049382686615, + "learning_rate": 7.199867593475149e-06, + "loss": 0.1983, + "step": 12707 + }, + { + "epoch": 1.1971456159770142, + "grad_norm": 0.6077711582183838, + "learning_rate": 7.198418013861553e-06, + "loss": 0.1878, + "step": 12708 + }, + { + "epoch": 1.1972398200701821, + "grad_norm": 0.7669473886489868, + "learning_rate": 7.196968498128359e-06, + "loss": 0.2075, + "step": 12709 + }, + { + "epoch": 1.1973340241633499, + "grad_norm": 0.6662476062774658, + "learning_rate": 7.195519046308616e-06, + "loss": 0.2073, + "step": 12710 + }, + { + "epoch": 1.1974282282565176, + "grad_norm": 0.6097497940063477, + "learning_rate": 7.1940696584353784e-06, + "loss": 0.1908, + "step": 12711 + }, + { + "epoch": 1.1975224323496856, + "grad_norm": 0.6982675194740295, + "learning_rate": 7.1926203345416935e-06, + "loss": 0.2524, + "step": 12712 + }, + { + "epoch": 1.1976166364428535, + "grad_norm": 0.6172206997871399, + "learning_rate": 7.191171074660603e-06, + "loss": 0.199, + "step": 12713 + }, + { + "epoch": 1.1977108405360213, + "grad_norm": 0.6649495959281921, + "learning_rate": 7.189721878825157e-06, + "loss": 0.2139, + "step": 12714 + }, + { + "epoch": 1.197805044629189, + "grad_norm": 0.6137988567352295, + "learning_rate": 7.188272747068404e-06, + "loss": 0.2042, + "step": 12715 + }, + { + "epoch": 1.197899248722357, + "grad_norm": 0.6602396965026855, + "learning_rate": 7.186823679423371e-06, + "loss": 0.2083, + "step": 12716 + }, + { + "epoch": 1.197993452815525, + "grad_norm": 0.6809267997741699, + "learning_rate": 7.185374675923114e-06, + "loss": 0.2304, + "step": 12717 + }, + { + "epoch": 1.1980876569086927, + "grad_norm": 0.6102619171142578, + "learning_rate": 7.18392573660067e-06, + "loss": 0.1923, + "step": 12718 + }, + { + "epoch": 1.1981818610018604, + "grad_norm": 0.6653514504432678, + "learning_rate": 7.182476861489072e-06, + "loss": 0.2469, + "step": 12719 + }, + { + "epoch": 1.1982760650950284, + "grad_norm": 0.5470622181892395, + "learning_rate": 7.181028050621355e-06, + "loss": 0.1868, + "step": 12720 + }, + { + "epoch": 1.1983702691881963, + "grad_norm": 0.6350684762001038, + "learning_rate": 7.179579304030562e-06, + "loss": 0.2037, + "step": 12721 + }, + { + "epoch": 1.198464473281364, + "grad_norm": 0.6286614537239075, + "learning_rate": 7.178130621749722e-06, + "loss": 0.1995, + "step": 12722 + }, + { + "epoch": 1.1985586773745318, + "grad_norm": 0.6691330075263977, + "learning_rate": 7.176682003811868e-06, + "loss": 0.225, + "step": 12723 + }, + { + "epoch": 1.1986528814676998, + "grad_norm": 0.7239378690719604, + "learning_rate": 7.17523345025003e-06, + "loss": 0.2046, + "step": 12724 + }, + { + "epoch": 1.1987470855608677, + "grad_norm": 0.6537780165672302, + "learning_rate": 7.173784961097239e-06, + "loss": 0.236, + "step": 12725 + }, + { + "epoch": 1.1988412896540355, + "grad_norm": 0.7614167928695679, + "learning_rate": 7.172336536386519e-06, + "loss": 0.2231, + "step": 12726 + }, + { + "epoch": 1.1989354937472032, + "grad_norm": 0.5911201238632202, + "learning_rate": 7.170888176150903e-06, + "loss": 0.1741, + "step": 12727 + }, + { + "epoch": 1.1990296978403712, + "grad_norm": 0.6751967668533325, + "learning_rate": 7.16943988042341e-06, + "loss": 0.2134, + "step": 12728 + }, + { + "epoch": 1.1991239019335391, + "grad_norm": 0.7988777160644531, + "learning_rate": 7.167991649237066e-06, + "loss": 0.1947, + "step": 12729 + }, + { + "epoch": 1.1992181060267069, + "grad_norm": 0.7002336382865906, + "learning_rate": 7.16654348262489e-06, + "loss": 0.2236, + "step": 12730 + }, + { + "epoch": 1.1993123101198746, + "grad_norm": 0.5490429997444153, + "learning_rate": 7.165095380619906e-06, + "loss": 0.1928, + "step": 12731 + }, + { + "epoch": 1.1994065142130426, + "grad_norm": 0.6369600892066956, + "learning_rate": 7.163647343255134e-06, + "loss": 0.195, + "step": 12732 + }, + { + "epoch": 1.1995007183062105, + "grad_norm": 0.6094090342521667, + "learning_rate": 7.162199370563585e-06, + "loss": 0.1936, + "step": 12733 + }, + { + "epoch": 1.1995949223993783, + "grad_norm": 0.6757724285125732, + "learning_rate": 7.160751462578282e-06, + "loss": 0.1858, + "step": 12734 + }, + { + "epoch": 1.199689126492546, + "grad_norm": 0.5965824127197266, + "learning_rate": 7.159303619332236e-06, + "loss": 0.1963, + "step": 12735 + }, + { + "epoch": 1.199783330585714, + "grad_norm": 0.6731762886047363, + "learning_rate": 7.157855840858457e-06, + "loss": 0.2255, + "step": 12736 + }, + { + "epoch": 1.199877534678882, + "grad_norm": 0.6442179679870605, + "learning_rate": 7.156408127189964e-06, + "loss": 0.2162, + "step": 12737 + }, + { + "epoch": 1.1999717387720497, + "grad_norm": 0.6489914059638977, + "learning_rate": 7.154960478359766e-06, + "loss": 0.2052, + "step": 12738 + }, + { + "epoch": 1.2000659428652174, + "grad_norm": 0.6417892575263977, + "learning_rate": 7.1535128944008666e-06, + "loss": 0.1991, + "step": 12739 + }, + { + "epoch": 1.2001601469583854, + "grad_norm": 0.6218274235725403, + "learning_rate": 7.152065375346273e-06, + "loss": 0.2104, + "step": 12740 + }, + { + "epoch": 1.2002543510515533, + "grad_norm": 0.6693999171257019, + "learning_rate": 7.150617921228995e-06, + "loss": 0.2329, + "step": 12741 + }, + { + "epoch": 1.200348555144721, + "grad_norm": 0.7353677153587341, + "learning_rate": 7.149170532082037e-06, + "loss": 0.2242, + "step": 12742 + }, + { + "epoch": 1.2004427592378888, + "grad_norm": 0.6996892094612122, + "learning_rate": 7.147723207938395e-06, + "loss": 0.1921, + "step": 12743 + }, + { + "epoch": 1.2005369633310568, + "grad_norm": 0.6604074239730835, + "learning_rate": 7.146275948831078e-06, + "loss": 0.2125, + "step": 12744 + }, + { + "epoch": 1.2006311674242245, + "grad_norm": 0.686569333076477, + "learning_rate": 7.144828754793084e-06, + "loss": 0.171, + "step": 12745 + }, + { + "epoch": 1.2007253715173924, + "grad_norm": 0.6470799446105957, + "learning_rate": 7.143381625857407e-06, + "loss": 0.214, + "step": 12746 + }, + { + "epoch": 1.2008195756105602, + "grad_norm": 0.6509439945220947, + "learning_rate": 7.141934562057049e-06, + "loss": 0.2175, + "step": 12747 + }, + { + "epoch": 1.2009137797037281, + "grad_norm": 0.6378494501113892, + "learning_rate": 7.1404875634250026e-06, + "loss": 0.1989, + "step": 12748 + }, + { + "epoch": 1.2010079837968959, + "grad_norm": 0.6776915192604065, + "learning_rate": 7.139040629994263e-06, + "loss": 0.2172, + "step": 12749 + }, + { + "epoch": 1.2011021878900638, + "grad_norm": 0.6282821297645569, + "learning_rate": 7.137593761797818e-06, + "loss": 0.2073, + "step": 12750 + }, + { + "epoch": 1.2011963919832316, + "grad_norm": 0.5688050389289856, + "learning_rate": 7.136146958868666e-06, + "loss": 0.1773, + "step": 12751 + }, + { + "epoch": 1.2012905960763995, + "grad_norm": 0.7232409715652466, + "learning_rate": 7.134700221239793e-06, + "loss": 0.2, + "step": 12752 + }, + { + "epoch": 1.2013848001695673, + "grad_norm": 0.6493895649909973, + "learning_rate": 7.133253548944181e-06, + "loss": 0.2054, + "step": 12753 + }, + { + "epoch": 1.2014790042627352, + "grad_norm": 0.6467964053153992, + "learning_rate": 7.131806942014825e-06, + "loss": 0.1916, + "step": 12754 + }, + { + "epoch": 1.201573208355903, + "grad_norm": 0.6178547143936157, + "learning_rate": 7.1303604004847085e-06, + "loss": 0.2295, + "step": 12755 + }, + { + "epoch": 1.201667412449071, + "grad_norm": 0.7066540122032166, + "learning_rate": 7.128913924386807e-06, + "loss": 0.2297, + "step": 12756 + }, + { + "epoch": 1.2017616165422387, + "grad_norm": 0.6456404328346252, + "learning_rate": 7.127467513754112e-06, + "loss": 0.2108, + "step": 12757 + }, + { + "epoch": 1.2018558206354066, + "grad_norm": 0.6754060387611389, + "learning_rate": 7.1260211686196035e-06, + "loss": 0.2083, + "step": 12758 + }, + { + "epoch": 1.2019500247285744, + "grad_norm": 0.6716510653495789, + "learning_rate": 7.124574889016254e-06, + "loss": 0.2238, + "step": 12759 + }, + { + "epoch": 1.2020442288217423, + "grad_norm": 0.6841642260551453, + "learning_rate": 7.1231286749770416e-06, + "loss": 0.1973, + "step": 12760 + }, + { + "epoch": 1.20213843291491, + "grad_norm": 0.5917529463768005, + "learning_rate": 7.1216825265349465e-06, + "loss": 0.2067, + "step": 12761 + }, + { + "epoch": 1.202232637008078, + "grad_norm": 0.6807586550712585, + "learning_rate": 7.120236443722941e-06, + "loss": 0.2049, + "step": 12762 + }, + { + "epoch": 1.2023268411012458, + "grad_norm": 0.6509501934051514, + "learning_rate": 7.118790426573997e-06, + "loss": 0.2004, + "step": 12763 + }, + { + "epoch": 1.2024210451944137, + "grad_norm": 0.675783097743988, + "learning_rate": 7.1173444751210885e-06, + "loss": 0.2322, + "step": 12764 + }, + { + "epoch": 1.2025152492875815, + "grad_norm": 0.6668880581855774, + "learning_rate": 7.115898589397185e-06, + "loss": 0.2457, + "step": 12765 + }, + { + "epoch": 1.2026094533807494, + "grad_norm": 0.6287373900413513, + "learning_rate": 7.114452769435252e-06, + "loss": 0.2001, + "step": 12766 + }, + { + "epoch": 1.2027036574739172, + "grad_norm": 0.6526023745536804, + "learning_rate": 7.1130070152682605e-06, + "loss": 0.1813, + "step": 12767 + }, + { + "epoch": 1.2027978615670851, + "grad_norm": 0.6208986043930054, + "learning_rate": 7.111561326929173e-06, + "loss": 0.1963, + "step": 12768 + }, + { + "epoch": 1.2028920656602529, + "grad_norm": 0.6641842126846313, + "learning_rate": 7.110115704450955e-06, + "loss": 0.1975, + "step": 12769 + }, + { + "epoch": 1.2029862697534208, + "grad_norm": 0.6185021996498108, + "learning_rate": 7.108670147866565e-06, + "loss": 0.1696, + "step": 12770 + }, + { + "epoch": 1.2030804738465886, + "grad_norm": 0.6088213920593262, + "learning_rate": 7.107224657208971e-06, + "loss": 0.2202, + "step": 12771 + }, + { + "epoch": 1.2031746779397565, + "grad_norm": 0.6018186211585999, + "learning_rate": 7.10577923251113e-06, + "loss": 0.2033, + "step": 12772 + }, + { + "epoch": 1.2032688820329243, + "grad_norm": 0.6003065705299377, + "learning_rate": 7.104333873805991e-06, + "loss": 0.1905, + "step": 12773 + }, + { + "epoch": 1.2033630861260922, + "grad_norm": 0.6481710076332092, + "learning_rate": 7.102888581126523e-06, + "loss": 0.2095, + "step": 12774 + }, + { + "epoch": 1.20345729021926, + "grad_norm": 0.7129554748535156, + "learning_rate": 7.1014433545056785e-06, + "loss": 0.2175, + "step": 12775 + }, + { + "epoch": 1.203551494312428, + "grad_norm": 0.5934905409812927, + "learning_rate": 7.099998193976401e-06, + "loss": 0.1825, + "step": 12776 + }, + { + "epoch": 1.2036456984055957, + "grad_norm": 0.6513676047325134, + "learning_rate": 7.098553099571654e-06, + "loss": 0.2241, + "step": 12777 + }, + { + "epoch": 1.2037399024987636, + "grad_norm": 0.7061363458633423, + "learning_rate": 7.097108071324386e-06, + "loss": 0.2314, + "step": 12778 + }, + { + "epoch": 1.2038341065919314, + "grad_norm": 0.708442747592926, + "learning_rate": 7.095663109267541e-06, + "loss": 0.2111, + "step": 12779 + }, + { + "epoch": 1.2039283106850993, + "grad_norm": 0.6820908188819885, + "learning_rate": 7.094218213434068e-06, + "loss": 0.2339, + "step": 12780 + }, + { + "epoch": 1.204022514778267, + "grad_norm": 0.6267956495285034, + "learning_rate": 7.092773383856913e-06, + "loss": 0.1869, + "step": 12781 + }, + { + "epoch": 1.204116718871435, + "grad_norm": 0.7776097655296326, + "learning_rate": 7.091328620569023e-06, + "loss": 0.1977, + "step": 12782 + }, + { + "epoch": 1.2042109229646027, + "grad_norm": 0.6159158945083618, + "learning_rate": 7.0898839236033355e-06, + "loss": 0.2093, + "step": 12783 + }, + { + "epoch": 1.2043051270577707, + "grad_norm": 0.7958076000213623, + "learning_rate": 7.088439292992798e-06, + "loss": 0.1876, + "step": 12784 + }, + { + "epoch": 1.2043993311509384, + "grad_norm": 0.732624351978302, + "learning_rate": 7.086994728770348e-06, + "loss": 0.2307, + "step": 12785 + }, + { + "epoch": 1.2044935352441064, + "grad_norm": 0.9008014798164368, + "learning_rate": 7.085550230968921e-06, + "loss": 0.2152, + "step": 12786 + }, + { + "epoch": 1.2045877393372741, + "grad_norm": 0.6954534649848938, + "learning_rate": 7.084105799621457e-06, + "loss": 0.2121, + "step": 12787 + }, + { + "epoch": 1.204681943430442, + "grad_norm": 0.5858094096183777, + "learning_rate": 7.08266143476089e-06, + "loss": 0.1702, + "step": 12788 + }, + { + "epoch": 1.2047761475236098, + "grad_norm": 0.6322008371353149, + "learning_rate": 7.081217136420155e-06, + "loss": 0.2233, + "step": 12789 + }, + { + "epoch": 1.2048703516167778, + "grad_norm": 0.7262994050979614, + "learning_rate": 7.079772904632181e-06, + "loss": 0.2287, + "step": 12790 + }, + { + "epoch": 1.2049645557099455, + "grad_norm": 0.7194874286651611, + "learning_rate": 7.078328739429903e-06, + "loss": 0.2145, + "step": 12791 + }, + { + "epoch": 1.2050587598031135, + "grad_norm": 0.6091543436050415, + "learning_rate": 7.076884640846251e-06, + "loss": 0.1949, + "step": 12792 + }, + { + "epoch": 1.2051529638962812, + "grad_norm": 0.5965726971626282, + "learning_rate": 7.075440608914143e-06, + "loss": 0.1778, + "step": 12793 + }, + { + "epoch": 1.2052471679894492, + "grad_norm": 0.768684446811676, + "learning_rate": 7.073996643666516e-06, + "loss": 0.1985, + "step": 12794 + }, + { + "epoch": 1.205341372082617, + "grad_norm": 0.6657359004020691, + "learning_rate": 7.072552745136293e-06, + "loss": 0.2113, + "step": 12795 + }, + { + "epoch": 1.205435576175785, + "grad_norm": 0.6386681199073792, + "learning_rate": 7.071108913356388e-06, + "loss": 0.1982, + "step": 12796 + }, + { + "epoch": 1.2055297802689526, + "grad_norm": 0.6536575555801392, + "learning_rate": 7.069665148359737e-06, + "loss": 0.2181, + "step": 12797 + }, + { + "epoch": 1.2056239843621206, + "grad_norm": 0.6982464790344238, + "learning_rate": 7.068221450179249e-06, + "loss": 0.2056, + "step": 12798 + }, + { + "epoch": 1.2057181884552883, + "grad_norm": 0.7056099772453308, + "learning_rate": 7.066777818847847e-06, + "loss": 0.2309, + "step": 12799 + }, + { + "epoch": 1.2058123925484563, + "grad_norm": 0.6622529029846191, + "learning_rate": 7.065334254398444e-06, + "loss": 0.1976, + "step": 12800 + }, + { + "epoch": 1.205906596641624, + "grad_norm": 0.7267027497291565, + "learning_rate": 7.063890756863961e-06, + "loss": 0.224, + "step": 12801 + }, + { + "epoch": 1.206000800734792, + "grad_norm": 0.6865708231925964, + "learning_rate": 7.06244732627731e-06, + "loss": 0.1954, + "step": 12802 + }, + { + "epoch": 1.2060950048279597, + "grad_norm": 0.7472881078720093, + "learning_rate": 7.061003962671401e-06, + "loss": 0.2098, + "step": 12803 + }, + { + "epoch": 1.2061892089211277, + "grad_norm": 0.58888179063797, + "learning_rate": 7.059560666079148e-06, + "loss": 0.1774, + "step": 12804 + }, + { + "epoch": 1.2062834130142954, + "grad_norm": 0.6867105960845947, + "learning_rate": 7.05811743653346e-06, + "loss": 0.2048, + "step": 12805 + }, + { + "epoch": 1.2063776171074634, + "grad_norm": 0.7627989053726196, + "learning_rate": 7.056674274067242e-06, + "loss": 0.2333, + "step": 12806 + }, + { + "epoch": 1.2064718212006311, + "grad_norm": 0.6554869413375854, + "learning_rate": 7.055231178713404e-06, + "loss": 0.2053, + "step": 12807 + }, + { + "epoch": 1.206566025293799, + "grad_norm": 0.6362926959991455, + "learning_rate": 7.05378815050485e-06, + "loss": 0.2228, + "step": 12808 + }, + { + "epoch": 1.2066602293869668, + "grad_norm": 0.62162184715271, + "learning_rate": 7.052345189474483e-06, + "loss": 0.2073, + "step": 12809 + }, + { + "epoch": 1.2067544334801348, + "grad_norm": 0.6365129351615906, + "learning_rate": 7.050902295655202e-06, + "loss": 0.1994, + "step": 12810 + }, + { + "epoch": 1.2068486375733025, + "grad_norm": 0.6029601693153381, + "learning_rate": 7.049459469079911e-06, + "loss": 0.1914, + "step": 12811 + }, + { + "epoch": 1.2069428416664705, + "grad_norm": 0.6337880492210388, + "learning_rate": 7.048016709781509e-06, + "loss": 0.2087, + "step": 12812 + }, + { + "epoch": 1.2070370457596382, + "grad_norm": 0.6876749396324158, + "learning_rate": 7.046574017792887e-06, + "loss": 0.1919, + "step": 12813 + }, + { + "epoch": 1.2071312498528062, + "grad_norm": 0.607933521270752, + "learning_rate": 7.045131393146947e-06, + "loss": 0.1916, + "step": 12814 + }, + { + "epoch": 1.207225453945974, + "grad_norm": 0.725727379322052, + "learning_rate": 7.043688835876583e-06, + "loss": 0.2104, + "step": 12815 + }, + { + "epoch": 1.2073196580391419, + "grad_norm": 0.626183032989502, + "learning_rate": 7.04224634601468e-06, + "loss": 0.2267, + "step": 12816 + }, + { + "epoch": 1.2074138621323096, + "grad_norm": 0.6115753650665283, + "learning_rate": 7.0408039235941415e-06, + "loss": 0.2166, + "step": 12817 + }, + { + "epoch": 1.2075080662254776, + "grad_norm": 0.6772529482841492, + "learning_rate": 7.039361568647847e-06, + "loss": 0.2137, + "step": 12818 + }, + { + "epoch": 1.2076022703186453, + "grad_norm": 0.5911723971366882, + "learning_rate": 7.0379192812086885e-06, + "loss": 0.2021, + "step": 12819 + }, + { + "epoch": 1.2076964744118133, + "grad_norm": 0.6694115400314331, + "learning_rate": 7.036477061309548e-06, + "loss": 0.2092, + "step": 12820 + }, + { + "epoch": 1.207790678504981, + "grad_norm": 0.6884358525276184, + "learning_rate": 7.035034908983317e-06, + "loss": 0.223, + "step": 12821 + }, + { + "epoch": 1.207884882598149, + "grad_norm": 0.6839742064476013, + "learning_rate": 7.033592824262875e-06, + "loss": 0.2128, + "step": 12822 + }, + { + "epoch": 1.2079790866913167, + "grad_norm": 0.671143651008606, + "learning_rate": 7.032150807181102e-06, + "loss": 0.2202, + "step": 12823 + }, + { + "epoch": 1.2080732907844847, + "grad_norm": 0.621971845626831, + "learning_rate": 7.030708857770883e-06, + "loss": 0.1729, + "step": 12824 + }, + { + "epoch": 1.2081674948776524, + "grad_norm": 0.7871326804161072, + "learning_rate": 7.029266976065092e-06, + "loss": 0.1986, + "step": 12825 + }, + { + "epoch": 1.2082616989708204, + "grad_norm": 0.6898002028465271, + "learning_rate": 7.027825162096609e-06, + "loss": 0.1771, + "step": 12826 + }, + { + "epoch": 1.208355903063988, + "grad_norm": 0.658204197883606, + "learning_rate": 7.0263834158983105e-06, + "loss": 0.2062, + "step": 12827 + }, + { + "epoch": 1.208450107157156, + "grad_norm": 0.6371948719024658, + "learning_rate": 7.024941737503067e-06, + "loss": 0.2236, + "step": 12828 + }, + { + "epoch": 1.2085443112503238, + "grad_norm": 0.6982995867729187, + "learning_rate": 7.023500126943754e-06, + "loss": 0.1935, + "step": 12829 + }, + { + "epoch": 1.2086385153434918, + "grad_norm": 0.6304657459259033, + "learning_rate": 7.02205858425324e-06, + "loss": 0.1816, + "step": 12830 + }, + { + "epoch": 1.2087327194366595, + "grad_norm": 0.7354641556739807, + "learning_rate": 7.020617109464397e-06, + "loss": 0.2334, + "step": 12831 + }, + { + "epoch": 1.2088269235298275, + "grad_norm": 0.6110231280326843, + "learning_rate": 7.019175702610095e-06, + "loss": 0.1985, + "step": 12832 + }, + { + "epoch": 1.2089211276229952, + "grad_norm": 0.5807055234909058, + "learning_rate": 7.017734363723189e-06, + "loss": 0.1814, + "step": 12833 + }, + { + "epoch": 1.2090153317161632, + "grad_norm": 0.5862298011779785, + "learning_rate": 7.016293092836556e-06, + "loss": 0.1934, + "step": 12834 + }, + { + "epoch": 1.209109535809331, + "grad_norm": 0.6219956874847412, + "learning_rate": 7.014851889983058e-06, + "loss": 0.1884, + "step": 12835 + }, + { + "epoch": 1.2092037399024989, + "grad_norm": 0.700542688369751, + "learning_rate": 7.013410755195547e-06, + "loss": 0.2139, + "step": 12836 + }, + { + "epoch": 1.2092979439956666, + "grad_norm": 0.6179965138435364, + "learning_rate": 7.011969688506894e-06, + "loss": 0.2129, + "step": 12837 + }, + { + "epoch": 1.2093921480888346, + "grad_norm": 0.7157810926437378, + "learning_rate": 7.010528689949954e-06, + "loss": 0.2378, + "step": 12838 + }, + { + "epoch": 1.2094863521820023, + "grad_norm": 0.6352458596229553, + "learning_rate": 7.009087759557581e-06, + "loss": 0.2309, + "step": 12839 + }, + { + "epoch": 1.2095805562751702, + "grad_norm": 0.6100900769233704, + "learning_rate": 7.007646897362632e-06, + "loss": 0.1816, + "step": 12840 + }, + { + "epoch": 1.209674760368338, + "grad_norm": 0.6023179888725281, + "learning_rate": 7.006206103397962e-06, + "loss": 0.2074, + "step": 12841 + }, + { + "epoch": 1.209768964461506, + "grad_norm": 0.7667863368988037, + "learning_rate": 7.004765377696424e-06, + "loss": 0.2076, + "step": 12842 + }, + { + "epoch": 1.2098631685546737, + "grad_norm": 0.6673808693885803, + "learning_rate": 7.003324720290865e-06, + "loss": 0.2148, + "step": 12843 + }, + { + "epoch": 1.2099573726478416, + "grad_norm": 0.690650224685669, + "learning_rate": 7.001884131214141e-06, + "loss": 0.2546, + "step": 12844 + }, + { + "epoch": 1.2100515767410094, + "grad_norm": 0.5930016040802002, + "learning_rate": 7.0004436104990925e-06, + "loss": 0.2089, + "step": 12845 + }, + { + "epoch": 1.2101457808341771, + "grad_norm": 0.6467576622962952, + "learning_rate": 6.999003158178568e-06, + "loss": 0.2155, + "step": 12846 + }, + { + "epoch": 1.210239984927345, + "grad_norm": 0.655545711517334, + "learning_rate": 6.997562774285413e-06, + "loss": 0.1981, + "step": 12847 + }, + { + "epoch": 1.210334189020513, + "grad_norm": 0.6148264408111572, + "learning_rate": 6.996122458852472e-06, + "loss": 0.1938, + "step": 12848 + }, + { + "epoch": 1.2104283931136808, + "grad_norm": 0.6048044562339783, + "learning_rate": 6.994682211912585e-06, + "loss": 0.2, + "step": 12849 + }, + { + "epoch": 1.2105225972068485, + "grad_norm": 0.6152675747871399, + "learning_rate": 6.993242033498589e-06, + "loss": 0.2074, + "step": 12850 + }, + { + "epoch": 1.2106168013000165, + "grad_norm": 0.9023056626319885, + "learning_rate": 6.991801923643324e-06, + "loss": 0.2084, + "step": 12851 + }, + { + "epoch": 1.2107110053931844, + "grad_norm": 0.6928418278694153, + "learning_rate": 6.990361882379633e-06, + "loss": 0.2043, + "step": 12852 + }, + { + "epoch": 1.2108052094863522, + "grad_norm": 0.6762405037879944, + "learning_rate": 6.988921909740338e-06, + "loss": 0.2149, + "step": 12853 + }, + { + "epoch": 1.21089941357952, + "grad_norm": 0.7207145094871521, + "learning_rate": 6.987482005758284e-06, + "loss": 0.2114, + "step": 12854 + }, + { + "epoch": 1.2109936176726879, + "grad_norm": 0.6876195073127747, + "learning_rate": 6.986042170466301e-06, + "loss": 0.1991, + "step": 12855 + }, + { + "epoch": 1.2110878217658558, + "grad_norm": 0.7374923229217529, + "learning_rate": 6.9846024038972115e-06, + "loss": 0.212, + "step": 12856 + }, + { + "epoch": 1.2111820258590236, + "grad_norm": 0.6431059837341309, + "learning_rate": 6.983162706083858e-06, + "loss": 0.1802, + "step": 12857 + }, + { + "epoch": 1.2112762299521913, + "grad_norm": 0.6324191093444824, + "learning_rate": 6.981723077059057e-06, + "loss": 0.1873, + "step": 12858 + }, + { + "epoch": 1.2113704340453593, + "grad_norm": 0.6531680226325989, + "learning_rate": 6.9802835168556395e-06, + "loss": 0.188, + "step": 12859 + }, + { + "epoch": 1.2114646381385272, + "grad_norm": 0.6321462392807007, + "learning_rate": 6.978844025506424e-06, + "loss": 0.1879, + "step": 12860 + }, + { + "epoch": 1.211558842231695, + "grad_norm": 0.6635614633560181, + "learning_rate": 6.97740460304424e-06, + "loss": 0.1584, + "step": 12861 + }, + { + "epoch": 1.2116530463248627, + "grad_norm": 1.053592562675476, + "learning_rate": 6.975965249501906e-06, + "loss": 0.2125, + "step": 12862 + }, + { + "epoch": 1.2117472504180307, + "grad_norm": 0.6097428798675537, + "learning_rate": 6.974525964912238e-06, + "loss": 0.185, + "step": 12863 + }, + { + "epoch": 1.2118414545111986, + "grad_norm": 0.6370858550071716, + "learning_rate": 6.97308674930806e-06, + "loss": 0.1891, + "step": 12864 + }, + { + "epoch": 1.2119356586043664, + "grad_norm": 0.8104294538497925, + "learning_rate": 6.9716476027221845e-06, + "loss": 0.227, + "step": 12865 + }, + { + "epoch": 1.212029862697534, + "grad_norm": 0.6168524622917175, + "learning_rate": 6.970208525187425e-06, + "loss": 0.2064, + "step": 12866 + }, + { + "epoch": 1.212124066790702, + "grad_norm": 0.6625261902809143, + "learning_rate": 6.9687695167366e-06, + "loss": 0.2277, + "step": 12867 + }, + { + "epoch": 1.21221827088387, + "grad_norm": 0.7567079663276672, + "learning_rate": 6.967330577402516e-06, + "loss": 0.2451, + "step": 12868 + }, + { + "epoch": 1.2123124749770378, + "grad_norm": 1.0120052099227905, + "learning_rate": 6.965891707217989e-06, + "loss": 0.2097, + "step": 12869 + }, + { + "epoch": 1.2124066790702055, + "grad_norm": 0.6066929697990417, + "learning_rate": 6.964452906215815e-06, + "loss": 0.1947, + "step": 12870 + }, + { + "epoch": 1.2125008831633735, + "grad_norm": 0.6516382694244385, + "learning_rate": 6.963014174428815e-06, + "loss": 0.2025, + "step": 12871 + }, + { + "epoch": 1.2125950872565414, + "grad_norm": 0.605207085609436, + "learning_rate": 6.961575511889791e-06, + "loss": 0.1864, + "step": 12872 + }, + { + "epoch": 1.2126892913497092, + "grad_norm": 0.644787073135376, + "learning_rate": 6.960136918631537e-06, + "loss": 0.1974, + "step": 12873 + }, + { + "epoch": 1.212783495442877, + "grad_norm": 0.7836694121360779, + "learning_rate": 6.9586983946868665e-06, + "loss": 0.2035, + "step": 12874 + }, + { + "epoch": 1.2128776995360449, + "grad_norm": 0.6779301762580872, + "learning_rate": 6.9572599400885796e-06, + "loss": 0.2137, + "step": 12875 + }, + { + "epoch": 1.2129719036292128, + "grad_norm": 0.6337977051734924, + "learning_rate": 6.9558215548694645e-06, + "loss": 0.2075, + "step": 12876 + }, + { + "epoch": 1.2130661077223805, + "grad_norm": 0.6754133701324463, + "learning_rate": 6.954383239062332e-06, + "loss": 0.187, + "step": 12877 + }, + { + "epoch": 1.2131603118155483, + "grad_norm": 0.665571391582489, + "learning_rate": 6.952944992699971e-06, + "loss": 0.2194, + "step": 12878 + }, + { + "epoch": 1.2132545159087162, + "grad_norm": 0.6134076118469238, + "learning_rate": 6.9515068158151745e-06, + "loss": 0.2023, + "step": 12879 + }, + { + "epoch": 1.2133487200018842, + "grad_norm": 0.6333722472190857, + "learning_rate": 6.950068708440737e-06, + "loss": 0.1778, + "step": 12880 + }, + { + "epoch": 1.213442924095052, + "grad_norm": 0.6231043338775635, + "learning_rate": 6.948630670609451e-06, + "loss": 0.2145, + "step": 12881 + }, + { + "epoch": 1.2135371281882197, + "grad_norm": 0.6057214140892029, + "learning_rate": 6.947192702354104e-06, + "loss": 0.1738, + "step": 12882 + }, + { + "epoch": 1.2136313322813876, + "grad_norm": 0.6275814175605774, + "learning_rate": 6.945754803707484e-06, + "loss": 0.2013, + "step": 12883 + }, + { + "epoch": 1.2137255363745554, + "grad_norm": 0.6310911774635315, + "learning_rate": 6.944316974702379e-06, + "loss": 0.1786, + "step": 12884 + }, + { + "epoch": 1.2138197404677233, + "grad_norm": 0.6078299283981323, + "learning_rate": 6.9428792153715744e-06, + "loss": 0.1735, + "step": 12885 + }, + { + "epoch": 1.213913944560891, + "grad_norm": 0.6392536163330078, + "learning_rate": 6.941441525747847e-06, + "loss": 0.204, + "step": 12886 + }, + { + "epoch": 1.214008148654059, + "grad_norm": 0.7176835536956787, + "learning_rate": 6.940003905863986e-06, + "loss": 0.2473, + "step": 12887 + }, + { + "epoch": 1.2141023527472268, + "grad_norm": 0.7947714924812317, + "learning_rate": 6.938566355752769e-06, + "loss": 0.2024, + "step": 12888 + }, + { + "epoch": 1.2141965568403947, + "grad_norm": 0.633015513420105, + "learning_rate": 6.937128875446975e-06, + "loss": 0.1943, + "step": 12889 + }, + { + "epoch": 1.2142907609335625, + "grad_norm": 0.6805613040924072, + "learning_rate": 6.935691464979374e-06, + "loss": 0.2059, + "step": 12890 + }, + { + "epoch": 1.2143849650267304, + "grad_norm": 0.7668185830116272, + "learning_rate": 6.93425412438275e-06, + "loss": 0.2454, + "step": 12891 + }, + { + "epoch": 1.2144791691198982, + "grad_norm": 0.6264731884002686, + "learning_rate": 6.932816853689875e-06, + "loss": 0.1765, + "step": 12892 + }, + { + "epoch": 1.2145733732130661, + "grad_norm": 0.618474006652832, + "learning_rate": 6.931379652933514e-06, + "loss": 0.1966, + "step": 12893 + }, + { + "epoch": 1.2146675773062339, + "grad_norm": 0.7000207901000977, + "learning_rate": 6.929942522146446e-06, + "loss": 0.2274, + "step": 12894 + }, + { + "epoch": 1.2147617813994018, + "grad_norm": 0.6906728148460388, + "learning_rate": 6.928505461361439e-06, + "loss": 0.197, + "step": 12895 + }, + { + "epoch": 1.2148559854925696, + "grad_norm": 0.6946936249732971, + "learning_rate": 6.9270684706112515e-06, + "loss": 0.1914, + "step": 12896 + }, + { + "epoch": 1.2149501895857375, + "grad_norm": 0.6604578495025635, + "learning_rate": 6.925631549928662e-06, + "loss": 0.201, + "step": 12897 + }, + { + "epoch": 1.2150443936789053, + "grad_norm": 0.5688307285308838, + "learning_rate": 6.924194699346425e-06, + "loss": 0.1949, + "step": 12898 + }, + { + "epoch": 1.2151385977720732, + "grad_norm": 0.5732439160346985, + "learning_rate": 6.922757918897305e-06, + "loss": 0.1829, + "step": 12899 + }, + { + "epoch": 1.215232801865241, + "grad_norm": 0.6947095394134521, + "learning_rate": 6.9213212086140624e-06, + "loss": 0.2043, + "step": 12900 + }, + { + "epoch": 1.215327005958409, + "grad_norm": 0.6432653665542603, + "learning_rate": 6.9198845685294595e-06, + "loss": 0.2041, + "step": 12901 + }, + { + "epoch": 1.2154212100515767, + "grad_norm": 0.6288147568702698, + "learning_rate": 6.918447998676252e-06, + "loss": 0.1804, + "step": 12902 + }, + { + "epoch": 1.2155154141447446, + "grad_norm": 0.6071980595588684, + "learning_rate": 6.917011499087193e-06, + "loss": 0.1827, + "step": 12903 + }, + { + "epoch": 1.2156096182379124, + "grad_norm": 0.610538899898529, + "learning_rate": 6.915575069795042e-06, + "loss": 0.2186, + "step": 12904 + }, + { + "epoch": 1.2157038223310803, + "grad_norm": 0.681826114654541, + "learning_rate": 6.91413871083255e-06, + "loss": 0.2009, + "step": 12905 + }, + { + "epoch": 1.215798026424248, + "grad_norm": 0.6242051720619202, + "learning_rate": 6.912702422232466e-06, + "loss": 0.1945, + "step": 12906 + }, + { + "epoch": 1.215892230517416, + "grad_norm": 0.6371919512748718, + "learning_rate": 6.911266204027542e-06, + "loss": 0.1884, + "step": 12907 + }, + { + "epoch": 1.2159864346105838, + "grad_norm": 0.6108184456825256, + "learning_rate": 6.909830056250527e-06, + "loss": 0.1836, + "step": 12908 + }, + { + "epoch": 1.2160806387037517, + "grad_norm": 0.6858425140380859, + "learning_rate": 6.908393978934163e-06, + "loss": 0.202, + "step": 12909 + }, + { + "epoch": 1.2161748427969195, + "grad_norm": 0.6188270449638367, + "learning_rate": 6.906957972111199e-06, + "loss": 0.2185, + "step": 12910 + }, + { + "epoch": 1.2162690468900874, + "grad_norm": 0.7016656994819641, + "learning_rate": 6.905522035814378e-06, + "loss": 0.2181, + "step": 12911 + }, + { + "epoch": 1.2163632509832552, + "grad_norm": 0.6962555050849915, + "learning_rate": 6.9040861700764415e-06, + "loss": 0.2095, + "step": 12912 + }, + { + "epoch": 1.216457455076423, + "grad_norm": 0.6605420112609863, + "learning_rate": 6.902650374930122e-06, + "loss": 0.2255, + "step": 12913 + }, + { + "epoch": 1.2165516591695908, + "grad_norm": 0.6797463893890381, + "learning_rate": 6.901214650408171e-06, + "loss": 0.1823, + "step": 12914 + }, + { + "epoch": 1.2166458632627588, + "grad_norm": 0.6401837468147278, + "learning_rate": 6.899778996543316e-06, + "loss": 0.2159, + "step": 12915 + }, + { + "epoch": 1.2167400673559265, + "grad_norm": 0.6821116209030151, + "learning_rate": 6.89834341336829e-06, + "loss": 0.2272, + "step": 12916 + }, + { + "epoch": 1.2168342714490945, + "grad_norm": 0.6074873208999634, + "learning_rate": 6.896907900915837e-06, + "loss": 0.2048, + "step": 12917 + }, + { + "epoch": 1.2169284755422622, + "grad_norm": 0.5764397978782654, + "learning_rate": 6.8954724592186815e-06, + "loss": 0.2006, + "step": 12918 + }, + { + "epoch": 1.2170226796354302, + "grad_norm": 0.6415478587150574, + "learning_rate": 6.894037088309551e-06, + "loss": 0.2061, + "step": 12919 + }, + { + "epoch": 1.217116883728598, + "grad_norm": 0.6077975630760193, + "learning_rate": 6.892601788221185e-06, + "loss": 0.2006, + "step": 12920 + }, + { + "epoch": 1.217211087821766, + "grad_norm": 0.6001858115196228, + "learning_rate": 6.8911665589863e-06, + "loss": 0.1894, + "step": 12921 + }, + { + "epoch": 1.2173052919149336, + "grad_norm": 0.6244654059410095, + "learning_rate": 6.889731400637627e-06, + "loss": 0.1916, + "step": 12922 + }, + { + "epoch": 1.2173994960081016, + "grad_norm": 0.629303514957428, + "learning_rate": 6.888296313207885e-06, + "loss": 0.2089, + "step": 12923 + }, + { + "epoch": 1.2174937001012693, + "grad_norm": 0.5962958931922913, + "learning_rate": 6.886861296729803e-06, + "loss": 0.2066, + "step": 12924 + }, + { + "epoch": 1.2175879041944373, + "grad_norm": 0.7022969126701355, + "learning_rate": 6.885426351236097e-06, + "loss": 0.2109, + "step": 12925 + }, + { + "epoch": 1.217682108287605, + "grad_norm": 0.6517074704170227, + "learning_rate": 6.883991476759484e-06, + "loss": 0.2242, + "step": 12926 + }, + { + "epoch": 1.217776312380773, + "grad_norm": 0.6240322589874268, + "learning_rate": 6.882556673332687e-06, + "loss": 0.2162, + "step": 12927 + }, + { + "epoch": 1.2178705164739407, + "grad_norm": 0.7480376958847046, + "learning_rate": 6.88112194098842e-06, + "loss": 0.2004, + "step": 12928 + }, + { + "epoch": 1.2179647205671087, + "grad_norm": 0.6312969923019409, + "learning_rate": 6.8796872797593935e-06, + "loss": 0.224, + "step": 12929 + }, + { + "epoch": 1.2180589246602764, + "grad_norm": 0.6434739828109741, + "learning_rate": 6.878252689678326e-06, + "loss": 0.2132, + "step": 12930 + }, + { + "epoch": 1.2181531287534444, + "grad_norm": 0.6692473292350769, + "learning_rate": 6.876818170777924e-06, + "loss": 0.2147, + "step": 12931 + }, + { + "epoch": 1.2182473328466121, + "grad_norm": 0.6293768882751465, + "learning_rate": 6.875383723090898e-06, + "loss": 0.1832, + "step": 12932 + }, + { + "epoch": 1.21834153693978, + "grad_norm": 0.6315598487854004, + "learning_rate": 6.873949346649951e-06, + "loss": 0.1979, + "step": 12933 + }, + { + "epoch": 1.2184357410329478, + "grad_norm": 0.6284828186035156, + "learning_rate": 6.872515041487799e-06, + "loss": 0.2065, + "step": 12934 + }, + { + "epoch": 1.2185299451261158, + "grad_norm": 0.5833081603050232, + "learning_rate": 6.87108080763714e-06, + "loss": 0.1788, + "step": 12935 + }, + { + "epoch": 1.2186241492192835, + "grad_norm": 0.7753840684890747, + "learning_rate": 6.869646645130673e-06, + "loss": 0.2163, + "step": 12936 + }, + { + "epoch": 1.2187183533124515, + "grad_norm": 0.7362518310546875, + "learning_rate": 6.86821255400111e-06, + "loss": 0.2166, + "step": 12937 + }, + { + "epoch": 1.2188125574056192, + "grad_norm": 0.6860774159431458, + "learning_rate": 6.866778534281141e-06, + "loss": 0.2301, + "step": 12938 + }, + { + "epoch": 1.2189067614987872, + "grad_norm": 0.6610662937164307, + "learning_rate": 6.865344586003464e-06, + "loss": 0.1869, + "step": 12939 + }, + { + "epoch": 1.219000965591955, + "grad_norm": 0.6065652370452881, + "learning_rate": 6.863910709200784e-06, + "loss": 0.1961, + "step": 12940 + }, + { + "epoch": 1.2190951696851229, + "grad_norm": 0.6814299821853638, + "learning_rate": 6.862476903905788e-06, + "loss": 0.2068, + "step": 12941 + }, + { + "epoch": 1.2191893737782906, + "grad_norm": 0.7525548934936523, + "learning_rate": 6.8610431701511705e-06, + "loss": 0.24, + "step": 12942 + }, + { + "epoch": 1.2192835778714586, + "grad_norm": 0.535211980342865, + "learning_rate": 6.859609507969621e-06, + "loss": 0.1864, + "step": 12943 + }, + { + "epoch": 1.2193777819646263, + "grad_norm": 0.8989139199256897, + "learning_rate": 6.858175917393834e-06, + "loss": 0.2182, + "step": 12944 + }, + { + "epoch": 1.2194719860577943, + "grad_norm": 0.6886752843856812, + "learning_rate": 6.8567423984564955e-06, + "loss": 0.2273, + "step": 12945 + }, + { + "epoch": 1.219566190150962, + "grad_norm": 0.6629517674446106, + "learning_rate": 6.8553089511902896e-06, + "loss": 0.2219, + "step": 12946 + }, + { + "epoch": 1.21966039424413, + "grad_norm": 0.6721383333206177, + "learning_rate": 6.853875575627903e-06, + "loss": 0.1996, + "step": 12947 + }, + { + "epoch": 1.2197545983372977, + "grad_norm": 0.7022584676742554, + "learning_rate": 6.85244227180202e-06, + "loss": 0.2102, + "step": 12948 + }, + { + "epoch": 1.2198488024304657, + "grad_norm": 0.7004144787788391, + "learning_rate": 6.85100903974532e-06, + "loss": 0.2159, + "step": 12949 + }, + { + "epoch": 1.2199430065236334, + "grad_norm": 0.7221656441688538, + "learning_rate": 6.8495758794904845e-06, + "loss": 0.2281, + "step": 12950 + }, + { + "epoch": 1.2200372106168014, + "grad_norm": 0.5384569764137268, + "learning_rate": 6.8481427910701915e-06, + "loss": 0.1882, + "step": 12951 + }, + { + "epoch": 1.220131414709969, + "grad_norm": 0.738702654838562, + "learning_rate": 6.84670977451712e-06, + "loss": 0.2285, + "step": 12952 + }, + { + "epoch": 1.220225618803137, + "grad_norm": 0.6407803893089294, + "learning_rate": 6.845276829863935e-06, + "loss": 0.2044, + "step": 12953 + }, + { + "epoch": 1.2203198228963048, + "grad_norm": 0.6032186150550842, + "learning_rate": 6.843843957143324e-06, + "loss": 0.2278, + "step": 12954 + }, + { + "epoch": 1.2204140269894728, + "grad_norm": 0.6539852619171143, + "learning_rate": 6.842411156387949e-06, + "loss": 0.198, + "step": 12955 + }, + { + "epoch": 1.2205082310826405, + "grad_norm": 0.6179254651069641, + "learning_rate": 6.8409784276304805e-06, + "loss": 0.1949, + "step": 12956 + }, + { + "epoch": 1.2206024351758085, + "grad_norm": 0.7868675589561462, + "learning_rate": 6.839545770903595e-06, + "loss": 0.2299, + "step": 12957 + }, + { + "epoch": 1.2206966392689762, + "grad_norm": 0.5924388766288757, + "learning_rate": 6.838113186239951e-06, + "loss": 0.1879, + "step": 12958 + }, + { + "epoch": 1.2207908433621442, + "grad_norm": 0.6939521431922913, + "learning_rate": 6.836680673672214e-06, + "loss": 0.2126, + "step": 12959 + }, + { + "epoch": 1.220885047455312, + "grad_norm": 0.5712323784828186, + "learning_rate": 6.835248233233052e-06, + "loss": 0.1882, + "step": 12960 + }, + { + "epoch": 1.2209792515484799, + "grad_norm": 0.6114972829818726, + "learning_rate": 6.833815864955126e-06, + "loss": 0.1519, + "step": 12961 + }, + { + "epoch": 1.2210734556416476, + "grad_norm": 0.6172329187393188, + "learning_rate": 6.832383568871093e-06, + "loss": 0.2085, + "step": 12962 + }, + { + "epoch": 1.2211676597348156, + "grad_norm": 0.6652126908302307, + "learning_rate": 6.830951345013612e-06, + "loss": 0.1959, + "step": 12963 + }, + { + "epoch": 1.2212618638279833, + "grad_norm": 0.6360315680503845, + "learning_rate": 6.8295191934153435e-06, + "loss": 0.1829, + "step": 12964 + }, + { + "epoch": 1.2213560679211513, + "grad_norm": 0.6886136531829834, + "learning_rate": 6.8280871141089415e-06, + "loss": 0.2186, + "step": 12965 + }, + { + "epoch": 1.221450272014319, + "grad_norm": 0.6091665625572205, + "learning_rate": 6.826655107127056e-06, + "loss": 0.1843, + "step": 12966 + }, + { + "epoch": 1.221544476107487, + "grad_norm": 0.7013200521469116, + "learning_rate": 6.825223172502344e-06, + "loss": 0.2184, + "step": 12967 + }, + { + "epoch": 1.2216386802006547, + "grad_norm": 0.6541852355003357, + "learning_rate": 6.823791310267454e-06, + "loss": 0.2055, + "step": 12968 + }, + { + "epoch": 1.2217328842938227, + "grad_norm": 0.6683152914047241, + "learning_rate": 6.822359520455031e-06, + "loss": 0.2137, + "step": 12969 + }, + { + "epoch": 1.2218270883869904, + "grad_norm": 0.683961033821106, + "learning_rate": 6.820927803097728e-06, + "loss": 0.2301, + "step": 12970 + }, + { + "epoch": 1.2219212924801583, + "grad_norm": 0.6960816383361816, + "learning_rate": 6.819496158228187e-06, + "loss": 0.241, + "step": 12971 + }, + { + "epoch": 1.222015496573326, + "grad_norm": 0.6431972980499268, + "learning_rate": 6.818064585879055e-06, + "loss": 0.19, + "step": 12972 + }, + { + "epoch": 1.222109700666494, + "grad_norm": 0.7428820729255676, + "learning_rate": 6.816633086082964e-06, + "loss": 0.2434, + "step": 12973 + }, + { + "epoch": 1.2222039047596618, + "grad_norm": 0.6971383690834045, + "learning_rate": 6.8152016588725704e-06, + "loss": 0.2148, + "step": 12974 + }, + { + "epoch": 1.2222981088528297, + "grad_norm": 0.6649903059005737, + "learning_rate": 6.813770304280501e-06, + "loss": 0.1937, + "step": 12975 + }, + { + "epoch": 1.2223923129459975, + "grad_norm": 0.546851396560669, + "learning_rate": 6.812339022339391e-06, + "loss": 0.2076, + "step": 12976 + }, + { + "epoch": 1.2224865170391654, + "grad_norm": 0.7050590515136719, + "learning_rate": 6.810907813081888e-06, + "loss": 0.211, + "step": 12977 + }, + { + "epoch": 1.2225807211323332, + "grad_norm": 0.6709058880805969, + "learning_rate": 6.809476676540618e-06, + "loss": 0.2297, + "step": 12978 + }, + { + "epoch": 1.2226749252255011, + "grad_norm": 0.5935243964195251, + "learning_rate": 6.808045612748211e-06, + "loss": 0.1976, + "step": 12979 + }, + { + "epoch": 1.2227691293186689, + "grad_norm": 0.610144317150116, + "learning_rate": 6.806614621737303e-06, + "loss": 0.1971, + "step": 12980 + }, + { + "epoch": 1.2228633334118368, + "grad_norm": 0.6993465423583984, + "learning_rate": 6.80518370354052e-06, + "loss": 0.2394, + "step": 12981 + }, + { + "epoch": 1.2229575375050046, + "grad_norm": 0.6231632232666016, + "learning_rate": 6.803752858190489e-06, + "loss": 0.1884, + "step": 12982 + }, + { + "epoch": 1.2230517415981725, + "grad_norm": 0.8581657409667969, + "learning_rate": 6.8023220857198345e-06, + "loss": 0.236, + "step": 12983 + }, + { + "epoch": 1.2231459456913403, + "grad_norm": 0.6415079236030579, + "learning_rate": 6.800891386161184e-06, + "loss": 0.197, + "step": 12984 + }, + { + "epoch": 1.223240149784508, + "grad_norm": 0.7081721425056458, + "learning_rate": 6.7994607595471565e-06, + "loss": 0.2041, + "step": 12985 + }, + { + "epoch": 1.223334353877676, + "grad_norm": 0.6858395934104919, + "learning_rate": 6.798030205910373e-06, + "loss": 0.2151, + "step": 12986 + }, + { + "epoch": 1.223428557970844, + "grad_norm": 0.6121916770935059, + "learning_rate": 6.796599725283453e-06, + "loss": 0.1799, + "step": 12987 + }, + { + "epoch": 1.2235227620640117, + "grad_norm": 0.6130219101905823, + "learning_rate": 6.795169317699014e-06, + "loss": 0.1897, + "step": 12988 + }, + { + "epoch": 1.2236169661571794, + "grad_norm": 0.7238254547119141, + "learning_rate": 6.793738983189668e-06, + "loss": 0.2447, + "step": 12989 + }, + { + "epoch": 1.2237111702503474, + "grad_norm": 0.664619505405426, + "learning_rate": 6.792308721788035e-06, + "loss": 0.195, + "step": 12990 + }, + { + "epoch": 1.2238053743435153, + "grad_norm": 0.6615363359451294, + "learning_rate": 6.7908785335267245e-06, + "loss": 0.2067, + "step": 12991 + }, + { + "epoch": 1.223899578436683, + "grad_norm": 0.6361871957778931, + "learning_rate": 6.789448418438348e-06, + "loss": 0.2296, + "step": 12992 + }, + { + "epoch": 1.2239937825298508, + "grad_norm": 0.6958268880844116, + "learning_rate": 6.788018376555506e-06, + "loss": 0.2273, + "step": 12993 + }, + { + "epoch": 1.2240879866230188, + "grad_norm": 0.6189342141151428, + "learning_rate": 6.786588407910819e-06, + "loss": 0.2197, + "step": 12994 + }, + { + "epoch": 1.2241821907161867, + "grad_norm": 0.7021915316581726, + "learning_rate": 6.785158512536884e-06, + "loss": 0.203, + "step": 12995 + }, + { + "epoch": 1.2242763948093545, + "grad_norm": 0.7075203061103821, + "learning_rate": 6.783728690466302e-06, + "loss": 0.229, + "step": 12996 + }, + { + "epoch": 1.2243705989025222, + "grad_norm": 1.0234602689743042, + "learning_rate": 6.782298941731686e-06, + "loss": 0.2225, + "step": 12997 + }, + { + "epoch": 1.2244648029956902, + "grad_norm": 0.6870042681694031, + "learning_rate": 6.780869266365629e-06, + "loss": 0.2115, + "step": 12998 + }, + { + "epoch": 1.2245590070888581, + "grad_norm": 0.6313785910606384, + "learning_rate": 6.77943966440073e-06, + "loss": 0.1857, + "step": 12999 + }, + { + "epoch": 1.2246532111820259, + "grad_norm": 0.6202945709228516, + "learning_rate": 6.778010135869588e-06, + "loss": 0.2032, + "step": 13000 + }, + { + "epoch": 1.2247474152751936, + "grad_norm": 0.6480250358581543, + "learning_rate": 6.776580680804799e-06, + "loss": 0.2405, + "step": 13001 + }, + { + "epoch": 1.2248416193683616, + "grad_norm": 0.6614483594894409, + "learning_rate": 6.7751512992389535e-06, + "loss": 0.1711, + "step": 13002 + }, + { + "epoch": 1.2249358234615295, + "grad_norm": 0.6586751341819763, + "learning_rate": 6.773721991204646e-06, + "loss": 0.2226, + "step": 13003 + }, + { + "epoch": 1.2250300275546973, + "grad_norm": 0.6510355472564697, + "learning_rate": 6.7722927567344664e-06, + "loss": 0.2223, + "step": 13004 + }, + { + "epoch": 1.225124231647865, + "grad_norm": 0.6354719996452332, + "learning_rate": 6.770863595861006e-06, + "loss": 0.2112, + "step": 13005 + }, + { + "epoch": 1.225218435741033, + "grad_norm": 0.6637195348739624, + "learning_rate": 6.769434508616846e-06, + "loss": 0.1985, + "step": 13006 + }, + { + "epoch": 1.225312639834201, + "grad_norm": 0.6359204649925232, + "learning_rate": 6.768005495034577e-06, + "loss": 0.2042, + "step": 13007 + }, + { + "epoch": 1.2254068439273686, + "grad_norm": 0.659364640712738, + "learning_rate": 6.7665765551467835e-06, + "loss": 0.2073, + "step": 13008 + }, + { + "epoch": 1.2255010480205364, + "grad_norm": 0.6522590517997742, + "learning_rate": 6.765147688986041e-06, + "loss": 0.1839, + "step": 13009 + }, + { + "epoch": 1.2255952521137043, + "grad_norm": 0.6877027153968811, + "learning_rate": 6.7637188965849365e-06, + "loss": 0.2395, + "step": 13010 + }, + { + "epoch": 1.2256894562068723, + "grad_norm": 0.6307399272918701, + "learning_rate": 6.762290177976046e-06, + "loss": 0.1898, + "step": 13011 + }, + { + "epoch": 1.22578366030004, + "grad_norm": 0.645378589630127, + "learning_rate": 6.7608615331919496e-06, + "loss": 0.2109, + "step": 13012 + }, + { + "epoch": 1.2258778643932078, + "grad_norm": 0.6725184917449951, + "learning_rate": 6.759432962265214e-06, + "loss": 0.2086, + "step": 13013 + }, + { + "epoch": 1.2259720684863757, + "grad_norm": 0.7793787121772766, + "learning_rate": 6.758004465228423e-06, + "loss": 0.1951, + "step": 13014 + }, + { + "epoch": 1.2260662725795437, + "grad_norm": 0.6769189238548279, + "learning_rate": 6.756576042114143e-06, + "loss": 0.1976, + "step": 13015 + }, + { + "epoch": 1.2261604766727114, + "grad_norm": 0.6077859401702881, + "learning_rate": 6.7551476929549396e-06, + "loss": 0.2024, + "step": 13016 + }, + { + "epoch": 1.2262546807658792, + "grad_norm": 0.5559185743331909, + "learning_rate": 6.753719417783394e-06, + "loss": 0.1905, + "step": 13017 + }, + { + "epoch": 1.2263488848590471, + "grad_norm": 0.7667734026908875, + "learning_rate": 6.752291216632065e-06, + "loss": 0.2025, + "step": 13018 + }, + { + "epoch": 1.2264430889522149, + "grad_norm": 0.6821576952934265, + "learning_rate": 6.750863089533516e-06, + "loss": 0.1975, + "step": 13019 + }, + { + "epoch": 1.2265372930453828, + "grad_norm": 0.6921886205673218, + "learning_rate": 6.749435036520315e-06, + "loss": 0.1745, + "step": 13020 + }, + { + "epoch": 1.2266314971385506, + "grad_norm": 0.658531904220581, + "learning_rate": 6.748007057625023e-06, + "loss": 0.2207, + "step": 13021 + }, + { + "epoch": 1.2267257012317185, + "grad_norm": 0.6247740387916565, + "learning_rate": 6.746579152880201e-06, + "loss": 0.2025, + "step": 13022 + }, + { + "epoch": 1.2268199053248863, + "grad_norm": 0.6983477473258972, + "learning_rate": 6.745151322318402e-06, + "loss": 0.2195, + "step": 13023 + }, + { + "epoch": 1.2269141094180542, + "grad_norm": 0.70487380027771, + "learning_rate": 6.743723565972189e-06, + "loss": 0.2205, + "step": 13024 + }, + { + "epoch": 1.227008313511222, + "grad_norm": 0.6091448068618774, + "learning_rate": 6.742295883874114e-06, + "loss": 0.1991, + "step": 13025 + }, + { + "epoch": 1.22710251760439, + "grad_norm": 0.6577531099319458, + "learning_rate": 6.740868276056729e-06, + "loss": 0.1923, + "step": 13026 + }, + { + "epoch": 1.2271967216975577, + "grad_norm": 0.5893779397010803, + "learning_rate": 6.73944074255259e-06, + "loss": 0.1886, + "step": 13027 + }, + { + "epoch": 1.2272909257907256, + "grad_norm": 0.708914577960968, + "learning_rate": 6.738013283394244e-06, + "loss": 0.2096, + "step": 13028 + }, + { + "epoch": 1.2273851298838934, + "grad_norm": 0.7507368922233582, + "learning_rate": 6.73658589861424e-06, + "loss": 0.2191, + "step": 13029 + }, + { + "epoch": 1.2274793339770613, + "grad_norm": 0.6944422125816345, + "learning_rate": 6.735158588245125e-06, + "loss": 0.2026, + "step": 13030 + }, + { + "epoch": 1.227573538070229, + "grad_norm": 0.6723152995109558, + "learning_rate": 6.733731352319446e-06, + "loss": 0.226, + "step": 13031 + }, + { + "epoch": 1.227667742163397, + "grad_norm": 0.5919938087463379, + "learning_rate": 6.73230419086974e-06, + "loss": 0.1834, + "step": 13032 + }, + { + "epoch": 1.2277619462565648, + "grad_norm": 0.6095983386039734, + "learning_rate": 6.7308771039285496e-06, + "loss": 0.1924, + "step": 13033 + }, + { + "epoch": 1.2278561503497327, + "grad_norm": 0.6341801881790161, + "learning_rate": 6.729450091528422e-06, + "loss": 0.2097, + "step": 13034 + }, + { + "epoch": 1.2279503544429005, + "grad_norm": 0.6179428100585938, + "learning_rate": 6.728023153701889e-06, + "loss": 0.1781, + "step": 13035 + }, + { + "epoch": 1.2280445585360684, + "grad_norm": 0.6255068182945251, + "learning_rate": 6.726596290481484e-06, + "loss": 0.2111, + "step": 13036 + }, + { + "epoch": 1.2281387626292362, + "grad_norm": 0.5949823260307312, + "learning_rate": 6.725169501899752e-06, + "loss": 0.2022, + "step": 13037 + }, + { + "epoch": 1.2282329667224041, + "grad_norm": 0.6200029850006104, + "learning_rate": 6.7237427879892184e-06, + "loss": 0.1996, + "step": 13038 + }, + { + "epoch": 1.2283271708155719, + "grad_norm": 0.7356598973274231, + "learning_rate": 6.7223161487824125e-06, + "loss": 0.2112, + "step": 13039 + }, + { + "epoch": 1.2284213749087398, + "grad_norm": 0.5932050347328186, + "learning_rate": 6.720889584311871e-06, + "loss": 0.2067, + "step": 13040 + }, + { + "epoch": 1.2285155790019076, + "grad_norm": 0.565879762172699, + "learning_rate": 6.7194630946101195e-06, + "loss": 0.18, + "step": 13041 + }, + { + "epoch": 1.2286097830950755, + "grad_norm": 0.6121799349784851, + "learning_rate": 6.718036679709681e-06, + "loss": 0.2064, + "step": 13042 + }, + { + "epoch": 1.2287039871882433, + "grad_norm": 1.1347873210906982, + "learning_rate": 6.716610339643079e-06, + "loss": 0.2086, + "step": 13043 + }, + { + "epoch": 1.2287981912814112, + "grad_norm": 0.6285647749900818, + "learning_rate": 6.715184074442842e-06, + "loss": 0.1982, + "step": 13044 + }, + { + "epoch": 1.228892395374579, + "grad_norm": 0.6440559029579163, + "learning_rate": 6.713757884141489e-06, + "loss": 0.21, + "step": 13045 + }, + { + "epoch": 1.228986599467747, + "grad_norm": 0.5830798149108887, + "learning_rate": 6.712331768771536e-06, + "loss": 0.1839, + "step": 13046 + }, + { + "epoch": 1.2290808035609146, + "grad_norm": 0.699811577796936, + "learning_rate": 6.710905728365504e-06, + "loss": 0.2108, + "step": 13047 + }, + { + "epoch": 1.2291750076540826, + "grad_norm": 0.9867917895317078, + "learning_rate": 6.7094797629559105e-06, + "loss": 0.2024, + "step": 13048 + }, + { + "epoch": 1.2292692117472503, + "grad_norm": 0.7100919485092163, + "learning_rate": 6.708053872575264e-06, + "loss": 0.1807, + "step": 13049 + }, + { + "epoch": 1.2293634158404183, + "grad_norm": 0.6222515106201172, + "learning_rate": 6.706628057256082e-06, + "loss": 0.1939, + "step": 13050 + }, + { + "epoch": 1.229457619933586, + "grad_norm": 0.6623128056526184, + "learning_rate": 6.705202317030876e-06, + "loss": 0.2179, + "step": 13051 + }, + { + "epoch": 1.229551824026754, + "grad_norm": 0.6505099534988403, + "learning_rate": 6.70377665193215e-06, + "loss": 0.2026, + "step": 13052 + }, + { + "epoch": 1.2296460281199217, + "grad_norm": 0.6078628301620483, + "learning_rate": 6.702351061992411e-06, + "loss": 0.1868, + "step": 13053 + }, + { + "epoch": 1.2297402322130897, + "grad_norm": 0.6993672251701355, + "learning_rate": 6.700925547244173e-06, + "loss": 0.2292, + "step": 13054 + }, + { + "epoch": 1.2298344363062574, + "grad_norm": 0.6512143611907959, + "learning_rate": 6.699500107719933e-06, + "loss": 0.183, + "step": 13055 + }, + { + "epoch": 1.2299286403994254, + "grad_norm": 0.6094197034835815, + "learning_rate": 6.69807474345219e-06, + "loss": 0.188, + "step": 13056 + }, + { + "epoch": 1.2300228444925931, + "grad_norm": 0.6811914443969727, + "learning_rate": 6.696649454473456e-06, + "loss": 0.2339, + "step": 13057 + }, + { + "epoch": 1.230117048585761, + "grad_norm": 0.5943988561630249, + "learning_rate": 6.695224240816223e-06, + "loss": 0.1789, + "step": 13058 + }, + { + "epoch": 1.2302112526789288, + "grad_norm": 0.6743707060813904, + "learning_rate": 6.693799102512983e-06, + "loss": 0.2097, + "step": 13059 + }, + { + "epoch": 1.2303054567720968, + "grad_norm": 0.6571220755577087, + "learning_rate": 6.692374039596241e-06, + "loss": 0.2069, + "step": 13060 + }, + { + "epoch": 1.2303996608652645, + "grad_norm": 0.6759471297264099, + "learning_rate": 6.690949052098486e-06, + "loss": 0.223, + "step": 13061 + }, + { + "epoch": 1.2304938649584325, + "grad_norm": 0.6499541401863098, + "learning_rate": 6.6895241400522085e-06, + "loss": 0.1854, + "step": 13062 + }, + { + "epoch": 1.2305880690516002, + "grad_norm": 0.644608736038208, + "learning_rate": 6.6880993034898985e-06, + "loss": 0.1943, + "step": 13063 + }, + { + "epoch": 1.2306822731447682, + "grad_norm": 0.6547313332557678, + "learning_rate": 6.68667454244405e-06, + "loss": 0.2098, + "step": 13064 + }, + { + "epoch": 1.230776477237936, + "grad_norm": 0.6327990293502808, + "learning_rate": 6.685249856947146e-06, + "loss": 0.2028, + "step": 13065 + }, + { + "epoch": 1.2308706813311039, + "grad_norm": 0.7770459651947021, + "learning_rate": 6.683825247031668e-06, + "loss": 0.2084, + "step": 13066 + }, + { + "epoch": 1.2309648854242716, + "grad_norm": 0.6875342726707458, + "learning_rate": 6.682400712730106e-06, + "loss": 0.1967, + "step": 13067 + }, + { + "epoch": 1.2310590895174396, + "grad_norm": 0.6348468065261841, + "learning_rate": 6.6809762540749375e-06, + "loss": 0.1977, + "step": 13068 + }, + { + "epoch": 1.2311532936106073, + "grad_norm": 0.5387022495269775, + "learning_rate": 6.679551871098644e-06, + "loss": 0.1653, + "step": 13069 + }, + { + "epoch": 1.2312474977037753, + "grad_norm": 0.6535850167274475, + "learning_rate": 6.678127563833703e-06, + "loss": 0.2006, + "step": 13070 + }, + { + "epoch": 1.231341701796943, + "grad_norm": 0.632869303226471, + "learning_rate": 6.676703332312593e-06, + "loss": 0.2389, + "step": 13071 + }, + { + "epoch": 1.231435905890111, + "grad_norm": 0.7358940839767456, + "learning_rate": 6.675279176567785e-06, + "loss": 0.2206, + "step": 13072 + }, + { + "epoch": 1.2315301099832787, + "grad_norm": 0.6896353363990784, + "learning_rate": 6.67385509663175e-06, + "loss": 0.2249, + "step": 13073 + }, + { + "epoch": 1.2316243140764467, + "grad_norm": 0.6563900709152222, + "learning_rate": 6.672431092536968e-06, + "loss": 0.2229, + "step": 13074 + }, + { + "epoch": 1.2317185181696144, + "grad_norm": 0.6559754014015198, + "learning_rate": 6.671007164315901e-06, + "loss": 0.2029, + "step": 13075 + }, + { + "epoch": 1.2318127222627824, + "grad_norm": 0.5931310653686523, + "learning_rate": 6.6695833120010165e-06, + "loss": 0.2023, + "step": 13076 + }, + { + "epoch": 1.2319069263559501, + "grad_norm": 0.6085942387580872, + "learning_rate": 6.668159535624786e-06, + "loss": 0.2113, + "step": 13077 + }, + { + "epoch": 1.232001130449118, + "grad_norm": 0.6777974367141724, + "learning_rate": 6.666735835219671e-06, + "loss": 0.2165, + "step": 13078 + }, + { + "epoch": 1.2320953345422858, + "grad_norm": 0.6456210613250732, + "learning_rate": 6.665312210818133e-06, + "loss": 0.2127, + "step": 13079 + }, + { + "epoch": 1.2321895386354538, + "grad_norm": 0.5929754972457886, + "learning_rate": 6.663888662452634e-06, + "loss": 0.1873, + "step": 13080 + }, + { + "epoch": 1.2322837427286215, + "grad_norm": 0.7054176330566406, + "learning_rate": 6.662465190155633e-06, + "loss": 0.2113, + "step": 13081 + }, + { + "epoch": 1.2323779468217895, + "grad_norm": 0.6892966032028198, + "learning_rate": 6.661041793959588e-06, + "loss": 0.2558, + "step": 13082 + }, + { + "epoch": 1.2324721509149572, + "grad_norm": 0.7085790634155273, + "learning_rate": 6.659618473896951e-06, + "loss": 0.2193, + "step": 13083 + }, + { + "epoch": 1.2325663550081252, + "grad_norm": 0.6542141437530518, + "learning_rate": 6.658195230000182e-06, + "loss": 0.1873, + "step": 13084 + }, + { + "epoch": 1.232660559101293, + "grad_norm": 0.7204039692878723, + "learning_rate": 6.656772062301729e-06, + "loss": 0.2189, + "step": 13085 + }, + { + "epoch": 1.2327547631944609, + "grad_norm": 0.6094175577163696, + "learning_rate": 6.655348970834042e-06, + "loss": 0.1922, + "step": 13086 + }, + { + "epoch": 1.2328489672876286, + "grad_norm": 0.6026403903961182, + "learning_rate": 6.6539259556295735e-06, + "loss": 0.2012, + "step": 13087 + }, + { + "epoch": 1.2329431713807966, + "grad_norm": 0.6669145822525024, + "learning_rate": 6.652503016720767e-06, + "loss": 0.2139, + "step": 13088 + }, + { + "epoch": 1.2330373754739643, + "grad_norm": 0.6415684223175049, + "learning_rate": 6.6510801541400674e-06, + "loss": 0.1934, + "step": 13089 + }, + { + "epoch": 1.2331315795671323, + "grad_norm": 0.5711910724639893, + "learning_rate": 6.649657367919922e-06, + "loss": 0.1879, + "step": 13090 + }, + { + "epoch": 1.2332257836603, + "grad_norm": 0.6705195903778076, + "learning_rate": 6.648234658092771e-06, + "loss": 0.202, + "step": 13091 + }, + { + "epoch": 1.233319987753468, + "grad_norm": 0.6863791942596436, + "learning_rate": 6.646812024691052e-06, + "loss": 0.2085, + "step": 13092 + }, + { + "epoch": 1.2334141918466357, + "grad_norm": 0.6593004465103149, + "learning_rate": 6.645389467747198e-06, + "loss": 0.1631, + "step": 13093 + }, + { + "epoch": 1.2335083959398037, + "grad_norm": 0.6083840131759644, + "learning_rate": 6.643966987293662e-06, + "loss": 0.1942, + "step": 13094 + }, + { + "epoch": 1.2336026000329714, + "grad_norm": 0.7365472912788391, + "learning_rate": 6.642544583362865e-06, + "loss": 0.2064, + "step": 13095 + }, + { + "epoch": 1.2336968041261394, + "grad_norm": 0.5965930223464966, + "learning_rate": 6.641122255987242e-06, + "loss": 0.2155, + "step": 13096 + }, + { + "epoch": 1.233791008219307, + "grad_norm": 0.6303234696388245, + "learning_rate": 6.639700005199228e-06, + "loss": 0.1824, + "step": 13097 + }, + { + "epoch": 1.233885212312475, + "grad_norm": 0.6314939260482788, + "learning_rate": 6.6382778310312515e-06, + "loss": 0.2012, + "step": 13098 + }, + { + "epoch": 1.2339794164056428, + "grad_norm": 0.6179655194282532, + "learning_rate": 6.6368557335157365e-06, + "loss": 0.2054, + "step": 13099 + }, + { + "epoch": 1.2340736204988108, + "grad_norm": 0.6266142129898071, + "learning_rate": 6.635433712685115e-06, + "loss": 0.1828, + "step": 13100 + }, + { + "epoch": 1.2341678245919785, + "grad_norm": 0.6063579320907593, + "learning_rate": 6.634011768571807e-06, + "loss": 0.175, + "step": 13101 + }, + { + "epoch": 1.2342620286851465, + "grad_norm": 0.6964307427406311, + "learning_rate": 6.6325899012082375e-06, + "loss": 0.1942, + "step": 13102 + }, + { + "epoch": 1.2343562327783142, + "grad_norm": 0.6033285856246948, + "learning_rate": 6.631168110626825e-06, + "loss": 0.193, + "step": 13103 + }, + { + "epoch": 1.2344504368714821, + "grad_norm": 0.729873538017273, + "learning_rate": 6.629746396859989e-06, + "loss": 0.2272, + "step": 13104 + }, + { + "epoch": 1.2345446409646499, + "grad_norm": 0.7651042342185974, + "learning_rate": 6.6283247599401475e-06, + "loss": 0.2374, + "step": 13105 + }, + { + "epoch": 1.2346388450578178, + "grad_norm": 0.6986647844314575, + "learning_rate": 6.626903199899716e-06, + "loss": 0.2072, + "step": 13106 + }, + { + "epoch": 1.2347330491509856, + "grad_norm": 0.6632575392723083, + "learning_rate": 6.62548171677111e-06, + "loss": 0.1976, + "step": 13107 + }, + { + "epoch": 1.2348272532441535, + "grad_norm": 0.582164466381073, + "learning_rate": 6.624060310586737e-06, + "loss": 0.1782, + "step": 13108 + }, + { + "epoch": 1.2349214573373213, + "grad_norm": 0.7106314897537231, + "learning_rate": 6.622638981379011e-06, + "loss": 0.184, + "step": 13109 + }, + { + "epoch": 1.2350156614304892, + "grad_norm": 0.6218223571777344, + "learning_rate": 6.621217729180338e-06, + "loss": 0.2154, + "step": 13110 + }, + { + "epoch": 1.235109865523657, + "grad_norm": 0.6390554308891296, + "learning_rate": 6.619796554023131e-06, + "loss": 0.2115, + "step": 13111 + }, + { + "epoch": 1.235204069616825, + "grad_norm": 0.6510376930236816, + "learning_rate": 6.618375455939787e-06, + "loss": 0.2305, + "step": 13112 + }, + { + "epoch": 1.2352982737099927, + "grad_norm": 0.6226624250411987, + "learning_rate": 6.616954434962709e-06, + "loss": 0.1773, + "step": 13113 + }, + { + "epoch": 1.2353924778031606, + "grad_norm": 0.615502655506134, + "learning_rate": 6.615533491124307e-06, + "loss": 0.1965, + "step": 13114 + }, + { + "epoch": 1.2354866818963284, + "grad_norm": 0.6183832883834839, + "learning_rate": 6.614112624456974e-06, + "loss": 0.192, + "step": 13115 + }, + { + "epoch": 1.2355808859894963, + "grad_norm": 0.6593360304832458, + "learning_rate": 6.612691834993108e-06, + "loss": 0.221, + "step": 13116 + }, + { + "epoch": 1.235675090082664, + "grad_norm": 0.6409652233123779, + "learning_rate": 6.6112711227651085e-06, + "loss": 0.2043, + "step": 13117 + }, + { + "epoch": 1.235769294175832, + "grad_norm": 0.5980530977249146, + "learning_rate": 6.609850487805368e-06, + "loss": 0.1988, + "step": 13118 + }, + { + "epoch": 1.2358634982689998, + "grad_norm": 0.6702038049697876, + "learning_rate": 6.6084299301462776e-06, + "loss": 0.1993, + "step": 13119 + }, + { + "epoch": 1.2359577023621677, + "grad_norm": 0.7119449377059937, + "learning_rate": 6.607009449820232e-06, + "loss": 0.2048, + "step": 13120 + }, + { + "epoch": 1.2360519064553355, + "grad_norm": 0.6192128658294678, + "learning_rate": 6.6055890468596175e-06, + "loss": 0.1799, + "step": 13121 + }, + { + "epoch": 1.2361461105485034, + "grad_norm": 0.6519134640693665, + "learning_rate": 6.604168721296823e-06, + "loss": 0.2091, + "step": 13122 + }, + { + "epoch": 1.2362403146416712, + "grad_norm": 0.5972312688827515, + "learning_rate": 6.602748473164231e-06, + "loss": 0.1808, + "step": 13123 + }, + { + "epoch": 1.236334518734839, + "grad_norm": 0.5865551233291626, + "learning_rate": 6.6013283024942295e-06, + "loss": 0.1718, + "step": 13124 + }, + { + "epoch": 1.2364287228280069, + "grad_norm": 0.6026672720909119, + "learning_rate": 6.5999082093192e-06, + "loss": 0.2056, + "step": 13125 + }, + { + "epoch": 1.2365229269211748, + "grad_norm": 0.6767292022705078, + "learning_rate": 6.5984881936715195e-06, + "loss": 0.2118, + "step": 13126 + }, + { + "epoch": 1.2366171310143426, + "grad_norm": 0.6915835738182068, + "learning_rate": 6.59706825558357e-06, + "loss": 0.2153, + "step": 13127 + }, + { + "epoch": 1.2367113351075103, + "grad_norm": 0.7222739458084106, + "learning_rate": 6.595648395087728e-06, + "loss": 0.1925, + "step": 13128 + }, + { + "epoch": 1.2368055392006783, + "grad_norm": 0.7652611136436462, + "learning_rate": 6.594228612216365e-06, + "loss": 0.2099, + "step": 13129 + }, + { + "epoch": 1.2368997432938462, + "grad_norm": 0.6785929203033447, + "learning_rate": 6.5928089070018576e-06, + "loss": 0.2151, + "step": 13130 + }, + { + "epoch": 1.236993947387014, + "grad_norm": 0.6909286379814148, + "learning_rate": 6.591389279476579e-06, + "loss": 0.2083, + "step": 13131 + }, + { + "epoch": 1.2370881514801817, + "grad_norm": 0.7029983997344971, + "learning_rate": 6.589969729672896e-06, + "loss": 0.2132, + "step": 13132 + }, + { + "epoch": 1.2371823555733497, + "grad_norm": 0.6916367411613464, + "learning_rate": 6.588550257623171e-06, + "loss": 0.2134, + "step": 13133 + }, + { + "epoch": 1.2372765596665176, + "grad_norm": 0.6737584471702576, + "learning_rate": 6.587130863359783e-06, + "loss": 0.2042, + "step": 13134 + }, + { + "epoch": 1.2373707637596854, + "grad_norm": 0.7271801233291626, + "learning_rate": 6.585711546915087e-06, + "loss": 0.2137, + "step": 13135 + }, + { + "epoch": 1.237464967852853, + "grad_norm": 0.6153900623321533, + "learning_rate": 6.584292308321445e-06, + "loss": 0.1869, + "step": 13136 + }, + { + "epoch": 1.237559171946021, + "grad_norm": 0.7527621388435364, + "learning_rate": 6.582873147611224e-06, + "loss": 0.2296, + "step": 13137 + }, + { + "epoch": 1.237653376039189, + "grad_norm": 0.5928955078125, + "learning_rate": 6.581454064816781e-06, + "loss": 0.1997, + "step": 13138 + }, + { + "epoch": 1.2377475801323568, + "grad_norm": 0.6899813413619995, + "learning_rate": 6.5800350599704684e-06, + "loss": 0.1963, + "step": 13139 + }, + { + "epoch": 1.2378417842255245, + "grad_norm": 0.6595364212989807, + "learning_rate": 6.578616133104648e-06, + "loss": 0.2243, + "step": 13140 + }, + { + "epoch": 1.2379359883186924, + "grad_norm": 0.5497678518295288, + "learning_rate": 6.5771972842516715e-06, + "loss": 0.1798, + "step": 13141 + }, + { + "epoch": 1.2380301924118604, + "grad_norm": 0.6328285932540894, + "learning_rate": 6.575778513443891e-06, + "loss": 0.1983, + "step": 13142 + }, + { + "epoch": 1.2381243965050281, + "grad_norm": 0.6856815218925476, + "learning_rate": 6.574359820713653e-06, + "loss": 0.2272, + "step": 13143 + }, + { + "epoch": 1.2382186005981959, + "grad_norm": 0.6073868274688721, + "learning_rate": 6.572941206093311e-06, + "loss": 0.1959, + "step": 13144 + }, + { + "epoch": 1.2383128046913638, + "grad_norm": 0.736967921257019, + "learning_rate": 6.571522669615209e-06, + "loss": 0.2434, + "step": 13145 + }, + { + "epoch": 1.2384070087845318, + "grad_norm": 0.6768428683280945, + "learning_rate": 6.570104211311692e-06, + "loss": 0.1868, + "step": 13146 + }, + { + "epoch": 1.2385012128776995, + "grad_norm": 0.591189444065094, + "learning_rate": 6.568685831215105e-06, + "loss": 0.1737, + "step": 13147 + }, + { + "epoch": 1.2385954169708673, + "grad_norm": 0.7081210017204285, + "learning_rate": 6.5672675293577885e-06, + "loss": 0.2168, + "step": 13148 + }, + { + "epoch": 1.2386896210640352, + "grad_norm": 0.6531855463981628, + "learning_rate": 6.565849305772075e-06, + "loss": 0.2216, + "step": 13149 + }, + { + "epoch": 1.2387838251572032, + "grad_norm": 0.6034048199653625, + "learning_rate": 6.564431160490313e-06, + "loss": 0.1951, + "step": 13150 + }, + { + "epoch": 1.238878029250371, + "grad_norm": 0.6722594499588013, + "learning_rate": 6.563013093544837e-06, + "loss": 0.2166, + "step": 13151 + }, + { + "epoch": 1.2389722333435387, + "grad_norm": 0.6729581952095032, + "learning_rate": 6.561595104967975e-06, + "loss": 0.2155, + "step": 13152 + }, + { + "epoch": 1.2390664374367066, + "grad_norm": 0.627149224281311, + "learning_rate": 6.560177194792057e-06, + "loss": 0.1976, + "step": 13153 + }, + { + "epoch": 1.2391606415298746, + "grad_norm": 0.673635721206665, + "learning_rate": 6.558759363049426e-06, + "loss": 0.1841, + "step": 13154 + }, + { + "epoch": 1.2392548456230423, + "grad_norm": 0.6006341576576233, + "learning_rate": 6.5573416097724e-06, + "loss": 0.1847, + "step": 13155 + }, + { + "epoch": 1.23934904971621, + "grad_norm": 0.8298212289810181, + "learning_rate": 6.555923934993309e-06, + "loss": 0.2264, + "step": 13156 + }, + { + "epoch": 1.239443253809378, + "grad_norm": 0.6421175599098206, + "learning_rate": 6.554506338744482e-06, + "loss": 0.1883, + "step": 13157 + }, + { + "epoch": 1.2395374579025458, + "grad_norm": 0.6622430086135864, + "learning_rate": 6.553088821058237e-06, + "loss": 0.2039, + "step": 13158 + }, + { + "epoch": 1.2396316619957137, + "grad_norm": 0.6573102474212646, + "learning_rate": 6.5516713819668955e-06, + "loss": 0.1962, + "step": 13159 + }, + { + "epoch": 1.2397258660888815, + "grad_norm": 0.8151679635047913, + "learning_rate": 6.550254021502782e-06, + "loss": 0.2009, + "step": 13160 + }, + { + "epoch": 1.2398200701820494, + "grad_norm": 0.6960872411727905, + "learning_rate": 6.548836739698212e-06, + "loss": 0.2179, + "step": 13161 + }, + { + "epoch": 1.2399142742752172, + "grad_norm": 0.659559965133667, + "learning_rate": 6.547419536585502e-06, + "loss": 0.2065, + "step": 13162 + }, + { + "epoch": 1.2400084783683851, + "grad_norm": 0.6673386693000793, + "learning_rate": 6.5460024121969635e-06, + "loss": 0.2142, + "step": 13163 + }, + { + "epoch": 1.2401026824615529, + "grad_norm": 0.6193254590034485, + "learning_rate": 6.544585366564913e-06, + "loss": 0.2057, + "step": 13164 + }, + { + "epoch": 1.2401968865547208, + "grad_norm": 0.6263452768325806, + "learning_rate": 6.543168399721661e-06, + "loss": 0.1869, + "step": 13165 + }, + { + "epoch": 1.2402910906478886, + "grad_norm": 0.6214220523834229, + "learning_rate": 6.541751511699514e-06, + "loss": 0.199, + "step": 13166 + }, + { + "epoch": 1.2403852947410565, + "grad_norm": 0.6071861982345581, + "learning_rate": 6.540334702530782e-06, + "loss": 0.199, + "step": 13167 + }, + { + "epoch": 1.2404794988342243, + "grad_norm": 0.6378260254859924, + "learning_rate": 6.538917972247771e-06, + "loss": 0.2381, + "step": 13168 + }, + { + "epoch": 1.2405737029273922, + "grad_norm": 0.661163330078125, + "learning_rate": 6.537501320882778e-06, + "loss": 0.1988, + "step": 13169 + }, + { + "epoch": 1.24066790702056, + "grad_norm": 0.7192729115486145, + "learning_rate": 6.536084748468114e-06, + "loss": 0.1879, + "step": 13170 + }, + { + "epoch": 1.240762111113728, + "grad_norm": 0.662153959274292, + "learning_rate": 6.534668255036075e-06, + "loss": 0.1885, + "step": 13171 + }, + { + "epoch": 1.2408563152068957, + "grad_norm": 0.5854468941688538, + "learning_rate": 6.533251840618958e-06, + "loss": 0.185, + "step": 13172 + }, + { + "epoch": 1.2409505193000636, + "grad_norm": 0.5961412191390991, + "learning_rate": 6.531835505249057e-06, + "loss": 0.2056, + "step": 13173 + }, + { + "epoch": 1.2410447233932314, + "grad_norm": 0.6531262397766113, + "learning_rate": 6.530419248958675e-06, + "loss": 0.201, + "step": 13174 + }, + { + "epoch": 1.2411389274863993, + "grad_norm": 0.683880627155304, + "learning_rate": 6.529003071780098e-06, + "loss": 0.2197, + "step": 13175 + }, + { + "epoch": 1.241233131579567, + "grad_norm": 0.6978038549423218, + "learning_rate": 6.527586973745619e-06, + "loss": 0.2409, + "step": 13176 + }, + { + "epoch": 1.241327335672735, + "grad_norm": 0.632899284362793, + "learning_rate": 6.526170954887528e-06, + "loss": 0.189, + "step": 13177 + }, + { + "epoch": 1.2414215397659027, + "grad_norm": 0.5654722452163696, + "learning_rate": 6.52475501523811e-06, + "loss": 0.1709, + "step": 13178 + }, + { + "epoch": 1.2415157438590707, + "grad_norm": 0.6315605640411377, + "learning_rate": 6.523339154829651e-06, + "loss": 0.2222, + "step": 13179 + }, + { + "epoch": 1.2416099479522384, + "grad_norm": 0.6636592745780945, + "learning_rate": 6.5219233736944384e-06, + "loss": 0.2049, + "step": 13180 + }, + { + "epoch": 1.2417041520454064, + "grad_norm": 0.6793553233146667, + "learning_rate": 6.520507671864753e-06, + "loss": 0.2112, + "step": 13181 + }, + { + "epoch": 1.2417983561385741, + "grad_norm": 0.6310015916824341, + "learning_rate": 6.519092049372873e-06, + "loss": 0.2142, + "step": 13182 + }, + { + "epoch": 1.241892560231742, + "grad_norm": 0.6464283466339111, + "learning_rate": 6.517676506251074e-06, + "loss": 0.1928, + "step": 13183 + }, + { + "epoch": 1.2419867643249098, + "grad_norm": 0.5940302610397339, + "learning_rate": 6.51626104253164e-06, + "loss": 0.1871, + "step": 13184 + }, + { + "epoch": 1.2420809684180778, + "grad_norm": 0.7057653069496155, + "learning_rate": 6.5148456582468424e-06, + "loss": 0.1981, + "step": 13185 + }, + { + "epoch": 1.2421751725112455, + "grad_norm": 0.6060236692428589, + "learning_rate": 6.51343035342895e-06, + "loss": 0.2021, + "step": 13186 + }, + { + "epoch": 1.2422693766044135, + "grad_norm": 0.6369514465332031, + "learning_rate": 6.512015128110241e-06, + "loss": 0.1832, + "step": 13187 + }, + { + "epoch": 1.2423635806975812, + "grad_norm": 0.6441729068756104, + "learning_rate": 6.510599982322982e-06, + "loss": 0.2138, + "step": 13188 + }, + { + "epoch": 1.2424577847907492, + "grad_norm": 0.5923165082931519, + "learning_rate": 6.509184916099433e-06, + "loss": 0.1976, + "step": 13189 + }, + { + "epoch": 1.242551988883917, + "grad_norm": 0.5233913064002991, + "learning_rate": 6.507769929471875e-06, + "loss": 0.1697, + "step": 13190 + }, + { + "epoch": 1.242646192977085, + "grad_norm": 0.6921975612640381, + "learning_rate": 6.506355022472561e-06, + "loss": 0.2013, + "step": 13191 + }, + { + "epoch": 1.2427403970702526, + "grad_norm": 0.6269617080688477, + "learning_rate": 6.504940195133755e-06, + "loss": 0.1975, + "step": 13192 + }, + { + "epoch": 1.2428346011634206, + "grad_norm": 0.7483435273170471, + "learning_rate": 6.503525447487717e-06, + "loss": 0.2226, + "step": 13193 + }, + { + "epoch": 1.2429288052565883, + "grad_norm": 0.6120871305465698, + "learning_rate": 6.502110779566706e-06, + "loss": 0.2191, + "step": 13194 + }, + { + "epoch": 1.2430230093497563, + "grad_norm": 0.5768830180168152, + "learning_rate": 6.50069619140298e-06, + "loss": 0.1729, + "step": 13195 + }, + { + "epoch": 1.243117213442924, + "grad_norm": 0.7296568155288696, + "learning_rate": 6.499281683028791e-06, + "loss": 0.1988, + "step": 13196 + }, + { + "epoch": 1.243211417536092, + "grad_norm": 0.6651104092597961, + "learning_rate": 6.497867254476395e-06, + "loss": 0.1935, + "step": 13197 + }, + { + "epoch": 1.2433056216292597, + "grad_norm": 0.6365832686424255, + "learning_rate": 6.496452905778041e-06, + "loss": 0.1862, + "step": 13198 + }, + { + "epoch": 1.2433998257224277, + "grad_norm": 0.7073401212692261, + "learning_rate": 6.495038636965978e-06, + "loss": 0.2095, + "step": 13199 + }, + { + "epoch": 1.2434940298155954, + "grad_norm": 0.627632200717926, + "learning_rate": 6.4936244480724575e-06, + "loss": 0.1928, + "step": 13200 + }, + { + "epoch": 1.2435882339087634, + "grad_norm": 0.7091456651687622, + "learning_rate": 6.492210339129721e-06, + "loss": 0.2011, + "step": 13201 + }, + { + "epoch": 1.2436824380019311, + "grad_norm": 0.6287287473678589, + "learning_rate": 6.490796310170013e-06, + "loss": 0.2135, + "step": 13202 + }, + { + "epoch": 1.243776642095099, + "grad_norm": 0.6407252550125122, + "learning_rate": 6.489382361225576e-06, + "loss": 0.2096, + "step": 13203 + }, + { + "epoch": 1.2438708461882668, + "grad_norm": 0.6490053534507751, + "learning_rate": 6.487968492328651e-06, + "loss": 0.2196, + "step": 13204 + }, + { + "epoch": 1.2439650502814348, + "grad_norm": 0.6701541543006897, + "learning_rate": 6.486554703511477e-06, + "loss": 0.2147, + "step": 13205 + }, + { + "epoch": 1.2440592543746025, + "grad_norm": 0.6274252533912659, + "learning_rate": 6.4851409948062875e-06, + "loss": 0.2021, + "step": 13206 + }, + { + "epoch": 1.2441534584677705, + "grad_norm": 0.7174579501152039, + "learning_rate": 6.48372736624532e-06, + "loss": 0.2408, + "step": 13207 + }, + { + "epoch": 1.2442476625609382, + "grad_norm": 0.6236835718154907, + "learning_rate": 6.482313817860809e-06, + "loss": 0.2154, + "step": 13208 + }, + { + "epoch": 1.2443418666541062, + "grad_norm": 0.6155478358268738, + "learning_rate": 6.480900349684977e-06, + "loss": 0.1959, + "step": 13209 + }, + { + "epoch": 1.244436070747274, + "grad_norm": 0.7192291617393494, + "learning_rate": 6.479486961750065e-06, + "loss": 0.2438, + "step": 13210 + }, + { + "epoch": 1.2445302748404419, + "grad_norm": 0.6841741800308228, + "learning_rate": 6.478073654088295e-06, + "loss": 0.1929, + "step": 13211 + }, + { + "epoch": 1.2446244789336096, + "grad_norm": 0.6367154121398926, + "learning_rate": 6.476660426731891e-06, + "loss": 0.201, + "step": 13212 + }, + { + "epoch": 1.2447186830267776, + "grad_norm": 0.6178945899009705, + "learning_rate": 6.475247279713076e-06, + "loss": 0.2105, + "step": 13213 + }, + { + "epoch": 1.2448128871199453, + "grad_norm": 0.6798058152198792, + "learning_rate": 6.4738342130640764e-06, + "loss": 0.2114, + "step": 13214 + }, + { + "epoch": 1.2449070912131133, + "grad_norm": 0.6734654903411865, + "learning_rate": 6.47242122681711e-06, + "loss": 0.2306, + "step": 13215 + }, + { + "epoch": 1.245001295306281, + "grad_norm": 0.764380931854248, + "learning_rate": 6.471008321004393e-06, + "loss": 0.2278, + "step": 13216 + }, + { + "epoch": 1.245095499399449, + "grad_norm": 0.6479675769805908, + "learning_rate": 6.4695954956581464e-06, + "loss": 0.2103, + "step": 13217 + }, + { + "epoch": 1.2451897034926167, + "grad_norm": 0.628743052482605, + "learning_rate": 6.468182750810582e-06, + "loss": 0.1941, + "step": 13218 + }, + { + "epoch": 1.2452839075857847, + "grad_norm": 0.6989083290100098, + "learning_rate": 6.466770086493911e-06, + "loss": 0.213, + "step": 13219 + }, + { + "epoch": 1.2453781116789524, + "grad_norm": 0.6205032467842102, + "learning_rate": 6.4653575027403485e-06, + "loss": 0.2083, + "step": 13220 + }, + { + "epoch": 1.2454723157721204, + "grad_norm": 0.6287810206413269, + "learning_rate": 6.463944999582102e-06, + "loss": 0.2182, + "step": 13221 + }, + { + "epoch": 1.245566519865288, + "grad_norm": 0.6180534958839417, + "learning_rate": 6.462532577051377e-06, + "loss": 0.1902, + "step": 13222 + }, + { + "epoch": 1.245660723958456, + "grad_norm": 0.6427865624427795, + "learning_rate": 6.461120235180378e-06, + "loss": 0.218, + "step": 13223 + }, + { + "epoch": 1.2457549280516238, + "grad_norm": 0.624610424041748, + "learning_rate": 6.4597079740013126e-06, + "loss": 0.1779, + "step": 13224 + }, + { + "epoch": 1.2458491321447918, + "grad_norm": 0.6102461218833923, + "learning_rate": 6.458295793546381e-06, + "loss": 0.2159, + "step": 13225 + }, + { + "epoch": 1.2459433362379595, + "grad_norm": 0.7143837809562683, + "learning_rate": 6.456883693847781e-06, + "loss": 0.2104, + "step": 13226 + }, + { + "epoch": 1.2460375403311275, + "grad_norm": 0.7000229954719543, + "learning_rate": 6.455471674937714e-06, + "loss": 0.2043, + "step": 13227 + }, + { + "epoch": 1.2461317444242952, + "grad_norm": 0.6932652592658997, + "learning_rate": 6.454059736848376e-06, + "loss": 0.2209, + "step": 13228 + }, + { + "epoch": 1.2462259485174632, + "grad_norm": 0.6495449542999268, + "learning_rate": 6.4526478796119555e-06, + "loss": 0.2079, + "step": 13229 + }, + { + "epoch": 1.246320152610631, + "grad_norm": 0.5908781886100769, + "learning_rate": 6.451236103260652e-06, + "loss": 0.184, + "step": 13230 + }, + { + "epoch": 1.2464143567037989, + "grad_norm": 0.6017096638679504, + "learning_rate": 6.449824407826655e-06, + "loss": 0.1765, + "step": 13231 + }, + { + "epoch": 1.2465085607969666, + "grad_norm": 0.6833210587501526, + "learning_rate": 6.4484127933421514e-06, + "loss": 0.2077, + "step": 13232 + }, + { + "epoch": 1.2466027648901346, + "grad_norm": 0.6991371512413025, + "learning_rate": 6.447001259839325e-06, + "loss": 0.2494, + "step": 13233 + }, + { + "epoch": 1.2466969689833023, + "grad_norm": 0.5941142439842224, + "learning_rate": 6.445589807350369e-06, + "loss": 0.2145, + "step": 13234 + }, + { + "epoch": 1.2467911730764702, + "grad_norm": 0.6017299890518188, + "learning_rate": 6.444178435907461e-06, + "loss": 0.1924, + "step": 13235 + }, + { + "epoch": 1.246885377169638, + "grad_norm": 0.6834784150123596, + "learning_rate": 6.442767145542782e-06, + "loss": 0.2098, + "step": 13236 + }, + { + "epoch": 1.246979581262806, + "grad_norm": 0.6336193680763245, + "learning_rate": 6.441355936288516e-06, + "loss": 0.1907, + "step": 13237 + }, + { + "epoch": 1.2470737853559737, + "grad_norm": 0.6442963480949402, + "learning_rate": 6.439944808176837e-06, + "loss": 0.1934, + "step": 13238 + }, + { + "epoch": 1.2471679894491416, + "grad_norm": 0.6089239716529846, + "learning_rate": 6.4385337612399215e-06, + "loss": 0.2017, + "step": 13239 + }, + { + "epoch": 1.2472621935423094, + "grad_norm": 0.5581820607185364, + "learning_rate": 6.437122795509945e-06, + "loss": 0.1831, + "step": 13240 + }, + { + "epoch": 1.2473563976354773, + "grad_norm": 0.7005428671836853, + "learning_rate": 6.43571191101908e-06, + "loss": 0.1856, + "step": 13241 + }, + { + "epoch": 1.247450601728645, + "grad_norm": 0.631710946559906, + "learning_rate": 6.434301107799494e-06, + "loss": 0.2002, + "step": 13242 + }, + { + "epoch": 1.247544805821813, + "grad_norm": 0.7596853375434875, + "learning_rate": 6.432890385883357e-06, + "loss": 0.2144, + "step": 13243 + }, + { + "epoch": 1.2476390099149808, + "grad_norm": 0.6215640306472778, + "learning_rate": 6.431479745302838e-06, + "loss": 0.2075, + "step": 13244 + }, + { + "epoch": 1.2477332140081487, + "grad_norm": 0.6445016264915466, + "learning_rate": 6.4300691860900975e-06, + "loss": 0.2068, + "step": 13245 + }, + { + "epoch": 1.2478274181013165, + "grad_norm": 0.6773838400840759, + "learning_rate": 6.4286587082773e-06, + "loss": 0.2452, + "step": 13246 + }, + { + "epoch": 1.2479216221944844, + "grad_norm": 0.5760672688484192, + "learning_rate": 6.42724831189661e-06, + "loss": 0.2073, + "step": 13247 + }, + { + "epoch": 1.2480158262876522, + "grad_norm": 0.614971935749054, + "learning_rate": 6.4258379969801846e-06, + "loss": 0.1849, + "step": 13248 + }, + { + "epoch": 1.2481100303808201, + "grad_norm": 0.6382244825363159, + "learning_rate": 6.424427763560175e-06, + "loss": 0.1835, + "step": 13249 + }, + { + "epoch": 1.2482042344739879, + "grad_norm": 0.6506362557411194, + "learning_rate": 6.423017611668745e-06, + "loss": 0.2272, + "step": 13250 + }, + { + "epoch": 1.2482984385671558, + "grad_norm": 0.6593846678733826, + "learning_rate": 6.421607541338049e-06, + "loss": 0.2088, + "step": 13251 + }, + { + "epoch": 1.2483926426603236, + "grad_norm": 0.6460914015769958, + "learning_rate": 6.420197552600232e-06, + "loss": 0.1972, + "step": 13252 + }, + { + "epoch": 1.2484868467534915, + "grad_norm": 0.6168376207351685, + "learning_rate": 6.418787645487446e-06, + "loss": 0.2213, + "step": 13253 + }, + { + "epoch": 1.2485810508466593, + "grad_norm": 0.7077895998954773, + "learning_rate": 6.41737782003184e-06, + "loss": 0.2051, + "step": 13254 + }, + { + "epoch": 1.2486752549398272, + "grad_norm": 0.6611896753311157, + "learning_rate": 6.415968076265562e-06, + "loss": 0.2222, + "step": 13255 + }, + { + "epoch": 1.248769459032995, + "grad_norm": 0.6410412788391113, + "learning_rate": 6.4145584142207525e-06, + "loss": 0.2132, + "step": 13256 + }, + { + "epoch": 1.248863663126163, + "grad_norm": 0.6798258423805237, + "learning_rate": 6.413148833929559e-06, + "loss": 0.205, + "step": 13257 + }, + { + "epoch": 1.2489578672193307, + "grad_norm": 0.6173869967460632, + "learning_rate": 6.411739335424118e-06, + "loss": 0.2124, + "step": 13258 + }, + { + "epoch": 1.2490520713124986, + "grad_norm": 0.5916337966918945, + "learning_rate": 6.410329918736568e-06, + "loss": 0.1915, + "step": 13259 + }, + { + "epoch": 1.2491462754056664, + "grad_norm": 0.6927295923233032, + "learning_rate": 6.408920583899049e-06, + "loss": 0.2081, + "step": 13260 + }, + { + "epoch": 1.2492404794988343, + "grad_norm": 0.6642054915428162, + "learning_rate": 6.407511330943694e-06, + "loss": 0.1902, + "step": 13261 + }, + { + "epoch": 1.249334683592002, + "grad_norm": 0.6179870963096619, + "learning_rate": 6.406102159902638e-06, + "loss": 0.2046, + "step": 13262 + }, + { + "epoch": 1.2494288876851698, + "grad_norm": 0.6349109411239624, + "learning_rate": 6.404693070808008e-06, + "loss": 0.2286, + "step": 13263 + }, + { + "epoch": 1.2495230917783378, + "grad_norm": 0.5897269248962402, + "learning_rate": 6.403284063691938e-06, + "loss": 0.1878, + "step": 13264 + }, + { + "epoch": 1.2496172958715057, + "grad_norm": 0.6208135485649109, + "learning_rate": 6.401875138586557e-06, + "loss": 0.1983, + "step": 13265 + }, + { + "epoch": 1.2497114999646735, + "grad_norm": 0.7107505798339844, + "learning_rate": 6.400466295523979e-06, + "loss": 0.2373, + "step": 13266 + }, + { + "epoch": 1.2498057040578412, + "grad_norm": 0.6204399466514587, + "learning_rate": 6.399057534536342e-06, + "loss": 0.1887, + "step": 13267 + }, + { + "epoch": 1.2498999081510092, + "grad_norm": 0.7137466669082642, + "learning_rate": 6.397648855655765e-06, + "loss": 0.2237, + "step": 13268 + }, + { + "epoch": 1.2499941122441771, + "grad_norm": 0.6079708933830261, + "learning_rate": 6.396240258914357e-06, + "loss": 0.2368, + "step": 13269 + }, + { + "epoch": 1.2500883163373449, + "grad_norm": 0.6251336336135864, + "learning_rate": 6.3948317443442496e-06, + "loss": 0.1881, + "step": 13270 + }, + { + "epoch": 1.2501825204305126, + "grad_norm": 0.6043210625648499, + "learning_rate": 6.393423311977556e-06, + "loss": 0.1919, + "step": 13271 + }, + { + "epoch": 1.2502767245236805, + "grad_norm": 0.5962233543395996, + "learning_rate": 6.392014961846387e-06, + "loss": 0.1889, + "step": 13272 + }, + { + "epoch": 1.2503709286168485, + "grad_norm": 0.5834617614746094, + "learning_rate": 6.3906066939828546e-06, + "loss": 0.1879, + "step": 13273 + }, + { + "epoch": 1.2504651327100162, + "grad_norm": 0.597581148147583, + "learning_rate": 6.389198508419072e-06, + "loss": 0.179, + "step": 13274 + }, + { + "epoch": 1.250559336803184, + "grad_norm": 0.6518176794052124, + "learning_rate": 6.38779040518715e-06, + "loss": 0.2046, + "step": 13275 + }, + { + "epoch": 1.250653540896352, + "grad_norm": 0.6201210618019104, + "learning_rate": 6.38638238431919e-06, + "loss": 0.2061, + "step": 13276 + }, + { + "epoch": 1.25074774498952, + "grad_norm": 0.6848750114440918, + "learning_rate": 6.384974445847302e-06, + "loss": 0.1925, + "step": 13277 + }, + { + "epoch": 1.2508419490826876, + "grad_norm": 0.6894055008888245, + "learning_rate": 6.383566589803587e-06, + "loss": 0.2056, + "step": 13278 + }, + { + "epoch": 1.2509361531758554, + "grad_norm": 0.6124905943870544, + "learning_rate": 6.382158816220146e-06, + "loss": 0.1922, + "step": 13279 + }, + { + "epoch": 1.2510303572690233, + "grad_norm": 0.644424319267273, + "learning_rate": 6.38075112512908e-06, + "loss": 0.2004, + "step": 13280 + }, + { + "epoch": 1.2511245613621913, + "grad_norm": 0.5365655422210693, + "learning_rate": 6.3793435165624866e-06, + "loss": 0.1776, + "step": 13281 + }, + { + "epoch": 1.251218765455359, + "grad_norm": 0.6420202255249023, + "learning_rate": 6.377935990552459e-06, + "loss": 0.1911, + "step": 13282 + }, + { + "epoch": 1.2513129695485268, + "grad_norm": 0.6588460803031921, + "learning_rate": 6.376528547131091e-06, + "loss": 0.2131, + "step": 13283 + }, + { + "epoch": 1.2514071736416947, + "grad_norm": 0.6637923717498779, + "learning_rate": 6.375121186330478e-06, + "loss": 0.1893, + "step": 13284 + }, + { + "epoch": 1.2515013777348627, + "grad_norm": 0.6343114376068115, + "learning_rate": 6.373713908182711e-06, + "loss": 0.1873, + "step": 13285 + }, + { + "epoch": 1.2515955818280304, + "grad_norm": 0.60640549659729, + "learning_rate": 6.372306712719868e-06, + "loss": 0.2026, + "step": 13286 + }, + { + "epoch": 1.2516897859211982, + "grad_norm": 0.6385098099708557, + "learning_rate": 6.370899599974047e-06, + "loss": 0.1905, + "step": 13287 + }, + { + "epoch": 1.2517839900143661, + "grad_norm": 0.7508695125579834, + "learning_rate": 6.369492569977329e-06, + "loss": 0.2229, + "step": 13288 + }, + { + "epoch": 1.251878194107534, + "grad_norm": 0.5951728820800781, + "learning_rate": 6.368085622761788e-06, + "loss": 0.1854, + "step": 13289 + }, + { + "epoch": 1.2519723982007018, + "grad_norm": 0.579662024974823, + "learning_rate": 6.366678758359517e-06, + "loss": 0.1888, + "step": 13290 + }, + { + "epoch": 1.2520666022938696, + "grad_norm": 0.6184574365615845, + "learning_rate": 6.3652719768025915e-06, + "loss": 0.204, + "step": 13291 + }, + { + "epoch": 1.2521608063870375, + "grad_norm": 0.7084358334541321, + "learning_rate": 6.363865278123085e-06, + "loss": 0.2039, + "step": 13292 + }, + { + "epoch": 1.2522550104802055, + "grad_norm": 0.6930648684501648, + "learning_rate": 6.362458662353069e-06, + "loss": 0.2441, + "step": 13293 + }, + { + "epoch": 1.2523492145733732, + "grad_norm": 0.6676106452941895, + "learning_rate": 6.361052129524625e-06, + "loss": 0.2223, + "step": 13294 + }, + { + "epoch": 1.252443418666541, + "grad_norm": 0.6763599514961243, + "learning_rate": 6.3596456796698195e-06, + "loss": 0.222, + "step": 13295 + }, + { + "epoch": 1.252537622759709, + "grad_norm": 0.7015537619590759, + "learning_rate": 6.3582393128207206e-06, + "loss": 0.1883, + "step": 13296 + }, + { + "epoch": 1.2526318268528769, + "grad_norm": 0.6660821437835693, + "learning_rate": 6.3568330290094e-06, + "loss": 0.1755, + "step": 13297 + }, + { + "epoch": 1.2527260309460446, + "grad_norm": 0.6238716244697571, + "learning_rate": 6.3554268282679196e-06, + "loss": 0.2131, + "step": 13298 + }, + { + "epoch": 1.2528202350392124, + "grad_norm": 0.6290259957313538, + "learning_rate": 6.354020710628342e-06, + "loss": 0.2021, + "step": 13299 + }, + { + "epoch": 1.2529144391323803, + "grad_norm": 0.6270294785499573, + "learning_rate": 6.352614676122734e-06, + "loss": 0.211, + "step": 13300 + }, + { + "epoch": 1.2530086432255483, + "grad_norm": 0.6475179195404053, + "learning_rate": 6.351208724783151e-06, + "loss": 0.2159, + "step": 13301 + }, + { + "epoch": 1.253102847318716, + "grad_norm": 0.6810320019721985, + "learning_rate": 6.349802856641653e-06, + "loss": 0.1928, + "step": 13302 + }, + { + "epoch": 1.2531970514118838, + "grad_norm": 0.7074736952781677, + "learning_rate": 6.3483970717302925e-06, + "loss": 0.2214, + "step": 13303 + }, + { + "epoch": 1.2532912555050517, + "grad_norm": 0.7083766460418701, + "learning_rate": 6.346991370081128e-06, + "loss": 0.2, + "step": 13304 + }, + { + "epoch": 1.2533854595982197, + "grad_norm": 0.7196433544158936, + "learning_rate": 6.345585751726211e-06, + "loss": 0.2319, + "step": 13305 + }, + { + "epoch": 1.2534796636913874, + "grad_norm": 0.7915971279144287, + "learning_rate": 6.344180216697585e-06, + "loss": 0.2138, + "step": 13306 + }, + { + "epoch": 1.2535738677845552, + "grad_norm": 0.6494907140731812, + "learning_rate": 6.342774765027309e-06, + "loss": 0.1991, + "step": 13307 + }, + { + "epoch": 1.2536680718777231, + "grad_norm": 0.6098228096961975, + "learning_rate": 6.341369396747426e-06, + "loss": 0.1907, + "step": 13308 + }, + { + "epoch": 1.253762275970891, + "grad_norm": 0.6455572843551636, + "learning_rate": 6.339964111889971e-06, + "loss": 0.205, + "step": 13309 + }, + { + "epoch": 1.2538564800640588, + "grad_norm": 0.626918375492096, + "learning_rate": 6.3385589104870024e-06, + "loss": 0.2093, + "step": 13310 + }, + { + "epoch": 1.2539506841572265, + "grad_norm": 0.7116323709487915, + "learning_rate": 6.337153792570551e-06, + "loss": 0.2154, + "step": 13311 + }, + { + "epoch": 1.2540448882503945, + "grad_norm": 0.6362070441246033, + "learning_rate": 6.335748758172658e-06, + "loss": 0.2016, + "step": 13312 + }, + { + "epoch": 1.2541390923435622, + "grad_norm": 0.6597411036491394, + "learning_rate": 6.334343807325358e-06, + "loss": 0.1964, + "step": 13313 + }, + { + "epoch": 1.2542332964367302, + "grad_norm": 0.5408357381820679, + "learning_rate": 6.33293894006069e-06, + "loss": 0.1695, + "step": 13314 + }, + { + "epoch": 1.254327500529898, + "grad_norm": 0.6397354006767273, + "learning_rate": 6.331534156410686e-06, + "loss": 0.2136, + "step": 13315 + }, + { + "epoch": 1.254421704623066, + "grad_norm": 0.6547505855560303, + "learning_rate": 6.330129456407374e-06, + "loss": 0.2044, + "step": 13316 + }, + { + "epoch": 1.2545159087162336, + "grad_norm": 0.5775501132011414, + "learning_rate": 6.328724840082787e-06, + "loss": 0.1859, + "step": 13317 + }, + { + "epoch": 1.2546101128094016, + "grad_norm": 0.6217550039291382, + "learning_rate": 6.3273203074689535e-06, + "loss": 0.2066, + "step": 13318 + }, + { + "epoch": 1.2547043169025693, + "grad_norm": 0.6779605150222778, + "learning_rate": 6.325915858597893e-06, + "loss": 0.2246, + "step": 13319 + }, + { + "epoch": 1.2547985209957373, + "grad_norm": 0.639426589012146, + "learning_rate": 6.324511493501636e-06, + "loss": 0.2087, + "step": 13320 + }, + { + "epoch": 1.254892725088905, + "grad_norm": 0.6547714471817017, + "learning_rate": 6.3231072122122e-06, + "loss": 0.1762, + "step": 13321 + }, + { + "epoch": 1.254986929182073, + "grad_norm": 0.694246768951416, + "learning_rate": 6.321703014761609e-06, + "loss": 0.2019, + "step": 13322 + }, + { + "epoch": 1.2550811332752407, + "grad_norm": 0.6800451874732971, + "learning_rate": 6.320298901181874e-06, + "loss": 0.1962, + "step": 13323 + }, + { + "epoch": 1.2551753373684087, + "grad_norm": 0.7721226811408997, + "learning_rate": 6.318894871505016e-06, + "loss": 0.2348, + "step": 13324 + }, + { + "epoch": 1.2552695414615764, + "grad_norm": 0.6430573463439941, + "learning_rate": 6.31749092576305e-06, + "loss": 0.1966, + "step": 13325 + }, + { + "epoch": 1.2553637455547444, + "grad_norm": 0.6110111474990845, + "learning_rate": 6.3160870639879815e-06, + "loss": 0.1898, + "step": 13326 + }, + { + "epoch": 1.2554579496479121, + "grad_norm": 0.6774904727935791, + "learning_rate": 6.314683286211828e-06, + "loss": 0.2139, + "step": 13327 + }, + { + "epoch": 1.25555215374108, + "grad_norm": 0.7119840383529663, + "learning_rate": 6.313279592466596e-06, + "loss": 0.2097, + "step": 13328 + }, + { + "epoch": 1.2556463578342478, + "grad_norm": 0.6127558350563049, + "learning_rate": 6.311875982784288e-06, + "loss": 0.1954, + "step": 13329 + }, + { + "epoch": 1.2557405619274158, + "grad_norm": 0.5999763607978821, + "learning_rate": 6.310472457196914e-06, + "loss": 0.1867, + "step": 13330 + }, + { + "epoch": 1.2558347660205835, + "grad_norm": 0.667149543762207, + "learning_rate": 6.309069015736475e-06, + "loss": 0.2136, + "step": 13331 + }, + { + "epoch": 1.2559289701137515, + "grad_norm": 0.77800452709198, + "learning_rate": 6.3076656584349695e-06, + "loss": 0.1929, + "step": 13332 + }, + { + "epoch": 1.2560231742069192, + "grad_norm": 0.6693803668022156, + "learning_rate": 6.3062623853243945e-06, + "loss": 0.1877, + "step": 13333 + }, + { + "epoch": 1.2561173783000872, + "grad_norm": 0.6125097274780273, + "learning_rate": 6.304859196436752e-06, + "loss": 0.1985, + "step": 13334 + }, + { + "epoch": 1.256211582393255, + "grad_norm": 0.7701629400253296, + "learning_rate": 6.303456091804034e-06, + "loss": 0.1776, + "step": 13335 + }, + { + "epoch": 1.2563057864864229, + "grad_norm": 0.6409590244293213, + "learning_rate": 6.302053071458233e-06, + "loss": 0.2042, + "step": 13336 + }, + { + "epoch": 1.2563999905795906, + "grad_norm": 0.5813066363334656, + "learning_rate": 6.300650135431342e-06, + "loss": 0.185, + "step": 13337 + }, + { + "epoch": 1.2564941946727586, + "grad_norm": 0.6590962409973145, + "learning_rate": 6.2992472837553495e-06, + "loss": 0.1678, + "step": 13338 + }, + { + "epoch": 1.2565883987659263, + "grad_norm": 0.5903682708740234, + "learning_rate": 6.2978445164622405e-06, + "loss": 0.2212, + "step": 13339 + }, + { + "epoch": 1.2566826028590943, + "grad_norm": 0.7023627758026123, + "learning_rate": 6.296441833584004e-06, + "loss": 0.2191, + "step": 13340 + }, + { + "epoch": 1.256776806952262, + "grad_norm": 0.5824342370033264, + "learning_rate": 6.29503923515262e-06, + "loss": 0.1962, + "step": 13341 + }, + { + "epoch": 1.25687101104543, + "grad_norm": 0.7049176692962646, + "learning_rate": 6.293636721200074e-06, + "loss": 0.212, + "step": 13342 + }, + { + "epoch": 1.2569652151385977, + "grad_norm": 0.6386911273002625, + "learning_rate": 6.292234291758339e-06, + "loss": 0.1746, + "step": 13343 + }, + { + "epoch": 1.2570594192317657, + "grad_norm": 0.7633404731750488, + "learning_rate": 6.290831946859397e-06, + "loss": 0.2032, + "step": 13344 + }, + { + "epoch": 1.2571536233249334, + "grad_norm": 0.7216797471046448, + "learning_rate": 6.289429686535226e-06, + "loss": 0.2268, + "step": 13345 + }, + { + "epoch": 1.2572478274181014, + "grad_norm": 0.673564076423645, + "learning_rate": 6.2880275108177915e-06, + "loss": 0.2085, + "step": 13346 + }, + { + "epoch": 1.257342031511269, + "grad_norm": 0.6010403037071228, + "learning_rate": 6.2866254197390744e-06, + "loss": 0.1894, + "step": 13347 + }, + { + "epoch": 1.257436235604437, + "grad_norm": 0.6412175297737122, + "learning_rate": 6.285223413331043e-06, + "loss": 0.1874, + "step": 13348 + }, + { + "epoch": 1.2575304396976048, + "grad_norm": 0.6892516016960144, + "learning_rate": 6.283821491625655e-06, + "loss": 0.2266, + "step": 13349 + }, + { + "epoch": 1.2576246437907728, + "grad_norm": 0.5850574374198914, + "learning_rate": 6.2824196546548925e-06, + "loss": 0.1903, + "step": 13350 + }, + { + "epoch": 1.2577188478839405, + "grad_norm": 0.6382246613502502, + "learning_rate": 6.281017902450707e-06, + "loss": 0.2172, + "step": 13351 + }, + { + "epoch": 1.2578130519771085, + "grad_norm": 0.5774824619293213, + "learning_rate": 6.279616235045065e-06, + "loss": 0.1953, + "step": 13352 + }, + { + "epoch": 1.2579072560702762, + "grad_norm": 0.6158215403556824, + "learning_rate": 6.278214652469925e-06, + "loss": 0.2159, + "step": 13353 + }, + { + "epoch": 1.2580014601634442, + "grad_norm": 0.6581766605377197, + "learning_rate": 6.27681315475725e-06, + "loss": 0.2442, + "step": 13354 + }, + { + "epoch": 1.258095664256612, + "grad_norm": 0.5932873487472534, + "learning_rate": 6.275411741938991e-06, + "loss": 0.2015, + "step": 13355 + }, + { + "epoch": 1.2581898683497799, + "grad_norm": 0.6409937739372253, + "learning_rate": 6.274010414047105e-06, + "loss": 0.2025, + "step": 13356 + }, + { + "epoch": 1.2582840724429476, + "grad_norm": 0.678022027015686, + "learning_rate": 6.272609171113544e-06, + "loss": 0.2082, + "step": 13357 + }, + { + "epoch": 1.2583782765361156, + "grad_norm": 0.6111551523208618, + "learning_rate": 6.271208013170258e-06, + "loss": 0.181, + "step": 13358 + }, + { + "epoch": 1.2584724806292833, + "grad_norm": 0.7439393997192383, + "learning_rate": 6.269806940249194e-06, + "loss": 0.2055, + "step": 13359 + }, + { + "epoch": 1.2585666847224513, + "grad_norm": 0.6639959216117859, + "learning_rate": 6.268405952382304e-06, + "loss": 0.1895, + "step": 13360 + }, + { + "epoch": 1.258660888815619, + "grad_norm": 0.5995284914970398, + "learning_rate": 6.267005049601529e-06, + "loss": 0.2069, + "step": 13361 + }, + { + "epoch": 1.258755092908787, + "grad_norm": 0.6568467617034912, + "learning_rate": 6.2656042319388145e-06, + "loss": 0.1799, + "step": 13362 + }, + { + "epoch": 1.2588492970019547, + "grad_norm": 0.6851602792739868, + "learning_rate": 6.264203499426092e-06, + "loss": 0.231, + "step": 13363 + }, + { + "epoch": 1.2589435010951227, + "grad_norm": 0.6350014805793762, + "learning_rate": 6.262802852095311e-06, + "loss": 0.1828, + "step": 13364 + }, + { + "epoch": 1.2590377051882904, + "grad_norm": 0.6139953136444092, + "learning_rate": 6.261402289978407e-06, + "loss": 0.1922, + "step": 13365 + }, + { + "epoch": 1.2591319092814584, + "grad_norm": 0.6572669148445129, + "learning_rate": 6.260001813107307e-06, + "loss": 0.2082, + "step": 13366 + }, + { + "epoch": 1.259226113374626, + "grad_norm": 0.6333560943603516, + "learning_rate": 6.258601421513954e-06, + "loss": 0.2076, + "step": 13367 + }, + { + "epoch": 1.259320317467794, + "grad_norm": 0.6473051905632019, + "learning_rate": 6.257201115230276e-06, + "loss": 0.1914, + "step": 13368 + }, + { + "epoch": 1.2594145215609618, + "grad_norm": 0.6263172030448914, + "learning_rate": 6.255800894288196e-06, + "loss": 0.1881, + "step": 13369 + }, + { + "epoch": 1.2595087256541297, + "grad_norm": 0.6993263363838196, + "learning_rate": 6.2544007587196496e-06, + "loss": 0.2003, + "step": 13370 + }, + { + "epoch": 1.2596029297472975, + "grad_norm": 0.748926043510437, + "learning_rate": 6.253000708556558e-06, + "loss": 0.2363, + "step": 13371 + }, + { + "epoch": 1.2596971338404654, + "grad_norm": 0.7410542368888855, + "learning_rate": 6.2516007438308456e-06, + "loss": 0.2338, + "step": 13372 + }, + { + "epoch": 1.2597913379336332, + "grad_norm": 0.6676450967788696, + "learning_rate": 6.250200864574432e-06, + "loss": 0.2038, + "step": 13373 + }, + { + "epoch": 1.2598855420268011, + "grad_norm": 0.593265950679779, + "learning_rate": 6.2488010708192385e-06, + "loss": 0.2039, + "step": 13374 + }, + { + "epoch": 1.2599797461199689, + "grad_norm": 0.6724021434783936, + "learning_rate": 6.247401362597182e-06, + "loss": 0.2267, + "step": 13375 + }, + { + "epoch": 1.2600739502131368, + "grad_norm": 0.5940662622451782, + "learning_rate": 6.246001739940175e-06, + "loss": 0.1859, + "step": 13376 + }, + { + "epoch": 1.2601681543063046, + "grad_norm": 0.6112838387489319, + "learning_rate": 6.244602202880138e-06, + "loss": 0.1858, + "step": 13377 + }, + { + "epoch": 1.2602623583994725, + "grad_norm": 0.6411683559417725, + "learning_rate": 6.2432027514489776e-06, + "loss": 0.1986, + "step": 13378 + }, + { + "epoch": 1.2603565624926403, + "grad_norm": 0.7092647552490234, + "learning_rate": 6.241803385678603e-06, + "loss": 0.2276, + "step": 13379 + }, + { + "epoch": 1.2604507665858082, + "grad_norm": 0.6743448972702026, + "learning_rate": 6.240404105600925e-06, + "loss": 0.2167, + "step": 13380 + }, + { + "epoch": 1.260544970678976, + "grad_norm": 0.6665029525756836, + "learning_rate": 6.239004911247848e-06, + "loss": 0.186, + "step": 13381 + }, + { + "epoch": 1.2606391747721437, + "grad_norm": 0.6669281721115112, + "learning_rate": 6.237605802651277e-06, + "loss": 0.1897, + "step": 13382 + }, + { + "epoch": 1.2607333788653117, + "grad_norm": 0.617443323135376, + "learning_rate": 6.236206779843106e-06, + "loss": 0.1941, + "step": 13383 + }, + { + "epoch": 1.2608275829584796, + "grad_norm": 0.6149744391441345, + "learning_rate": 6.234807842855246e-06, + "loss": 0.2075, + "step": 13384 + }, + { + "epoch": 1.2609217870516474, + "grad_norm": 0.6505521535873413, + "learning_rate": 6.233408991719591e-06, + "loss": 0.1847, + "step": 13385 + }, + { + "epoch": 1.261015991144815, + "grad_norm": 0.6811007261276245, + "learning_rate": 6.2320102264680325e-06, + "loss": 0.2021, + "step": 13386 + }, + { + "epoch": 1.261110195237983, + "grad_norm": 0.7151916027069092, + "learning_rate": 6.23061154713247e-06, + "loss": 0.2231, + "step": 13387 + }, + { + "epoch": 1.261204399331151, + "grad_norm": 0.6212832927703857, + "learning_rate": 6.229212953744796e-06, + "loss": 0.1759, + "step": 13388 + }, + { + "epoch": 1.2612986034243188, + "grad_norm": 0.7134900093078613, + "learning_rate": 6.227814446336894e-06, + "loss": 0.212, + "step": 13389 + }, + { + "epoch": 1.2613928075174865, + "grad_norm": 0.6666131615638733, + "learning_rate": 6.226416024940661e-06, + "loss": 0.1975, + "step": 13390 + }, + { + "epoch": 1.2614870116106545, + "grad_norm": 0.6203562021255493, + "learning_rate": 6.225017689587978e-06, + "loss": 0.1948, + "step": 13391 + }, + { + "epoch": 1.2615812157038224, + "grad_norm": 0.6093359589576721, + "learning_rate": 6.2236194403107275e-06, + "loss": 0.2369, + "step": 13392 + }, + { + "epoch": 1.2616754197969902, + "grad_norm": 0.7116411924362183, + "learning_rate": 6.222221277140793e-06, + "loss": 0.2183, + "step": 13393 + }, + { + "epoch": 1.261769623890158, + "grad_norm": 0.6080445051193237, + "learning_rate": 6.220823200110058e-06, + "loss": 0.2132, + "step": 13394 + }, + { + "epoch": 1.2618638279833259, + "grad_norm": 0.6164963841438293, + "learning_rate": 6.219425209250399e-06, + "loss": 0.2267, + "step": 13395 + }, + { + "epoch": 1.2619580320764938, + "grad_norm": 0.6323091983795166, + "learning_rate": 6.21802730459369e-06, + "loss": 0.2003, + "step": 13396 + }, + { + "epoch": 1.2620522361696616, + "grad_norm": 0.6259250640869141, + "learning_rate": 6.216629486171808e-06, + "loss": 0.2062, + "step": 13397 + }, + { + "epoch": 1.2621464402628293, + "grad_norm": 0.6342028379440308, + "learning_rate": 6.215231754016626e-06, + "loss": 0.1967, + "step": 13398 + }, + { + "epoch": 1.2622406443559973, + "grad_norm": 0.7478952407836914, + "learning_rate": 6.213834108160011e-06, + "loss": 0.2082, + "step": 13399 + }, + { + "epoch": 1.2623348484491652, + "grad_norm": 0.6865331530570984, + "learning_rate": 6.212436548633836e-06, + "loss": 0.2007, + "step": 13400 + }, + { + "epoch": 1.262429052542333, + "grad_norm": 0.6410707235336304, + "learning_rate": 6.211039075469964e-06, + "loss": 0.2128, + "step": 13401 + }, + { + "epoch": 1.2625232566355007, + "grad_norm": 0.7006101608276367, + "learning_rate": 6.209641688700265e-06, + "loss": 0.2193, + "step": 13402 + }, + { + "epoch": 1.2626174607286687, + "grad_norm": 0.5957962274551392, + "learning_rate": 6.2082443883565905e-06, + "loss": 0.1995, + "step": 13403 + }, + { + "epoch": 1.2627116648218366, + "grad_norm": 0.6570765376091003, + "learning_rate": 6.206847174470811e-06, + "loss": 0.1983, + "step": 13404 + }, + { + "epoch": 1.2628058689150043, + "grad_norm": 0.7158840298652649, + "learning_rate": 6.205450047074786e-06, + "loss": 0.2459, + "step": 13405 + }, + { + "epoch": 1.262900073008172, + "grad_norm": 0.6220098733901978, + "learning_rate": 6.204053006200361e-06, + "loss": 0.2014, + "step": 13406 + }, + { + "epoch": 1.26299427710134, + "grad_norm": 0.7102717757225037, + "learning_rate": 6.202656051879405e-06, + "loss": 0.1923, + "step": 13407 + }, + { + "epoch": 1.263088481194508, + "grad_norm": 0.6392085552215576, + "learning_rate": 6.20125918414376e-06, + "loss": 0.1891, + "step": 13408 + }, + { + "epoch": 1.2631826852876757, + "grad_norm": 0.5897543430328369, + "learning_rate": 6.199862403025278e-06, + "loss": 0.2113, + "step": 13409 + }, + { + "epoch": 1.2632768893808435, + "grad_norm": 0.5956061482429504, + "learning_rate": 6.198465708555815e-06, + "loss": 0.1926, + "step": 13410 + }, + { + "epoch": 1.2633710934740114, + "grad_norm": 0.6666398048400879, + "learning_rate": 6.1970691007672124e-06, + "loss": 0.227, + "step": 13411 + }, + { + "epoch": 1.2634652975671794, + "grad_norm": 0.7923960089683533, + "learning_rate": 6.195672579691314e-06, + "loss": 0.186, + "step": 13412 + }, + { + "epoch": 1.2635595016603471, + "grad_norm": 0.6867488622665405, + "learning_rate": 6.194276145359963e-06, + "loss": 0.2213, + "step": 13413 + }, + { + "epoch": 1.2636537057535149, + "grad_norm": 0.6683698892593384, + "learning_rate": 6.192879797805005e-06, + "loss": 0.2064, + "step": 13414 + }, + { + "epoch": 1.2637479098466828, + "grad_norm": 0.6710454821586609, + "learning_rate": 6.191483537058274e-06, + "loss": 0.197, + "step": 13415 + }, + { + "epoch": 1.2638421139398508, + "grad_norm": 0.6741254329681396, + "learning_rate": 6.1900873631516064e-06, + "loss": 0.2063, + "step": 13416 + }, + { + "epoch": 1.2639363180330185, + "grad_norm": 0.703450083732605, + "learning_rate": 6.188691276116841e-06, + "loss": 0.2154, + "step": 13417 + }, + { + "epoch": 1.2640305221261863, + "grad_norm": 0.6495195627212524, + "learning_rate": 6.18729527598581e-06, + "loss": 0.2049, + "step": 13418 + }, + { + "epoch": 1.2641247262193542, + "grad_norm": 0.6587666869163513, + "learning_rate": 6.18589936279034e-06, + "loss": 0.2528, + "step": 13419 + }, + { + "epoch": 1.2642189303125222, + "grad_norm": 0.6717892289161682, + "learning_rate": 6.1845035365622655e-06, + "loss": 0.2048, + "step": 13420 + }, + { + "epoch": 1.26431313440569, + "grad_norm": 0.6473823189735413, + "learning_rate": 6.183107797333411e-06, + "loss": 0.1894, + "step": 13421 + }, + { + "epoch": 1.2644073384988577, + "grad_norm": 0.6968402862548828, + "learning_rate": 6.181712145135603e-06, + "loss": 0.2301, + "step": 13422 + }, + { + "epoch": 1.2645015425920256, + "grad_norm": 0.6945221424102783, + "learning_rate": 6.1803165800006585e-06, + "loss": 0.2142, + "step": 13423 + }, + { + "epoch": 1.2645957466851936, + "grad_norm": 0.6499255895614624, + "learning_rate": 6.178921101960407e-06, + "loss": 0.2055, + "step": 13424 + }, + { + "epoch": 1.2646899507783613, + "grad_norm": 0.674542248249054, + "learning_rate": 6.177525711046664e-06, + "loss": 0.2271, + "step": 13425 + }, + { + "epoch": 1.264784154871529, + "grad_norm": 0.6025372743606567, + "learning_rate": 6.176130407291243e-06, + "loss": 0.1968, + "step": 13426 + }, + { + "epoch": 1.264878358964697, + "grad_norm": 0.6595885753631592, + "learning_rate": 6.174735190725967e-06, + "loss": 0.2335, + "step": 13427 + }, + { + "epoch": 1.264972563057865, + "grad_norm": 0.6103795170783997, + "learning_rate": 6.173340061382642e-06, + "loss": 0.2081, + "step": 13428 + }, + { + "epoch": 1.2650667671510327, + "grad_norm": 0.596925675868988, + "learning_rate": 6.1719450192930786e-06, + "loss": 0.1965, + "step": 13429 + }, + { + "epoch": 1.2651609712442005, + "grad_norm": 0.6182538270950317, + "learning_rate": 6.1705500644890946e-06, + "loss": 0.198, + "step": 13430 + }, + { + "epoch": 1.2652551753373684, + "grad_norm": 0.6547569632530212, + "learning_rate": 6.16915519700249e-06, + "loss": 0.1984, + "step": 13431 + }, + { + "epoch": 1.2653493794305364, + "grad_norm": 0.6183830499649048, + "learning_rate": 6.1677604168650705e-06, + "loss": 0.1706, + "step": 13432 + }, + { + "epoch": 1.2654435835237041, + "grad_norm": 0.6266723275184631, + "learning_rate": 6.1663657241086385e-06, + "loss": 0.1862, + "step": 13433 + }, + { + "epoch": 1.2655377876168719, + "grad_norm": 0.6035991907119751, + "learning_rate": 6.164971118764999e-06, + "loss": 0.1855, + "step": 13434 + }, + { + "epoch": 1.2656319917100398, + "grad_norm": 0.7422029376029968, + "learning_rate": 6.16357660086595e-06, + "loss": 0.2166, + "step": 13435 + }, + { + "epoch": 1.2657261958032078, + "grad_norm": 0.6359850168228149, + "learning_rate": 6.162182170443285e-06, + "loss": 0.2069, + "step": 13436 + }, + { + "epoch": 1.2658203998963755, + "grad_norm": 0.5942542552947998, + "learning_rate": 6.160787827528802e-06, + "loss": 0.1769, + "step": 13437 + }, + { + "epoch": 1.2659146039895433, + "grad_norm": 0.6017497777938843, + "learning_rate": 6.159393572154296e-06, + "loss": 0.1845, + "step": 13438 + }, + { + "epoch": 1.2660088080827112, + "grad_norm": 0.5920788049697876, + "learning_rate": 6.1579994043515535e-06, + "loss": 0.2143, + "step": 13439 + }, + { + "epoch": 1.2661030121758792, + "grad_norm": 0.6443082690238953, + "learning_rate": 6.156605324152369e-06, + "loss": 0.1766, + "step": 13440 + }, + { + "epoch": 1.266197216269047, + "grad_norm": 0.7014357447624207, + "learning_rate": 6.155211331588527e-06, + "loss": 0.1884, + "step": 13441 + }, + { + "epoch": 1.2662914203622146, + "grad_norm": 0.6396356225013733, + "learning_rate": 6.153817426691813e-06, + "loss": 0.1896, + "step": 13442 + }, + { + "epoch": 1.2663856244553826, + "grad_norm": 0.6797584891319275, + "learning_rate": 6.152423609494005e-06, + "loss": 0.2122, + "step": 13443 + }, + { + "epoch": 1.2664798285485506, + "grad_norm": 0.60710209608078, + "learning_rate": 6.151029880026893e-06, + "loss": 0.1645, + "step": 13444 + }, + { + "epoch": 1.2665740326417183, + "grad_norm": 0.6762518286705017, + "learning_rate": 6.149636238322255e-06, + "loss": 0.2045, + "step": 13445 + }, + { + "epoch": 1.266668236734886, + "grad_norm": 0.6514323949813843, + "learning_rate": 6.148242684411859e-06, + "loss": 0.1799, + "step": 13446 + }, + { + "epoch": 1.266762440828054, + "grad_norm": 0.5996586084365845, + "learning_rate": 6.146849218327493e-06, + "loss": 0.2042, + "step": 13447 + }, + { + "epoch": 1.266856644921222, + "grad_norm": 0.6697123050689697, + "learning_rate": 6.145455840100921e-06, + "loss": 0.2215, + "step": 13448 + }, + { + "epoch": 1.2669508490143897, + "grad_norm": 0.6317233443260193, + "learning_rate": 6.144062549763914e-06, + "loss": 0.2042, + "step": 13449 + }, + { + "epoch": 1.2670450531075574, + "grad_norm": 0.6346283555030823, + "learning_rate": 6.142669347348249e-06, + "loss": 0.2037, + "step": 13450 + }, + { + "epoch": 1.2671392572007254, + "grad_norm": 0.6377195119857788, + "learning_rate": 6.141276232885687e-06, + "loss": 0.1988, + "step": 13451 + }, + { + "epoch": 1.2672334612938931, + "grad_norm": 0.6562709808349609, + "learning_rate": 6.139883206407995e-06, + "loss": 0.2204, + "step": 13452 + }, + { + "epoch": 1.267327665387061, + "grad_norm": 0.6999305486679077, + "learning_rate": 6.138490267946933e-06, + "loss": 0.2134, + "step": 13453 + }, + { + "epoch": 1.2674218694802288, + "grad_norm": 0.6550266146659851, + "learning_rate": 6.137097417534268e-06, + "loss": 0.1927, + "step": 13454 + }, + { + "epoch": 1.2675160735733968, + "grad_norm": 0.6951824426651001, + "learning_rate": 6.135704655201755e-06, + "loss": 0.2155, + "step": 13455 + }, + { + "epoch": 1.2676102776665645, + "grad_norm": 0.666174590587616, + "learning_rate": 6.134311980981149e-06, + "loss": 0.2001, + "step": 13456 + }, + { + "epoch": 1.2677044817597325, + "grad_norm": 0.6004818677902222, + "learning_rate": 6.132919394904212e-06, + "loss": 0.184, + "step": 13457 + }, + { + "epoch": 1.2677986858529002, + "grad_norm": 0.612799882888794, + "learning_rate": 6.131526897002693e-06, + "loss": 0.1976, + "step": 13458 + }, + { + "epoch": 1.2678928899460682, + "grad_norm": 0.6072046756744385, + "learning_rate": 6.130134487308341e-06, + "loss": 0.1762, + "step": 13459 + }, + { + "epoch": 1.267987094039236, + "grad_norm": 0.648533821105957, + "learning_rate": 6.12874216585291e-06, + "loss": 0.2029, + "step": 13460 + }, + { + "epoch": 1.268081298132404, + "grad_norm": 0.6512607336044312, + "learning_rate": 6.127349932668144e-06, + "loss": 0.2426, + "step": 13461 + }, + { + "epoch": 1.2681755022255716, + "grad_norm": 0.6310539841651917, + "learning_rate": 6.12595778778579e-06, + "loss": 0.1868, + "step": 13462 + }, + { + "epoch": 1.2682697063187396, + "grad_norm": 0.6344117522239685, + "learning_rate": 6.124565731237586e-06, + "loss": 0.1886, + "step": 13463 + }, + { + "epoch": 1.2683639104119073, + "grad_norm": 0.6713270545005798, + "learning_rate": 6.123173763055279e-06, + "loss": 0.1888, + "step": 13464 + }, + { + "epoch": 1.2684581145050753, + "grad_norm": 0.6484018564224243, + "learning_rate": 6.121781883270609e-06, + "loss": 0.1955, + "step": 13465 + }, + { + "epoch": 1.268552318598243, + "grad_norm": 0.6407499313354492, + "learning_rate": 6.1203900919153026e-06, + "loss": 0.2027, + "step": 13466 + }, + { + "epoch": 1.268646522691411, + "grad_norm": 0.6809627413749695, + "learning_rate": 6.118998389021109e-06, + "loss": 0.2135, + "step": 13467 + }, + { + "epoch": 1.2687407267845787, + "grad_norm": 0.6053674221038818, + "learning_rate": 6.117606774619752e-06, + "loss": 0.1995, + "step": 13468 + }, + { + "epoch": 1.2688349308777467, + "grad_norm": 0.6646608710289001, + "learning_rate": 6.116215248742961e-06, + "loss": 0.2257, + "step": 13469 + }, + { + "epoch": 1.2689291349709144, + "grad_norm": 0.6139371991157532, + "learning_rate": 6.114823811422474e-06, + "loss": 0.2173, + "step": 13470 + }, + { + "epoch": 1.2690233390640824, + "grad_norm": 0.6624774932861328, + "learning_rate": 6.1134324626900125e-06, + "loss": 0.1883, + "step": 13471 + }, + { + "epoch": 1.2691175431572501, + "grad_norm": 0.746110200881958, + "learning_rate": 6.112041202577299e-06, + "loss": 0.2286, + "step": 13472 + }, + { + "epoch": 1.269211747250418, + "grad_norm": 0.6624348759651184, + "learning_rate": 6.110650031116059e-06, + "loss": 0.2179, + "step": 13473 + }, + { + "epoch": 1.2693059513435858, + "grad_norm": 0.7462780475616455, + "learning_rate": 6.1092589483380126e-06, + "loss": 0.2055, + "step": 13474 + }, + { + "epoch": 1.2694001554367538, + "grad_norm": 0.6260996460914612, + "learning_rate": 6.107867954274882e-06, + "loss": 0.2193, + "step": 13475 + }, + { + "epoch": 1.2694943595299215, + "grad_norm": 0.6321893334388733, + "learning_rate": 6.106477048958378e-06, + "loss": 0.2063, + "step": 13476 + }, + { + "epoch": 1.2695885636230895, + "grad_norm": 0.6717593669891357, + "learning_rate": 6.105086232420221e-06, + "loss": 0.1941, + "step": 13477 + }, + { + "epoch": 1.2696827677162572, + "grad_norm": 0.6348603367805481, + "learning_rate": 6.103695504692122e-06, + "loss": 0.1913, + "step": 13478 + }, + { + "epoch": 1.2697769718094252, + "grad_norm": 0.7140622735023499, + "learning_rate": 6.1023048658057886e-06, + "loss": 0.2087, + "step": 13479 + }, + { + "epoch": 1.269871175902593, + "grad_norm": 0.6726442575454712, + "learning_rate": 6.100914315792934e-06, + "loss": 0.228, + "step": 13480 + }, + { + "epoch": 1.2699653799957609, + "grad_norm": 0.6332194805145264, + "learning_rate": 6.099523854685264e-06, + "loss": 0.2115, + "step": 13481 + }, + { + "epoch": 1.2700595840889286, + "grad_norm": 0.6968733668327332, + "learning_rate": 6.098133482514483e-06, + "loss": 0.1848, + "step": 13482 + }, + { + "epoch": 1.2701537881820966, + "grad_norm": 0.7023800015449524, + "learning_rate": 6.096743199312289e-06, + "loss": 0.225, + "step": 13483 + }, + { + "epoch": 1.2702479922752643, + "grad_norm": 0.7038373947143555, + "learning_rate": 6.095353005110389e-06, + "loss": 0.2299, + "step": 13484 + }, + { + "epoch": 1.2703421963684323, + "grad_norm": 0.7013296484947205, + "learning_rate": 6.093962899940482e-06, + "loss": 0.2276, + "step": 13485 + }, + { + "epoch": 1.2704364004616, + "grad_norm": 0.6218545436859131, + "learning_rate": 6.0925728838342545e-06, + "loss": 0.1978, + "step": 13486 + }, + { + "epoch": 1.270530604554768, + "grad_norm": 0.5693235993385315, + "learning_rate": 6.091182956823415e-06, + "loss": 0.1663, + "step": 13487 + }, + { + "epoch": 1.2706248086479357, + "grad_norm": 0.6664445400238037, + "learning_rate": 6.089793118939646e-06, + "loss": 0.2001, + "step": 13488 + }, + { + "epoch": 1.2707190127411037, + "grad_norm": 0.6295485496520996, + "learning_rate": 6.088403370214639e-06, + "loss": 0.2095, + "step": 13489 + }, + { + "epoch": 1.2708132168342714, + "grad_norm": 0.7190636992454529, + "learning_rate": 6.08701371068009e-06, + "loss": 0.1925, + "step": 13490 + }, + { + "epoch": 1.2709074209274394, + "grad_norm": 0.7192946672439575, + "learning_rate": 6.085624140367677e-06, + "loss": 0.252, + "step": 13491 + }, + { + "epoch": 1.271001625020607, + "grad_norm": 0.5965147018432617, + "learning_rate": 6.084234659309088e-06, + "loss": 0.1955, + "step": 13492 + }, + { + "epoch": 1.271095829113775, + "grad_norm": 0.6417478919029236, + "learning_rate": 6.082845267536003e-06, + "loss": 0.1896, + "step": 13493 + }, + { + "epoch": 1.2711900332069428, + "grad_norm": 0.6431136131286621, + "learning_rate": 6.081455965080105e-06, + "loss": 0.1938, + "step": 13494 + }, + { + "epoch": 1.2712842373001108, + "grad_norm": 0.689927339553833, + "learning_rate": 6.080066751973073e-06, + "loss": 0.1996, + "step": 13495 + }, + { + "epoch": 1.2713784413932785, + "grad_norm": 0.6282612085342407, + "learning_rate": 6.078677628246577e-06, + "loss": 0.2118, + "step": 13496 + }, + { + "epoch": 1.2714726454864465, + "grad_norm": 0.6963714957237244, + "learning_rate": 6.077288593932298e-06, + "loss": 0.1892, + "step": 13497 + }, + { + "epoch": 1.2715668495796142, + "grad_norm": 0.6831355094909668, + "learning_rate": 6.075899649061907e-06, + "loss": 0.2163, + "step": 13498 + }, + { + "epoch": 1.2716610536727821, + "grad_norm": 0.7394452691078186, + "learning_rate": 6.07451079366707e-06, + "loss": 0.2024, + "step": 13499 + }, + { + "epoch": 1.2717552577659499, + "grad_norm": 0.6782084703445435, + "learning_rate": 6.073122027779459e-06, + "loss": 0.2091, + "step": 13500 + }, + { + "epoch": 1.2718494618591178, + "grad_norm": 0.6251419186592102, + "learning_rate": 6.071733351430739e-06, + "loss": 0.2007, + "step": 13501 + }, + { + "epoch": 1.2719436659522856, + "grad_norm": 0.6253593564033508, + "learning_rate": 6.070344764652577e-06, + "loss": 0.195, + "step": 13502 + }, + { + "epoch": 1.2720378700454535, + "grad_norm": 0.6126936078071594, + "learning_rate": 6.068956267476624e-06, + "loss": 0.1963, + "step": 13503 + }, + { + "epoch": 1.2721320741386213, + "grad_norm": 0.6560893058776855, + "learning_rate": 6.067567859934553e-06, + "loss": 0.1872, + "step": 13504 + }, + { + "epoch": 1.2722262782317892, + "grad_norm": 0.7075398564338684, + "learning_rate": 6.0661795420580185e-06, + "loss": 0.2172, + "step": 13505 + }, + { + "epoch": 1.272320482324957, + "grad_norm": 0.5794890522956848, + "learning_rate": 6.064791313878667e-06, + "loss": 0.1711, + "step": 13506 + }, + { + "epoch": 1.272414686418125, + "grad_norm": 0.6572920680046082, + "learning_rate": 6.063403175428166e-06, + "loss": 0.1983, + "step": 13507 + }, + { + "epoch": 1.2725088905112927, + "grad_norm": 0.6745020151138306, + "learning_rate": 6.0620151267381585e-06, + "loss": 0.2091, + "step": 13508 + }, + { + "epoch": 1.2726030946044606, + "grad_norm": 0.6793653964996338, + "learning_rate": 6.060627167840294e-06, + "loss": 0.2253, + "step": 13509 + }, + { + "epoch": 1.2726972986976284, + "grad_norm": 0.7092230916023254, + "learning_rate": 6.059239298766226e-06, + "loss": 0.254, + "step": 13510 + }, + { + "epoch": 1.2727915027907963, + "grad_norm": 0.6768363118171692, + "learning_rate": 6.057851519547595e-06, + "loss": 0.2019, + "step": 13511 + }, + { + "epoch": 1.272885706883964, + "grad_norm": 0.5751312971115112, + "learning_rate": 6.0564638302160474e-06, + "loss": 0.1833, + "step": 13512 + }, + { + "epoch": 1.272979910977132, + "grad_norm": 0.6538204550743103, + "learning_rate": 6.05507623080322e-06, + "loss": 0.1964, + "step": 13513 + }, + { + "epoch": 1.2730741150702998, + "grad_norm": 0.7615554332733154, + "learning_rate": 6.053688721340758e-06, + "loss": 0.2163, + "step": 13514 + }, + { + "epoch": 1.2731683191634677, + "grad_norm": 0.6217525005340576, + "learning_rate": 6.052301301860296e-06, + "loss": 0.2232, + "step": 13515 + }, + { + "epoch": 1.2732625232566355, + "grad_norm": 0.62110435962677, + "learning_rate": 6.050913972393468e-06, + "loss": 0.1924, + "step": 13516 + }, + { + "epoch": 1.2733567273498034, + "grad_norm": 0.6933391690254211, + "learning_rate": 6.049526732971911e-06, + "loss": 0.2219, + "step": 13517 + }, + { + "epoch": 1.2734509314429712, + "grad_norm": 0.6840419769287109, + "learning_rate": 6.048139583627252e-06, + "loss": 0.1849, + "step": 13518 + }, + { + "epoch": 1.2735451355361391, + "grad_norm": 0.6440111398696899, + "learning_rate": 6.046752524391122e-06, + "loss": 0.1937, + "step": 13519 + }, + { + "epoch": 1.2736393396293069, + "grad_norm": 0.6548159122467041, + "learning_rate": 6.045365555295151e-06, + "loss": 0.2187, + "step": 13520 + }, + { + "epoch": 1.2737335437224746, + "grad_norm": 0.7605534791946411, + "learning_rate": 6.04397867637096e-06, + "loss": 0.1825, + "step": 13521 + }, + { + "epoch": 1.2738277478156426, + "grad_norm": 0.6676763892173767, + "learning_rate": 6.042591887650175e-06, + "loss": 0.2044, + "step": 13522 + }, + { + "epoch": 1.2739219519088105, + "grad_norm": 0.633726179599762, + "learning_rate": 6.04120518916441e-06, + "loss": 0.1913, + "step": 13523 + }, + { + "epoch": 1.2740161560019783, + "grad_norm": 0.7962849140167236, + "learning_rate": 6.039818580945293e-06, + "loss": 0.2209, + "step": 13524 + }, + { + "epoch": 1.274110360095146, + "grad_norm": 0.6224062442779541, + "learning_rate": 6.038432063024437e-06, + "loss": 0.2078, + "step": 13525 + }, + { + "epoch": 1.274204564188314, + "grad_norm": 0.6988505125045776, + "learning_rate": 6.037045635433454e-06, + "loss": 0.2157, + "step": 13526 + }, + { + "epoch": 1.274298768281482, + "grad_norm": 0.6654592752456665, + "learning_rate": 6.035659298203963e-06, + "loss": 0.1987, + "step": 13527 + }, + { + "epoch": 1.2743929723746497, + "grad_norm": 0.6687403917312622, + "learning_rate": 6.03427305136757e-06, + "loss": 0.2183, + "step": 13528 + }, + { + "epoch": 1.2744871764678174, + "grad_norm": 0.6197983026504517, + "learning_rate": 6.03288689495588e-06, + "loss": 0.21, + "step": 13529 + }, + { + "epoch": 1.2745813805609854, + "grad_norm": 0.6117048263549805, + "learning_rate": 6.031500829000509e-06, + "loss": 0.2103, + "step": 13530 + }, + { + "epoch": 1.2746755846541533, + "grad_norm": 0.7111307382583618, + "learning_rate": 6.030114853533057e-06, + "loss": 0.2159, + "step": 13531 + }, + { + "epoch": 1.274769788747321, + "grad_norm": 0.6280469298362732, + "learning_rate": 6.028728968585125e-06, + "loss": 0.1944, + "step": 13532 + }, + { + "epoch": 1.2748639928404888, + "grad_norm": 1.2519681453704834, + "learning_rate": 6.0273431741883115e-06, + "loss": 0.202, + "step": 13533 + }, + { + "epoch": 1.2749581969336568, + "grad_norm": 0.641857385635376, + "learning_rate": 6.02595747037422e-06, + "loss": 0.2074, + "step": 13534 + }, + { + "epoch": 1.2750524010268247, + "grad_norm": 0.6303014159202576, + "learning_rate": 6.024571857174443e-06, + "loss": 0.1903, + "step": 13535 + }, + { + "epoch": 1.2751466051199924, + "grad_norm": 0.7295275330543518, + "learning_rate": 6.023186334620574e-06, + "loss": 0.1933, + "step": 13536 + }, + { + "epoch": 1.2752408092131602, + "grad_norm": 0.6660329699516296, + "learning_rate": 6.0218009027442105e-06, + "loss": 0.2011, + "step": 13537 + }, + { + "epoch": 1.2753350133063281, + "grad_norm": 0.6415559649467468, + "learning_rate": 6.020415561576938e-06, + "loss": 0.2096, + "step": 13538 + }, + { + "epoch": 1.275429217399496, + "grad_norm": 0.6854776740074158, + "learning_rate": 6.019030311150342e-06, + "loss": 0.2016, + "step": 13539 + }, + { + "epoch": 1.2755234214926638, + "grad_norm": 0.6685968637466431, + "learning_rate": 6.017645151496015e-06, + "loss": 0.2119, + "step": 13540 + }, + { + "epoch": 1.2756176255858316, + "grad_norm": 0.6606480479240417, + "learning_rate": 6.0162600826455375e-06, + "loss": 0.1943, + "step": 13541 + }, + { + "epoch": 1.2757118296789995, + "grad_norm": 0.954875648021698, + "learning_rate": 6.014875104630493e-06, + "loss": 0.2248, + "step": 13542 + }, + { + "epoch": 1.2758060337721675, + "grad_norm": 0.7003217339515686, + "learning_rate": 6.013490217482452e-06, + "loss": 0.2327, + "step": 13543 + }, + { + "epoch": 1.2759002378653352, + "grad_norm": 0.6598342657089233, + "learning_rate": 6.0121054212330066e-06, + "loss": 0.2404, + "step": 13544 + }, + { + "epoch": 1.275994441958503, + "grad_norm": 0.6676200032234192, + "learning_rate": 6.010720715913723e-06, + "loss": 0.2383, + "step": 13545 + }, + { + "epoch": 1.276088646051671, + "grad_norm": 0.6191518306732178, + "learning_rate": 6.009336101556171e-06, + "loss": 0.1803, + "step": 13546 + }, + { + "epoch": 1.276182850144839, + "grad_norm": 0.6574147343635559, + "learning_rate": 6.007951578191935e-06, + "loss": 0.2073, + "step": 13547 + }, + { + "epoch": 1.2762770542380066, + "grad_norm": 0.5903410911560059, + "learning_rate": 6.006567145852575e-06, + "loss": 0.1949, + "step": 13548 + }, + { + "epoch": 1.2763712583311744, + "grad_norm": 0.7233710289001465, + "learning_rate": 6.0051828045696555e-06, + "loss": 0.2356, + "step": 13549 + }, + { + "epoch": 1.2764654624243423, + "grad_norm": 0.6821407675743103, + "learning_rate": 6.0037985543747524e-06, + "loss": 0.2295, + "step": 13550 + }, + { + "epoch": 1.2765596665175103, + "grad_norm": 0.6873706579208374, + "learning_rate": 6.00241439529942e-06, + "loss": 0.2101, + "step": 13551 + }, + { + "epoch": 1.276653870610678, + "grad_norm": 0.5550234913825989, + "learning_rate": 6.001030327375222e-06, + "loss": 0.1893, + "step": 13552 + }, + { + "epoch": 1.2767480747038458, + "grad_norm": 0.7132258415222168, + "learning_rate": 5.999646350633715e-06, + "loss": 0.2239, + "step": 13553 + }, + { + "epoch": 1.2768422787970137, + "grad_norm": 0.6445050239562988, + "learning_rate": 5.9982624651064605e-06, + "loss": 0.2303, + "step": 13554 + }, + { + "epoch": 1.2769364828901817, + "grad_norm": 0.5875824093818665, + "learning_rate": 5.99687867082501e-06, + "loss": 0.1981, + "step": 13555 + }, + { + "epoch": 1.2770306869833494, + "grad_norm": 0.5886663794517517, + "learning_rate": 5.995494967820915e-06, + "loss": 0.1927, + "step": 13556 + }, + { + "epoch": 1.2771248910765172, + "grad_norm": 0.6198879480361938, + "learning_rate": 5.994111356125729e-06, + "loss": 0.1979, + "step": 13557 + }, + { + "epoch": 1.2772190951696851, + "grad_norm": 0.7277728915214539, + "learning_rate": 5.992727835771002e-06, + "loss": 0.2291, + "step": 13558 + }, + { + "epoch": 1.277313299262853, + "grad_norm": 0.6491997838020325, + "learning_rate": 5.9913444067882735e-06, + "loss": 0.2108, + "step": 13559 + }, + { + "epoch": 1.2774075033560208, + "grad_norm": 0.7521607279777527, + "learning_rate": 5.989961069209094e-06, + "loss": 0.2438, + "step": 13560 + }, + { + "epoch": 1.2775017074491886, + "grad_norm": 0.6627978086471558, + "learning_rate": 5.9885778230650024e-06, + "loss": 0.2285, + "step": 13561 + }, + { + "epoch": 1.2775959115423565, + "grad_norm": 0.6448333859443665, + "learning_rate": 5.9871946683875444e-06, + "loss": 0.2119, + "step": 13562 + }, + { + "epoch": 1.2776901156355245, + "grad_norm": 0.6985797882080078, + "learning_rate": 5.985811605208247e-06, + "loss": 0.2284, + "step": 13563 + }, + { + "epoch": 1.2777843197286922, + "grad_norm": 0.640409529209137, + "learning_rate": 5.984428633558661e-06, + "loss": 0.1924, + "step": 13564 + }, + { + "epoch": 1.27787852382186, + "grad_norm": 0.6922116279602051, + "learning_rate": 5.983045753470308e-06, + "loss": 0.2006, + "step": 13565 + }, + { + "epoch": 1.277972727915028, + "grad_norm": 0.6037483811378479, + "learning_rate": 5.981662964974721e-06, + "loss": 0.1798, + "step": 13566 + }, + { + "epoch": 1.2780669320081959, + "grad_norm": 0.7847719788551331, + "learning_rate": 5.980280268103439e-06, + "loss": 0.2183, + "step": 13567 + }, + { + "epoch": 1.2781611361013636, + "grad_norm": 0.7188629508018494, + "learning_rate": 5.978897662887982e-06, + "loss": 0.1801, + "step": 13568 + }, + { + "epoch": 1.2782553401945314, + "grad_norm": 0.6025604009628296, + "learning_rate": 5.9775151493598735e-06, + "loss": 0.178, + "step": 13569 + }, + { + "epoch": 1.2783495442876993, + "grad_norm": 0.6027699708938599, + "learning_rate": 5.9761327275506435e-06, + "loss": 0.1783, + "step": 13570 + }, + { + "epoch": 1.2784437483808673, + "grad_norm": 0.6953678727149963, + "learning_rate": 5.9747503974918105e-06, + "loss": 0.2163, + "step": 13571 + }, + { + "epoch": 1.278537952474035, + "grad_norm": 0.5966035723686218, + "learning_rate": 5.973368159214893e-06, + "loss": 0.1852, + "step": 13572 + }, + { + "epoch": 1.2786321565672027, + "grad_norm": 0.6455698609352112, + "learning_rate": 5.971986012751407e-06, + "loss": 0.2162, + "step": 13573 + }, + { + "epoch": 1.2787263606603707, + "grad_norm": 0.624878466129303, + "learning_rate": 5.970603958132871e-06, + "loss": 0.2072, + "step": 13574 + }, + { + "epoch": 1.2788205647535387, + "grad_norm": 0.6461345553398132, + "learning_rate": 5.969221995390797e-06, + "loss": 0.1863, + "step": 13575 + }, + { + "epoch": 1.2789147688467064, + "grad_norm": 0.6307018995285034, + "learning_rate": 5.967840124556693e-06, + "loss": 0.1924, + "step": 13576 + }, + { + "epoch": 1.2790089729398741, + "grad_norm": 0.6108476519584656, + "learning_rate": 5.966458345662072e-06, + "loss": 0.1875, + "step": 13577 + }, + { + "epoch": 1.279103177033042, + "grad_norm": 0.6464184522628784, + "learning_rate": 5.965076658738439e-06, + "loss": 0.204, + "step": 13578 + }, + { + "epoch": 1.27919738112621, + "grad_norm": 0.6568793058395386, + "learning_rate": 5.963695063817297e-06, + "loss": 0.2057, + "step": 13579 + }, + { + "epoch": 1.2792915852193778, + "grad_norm": 0.6844866275787354, + "learning_rate": 5.9623135609301495e-06, + "loss": 0.2215, + "step": 13580 + }, + { + "epoch": 1.2793857893125455, + "grad_norm": 0.6494469046592712, + "learning_rate": 5.960932150108498e-06, + "loss": 0.225, + "step": 13581 + }, + { + "epoch": 1.2794799934057135, + "grad_norm": 0.695708155632019, + "learning_rate": 5.959550831383842e-06, + "loss": 0.2069, + "step": 13582 + }, + { + "epoch": 1.2795741974988815, + "grad_norm": 0.7102089524269104, + "learning_rate": 5.9581696047876714e-06, + "loss": 0.2391, + "step": 13583 + }, + { + "epoch": 1.2796684015920492, + "grad_norm": 0.6285651326179504, + "learning_rate": 5.956788470351489e-06, + "loss": 0.204, + "step": 13584 + }, + { + "epoch": 1.279762605685217, + "grad_norm": 0.6996090412139893, + "learning_rate": 5.955407428106781e-06, + "loss": 0.2024, + "step": 13585 + }, + { + "epoch": 1.279856809778385, + "grad_norm": 0.7680042386054993, + "learning_rate": 5.954026478085035e-06, + "loss": 0.2115, + "step": 13586 + }, + { + "epoch": 1.2799510138715529, + "grad_norm": 0.6397778987884521, + "learning_rate": 5.952645620317748e-06, + "loss": 0.1866, + "step": 13587 + }, + { + "epoch": 1.2800452179647206, + "grad_norm": 0.6503490805625916, + "learning_rate": 5.951264854836398e-06, + "loss": 0.1947, + "step": 13588 + }, + { + "epoch": 1.2801394220578883, + "grad_norm": 0.645462155342102, + "learning_rate": 5.949884181672469e-06, + "loss": 0.1991, + "step": 13589 + }, + { + "epoch": 1.2802336261510563, + "grad_norm": 0.7461054921150208, + "learning_rate": 5.9485036008574475e-06, + "loss": 0.2373, + "step": 13590 + }, + { + "epoch": 1.280327830244224, + "grad_norm": 0.6423100233078003, + "learning_rate": 5.947123112422808e-06, + "loss": 0.1909, + "step": 13591 + }, + { + "epoch": 1.280422034337392, + "grad_norm": 0.634678065776825, + "learning_rate": 5.94574271640003e-06, + "loss": 0.2063, + "step": 13592 + }, + { + "epoch": 1.2805162384305597, + "grad_norm": 0.6117005348205566, + "learning_rate": 5.944362412820586e-06, + "loss": 0.218, + "step": 13593 + }, + { + "epoch": 1.2806104425237277, + "grad_norm": 0.6724438071250916, + "learning_rate": 5.942982201715954e-06, + "loss": 0.2084, + "step": 13594 + }, + { + "epoch": 1.2807046466168954, + "grad_norm": 0.684286892414093, + "learning_rate": 5.941602083117601e-06, + "loss": 0.2231, + "step": 13595 + }, + { + "epoch": 1.2807988507100634, + "grad_norm": 0.7269381880760193, + "learning_rate": 5.9402220570569945e-06, + "loss": 0.1986, + "step": 13596 + }, + { + "epoch": 1.2808930548032311, + "grad_norm": 0.6402116417884827, + "learning_rate": 5.9388421235656065e-06, + "loss": 0.216, + "step": 13597 + }, + { + "epoch": 1.280987258896399, + "grad_norm": 0.5580009818077087, + "learning_rate": 5.9374622826748994e-06, + "loss": 0.1756, + "step": 13598 + }, + { + "epoch": 1.2810814629895668, + "grad_norm": 0.6470601558685303, + "learning_rate": 5.936082534416332e-06, + "loss": 0.2119, + "step": 13599 + }, + { + "epoch": 1.2811756670827348, + "grad_norm": 0.7546946406364441, + "learning_rate": 5.934702878821371e-06, + "loss": 0.229, + "step": 13600 + }, + { + "epoch": 1.2812698711759025, + "grad_norm": 0.5874007344245911, + "learning_rate": 5.933323315921471e-06, + "loss": 0.2011, + "step": 13601 + }, + { + "epoch": 1.2813640752690705, + "grad_norm": 0.5410974025726318, + "learning_rate": 5.931943845748089e-06, + "loss": 0.1747, + "step": 13602 + }, + { + "epoch": 1.2814582793622382, + "grad_norm": 0.5804749727249146, + "learning_rate": 5.9305644683326755e-06, + "loss": 0.187, + "step": 13603 + }, + { + "epoch": 1.2815524834554062, + "grad_norm": 0.6447020173072815, + "learning_rate": 5.929185183706689e-06, + "loss": 0.2153, + "step": 13604 + }, + { + "epoch": 1.281646687548574, + "grad_norm": 0.6586759090423584, + "learning_rate": 5.927805991901576e-06, + "loss": 0.2184, + "step": 13605 + }, + { + "epoch": 1.2817408916417419, + "grad_norm": 0.5643503665924072, + "learning_rate": 5.926426892948779e-06, + "loss": 0.1957, + "step": 13606 + }, + { + "epoch": 1.2818350957349096, + "grad_norm": 0.6107988357543945, + "learning_rate": 5.925047886879756e-06, + "loss": 0.185, + "step": 13607 + }, + { + "epoch": 1.2819292998280776, + "grad_norm": 0.6334426403045654, + "learning_rate": 5.92366897372594e-06, + "loss": 0.2218, + "step": 13608 + }, + { + "epoch": 1.2820235039212453, + "grad_norm": 0.6359449028968811, + "learning_rate": 5.922290153518772e-06, + "loss": 0.2093, + "step": 13609 + }, + { + "epoch": 1.2821177080144133, + "grad_norm": 0.5687635540962219, + "learning_rate": 5.9209114262897e-06, + "loss": 0.1728, + "step": 13610 + }, + { + "epoch": 1.282211912107581, + "grad_norm": 0.6605468988418579, + "learning_rate": 5.919532792070154e-06, + "loss": 0.2201, + "step": 13611 + }, + { + "epoch": 1.282306116200749, + "grad_norm": 0.6663120985031128, + "learning_rate": 5.918154250891573e-06, + "loss": 0.1738, + "step": 13612 + }, + { + "epoch": 1.2824003202939167, + "grad_norm": 0.7010716199874878, + "learning_rate": 5.9167758027853824e-06, + "loss": 0.1961, + "step": 13613 + }, + { + "epoch": 1.2824945243870847, + "grad_norm": 0.5978226065635681, + "learning_rate": 5.9153974477830226e-06, + "loss": 0.1954, + "step": 13614 + }, + { + "epoch": 1.2825887284802524, + "grad_norm": 0.6706185936927795, + "learning_rate": 5.914019185915918e-06, + "loss": 0.2116, + "step": 13615 + }, + { + "epoch": 1.2826829325734204, + "grad_norm": 0.641293466091156, + "learning_rate": 5.912641017215493e-06, + "loss": 0.1982, + "step": 13616 + }, + { + "epoch": 1.282777136666588, + "grad_norm": 0.5829498171806335, + "learning_rate": 5.9112629417131736e-06, + "loss": 0.195, + "step": 13617 + }, + { + "epoch": 1.282871340759756, + "grad_norm": 0.6060450077056885, + "learning_rate": 5.909884959440385e-06, + "loss": 0.1757, + "step": 13618 + }, + { + "epoch": 1.2829655448529238, + "grad_norm": 0.62343430519104, + "learning_rate": 5.908507070428542e-06, + "loss": 0.1968, + "step": 13619 + }, + { + "epoch": 1.2830597489460918, + "grad_norm": 0.7056381106376648, + "learning_rate": 5.907129274709068e-06, + "loss": 0.2494, + "step": 13620 + }, + { + "epoch": 1.2831539530392595, + "grad_norm": 0.6520238518714905, + "learning_rate": 5.905751572313376e-06, + "loss": 0.2319, + "step": 13621 + }, + { + "epoch": 1.2832481571324275, + "grad_norm": 0.711362361907959, + "learning_rate": 5.904373963272882e-06, + "loss": 0.2195, + "step": 13622 + }, + { + "epoch": 1.2833423612255952, + "grad_norm": 0.6747616529464722, + "learning_rate": 5.902996447618989e-06, + "loss": 0.2164, + "step": 13623 + }, + { + "epoch": 1.2834365653187632, + "grad_norm": 0.6405224204063416, + "learning_rate": 5.901619025383121e-06, + "loss": 0.1935, + "step": 13624 + }, + { + "epoch": 1.283530769411931, + "grad_norm": 0.5808284282684326, + "learning_rate": 5.900241696596673e-06, + "loss": 0.1784, + "step": 13625 + }, + { + "epoch": 1.2836249735050989, + "grad_norm": 0.6893551349639893, + "learning_rate": 5.898864461291052e-06, + "loss": 0.2121, + "step": 13626 + }, + { + "epoch": 1.2837191775982666, + "grad_norm": 0.6487012505531311, + "learning_rate": 5.89748731949767e-06, + "loss": 0.2099, + "step": 13627 + }, + { + "epoch": 1.2838133816914346, + "grad_norm": 0.6735448837280273, + "learning_rate": 5.896110271247919e-06, + "loss": 0.2086, + "step": 13628 + }, + { + "epoch": 1.2839075857846023, + "grad_norm": 0.6319160461425781, + "learning_rate": 5.8947333165732006e-06, + "loss": 0.1938, + "step": 13629 + }, + { + "epoch": 1.2840017898777702, + "grad_norm": 0.5781620740890503, + "learning_rate": 5.893356455504911e-06, + "loss": 0.2113, + "step": 13630 + }, + { + "epoch": 1.284095993970938, + "grad_norm": 0.7069408893585205, + "learning_rate": 5.891979688074446e-06, + "loss": 0.1988, + "step": 13631 + }, + { + "epoch": 1.284190198064106, + "grad_norm": 0.5846056938171387, + "learning_rate": 5.890603014313199e-06, + "loss": 0.188, + "step": 13632 + }, + { + "epoch": 1.2842844021572737, + "grad_norm": 0.5766475200653076, + "learning_rate": 5.889226434252554e-06, + "loss": 0.1715, + "step": 13633 + }, + { + "epoch": 1.2843786062504416, + "grad_norm": 0.6382126212120056, + "learning_rate": 5.887849947923907e-06, + "loss": 0.2044, + "step": 13634 + }, + { + "epoch": 1.2844728103436094, + "grad_norm": 0.6012054681777954, + "learning_rate": 5.886473555358641e-06, + "loss": 0.1729, + "step": 13635 + }, + { + "epoch": 1.2845670144367773, + "grad_norm": 0.6249715685844421, + "learning_rate": 5.885097256588137e-06, + "loss": 0.2089, + "step": 13636 + }, + { + "epoch": 1.284661218529945, + "grad_norm": 0.6467832922935486, + "learning_rate": 5.883721051643782e-06, + "loss": 0.2052, + "step": 13637 + }, + { + "epoch": 1.284755422623113, + "grad_norm": 0.6491999626159668, + "learning_rate": 5.8823449405569525e-06, + "loss": 0.2208, + "step": 13638 + }, + { + "epoch": 1.2848496267162808, + "grad_norm": 0.9389375448226929, + "learning_rate": 5.8809689233590235e-06, + "loss": 0.2331, + "step": 13639 + }, + { + "epoch": 1.2849438308094487, + "grad_norm": 0.646766185760498, + "learning_rate": 5.879593000081376e-06, + "loss": 0.1751, + "step": 13640 + }, + { + "epoch": 1.2850380349026165, + "grad_norm": 0.7001878619194031, + "learning_rate": 5.878217170755383e-06, + "loss": 0.2209, + "step": 13641 + }, + { + "epoch": 1.2851322389957844, + "grad_norm": 0.6537686586380005, + "learning_rate": 5.87684143541241e-06, + "loss": 0.1908, + "step": 13642 + }, + { + "epoch": 1.2852264430889522, + "grad_norm": 0.5841006636619568, + "learning_rate": 5.875465794083827e-06, + "loss": 0.198, + "step": 13643 + }, + { + "epoch": 1.2853206471821201, + "grad_norm": 0.6095043420791626, + "learning_rate": 5.8740902468010075e-06, + "loss": 0.2225, + "step": 13644 + }, + { + "epoch": 1.2854148512752879, + "grad_norm": 0.583604097366333, + "learning_rate": 5.872714793595309e-06, + "loss": 0.1786, + "step": 13645 + }, + { + "epoch": 1.2855090553684558, + "grad_norm": 0.6523245573043823, + "learning_rate": 5.8713394344980915e-06, + "loss": 0.1986, + "step": 13646 + }, + { + "epoch": 1.2856032594616236, + "grad_norm": 0.7673590183258057, + "learning_rate": 5.869964169540726e-06, + "loss": 0.2112, + "step": 13647 + }, + { + "epoch": 1.2856974635547915, + "grad_norm": 0.6839298605918884, + "learning_rate": 5.868588998754563e-06, + "loss": 0.2285, + "step": 13648 + }, + { + "epoch": 1.2857916676479593, + "grad_norm": 0.7030475735664368, + "learning_rate": 5.867213922170958e-06, + "loss": 0.2442, + "step": 13649 + }, + { + "epoch": 1.2858858717411272, + "grad_norm": 0.6308186650276184, + "learning_rate": 5.86583893982127e-06, + "loss": 0.2036, + "step": 13650 + }, + { + "epoch": 1.285980075834295, + "grad_norm": 0.6027628779411316, + "learning_rate": 5.864464051736847e-06, + "loss": 0.1992, + "step": 13651 + }, + { + "epoch": 1.286074279927463, + "grad_norm": 0.653380811214447, + "learning_rate": 5.8630892579490396e-06, + "loss": 0.2127, + "step": 13652 + }, + { + "epoch": 1.2861684840206307, + "grad_norm": 0.7756394147872925, + "learning_rate": 5.8617145584891935e-06, + "loss": 0.2122, + "step": 13653 + }, + { + "epoch": 1.2862626881137986, + "grad_norm": 0.6326111555099487, + "learning_rate": 5.860339953388656e-06, + "loss": 0.2074, + "step": 13654 + }, + { + "epoch": 1.2863568922069664, + "grad_norm": 0.6357513070106506, + "learning_rate": 5.8589654426787715e-06, + "loss": 0.1884, + "step": 13655 + }, + { + "epoch": 1.2864510963001343, + "grad_norm": 0.6499336361885071, + "learning_rate": 5.857591026390877e-06, + "loss": 0.2115, + "step": 13656 + }, + { + "epoch": 1.286545300393302, + "grad_norm": 0.636991024017334, + "learning_rate": 5.856216704556313e-06, + "loss": 0.1918, + "step": 13657 + }, + { + "epoch": 1.28663950448647, + "grad_norm": 0.6284456849098206, + "learning_rate": 5.854842477206419e-06, + "loss": 0.1919, + "step": 13658 + }, + { + "epoch": 1.2867337085796378, + "grad_norm": 0.6406119465827942, + "learning_rate": 5.853468344372524e-06, + "loss": 0.2064, + "step": 13659 + }, + { + "epoch": 1.2868279126728055, + "grad_norm": 0.6799442172050476, + "learning_rate": 5.852094306085966e-06, + "loss": 0.1912, + "step": 13660 + }, + { + "epoch": 1.2869221167659735, + "grad_norm": 0.6351364850997925, + "learning_rate": 5.850720362378074e-06, + "loss": 0.2067, + "step": 13661 + }, + { + "epoch": 1.2870163208591414, + "grad_norm": 0.6258965134620667, + "learning_rate": 5.8493465132801745e-06, + "loss": 0.2074, + "step": 13662 + }, + { + "epoch": 1.2871105249523092, + "grad_norm": 0.6602813005447388, + "learning_rate": 5.847972758823588e-06, + "loss": 0.1906, + "step": 13663 + }, + { + "epoch": 1.287204729045477, + "grad_norm": 0.6425490379333496, + "learning_rate": 5.846599099039649e-06, + "loss": 0.1922, + "step": 13664 + }, + { + "epoch": 1.2872989331386449, + "grad_norm": 0.6882752180099487, + "learning_rate": 5.845225533959673e-06, + "loss": 0.2176, + "step": 13665 + }, + { + "epoch": 1.2873931372318128, + "grad_norm": 0.6884061694145203, + "learning_rate": 5.843852063614977e-06, + "loss": 0.2046, + "step": 13666 + }, + { + "epoch": 1.2874873413249805, + "grad_norm": 0.6444416642189026, + "learning_rate": 5.842478688036887e-06, + "loss": 0.1875, + "step": 13667 + }, + { + "epoch": 1.2875815454181483, + "grad_norm": 0.6255632638931274, + "learning_rate": 5.841105407256711e-06, + "loss": 0.2066, + "step": 13668 + }, + { + "epoch": 1.2876757495113162, + "grad_norm": 0.6718289256095886, + "learning_rate": 5.83973222130576e-06, + "loss": 0.2069, + "step": 13669 + }, + { + "epoch": 1.2877699536044842, + "grad_norm": 0.6975240707397461, + "learning_rate": 5.838359130215352e-06, + "loss": 0.206, + "step": 13670 + }, + { + "epoch": 1.287864157697652, + "grad_norm": 0.5970094203948975, + "learning_rate": 5.836986134016793e-06, + "loss": 0.1783, + "step": 13671 + }, + { + "epoch": 1.2879583617908197, + "grad_norm": 0.6502562761306763, + "learning_rate": 5.835613232741386e-06, + "loss": 0.1959, + "step": 13672 + }, + { + "epoch": 1.2880525658839876, + "grad_norm": 0.6309152841567993, + "learning_rate": 5.8342404264204365e-06, + "loss": 0.1994, + "step": 13673 + }, + { + "epoch": 1.2881467699771556, + "grad_norm": 0.693664014339447, + "learning_rate": 5.832867715085251e-06, + "loss": 0.2097, + "step": 13674 + }, + { + "epoch": 1.2882409740703233, + "grad_norm": 0.7204006314277649, + "learning_rate": 5.831495098767124e-06, + "loss": 0.2076, + "step": 13675 + }, + { + "epoch": 1.288335178163491, + "grad_norm": 0.6644877195358276, + "learning_rate": 5.830122577497353e-06, + "loss": 0.2134, + "step": 13676 + }, + { + "epoch": 1.288429382256659, + "grad_norm": 0.8421399593353271, + "learning_rate": 5.828750151307241e-06, + "loss": 0.2231, + "step": 13677 + }, + { + "epoch": 1.288523586349827, + "grad_norm": 0.7198042869567871, + "learning_rate": 5.827377820228073e-06, + "loss": 0.2206, + "step": 13678 + }, + { + "epoch": 1.2886177904429947, + "grad_norm": 0.707188606262207, + "learning_rate": 5.826005584291144e-06, + "loss": 0.2178, + "step": 13679 + }, + { + "epoch": 1.2887119945361625, + "grad_norm": 0.6541299223899841, + "learning_rate": 5.824633443527748e-06, + "loss": 0.2257, + "step": 13680 + }, + { + "epoch": 1.2888061986293304, + "grad_norm": 0.6398420929908752, + "learning_rate": 5.82326139796916e-06, + "loss": 0.2146, + "step": 13681 + }, + { + "epoch": 1.2889004027224984, + "grad_norm": 0.7083244919776917, + "learning_rate": 5.821889447646678e-06, + "loss": 0.2084, + "step": 13682 + }, + { + "epoch": 1.2889946068156661, + "grad_norm": 0.6632961630821228, + "learning_rate": 5.820517592591573e-06, + "loss": 0.2084, + "step": 13683 + }, + { + "epoch": 1.2890888109088339, + "grad_norm": 0.6450549960136414, + "learning_rate": 5.819145832835131e-06, + "loss": 0.1961, + "step": 13684 + }, + { + "epoch": 1.2891830150020018, + "grad_norm": 0.6083947420120239, + "learning_rate": 5.817774168408632e-06, + "loss": 0.1724, + "step": 13685 + }, + { + "epoch": 1.2892772190951698, + "grad_norm": 0.7011882066726685, + "learning_rate": 5.816402599343348e-06, + "loss": 0.2077, + "step": 13686 + }, + { + "epoch": 1.2893714231883375, + "grad_norm": 0.7546470165252686, + "learning_rate": 5.815031125670554e-06, + "loss": 0.2163, + "step": 13687 + }, + { + "epoch": 1.2894656272815053, + "grad_norm": 0.6526637673377991, + "learning_rate": 5.813659747421527e-06, + "loss": 0.2132, + "step": 13688 + }, + { + "epoch": 1.2895598313746732, + "grad_norm": 0.6755617260932922, + "learning_rate": 5.812288464627528e-06, + "loss": 0.1823, + "step": 13689 + }, + { + "epoch": 1.2896540354678412, + "grad_norm": 0.6853237152099609, + "learning_rate": 5.810917277319827e-06, + "loss": 0.1956, + "step": 13690 + }, + { + "epoch": 1.289748239561009, + "grad_norm": 0.6459473967552185, + "learning_rate": 5.809546185529697e-06, + "loss": 0.1937, + "step": 13691 + }, + { + "epoch": 1.2898424436541767, + "grad_norm": 0.6184282302856445, + "learning_rate": 5.808175189288394e-06, + "loss": 0.2082, + "step": 13692 + }, + { + "epoch": 1.2899366477473446, + "grad_norm": 0.6942335367202759, + "learning_rate": 5.806804288627171e-06, + "loss": 0.2055, + "step": 13693 + }, + { + "epoch": 1.2900308518405126, + "grad_norm": 0.600774347782135, + "learning_rate": 5.805433483577303e-06, + "loss": 0.206, + "step": 13694 + }, + { + "epoch": 1.2901250559336803, + "grad_norm": 0.7072567343711853, + "learning_rate": 5.804062774170038e-06, + "loss": 0.1871, + "step": 13695 + }, + { + "epoch": 1.290219260026848, + "grad_norm": 0.6618983745574951, + "learning_rate": 5.802692160436628e-06, + "loss": 0.2237, + "step": 13696 + }, + { + "epoch": 1.290313464120016, + "grad_norm": 0.6313806176185608, + "learning_rate": 5.801321642408328e-06, + "loss": 0.1948, + "step": 13697 + }, + { + "epoch": 1.290407668213184, + "grad_norm": 0.6524603962898254, + "learning_rate": 5.799951220116391e-06, + "loss": 0.1927, + "step": 13698 + }, + { + "epoch": 1.2905018723063517, + "grad_norm": 0.6209326386451721, + "learning_rate": 5.798580893592058e-06, + "loss": 0.1866, + "step": 13699 + }, + { + "epoch": 1.2905960763995195, + "grad_norm": 0.6452550888061523, + "learning_rate": 5.797210662866579e-06, + "loss": 0.1991, + "step": 13700 + }, + { + "epoch": 1.2906902804926874, + "grad_norm": 0.5927140116691589, + "learning_rate": 5.795840527971199e-06, + "loss": 0.1664, + "step": 13701 + }, + { + "epoch": 1.2907844845858554, + "grad_norm": 0.6361879706382751, + "learning_rate": 5.794470488937154e-06, + "loss": 0.2066, + "step": 13702 + }, + { + "epoch": 1.2908786886790231, + "grad_norm": 0.7191894054412842, + "learning_rate": 5.793100545795687e-06, + "loss": 0.1902, + "step": 13703 + }, + { + "epoch": 1.2909728927721908, + "grad_norm": 0.612771213054657, + "learning_rate": 5.791730698578035e-06, + "loss": 0.1967, + "step": 13704 + }, + { + "epoch": 1.2910670968653588, + "grad_norm": 0.6511619687080383, + "learning_rate": 5.7903609473154295e-06, + "loss": 0.1983, + "step": 13705 + }, + { + "epoch": 1.2911613009585268, + "grad_norm": 0.6181254386901855, + "learning_rate": 5.788991292039103e-06, + "loss": 0.1811, + "step": 13706 + }, + { + "epoch": 1.2912555050516945, + "grad_norm": 0.7074618339538574, + "learning_rate": 5.7876217327802935e-06, + "loss": 0.2239, + "step": 13707 + }, + { + "epoch": 1.2913497091448622, + "grad_norm": 0.6534847021102905, + "learning_rate": 5.786252269570219e-06, + "loss": 0.1901, + "step": 13708 + }, + { + "epoch": 1.2914439132380302, + "grad_norm": 0.6207404732704163, + "learning_rate": 5.784882902440108e-06, + "loss": 0.1915, + "step": 13709 + }, + { + "epoch": 1.2915381173311982, + "grad_norm": 0.7236878275871277, + "learning_rate": 5.7835136314211894e-06, + "loss": 0.2041, + "step": 13710 + }, + { + "epoch": 1.291632321424366, + "grad_norm": 0.7188735604286194, + "learning_rate": 5.782144456544681e-06, + "loss": 0.2091, + "step": 13711 + }, + { + "epoch": 1.2917265255175336, + "grad_norm": 0.6913007497787476, + "learning_rate": 5.780775377841799e-06, + "loss": 0.1969, + "step": 13712 + }, + { + "epoch": 1.2918207296107016, + "grad_norm": 0.6481049060821533, + "learning_rate": 5.779406395343763e-06, + "loss": 0.211, + "step": 13713 + }, + { + "epoch": 1.2919149337038696, + "grad_norm": 0.6210039258003235, + "learning_rate": 5.778037509081793e-06, + "loss": 0.1969, + "step": 13714 + }, + { + "epoch": 1.2920091377970373, + "grad_norm": 0.5863572955131531, + "learning_rate": 5.776668719087092e-06, + "loss": 0.2072, + "step": 13715 + }, + { + "epoch": 1.292103341890205, + "grad_norm": 0.7057060599327087, + "learning_rate": 5.775300025390876e-06, + "loss": 0.2216, + "step": 13716 + }, + { + "epoch": 1.292197545983373, + "grad_norm": 0.6310413479804993, + "learning_rate": 5.773931428024357e-06, + "loss": 0.2117, + "step": 13717 + }, + { + "epoch": 1.292291750076541, + "grad_norm": 0.6071088910102844, + "learning_rate": 5.772562927018734e-06, + "loss": 0.1998, + "step": 13718 + }, + { + "epoch": 1.2923859541697087, + "grad_norm": 0.6977028846740723, + "learning_rate": 5.771194522405215e-06, + "loss": 0.2134, + "step": 13719 + }, + { + "epoch": 1.2924801582628764, + "grad_norm": 0.6454960703849792, + "learning_rate": 5.769826214215003e-06, + "loss": 0.1865, + "step": 13720 + }, + { + "epoch": 1.2925743623560444, + "grad_norm": 0.9823235273361206, + "learning_rate": 5.768458002479292e-06, + "loss": 0.1691, + "step": 13721 + }, + { + "epoch": 1.2926685664492124, + "grad_norm": 0.5987262725830078, + "learning_rate": 5.767089887229287e-06, + "loss": 0.1599, + "step": 13722 + }, + { + "epoch": 1.29276277054238, + "grad_norm": 0.6353163719177246, + "learning_rate": 5.765721868496175e-06, + "loss": 0.2048, + "step": 13723 + }, + { + "epoch": 1.2928569746355478, + "grad_norm": 0.6583973169326782, + "learning_rate": 5.764353946311152e-06, + "loss": 0.2125, + "step": 13724 + }, + { + "epoch": 1.2929511787287158, + "grad_norm": 0.6165882349014282, + "learning_rate": 5.7629861207054135e-06, + "loss": 0.2145, + "step": 13725 + }, + { + "epoch": 1.2930453828218837, + "grad_norm": 0.646360456943512, + "learning_rate": 5.761618391710142e-06, + "loss": 0.2082, + "step": 13726 + }, + { + "epoch": 1.2931395869150515, + "grad_norm": 0.6963431239128113, + "learning_rate": 5.760250759356525e-06, + "loss": 0.1844, + "step": 13727 + }, + { + "epoch": 1.2932337910082192, + "grad_norm": 0.7241135835647583, + "learning_rate": 5.758883223675751e-06, + "loss": 0.2107, + "step": 13728 + }, + { + "epoch": 1.2933279951013872, + "grad_norm": 0.7736554145812988, + "learning_rate": 5.7575157846989945e-06, + "loss": 0.1818, + "step": 13729 + }, + { + "epoch": 1.293422199194555, + "grad_norm": 0.5883830189704895, + "learning_rate": 5.7561484424574385e-06, + "loss": 0.1905, + "step": 13730 + }, + { + "epoch": 1.2935164032877229, + "grad_norm": 0.5926435589790344, + "learning_rate": 5.754781196982266e-06, + "loss": 0.1995, + "step": 13731 + }, + { + "epoch": 1.2936106073808906, + "grad_norm": 0.6940343379974365, + "learning_rate": 5.753414048304649e-06, + "loss": 0.2249, + "step": 13732 + }, + { + "epoch": 1.2937048114740586, + "grad_norm": 0.6366355419158936, + "learning_rate": 5.75204699645575e-06, + "loss": 0.188, + "step": 13733 + }, + { + "epoch": 1.2937990155672263, + "grad_norm": 0.8059617280960083, + "learning_rate": 5.750680041466756e-06, + "loss": 0.2258, + "step": 13734 + }, + { + "epoch": 1.2938932196603943, + "grad_norm": 0.656348466873169, + "learning_rate": 5.7493131833688306e-06, + "loss": 0.2244, + "step": 13735 + }, + { + "epoch": 1.293987423753562, + "grad_norm": 0.7119974493980408, + "learning_rate": 5.747946422193133e-06, + "loss": 0.2015, + "step": 13736 + }, + { + "epoch": 1.29408162784673, + "grad_norm": 0.5967922806739807, + "learning_rate": 5.746579757970834e-06, + "loss": 0.1944, + "step": 13737 + }, + { + "epoch": 1.2941758319398977, + "grad_norm": 0.6940702199935913, + "learning_rate": 5.745213190733099e-06, + "loss": 0.2004, + "step": 13738 + }, + { + "epoch": 1.2942700360330657, + "grad_norm": 0.6587916612625122, + "learning_rate": 5.7438467205110785e-06, + "loss": 0.2202, + "step": 13739 + }, + { + "epoch": 1.2943642401262334, + "grad_norm": 0.6120479106903076, + "learning_rate": 5.742480347335935e-06, + "loss": 0.186, + "step": 13740 + }, + { + "epoch": 1.2944584442194014, + "grad_norm": 0.6482627987861633, + "learning_rate": 5.7411140712388284e-06, + "loss": 0.1993, + "step": 13741 + }, + { + "epoch": 1.294552648312569, + "grad_norm": 0.687244176864624, + "learning_rate": 5.739747892250902e-06, + "loss": 0.208, + "step": 13742 + }, + { + "epoch": 1.294646852405737, + "grad_norm": 0.6568077802658081, + "learning_rate": 5.738381810403314e-06, + "loss": 0.1788, + "step": 13743 + }, + { + "epoch": 1.2947410564989048, + "grad_norm": 0.6496370434761047, + "learning_rate": 5.737015825727215e-06, + "loss": 0.1862, + "step": 13744 + }, + { + "epoch": 1.2948352605920728, + "grad_norm": 0.699585497379303, + "learning_rate": 5.735649938253743e-06, + "loss": 0.2315, + "step": 13745 + }, + { + "epoch": 1.2949294646852405, + "grad_norm": 0.6499279737472534, + "learning_rate": 5.734284148014049e-06, + "loss": 0.1862, + "step": 13746 + }, + { + "epoch": 1.2950236687784085, + "grad_norm": 0.6824267506599426, + "learning_rate": 5.7329184550392756e-06, + "loss": 0.2133, + "step": 13747 + }, + { + "epoch": 1.2951178728715762, + "grad_norm": 0.6080122590065002, + "learning_rate": 5.731552859360563e-06, + "loss": 0.2011, + "step": 13748 + }, + { + "epoch": 1.2952120769647442, + "grad_norm": 0.6081357002258301, + "learning_rate": 5.730187361009036e-06, + "loss": 0.1969, + "step": 13749 + }, + { + "epoch": 1.295306281057912, + "grad_norm": 0.6595403552055359, + "learning_rate": 5.72882196001585e-06, + "loss": 0.1994, + "step": 13750 + }, + { + "epoch": 1.2954004851510799, + "grad_norm": 0.7420370578765869, + "learning_rate": 5.727456656412129e-06, + "loss": 0.2013, + "step": 13751 + }, + { + "epoch": 1.2954946892442476, + "grad_norm": 0.6704427003860474, + "learning_rate": 5.726091450228999e-06, + "loss": 0.1823, + "step": 13752 + }, + { + "epoch": 1.2955888933374156, + "grad_norm": 0.6598203182220459, + "learning_rate": 5.724726341497594e-06, + "loss": 0.2126, + "step": 13753 + }, + { + "epoch": 1.2956830974305833, + "grad_norm": 0.6840913891792297, + "learning_rate": 5.723361330249044e-06, + "loss": 0.1949, + "step": 13754 + }, + { + "epoch": 1.2957773015237513, + "grad_norm": 0.6585990786552429, + "learning_rate": 5.721996416514466e-06, + "loss": 0.1956, + "step": 13755 + }, + { + "epoch": 1.295871505616919, + "grad_norm": 0.6387620568275452, + "learning_rate": 5.720631600324986e-06, + "loss": 0.2115, + "step": 13756 + }, + { + "epoch": 1.295965709710087, + "grad_norm": 0.6068645119667053, + "learning_rate": 5.719266881711727e-06, + "loss": 0.1771, + "step": 13757 + }, + { + "epoch": 1.2960599138032547, + "grad_norm": 0.766531765460968, + "learning_rate": 5.7179022607058e-06, + "loss": 0.2383, + "step": 13758 + }, + { + "epoch": 1.2961541178964227, + "grad_norm": 0.6244555711746216, + "learning_rate": 5.716537737338324e-06, + "loss": 0.1807, + "step": 13759 + }, + { + "epoch": 1.2962483219895904, + "grad_norm": 0.5754210948944092, + "learning_rate": 5.715173311640415e-06, + "loss": 0.176, + "step": 13760 + }, + { + "epoch": 1.2963425260827584, + "grad_norm": 0.6409897208213806, + "learning_rate": 5.713808983643179e-06, + "loss": 0.2236, + "step": 13761 + }, + { + "epoch": 1.296436730175926, + "grad_norm": 0.7322799563407898, + "learning_rate": 5.712444753377728e-06, + "loss": 0.2276, + "step": 13762 + }, + { + "epoch": 1.296530934269094, + "grad_norm": 0.648629367351532, + "learning_rate": 5.711080620875165e-06, + "loss": 0.2307, + "step": 13763 + }, + { + "epoch": 1.2966251383622618, + "grad_norm": 0.5882104635238647, + "learning_rate": 5.709716586166598e-06, + "loss": 0.1862, + "step": 13764 + }, + { + "epoch": 1.2967193424554297, + "grad_norm": 0.6721855401992798, + "learning_rate": 5.708352649283131e-06, + "loss": 0.2068, + "step": 13765 + }, + { + "epoch": 1.2968135465485975, + "grad_norm": 0.6700777411460876, + "learning_rate": 5.706988810255856e-06, + "loss": 0.1987, + "step": 13766 + }, + { + "epoch": 1.2969077506417654, + "grad_norm": 0.7214812636375427, + "learning_rate": 5.705625069115877e-06, + "loss": 0.2063, + "step": 13767 + }, + { + "epoch": 1.2970019547349332, + "grad_norm": 0.6548411846160889, + "learning_rate": 5.70426142589429e-06, + "loss": 0.2105, + "step": 13768 + }, + { + "epoch": 1.2970961588281011, + "grad_norm": 0.6924688220024109, + "learning_rate": 5.702897880622182e-06, + "loss": 0.1953, + "step": 13769 + }, + { + "epoch": 1.2971903629212689, + "grad_norm": 0.6035183668136597, + "learning_rate": 5.701534433330649e-06, + "loss": 0.1868, + "step": 13770 + }, + { + "epoch": 1.2972845670144368, + "grad_norm": 0.6803979277610779, + "learning_rate": 5.700171084050783e-06, + "loss": 0.2206, + "step": 13771 + }, + { + "epoch": 1.2973787711076046, + "grad_norm": 0.661728024482727, + "learning_rate": 5.698807832813664e-06, + "loss": 0.2133, + "step": 13772 + }, + { + "epoch": 1.2974729752007725, + "grad_norm": 0.8313462138175964, + "learning_rate": 5.697444679650372e-06, + "loss": 0.2311, + "step": 13773 + }, + { + "epoch": 1.2975671792939403, + "grad_norm": 0.61605304479599, + "learning_rate": 5.696081624592002e-06, + "loss": 0.2055, + "step": 13774 + }, + { + "epoch": 1.2976613833871082, + "grad_norm": 0.645388662815094, + "learning_rate": 5.6947186676696295e-06, + "loss": 0.1816, + "step": 13775 + }, + { + "epoch": 1.297755587480276, + "grad_norm": 0.7084368467330933, + "learning_rate": 5.693355808914325e-06, + "loss": 0.2265, + "step": 13776 + }, + { + "epoch": 1.297849791573444, + "grad_norm": 0.6451153755187988, + "learning_rate": 5.691993048357168e-06, + "loss": 0.181, + "step": 13777 + }, + { + "epoch": 1.2979439956666117, + "grad_norm": 0.7079511284828186, + "learning_rate": 5.690630386029235e-06, + "loss": 0.2225, + "step": 13778 + }, + { + "epoch": 1.2980381997597796, + "grad_norm": 0.6755489110946655, + "learning_rate": 5.689267821961591e-06, + "loss": 0.2087, + "step": 13779 + }, + { + "epoch": 1.2981324038529474, + "grad_norm": 0.7519961595535278, + "learning_rate": 5.687905356185306e-06, + "loss": 0.2136, + "step": 13780 + }, + { + "epoch": 1.2982266079461153, + "grad_norm": 0.612177312374115, + "learning_rate": 5.686542988731451e-06, + "loss": 0.2046, + "step": 13781 + }, + { + "epoch": 1.298320812039283, + "grad_norm": 0.6108668446540833, + "learning_rate": 5.685180719631085e-06, + "loss": 0.1926, + "step": 13782 + }, + { + "epoch": 1.298415016132451, + "grad_norm": 0.6176031231880188, + "learning_rate": 5.6838185489152696e-06, + "loss": 0.1759, + "step": 13783 + }, + { + "epoch": 1.2985092202256188, + "grad_norm": 0.6078556180000305, + "learning_rate": 5.6824564766150724e-06, + "loss": 0.1891, + "step": 13784 + }, + { + "epoch": 1.2986034243187867, + "grad_norm": 0.6300124526023865, + "learning_rate": 5.68109450276154e-06, + "loss": 0.1937, + "step": 13785 + }, + { + "epoch": 1.2986976284119545, + "grad_norm": 0.6361894011497498, + "learning_rate": 5.679732627385732e-06, + "loss": 0.226, + "step": 13786 + }, + { + "epoch": 1.2987918325051224, + "grad_norm": 0.7193924188613892, + "learning_rate": 5.6783708505187065e-06, + "loss": 0.2157, + "step": 13787 + }, + { + "epoch": 1.2988860365982902, + "grad_norm": 0.7525556087493896, + "learning_rate": 5.677009172191508e-06, + "loss": 0.2335, + "step": 13788 + }, + { + "epoch": 1.2989802406914581, + "grad_norm": 0.6641455888748169, + "learning_rate": 5.675647592435177e-06, + "loss": 0.1991, + "step": 13789 + }, + { + "epoch": 1.2990744447846259, + "grad_norm": 0.6445804238319397, + "learning_rate": 5.674286111280778e-06, + "loss": 0.2174, + "step": 13790 + }, + { + "epoch": 1.2991686488777938, + "grad_norm": 0.7037086486816406, + "learning_rate": 5.672924728759346e-06, + "loss": 0.2001, + "step": 13791 + }, + { + "epoch": 1.2992628529709616, + "grad_norm": 0.653767466545105, + "learning_rate": 5.671563444901917e-06, + "loss": 0.2076, + "step": 13792 + }, + { + "epoch": 1.2993570570641295, + "grad_norm": 0.753557026386261, + "learning_rate": 5.670202259739537e-06, + "loss": 0.2196, + "step": 13793 + }, + { + "epoch": 1.2994512611572973, + "grad_norm": 0.6928074955940247, + "learning_rate": 5.668841173303244e-06, + "loss": 0.2221, + "step": 13794 + }, + { + "epoch": 1.2995454652504652, + "grad_norm": 0.6821420788764954, + "learning_rate": 5.667480185624068e-06, + "loss": 0.213, + "step": 13795 + }, + { + "epoch": 1.299639669343633, + "grad_norm": 0.6530633568763733, + "learning_rate": 5.666119296733042e-06, + "loss": 0.207, + "step": 13796 + }, + { + "epoch": 1.299733873436801, + "grad_norm": 0.5936562418937683, + "learning_rate": 5.6647585066612045e-06, + "loss": 0.2037, + "step": 13797 + }, + { + "epoch": 1.2998280775299687, + "grad_norm": 1.7025306224822998, + "learning_rate": 5.663397815439573e-06, + "loss": 0.203, + "step": 13798 + }, + { + "epoch": 1.2999222816231364, + "grad_norm": 0.63300621509552, + "learning_rate": 5.662037223099177e-06, + "loss": 0.2105, + "step": 13799 + }, + { + "epoch": 1.3000164857163043, + "grad_norm": 0.5539723634719849, + "learning_rate": 5.660676729671046e-06, + "loss": 0.191, + "step": 13800 + }, + { + "epoch": 1.3001106898094723, + "grad_norm": 0.7422416806221008, + "learning_rate": 5.659316335186194e-06, + "loss": 0.2484, + "step": 13801 + }, + { + "epoch": 1.30020489390264, + "grad_norm": 0.6566030383110046, + "learning_rate": 5.657956039675645e-06, + "loss": 0.2175, + "step": 13802 + }, + { + "epoch": 1.3002990979958078, + "grad_norm": 0.6515787839889526, + "learning_rate": 5.656595843170409e-06, + "loss": 0.1981, + "step": 13803 + }, + { + "epoch": 1.3003933020889757, + "grad_norm": 1.1477673053741455, + "learning_rate": 5.655235745701506e-06, + "loss": 0.2261, + "step": 13804 + }, + { + "epoch": 1.3004875061821437, + "grad_norm": 0.7515124678611755, + "learning_rate": 5.653875747299951e-06, + "loss": 0.176, + "step": 13805 + }, + { + "epoch": 1.3005817102753114, + "grad_norm": 0.7207934856414795, + "learning_rate": 5.652515847996744e-06, + "loss": 0.2471, + "step": 13806 + }, + { + "epoch": 1.3006759143684792, + "grad_norm": 0.6418449282646179, + "learning_rate": 5.651156047822903e-06, + "loss": 0.22, + "step": 13807 + }, + { + "epoch": 1.3007701184616471, + "grad_norm": 0.5678517818450928, + "learning_rate": 5.64979634680943e-06, + "loss": 0.1719, + "step": 13808 + }, + { + "epoch": 1.300864322554815, + "grad_norm": 0.6331684589385986, + "learning_rate": 5.6484367449873265e-06, + "loss": 0.1984, + "step": 13809 + }, + { + "epoch": 1.3009585266479828, + "grad_norm": 0.6593651175498962, + "learning_rate": 5.6470772423875954e-06, + "loss": 0.1984, + "step": 13810 + }, + { + "epoch": 1.3010527307411506, + "grad_norm": 0.5979335904121399, + "learning_rate": 5.645717839041238e-06, + "loss": 0.1898, + "step": 13811 + }, + { + "epoch": 1.3011469348343185, + "grad_norm": 0.676989734172821, + "learning_rate": 5.644358534979248e-06, + "loss": 0.2151, + "step": 13812 + }, + { + "epoch": 1.3012411389274865, + "grad_norm": 0.546859085559845, + "learning_rate": 5.642999330232617e-06, + "loss": 0.1553, + "step": 13813 + }, + { + "epoch": 1.3013353430206542, + "grad_norm": 0.6827539801597595, + "learning_rate": 5.6416402248323375e-06, + "loss": 0.2003, + "step": 13814 + }, + { + "epoch": 1.301429547113822, + "grad_norm": 0.638532817363739, + "learning_rate": 5.6402812188094055e-06, + "loss": 0.2062, + "step": 13815 + }, + { + "epoch": 1.30152375120699, + "grad_norm": 0.7166432738304138, + "learning_rate": 5.6389223121948014e-06, + "loss": 0.2417, + "step": 13816 + }, + { + "epoch": 1.301617955300158, + "grad_norm": 0.6156617403030396, + "learning_rate": 5.637563505019512e-06, + "loss": 0.2008, + "step": 13817 + }, + { + "epoch": 1.3017121593933256, + "grad_norm": 0.6777030825614929, + "learning_rate": 5.636204797314526e-06, + "loss": 0.2132, + "step": 13818 + }, + { + "epoch": 1.3018063634864934, + "grad_norm": 0.6603621244430542, + "learning_rate": 5.634846189110814e-06, + "loss": 0.2028, + "step": 13819 + }, + { + "epoch": 1.3019005675796613, + "grad_norm": 0.6802692413330078, + "learning_rate": 5.633487680439362e-06, + "loss": 0.2, + "step": 13820 + }, + { + "epoch": 1.3019947716728293, + "grad_norm": 0.7136988639831543, + "learning_rate": 5.632129271331146e-06, + "loss": 0.2039, + "step": 13821 + }, + { + "epoch": 1.302088975765997, + "grad_norm": 0.6380264759063721, + "learning_rate": 5.630770961817134e-06, + "loss": 0.2167, + "step": 13822 + }, + { + "epoch": 1.3021831798591648, + "grad_norm": 0.5513056516647339, + "learning_rate": 5.629412751928301e-06, + "loss": 0.1999, + "step": 13823 + }, + { + "epoch": 1.3022773839523327, + "grad_norm": 0.6224511861801147, + "learning_rate": 5.628054641695622e-06, + "loss": 0.1798, + "step": 13824 + }, + { + "epoch": 1.3023715880455007, + "grad_norm": 0.6389832496643066, + "learning_rate": 5.626696631150053e-06, + "loss": 0.2107, + "step": 13825 + }, + { + "epoch": 1.3024657921386684, + "grad_norm": 0.6147251725196838, + "learning_rate": 5.625338720322564e-06, + "loss": 0.1967, + "step": 13826 + }, + { + "epoch": 1.3025599962318362, + "grad_norm": 0.6606024503707886, + "learning_rate": 5.6239809092441225e-06, + "loss": 0.23, + "step": 13827 + }, + { + "epoch": 1.3026542003250041, + "grad_norm": 0.6261757016181946, + "learning_rate": 5.622623197945684e-06, + "loss": 0.1968, + "step": 13828 + }, + { + "epoch": 1.302748404418172, + "grad_norm": 0.6423500180244446, + "learning_rate": 5.6212655864581976e-06, + "loss": 0.2067, + "step": 13829 + }, + { + "epoch": 1.3028426085113398, + "grad_norm": 0.6579574942588806, + "learning_rate": 5.619908074812637e-06, + "loss": 0.2011, + "step": 13830 + }, + { + "epoch": 1.3029368126045076, + "grad_norm": 0.6220484375953674, + "learning_rate": 5.618550663039945e-06, + "loss": 0.2038, + "step": 13831 + }, + { + "epoch": 1.3030310166976755, + "grad_norm": 0.6784374117851257, + "learning_rate": 5.6171933511710705e-06, + "loss": 0.2172, + "step": 13832 + }, + { + "epoch": 1.3031252207908435, + "grad_norm": 0.7078768610954285, + "learning_rate": 5.6158361392369655e-06, + "loss": 0.2181, + "step": 13833 + }, + { + "epoch": 1.3032194248840112, + "grad_norm": 0.6488476395606995, + "learning_rate": 5.61447902726858e-06, + "loss": 0.2415, + "step": 13834 + }, + { + "epoch": 1.303313628977179, + "grad_norm": 0.654567539691925, + "learning_rate": 5.6131220152968525e-06, + "loss": 0.1912, + "step": 13835 + }, + { + "epoch": 1.303407833070347, + "grad_norm": 0.6786851286888123, + "learning_rate": 5.611765103352727e-06, + "loss": 0.2128, + "step": 13836 + }, + { + "epoch": 1.3035020371635149, + "grad_norm": 0.6657833456993103, + "learning_rate": 5.6104082914671465e-06, + "loss": 0.1913, + "step": 13837 + }, + { + "epoch": 1.3035962412566826, + "grad_norm": 0.7241305112838745, + "learning_rate": 5.609051579671043e-06, + "loss": 0.2056, + "step": 13838 + }, + { + "epoch": 1.3036904453498503, + "grad_norm": 0.6167240738868713, + "learning_rate": 5.607694967995354e-06, + "loss": 0.1905, + "step": 13839 + }, + { + "epoch": 1.3037846494430183, + "grad_norm": 0.7255936861038208, + "learning_rate": 5.606338456471017e-06, + "loss": 0.2272, + "step": 13840 + }, + { + "epoch": 1.3038788535361863, + "grad_norm": 0.6236115097999573, + "learning_rate": 5.604982045128953e-06, + "loss": 0.2019, + "step": 13841 + }, + { + "epoch": 1.303973057629354, + "grad_norm": 0.6566281318664551, + "learning_rate": 5.6036257340000995e-06, + "loss": 0.2097, + "step": 13842 + }, + { + "epoch": 1.3040672617225217, + "grad_norm": 0.6480876803398132, + "learning_rate": 5.6022695231153754e-06, + "loss": 0.1963, + "step": 13843 + }, + { + "epoch": 1.3041614658156897, + "grad_norm": 0.661703884601593, + "learning_rate": 5.600913412505707e-06, + "loss": 0.1948, + "step": 13844 + }, + { + "epoch": 1.3042556699088577, + "grad_norm": 0.6555445194244385, + "learning_rate": 5.599557402202019e-06, + "loss": 0.1893, + "step": 13845 + }, + { + "epoch": 1.3043498740020254, + "grad_norm": 0.6487663984298706, + "learning_rate": 5.598201492235224e-06, + "loss": 0.1893, + "step": 13846 + }, + { + "epoch": 1.3044440780951931, + "grad_norm": 0.7296391725540161, + "learning_rate": 5.596845682636245e-06, + "loss": 0.2082, + "step": 13847 + }, + { + "epoch": 1.304538282188361, + "grad_norm": 0.8082395792007446, + "learning_rate": 5.595489973435995e-06, + "loss": 0.2096, + "step": 13848 + }, + { + "epoch": 1.304632486281529, + "grad_norm": 0.6186919212341309, + "learning_rate": 5.594134364665383e-06, + "loss": 0.2006, + "step": 13849 + }, + { + "epoch": 1.3047266903746968, + "grad_norm": 1.2049318552017212, + "learning_rate": 5.592778856355321e-06, + "loss": 0.2264, + "step": 13850 + }, + { + "epoch": 1.3048208944678645, + "grad_norm": 0.7251076102256775, + "learning_rate": 5.591423448536719e-06, + "loss": 0.2148, + "step": 13851 + }, + { + "epoch": 1.3049150985610325, + "grad_norm": 0.608109176158905, + "learning_rate": 5.590068141240483e-06, + "loss": 0.1774, + "step": 13852 + }, + { + "epoch": 1.3050093026542005, + "grad_norm": 0.6641905307769775, + "learning_rate": 5.588712934497509e-06, + "loss": 0.1819, + "step": 13853 + }, + { + "epoch": 1.3051035067473682, + "grad_norm": 0.6662217974662781, + "learning_rate": 5.587357828338702e-06, + "loss": 0.2048, + "step": 13854 + }, + { + "epoch": 1.305197710840536, + "grad_norm": 0.6918042898178101, + "learning_rate": 5.586002822794964e-06, + "loss": 0.2212, + "step": 13855 + }, + { + "epoch": 1.305291914933704, + "grad_norm": 0.6342300772666931, + "learning_rate": 5.584647917897185e-06, + "loss": 0.2008, + "step": 13856 + }, + { + "epoch": 1.3053861190268718, + "grad_norm": 0.540745735168457, + "learning_rate": 5.58329311367626e-06, + "loss": 0.1788, + "step": 13857 + }, + { + "epoch": 1.3054803231200396, + "grad_norm": 0.6319723129272461, + "learning_rate": 5.5819384101630895e-06, + "loss": 0.1926, + "step": 13858 + }, + { + "epoch": 1.3055745272132073, + "grad_norm": 0.604201078414917, + "learning_rate": 5.58058380738855e-06, + "loss": 0.1842, + "step": 13859 + }, + { + "epoch": 1.3056687313063753, + "grad_norm": 0.5605629682540894, + "learning_rate": 5.5792293053835354e-06, + "loss": 0.1952, + "step": 13860 + }, + { + "epoch": 1.3057629353995432, + "grad_norm": 0.6751049757003784, + "learning_rate": 5.577874904178932e-06, + "loss": 0.2054, + "step": 13861 + }, + { + "epoch": 1.305857139492711, + "grad_norm": 0.6867413520812988, + "learning_rate": 5.5765206038056175e-06, + "loss": 0.2158, + "step": 13862 + }, + { + "epoch": 1.3059513435858787, + "grad_norm": 0.6905479431152344, + "learning_rate": 5.5751664042944745e-06, + "loss": 0.1897, + "step": 13863 + }, + { + "epoch": 1.3060455476790467, + "grad_norm": 0.602834939956665, + "learning_rate": 5.573812305676383e-06, + "loss": 0.1759, + "step": 13864 + }, + { + "epoch": 1.3061397517722146, + "grad_norm": 0.7618058919906616, + "learning_rate": 5.572458307982216e-06, + "loss": 0.2228, + "step": 13865 + }, + { + "epoch": 1.3062339558653824, + "grad_norm": 0.6712250709533691, + "learning_rate": 5.57110441124284e-06, + "loss": 0.2045, + "step": 13866 + }, + { + "epoch": 1.3063281599585501, + "grad_norm": 0.6632776260375977, + "learning_rate": 5.56975061548914e-06, + "loss": 0.2149, + "step": 13867 + }, + { + "epoch": 1.306422364051718, + "grad_norm": 0.6257486939430237, + "learning_rate": 5.5683969207519795e-06, + "loss": 0.2044, + "step": 13868 + }, + { + "epoch": 1.3065165681448858, + "grad_norm": 0.6668272018432617, + "learning_rate": 5.567043327062212e-06, + "loss": 0.2594, + "step": 13869 + }, + { + "epoch": 1.3066107722380538, + "grad_norm": 0.6111574172973633, + "learning_rate": 5.5656898344507224e-06, + "loss": 0.1894, + "step": 13870 + }, + { + "epoch": 1.3067049763312215, + "grad_norm": 0.636658787727356, + "learning_rate": 5.564336442948362e-06, + "loss": 0.2069, + "step": 13871 + }, + { + "epoch": 1.3067991804243895, + "grad_norm": 0.6372724175453186, + "learning_rate": 5.562983152585986e-06, + "loss": 0.2114, + "step": 13872 + }, + { + "epoch": 1.3068933845175572, + "grad_norm": 0.7113171219825745, + "learning_rate": 5.561629963394457e-06, + "loss": 0.2186, + "step": 13873 + }, + { + "epoch": 1.3069875886107252, + "grad_norm": 0.6116046905517578, + "learning_rate": 5.560276875404631e-06, + "loss": 0.2081, + "step": 13874 + }, + { + "epoch": 1.307081792703893, + "grad_norm": 0.6473698616027832, + "learning_rate": 5.5589238886473565e-06, + "loss": 0.1983, + "step": 13875 + }, + { + "epoch": 1.3071759967970609, + "grad_norm": 0.723466157913208, + "learning_rate": 5.557571003153485e-06, + "loss": 0.2395, + "step": 13876 + }, + { + "epoch": 1.3072702008902286, + "grad_norm": 0.7086714506149292, + "learning_rate": 5.5562182189538684e-06, + "loss": 0.2052, + "step": 13877 + }, + { + "epoch": 1.3073644049833966, + "grad_norm": 0.6335019469261169, + "learning_rate": 5.554865536079346e-06, + "loss": 0.1835, + "step": 13878 + }, + { + "epoch": 1.3074586090765643, + "grad_norm": 0.631964921951294, + "learning_rate": 5.5535129545607625e-06, + "loss": 0.2088, + "step": 13879 + }, + { + "epoch": 1.3075528131697323, + "grad_norm": 0.6842601299285889, + "learning_rate": 5.552160474428965e-06, + "loss": 0.2314, + "step": 13880 + }, + { + "epoch": 1.3076470172629, + "grad_norm": 0.6788905262947083, + "learning_rate": 5.550808095714784e-06, + "loss": 0.1907, + "step": 13881 + }, + { + "epoch": 1.307741221356068, + "grad_norm": 0.6198939681053162, + "learning_rate": 5.549455818449061e-06, + "loss": 0.2042, + "step": 13882 + }, + { + "epoch": 1.3078354254492357, + "grad_norm": 0.6979933977127075, + "learning_rate": 5.548103642662626e-06, + "loss": 0.1756, + "step": 13883 + }, + { + "epoch": 1.3079296295424037, + "grad_norm": 0.779558539390564, + "learning_rate": 5.546751568386313e-06, + "loss": 0.2351, + "step": 13884 + }, + { + "epoch": 1.3080238336355714, + "grad_norm": 0.6771942377090454, + "learning_rate": 5.545399595650956e-06, + "loss": 0.2274, + "step": 13885 + }, + { + "epoch": 1.3081180377287394, + "grad_norm": 0.6119150519371033, + "learning_rate": 5.544047724487371e-06, + "loss": 0.1871, + "step": 13886 + }, + { + "epoch": 1.308212241821907, + "grad_norm": 0.6790033578872681, + "learning_rate": 5.542695954926391e-06, + "loss": 0.199, + "step": 13887 + }, + { + "epoch": 1.308306445915075, + "grad_norm": 0.6950446963310242, + "learning_rate": 5.54134428699884e-06, + "loss": 0.2046, + "step": 13888 + }, + { + "epoch": 1.3084006500082428, + "grad_norm": 0.6052107214927673, + "learning_rate": 5.5399927207355305e-06, + "loss": 0.2023, + "step": 13889 + }, + { + "epoch": 1.3084948541014108, + "grad_norm": 0.649046003818512, + "learning_rate": 5.538641256167285e-06, + "loss": 0.205, + "step": 13890 + }, + { + "epoch": 1.3085890581945785, + "grad_norm": 0.6405214071273804, + "learning_rate": 5.537289893324922e-06, + "loss": 0.1841, + "step": 13891 + }, + { + "epoch": 1.3086832622877465, + "grad_norm": 0.6337867379188538, + "learning_rate": 5.535938632239253e-06, + "loss": 0.179, + "step": 13892 + }, + { + "epoch": 1.3087774663809142, + "grad_norm": 0.7240697145462036, + "learning_rate": 5.53458747294108e-06, + "loss": 0.2294, + "step": 13893 + }, + { + "epoch": 1.3088716704740821, + "grad_norm": 0.7006561756134033, + "learning_rate": 5.533236415461221e-06, + "loss": 0.1989, + "step": 13894 + }, + { + "epoch": 1.3089658745672499, + "grad_norm": 0.6407173871994019, + "learning_rate": 5.531885459830481e-06, + "loss": 0.1989, + "step": 13895 + }, + { + "epoch": 1.3090600786604178, + "grad_norm": 0.5988685488700867, + "learning_rate": 5.53053460607966e-06, + "loss": 0.1932, + "step": 13896 + }, + { + "epoch": 1.3091542827535856, + "grad_norm": 0.6343111395835876, + "learning_rate": 5.529183854239563e-06, + "loss": 0.1818, + "step": 13897 + }, + { + "epoch": 1.3092484868467535, + "grad_norm": 0.6254624128341675, + "learning_rate": 5.527833204340992e-06, + "loss": 0.2141, + "step": 13898 + }, + { + "epoch": 1.3093426909399213, + "grad_norm": 0.6266101002693176, + "learning_rate": 5.526482656414737e-06, + "loss": 0.1938, + "step": 13899 + }, + { + "epoch": 1.3094368950330892, + "grad_norm": 0.6078035235404968, + "learning_rate": 5.525132210491595e-06, + "loss": 0.1739, + "step": 13900 + }, + { + "epoch": 1.309531099126257, + "grad_norm": 0.5897152423858643, + "learning_rate": 5.523781866602362e-06, + "loss": 0.1692, + "step": 13901 + }, + { + "epoch": 1.309625303219425, + "grad_norm": 0.6647896766662598, + "learning_rate": 5.522431624777822e-06, + "loss": 0.1866, + "step": 13902 + }, + { + "epoch": 1.3097195073125927, + "grad_norm": 0.7052650451660156, + "learning_rate": 5.521081485048766e-06, + "loss": 0.1921, + "step": 13903 + }, + { + "epoch": 1.3098137114057606, + "grad_norm": 0.5926150679588318, + "learning_rate": 5.519731447445983e-06, + "loss": 0.185, + "step": 13904 + }, + { + "epoch": 1.3099079154989284, + "grad_norm": 0.7053104639053345, + "learning_rate": 5.518381512000252e-06, + "loss": 0.2235, + "step": 13905 + }, + { + "epoch": 1.3100021195920963, + "grad_norm": 0.6310542821884155, + "learning_rate": 5.517031678742344e-06, + "loss": 0.187, + "step": 13906 + }, + { + "epoch": 1.310096323685264, + "grad_norm": 0.6946753859519958, + "learning_rate": 5.515681947703055e-06, + "loss": 0.2191, + "step": 13907 + }, + { + "epoch": 1.310190527778432, + "grad_norm": 0.6307712197303772, + "learning_rate": 5.514332318913154e-06, + "loss": 0.2025, + "step": 13908 + }, + { + "epoch": 1.3102847318715998, + "grad_norm": 0.667898416519165, + "learning_rate": 5.512982792403405e-06, + "loss": 0.2105, + "step": 13909 + }, + { + "epoch": 1.3103789359647677, + "grad_norm": 0.6130701899528503, + "learning_rate": 5.511633368204596e-06, + "loss": 0.1994, + "step": 13910 + }, + { + "epoch": 1.3104731400579355, + "grad_norm": 0.6175673007965088, + "learning_rate": 5.510284046347487e-06, + "loss": 0.1743, + "step": 13911 + }, + { + "epoch": 1.3105673441511034, + "grad_norm": 0.6781598925590515, + "learning_rate": 5.508934826862842e-06, + "loss": 0.2126, + "step": 13912 + }, + { + "epoch": 1.3106615482442712, + "grad_norm": 0.6490588188171387, + "learning_rate": 5.507585709781427e-06, + "loss": 0.2067, + "step": 13913 + }, + { + "epoch": 1.3107557523374391, + "grad_norm": 0.6518041491508484, + "learning_rate": 5.5062366951340105e-06, + "loss": 0.2147, + "step": 13914 + }, + { + "epoch": 1.3108499564306069, + "grad_norm": 0.6279114484786987, + "learning_rate": 5.504887782951343e-06, + "loss": 0.2067, + "step": 13915 + }, + { + "epoch": 1.3109441605237748, + "grad_norm": 0.6991190910339355, + "learning_rate": 5.503538973264185e-06, + "loss": 0.2022, + "step": 13916 + }, + { + "epoch": 1.3110383646169426, + "grad_norm": 0.6147141456604004, + "learning_rate": 5.502190266103298e-06, + "loss": 0.1905, + "step": 13917 + }, + { + "epoch": 1.3111325687101105, + "grad_norm": 0.6829583644866943, + "learning_rate": 5.5008416614994235e-06, + "loss": 0.1844, + "step": 13918 + }, + { + "epoch": 1.3112267728032783, + "grad_norm": 0.6109396815299988, + "learning_rate": 5.499493159483318e-06, + "loss": 0.2091, + "step": 13919 + }, + { + "epoch": 1.3113209768964462, + "grad_norm": 0.6405788064002991, + "learning_rate": 5.498144760085732e-06, + "loss": 0.2102, + "step": 13920 + }, + { + "epoch": 1.311415180989614, + "grad_norm": 0.702020525932312, + "learning_rate": 5.496796463337404e-06, + "loss": 0.188, + "step": 13921 + }, + { + "epoch": 1.311509385082782, + "grad_norm": 0.7264140844345093, + "learning_rate": 5.495448269269085e-06, + "loss": 0.1992, + "step": 13922 + }, + { + "epoch": 1.3116035891759497, + "grad_norm": 0.5828751921653748, + "learning_rate": 5.494100177911508e-06, + "loss": 0.1694, + "step": 13923 + }, + { + "epoch": 1.3116977932691176, + "grad_norm": 0.6680941581726074, + "learning_rate": 5.492752189295415e-06, + "loss": 0.2218, + "step": 13924 + }, + { + "epoch": 1.3117919973622854, + "grad_norm": 0.6263971328735352, + "learning_rate": 5.491404303451546e-06, + "loss": 0.2012, + "step": 13925 + }, + { + "epoch": 1.3118862014554533, + "grad_norm": 0.6877226829528809, + "learning_rate": 5.49005652041063e-06, + "loss": 0.209, + "step": 13926 + }, + { + "epoch": 1.311980405548621, + "grad_norm": 0.704127311706543, + "learning_rate": 5.488708840203398e-06, + "loss": 0.2108, + "step": 13927 + }, + { + "epoch": 1.312074609641789, + "grad_norm": 0.6672877669334412, + "learning_rate": 5.487361262860587e-06, + "loss": 0.2166, + "step": 13928 + }, + { + "epoch": 1.3121688137349568, + "grad_norm": 0.6394596099853516, + "learning_rate": 5.486013788412912e-06, + "loss": 0.1595, + "step": 13929 + }, + { + "epoch": 1.3122630178281247, + "grad_norm": 0.659265398979187, + "learning_rate": 5.484666416891109e-06, + "loss": 0.1943, + "step": 13930 + }, + { + "epoch": 1.3123572219212924, + "grad_norm": 0.6087794899940491, + "learning_rate": 5.483319148325891e-06, + "loss": 0.1968, + "step": 13931 + }, + { + "epoch": 1.3124514260144604, + "grad_norm": 0.7124664783477783, + "learning_rate": 5.481971982747985e-06, + "loss": 0.2051, + "step": 13932 + }, + { + "epoch": 1.3125456301076281, + "grad_norm": 0.5750867128372192, + "learning_rate": 5.4806249201881e-06, + "loss": 0.1912, + "step": 13933 + }, + { + "epoch": 1.312639834200796, + "grad_norm": 0.6283868551254272, + "learning_rate": 5.479277960676959e-06, + "loss": 0.2112, + "step": 13934 + }, + { + "epoch": 1.3127340382939638, + "grad_norm": 0.6514611840248108, + "learning_rate": 5.4779311042452735e-06, + "loss": 0.1917, + "step": 13935 + }, + { + "epoch": 1.3128282423871318, + "grad_norm": 0.6681877374649048, + "learning_rate": 5.476584350923749e-06, + "loss": 0.2039, + "step": 13936 + }, + { + "epoch": 1.3129224464802995, + "grad_norm": 0.6883458495140076, + "learning_rate": 5.4752377007430966e-06, + "loss": 0.2169, + "step": 13937 + }, + { + "epoch": 1.3130166505734673, + "grad_norm": 0.610941469669342, + "learning_rate": 5.4738911537340275e-06, + "loss": 0.1983, + "step": 13938 + }, + { + "epoch": 1.3131108546666352, + "grad_norm": 0.6219098567962646, + "learning_rate": 5.472544709927234e-06, + "loss": 0.1824, + "step": 13939 + }, + { + "epoch": 1.3132050587598032, + "grad_norm": 0.7274148464202881, + "learning_rate": 5.471198369353425e-06, + "loss": 0.2126, + "step": 13940 + }, + { + "epoch": 1.313299262852971, + "grad_norm": 0.7503207921981812, + "learning_rate": 5.469852132043301e-06, + "loss": 0.2256, + "step": 13941 + }, + { + "epoch": 1.3133934669461387, + "grad_norm": 0.6928357481956482, + "learning_rate": 5.468505998027549e-06, + "loss": 0.206, + "step": 13942 + }, + { + "epoch": 1.3134876710393066, + "grad_norm": 0.6850817799568176, + "learning_rate": 5.467159967336868e-06, + "loss": 0.2092, + "step": 13943 + }, + { + "epoch": 1.3135818751324746, + "grad_norm": 0.6698310971260071, + "learning_rate": 5.465814040001955e-06, + "loss": 0.2052, + "step": 13944 + }, + { + "epoch": 1.3136760792256423, + "grad_norm": 0.6120530962944031, + "learning_rate": 5.464468216053493e-06, + "loss": 0.1855, + "step": 13945 + }, + { + "epoch": 1.31377028331881, + "grad_norm": 0.6629364490509033, + "learning_rate": 5.463122495522164e-06, + "loss": 0.2162, + "step": 13946 + }, + { + "epoch": 1.313864487411978, + "grad_norm": 0.6264070272445679, + "learning_rate": 5.461776878438665e-06, + "loss": 0.1888, + "step": 13947 + }, + { + "epoch": 1.313958691505146, + "grad_norm": 0.5900087952613831, + "learning_rate": 5.460431364833673e-06, + "loss": 0.1791, + "step": 13948 + }, + { + "epoch": 1.3140528955983137, + "grad_norm": 0.7165248394012451, + "learning_rate": 5.459085954737857e-06, + "loss": 0.1723, + "step": 13949 + }, + { + "epoch": 1.3141470996914815, + "grad_norm": 0.6292446851730347, + "learning_rate": 5.4577406481819125e-06, + "loss": 0.2123, + "step": 13950 + }, + { + "epoch": 1.3142413037846494, + "grad_norm": 0.6633747220039368, + "learning_rate": 5.456395445196506e-06, + "loss": 0.2055, + "step": 13951 + }, + { + "epoch": 1.3143355078778174, + "grad_norm": 0.6716272234916687, + "learning_rate": 5.455050345812306e-06, + "loss": 0.2315, + "step": 13952 + }, + { + "epoch": 1.3144297119709851, + "grad_norm": 0.693078875541687, + "learning_rate": 5.453705350059988e-06, + "loss": 0.2241, + "step": 13953 + }, + { + "epoch": 1.3145239160641529, + "grad_norm": 0.8464998602867126, + "learning_rate": 5.452360457970222e-06, + "loss": 0.2064, + "step": 13954 + }, + { + "epoch": 1.3146181201573208, + "grad_norm": 0.6772229075431824, + "learning_rate": 5.451015669573666e-06, + "loss": 0.2122, + "step": 13955 + }, + { + "epoch": 1.3147123242504888, + "grad_norm": 0.6633017063140869, + "learning_rate": 5.449670984900988e-06, + "loss": 0.2097, + "step": 13956 + }, + { + "epoch": 1.3148065283436565, + "grad_norm": 0.7297033071517944, + "learning_rate": 5.448326403982854e-06, + "loss": 0.2433, + "step": 13957 + }, + { + "epoch": 1.3149007324368243, + "grad_norm": 0.5902170538902283, + "learning_rate": 5.446981926849912e-06, + "loss": 0.1731, + "step": 13958 + }, + { + "epoch": 1.3149949365299922, + "grad_norm": 0.6542876362800598, + "learning_rate": 5.445637553532825e-06, + "loss": 0.1998, + "step": 13959 + }, + { + "epoch": 1.3150891406231602, + "grad_norm": 0.7086982727050781, + "learning_rate": 5.444293284062248e-06, + "loss": 0.215, + "step": 13960 + }, + { + "epoch": 1.315183344716328, + "grad_norm": 0.7683100700378418, + "learning_rate": 5.442949118468826e-06, + "loss": 0.1876, + "step": 13961 + }, + { + "epoch": 1.3152775488094957, + "grad_norm": 0.7870001792907715, + "learning_rate": 5.441605056783216e-06, + "loss": 0.2091, + "step": 13962 + }, + { + "epoch": 1.3153717529026636, + "grad_norm": 0.6686782240867615, + "learning_rate": 5.440261099036056e-06, + "loss": 0.1932, + "step": 13963 + }, + { + "epoch": 1.3154659569958316, + "grad_norm": 0.7002503275871277, + "learning_rate": 5.4389172452579956e-06, + "loss": 0.2336, + "step": 13964 + }, + { + "epoch": 1.3155601610889993, + "grad_norm": 0.743320643901825, + "learning_rate": 5.437573495479678e-06, + "loss": 0.2307, + "step": 13965 + }, + { + "epoch": 1.315654365182167, + "grad_norm": 0.6570819020271301, + "learning_rate": 5.436229849731738e-06, + "loss": 0.1951, + "step": 13966 + }, + { + "epoch": 1.315748569275335, + "grad_norm": 0.6411550641059875, + "learning_rate": 5.434886308044816e-06, + "loss": 0.1872, + "step": 13967 + }, + { + "epoch": 1.315842773368503, + "grad_norm": 0.6603879928588867, + "learning_rate": 5.433542870449549e-06, + "loss": 0.2077, + "step": 13968 + }, + { + "epoch": 1.3159369774616707, + "grad_norm": 0.6315999031066895, + "learning_rate": 5.432199536976563e-06, + "loss": 0.2121, + "step": 13969 + }, + { + "epoch": 1.3160311815548384, + "grad_norm": 0.6271128058433533, + "learning_rate": 5.430856307656495e-06, + "loss": 0.2052, + "step": 13970 + }, + { + "epoch": 1.3161253856480064, + "grad_norm": 0.6487036347389221, + "learning_rate": 5.429513182519967e-06, + "loss": 0.195, + "step": 13971 + }, + { + "epoch": 1.3162195897411744, + "grad_norm": 0.6588313579559326, + "learning_rate": 5.42817016159761e-06, + "loss": 0.1976, + "step": 13972 + }, + { + "epoch": 1.316313793834342, + "grad_norm": 0.6937627196311951, + "learning_rate": 5.426827244920041e-06, + "loss": 0.2223, + "step": 13973 + }, + { + "epoch": 1.3164079979275098, + "grad_norm": 0.6528636813163757, + "learning_rate": 5.425484432517882e-06, + "loss": 0.2179, + "step": 13974 + }, + { + "epoch": 1.3165022020206778, + "grad_norm": 0.6710507273674011, + "learning_rate": 5.424141724421757e-06, + "loss": 0.1884, + "step": 13975 + }, + { + "epoch": 1.3165964061138458, + "grad_norm": 0.65396648645401, + "learning_rate": 5.422799120662273e-06, + "loss": 0.1754, + "step": 13976 + }, + { + "epoch": 1.3166906102070135, + "grad_norm": 0.6715162396430969, + "learning_rate": 5.421456621270048e-06, + "loss": 0.1903, + "step": 13977 + }, + { + "epoch": 1.3167848143001812, + "grad_norm": 0.6242902278900146, + "learning_rate": 5.420114226275697e-06, + "loss": 0.2103, + "step": 13978 + }, + { + "epoch": 1.3168790183933492, + "grad_norm": 0.68259197473526, + "learning_rate": 5.418771935709821e-06, + "loss": 0.2247, + "step": 13979 + }, + { + "epoch": 1.3169732224865172, + "grad_norm": 0.63609379529953, + "learning_rate": 5.417429749603029e-06, + "loss": 0.2048, + "step": 13980 + }, + { + "epoch": 1.317067426579685, + "grad_norm": 0.6944010853767395, + "learning_rate": 5.416087667985931e-06, + "loss": 0.2493, + "step": 13981 + }, + { + "epoch": 1.3171616306728526, + "grad_norm": 0.7394487261772156, + "learning_rate": 5.414745690889122e-06, + "loss": 0.215, + "step": 13982 + }, + { + "epoch": 1.3172558347660206, + "grad_norm": 0.6646633148193359, + "learning_rate": 5.413403818343195e-06, + "loss": 0.2234, + "step": 13983 + }, + { + "epoch": 1.3173500388591886, + "grad_norm": 0.5564711093902588, + "learning_rate": 5.412062050378764e-06, + "loss": 0.1872, + "step": 13984 + }, + { + "epoch": 1.3174442429523563, + "grad_norm": 0.691422164440155, + "learning_rate": 5.410720387026411e-06, + "loss": 0.2264, + "step": 13985 + }, + { + "epoch": 1.317538447045524, + "grad_norm": 0.7079923152923584, + "learning_rate": 5.4093788283167235e-06, + "loss": 0.2186, + "step": 13986 + }, + { + "epoch": 1.317632651138692, + "grad_norm": 0.7273160815238953, + "learning_rate": 5.408037374280306e-06, + "loss": 0.1958, + "step": 13987 + }, + { + "epoch": 1.31772685523186, + "grad_norm": 0.7292044758796692, + "learning_rate": 5.406696024947739e-06, + "loss": 0.2151, + "step": 13988 + }, + { + "epoch": 1.3178210593250277, + "grad_norm": 0.7436671853065491, + "learning_rate": 5.4053547803495966e-06, + "loss": 0.2196, + "step": 13989 + }, + { + "epoch": 1.3179152634181954, + "grad_norm": 0.6820604801177979, + "learning_rate": 5.4040136405164815e-06, + "loss": 0.2045, + "step": 13990 + }, + { + "epoch": 1.3180094675113634, + "grad_norm": 0.6173348426818848, + "learning_rate": 5.40267260547896e-06, + "loss": 0.1934, + "step": 13991 + }, + { + "epoch": 1.3181036716045313, + "grad_norm": 0.5947291851043701, + "learning_rate": 5.401331675267612e-06, + "loss": 0.1781, + "step": 13992 + }, + { + "epoch": 1.318197875697699, + "grad_norm": 0.6224139332771301, + "learning_rate": 5.399990849913013e-06, + "loss": 0.1892, + "step": 13993 + }, + { + "epoch": 1.3182920797908668, + "grad_norm": 0.6537177562713623, + "learning_rate": 5.3986501294457396e-06, + "loss": 0.1859, + "step": 13994 + }, + { + "epoch": 1.3183862838840348, + "grad_norm": 0.6509084105491638, + "learning_rate": 5.397309513896356e-06, + "loss": 0.1821, + "step": 13995 + }, + { + "epoch": 1.3184804879772027, + "grad_norm": 0.575751543045044, + "learning_rate": 5.395969003295434e-06, + "loss": 0.1887, + "step": 13996 + }, + { + "epoch": 1.3185746920703705, + "grad_norm": 1.0063947439193726, + "learning_rate": 5.394628597673541e-06, + "loss": 0.2199, + "step": 13997 + }, + { + "epoch": 1.3186688961635382, + "grad_norm": 0.7250304818153381, + "learning_rate": 5.393288297061237e-06, + "loss": 0.2445, + "step": 13998 + }, + { + "epoch": 1.3187631002567062, + "grad_norm": 0.6793080568313599, + "learning_rate": 5.391948101489083e-06, + "loss": 0.2043, + "step": 13999 + }, + { + "epoch": 1.3188573043498741, + "grad_norm": 0.6111436486244202, + "learning_rate": 5.3906080109876435e-06, + "loss": 0.2122, + "step": 14000 + }, + { + "epoch": 1.3189515084430419, + "grad_norm": 0.6745696663856506, + "learning_rate": 5.389268025587465e-06, + "loss": 0.2288, + "step": 14001 + }, + { + "epoch": 1.3190457125362096, + "grad_norm": 0.6720831990242004, + "learning_rate": 5.38792814531911e-06, + "loss": 0.1859, + "step": 14002 + }, + { + "epoch": 1.3191399166293776, + "grad_norm": 0.6277100443840027, + "learning_rate": 5.386588370213124e-06, + "loss": 0.1928, + "step": 14003 + }, + { + "epoch": 1.3192341207225453, + "grad_norm": 0.6584670543670654, + "learning_rate": 5.385248700300055e-06, + "loss": 0.2018, + "step": 14004 + }, + { + "epoch": 1.3193283248157133, + "grad_norm": 0.5974646210670471, + "learning_rate": 5.383909135610459e-06, + "loss": 0.1668, + "step": 14005 + }, + { + "epoch": 1.319422528908881, + "grad_norm": 0.7366122603416443, + "learning_rate": 5.382569676174868e-06, + "loss": 0.211, + "step": 14006 + }, + { + "epoch": 1.319516733002049, + "grad_norm": 0.6093755960464478, + "learning_rate": 5.3812303220238295e-06, + "loss": 0.1678, + "step": 14007 + }, + { + "epoch": 1.3196109370952167, + "grad_norm": 0.6341589093208313, + "learning_rate": 5.379891073187888e-06, + "loss": 0.1884, + "step": 14008 + }, + { + "epoch": 1.3197051411883847, + "grad_norm": 0.6902104616165161, + "learning_rate": 5.378551929697571e-06, + "loss": 0.2128, + "step": 14009 + }, + { + "epoch": 1.3197993452815524, + "grad_norm": 0.6470848321914673, + "learning_rate": 5.377212891583419e-06, + "loss": 0.1853, + "step": 14010 + }, + { + "epoch": 1.3198935493747204, + "grad_norm": 0.6759555339813232, + "learning_rate": 5.375873958875961e-06, + "loss": 0.2055, + "step": 14011 + }, + { + "epoch": 1.319987753467888, + "grad_norm": 0.6148934960365295, + "learning_rate": 5.374535131605731e-06, + "loss": 0.2123, + "step": 14012 + }, + { + "epoch": 1.320081957561056, + "grad_norm": 0.6794771552085876, + "learning_rate": 5.3731964098032495e-06, + "loss": 0.2107, + "step": 14013 + }, + { + "epoch": 1.3201761616542238, + "grad_norm": 0.6981692314147949, + "learning_rate": 5.3718577934990446e-06, + "loss": 0.1738, + "step": 14014 + }, + { + "epoch": 1.3202703657473918, + "grad_norm": 0.6872310042381287, + "learning_rate": 5.370519282723644e-06, + "loss": 0.1796, + "step": 14015 + }, + { + "epoch": 1.3203645698405595, + "grad_norm": 0.6473656296730042, + "learning_rate": 5.36918087750756e-06, + "loss": 0.2132, + "step": 14016 + }, + { + "epoch": 1.3204587739337275, + "grad_norm": 0.6503841876983643, + "learning_rate": 5.367842577881313e-06, + "loss": 0.2144, + "step": 14017 + }, + { + "epoch": 1.3205529780268952, + "grad_norm": 0.666084349155426, + "learning_rate": 5.3665043838754235e-06, + "loss": 0.2195, + "step": 14018 + }, + { + "epoch": 1.3206471821200632, + "grad_norm": 0.7067003846168518, + "learning_rate": 5.365166295520395e-06, + "loss": 0.2094, + "step": 14019 + }, + { + "epoch": 1.320741386213231, + "grad_norm": 0.6496502161026001, + "learning_rate": 5.363828312846744e-06, + "loss": 0.2137, + "step": 14020 + }, + { + "epoch": 1.3208355903063989, + "grad_norm": 0.6001673340797424, + "learning_rate": 5.362490435884982e-06, + "loss": 0.2141, + "step": 14021 + }, + { + "epoch": 1.3209297943995666, + "grad_norm": 0.6753737926483154, + "learning_rate": 5.361152664665608e-06, + "loss": 0.2276, + "step": 14022 + }, + { + "epoch": 1.3210239984927346, + "grad_norm": 0.595683217048645, + "learning_rate": 5.3598149992191204e-06, + "loss": 0.1954, + "step": 14023 + }, + { + "epoch": 1.3211182025859023, + "grad_norm": 0.6467253565788269, + "learning_rate": 5.358477439576035e-06, + "loss": 0.2154, + "step": 14024 + }, + { + "epoch": 1.3212124066790703, + "grad_norm": 0.7814813852310181, + "learning_rate": 5.357139985766843e-06, + "loss": 0.1985, + "step": 14025 + }, + { + "epoch": 1.321306610772238, + "grad_norm": 0.6634308695793152, + "learning_rate": 5.355802637822031e-06, + "loss": 0.1991, + "step": 14026 + }, + { + "epoch": 1.321400814865406, + "grad_norm": 0.6398400068283081, + "learning_rate": 5.354465395772108e-06, + "loss": 0.1898, + "step": 14027 + }, + { + "epoch": 1.3214950189585737, + "grad_norm": 0.6499345302581787, + "learning_rate": 5.35312825964756e-06, + "loss": 0.2161, + "step": 14028 + }, + { + "epoch": 1.3215892230517416, + "grad_norm": 0.6460532546043396, + "learning_rate": 5.351791229478866e-06, + "loss": 0.2072, + "step": 14029 + }, + { + "epoch": 1.3216834271449094, + "grad_norm": 0.6906152963638306, + "learning_rate": 5.350454305296529e-06, + "loss": 0.2221, + "step": 14030 + }, + { + "epoch": 1.3217776312380773, + "grad_norm": 0.7326046228408813, + "learning_rate": 5.349117487131024e-06, + "loss": 0.2123, + "step": 14031 + }, + { + "epoch": 1.321871835331245, + "grad_norm": 0.6765629053115845, + "learning_rate": 5.347780775012828e-06, + "loss": 0.2138, + "step": 14032 + }, + { + "epoch": 1.321966039424413, + "grad_norm": 0.6705592274665833, + "learning_rate": 5.346444168972426e-06, + "loss": 0.2313, + "step": 14033 + }, + { + "epoch": 1.3220602435175808, + "grad_norm": 0.6718334555625916, + "learning_rate": 5.345107669040298e-06, + "loss": 0.2331, + "step": 14034 + }, + { + "epoch": 1.3221544476107487, + "grad_norm": 0.6745386123657227, + "learning_rate": 5.34377127524691e-06, + "loss": 0.1979, + "step": 14035 + }, + { + "epoch": 1.3222486517039165, + "grad_norm": 0.6806681156158447, + "learning_rate": 5.342434987622738e-06, + "loss": 0.2322, + "step": 14036 + }, + { + "epoch": 1.3223428557970844, + "grad_norm": 0.6321537494659424, + "learning_rate": 5.3410988061982545e-06, + "loss": 0.1948, + "step": 14037 + }, + { + "epoch": 1.3224370598902522, + "grad_norm": 0.6890372633934021, + "learning_rate": 5.3397627310039215e-06, + "loss": 0.2133, + "step": 14038 + }, + { + "epoch": 1.3225312639834201, + "grad_norm": 0.5901936292648315, + "learning_rate": 5.338426762070204e-06, + "loss": 0.1788, + "step": 14039 + }, + { + "epoch": 1.3226254680765879, + "grad_norm": 0.7906286120414734, + "learning_rate": 5.337090899427568e-06, + "loss": 0.207, + "step": 14040 + }, + { + "epoch": 1.3227196721697558, + "grad_norm": 0.7934432625770569, + "learning_rate": 5.335755143106469e-06, + "loss": 0.2294, + "step": 14041 + }, + { + "epoch": 1.3228138762629236, + "grad_norm": 0.6328506469726562, + "learning_rate": 5.33441949313737e-06, + "loss": 0.205, + "step": 14042 + }, + { + "epoch": 1.3229080803560915, + "grad_norm": 0.5785630941390991, + "learning_rate": 5.333083949550717e-06, + "loss": 0.1797, + "step": 14043 + }, + { + "epoch": 1.3230022844492593, + "grad_norm": 0.6074365973472595, + "learning_rate": 5.3317485123769685e-06, + "loss": 0.1917, + "step": 14044 + }, + { + "epoch": 1.3230964885424272, + "grad_norm": 0.6671708822250366, + "learning_rate": 5.330413181646577e-06, + "loss": 0.2148, + "step": 14045 + }, + { + "epoch": 1.323190692635595, + "grad_norm": 0.6195216774940491, + "learning_rate": 5.329077957389982e-06, + "loss": 0.1682, + "step": 14046 + }, + { + "epoch": 1.323284896728763, + "grad_norm": 0.7096898555755615, + "learning_rate": 5.327742839637637e-06, + "loss": 0.19, + "step": 14047 + }, + { + "epoch": 1.3233791008219307, + "grad_norm": 0.9569317102432251, + "learning_rate": 5.32640782841998e-06, + "loss": 0.2011, + "step": 14048 + }, + { + "epoch": 1.3234733049150986, + "grad_norm": 0.6340876817703247, + "learning_rate": 5.325072923767449e-06, + "loss": 0.1843, + "step": 14049 + }, + { + "epoch": 1.3235675090082664, + "grad_norm": 0.6779504418373108, + "learning_rate": 5.323738125710492e-06, + "loss": 0.2316, + "step": 14050 + }, + { + "epoch": 1.3236617131014343, + "grad_norm": 0.6356896162033081, + "learning_rate": 5.322403434279531e-06, + "loss": 0.2039, + "step": 14051 + }, + { + "epoch": 1.323755917194602, + "grad_norm": 0.5953193306922913, + "learning_rate": 5.321068849505011e-06, + "loss": 0.2044, + "step": 14052 + }, + { + "epoch": 1.32385012128777, + "grad_norm": 0.6592140793800354, + "learning_rate": 5.319734371417353e-06, + "loss": 0.1971, + "step": 14053 + }, + { + "epoch": 1.3239443253809378, + "grad_norm": 0.6701865196228027, + "learning_rate": 5.318400000046991e-06, + "loss": 0.189, + "step": 14054 + }, + { + "epoch": 1.3240385294741057, + "grad_norm": 0.6589224934577942, + "learning_rate": 5.317065735424352e-06, + "loss": 0.2135, + "step": 14055 + }, + { + "epoch": 1.3241327335672735, + "grad_norm": 0.7157045006752014, + "learning_rate": 5.315731577579853e-06, + "loss": 0.2019, + "step": 14056 + }, + { + "epoch": 1.3242269376604414, + "grad_norm": 0.6550059914588928, + "learning_rate": 5.314397526543917e-06, + "loss": 0.1998, + "step": 14057 + }, + { + "epoch": 1.3243211417536092, + "grad_norm": 0.6386902332305908, + "learning_rate": 5.313063582346969e-06, + "loss": 0.2047, + "step": 14058 + }, + { + "epoch": 1.3244153458467771, + "grad_norm": 0.6854854822158813, + "learning_rate": 5.311729745019416e-06, + "loss": 0.2276, + "step": 14059 + }, + { + "epoch": 1.3245095499399449, + "grad_norm": 0.6264967918395996, + "learning_rate": 5.310396014591675e-06, + "loss": 0.1905, + "step": 14060 + }, + { + "epoch": 1.3246037540331128, + "grad_norm": 0.674786388874054, + "learning_rate": 5.309062391094161e-06, + "loss": 0.2458, + "step": 14061 + }, + { + "epoch": 1.3246979581262806, + "grad_norm": 0.6399668455123901, + "learning_rate": 5.3077288745572794e-06, + "loss": 0.1985, + "step": 14062 + }, + { + "epoch": 1.3247921622194485, + "grad_norm": 0.6307435631752014, + "learning_rate": 5.306395465011429e-06, + "loss": 0.2084, + "step": 14063 + }, + { + "epoch": 1.3248863663126162, + "grad_norm": 0.686042845249176, + "learning_rate": 5.305062162487028e-06, + "loss": 0.1987, + "step": 14064 + }, + { + "epoch": 1.3249805704057842, + "grad_norm": 0.6557334065437317, + "learning_rate": 5.30372896701447e-06, + "loss": 0.2039, + "step": 14065 + }, + { + "epoch": 1.325074774498952, + "grad_norm": 0.7092450857162476, + "learning_rate": 5.302395878624148e-06, + "loss": 0.1889, + "step": 14066 + }, + { + "epoch": 1.32516897859212, + "grad_norm": 0.7731506824493408, + "learning_rate": 5.3010628973464716e-06, + "loss": 0.222, + "step": 14067 + }, + { + "epoch": 1.3252631826852876, + "grad_norm": 0.6983768343925476, + "learning_rate": 5.299730023211829e-06, + "loss": 0.2242, + "step": 14068 + }, + { + "epoch": 1.3253573867784556, + "grad_norm": 0.6925291419029236, + "learning_rate": 5.2983972562506025e-06, + "loss": 0.1994, + "step": 14069 + }, + { + "epoch": 1.3254515908716233, + "grad_norm": 0.6160935163497925, + "learning_rate": 5.297064596493198e-06, + "loss": 0.1922, + "step": 14070 + }, + { + "epoch": 1.3255457949647913, + "grad_norm": 0.6398634314537048, + "learning_rate": 5.2957320439699944e-06, + "loss": 0.2239, + "step": 14071 + }, + { + "epoch": 1.325639999057959, + "grad_norm": 0.5992037057876587, + "learning_rate": 5.2943995987113706e-06, + "loss": 0.1808, + "step": 14072 + }, + { + "epoch": 1.3257342031511268, + "grad_norm": 0.6360060572624207, + "learning_rate": 5.293067260747714e-06, + "loss": 0.2062, + "step": 14073 + }, + { + "epoch": 1.3258284072442947, + "grad_norm": 0.6519641280174255, + "learning_rate": 5.291735030109407e-06, + "loss": 0.2274, + "step": 14074 + }, + { + "epoch": 1.3259226113374627, + "grad_norm": 0.6662633419036865, + "learning_rate": 5.290402906826819e-06, + "loss": 0.2107, + "step": 14075 + }, + { + "epoch": 1.3260168154306304, + "grad_norm": 0.6444947123527527, + "learning_rate": 5.289070890930328e-06, + "loss": 0.2291, + "step": 14076 + }, + { + "epoch": 1.3261110195237982, + "grad_norm": 0.6289883255958557, + "learning_rate": 5.28773898245031e-06, + "loss": 0.183, + "step": 14077 + }, + { + "epoch": 1.3262052236169661, + "grad_norm": 0.6775181293487549, + "learning_rate": 5.286407181417128e-06, + "loss": 0.1887, + "step": 14078 + }, + { + "epoch": 1.326299427710134, + "grad_norm": 0.7216343283653259, + "learning_rate": 5.285075487861151e-06, + "loss": 0.1822, + "step": 14079 + }, + { + "epoch": 1.3263936318033018, + "grad_norm": 0.5782265663146973, + "learning_rate": 5.283743901812748e-06, + "loss": 0.1906, + "step": 14080 + }, + { + "epoch": 1.3264878358964696, + "grad_norm": 0.6510884761810303, + "learning_rate": 5.282412423302274e-06, + "loss": 0.2207, + "step": 14081 + }, + { + "epoch": 1.3265820399896375, + "grad_norm": 0.625861406326294, + "learning_rate": 5.281081052360098e-06, + "loss": 0.1882, + "step": 14082 + }, + { + "epoch": 1.3266762440828055, + "grad_norm": 0.602749764919281, + "learning_rate": 5.279749789016567e-06, + "loss": 0.1887, + "step": 14083 + }, + { + "epoch": 1.3267704481759732, + "grad_norm": 0.6150605082511902, + "learning_rate": 5.278418633302041e-06, + "loss": 0.1932, + "step": 14084 + }, + { + "epoch": 1.326864652269141, + "grad_norm": 0.7779563665390015, + "learning_rate": 5.277087585246876e-06, + "loss": 0.2358, + "step": 14085 + }, + { + "epoch": 1.326958856362309, + "grad_norm": 0.7492592930793762, + "learning_rate": 5.275756644881416e-06, + "loss": 0.2374, + "step": 14086 + }, + { + "epoch": 1.3270530604554769, + "grad_norm": 0.6166776418685913, + "learning_rate": 5.274425812236013e-06, + "loss": 0.196, + "step": 14087 + }, + { + "epoch": 1.3271472645486446, + "grad_norm": 0.7160036563873291, + "learning_rate": 5.2730950873410045e-06, + "loss": 0.2326, + "step": 14088 + }, + { + "epoch": 1.3272414686418124, + "grad_norm": 0.6125269532203674, + "learning_rate": 5.27176447022674e-06, + "loss": 0.2014, + "step": 14089 + }, + { + "epoch": 1.3273356727349803, + "grad_norm": 0.6119320392608643, + "learning_rate": 5.270433960923562e-06, + "loss": 0.2027, + "step": 14090 + }, + { + "epoch": 1.3274298768281483, + "grad_norm": 0.6584473252296448, + "learning_rate": 5.269103559461799e-06, + "loss": 0.2095, + "step": 14091 + }, + { + "epoch": 1.327524080921316, + "grad_norm": 0.5868294835090637, + "learning_rate": 5.2677732658717965e-06, + "loss": 0.1906, + "step": 14092 + }, + { + "epoch": 1.3276182850144838, + "grad_norm": 0.6940475106239319, + "learning_rate": 5.266443080183879e-06, + "loss": 0.1973, + "step": 14093 + }, + { + "epoch": 1.3277124891076517, + "grad_norm": 0.7088175415992737, + "learning_rate": 5.265113002428377e-06, + "loss": 0.2628, + "step": 14094 + }, + { + "epoch": 1.3278066932008197, + "grad_norm": 0.5621417760848999, + "learning_rate": 5.263783032635627e-06, + "loss": 0.1865, + "step": 14095 + }, + { + "epoch": 1.3279008972939874, + "grad_norm": 0.6609739661216736, + "learning_rate": 5.262453170835944e-06, + "loss": 0.2021, + "step": 14096 + }, + { + "epoch": 1.3279951013871552, + "grad_norm": 0.7245957851409912, + "learning_rate": 5.261123417059656e-06, + "loss": 0.1936, + "step": 14097 + }, + { + "epoch": 1.3280893054803231, + "grad_norm": 1.0036548376083374, + "learning_rate": 5.2597937713370865e-06, + "loss": 0.1882, + "step": 14098 + }, + { + "epoch": 1.328183509573491, + "grad_norm": 0.6218024492263794, + "learning_rate": 5.258464233698546e-06, + "loss": 0.1988, + "step": 14099 + }, + { + "epoch": 1.3282777136666588, + "grad_norm": 0.6181890964508057, + "learning_rate": 5.257134804174353e-06, + "loss": 0.18, + "step": 14100 + }, + { + "epoch": 1.3283719177598265, + "grad_norm": 0.6878771781921387, + "learning_rate": 5.255805482794827e-06, + "loss": 0.1988, + "step": 14101 + }, + { + "epoch": 1.3284661218529945, + "grad_norm": 0.659048855304718, + "learning_rate": 5.2544762695902715e-06, + "loss": 0.1829, + "step": 14102 + }, + { + "epoch": 1.3285603259461625, + "grad_norm": 0.6403631567955017, + "learning_rate": 5.25314716459099e-06, + "loss": 0.2128, + "step": 14103 + }, + { + "epoch": 1.3286545300393302, + "grad_norm": 0.6543837785720825, + "learning_rate": 5.2518181678273e-06, + "loss": 0.1944, + "step": 14104 + }, + { + "epoch": 1.328748734132498, + "grad_norm": 0.605215847492218, + "learning_rate": 5.250489279329501e-06, + "loss": 0.2278, + "step": 14105 + }, + { + "epoch": 1.328842938225666, + "grad_norm": 0.6835913062095642, + "learning_rate": 5.249160499127883e-06, + "loss": 0.1957, + "step": 14106 + }, + { + "epoch": 1.3289371423188339, + "grad_norm": 0.6238964796066284, + "learning_rate": 5.247831827252761e-06, + "loss": 0.1934, + "step": 14107 + }, + { + "epoch": 1.3290313464120016, + "grad_norm": 0.639494001865387, + "learning_rate": 5.246503263734422e-06, + "loss": 0.2141, + "step": 14108 + }, + { + "epoch": 1.3291255505051693, + "grad_norm": 0.6721289753913879, + "learning_rate": 5.245174808603153e-06, + "loss": 0.1906, + "step": 14109 + }, + { + "epoch": 1.3292197545983373, + "grad_norm": 0.6276894211769104, + "learning_rate": 5.24384646188926e-06, + "loss": 0.1941, + "step": 14110 + }, + { + "epoch": 1.3293139586915053, + "grad_norm": 0.6959485411643982, + "learning_rate": 5.242518223623022e-06, + "loss": 0.1932, + "step": 14111 + }, + { + "epoch": 1.329408162784673, + "grad_norm": 0.677562415599823, + "learning_rate": 5.2411900938347224e-06, + "loss": 0.1928, + "step": 14112 + }, + { + "epoch": 1.3295023668778407, + "grad_norm": 0.6668716669082642, + "learning_rate": 5.239862072554649e-06, + "loss": 0.1907, + "step": 14113 + }, + { + "epoch": 1.3295965709710087, + "grad_norm": 0.6348514556884766, + "learning_rate": 5.238534159813085e-06, + "loss": 0.2039, + "step": 14114 + }, + { + "epoch": 1.3296907750641767, + "grad_norm": 0.6691227555274963, + "learning_rate": 5.237206355640303e-06, + "loss": 0.2223, + "step": 14115 + }, + { + "epoch": 1.3297849791573444, + "grad_norm": 0.6502666473388672, + "learning_rate": 5.235878660066582e-06, + "loss": 0.2171, + "step": 14116 + }, + { + "epoch": 1.3298791832505121, + "grad_norm": 0.5818041563034058, + "learning_rate": 5.234551073122199e-06, + "loss": 0.2066, + "step": 14117 + }, + { + "epoch": 1.32997338734368, + "grad_norm": 0.7705210447311401, + "learning_rate": 5.2332235948374165e-06, + "loss": 0.2094, + "step": 14118 + }, + { + "epoch": 1.330067591436848, + "grad_norm": 0.6761450171470642, + "learning_rate": 5.23189622524251e-06, + "loss": 0.2167, + "step": 14119 + }, + { + "epoch": 1.3301617955300158, + "grad_norm": 0.6264675855636597, + "learning_rate": 5.230568964367746e-06, + "loss": 0.1916, + "step": 14120 + }, + { + "epoch": 1.3302559996231835, + "grad_norm": 0.5645196437835693, + "learning_rate": 5.229241812243382e-06, + "loss": 0.2047, + "step": 14121 + }, + { + "epoch": 1.3303502037163515, + "grad_norm": 0.6507833003997803, + "learning_rate": 5.227914768899687e-06, + "loss": 0.2037, + "step": 14122 + }, + { + "epoch": 1.3304444078095194, + "grad_norm": 0.6598931550979614, + "learning_rate": 5.226587834366911e-06, + "loss": 0.2128, + "step": 14123 + }, + { + "epoch": 1.3305386119026872, + "grad_norm": 0.6404772996902466, + "learning_rate": 5.225261008675315e-06, + "loss": 0.185, + "step": 14124 + }, + { + "epoch": 1.330632815995855, + "grad_norm": 0.6718472242355347, + "learning_rate": 5.223934291855155e-06, + "loss": 0.1919, + "step": 14125 + }, + { + "epoch": 1.3307270200890229, + "grad_norm": 0.6085643172264099, + "learning_rate": 5.222607683936678e-06, + "loss": 0.1915, + "step": 14126 + }, + { + "epoch": 1.3308212241821908, + "grad_norm": 0.7111129760742188, + "learning_rate": 5.221281184950135e-06, + "loss": 0.2161, + "step": 14127 + }, + { + "epoch": 1.3309154282753586, + "grad_norm": 0.6403906345367432, + "learning_rate": 5.21995479492577e-06, + "loss": 0.2101, + "step": 14128 + }, + { + "epoch": 1.3310096323685263, + "grad_norm": 0.7159192562103271, + "learning_rate": 5.218628513893829e-06, + "loss": 0.1727, + "step": 14129 + }, + { + "epoch": 1.3311038364616943, + "grad_norm": 0.6612852215766907, + "learning_rate": 5.217302341884554e-06, + "loss": 0.2318, + "step": 14130 + }, + { + "epoch": 1.3311980405548622, + "grad_norm": 0.6722060441970825, + "learning_rate": 5.215976278928179e-06, + "loss": 0.2087, + "step": 14131 + }, + { + "epoch": 1.33129224464803, + "grad_norm": 0.6088632941246033, + "learning_rate": 5.214650325054947e-06, + "loss": 0.1798, + "step": 14132 + }, + { + "epoch": 1.3313864487411977, + "grad_norm": 0.7001636624336243, + "learning_rate": 5.2133244802950855e-06, + "loss": 0.1932, + "step": 14133 + }, + { + "epoch": 1.3314806528343657, + "grad_norm": 0.6143582463264465, + "learning_rate": 5.211998744678828e-06, + "loss": 0.186, + "step": 14134 + }, + { + "epoch": 1.3315748569275336, + "grad_norm": 0.678594708442688, + "learning_rate": 5.210673118236407e-06, + "loss": 0.2423, + "step": 14135 + }, + { + "epoch": 1.3316690610207014, + "grad_norm": 1.064894199371338, + "learning_rate": 5.209347600998043e-06, + "loss": 0.1962, + "step": 14136 + }, + { + "epoch": 1.331763265113869, + "grad_norm": 0.6495254635810852, + "learning_rate": 5.208022192993963e-06, + "loss": 0.2058, + "step": 14137 + }, + { + "epoch": 1.331857469207037, + "grad_norm": 0.6739198565483093, + "learning_rate": 5.206696894254389e-06, + "loss": 0.2022, + "step": 14138 + }, + { + "epoch": 1.331951673300205, + "grad_norm": 0.6579096913337708, + "learning_rate": 5.205371704809537e-06, + "loss": 0.2033, + "step": 14139 + }, + { + "epoch": 1.3320458773933728, + "grad_norm": 0.5793762803077698, + "learning_rate": 5.204046624689625e-06, + "loss": 0.1724, + "step": 14140 + }, + { + "epoch": 1.3321400814865405, + "grad_norm": 0.6498485803604126, + "learning_rate": 5.202721653924872e-06, + "loss": 0.188, + "step": 14141 + }, + { + "epoch": 1.3322342855797085, + "grad_norm": 0.587631344795227, + "learning_rate": 5.201396792545483e-06, + "loss": 0.1688, + "step": 14142 + }, + { + "epoch": 1.3323284896728762, + "grad_norm": 0.7063856720924377, + "learning_rate": 5.200072040581661e-06, + "loss": 0.2104, + "step": 14143 + }, + { + "epoch": 1.3324226937660442, + "grad_norm": 0.6430703997612, + "learning_rate": 5.1987473980636275e-06, + "loss": 0.1869, + "step": 14144 + }, + { + "epoch": 1.332516897859212, + "grad_norm": 0.6656419038772583, + "learning_rate": 5.197422865021577e-06, + "loss": 0.2291, + "step": 14145 + }, + { + "epoch": 1.3326111019523799, + "grad_norm": 0.6350959539413452, + "learning_rate": 5.196098441485707e-06, + "loss": 0.2052, + "step": 14146 + }, + { + "epoch": 1.3327053060455476, + "grad_norm": 0.7392879128456116, + "learning_rate": 5.19477412748623e-06, + "loss": 0.2164, + "step": 14147 + }, + { + "epoch": 1.3327995101387156, + "grad_norm": 0.6880599856376648, + "learning_rate": 5.1934499230533335e-06, + "loss": 0.2199, + "step": 14148 + }, + { + "epoch": 1.3328937142318833, + "grad_norm": 0.6650460958480835, + "learning_rate": 5.192125828217203e-06, + "loss": 0.184, + "step": 14149 + }, + { + "epoch": 1.3329879183250513, + "grad_norm": 0.667985737323761, + "learning_rate": 5.190801843008049e-06, + "loss": 0.1913, + "step": 14150 + }, + { + "epoch": 1.333082122418219, + "grad_norm": 0.6559428572654724, + "learning_rate": 5.189477967456049e-06, + "loss": 0.2034, + "step": 14151 + }, + { + "epoch": 1.333176326511387, + "grad_norm": 0.671495258808136, + "learning_rate": 5.188154201591387e-06, + "loss": 0.1852, + "step": 14152 + }, + { + "epoch": 1.3332705306045547, + "grad_norm": 0.7077657580375671, + "learning_rate": 5.186830545444252e-06, + "loss": 0.2078, + "step": 14153 + }, + { + "epoch": 1.3333647346977227, + "grad_norm": 0.6105444431304932, + "learning_rate": 5.185506999044828e-06, + "loss": 0.206, + "step": 14154 + }, + { + "epoch": 1.3334589387908904, + "grad_norm": 0.7339476346969604, + "learning_rate": 5.184183562423286e-06, + "loss": 0.2156, + "step": 14155 + }, + { + "epoch": 1.3335531428840584, + "grad_norm": 0.6311266422271729, + "learning_rate": 5.182860235609807e-06, + "loss": 0.2014, + "step": 14156 + }, + { + "epoch": 1.333647346977226, + "grad_norm": 0.6553947329521179, + "learning_rate": 5.181537018634568e-06, + "loss": 0.2231, + "step": 14157 + }, + { + "epoch": 1.333741551070394, + "grad_norm": 0.6227595210075378, + "learning_rate": 5.180213911527733e-06, + "loss": 0.2078, + "step": 14158 + }, + { + "epoch": 1.3338357551635618, + "grad_norm": 0.6377168297767639, + "learning_rate": 5.178890914319474e-06, + "loss": 0.1951, + "step": 14159 + }, + { + "epoch": 1.3339299592567297, + "grad_norm": 0.7014271020889282, + "learning_rate": 5.177568027039963e-06, + "loss": 0.2146, + "step": 14160 + }, + { + "epoch": 1.3340241633498975, + "grad_norm": 0.6947354674339294, + "learning_rate": 5.176245249719355e-06, + "loss": 0.1796, + "step": 14161 + }, + { + "epoch": 1.3341183674430654, + "grad_norm": 0.728440523147583, + "learning_rate": 5.174922582387819e-06, + "loss": 0.2094, + "step": 14162 + }, + { + "epoch": 1.3342125715362332, + "grad_norm": 0.6573033928871155, + "learning_rate": 5.173600025075507e-06, + "loss": 0.1899, + "step": 14163 + }, + { + "epoch": 1.3343067756294011, + "grad_norm": 0.6331418752670288, + "learning_rate": 5.1722775778125815e-06, + "loss": 0.2263, + "step": 14164 + }, + { + "epoch": 1.3344009797225689, + "grad_norm": 0.6571514010429382, + "learning_rate": 5.17095524062919e-06, + "loss": 0.1897, + "step": 14165 + }, + { + "epoch": 1.3344951838157368, + "grad_norm": 0.6402251720428467, + "learning_rate": 5.169633013555487e-06, + "loss": 0.1998, + "step": 14166 + }, + { + "epoch": 1.3345893879089046, + "grad_norm": 0.7158523797988892, + "learning_rate": 5.168310896621626e-06, + "loss": 0.2141, + "step": 14167 + }, + { + "epoch": 1.3346835920020725, + "grad_norm": 0.5873801112174988, + "learning_rate": 5.166988889857745e-06, + "loss": 0.2081, + "step": 14168 + }, + { + "epoch": 1.3347777960952403, + "grad_norm": 0.7097365856170654, + "learning_rate": 5.165666993293992e-06, + "loss": 0.2091, + "step": 14169 + }, + { + "epoch": 1.3348720001884082, + "grad_norm": 0.873871922492981, + "learning_rate": 5.1643452069605105e-06, + "loss": 0.1907, + "step": 14170 + }, + { + "epoch": 1.334966204281576, + "grad_norm": 0.7245442867279053, + "learning_rate": 5.163023530887435e-06, + "loss": 0.2258, + "step": 14171 + }, + { + "epoch": 1.335060408374744, + "grad_norm": 0.6157423853874207, + "learning_rate": 5.161701965104906e-06, + "loss": 0.1792, + "step": 14172 + }, + { + "epoch": 1.3351546124679117, + "grad_norm": 0.656654953956604, + "learning_rate": 5.1603805096430506e-06, + "loss": 0.206, + "step": 14173 + }, + { + "epoch": 1.3352488165610796, + "grad_norm": 0.6471630334854126, + "learning_rate": 5.159059164532005e-06, + "loss": 0.1657, + "step": 14174 + }, + { + "epoch": 1.3353430206542474, + "grad_norm": 0.6502017378807068, + "learning_rate": 5.1577379298019e-06, + "loss": 0.1684, + "step": 14175 + }, + { + "epoch": 1.3354372247474153, + "grad_norm": 0.6079532504081726, + "learning_rate": 5.1564168054828555e-06, + "loss": 0.1905, + "step": 14176 + }, + { + "epoch": 1.335531428840583, + "grad_norm": 0.692517876625061, + "learning_rate": 5.155095791605e-06, + "loss": 0.2175, + "step": 14177 + }, + { + "epoch": 1.335625632933751, + "grad_norm": 0.6979208588600159, + "learning_rate": 5.153774888198455e-06, + "loss": 0.1987, + "step": 14178 + }, + { + "epoch": 1.3357198370269188, + "grad_norm": 0.5712969303131104, + "learning_rate": 5.1524540952933345e-06, + "loss": 0.173, + "step": 14179 + }, + { + "epoch": 1.3358140411200867, + "grad_norm": 0.6468443274497986, + "learning_rate": 5.151133412919758e-06, + "loss": 0.2328, + "step": 14180 + }, + { + "epoch": 1.3359082452132545, + "grad_norm": 0.6241378784179688, + "learning_rate": 5.149812841107842e-06, + "loss": 0.2098, + "step": 14181 + }, + { + "epoch": 1.3360024493064224, + "grad_norm": 0.5934635996818542, + "learning_rate": 5.1484923798876926e-06, + "loss": 0.1844, + "step": 14182 + }, + { + "epoch": 1.3360966533995902, + "grad_norm": 0.6325280070304871, + "learning_rate": 5.147172029289415e-06, + "loss": 0.1992, + "step": 14183 + }, + { + "epoch": 1.3361908574927581, + "grad_norm": 0.6002156138420105, + "learning_rate": 5.145851789343126e-06, + "loss": 0.177, + "step": 14184 + }, + { + "epoch": 1.3362850615859259, + "grad_norm": 0.6655933856964111, + "learning_rate": 5.1445316600789245e-06, + "loss": 0.1864, + "step": 14185 + }, + { + "epoch": 1.3363792656790938, + "grad_norm": 0.6828367710113525, + "learning_rate": 5.1432116415269014e-06, + "loss": 0.1963, + "step": 14186 + }, + { + "epoch": 1.3364734697722616, + "grad_norm": 0.7178785800933838, + "learning_rate": 5.141891733717173e-06, + "loss": 0.1999, + "step": 14187 + }, + { + "epoch": 1.3365676738654295, + "grad_norm": 0.6871474385261536, + "learning_rate": 5.140571936679825e-06, + "loss": 0.2338, + "step": 14188 + }, + { + "epoch": 1.3366618779585973, + "grad_norm": 0.6616499423980713, + "learning_rate": 5.139252250444948e-06, + "loss": 0.2309, + "step": 14189 + }, + { + "epoch": 1.3367560820517652, + "grad_norm": 0.6596075892448425, + "learning_rate": 5.137932675042638e-06, + "loss": 0.1888, + "step": 14190 + }, + { + "epoch": 1.336850286144933, + "grad_norm": 0.6166261434555054, + "learning_rate": 5.136613210502986e-06, + "loss": 0.1731, + "step": 14191 + }, + { + "epoch": 1.336944490238101, + "grad_norm": 0.7741532921791077, + "learning_rate": 5.13529385685607e-06, + "loss": 0.1808, + "step": 14192 + }, + { + "epoch": 1.3370386943312687, + "grad_norm": 0.6858149766921997, + "learning_rate": 5.133974614131978e-06, + "loss": 0.2063, + "step": 14193 + }, + { + "epoch": 1.3371328984244366, + "grad_norm": 0.9508662223815918, + "learning_rate": 5.132655482360795e-06, + "loss": 0.1987, + "step": 14194 + }, + { + "epoch": 1.3372271025176043, + "grad_norm": 0.6633398532867432, + "learning_rate": 5.1313364615725895e-06, + "loss": 0.217, + "step": 14195 + }, + { + "epoch": 1.3373213066107723, + "grad_norm": 0.6646075248718262, + "learning_rate": 5.1300175517974415e-06, + "loss": 0.195, + "step": 14196 + }, + { + "epoch": 1.33741551070394, + "grad_norm": 0.6978083252906799, + "learning_rate": 5.128698753065429e-06, + "loss": 0.226, + "step": 14197 + }, + { + "epoch": 1.337509714797108, + "grad_norm": 0.5929834842681885, + "learning_rate": 5.127380065406615e-06, + "loss": 0.2008, + "step": 14198 + }, + { + "epoch": 1.3376039188902757, + "grad_norm": 0.6304779052734375, + "learning_rate": 5.126061488851072e-06, + "loss": 0.1965, + "step": 14199 + }, + { + "epoch": 1.3376981229834437, + "grad_norm": 0.6507366895675659, + "learning_rate": 5.124743023428867e-06, + "loss": 0.1939, + "step": 14200 + }, + { + "epoch": 1.3377923270766114, + "grad_norm": 0.6634584665298462, + "learning_rate": 5.123424669170058e-06, + "loss": 0.201, + "step": 14201 + }, + { + "epoch": 1.3378865311697794, + "grad_norm": 0.7605651617050171, + "learning_rate": 5.122106426104713e-06, + "loss": 0.2121, + "step": 14202 + }, + { + "epoch": 1.3379807352629471, + "grad_norm": 0.8019225597381592, + "learning_rate": 5.1207882942628795e-06, + "loss": 0.2436, + "step": 14203 + }, + { + "epoch": 1.338074939356115, + "grad_norm": 0.6724850535392761, + "learning_rate": 5.1194702736746235e-06, + "loss": 0.2057, + "step": 14204 + }, + { + "epoch": 1.3381691434492828, + "grad_norm": 0.6152219772338867, + "learning_rate": 5.118152364369991e-06, + "loss": 0.1956, + "step": 14205 + }, + { + "epoch": 1.3382633475424508, + "grad_norm": 0.6260862350463867, + "learning_rate": 5.116834566379032e-06, + "loss": 0.1862, + "step": 14206 + }, + { + "epoch": 1.3383575516356185, + "grad_norm": 0.617588222026825, + "learning_rate": 5.115516879731801e-06, + "loss": 0.1997, + "step": 14207 + }, + { + "epoch": 1.3384517557287865, + "grad_norm": 0.7096170783042908, + "learning_rate": 5.114199304458336e-06, + "loss": 0.2151, + "step": 14208 + }, + { + "epoch": 1.3385459598219542, + "grad_norm": 0.6094770431518555, + "learning_rate": 5.112881840588683e-06, + "loss": 0.2004, + "step": 14209 + }, + { + "epoch": 1.3386401639151222, + "grad_norm": 0.6757852435112, + "learning_rate": 5.111564488152885e-06, + "loss": 0.2301, + "step": 14210 + }, + { + "epoch": 1.33873436800829, + "grad_norm": 0.8722229599952698, + "learning_rate": 5.110247247180974e-06, + "loss": 0.2083, + "step": 14211 + }, + { + "epoch": 1.3388285721014577, + "grad_norm": 0.6497089862823486, + "learning_rate": 5.108930117702992e-06, + "loss": 0.2004, + "step": 14212 + }, + { + "epoch": 1.3389227761946256, + "grad_norm": 0.6538980007171631, + "learning_rate": 5.107613099748963e-06, + "loss": 0.2081, + "step": 14213 + }, + { + "epoch": 1.3390169802877936, + "grad_norm": 0.8271617293357849, + "learning_rate": 5.106296193348921e-06, + "loss": 0.2366, + "step": 14214 + }, + { + "epoch": 1.3391111843809613, + "grad_norm": 0.593307375907898, + "learning_rate": 5.104979398532899e-06, + "loss": 0.1701, + "step": 14215 + }, + { + "epoch": 1.339205388474129, + "grad_norm": 0.6513305902481079, + "learning_rate": 5.103662715330912e-06, + "loss": 0.2111, + "step": 14216 + }, + { + "epoch": 1.339299592567297, + "grad_norm": 0.6432667970657349, + "learning_rate": 5.102346143772988e-06, + "loss": 0.1873, + "step": 14217 + }, + { + "epoch": 1.339393796660465, + "grad_norm": 0.6495787501335144, + "learning_rate": 5.10102968388915e-06, + "loss": 0.2132, + "step": 14218 + }, + { + "epoch": 1.3394880007536327, + "grad_norm": 0.6786971688270569, + "learning_rate": 5.099713335709409e-06, + "loss": 0.1986, + "step": 14219 + }, + { + "epoch": 1.3395822048468005, + "grad_norm": 0.572080671787262, + "learning_rate": 5.098397099263781e-06, + "loss": 0.19, + "step": 14220 + }, + { + "epoch": 1.3396764089399684, + "grad_norm": 0.8995034098625183, + "learning_rate": 5.097080974582283e-06, + "loss": 0.2059, + "step": 14221 + }, + { + "epoch": 1.3397706130331364, + "grad_norm": 0.7047062516212463, + "learning_rate": 5.095764961694923e-06, + "loss": 0.2367, + "step": 14222 + }, + { + "epoch": 1.3398648171263041, + "grad_norm": 0.6372144818305969, + "learning_rate": 5.094449060631697e-06, + "loss": 0.2181, + "step": 14223 + }, + { + "epoch": 1.3399590212194719, + "grad_norm": 0.6339496374130249, + "learning_rate": 5.093133271422628e-06, + "loss": 0.1862, + "step": 14224 + }, + { + "epoch": 1.3400532253126398, + "grad_norm": 0.7703357338905334, + "learning_rate": 5.091817594097708e-06, + "loss": 0.2211, + "step": 14225 + }, + { + "epoch": 1.3401474294058078, + "grad_norm": 0.66709965467453, + "learning_rate": 5.09050202868693e-06, + "loss": 0.206, + "step": 14226 + }, + { + "epoch": 1.3402416334989755, + "grad_norm": 0.6412050724029541, + "learning_rate": 5.089186575220307e-06, + "loss": 0.1962, + "step": 14227 + }, + { + "epoch": 1.3403358375921433, + "grad_norm": 0.665880560874939, + "learning_rate": 5.087871233727824e-06, + "loss": 0.2207, + "step": 14228 + }, + { + "epoch": 1.3404300416853112, + "grad_norm": 0.6656131148338318, + "learning_rate": 5.086556004239471e-06, + "loss": 0.211, + "step": 14229 + }, + { + "epoch": 1.3405242457784792, + "grad_norm": 0.6042746305465698, + "learning_rate": 5.085240886785239e-06, + "loss": 0.2029, + "step": 14230 + }, + { + "epoch": 1.340618449871647, + "grad_norm": 0.6668102741241455, + "learning_rate": 5.083925881395119e-06, + "loss": 0.1974, + "step": 14231 + }, + { + "epoch": 1.3407126539648146, + "grad_norm": 0.6841421127319336, + "learning_rate": 5.082610988099088e-06, + "loss": 0.2134, + "step": 14232 + }, + { + "epoch": 1.3408068580579826, + "grad_norm": 0.6074081659317017, + "learning_rate": 5.081296206927132e-06, + "loss": 0.2043, + "step": 14233 + }, + { + "epoch": 1.3409010621511506, + "grad_norm": 0.6494576334953308, + "learning_rate": 5.079981537909233e-06, + "loss": 0.1955, + "step": 14234 + }, + { + "epoch": 1.3409952662443183, + "grad_norm": 0.6407979130744934, + "learning_rate": 5.078666981075359e-06, + "loss": 0.172, + "step": 14235 + }, + { + "epoch": 1.341089470337486, + "grad_norm": 0.6389541625976562, + "learning_rate": 5.077352536455491e-06, + "loss": 0.1847, + "step": 14236 + }, + { + "epoch": 1.341183674430654, + "grad_norm": 0.6453443169593811, + "learning_rate": 5.076038204079601e-06, + "loss": 0.2112, + "step": 14237 + }, + { + "epoch": 1.341277878523822, + "grad_norm": 0.7297416925430298, + "learning_rate": 5.07472398397765e-06, + "loss": 0.2461, + "step": 14238 + }, + { + "epoch": 1.3413720826169897, + "grad_norm": 0.6424687504768372, + "learning_rate": 5.07340987617961e-06, + "loss": 0.181, + "step": 14239 + }, + { + "epoch": 1.3414662867101574, + "grad_norm": 0.6002586483955383, + "learning_rate": 5.0720958807154485e-06, + "loss": 0.1755, + "step": 14240 + }, + { + "epoch": 1.3415604908033254, + "grad_norm": 0.6982072591781616, + "learning_rate": 5.0707819976151175e-06, + "loss": 0.1645, + "step": 14241 + }, + { + "epoch": 1.3416546948964934, + "grad_norm": 0.7341618537902832, + "learning_rate": 5.069468226908585e-06, + "loss": 0.2407, + "step": 14242 + }, + { + "epoch": 1.341748898989661, + "grad_norm": 0.797859787940979, + "learning_rate": 5.068154568625797e-06, + "loss": 0.1932, + "step": 14243 + }, + { + "epoch": 1.3418431030828288, + "grad_norm": 0.661933958530426, + "learning_rate": 5.066841022796716e-06, + "loss": 0.2066, + "step": 14244 + }, + { + "epoch": 1.3419373071759968, + "grad_norm": 0.614425778388977, + "learning_rate": 5.065527589451286e-06, + "loss": 0.2158, + "step": 14245 + }, + { + "epoch": 1.3420315112691648, + "grad_norm": 0.6313828229904175, + "learning_rate": 5.064214268619458e-06, + "loss": 0.1832, + "step": 14246 + }, + { + "epoch": 1.3421257153623325, + "grad_norm": 0.652474582195282, + "learning_rate": 5.062901060331181e-06, + "loss": 0.2091, + "step": 14247 + }, + { + "epoch": 1.3422199194555002, + "grad_norm": 0.6407245993614197, + "learning_rate": 5.0615879646163926e-06, + "loss": 0.2062, + "step": 14248 + }, + { + "epoch": 1.3423141235486682, + "grad_norm": 0.7015902400016785, + "learning_rate": 5.060274981505035e-06, + "loss": 0.1906, + "step": 14249 + }, + { + "epoch": 1.3424083276418362, + "grad_norm": 0.7468515634536743, + "learning_rate": 5.058962111027051e-06, + "loss": 0.2116, + "step": 14250 + }, + { + "epoch": 1.342502531735004, + "grad_norm": 0.6226329207420349, + "learning_rate": 5.057649353212371e-06, + "loss": 0.2064, + "step": 14251 + }, + { + "epoch": 1.3425967358281716, + "grad_norm": 0.6461039781570435, + "learning_rate": 5.05633670809093e-06, + "loss": 0.2217, + "step": 14252 + }, + { + "epoch": 1.3426909399213396, + "grad_norm": 0.6401330232620239, + "learning_rate": 5.055024175692655e-06, + "loss": 0.2199, + "step": 14253 + }, + { + "epoch": 1.3427851440145075, + "grad_norm": 0.6089904308319092, + "learning_rate": 5.053711756047476e-06, + "loss": 0.1928, + "step": 14254 + }, + { + "epoch": 1.3428793481076753, + "grad_norm": 0.6451438069343567, + "learning_rate": 5.0523994491853225e-06, + "loss": 0.1826, + "step": 14255 + }, + { + "epoch": 1.342973552200843, + "grad_norm": 0.6480076313018799, + "learning_rate": 5.051087255136109e-06, + "loss": 0.2015, + "step": 14256 + }, + { + "epoch": 1.343067756294011, + "grad_norm": 0.6460951566696167, + "learning_rate": 5.049775173929761e-06, + "loss": 0.1955, + "step": 14257 + }, + { + "epoch": 1.343161960387179, + "grad_norm": 0.6903976798057556, + "learning_rate": 5.048463205596197e-06, + "loss": 0.2334, + "step": 14258 + }, + { + "epoch": 1.3432561644803467, + "grad_norm": 0.6762567758560181, + "learning_rate": 5.047151350165327e-06, + "loss": 0.2068, + "step": 14259 + }, + { + "epoch": 1.3433503685735144, + "grad_norm": 0.6496309638023376, + "learning_rate": 5.045839607667065e-06, + "loss": 0.1789, + "step": 14260 + }, + { + "epoch": 1.3434445726666824, + "grad_norm": 0.6982924342155457, + "learning_rate": 5.044527978131326e-06, + "loss": 0.2129, + "step": 14261 + }, + { + "epoch": 1.3435387767598503, + "grad_norm": 0.7475513219833374, + "learning_rate": 5.043216461588012e-06, + "loss": 0.2316, + "step": 14262 + }, + { + "epoch": 1.343632980853018, + "grad_norm": 0.6481485962867737, + "learning_rate": 5.041905058067023e-06, + "loss": 0.2212, + "step": 14263 + }, + { + "epoch": 1.3437271849461858, + "grad_norm": 0.630750298500061, + "learning_rate": 5.040593767598272e-06, + "loss": 0.1975, + "step": 14264 + }, + { + "epoch": 1.3438213890393538, + "grad_norm": 0.683118999004364, + "learning_rate": 5.039282590211654e-06, + "loss": 0.2035, + "step": 14265 + }, + { + "epoch": 1.3439155931325217, + "grad_norm": 0.6639612317085266, + "learning_rate": 5.037971525937056e-06, + "loss": 0.2073, + "step": 14266 + }, + { + "epoch": 1.3440097972256895, + "grad_norm": 0.619592547416687, + "learning_rate": 5.036660574804391e-06, + "loss": 0.1966, + "step": 14267 + }, + { + "epoch": 1.3441040013188572, + "grad_norm": 0.622658371925354, + "learning_rate": 5.035349736843539e-06, + "loss": 0.2005, + "step": 14268 + }, + { + "epoch": 1.3441982054120252, + "grad_norm": 0.66368168592453, + "learning_rate": 5.034039012084388e-06, + "loss": 0.2344, + "step": 14269 + }, + { + "epoch": 1.3442924095051931, + "grad_norm": 0.6013482213020325, + "learning_rate": 5.032728400556827e-06, + "loss": 0.1912, + "step": 14270 + }, + { + "epoch": 1.3443866135983609, + "grad_norm": 0.6582673192024231, + "learning_rate": 5.031417902290745e-06, + "loss": 0.1966, + "step": 14271 + }, + { + "epoch": 1.3444808176915286, + "grad_norm": 0.6961972713470459, + "learning_rate": 5.030107517316014e-06, + "loss": 0.2309, + "step": 14272 + }, + { + "epoch": 1.3445750217846966, + "grad_norm": 0.6419699192047119, + "learning_rate": 5.028797245662518e-06, + "loss": 0.198, + "step": 14273 + }, + { + "epoch": 1.3446692258778645, + "grad_norm": 0.6680853366851807, + "learning_rate": 5.027487087360138e-06, + "loss": 0.1949, + "step": 14274 + }, + { + "epoch": 1.3447634299710323, + "grad_norm": 0.702159583568573, + "learning_rate": 5.026177042438738e-06, + "loss": 0.2036, + "step": 14275 + }, + { + "epoch": 1.3448576340642, + "grad_norm": 0.6947282552719116, + "learning_rate": 5.024867110928193e-06, + "loss": 0.2071, + "step": 14276 + }, + { + "epoch": 1.344951838157368, + "grad_norm": 0.6485502123832703, + "learning_rate": 5.0235572928583766e-06, + "loss": 0.1854, + "step": 14277 + }, + { + "epoch": 1.345046042250536, + "grad_norm": 0.5859835147857666, + "learning_rate": 5.022247588259146e-06, + "loss": 0.1833, + "step": 14278 + }, + { + "epoch": 1.3451402463437037, + "grad_norm": 0.6901947855949402, + "learning_rate": 5.020937997160369e-06, + "loss": 0.2525, + "step": 14279 + }, + { + "epoch": 1.3452344504368714, + "grad_norm": 0.6361029744148254, + "learning_rate": 5.019628519591908e-06, + "loss": 0.2009, + "step": 14280 + }, + { + "epoch": 1.3453286545300394, + "grad_norm": 0.7549346685409546, + "learning_rate": 5.018319155583621e-06, + "loss": 0.2108, + "step": 14281 + }, + { + "epoch": 1.345422858623207, + "grad_norm": 0.5660572648048401, + "learning_rate": 5.017009905165357e-06, + "loss": 0.1904, + "step": 14282 + }, + { + "epoch": 1.345517062716375, + "grad_norm": 0.710254967212677, + "learning_rate": 5.015700768366973e-06, + "loss": 0.2061, + "step": 14283 + }, + { + "epoch": 1.3456112668095428, + "grad_norm": 0.731178879737854, + "learning_rate": 5.014391745218325e-06, + "loss": 0.2061, + "step": 14284 + }, + { + "epoch": 1.3457054709027108, + "grad_norm": 0.7249963879585266, + "learning_rate": 5.013082835749252e-06, + "loss": 0.2092, + "step": 14285 + }, + { + "epoch": 1.3457996749958785, + "grad_norm": 0.636004626750946, + "learning_rate": 5.011774039989601e-06, + "loss": 0.2026, + "step": 14286 + }, + { + "epoch": 1.3458938790890465, + "grad_norm": 0.670627236366272, + "learning_rate": 5.010465357969221e-06, + "loss": 0.235, + "step": 14287 + }, + { + "epoch": 1.3459880831822142, + "grad_norm": 0.6595027446746826, + "learning_rate": 5.0091567897179435e-06, + "loss": 0.1939, + "step": 14288 + }, + { + "epoch": 1.3460822872753821, + "grad_norm": 0.8129286170005798, + "learning_rate": 5.00784833526561e-06, + "loss": 0.2327, + "step": 14289 + }, + { + "epoch": 1.3461764913685499, + "grad_norm": 0.6742135882377625, + "learning_rate": 5.006539994642059e-06, + "loss": 0.2083, + "step": 14290 + }, + { + "epoch": 1.3462706954617178, + "grad_norm": 0.6889356374740601, + "learning_rate": 5.005231767877114e-06, + "loss": 0.2256, + "step": 14291 + }, + { + "epoch": 1.3463648995548856, + "grad_norm": 0.5851003527641296, + "learning_rate": 5.003923655000613e-06, + "loss": 0.1693, + "step": 14292 + }, + { + "epoch": 1.3464591036480535, + "grad_norm": 0.6751152873039246, + "learning_rate": 5.002615656042376e-06, + "loss": 0.2319, + "step": 14293 + }, + { + "epoch": 1.3465533077412213, + "grad_norm": 0.7556543350219727, + "learning_rate": 5.001307771032231e-06, + "loss": 0.2017, + "step": 14294 + }, + { + "epoch": 1.3466475118343892, + "grad_norm": 0.6427310109138489, + "learning_rate": 5.000000000000003e-06, + "loss": 0.2051, + "step": 14295 + }, + { + "epoch": 1.346741715927557, + "grad_norm": 0.7643883228302002, + "learning_rate": 4.998692342975503e-06, + "loss": 0.2072, + "step": 14296 + }, + { + "epoch": 1.346835920020725, + "grad_norm": 0.7492932677268982, + "learning_rate": 4.997384799988553e-06, + "loss": 0.2097, + "step": 14297 + }, + { + "epoch": 1.3469301241138927, + "grad_norm": 0.678108811378479, + "learning_rate": 4.996077371068969e-06, + "loss": 0.2083, + "step": 14298 + }, + { + "epoch": 1.3470243282070606, + "grad_norm": 0.9397788643836975, + "learning_rate": 4.9947700562465576e-06, + "loss": 0.1968, + "step": 14299 + }, + { + "epoch": 1.3471185323002284, + "grad_norm": 0.708877444267273, + "learning_rate": 4.993462855551129e-06, + "loss": 0.2006, + "step": 14300 + }, + { + "epoch": 1.3472127363933963, + "grad_norm": 0.6410995125770569, + "learning_rate": 4.992155769012493e-06, + "loss": 0.2128, + "step": 14301 + }, + { + "epoch": 1.347306940486564, + "grad_norm": 0.7376231551170349, + "learning_rate": 4.990848796660451e-06, + "loss": 0.2444, + "step": 14302 + }, + { + "epoch": 1.347401144579732, + "grad_norm": 0.6626389622688293, + "learning_rate": 4.989541938524796e-06, + "loss": 0.1856, + "step": 14303 + }, + { + "epoch": 1.3474953486728998, + "grad_norm": 0.6304222941398621, + "learning_rate": 4.98823519463534e-06, + "loss": 0.1866, + "step": 14304 + }, + { + "epoch": 1.3475895527660677, + "grad_norm": 0.5900949835777283, + "learning_rate": 4.986928565021874e-06, + "loss": 0.2034, + "step": 14305 + }, + { + "epoch": 1.3476837568592355, + "grad_norm": 0.6261608600616455, + "learning_rate": 4.985622049714185e-06, + "loss": 0.2052, + "step": 14306 + }, + { + "epoch": 1.3477779609524034, + "grad_norm": 0.6565543413162231, + "learning_rate": 4.984315648742068e-06, + "loss": 0.1795, + "step": 14307 + }, + { + "epoch": 1.3478721650455712, + "grad_norm": 0.7810153961181641, + "learning_rate": 4.983009362135315e-06, + "loss": 0.2179, + "step": 14308 + }, + { + "epoch": 1.3479663691387391, + "grad_norm": 0.6561579704284668, + "learning_rate": 4.9817031899237035e-06, + "loss": 0.2061, + "step": 14309 + }, + { + "epoch": 1.3480605732319069, + "grad_norm": 0.6692266464233398, + "learning_rate": 4.98039713213702e-06, + "loss": 0.2063, + "step": 14310 + }, + { + "epoch": 1.3481547773250748, + "grad_norm": 0.6777259111404419, + "learning_rate": 4.97909118880505e-06, + "loss": 0.2199, + "step": 14311 + }, + { + "epoch": 1.3482489814182426, + "grad_norm": 0.6281754374504089, + "learning_rate": 4.97778535995756e-06, + "loss": 0.2021, + "step": 14312 + }, + { + "epoch": 1.3483431855114105, + "grad_norm": 0.6651512980461121, + "learning_rate": 4.976479645624332e-06, + "loss": 0.2239, + "step": 14313 + }, + { + "epoch": 1.3484373896045783, + "grad_norm": 0.6048790812492371, + "learning_rate": 4.975174045835141e-06, + "loss": 0.1762, + "step": 14314 + }, + { + "epoch": 1.3485315936977462, + "grad_norm": 0.6577955484390259, + "learning_rate": 4.973868560619749e-06, + "loss": 0.1938, + "step": 14315 + }, + { + "epoch": 1.348625797790914, + "grad_norm": 0.6613869667053223, + "learning_rate": 4.972563190007927e-06, + "loss": 0.2085, + "step": 14316 + }, + { + "epoch": 1.348720001884082, + "grad_norm": 0.774599552154541, + "learning_rate": 4.971257934029442e-06, + "loss": 0.2079, + "step": 14317 + }, + { + "epoch": 1.3488142059772497, + "grad_norm": 0.6561988592147827, + "learning_rate": 4.96995279271405e-06, + "loss": 0.2109, + "step": 14318 + }, + { + "epoch": 1.3489084100704176, + "grad_norm": 0.6106441020965576, + "learning_rate": 4.968647766091514e-06, + "loss": 0.1914, + "step": 14319 + }, + { + "epoch": 1.3490026141635854, + "grad_norm": 0.6479611992835999, + "learning_rate": 4.9673428541915934e-06, + "loss": 0.1861, + "step": 14320 + }, + { + "epoch": 1.3490968182567533, + "grad_norm": 0.6346365213394165, + "learning_rate": 4.966038057044039e-06, + "loss": 0.1925, + "step": 14321 + }, + { + "epoch": 1.349191022349921, + "grad_norm": 0.6005086898803711, + "learning_rate": 4.964733374678599e-06, + "loss": 0.1784, + "step": 14322 + }, + { + "epoch": 1.349285226443089, + "grad_norm": 0.6776334643363953, + "learning_rate": 4.963428807125024e-06, + "loss": 0.2008, + "step": 14323 + }, + { + "epoch": 1.3493794305362568, + "grad_norm": 0.6856703162193298, + "learning_rate": 4.962124354413066e-06, + "loss": 0.2233, + "step": 14324 + }, + { + "epoch": 1.3494736346294247, + "grad_norm": 0.6597799062728882, + "learning_rate": 4.960820016572459e-06, + "loss": 0.2146, + "step": 14325 + }, + { + "epoch": 1.3495678387225924, + "grad_norm": 0.6462965607643127, + "learning_rate": 4.959515793632951e-06, + "loss": 0.2043, + "step": 14326 + }, + { + "epoch": 1.3496620428157604, + "grad_norm": 0.7108558416366577, + "learning_rate": 4.95821168562428e-06, + "loss": 0.2151, + "step": 14327 + }, + { + "epoch": 1.3497562469089281, + "grad_norm": 0.6805381774902344, + "learning_rate": 4.9569076925761775e-06, + "loss": 0.1905, + "step": 14328 + }, + { + "epoch": 1.349850451002096, + "grad_norm": 0.6004214882850647, + "learning_rate": 4.955603814518378e-06, + "loss": 0.1929, + "step": 14329 + }, + { + "epoch": 1.3499446550952638, + "grad_norm": 0.6933792233467102, + "learning_rate": 4.954300051480617e-06, + "loss": 0.2059, + "step": 14330 + }, + { + "epoch": 1.3500388591884318, + "grad_norm": 0.6043687462806702, + "learning_rate": 4.952996403492614e-06, + "loss": 0.1781, + "step": 14331 + }, + { + "epoch": 1.3501330632815995, + "grad_norm": 0.6481543183326721, + "learning_rate": 4.951692870584103e-06, + "loss": 0.1779, + "step": 14332 + }, + { + "epoch": 1.3502272673747675, + "grad_norm": 0.7032328248023987, + "learning_rate": 4.950389452784796e-06, + "loss": 0.2296, + "step": 14333 + }, + { + "epoch": 1.3503214714679352, + "grad_norm": 0.6082679033279419, + "learning_rate": 4.949086150124421e-06, + "loss": 0.1965, + "step": 14334 + }, + { + "epoch": 1.3504156755611032, + "grad_norm": 0.639819860458374, + "learning_rate": 4.9477829626326965e-06, + "loss": 0.2027, + "step": 14335 + }, + { + "epoch": 1.350509879654271, + "grad_norm": 0.6563467979431152, + "learning_rate": 4.9464798903393295e-06, + "loss": 0.224, + "step": 14336 + }, + { + "epoch": 1.350604083747439, + "grad_norm": 0.6432325839996338, + "learning_rate": 4.945176933274036e-06, + "loss": 0.1891, + "step": 14337 + }, + { + "epoch": 1.3506982878406066, + "grad_norm": 0.5720778107643127, + "learning_rate": 4.943874091466531e-06, + "loss": 0.2023, + "step": 14338 + }, + { + "epoch": 1.3507924919337746, + "grad_norm": 0.6740782260894775, + "learning_rate": 4.942571364946511e-06, + "loss": 0.1683, + "step": 14339 + }, + { + "epoch": 1.3508866960269423, + "grad_norm": 0.6470614671707153, + "learning_rate": 4.9412687537436865e-06, + "loss": 0.2147, + "step": 14340 + }, + { + "epoch": 1.3509809001201103, + "grad_norm": 0.5793033242225647, + "learning_rate": 4.939966257887762e-06, + "loss": 0.1833, + "step": 14341 + }, + { + "epoch": 1.351075104213278, + "grad_norm": 0.6842806339263916, + "learning_rate": 4.938663877408432e-06, + "loss": 0.1908, + "step": 14342 + }, + { + "epoch": 1.351169308306446, + "grad_norm": 0.6015111207962036, + "learning_rate": 4.937361612335384e-06, + "loss": 0.1764, + "step": 14343 + }, + { + "epoch": 1.3512635123996137, + "grad_norm": 0.5633746981620789, + "learning_rate": 4.936059462698329e-06, + "loss": 0.1553, + "step": 14344 + }, + { + "epoch": 1.3513577164927817, + "grad_norm": 0.6662188768386841, + "learning_rate": 4.934757428526951e-06, + "loss": 0.2186, + "step": 14345 + }, + { + "epoch": 1.3514519205859494, + "grad_norm": 0.6492788791656494, + "learning_rate": 4.933455509850933e-06, + "loss": 0.2015, + "step": 14346 + }, + { + "epoch": 1.3515461246791174, + "grad_norm": 0.607024610042572, + "learning_rate": 4.932153706699964e-06, + "loss": 0.2103, + "step": 14347 + }, + { + "epoch": 1.3516403287722851, + "grad_norm": 0.6619748473167419, + "learning_rate": 4.930852019103732e-06, + "loss": 0.2104, + "step": 14348 + }, + { + "epoch": 1.351734532865453, + "grad_norm": 0.7663852572441101, + "learning_rate": 4.929550447091911e-06, + "loss": 0.2038, + "step": 14349 + }, + { + "epoch": 1.3518287369586208, + "grad_norm": 0.6927057504653931, + "learning_rate": 4.92824899069418e-06, + "loss": 0.1969, + "step": 14350 + }, + { + "epoch": 1.3519229410517886, + "grad_norm": 0.6270477771759033, + "learning_rate": 4.92694764994022e-06, + "loss": 0.1893, + "step": 14351 + }, + { + "epoch": 1.3520171451449565, + "grad_norm": 0.6877565979957581, + "learning_rate": 4.925646424859696e-06, + "loss": 0.2246, + "step": 14352 + }, + { + "epoch": 1.3521113492381245, + "grad_norm": 0.6178605556488037, + "learning_rate": 4.92434531548228e-06, + "loss": 0.2082, + "step": 14353 + }, + { + "epoch": 1.3522055533312922, + "grad_norm": 0.6962509155273438, + "learning_rate": 4.923044321837645e-06, + "loss": 0.227, + "step": 14354 + }, + { + "epoch": 1.35229975742446, + "grad_norm": 0.7804440855979919, + "learning_rate": 4.921743443955447e-06, + "loss": 0.2344, + "step": 14355 + }, + { + "epoch": 1.352393961517628, + "grad_norm": 0.6728218793869019, + "learning_rate": 4.920442681865351e-06, + "loss": 0.223, + "step": 14356 + }, + { + "epoch": 1.3524881656107959, + "grad_norm": 0.6981995701789856, + "learning_rate": 4.9191420355970246e-06, + "loss": 0.2027, + "step": 14357 + }, + { + "epoch": 1.3525823697039636, + "grad_norm": 0.6677607297897339, + "learning_rate": 4.917841505180115e-06, + "loss": 0.2086, + "step": 14358 + }, + { + "epoch": 1.3526765737971314, + "grad_norm": 0.7647161483764648, + "learning_rate": 4.916541090644271e-06, + "loss": 0.1902, + "step": 14359 + }, + { + "epoch": 1.3527707778902993, + "grad_norm": 0.736425518989563, + "learning_rate": 4.9152407920191604e-06, + "loss": 0.1852, + "step": 14360 + }, + { + "epoch": 1.3528649819834673, + "grad_norm": 0.7363700866699219, + "learning_rate": 4.913940609334423e-06, + "loss": 0.1981, + "step": 14361 + }, + { + "epoch": 1.352959186076635, + "grad_norm": 0.6129119992256165, + "learning_rate": 4.912640542619702e-06, + "loss": 0.184, + "step": 14362 + }, + { + "epoch": 1.3530533901698027, + "grad_norm": 0.6737158894538879, + "learning_rate": 4.911340591904644e-06, + "loss": 0.2427, + "step": 14363 + }, + { + "epoch": 1.3531475942629707, + "grad_norm": 0.636288046836853, + "learning_rate": 4.910040757218894e-06, + "loss": 0.1717, + "step": 14364 + }, + { + "epoch": 1.3532417983561387, + "grad_norm": 0.6598813533782959, + "learning_rate": 4.908741038592084e-06, + "loss": 0.1993, + "step": 14365 + }, + { + "epoch": 1.3533360024493064, + "grad_norm": 0.6319726705551147, + "learning_rate": 4.907441436053852e-06, + "loss": 0.1955, + "step": 14366 + }, + { + "epoch": 1.3534302065424741, + "grad_norm": 0.7078801393508911, + "learning_rate": 4.906141949633832e-06, + "loss": 0.1933, + "step": 14367 + }, + { + "epoch": 1.353524410635642, + "grad_norm": 0.706860363483429, + "learning_rate": 4.904842579361653e-06, + "loss": 0.208, + "step": 14368 + }, + { + "epoch": 1.35361861472881, + "grad_norm": 0.6156027913093567, + "learning_rate": 4.903543325266941e-06, + "loss": 0.1861, + "step": 14369 + }, + { + "epoch": 1.3537128188219778, + "grad_norm": 0.6501897573471069, + "learning_rate": 4.902244187379327e-06, + "loss": 0.203, + "step": 14370 + }, + { + "epoch": 1.3538070229151455, + "grad_norm": 0.6195113658905029, + "learning_rate": 4.900945165728426e-06, + "loss": 0.176, + "step": 14371 + }, + { + "epoch": 1.3539012270083135, + "grad_norm": 0.7405468821525574, + "learning_rate": 4.899646260343866e-06, + "loss": 0.2067, + "step": 14372 + }, + { + "epoch": 1.3539954311014815, + "grad_norm": 0.9037397503852844, + "learning_rate": 4.898347471255253e-06, + "loss": 0.2006, + "step": 14373 + }, + { + "epoch": 1.3540896351946492, + "grad_norm": 0.6409209966659546, + "learning_rate": 4.897048798492209e-06, + "loss": 0.217, + "step": 14374 + }, + { + "epoch": 1.354183839287817, + "grad_norm": 0.6748372912406921, + "learning_rate": 4.895750242084347e-06, + "loss": 0.214, + "step": 14375 + }, + { + "epoch": 1.354278043380985, + "grad_norm": 0.6084638237953186, + "learning_rate": 4.894451802061271e-06, + "loss": 0.2, + "step": 14376 + }, + { + "epoch": 1.3543722474741529, + "grad_norm": 0.6398811936378479, + "learning_rate": 4.893153478452588e-06, + "loss": 0.2296, + "step": 14377 + }, + { + "epoch": 1.3544664515673206, + "grad_norm": 0.677300214767456, + "learning_rate": 4.891855271287909e-06, + "loss": 0.2095, + "step": 14378 + }, + { + "epoch": 1.3545606556604883, + "grad_norm": 0.7081146240234375, + "learning_rate": 4.890557180596826e-06, + "loss": 0.1932, + "step": 14379 + }, + { + "epoch": 1.3546548597536563, + "grad_norm": 0.6429355144500732, + "learning_rate": 4.8892592064089394e-06, + "loss": 0.2209, + "step": 14380 + }, + { + "epoch": 1.3547490638468243, + "grad_norm": 0.658106803894043, + "learning_rate": 4.887961348753852e-06, + "loss": 0.2157, + "step": 14381 + }, + { + "epoch": 1.354843267939992, + "grad_norm": 0.5953934788703918, + "learning_rate": 4.8866636076611515e-06, + "loss": 0.1764, + "step": 14382 + }, + { + "epoch": 1.3549374720331597, + "grad_norm": 0.6248713731765747, + "learning_rate": 4.88536598316042e-06, + "loss": 0.2047, + "step": 14383 + }, + { + "epoch": 1.3550316761263277, + "grad_norm": 0.6855310797691345, + "learning_rate": 4.884068475281264e-06, + "loss": 0.1919, + "step": 14384 + }, + { + "epoch": 1.3551258802194956, + "grad_norm": 0.7052231431007385, + "learning_rate": 4.882771084053257e-06, + "loss": 0.2219, + "step": 14385 + }, + { + "epoch": 1.3552200843126634, + "grad_norm": 0.6247349381446838, + "learning_rate": 4.88147380950598e-06, + "loss": 0.1908, + "step": 14386 + }, + { + "epoch": 1.3553142884058311, + "grad_norm": 0.6833993196487427, + "learning_rate": 4.880176651669015e-06, + "loss": 0.1899, + "step": 14387 + }, + { + "epoch": 1.355408492498999, + "grad_norm": 0.627804160118103, + "learning_rate": 4.878879610571946e-06, + "loss": 0.1818, + "step": 14388 + }, + { + "epoch": 1.355502696592167, + "grad_norm": 0.6863963603973389, + "learning_rate": 4.877582686244337e-06, + "loss": 0.2066, + "step": 14389 + }, + { + "epoch": 1.3555969006853348, + "grad_norm": 0.683441698551178, + "learning_rate": 4.876285878715764e-06, + "loss": 0.2144, + "step": 14390 + }, + { + "epoch": 1.3556911047785025, + "grad_norm": 0.6739716529846191, + "learning_rate": 4.8749891880158015e-06, + "loss": 0.2108, + "step": 14391 + }, + { + "epoch": 1.3557853088716705, + "grad_norm": 0.6303501725196838, + "learning_rate": 4.873692614174008e-06, + "loss": 0.2197, + "step": 14392 + }, + { + "epoch": 1.3558795129648384, + "grad_norm": 0.6293514966964722, + "learning_rate": 4.87239615721995e-06, + "loss": 0.1909, + "step": 14393 + }, + { + "epoch": 1.3559737170580062, + "grad_norm": 0.6505168676376343, + "learning_rate": 4.871099817183195e-06, + "loss": 0.203, + "step": 14394 + }, + { + "epoch": 1.356067921151174, + "grad_norm": 0.5902092456817627, + "learning_rate": 4.869803594093291e-06, + "loss": 0.184, + "step": 14395 + }, + { + "epoch": 1.3561621252443419, + "grad_norm": 0.6372312903404236, + "learning_rate": 4.868507487979799e-06, + "loss": 0.2336, + "step": 14396 + }, + { + "epoch": 1.3562563293375098, + "grad_norm": 0.6709840297698975, + "learning_rate": 4.867211498872276e-06, + "loss": 0.2035, + "step": 14397 + }, + { + "epoch": 1.3563505334306776, + "grad_norm": 0.6329782009124756, + "learning_rate": 4.865915626800269e-06, + "loss": 0.2051, + "step": 14398 + }, + { + "epoch": 1.3564447375238453, + "grad_norm": 0.6777759194374084, + "learning_rate": 4.864619871793319e-06, + "loss": 0.2065, + "step": 14399 + }, + { + "epoch": 1.3565389416170133, + "grad_norm": 0.6504156589508057, + "learning_rate": 4.863324233880985e-06, + "loss": 0.1853, + "step": 14400 + }, + { + "epoch": 1.3566331457101812, + "grad_norm": 0.6352477073669434, + "learning_rate": 4.862028713092802e-06, + "loss": 0.2052, + "step": 14401 + }, + { + "epoch": 1.356727349803349, + "grad_norm": 0.7121332883834839, + "learning_rate": 4.860733309458308e-06, + "loss": 0.2366, + "step": 14402 + }, + { + "epoch": 1.3568215538965167, + "grad_norm": 0.6510499715805054, + "learning_rate": 4.859438023007041e-06, + "loss": 0.215, + "step": 14403 + }, + { + "epoch": 1.3569157579896847, + "grad_norm": 0.5392564535140991, + "learning_rate": 4.858142853768541e-06, + "loss": 0.1715, + "step": 14404 + }, + { + "epoch": 1.3570099620828526, + "grad_norm": 0.632756233215332, + "learning_rate": 4.856847801772333e-06, + "loss": 0.2118, + "step": 14405 + }, + { + "epoch": 1.3571041661760204, + "grad_norm": 0.6321601867675781, + "learning_rate": 4.855552867047949e-06, + "loss": 0.194, + "step": 14406 + }, + { + "epoch": 1.357198370269188, + "grad_norm": 0.633243203163147, + "learning_rate": 4.854258049624919e-06, + "loss": 0.2026, + "step": 14407 + }, + { + "epoch": 1.357292574362356, + "grad_norm": 0.6365585327148438, + "learning_rate": 4.852963349532761e-06, + "loss": 0.2002, + "step": 14408 + }, + { + "epoch": 1.357386778455524, + "grad_norm": 0.749879777431488, + "learning_rate": 4.851668766800998e-06, + "loss": 0.2071, + "step": 14409 + }, + { + "epoch": 1.3574809825486918, + "grad_norm": 0.7128868699073792, + "learning_rate": 4.850374301459152e-06, + "loss": 0.2049, + "step": 14410 + }, + { + "epoch": 1.3575751866418595, + "grad_norm": 0.6514534950256348, + "learning_rate": 4.849079953536733e-06, + "loss": 0.223, + "step": 14411 + }, + { + "epoch": 1.3576693907350275, + "grad_norm": 0.6466713547706604, + "learning_rate": 4.847785723063261e-06, + "loss": 0.1996, + "step": 14412 + }, + { + "epoch": 1.3577635948281954, + "grad_norm": 0.6942420601844788, + "learning_rate": 4.846491610068238e-06, + "loss": 0.1871, + "step": 14413 + }, + { + "epoch": 1.3578577989213632, + "grad_norm": 0.6090893745422363, + "learning_rate": 4.845197614581177e-06, + "loss": 0.2192, + "step": 14414 + }, + { + "epoch": 1.357952003014531, + "grad_norm": 0.5880915522575378, + "learning_rate": 4.843903736631585e-06, + "loss": 0.1927, + "step": 14415 + }, + { + "epoch": 1.3580462071076989, + "grad_norm": 0.7274464964866638, + "learning_rate": 4.84260997624896e-06, + "loss": 0.2063, + "step": 14416 + }, + { + "epoch": 1.3581404112008668, + "grad_norm": 0.6381358504295349, + "learning_rate": 4.841316333462803e-06, + "loss": 0.1974, + "step": 14417 + }, + { + "epoch": 1.3582346152940346, + "grad_norm": 0.6187579035758972, + "learning_rate": 4.840022808302616e-06, + "loss": 0.2264, + "step": 14418 + }, + { + "epoch": 1.3583288193872023, + "grad_norm": 0.6147592067718506, + "learning_rate": 4.838729400797884e-06, + "loss": 0.2046, + "step": 14419 + }, + { + "epoch": 1.3584230234803703, + "grad_norm": 0.6889422535896301, + "learning_rate": 4.837436110978104e-06, + "loss": 0.1896, + "step": 14420 + }, + { + "epoch": 1.358517227573538, + "grad_norm": 0.6578227877616882, + "learning_rate": 4.836142938872769e-06, + "loss": 0.1819, + "step": 14421 + }, + { + "epoch": 1.358611431666706, + "grad_norm": 0.6827560067176819, + "learning_rate": 4.83484988451136e-06, + "loss": 0.1985, + "step": 14422 + }, + { + "epoch": 1.3587056357598737, + "grad_norm": 0.6549921631813049, + "learning_rate": 4.833556947923359e-06, + "loss": 0.1961, + "step": 14423 + }, + { + "epoch": 1.3587998398530416, + "grad_norm": 0.6283363699913025, + "learning_rate": 4.832264129138249e-06, + "loss": 0.2096, + "step": 14424 + }, + { + "epoch": 1.3588940439462094, + "grad_norm": 0.6205511093139648, + "learning_rate": 4.830971428185514e-06, + "loss": 0.2339, + "step": 14425 + }, + { + "epoch": 1.3589882480393773, + "grad_norm": 0.5928133130073547, + "learning_rate": 4.829678845094619e-06, + "loss": 0.1698, + "step": 14426 + }, + { + "epoch": 1.359082452132545, + "grad_norm": 0.6639288067817688, + "learning_rate": 4.828386379895043e-06, + "loss": 0.1921, + "step": 14427 + }, + { + "epoch": 1.359176656225713, + "grad_norm": 0.8254418969154358, + "learning_rate": 4.82709403261626e-06, + "loss": 0.1797, + "step": 14428 + }, + { + "epoch": 1.3592708603188808, + "grad_norm": 0.7011244893074036, + "learning_rate": 4.825801803287728e-06, + "loss": 0.205, + "step": 14429 + }, + { + "epoch": 1.3593650644120487, + "grad_norm": 0.840096652507782, + "learning_rate": 4.824509691938918e-06, + "loss": 0.1902, + "step": 14430 + }, + { + "epoch": 1.3594592685052165, + "grad_norm": 0.6223124861717224, + "learning_rate": 4.823217698599294e-06, + "loss": 0.1894, + "step": 14431 + }, + { + "epoch": 1.3595534725983844, + "grad_norm": 0.6484492421150208, + "learning_rate": 4.82192582329831e-06, + "loss": 0.1826, + "step": 14432 + }, + { + "epoch": 1.3596476766915522, + "grad_norm": 0.607126772403717, + "learning_rate": 4.820634066065424e-06, + "loss": 0.2202, + "step": 14433 + }, + { + "epoch": 1.3597418807847201, + "grad_norm": 0.6780961751937866, + "learning_rate": 4.819342426930096e-06, + "loss": 0.1947, + "step": 14434 + }, + { + "epoch": 1.3598360848778879, + "grad_norm": 0.6182903051376343, + "learning_rate": 4.818050905921768e-06, + "loss": 0.1917, + "step": 14435 + }, + { + "epoch": 1.3599302889710558, + "grad_norm": 0.646899402141571, + "learning_rate": 4.816759503069894e-06, + "loss": 0.2014, + "step": 14436 + }, + { + "epoch": 1.3600244930642236, + "grad_norm": 0.6887809634208679, + "learning_rate": 4.815468218403923e-06, + "loss": 0.2184, + "step": 14437 + }, + { + "epoch": 1.3601186971573915, + "grad_norm": 0.5613549947738647, + "learning_rate": 4.814177051953296e-06, + "loss": 0.1781, + "step": 14438 + }, + { + "epoch": 1.3602129012505593, + "grad_norm": 0.6463780403137207, + "learning_rate": 4.812886003747443e-06, + "loss": 0.2099, + "step": 14439 + }, + { + "epoch": 1.3603071053437272, + "grad_norm": 0.6642791032791138, + "learning_rate": 4.811595073815819e-06, + "loss": 0.214, + "step": 14440 + }, + { + "epoch": 1.360401309436895, + "grad_norm": 0.575738251209259, + "learning_rate": 4.8103042621878515e-06, + "loss": 0.1846, + "step": 14441 + }, + { + "epoch": 1.360495513530063, + "grad_norm": 0.6691033840179443, + "learning_rate": 4.809013568892969e-06, + "loss": 0.202, + "step": 14442 + }, + { + "epoch": 1.3605897176232307, + "grad_norm": 0.659534752368927, + "learning_rate": 4.807722993960605e-06, + "loss": 0.1923, + "step": 14443 + }, + { + "epoch": 1.3606839217163986, + "grad_norm": 0.7245131134986877, + "learning_rate": 4.806432537420191e-06, + "loss": 0.1847, + "step": 14444 + }, + { + "epoch": 1.3607781258095664, + "grad_norm": 0.634385347366333, + "learning_rate": 4.805142199301144e-06, + "loss": 0.1814, + "step": 14445 + }, + { + "epoch": 1.3608723299027343, + "grad_norm": 0.6050102114677429, + "learning_rate": 4.803851979632887e-06, + "loss": 0.1936, + "step": 14446 + }, + { + "epoch": 1.360966533995902, + "grad_norm": 0.6620184779167175, + "learning_rate": 4.802561878444846e-06, + "loss": 0.206, + "step": 14447 + }, + { + "epoch": 1.36106073808907, + "grad_norm": 0.8133841156959534, + "learning_rate": 4.801271895766429e-06, + "loss": 0.2338, + "step": 14448 + }, + { + "epoch": 1.3611549421822378, + "grad_norm": 0.5997223258018494, + "learning_rate": 4.79998203162705e-06, + "loss": 0.2005, + "step": 14449 + }, + { + "epoch": 1.3612491462754057, + "grad_norm": 0.7362989187240601, + "learning_rate": 4.798692286056129e-06, + "loss": 0.2131, + "step": 14450 + }, + { + "epoch": 1.3613433503685735, + "grad_norm": 0.7201138138771057, + "learning_rate": 4.797402659083064e-06, + "loss": 0.2652, + "step": 14451 + }, + { + "epoch": 1.3614375544617414, + "grad_norm": 0.7321445345878601, + "learning_rate": 4.796113150737267e-06, + "loss": 0.2121, + "step": 14452 + }, + { + "epoch": 1.3615317585549092, + "grad_norm": 0.6794480085372925, + "learning_rate": 4.794823761048134e-06, + "loss": 0.219, + "step": 14453 + }, + { + "epoch": 1.3616259626480771, + "grad_norm": 0.6611748337745667, + "learning_rate": 4.793534490045071e-06, + "loss": 0.1921, + "step": 14454 + }, + { + "epoch": 1.3617201667412449, + "grad_norm": 0.6154462695121765, + "learning_rate": 4.792245337757476e-06, + "loss": 0.1971, + "step": 14455 + }, + { + "epoch": 1.3618143708344128, + "grad_norm": 0.6860254406929016, + "learning_rate": 4.7909563042147375e-06, + "loss": 0.2116, + "step": 14456 + }, + { + "epoch": 1.3619085749275806, + "grad_norm": 0.6639959812164307, + "learning_rate": 4.789667389446252e-06, + "loss": 0.1944, + "step": 14457 + }, + { + "epoch": 1.3620027790207485, + "grad_norm": 0.7993869781494141, + "learning_rate": 4.788378593481411e-06, + "loss": 0.2247, + "step": 14458 + }, + { + "epoch": 1.3620969831139162, + "grad_norm": 0.7196150422096252, + "learning_rate": 4.787089916349594e-06, + "loss": 0.1944, + "step": 14459 + }, + { + "epoch": 1.3621911872070842, + "grad_norm": 0.6921483278274536, + "learning_rate": 4.7858013580801895e-06, + "loss": 0.2085, + "step": 14460 + }, + { + "epoch": 1.362285391300252, + "grad_norm": 0.6668462157249451, + "learning_rate": 4.784512918702582e-06, + "loss": 0.2286, + "step": 14461 + }, + { + "epoch": 1.36237959539342, + "grad_norm": 0.7428612112998962, + "learning_rate": 4.783224598246146e-06, + "loss": 0.1979, + "step": 14462 + }, + { + "epoch": 1.3624737994865876, + "grad_norm": 0.5864380598068237, + "learning_rate": 4.781936396740252e-06, + "loss": 0.1703, + "step": 14463 + }, + { + "epoch": 1.3625680035797556, + "grad_norm": 0.6457656025886536, + "learning_rate": 4.780648314214279e-06, + "loss": 0.2028, + "step": 14464 + }, + { + "epoch": 1.3626622076729233, + "grad_norm": 0.5920119285583496, + "learning_rate": 4.779360350697599e-06, + "loss": 0.1912, + "step": 14465 + }, + { + "epoch": 1.3627564117660913, + "grad_norm": 0.5777595043182373, + "learning_rate": 4.778072506219575e-06, + "loss": 0.183, + "step": 14466 + }, + { + "epoch": 1.362850615859259, + "grad_norm": 0.6431694626808167, + "learning_rate": 4.776784780809571e-06, + "loss": 0.2048, + "step": 14467 + }, + { + "epoch": 1.362944819952427, + "grad_norm": 0.6484269499778748, + "learning_rate": 4.775497174496958e-06, + "loss": 0.2075, + "step": 14468 + }, + { + "epoch": 1.3630390240455947, + "grad_norm": 0.6658286452293396, + "learning_rate": 4.774209687311085e-06, + "loss": 0.203, + "step": 14469 + }, + { + "epoch": 1.3631332281387627, + "grad_norm": 0.6317464113235474, + "learning_rate": 4.772922319281312e-06, + "loss": 0.2124, + "step": 14470 + }, + { + "epoch": 1.3632274322319304, + "grad_norm": 0.5848267674446106, + "learning_rate": 4.771635070436998e-06, + "loss": 0.1766, + "step": 14471 + }, + { + "epoch": 1.3633216363250984, + "grad_norm": 0.6971248388290405, + "learning_rate": 4.770347940807488e-06, + "loss": 0.1944, + "step": 14472 + }, + { + "epoch": 1.3634158404182661, + "grad_norm": 0.6944621801376343, + "learning_rate": 4.769060930422132e-06, + "loss": 0.1833, + "step": 14473 + }, + { + "epoch": 1.363510044511434, + "grad_norm": 0.6648178100585938, + "learning_rate": 4.76777403931028e-06, + "loss": 0.1825, + "step": 14474 + }, + { + "epoch": 1.3636042486046018, + "grad_norm": 0.632527232170105, + "learning_rate": 4.76648726750127e-06, + "loss": 0.195, + "step": 14475 + }, + { + "epoch": 1.3636984526977698, + "grad_norm": 0.6708411574363708, + "learning_rate": 4.765200615024439e-06, + "loss": 0.1986, + "step": 14476 + }, + { + "epoch": 1.3637926567909375, + "grad_norm": 0.8647854924201965, + "learning_rate": 4.7639140819091365e-06, + "loss": 0.2349, + "step": 14477 + }, + { + "epoch": 1.3638868608841055, + "grad_norm": 0.6907919645309448, + "learning_rate": 4.762627668184692e-06, + "loss": 0.2094, + "step": 14478 + }, + { + "epoch": 1.3639810649772732, + "grad_norm": 0.6024240851402283, + "learning_rate": 4.761341373880427e-06, + "loss": 0.1988, + "step": 14479 + }, + { + "epoch": 1.3640752690704412, + "grad_norm": 0.6763929724693298, + "learning_rate": 4.760055199025688e-06, + "loss": 0.2245, + "step": 14480 + }, + { + "epoch": 1.364169473163609, + "grad_norm": 0.6466813683509827, + "learning_rate": 4.758769143649795e-06, + "loss": 0.1956, + "step": 14481 + }, + { + "epoch": 1.3642636772567769, + "grad_norm": 0.6607981324195862, + "learning_rate": 4.757483207782068e-06, + "loss": 0.1926, + "step": 14482 + }, + { + "epoch": 1.3643578813499446, + "grad_norm": 0.5871493816375732, + "learning_rate": 4.75619739145183e-06, + "loss": 0.1769, + "step": 14483 + }, + { + "epoch": 1.3644520854431126, + "grad_norm": 0.7563601732254028, + "learning_rate": 4.754911694688405e-06, + "loss": 0.2157, + "step": 14484 + }, + { + "epoch": 1.3645462895362803, + "grad_norm": 0.6513185501098633, + "learning_rate": 4.753626117521103e-06, + "loss": 0.2007, + "step": 14485 + }, + { + "epoch": 1.3646404936294483, + "grad_norm": 0.5829171538352966, + "learning_rate": 4.752340659979239e-06, + "loss": 0.1691, + "step": 14486 + }, + { + "epoch": 1.364734697722616, + "grad_norm": 0.5896437764167786, + "learning_rate": 4.751055322092126e-06, + "loss": 0.1949, + "step": 14487 + }, + { + "epoch": 1.364828901815784, + "grad_norm": 0.6572392582893372, + "learning_rate": 4.7497701038890664e-06, + "loss": 0.1934, + "step": 14488 + }, + { + "epoch": 1.3649231059089517, + "grad_norm": 0.6673678755760193, + "learning_rate": 4.748485005399367e-06, + "loss": 0.2097, + "step": 14489 + }, + { + "epoch": 1.3650173100021195, + "grad_norm": 0.6368674635887146, + "learning_rate": 4.7472000266523364e-06, + "loss": 0.2025, + "step": 14490 + }, + { + "epoch": 1.3651115140952874, + "grad_norm": 0.7557327747344971, + "learning_rate": 4.745915167677264e-06, + "loss": 0.2029, + "step": 14491 + }, + { + "epoch": 1.3652057181884554, + "grad_norm": 0.6197017431259155, + "learning_rate": 4.744630428503455e-06, + "loss": 0.2242, + "step": 14492 + }, + { + "epoch": 1.3652999222816231, + "grad_norm": 0.6237503886222839, + "learning_rate": 4.743345809160197e-06, + "loss": 0.2016, + "step": 14493 + }, + { + "epoch": 1.3653941263747909, + "grad_norm": 0.6416782736778259, + "learning_rate": 4.742061309676783e-06, + "loss": 0.2004, + "step": 14494 + }, + { + "epoch": 1.3654883304679588, + "grad_norm": 0.6770577430725098, + "learning_rate": 4.740776930082508e-06, + "loss": 0.2125, + "step": 14495 + }, + { + "epoch": 1.3655825345611268, + "grad_norm": 0.6506226658821106, + "learning_rate": 4.739492670406648e-06, + "loss": 0.19, + "step": 14496 + }, + { + "epoch": 1.3656767386542945, + "grad_norm": 0.5910604000091553, + "learning_rate": 4.7382085306784895e-06, + "loss": 0.1606, + "step": 14497 + }, + { + "epoch": 1.3657709427474622, + "grad_norm": 0.5863729119300842, + "learning_rate": 4.736924510927319e-06, + "loss": 0.1861, + "step": 14498 + }, + { + "epoch": 1.3658651468406302, + "grad_norm": 0.6519709229469299, + "learning_rate": 4.735640611182405e-06, + "loss": 0.1939, + "step": 14499 + }, + { + "epoch": 1.3659593509337982, + "grad_norm": 0.7937561869621277, + "learning_rate": 4.734356831473027e-06, + "loss": 0.229, + "step": 14500 + }, + { + "epoch": 1.366053555026966, + "grad_norm": 0.7065807580947876, + "learning_rate": 4.733073171828461e-06, + "loss": 0.1873, + "step": 14501 + }, + { + "epoch": 1.3661477591201336, + "grad_norm": 0.6730491518974304, + "learning_rate": 4.7317896322779715e-06, + "loss": 0.2056, + "step": 14502 + }, + { + "epoch": 1.3662419632133016, + "grad_norm": 0.6207157373428345, + "learning_rate": 4.730506212850822e-06, + "loss": 0.1908, + "step": 14503 + }, + { + "epoch": 1.3663361673064696, + "grad_norm": 0.6879037022590637, + "learning_rate": 4.729222913576279e-06, + "loss": 0.1863, + "step": 14504 + }, + { + "epoch": 1.3664303713996373, + "grad_norm": 0.5929299592971802, + "learning_rate": 4.72793973448361e-06, + "loss": 0.1964, + "step": 14505 + }, + { + "epoch": 1.366524575492805, + "grad_norm": 0.6904355883598328, + "learning_rate": 4.726656675602065e-06, + "loss": 0.1881, + "step": 14506 + }, + { + "epoch": 1.366618779585973, + "grad_norm": 0.6281517744064331, + "learning_rate": 4.725373736960903e-06, + "loss": 0.1956, + "step": 14507 + }, + { + "epoch": 1.366712983679141, + "grad_norm": 0.6572969555854797, + "learning_rate": 4.7240909185893804e-06, + "loss": 0.2053, + "step": 14508 + }, + { + "epoch": 1.3668071877723087, + "grad_norm": 0.6697401404380798, + "learning_rate": 4.7228082205167414e-06, + "loss": 0.2059, + "step": 14509 + }, + { + "epoch": 1.3669013918654764, + "grad_norm": 0.5750896334648132, + "learning_rate": 4.721525642772236e-06, + "loss": 0.1748, + "step": 14510 + }, + { + "epoch": 1.3669955959586444, + "grad_norm": 0.6491417288780212, + "learning_rate": 4.7202431853851116e-06, + "loss": 0.1849, + "step": 14511 + }, + { + "epoch": 1.3670898000518124, + "grad_norm": 0.6947598457336426, + "learning_rate": 4.718960848384605e-06, + "loss": 0.1903, + "step": 14512 + }, + { + "epoch": 1.36718400414498, + "grad_norm": 0.7724454402923584, + "learning_rate": 4.717678631799959e-06, + "loss": 0.2262, + "step": 14513 + }, + { + "epoch": 1.3672782082381478, + "grad_norm": 0.6440337300300598, + "learning_rate": 4.716396535660412e-06, + "loss": 0.1996, + "step": 14514 + }, + { + "epoch": 1.3673724123313158, + "grad_norm": 0.6808662414550781, + "learning_rate": 4.715114559995197e-06, + "loss": 0.1859, + "step": 14515 + }, + { + "epoch": 1.3674666164244837, + "grad_norm": 1.4499611854553223, + "learning_rate": 4.713832704833534e-06, + "loss": 0.2303, + "step": 14516 + }, + { + "epoch": 1.3675608205176515, + "grad_norm": 0.7500669360160828, + "learning_rate": 4.712550970204669e-06, + "loss": 0.2175, + "step": 14517 + }, + { + "epoch": 1.3676550246108192, + "grad_norm": 0.6843932271003723, + "learning_rate": 4.711269356137819e-06, + "loss": 0.1801, + "step": 14518 + }, + { + "epoch": 1.3677492287039872, + "grad_norm": 0.7365272045135498, + "learning_rate": 4.709987862662199e-06, + "loss": 0.2196, + "step": 14519 + }, + { + "epoch": 1.3678434327971551, + "grad_norm": 0.690955638885498, + "learning_rate": 4.708706489807046e-06, + "loss": 0.2039, + "step": 14520 + }, + { + "epoch": 1.3679376368903229, + "grad_norm": 1.121690273284912, + "learning_rate": 4.707425237601566e-06, + "loss": 0.2303, + "step": 14521 + }, + { + "epoch": 1.3680318409834906, + "grad_norm": 0.6832505464553833, + "learning_rate": 4.706144106074972e-06, + "loss": 0.1973, + "step": 14522 + }, + { + "epoch": 1.3681260450766586, + "grad_norm": 0.6005591154098511, + "learning_rate": 4.704863095256481e-06, + "loss": 0.1916, + "step": 14523 + }, + { + "epoch": 1.3682202491698265, + "grad_norm": 0.7369061708450317, + "learning_rate": 4.7035822051753035e-06, + "loss": 0.1958, + "step": 14524 + }, + { + "epoch": 1.3683144532629943, + "grad_norm": 0.556412398815155, + "learning_rate": 4.702301435860639e-06, + "loss": 0.1805, + "step": 14525 + }, + { + "epoch": 1.368408657356162, + "grad_norm": 0.7060864567756653, + "learning_rate": 4.7010207873416945e-06, + "loss": 0.2371, + "step": 14526 + }, + { + "epoch": 1.36850286144933, + "grad_norm": 0.6817953586578369, + "learning_rate": 4.699740259647676e-06, + "loss": 0.2131, + "step": 14527 + }, + { + "epoch": 1.368597065542498, + "grad_norm": 0.7367711663246155, + "learning_rate": 4.698459852807772e-06, + "loss": 0.2062, + "step": 14528 + }, + { + "epoch": 1.3686912696356657, + "grad_norm": 0.5938516855239868, + "learning_rate": 4.697179566851182e-06, + "loss": 0.2106, + "step": 14529 + }, + { + "epoch": 1.3687854737288334, + "grad_norm": 0.7921022176742554, + "learning_rate": 4.695899401807104e-06, + "loss": 0.2256, + "step": 14530 + }, + { + "epoch": 1.3688796778220014, + "grad_norm": 0.6134877800941467, + "learning_rate": 4.694619357704718e-06, + "loss": 0.1965, + "step": 14531 + }, + { + "epoch": 1.3689738819151693, + "grad_norm": 0.7247481942176819, + "learning_rate": 4.693339434573219e-06, + "loss": 0.1929, + "step": 14532 + }, + { + "epoch": 1.369068086008337, + "grad_norm": 0.6652804017066956, + "learning_rate": 4.692059632441783e-06, + "loss": 0.1963, + "step": 14533 + }, + { + "epoch": 1.3691622901015048, + "grad_norm": 0.6574836373329163, + "learning_rate": 4.690779951339598e-06, + "loss": 0.1866, + "step": 14534 + }, + { + "epoch": 1.3692564941946728, + "grad_norm": 0.645953893661499, + "learning_rate": 4.689500391295844e-06, + "loss": 0.2129, + "step": 14535 + }, + { + "epoch": 1.3693506982878407, + "grad_norm": 0.609062910079956, + "learning_rate": 4.688220952339691e-06, + "loss": 0.1809, + "step": 14536 + }, + { + "epoch": 1.3694449023810085, + "grad_norm": 0.6129270792007446, + "learning_rate": 4.6869416345003136e-06, + "loss": 0.199, + "step": 14537 + }, + { + "epoch": 1.3695391064741762, + "grad_norm": 0.6726804971694946, + "learning_rate": 4.6856624378068886e-06, + "loss": 0.2218, + "step": 14538 + }, + { + "epoch": 1.3696333105673442, + "grad_norm": 0.6016710996627808, + "learning_rate": 4.684383362288575e-06, + "loss": 0.1836, + "step": 14539 + }, + { + "epoch": 1.3697275146605121, + "grad_norm": 0.6364812850952148, + "learning_rate": 4.683104407974545e-06, + "loss": 0.2038, + "step": 14540 + }, + { + "epoch": 1.3698217187536799, + "grad_norm": 0.8082037568092346, + "learning_rate": 4.681825574893953e-06, + "loss": 0.2099, + "step": 14541 + }, + { + "epoch": 1.3699159228468476, + "grad_norm": 0.5484848022460938, + "learning_rate": 4.680546863075968e-06, + "loss": 0.1739, + "step": 14542 + }, + { + "epoch": 1.3700101269400156, + "grad_norm": 0.63649982213974, + "learning_rate": 4.679268272549737e-06, + "loss": 0.1933, + "step": 14543 + }, + { + "epoch": 1.3701043310331835, + "grad_norm": 0.6741502285003662, + "learning_rate": 4.677989803344416e-06, + "loss": 0.2078, + "step": 14544 + }, + { + "epoch": 1.3701985351263513, + "grad_norm": 0.6861146688461304, + "learning_rate": 4.6767114554891634e-06, + "loss": 0.1827, + "step": 14545 + }, + { + "epoch": 1.370292739219519, + "grad_norm": 1.6177656650543213, + "learning_rate": 4.675433229013119e-06, + "loss": 0.2072, + "step": 14546 + }, + { + "epoch": 1.370386943312687, + "grad_norm": 0.7017530202865601, + "learning_rate": 4.674155123945431e-06, + "loss": 0.215, + "step": 14547 + }, + { + "epoch": 1.370481147405855, + "grad_norm": 0.5845447778701782, + "learning_rate": 4.6728771403152465e-06, + "loss": 0.1644, + "step": 14548 + }, + { + "epoch": 1.3705753514990227, + "grad_norm": 0.6896893382072449, + "learning_rate": 4.671599278151697e-06, + "loss": 0.2359, + "step": 14549 + }, + { + "epoch": 1.3706695555921904, + "grad_norm": 0.6154652833938599, + "learning_rate": 4.6703215374839265e-06, + "loss": 0.2019, + "step": 14550 + }, + { + "epoch": 1.3707637596853584, + "grad_norm": 0.6628214120864868, + "learning_rate": 4.66904391834107e-06, + "loss": 0.1817, + "step": 14551 + }, + { + "epoch": 1.3708579637785263, + "grad_norm": 0.6417022347450256, + "learning_rate": 4.6677664207522535e-06, + "loss": 0.2035, + "step": 14552 + }, + { + "epoch": 1.370952167871694, + "grad_norm": 0.6868710517883301, + "learning_rate": 4.6664890447466085e-06, + "loss": 0.204, + "step": 14553 + }, + { + "epoch": 1.3710463719648618, + "grad_norm": 0.6142571568489075, + "learning_rate": 4.665211790353265e-06, + "loss": 0.1924, + "step": 14554 + }, + { + "epoch": 1.3711405760580297, + "grad_norm": 0.7381865978240967, + "learning_rate": 4.663934657601344e-06, + "loss": 0.197, + "step": 14555 + }, + { + "epoch": 1.3712347801511977, + "grad_norm": 0.6057921051979065, + "learning_rate": 4.662657646519957e-06, + "loss": 0.2076, + "step": 14556 + }, + { + "epoch": 1.3713289842443654, + "grad_norm": 0.6941007971763611, + "learning_rate": 4.661380757138238e-06, + "loss": 0.2078, + "step": 14557 + }, + { + "epoch": 1.3714231883375332, + "grad_norm": 0.6600576639175415, + "learning_rate": 4.660103989485294e-06, + "loss": 0.204, + "step": 14558 + }, + { + "epoch": 1.3715173924307011, + "grad_norm": 0.5912672877311707, + "learning_rate": 4.6588273435902295e-06, + "loss": 0.178, + "step": 14559 + }, + { + "epoch": 1.3716115965238689, + "grad_norm": 0.6179364919662476, + "learning_rate": 4.65755081948217e-06, + "loss": 0.1926, + "step": 14560 + }, + { + "epoch": 1.3717058006170368, + "grad_norm": 0.7076285481452942, + "learning_rate": 4.6562744171902144e-06, + "loss": 0.2357, + "step": 14561 + }, + { + "epoch": 1.3718000047102046, + "grad_norm": 0.6799371242523193, + "learning_rate": 4.6549981367434615e-06, + "loss": 0.1939, + "step": 14562 + }, + { + "epoch": 1.3718942088033725, + "grad_norm": 0.5989409685134888, + "learning_rate": 4.6537219781710176e-06, + "loss": 0.18, + "step": 14563 + }, + { + "epoch": 1.3719884128965403, + "grad_norm": 0.6571546792984009, + "learning_rate": 4.652445941501984e-06, + "loss": 0.1963, + "step": 14564 + }, + { + "epoch": 1.3720826169897082, + "grad_norm": 0.6644889116287231, + "learning_rate": 4.65117002676545e-06, + "loss": 0.2163, + "step": 14565 + }, + { + "epoch": 1.372176821082876, + "grad_norm": 0.6616732478141785, + "learning_rate": 4.649894233990512e-06, + "loss": 0.1853, + "step": 14566 + }, + { + "epoch": 1.372271025176044, + "grad_norm": 0.6491537094116211, + "learning_rate": 4.648618563206263e-06, + "loss": 0.1922, + "step": 14567 + }, + { + "epoch": 1.3723652292692117, + "grad_norm": 0.6798130869865417, + "learning_rate": 4.647343014441782e-06, + "loss": 0.2214, + "step": 14568 + }, + { + "epoch": 1.3724594333623796, + "grad_norm": 0.7253738045692444, + "learning_rate": 4.646067587726159e-06, + "loss": 0.227, + "step": 14569 + }, + { + "epoch": 1.3725536374555474, + "grad_norm": 0.7082908749580383, + "learning_rate": 4.644792283088479e-06, + "loss": 0.2212, + "step": 14570 + }, + { + "epoch": 1.3726478415487153, + "grad_norm": 0.7631024718284607, + "learning_rate": 4.643517100557814e-06, + "loss": 0.2174, + "step": 14571 + }, + { + "epoch": 1.372742045641883, + "grad_norm": 0.6379669904708862, + "learning_rate": 4.642242040163245e-06, + "loss": 0.1863, + "step": 14572 + }, + { + "epoch": 1.372836249735051, + "grad_norm": 0.593402624130249, + "learning_rate": 4.640967101933841e-06, + "loss": 0.1695, + "step": 14573 + }, + { + "epoch": 1.3729304538282188, + "grad_norm": 0.6995849013328552, + "learning_rate": 4.6396922858986745e-06, + "loss": 0.2205, + "step": 14574 + }, + { + "epoch": 1.3730246579213867, + "grad_norm": 0.7164720892906189, + "learning_rate": 4.638417592086818e-06, + "loss": 0.205, + "step": 14575 + }, + { + "epoch": 1.3731188620145545, + "grad_norm": 0.681452214717865, + "learning_rate": 4.637143020527329e-06, + "loss": 0.2013, + "step": 14576 + }, + { + "epoch": 1.3732130661077224, + "grad_norm": 0.6576045751571655, + "learning_rate": 4.635868571249271e-06, + "loss": 0.1955, + "step": 14577 + }, + { + "epoch": 1.3733072702008902, + "grad_norm": 0.6932193040847778, + "learning_rate": 4.634594244281711e-06, + "loss": 0.2218, + "step": 14578 + }, + { + "epoch": 1.3734014742940581, + "grad_norm": 0.6721796989440918, + "learning_rate": 4.633320039653695e-06, + "loss": 0.2075, + "step": 14579 + }, + { + "epoch": 1.3734956783872259, + "grad_norm": 0.675926923751831, + "learning_rate": 4.632045957394286e-06, + "loss": 0.1931, + "step": 14580 + }, + { + "epoch": 1.3735898824803938, + "grad_norm": 0.6279994249343872, + "learning_rate": 4.630771997532526e-06, + "loss": 0.1912, + "step": 14581 + }, + { + "epoch": 1.3736840865735616, + "grad_norm": 0.6979064345359802, + "learning_rate": 4.629498160097473e-06, + "loss": 0.2116, + "step": 14582 + }, + { + "epoch": 1.3737782906667295, + "grad_norm": 0.5609949231147766, + "learning_rate": 4.628224445118165e-06, + "loss": 0.1645, + "step": 14583 + }, + { + "epoch": 1.3738724947598973, + "grad_norm": 0.6987993121147156, + "learning_rate": 4.626950852623645e-06, + "loss": 0.2077, + "step": 14584 + }, + { + "epoch": 1.3739666988530652, + "grad_norm": 0.797755777835846, + "learning_rate": 4.62567738264296e-06, + "loss": 0.1966, + "step": 14585 + }, + { + "epoch": 1.374060902946233, + "grad_norm": 0.6203672885894775, + "learning_rate": 4.624404035205139e-06, + "loss": 0.1979, + "step": 14586 + }, + { + "epoch": 1.374155107039401, + "grad_norm": 0.6443046927452087, + "learning_rate": 4.623130810339219e-06, + "loss": 0.2079, + "step": 14587 + }, + { + "epoch": 1.3742493111325687, + "grad_norm": 0.8126762509346008, + "learning_rate": 4.621857708074236e-06, + "loss": 0.2083, + "step": 14588 + }, + { + "epoch": 1.3743435152257366, + "grad_norm": 0.7203518152236938, + "learning_rate": 4.62058472843921e-06, + "loss": 0.2132, + "step": 14589 + }, + { + "epoch": 1.3744377193189043, + "grad_norm": 0.673168420791626, + "learning_rate": 4.619311871463172e-06, + "loss": 0.1998, + "step": 14590 + }, + { + "epoch": 1.3745319234120723, + "grad_norm": 0.6929827332496643, + "learning_rate": 4.618039137175149e-06, + "loss": 0.2139, + "step": 14591 + }, + { + "epoch": 1.37462612750524, + "grad_norm": 0.6129652857780457, + "learning_rate": 4.616766525604157e-06, + "loss": 0.1924, + "step": 14592 + }, + { + "epoch": 1.374720331598408, + "grad_norm": 0.6677462458610535, + "learning_rate": 4.615494036779206e-06, + "loss": 0.206, + "step": 14593 + }, + { + "epoch": 1.3748145356915757, + "grad_norm": 0.6840497851371765, + "learning_rate": 4.614221670729325e-06, + "loss": 0.2012, + "step": 14594 + }, + { + "epoch": 1.3749087397847437, + "grad_norm": 0.6231470108032227, + "learning_rate": 4.61294942748352e-06, + "loss": 0.2111, + "step": 14595 + }, + { + "epoch": 1.3750029438779114, + "grad_norm": 0.6912837028503418, + "learning_rate": 4.611677307070792e-06, + "loss": 0.2024, + "step": 14596 + }, + { + "epoch": 1.3750971479710794, + "grad_norm": 0.6089299917221069, + "learning_rate": 4.610405309520162e-06, + "loss": 0.211, + "step": 14597 + }, + { + "epoch": 1.3751913520642471, + "grad_norm": 0.6354824900627136, + "learning_rate": 4.609133434860626e-06, + "loss": 0.2091, + "step": 14598 + }, + { + "epoch": 1.375285556157415, + "grad_norm": 0.6570692658424377, + "learning_rate": 4.607861683121176e-06, + "loss": 0.1945, + "step": 14599 + }, + { + "epoch": 1.3753797602505828, + "grad_norm": 0.643208920955658, + "learning_rate": 4.606590054330827e-06, + "loss": 0.1923, + "step": 14600 + }, + { + "epoch": 1.3754739643437508, + "grad_norm": 0.6634544730186462, + "learning_rate": 4.605318548518567e-06, + "loss": 0.1826, + "step": 14601 + }, + { + "epoch": 1.3755681684369185, + "grad_norm": 0.6547496914863586, + "learning_rate": 4.604047165713382e-06, + "loss": 0.2249, + "step": 14602 + }, + { + "epoch": 1.3756623725300865, + "grad_norm": 0.629440426826477, + "learning_rate": 4.602775905944268e-06, + "loss": 0.1893, + "step": 14603 + }, + { + "epoch": 1.3757565766232542, + "grad_norm": 0.6593378782272339, + "learning_rate": 4.601504769240212e-06, + "loss": 0.1874, + "step": 14604 + }, + { + "epoch": 1.3758507807164222, + "grad_norm": 0.6591216921806335, + "learning_rate": 4.600233755630194e-06, + "loss": 0.2197, + "step": 14605 + }, + { + "epoch": 1.37594498480959, + "grad_norm": 0.5971760153770447, + "learning_rate": 4.5989628651431975e-06, + "loss": 0.1856, + "step": 14606 + }, + { + "epoch": 1.376039188902758, + "grad_norm": 0.5898448824882507, + "learning_rate": 4.597692097808203e-06, + "loss": 0.1814, + "step": 14607 + }, + { + "epoch": 1.3761333929959256, + "grad_norm": 0.8175941705703735, + "learning_rate": 4.596421453654181e-06, + "loss": 0.2445, + "step": 14608 + }, + { + "epoch": 1.3762275970890936, + "grad_norm": 0.8518378734588623, + "learning_rate": 4.595150932710107e-06, + "loss": 0.2034, + "step": 14609 + }, + { + "epoch": 1.3763218011822613, + "grad_norm": 0.6331554651260376, + "learning_rate": 4.593880535004952e-06, + "loss": 0.1975, + "step": 14610 + }, + { + "epoch": 1.3764160052754293, + "grad_norm": 0.6141476631164551, + "learning_rate": 4.592610260567679e-06, + "loss": 0.1889, + "step": 14611 + }, + { + "epoch": 1.376510209368597, + "grad_norm": 0.6104586124420166, + "learning_rate": 4.591340109427259e-06, + "loss": 0.2009, + "step": 14612 + }, + { + "epoch": 1.376604413461765, + "grad_norm": 0.666061282157898, + "learning_rate": 4.590070081612645e-06, + "loss": 0.2276, + "step": 14613 + }, + { + "epoch": 1.3766986175549327, + "grad_norm": 0.7101260423660278, + "learning_rate": 4.5888001771528e-06, + "loss": 0.1969, + "step": 14614 + }, + { + "epoch": 1.3767928216481007, + "grad_norm": 0.6600731015205383, + "learning_rate": 4.587530396076683e-06, + "loss": 0.2017, + "step": 14615 + }, + { + "epoch": 1.3768870257412684, + "grad_norm": 0.7134162783622742, + "learning_rate": 4.5862607384132395e-06, + "loss": 0.1936, + "step": 14616 + }, + { + "epoch": 1.3769812298344364, + "grad_norm": 0.6632982492446899, + "learning_rate": 4.584991204191424e-06, + "loss": 0.2248, + "step": 14617 + }, + { + "epoch": 1.3770754339276041, + "grad_norm": 0.6679424047470093, + "learning_rate": 4.583721793440188e-06, + "loss": 0.2035, + "step": 14618 + }, + { + "epoch": 1.377169638020772, + "grad_norm": 0.5961079001426697, + "learning_rate": 4.582452506188467e-06, + "loss": 0.2105, + "step": 14619 + }, + { + "epoch": 1.3772638421139398, + "grad_norm": 0.6959073543548584, + "learning_rate": 4.581183342465211e-06, + "loss": 0.2066, + "step": 14620 + }, + { + "epoch": 1.3773580462071078, + "grad_norm": 0.6452242732048035, + "learning_rate": 4.579914302299352e-06, + "loss": 0.2099, + "step": 14621 + }, + { + "epoch": 1.3774522503002755, + "grad_norm": 0.6384733319282532, + "learning_rate": 4.578645385719832e-06, + "loss": 0.1713, + "step": 14622 + }, + { + "epoch": 1.3775464543934435, + "grad_norm": 0.7627642154693604, + "learning_rate": 4.577376592755578e-06, + "loss": 0.1767, + "step": 14623 + }, + { + "epoch": 1.3776406584866112, + "grad_norm": 0.6539168357849121, + "learning_rate": 4.576107923435524e-06, + "loss": 0.2151, + "step": 14624 + }, + { + "epoch": 1.3777348625797792, + "grad_norm": 0.7202804684638977, + "learning_rate": 4.574839377788601e-06, + "loss": 0.2211, + "step": 14625 + }, + { + "epoch": 1.377829066672947, + "grad_norm": 0.6229695677757263, + "learning_rate": 4.573570955843728e-06, + "loss": 0.1937, + "step": 14626 + }, + { + "epoch": 1.3779232707661149, + "grad_norm": 0.6452165246009827, + "learning_rate": 4.572302657629828e-06, + "loss": 0.1662, + "step": 14627 + }, + { + "epoch": 1.3780174748592826, + "grad_norm": 0.7254365682601929, + "learning_rate": 4.571034483175826e-06, + "loss": 0.2067, + "step": 14628 + }, + { + "epoch": 1.3781116789524503, + "grad_norm": 0.6990476250648499, + "learning_rate": 4.56976643251063e-06, + "loss": 0.1784, + "step": 14629 + }, + { + "epoch": 1.3782058830456183, + "grad_norm": 0.6916759610176086, + "learning_rate": 4.568498505663157e-06, + "loss": 0.1989, + "step": 14630 + }, + { + "epoch": 1.3783000871387863, + "grad_norm": 0.6801608204841614, + "learning_rate": 4.567230702662322e-06, + "loss": 0.1871, + "step": 14631 + }, + { + "epoch": 1.378394291231954, + "grad_norm": 0.7456927299499512, + "learning_rate": 4.565963023537029e-06, + "loss": 0.2056, + "step": 14632 + }, + { + "epoch": 1.3784884953251217, + "grad_norm": 0.7449910044670105, + "learning_rate": 4.564695468316175e-06, + "loss": 0.1914, + "step": 14633 + }, + { + "epoch": 1.3785826994182897, + "grad_norm": 0.6321032643318176, + "learning_rate": 4.563428037028677e-06, + "loss": 0.2096, + "step": 14634 + }, + { + "epoch": 1.3786769035114577, + "grad_norm": 0.6685911417007446, + "learning_rate": 4.56216072970343e-06, + "loss": 0.1888, + "step": 14635 + }, + { + "epoch": 1.3787711076046254, + "grad_norm": 0.670963704586029, + "learning_rate": 4.560893546369318e-06, + "loss": 0.2113, + "step": 14636 + }, + { + "epoch": 1.3788653116977931, + "grad_norm": 0.6442837119102478, + "learning_rate": 4.559626487055254e-06, + "loss": 0.1871, + "step": 14637 + }, + { + "epoch": 1.378959515790961, + "grad_norm": 0.6563900113105774, + "learning_rate": 4.558359551790119e-06, + "loss": 0.1958, + "step": 14638 + }, + { + "epoch": 1.379053719884129, + "grad_norm": 0.7210825681686401, + "learning_rate": 4.5570927406027955e-06, + "loss": 0.2129, + "step": 14639 + }, + { + "epoch": 1.3791479239772968, + "grad_norm": 0.6628056764602661, + "learning_rate": 4.555826053522182e-06, + "loss": 0.2185, + "step": 14640 + }, + { + "epoch": 1.3792421280704645, + "grad_norm": 0.6475352644920349, + "learning_rate": 4.554559490577154e-06, + "loss": 0.1932, + "step": 14641 + }, + { + "epoch": 1.3793363321636325, + "grad_norm": 0.651343584060669, + "learning_rate": 4.553293051796587e-06, + "loss": 0.1904, + "step": 14642 + }, + { + "epoch": 1.3794305362568005, + "grad_norm": 0.6923327445983887, + "learning_rate": 4.552026737209362e-06, + "loss": 0.1776, + "step": 14643 + }, + { + "epoch": 1.3795247403499682, + "grad_norm": 0.6525247693061829, + "learning_rate": 4.5507605468443575e-06, + "loss": 0.1962, + "step": 14644 + }, + { + "epoch": 1.379618944443136, + "grad_norm": 0.6223458051681519, + "learning_rate": 4.549494480730435e-06, + "loss": 0.1838, + "step": 14645 + }, + { + "epoch": 1.379713148536304, + "grad_norm": 0.6536427140235901, + "learning_rate": 4.548228538896468e-06, + "loss": 0.1914, + "step": 14646 + }, + { + "epoch": 1.3798073526294719, + "grad_norm": 0.6627947688102722, + "learning_rate": 4.546962721371326e-06, + "loss": 0.2063, + "step": 14647 + }, + { + "epoch": 1.3799015567226396, + "grad_norm": 0.6367174983024597, + "learning_rate": 4.545697028183863e-06, + "loss": 0.1973, + "step": 14648 + }, + { + "epoch": 1.3799957608158073, + "grad_norm": 0.6368837952613831, + "learning_rate": 4.544431459362943e-06, + "loss": 0.1913, + "step": 14649 + }, + { + "epoch": 1.3800899649089753, + "grad_norm": 0.6525770425796509, + "learning_rate": 4.543166014937427e-06, + "loss": 0.202, + "step": 14650 + }, + { + "epoch": 1.3801841690021432, + "grad_norm": 0.668809711933136, + "learning_rate": 4.541900694936161e-06, + "loss": 0.1944, + "step": 14651 + }, + { + "epoch": 1.380278373095311, + "grad_norm": 0.6212360262870789, + "learning_rate": 4.540635499388004e-06, + "loss": 0.1893, + "step": 14652 + }, + { + "epoch": 1.3803725771884787, + "grad_norm": 0.6254076361656189, + "learning_rate": 4.539370428321798e-06, + "loss": 0.1778, + "step": 14653 + }, + { + "epoch": 1.3804667812816467, + "grad_norm": 0.6460777521133423, + "learning_rate": 4.538105481766389e-06, + "loss": 0.2101, + "step": 14654 + }, + { + "epoch": 1.3805609853748146, + "grad_norm": 0.6969109177589417, + "learning_rate": 4.536840659750628e-06, + "loss": 0.2052, + "step": 14655 + }, + { + "epoch": 1.3806551894679824, + "grad_norm": 0.6455351710319519, + "learning_rate": 4.535575962303344e-06, + "loss": 0.2069, + "step": 14656 + }, + { + "epoch": 1.3807493935611501, + "grad_norm": 0.626264750957489, + "learning_rate": 4.534311389453383e-06, + "loss": 0.1732, + "step": 14657 + }, + { + "epoch": 1.380843597654318, + "grad_norm": 0.6563705801963806, + "learning_rate": 4.533046941229571e-06, + "loss": 0.1942, + "step": 14658 + }, + { + "epoch": 1.380937801747486, + "grad_norm": 0.6639990210533142, + "learning_rate": 4.531782617660744e-06, + "loss": 0.2129, + "step": 14659 + }, + { + "epoch": 1.3810320058406538, + "grad_norm": 0.731053352355957, + "learning_rate": 4.530518418775734e-06, + "loss": 0.2214, + "step": 14660 + }, + { + "epoch": 1.3811262099338215, + "grad_norm": 0.7711619138717651, + "learning_rate": 4.529254344603358e-06, + "loss": 0.1677, + "step": 14661 + }, + { + "epoch": 1.3812204140269895, + "grad_norm": 0.7596476078033447, + "learning_rate": 4.527990395172448e-06, + "loss": 0.2339, + "step": 14662 + }, + { + "epoch": 1.3813146181201574, + "grad_norm": 0.6887319684028625, + "learning_rate": 4.526726570511816e-06, + "loss": 0.1914, + "step": 14663 + }, + { + "epoch": 1.3814088222133252, + "grad_norm": 0.7025315165519714, + "learning_rate": 4.525462870650282e-06, + "loss": 0.1815, + "step": 14664 + }, + { + "epoch": 1.381503026306493, + "grad_norm": 0.652842104434967, + "learning_rate": 4.524199295616666e-06, + "loss": 0.216, + "step": 14665 + }, + { + "epoch": 1.3815972303996609, + "grad_norm": 0.7267138361930847, + "learning_rate": 4.522935845439771e-06, + "loss": 0.1935, + "step": 14666 + }, + { + "epoch": 1.3816914344928288, + "grad_norm": 0.6524989604949951, + "learning_rate": 4.521672520148408e-06, + "loss": 0.2066, + "step": 14667 + }, + { + "epoch": 1.3817856385859966, + "grad_norm": 0.6495004296302795, + "learning_rate": 4.520409319771388e-06, + "loss": 0.1989, + "step": 14668 + }, + { + "epoch": 1.3818798426791643, + "grad_norm": 0.7611294388771057, + "learning_rate": 4.519146244337506e-06, + "loss": 0.2103, + "step": 14669 + }, + { + "epoch": 1.3819740467723323, + "grad_norm": 0.6148377656936646, + "learning_rate": 4.517883293875567e-06, + "loss": 0.2041, + "step": 14670 + }, + { + "epoch": 1.3820682508655002, + "grad_norm": 0.621082067489624, + "learning_rate": 4.51662046841437e-06, + "loss": 0.1843, + "step": 14671 + }, + { + "epoch": 1.382162454958668, + "grad_norm": 0.7805433869361877, + "learning_rate": 4.515357767982706e-06, + "loss": 0.1999, + "step": 14672 + }, + { + "epoch": 1.3822566590518357, + "grad_norm": 0.7082175612449646, + "learning_rate": 4.5140951926093615e-06, + "loss": 0.2322, + "step": 14673 + }, + { + "epoch": 1.3823508631450037, + "grad_norm": 0.704624593257904, + "learning_rate": 4.512832742323137e-06, + "loss": 0.2121, + "step": 14674 + }, + { + "epoch": 1.3824450672381716, + "grad_norm": 0.6305441856384277, + "learning_rate": 4.5115704171528105e-06, + "loss": 0.1668, + "step": 14675 + }, + { + "epoch": 1.3825392713313394, + "grad_norm": 0.6836445927619934, + "learning_rate": 4.510308217127162e-06, + "loss": 0.201, + "step": 14676 + }, + { + "epoch": 1.382633475424507, + "grad_norm": 0.7017547488212585, + "learning_rate": 4.509046142274981e-06, + "loss": 0.2054, + "step": 14677 + }, + { + "epoch": 1.382727679517675, + "grad_norm": 0.6677689552307129, + "learning_rate": 4.507784192625041e-06, + "loss": 0.219, + "step": 14678 + }, + { + "epoch": 1.382821883610843, + "grad_norm": 0.6143436431884766, + "learning_rate": 4.5065223682061075e-06, + "loss": 0.2066, + "step": 14679 + }, + { + "epoch": 1.3829160877040108, + "grad_norm": 2.129192590713501, + "learning_rate": 4.505260669046968e-06, + "loss": 0.1941, + "step": 14680 + }, + { + "epoch": 1.3830102917971785, + "grad_norm": 0.6475026607513428, + "learning_rate": 4.503999095176382e-06, + "loss": 0.2101, + "step": 14681 + }, + { + "epoch": 1.3831044958903465, + "grad_norm": 0.6193498969078064, + "learning_rate": 4.502737646623114e-06, + "loss": 0.1986, + "step": 14682 + }, + { + "epoch": 1.3831986999835144, + "grad_norm": 0.6228429079055786, + "learning_rate": 4.5014763234159285e-06, + "loss": 0.2105, + "step": 14683 + }, + { + "epoch": 1.3832929040766822, + "grad_norm": 0.6349548101425171, + "learning_rate": 4.50021512558359e-06, + "loss": 0.1967, + "step": 14684 + }, + { + "epoch": 1.3833871081698499, + "grad_norm": 0.7757889628410339, + "learning_rate": 4.498954053154849e-06, + "loss": 0.2268, + "step": 14685 + }, + { + "epoch": 1.3834813122630178, + "grad_norm": 0.6858643889427185, + "learning_rate": 4.497693106158462e-06, + "loss": 0.2143, + "step": 14686 + }, + { + "epoch": 1.3835755163561858, + "grad_norm": 0.6161576509475708, + "learning_rate": 4.496432284623186e-06, + "loss": 0.1807, + "step": 14687 + }, + { + "epoch": 1.3836697204493535, + "grad_norm": 0.6419311761856079, + "learning_rate": 4.495171588577763e-06, + "loss": 0.2197, + "step": 14688 + }, + { + "epoch": 1.3837639245425213, + "grad_norm": 0.6505982875823975, + "learning_rate": 4.493911018050941e-06, + "loss": 0.1718, + "step": 14689 + }, + { + "epoch": 1.3838581286356892, + "grad_norm": 0.7509188652038574, + "learning_rate": 4.492650573071465e-06, + "loss": 0.2127, + "step": 14690 + }, + { + "epoch": 1.3839523327288572, + "grad_norm": 0.6153393983840942, + "learning_rate": 4.49139025366807e-06, + "loss": 0.1963, + "step": 14691 + }, + { + "epoch": 1.384046536822025, + "grad_norm": 0.6083927750587463, + "learning_rate": 4.490130059869501e-06, + "loss": 0.188, + "step": 14692 + }, + { + "epoch": 1.3841407409151927, + "grad_norm": 0.6401993036270142, + "learning_rate": 4.488869991704483e-06, + "loss": 0.2107, + "step": 14693 + }, + { + "epoch": 1.3842349450083606, + "grad_norm": 0.6734372973442078, + "learning_rate": 4.4876100492017535e-06, + "loss": 0.2009, + "step": 14694 + }, + { + "epoch": 1.3843291491015286, + "grad_norm": 0.7103176712989807, + "learning_rate": 4.486350232390043e-06, + "loss": 0.1718, + "step": 14695 + }, + { + "epoch": 1.3844233531946963, + "grad_norm": 0.7211053967475891, + "learning_rate": 4.485090541298071e-06, + "loss": 0.2088, + "step": 14696 + }, + { + "epoch": 1.384517557287864, + "grad_norm": 0.6296860575675964, + "learning_rate": 4.483830975954566e-06, + "loss": 0.196, + "step": 14697 + }, + { + "epoch": 1.384611761381032, + "grad_norm": 0.6704601645469666, + "learning_rate": 4.482571536388244e-06, + "loss": 0.185, + "step": 14698 + }, + { + "epoch": 1.3847059654741998, + "grad_norm": 0.6774282455444336, + "learning_rate": 4.481312222627823e-06, + "loss": 0.1811, + "step": 14699 + }, + { + "epoch": 1.3848001695673677, + "grad_norm": 0.6754395365715027, + "learning_rate": 4.480053034702021e-06, + "loss": 0.1911, + "step": 14700 + }, + { + "epoch": 1.3848943736605355, + "grad_norm": 0.6918966770172119, + "learning_rate": 4.478793972639544e-06, + "loss": 0.1954, + "step": 14701 + }, + { + "epoch": 1.3849885777537034, + "grad_norm": 0.6784451603889465, + "learning_rate": 4.477535036469106e-06, + "loss": 0.234, + "step": 14702 + }, + { + "epoch": 1.3850827818468712, + "grad_norm": 0.6600942015647888, + "learning_rate": 4.476276226219406e-06, + "loss": 0.2479, + "step": 14703 + }, + { + "epoch": 1.3851769859400391, + "grad_norm": 0.5945283770561218, + "learning_rate": 4.475017541919151e-06, + "loss": 0.1902, + "step": 14704 + }, + { + "epoch": 1.3852711900332069, + "grad_norm": 0.705072820186615, + "learning_rate": 4.473758983597044e-06, + "loss": 0.1962, + "step": 14705 + }, + { + "epoch": 1.3853653941263748, + "grad_norm": 0.6650243997573853, + "learning_rate": 4.472500551281776e-06, + "loss": 0.1855, + "step": 14706 + }, + { + "epoch": 1.3854595982195426, + "grad_norm": 0.6843200922012329, + "learning_rate": 4.471242245002043e-06, + "loss": 0.2173, + "step": 14707 + }, + { + "epoch": 1.3855538023127105, + "grad_norm": 0.650365948677063, + "learning_rate": 4.4699840647865414e-06, + "loss": 0.2053, + "step": 14708 + }, + { + "epoch": 1.3856480064058783, + "grad_norm": 0.6803451776504517, + "learning_rate": 4.46872601066395e-06, + "loss": 0.2103, + "step": 14709 + }, + { + "epoch": 1.3857422104990462, + "grad_norm": 0.6168040037155151, + "learning_rate": 4.4674680826629626e-06, + "loss": 0.1888, + "step": 14710 + }, + { + "epoch": 1.385836414592214, + "grad_norm": 0.6308485865592957, + "learning_rate": 4.466210280812261e-06, + "loss": 0.179, + "step": 14711 + }, + { + "epoch": 1.385930618685382, + "grad_norm": 0.6388193368911743, + "learning_rate": 4.464952605140525e-06, + "loss": 0.205, + "step": 14712 + }, + { + "epoch": 1.3860248227785497, + "grad_norm": 0.7123376727104187, + "learning_rate": 4.46369505567642e-06, + "loss": 0.2082, + "step": 14713 + }, + { + "epoch": 1.3861190268717176, + "grad_norm": 0.6630170941352844, + "learning_rate": 4.462437632448639e-06, + "loss": 0.175, + "step": 14714 + }, + { + "epoch": 1.3862132309648854, + "grad_norm": 0.7099169492721558, + "learning_rate": 4.461180335485843e-06, + "loss": 0.194, + "step": 14715 + }, + { + "epoch": 1.3863074350580533, + "grad_norm": 0.7670761942863464, + "learning_rate": 4.459923164816694e-06, + "loss": 0.1955, + "step": 14716 + }, + { + "epoch": 1.386401639151221, + "grad_norm": 0.651181697845459, + "learning_rate": 4.458666120469872e-06, + "loss": 0.2138, + "step": 14717 + }, + { + "epoch": 1.386495843244389, + "grad_norm": 0.6150261163711548, + "learning_rate": 4.457409202474033e-06, + "loss": 0.1928, + "step": 14718 + }, + { + "epoch": 1.3865900473375568, + "grad_norm": 0.7484995722770691, + "learning_rate": 4.456152410857828e-06, + "loss": 0.1955, + "step": 14719 + }, + { + "epoch": 1.3866842514307247, + "grad_norm": 0.6253624558448792, + "learning_rate": 4.454895745649929e-06, + "loss": 0.1983, + "step": 14720 + }, + { + "epoch": 1.3867784555238925, + "grad_norm": 0.6854663491249084, + "learning_rate": 4.453639206878982e-06, + "loss": 0.2288, + "step": 14721 + }, + { + "epoch": 1.3868726596170604, + "grad_norm": 0.6492166519165039, + "learning_rate": 4.452382794573636e-06, + "loss": 0.193, + "step": 14722 + }, + { + "epoch": 1.3869668637102281, + "grad_norm": 0.6651354432106018, + "learning_rate": 4.451126508762542e-06, + "loss": 0.1984, + "step": 14723 + }, + { + "epoch": 1.387061067803396, + "grad_norm": 0.7173478603363037, + "learning_rate": 4.449870349474349e-06, + "loss": 0.2114, + "step": 14724 + }, + { + "epoch": 1.3871552718965638, + "grad_norm": 0.6437345147132874, + "learning_rate": 4.448614316737692e-06, + "loss": 0.2038, + "step": 14725 + }, + { + "epoch": 1.3872494759897318, + "grad_norm": 0.8587262630462646, + "learning_rate": 4.4473584105812125e-06, + "loss": 0.1895, + "step": 14726 + }, + { + "epoch": 1.3873436800828995, + "grad_norm": 0.6873801946640015, + "learning_rate": 4.446102631033553e-06, + "loss": 0.1808, + "step": 14727 + }, + { + "epoch": 1.3874378841760675, + "grad_norm": 0.6526833772659302, + "learning_rate": 4.444846978123339e-06, + "loss": 0.1973, + "step": 14728 + }, + { + "epoch": 1.3875320882692352, + "grad_norm": 0.745194137096405, + "learning_rate": 4.4435914518792055e-06, + "loss": 0.2133, + "step": 14729 + }, + { + "epoch": 1.3876262923624032, + "grad_norm": 1.006110429763794, + "learning_rate": 4.4423360523297835e-06, + "loss": 0.2134, + "step": 14730 + }, + { + "epoch": 1.387720496455571, + "grad_norm": 0.7415178418159485, + "learning_rate": 4.44108077950369e-06, + "loss": 0.2155, + "step": 14731 + }, + { + "epoch": 1.387814700548739, + "grad_norm": 0.6302402019500732, + "learning_rate": 4.439825633429558e-06, + "loss": 0.196, + "step": 14732 + }, + { + "epoch": 1.3879089046419066, + "grad_norm": 0.759882926940918, + "learning_rate": 4.438570614135994e-06, + "loss": 0.1934, + "step": 14733 + }, + { + "epoch": 1.3880031087350746, + "grad_norm": 0.7392933368682861, + "learning_rate": 4.437315721651623e-06, + "loss": 0.2178, + "step": 14734 + }, + { + "epoch": 1.3880973128282423, + "grad_norm": 0.6102693676948547, + "learning_rate": 4.4360609560050585e-06, + "loss": 0.1844, + "step": 14735 + }, + { + "epoch": 1.3881915169214103, + "grad_norm": 0.5957104563713074, + "learning_rate": 4.434806317224905e-06, + "loss": 0.1815, + "step": 14736 + }, + { + "epoch": 1.388285721014578, + "grad_norm": 0.6966644525527954, + "learning_rate": 4.433551805339779e-06, + "loss": 0.2062, + "step": 14737 + }, + { + "epoch": 1.388379925107746, + "grad_norm": 0.5965523719787598, + "learning_rate": 4.4322974203782776e-06, + "loss": 0.1767, + "step": 14738 + }, + { + "epoch": 1.3884741292009137, + "grad_norm": 0.6402685642242432, + "learning_rate": 4.431043162369005e-06, + "loss": 0.1839, + "step": 14739 + }, + { + "epoch": 1.3885683332940817, + "grad_norm": 0.7522590160369873, + "learning_rate": 4.429789031340565e-06, + "loss": 0.2459, + "step": 14740 + }, + { + "epoch": 1.3886625373872494, + "grad_norm": 0.6181792616844177, + "learning_rate": 4.428535027321544e-06, + "loss": 0.1985, + "step": 14741 + }, + { + "epoch": 1.3887567414804174, + "grad_norm": 0.6564207673072815, + "learning_rate": 4.427281150340547e-06, + "loss": 0.1993, + "step": 14742 + }, + { + "epoch": 1.3888509455735851, + "grad_norm": 0.6626052856445312, + "learning_rate": 4.426027400426152e-06, + "loss": 0.227, + "step": 14743 + }, + { + "epoch": 1.388945149666753, + "grad_norm": 0.6792217493057251, + "learning_rate": 4.424773777606955e-06, + "loss": 0.2034, + "step": 14744 + }, + { + "epoch": 1.3890393537599208, + "grad_norm": 0.6681301593780518, + "learning_rate": 4.42352028191154e-06, + "loss": 0.2135, + "step": 14745 + }, + { + "epoch": 1.3891335578530888, + "grad_norm": 0.6481303572654724, + "learning_rate": 4.422266913368484e-06, + "loss": 0.1823, + "step": 14746 + }, + { + "epoch": 1.3892277619462565, + "grad_norm": 0.6512411832809448, + "learning_rate": 4.4210136720063665e-06, + "loss": 0.1708, + "step": 14747 + }, + { + "epoch": 1.3893219660394245, + "grad_norm": 0.5567074418067932, + "learning_rate": 4.4197605578537715e-06, + "loss": 0.1779, + "step": 14748 + }, + { + "epoch": 1.3894161701325922, + "grad_norm": 0.6283254027366638, + "learning_rate": 4.418507570939261e-06, + "loss": 0.1952, + "step": 14749 + }, + { + "epoch": 1.3895103742257602, + "grad_norm": 0.6755726933479309, + "learning_rate": 4.41725471129141e-06, + "loss": 0.2124, + "step": 14750 + }, + { + "epoch": 1.389604578318928, + "grad_norm": 0.6136601567268372, + "learning_rate": 4.416001978938789e-06, + "loss": 0.1875, + "step": 14751 + }, + { + "epoch": 1.3896987824120959, + "grad_norm": 0.6659301519393921, + "learning_rate": 4.414749373909959e-06, + "loss": 0.206, + "step": 14752 + }, + { + "epoch": 1.3897929865052636, + "grad_norm": 0.6104385256767273, + "learning_rate": 4.413496896233474e-06, + "loss": 0.1812, + "step": 14753 + }, + { + "epoch": 1.3898871905984316, + "grad_norm": 0.7210302948951721, + "learning_rate": 4.412244545937906e-06, + "loss": 0.1923, + "step": 14754 + }, + { + "epoch": 1.3899813946915993, + "grad_norm": 0.6369888782501221, + "learning_rate": 4.4109923230518045e-06, + "loss": 0.2016, + "step": 14755 + }, + { + "epoch": 1.3900755987847673, + "grad_norm": 0.6671775579452515, + "learning_rate": 4.409740227603715e-06, + "loss": 0.206, + "step": 14756 + }, + { + "epoch": 1.390169802877935, + "grad_norm": 0.6954997181892395, + "learning_rate": 4.4084882596222e-06, + "loss": 0.2128, + "step": 14757 + }, + { + "epoch": 1.390264006971103, + "grad_norm": 0.6379964351654053, + "learning_rate": 4.4072364191358006e-06, + "loss": 0.2058, + "step": 14758 + }, + { + "epoch": 1.3903582110642707, + "grad_norm": 0.6217644810676575, + "learning_rate": 4.405984706173052e-06, + "loss": 0.171, + "step": 14759 + }, + { + "epoch": 1.3904524151574387, + "grad_norm": 1.2221981287002563, + "learning_rate": 4.404733120762512e-06, + "loss": 0.1833, + "step": 14760 + }, + { + "epoch": 1.3905466192506064, + "grad_norm": 0.6969944834709167, + "learning_rate": 4.4034816629327095e-06, + "loss": 0.2197, + "step": 14761 + }, + { + "epoch": 1.3906408233437744, + "grad_norm": 0.8289557099342346, + "learning_rate": 4.402230332712176e-06, + "loss": 0.209, + "step": 14762 + }, + { + "epoch": 1.390735027436942, + "grad_norm": 0.9347938299179077, + "learning_rate": 4.400979130129449e-06, + "loss": 0.2159, + "step": 14763 + }, + { + "epoch": 1.39082923153011, + "grad_norm": 0.7640525102615356, + "learning_rate": 4.39972805521306e-06, + "loss": 0.1962, + "step": 14764 + }, + { + "epoch": 1.3909234356232778, + "grad_norm": 0.6633443236351013, + "learning_rate": 4.398477107991529e-06, + "loss": 0.1916, + "step": 14765 + }, + { + "epoch": 1.3910176397164458, + "grad_norm": 0.6027562618255615, + "learning_rate": 4.397226288493384e-06, + "loss": 0.1719, + "step": 14766 + }, + { + "epoch": 1.3911118438096135, + "grad_norm": 0.6049935817718506, + "learning_rate": 4.3959755967471465e-06, + "loss": 0.1979, + "step": 14767 + }, + { + "epoch": 1.3912060479027812, + "grad_norm": 0.736307680606842, + "learning_rate": 4.394725032781328e-06, + "loss": 0.2163, + "step": 14768 + }, + { + "epoch": 1.3913002519959492, + "grad_norm": 0.6756777167320251, + "learning_rate": 4.393474596624449e-06, + "loss": 0.2139, + "step": 14769 + }, + { + "epoch": 1.3913944560891172, + "grad_norm": 0.6903586387634277, + "learning_rate": 4.3922242883050226e-06, + "loss": 0.2032, + "step": 14770 + }, + { + "epoch": 1.391488660182285, + "grad_norm": 0.6733536124229431, + "learning_rate": 4.390974107851552e-06, + "loss": 0.1946, + "step": 14771 + }, + { + "epoch": 1.3915828642754526, + "grad_norm": 0.6567028164863586, + "learning_rate": 4.389724055292549e-06, + "loss": 0.205, + "step": 14772 + }, + { + "epoch": 1.3916770683686206, + "grad_norm": 0.6626735925674438, + "learning_rate": 4.388474130656512e-06, + "loss": 0.1845, + "step": 14773 + }, + { + "epoch": 1.3917712724617886, + "grad_norm": 0.668620765209198, + "learning_rate": 4.387224333971946e-06, + "loss": 0.2279, + "step": 14774 + }, + { + "epoch": 1.3918654765549563, + "grad_norm": 0.6802232265472412, + "learning_rate": 4.3859746652673405e-06, + "loss": 0.2005, + "step": 14775 + }, + { + "epoch": 1.391959680648124, + "grad_norm": 0.7972701787948608, + "learning_rate": 4.3847251245711965e-06, + "loss": 0.2036, + "step": 14776 + }, + { + "epoch": 1.392053884741292, + "grad_norm": 0.7019937038421631, + "learning_rate": 4.383475711912007e-06, + "loss": 0.1799, + "step": 14777 + }, + { + "epoch": 1.39214808883446, + "grad_norm": 0.672942042350769, + "learning_rate": 4.3822264273182536e-06, + "loss": 0.1762, + "step": 14778 + }, + { + "epoch": 1.3922422929276277, + "grad_norm": 0.6258542537689209, + "learning_rate": 4.380977270818426e-06, + "loss": 0.1916, + "step": 14779 + }, + { + "epoch": 1.3923364970207954, + "grad_norm": 0.6168133020401001, + "learning_rate": 4.379728242441011e-06, + "loss": 0.1858, + "step": 14780 + }, + { + "epoch": 1.3924307011139634, + "grad_norm": 0.640653669834137, + "learning_rate": 4.378479342214479e-06, + "loss": 0.1749, + "step": 14781 + }, + { + "epoch": 1.3925249052071313, + "grad_norm": 0.6405872106552124, + "learning_rate": 4.377230570167316e-06, + "loss": 0.1764, + "step": 14782 + }, + { + "epoch": 1.392619109300299, + "grad_norm": 0.6704461574554443, + "learning_rate": 4.375981926327988e-06, + "loss": 0.1984, + "step": 14783 + }, + { + "epoch": 1.3927133133934668, + "grad_norm": 0.6826492547988892, + "learning_rate": 4.374733410724969e-06, + "loss": 0.1943, + "step": 14784 + }, + { + "epoch": 1.3928075174866348, + "grad_norm": 0.8903515934944153, + "learning_rate": 4.373485023386733e-06, + "loss": 0.2058, + "step": 14785 + }, + { + "epoch": 1.3929017215798027, + "grad_norm": 0.6510199308395386, + "learning_rate": 4.3722367643417365e-06, + "loss": 0.1864, + "step": 14786 + }, + { + "epoch": 1.3929959256729705, + "grad_norm": 0.641514241695404, + "learning_rate": 4.370988633618445e-06, + "loss": 0.2059, + "step": 14787 + }, + { + "epoch": 1.3930901297661382, + "grad_norm": 0.6693086624145508, + "learning_rate": 4.369740631245321e-06, + "loss": 0.2201, + "step": 14788 + }, + { + "epoch": 1.3931843338593062, + "grad_norm": 0.5776128768920898, + "learning_rate": 4.368492757250814e-06, + "loss": 0.1851, + "step": 14789 + }, + { + "epoch": 1.3932785379524741, + "grad_norm": 0.6831786632537842, + "learning_rate": 4.367245011663383e-06, + "loss": 0.1958, + "step": 14790 + }, + { + "epoch": 1.3933727420456419, + "grad_norm": 0.6280180811882019, + "learning_rate": 4.365997394511479e-06, + "loss": 0.1823, + "step": 14791 + }, + { + "epoch": 1.3934669461388096, + "grad_norm": 0.6874216794967651, + "learning_rate": 4.364749905823549e-06, + "loss": 0.223, + "step": 14792 + }, + { + "epoch": 1.3935611502319776, + "grad_norm": 0.648345947265625, + "learning_rate": 4.3635025456280275e-06, + "loss": 0.2489, + "step": 14793 + }, + { + "epoch": 1.3936553543251455, + "grad_norm": 0.6531689167022705, + "learning_rate": 4.3622553139533726e-06, + "loss": 0.1857, + "step": 14794 + }, + { + "epoch": 1.3937495584183133, + "grad_norm": 0.6035807132720947, + "learning_rate": 4.361008210828016e-06, + "loss": 0.185, + "step": 14795 + }, + { + "epoch": 1.393843762511481, + "grad_norm": 0.6098361015319824, + "learning_rate": 4.3597612362803854e-06, + "loss": 0.1778, + "step": 14796 + }, + { + "epoch": 1.393937966604649, + "grad_norm": 0.6855610609054565, + "learning_rate": 4.358514390338929e-06, + "loss": 0.2105, + "step": 14797 + }, + { + "epoch": 1.394032170697817, + "grad_norm": 0.5912315249443054, + "learning_rate": 4.357267673032069e-06, + "loss": 0.2294, + "step": 14798 + }, + { + "epoch": 1.3941263747909847, + "grad_norm": 0.6672865152359009, + "learning_rate": 4.35602108438823e-06, + "loss": 0.1979, + "step": 14799 + }, + { + "epoch": 1.3942205788841524, + "grad_norm": 0.7342973351478577, + "learning_rate": 4.354774624435838e-06, + "loss": 0.2132, + "step": 14800 + }, + { + "epoch": 1.3943147829773204, + "grad_norm": 0.6629995107650757, + "learning_rate": 4.353528293203318e-06, + "loss": 0.2207, + "step": 14801 + }, + { + "epoch": 1.3944089870704883, + "grad_norm": 0.6592957377433777, + "learning_rate": 4.352282090719083e-06, + "loss": 0.1848, + "step": 14802 + }, + { + "epoch": 1.394503191163656, + "grad_norm": 0.6145618557929993, + "learning_rate": 4.351036017011551e-06, + "loss": 0.1883, + "step": 14803 + }, + { + "epoch": 1.3945973952568238, + "grad_norm": 0.5742574334144592, + "learning_rate": 4.349790072109136e-06, + "loss": 0.1716, + "step": 14804 + }, + { + "epoch": 1.3946915993499918, + "grad_norm": 0.6625123620033264, + "learning_rate": 4.348544256040244e-06, + "loss": 0.2041, + "step": 14805 + }, + { + "epoch": 1.3947858034431597, + "grad_norm": 0.6488248705863953, + "learning_rate": 4.347298568833281e-06, + "loss": 0.2035, + "step": 14806 + }, + { + "epoch": 1.3948800075363275, + "grad_norm": 0.747687816619873, + "learning_rate": 4.346053010516657e-06, + "loss": 0.223, + "step": 14807 + }, + { + "epoch": 1.3949742116294952, + "grad_norm": 0.6713259220123291, + "learning_rate": 4.344807581118765e-06, + "loss": 0.1567, + "step": 14808 + }, + { + "epoch": 1.3950684157226632, + "grad_norm": 0.6101518869400024, + "learning_rate": 4.343562280668006e-06, + "loss": 0.2271, + "step": 14809 + }, + { + "epoch": 1.3951626198158311, + "grad_norm": 0.6190088987350464, + "learning_rate": 4.34231710919278e-06, + "loss": 0.1702, + "step": 14810 + }, + { + "epoch": 1.3952568239089989, + "grad_norm": 0.6656995415687561, + "learning_rate": 4.341072066721468e-06, + "loss": 0.1987, + "step": 14811 + }, + { + "epoch": 1.3953510280021666, + "grad_norm": 0.6109845638275146, + "learning_rate": 4.339827153282469e-06, + "loss": 0.1997, + "step": 14812 + }, + { + "epoch": 1.3954452320953346, + "grad_norm": 0.6390396952629089, + "learning_rate": 4.338582368904161e-06, + "loss": 0.1853, + "step": 14813 + }, + { + "epoch": 1.3955394361885025, + "grad_norm": 0.6447051763534546, + "learning_rate": 4.337337713614933e-06, + "loss": 0.2291, + "step": 14814 + }, + { + "epoch": 1.3956336402816703, + "grad_norm": 0.6061784029006958, + "learning_rate": 4.33609318744316e-06, + "loss": 0.196, + "step": 14815 + }, + { + "epoch": 1.395727844374838, + "grad_norm": 0.7119218707084656, + "learning_rate": 4.334848790417222e-06, + "loss": 0.2341, + "step": 14816 + }, + { + "epoch": 1.395822048468006, + "grad_norm": 0.7430229187011719, + "learning_rate": 4.333604522565496e-06, + "loss": 0.2351, + "step": 14817 + }, + { + "epoch": 1.395916252561174, + "grad_norm": 0.790756106376648, + "learning_rate": 4.332360383916347e-06, + "loss": 0.1935, + "step": 14818 + }, + { + "epoch": 1.3960104566543416, + "grad_norm": 0.5808947086334229, + "learning_rate": 4.331116374498145e-06, + "loss": 0.2016, + "step": 14819 + }, + { + "epoch": 1.3961046607475094, + "grad_norm": 0.6004825830459595, + "learning_rate": 4.329872494339262e-06, + "loss": 0.1751, + "step": 14820 + }, + { + "epoch": 1.3961988648406773, + "grad_norm": 0.6784185171127319, + "learning_rate": 4.328628743468051e-06, + "loss": 0.2106, + "step": 14821 + }, + { + "epoch": 1.3962930689338453, + "grad_norm": 0.6481770873069763, + "learning_rate": 4.32738512191288e-06, + "loss": 0.2326, + "step": 14822 + }, + { + "epoch": 1.396387273027013, + "grad_norm": 0.6379073858261108, + "learning_rate": 4.326141629702096e-06, + "loss": 0.2036, + "step": 14823 + }, + { + "epoch": 1.3964814771201808, + "grad_norm": 0.6903553009033203, + "learning_rate": 4.3248982668640585e-06, + "loss": 0.2207, + "step": 14824 + }, + { + "epoch": 1.3965756812133487, + "grad_norm": 0.7104666829109192, + "learning_rate": 4.32365503342712e-06, + "loss": 0.205, + "step": 14825 + }, + { + "epoch": 1.3966698853065167, + "grad_norm": 0.7103066444396973, + "learning_rate": 4.322411929419623e-06, + "loss": 0.1741, + "step": 14826 + }, + { + "epoch": 1.3967640893996844, + "grad_norm": 0.759509265422821, + "learning_rate": 4.321168954869913e-06, + "loss": 0.2221, + "step": 14827 + }, + { + "epoch": 1.3968582934928522, + "grad_norm": 0.6699749231338501, + "learning_rate": 4.319926109806339e-06, + "loss": 0.2105, + "step": 14828 + }, + { + "epoch": 1.3969524975860201, + "grad_norm": 0.642577588558197, + "learning_rate": 4.318683394257229e-06, + "loss": 0.1994, + "step": 14829 + }, + { + "epoch": 1.397046701679188, + "grad_norm": 0.6244491934776306, + "learning_rate": 4.317440808250924e-06, + "loss": 0.1963, + "step": 14830 + }, + { + "epoch": 1.3971409057723558, + "grad_norm": 0.6352891325950623, + "learning_rate": 4.316198351815761e-06, + "loss": 0.2109, + "step": 14831 + }, + { + "epoch": 1.3972351098655236, + "grad_norm": 0.6272392868995667, + "learning_rate": 4.314956024980066e-06, + "loss": 0.2042, + "step": 14832 + }, + { + "epoch": 1.3973293139586915, + "grad_norm": 0.711333692073822, + "learning_rate": 4.313713827772158e-06, + "loss": 0.1999, + "step": 14833 + }, + { + "epoch": 1.3974235180518593, + "grad_norm": 0.7087350487709045, + "learning_rate": 4.312471760220376e-06, + "loss": 0.2068, + "step": 14834 + }, + { + "epoch": 1.3975177221450272, + "grad_norm": 0.8110741972923279, + "learning_rate": 4.3112298223530335e-06, + "loss": 0.2539, + "step": 14835 + }, + { + "epoch": 1.397611926238195, + "grad_norm": 0.7614244222640991, + "learning_rate": 4.309988014198442e-06, + "loss": 0.2175, + "step": 14836 + }, + { + "epoch": 1.397706130331363, + "grad_norm": 0.6266860365867615, + "learning_rate": 4.30874633578493e-06, + "loss": 0.179, + "step": 14837 + }, + { + "epoch": 1.3978003344245307, + "grad_norm": 0.7377480268478394, + "learning_rate": 4.307504787140805e-06, + "loss": 0.2445, + "step": 14838 + }, + { + "epoch": 1.3978945385176986, + "grad_norm": 0.755540668964386, + "learning_rate": 4.306263368294369e-06, + "loss": 0.2168, + "step": 14839 + }, + { + "epoch": 1.3979887426108664, + "grad_norm": 0.7447509765625, + "learning_rate": 4.305022079273935e-06, + "loss": 0.2055, + "step": 14840 + }, + { + "epoch": 1.3980829467040343, + "grad_norm": 0.6671401262283325, + "learning_rate": 4.303780920107807e-06, + "loss": 0.1839, + "step": 14841 + }, + { + "epoch": 1.398177150797202, + "grad_norm": 0.7704858183860779, + "learning_rate": 4.302539890824282e-06, + "loss": 0.2014, + "step": 14842 + }, + { + "epoch": 1.39827135489037, + "grad_norm": 0.6026281714439392, + "learning_rate": 4.3012989914516575e-06, + "loss": 0.1825, + "step": 14843 + }, + { + "epoch": 1.3983655589835378, + "grad_norm": 0.7292162775993347, + "learning_rate": 4.300058222018233e-06, + "loss": 0.2114, + "step": 14844 + }, + { + "epoch": 1.3984597630767057, + "grad_norm": 0.6992906928062439, + "learning_rate": 4.2988175825522924e-06, + "loss": 0.1925, + "step": 14845 + }, + { + "epoch": 1.3985539671698735, + "grad_norm": 0.551853358745575, + "learning_rate": 4.297577073082129e-06, + "loss": 0.1658, + "step": 14846 + }, + { + "epoch": 1.3986481712630414, + "grad_norm": 0.772172212600708, + "learning_rate": 4.296336693636029e-06, + "loss": 0.221, + "step": 14847 + }, + { + "epoch": 1.3987423753562092, + "grad_norm": 0.7460121512413025, + "learning_rate": 4.295096444242272e-06, + "loss": 0.2145, + "step": 14848 + }, + { + "epoch": 1.3988365794493771, + "grad_norm": 0.6584548354148865, + "learning_rate": 4.293856324929137e-06, + "loss": 0.1972, + "step": 14849 + }, + { + "epoch": 1.3989307835425449, + "grad_norm": 0.7437769174575806, + "learning_rate": 4.292616335724908e-06, + "loss": 0.2435, + "step": 14850 + }, + { + "epoch": 1.3990249876357128, + "grad_norm": 0.7641134858131409, + "learning_rate": 4.291376476657847e-06, + "loss": 0.2023, + "step": 14851 + }, + { + "epoch": 1.3991191917288806, + "grad_norm": 0.6327671408653259, + "learning_rate": 4.290136747756236e-06, + "loss": 0.2069, + "step": 14852 + }, + { + "epoch": 1.3992133958220485, + "grad_norm": 0.582234799861908, + "learning_rate": 4.288897149048334e-06, + "loss": 0.1903, + "step": 14853 + }, + { + "epoch": 1.3993075999152162, + "grad_norm": 0.6909761428833008, + "learning_rate": 4.287657680562412e-06, + "loss": 0.2087, + "step": 14854 + }, + { + "epoch": 1.3994018040083842, + "grad_norm": 0.7836440205574036, + "learning_rate": 4.286418342326727e-06, + "loss": 0.2565, + "step": 14855 + }, + { + "epoch": 1.399496008101552, + "grad_norm": 0.6624511480331421, + "learning_rate": 4.285179134369539e-06, + "loss": 0.1986, + "step": 14856 + }, + { + "epoch": 1.39959021219472, + "grad_norm": 0.645569920539856, + "learning_rate": 4.283940056719109e-06, + "loss": 0.2202, + "step": 14857 + }, + { + "epoch": 1.3996844162878876, + "grad_norm": 0.6882280111312866, + "learning_rate": 4.282701109403683e-06, + "loss": 0.1751, + "step": 14858 + }, + { + "epoch": 1.3997786203810556, + "grad_norm": 0.6280230283737183, + "learning_rate": 4.281462292451512e-06, + "loss": 0.1894, + "step": 14859 + }, + { + "epoch": 1.3998728244742233, + "grad_norm": 0.7303670644760132, + "learning_rate": 4.28022360589085e-06, + "loss": 0.2032, + "step": 14860 + }, + { + "epoch": 1.3999670285673913, + "grad_norm": 0.6263623237609863, + "learning_rate": 4.278985049749931e-06, + "loss": 0.1623, + "step": 14861 + }, + { + "epoch": 1.400061232660559, + "grad_norm": 0.7629262208938599, + "learning_rate": 4.277746624057003e-06, + "loss": 0.2052, + "step": 14862 + }, + { + "epoch": 1.400155436753727, + "grad_norm": 0.6788503527641296, + "learning_rate": 4.2765083288403e-06, + "loss": 0.1945, + "step": 14863 + }, + { + "epoch": 1.4002496408468947, + "grad_norm": 0.6659932136535645, + "learning_rate": 4.275270164128057e-06, + "loss": 0.2245, + "step": 14864 + }, + { + "epoch": 1.4003438449400627, + "grad_norm": 0.8249582052230835, + "learning_rate": 4.274032129948512e-06, + "loss": 0.1851, + "step": 14865 + }, + { + "epoch": 1.4004380490332304, + "grad_norm": 0.6572917103767395, + "learning_rate": 4.272794226329887e-06, + "loss": 0.1768, + "step": 14866 + }, + { + "epoch": 1.4005322531263984, + "grad_norm": 0.6544477939605713, + "learning_rate": 4.271556453300411e-06, + "loss": 0.2145, + "step": 14867 + }, + { + "epoch": 1.4006264572195661, + "grad_norm": 0.6506238579750061, + "learning_rate": 4.2703188108883096e-06, + "loss": 0.2071, + "step": 14868 + }, + { + "epoch": 1.400720661312734, + "grad_norm": 0.6459051966667175, + "learning_rate": 4.269081299121797e-06, + "loss": 0.2219, + "step": 14869 + }, + { + "epoch": 1.4008148654059018, + "grad_norm": 0.661430299282074, + "learning_rate": 4.267843918029094e-06, + "loss": 0.1978, + "step": 14870 + }, + { + "epoch": 1.4009090694990698, + "grad_norm": 0.6441071629524231, + "learning_rate": 4.266606667638418e-06, + "loss": 0.1898, + "step": 14871 + }, + { + "epoch": 1.4010032735922375, + "grad_norm": 0.6597195863723755, + "learning_rate": 4.265369547977978e-06, + "loss": 0.2211, + "step": 14872 + }, + { + "epoch": 1.4010974776854055, + "grad_norm": 0.6670068502426147, + "learning_rate": 4.264132559075972e-06, + "loss": 0.1806, + "step": 14873 + }, + { + "epoch": 1.4011916817785732, + "grad_norm": 0.7369149327278137, + "learning_rate": 4.262895700960623e-06, + "loss": 0.1813, + "step": 14874 + }, + { + "epoch": 1.4012858858717412, + "grad_norm": 0.6494218707084656, + "learning_rate": 4.261658973660124e-06, + "loss": 0.2155, + "step": 14875 + }, + { + "epoch": 1.401380089964909, + "grad_norm": 0.6344252824783325, + "learning_rate": 4.260422377202668e-06, + "loss": 0.1963, + "step": 14876 + }, + { + "epoch": 1.4014742940580769, + "grad_norm": 0.7019323110580444, + "learning_rate": 4.259185911616465e-06, + "loss": 0.2249, + "step": 14877 + }, + { + "epoch": 1.4015684981512446, + "grad_norm": 0.6826676726341248, + "learning_rate": 4.257949576929702e-06, + "loss": 0.1947, + "step": 14878 + }, + { + "epoch": 1.4016627022444126, + "grad_norm": 0.6142680644989014, + "learning_rate": 4.256713373170565e-06, + "loss": 0.1994, + "step": 14879 + }, + { + "epoch": 1.4017569063375803, + "grad_norm": 0.6876006722450256, + "learning_rate": 4.255477300367246e-06, + "loss": 0.2296, + "step": 14880 + }, + { + "epoch": 1.4018511104307483, + "grad_norm": 0.7480155825614929, + "learning_rate": 4.2542413585479305e-06, + "loss": 0.2161, + "step": 14881 + }, + { + "epoch": 1.401945314523916, + "grad_norm": 0.6555769443511963, + "learning_rate": 4.253005547740796e-06, + "loss": 0.1943, + "step": 14882 + }, + { + "epoch": 1.402039518617084, + "grad_norm": 0.6414076685905457, + "learning_rate": 4.251769867974022e-06, + "loss": 0.1755, + "step": 14883 + }, + { + "epoch": 1.4021337227102517, + "grad_norm": 0.6698834300041199, + "learning_rate": 4.250534319275789e-06, + "loss": 0.2087, + "step": 14884 + }, + { + "epoch": 1.4022279268034197, + "grad_norm": 0.6921601891517639, + "learning_rate": 4.249298901674261e-06, + "loss": 0.2169, + "step": 14885 + }, + { + "epoch": 1.4023221308965874, + "grad_norm": 0.6521345973014832, + "learning_rate": 4.248063615197613e-06, + "loss": 0.1967, + "step": 14886 + }, + { + "epoch": 1.4024163349897554, + "grad_norm": 0.6779271960258484, + "learning_rate": 4.246828459874013e-06, + "loss": 0.2298, + "step": 14887 + }, + { + "epoch": 1.4025105390829231, + "grad_norm": 0.6023015975952148, + "learning_rate": 4.2455934357316176e-06, + "loss": 0.1777, + "step": 14888 + }, + { + "epoch": 1.402604743176091, + "grad_norm": 0.6468263864517212, + "learning_rate": 4.244358542798591e-06, + "loss": 0.1821, + "step": 14889 + }, + { + "epoch": 1.4026989472692588, + "grad_norm": 0.6531599760055542, + "learning_rate": 4.243123781103096e-06, + "loss": 0.197, + "step": 14890 + }, + { + "epoch": 1.4027931513624268, + "grad_norm": 0.6184870600700378, + "learning_rate": 4.241889150673281e-06, + "loss": 0.1917, + "step": 14891 + }, + { + "epoch": 1.4028873554555945, + "grad_norm": 0.6046724915504456, + "learning_rate": 4.240654651537294e-06, + "loss": 0.1834, + "step": 14892 + }, + { + "epoch": 1.4029815595487625, + "grad_norm": 0.644598126411438, + "learning_rate": 4.239420283723289e-06, + "loss": 0.2027, + "step": 14893 + }, + { + "epoch": 1.4030757636419302, + "grad_norm": 0.6100983619689941, + "learning_rate": 4.238186047259414e-06, + "loss": 0.1999, + "step": 14894 + }, + { + "epoch": 1.4031699677350982, + "grad_norm": 0.610946774482727, + "learning_rate": 4.236951942173803e-06, + "loss": 0.1955, + "step": 14895 + }, + { + "epoch": 1.403264171828266, + "grad_norm": 0.6465967893600464, + "learning_rate": 4.235717968494601e-06, + "loss": 0.1841, + "step": 14896 + }, + { + "epoch": 1.4033583759214339, + "grad_norm": 0.5939137935638428, + "learning_rate": 4.234484126249949e-06, + "loss": 0.1968, + "step": 14897 + }, + { + "epoch": 1.4034525800146016, + "grad_norm": 0.7123169898986816, + "learning_rate": 4.233250415467971e-06, + "loss": 0.2198, + "step": 14898 + }, + { + "epoch": 1.4035467841077696, + "grad_norm": 0.6703714728355408, + "learning_rate": 4.232016836176802e-06, + "loss": 0.237, + "step": 14899 + }, + { + "epoch": 1.4036409882009373, + "grad_norm": 0.7021177411079407, + "learning_rate": 4.230783388404573e-06, + "loss": 0.2179, + "step": 14900 + }, + { + "epoch": 1.4037351922941053, + "grad_norm": 0.6722543835639954, + "learning_rate": 4.229550072179401e-06, + "loss": 0.2293, + "step": 14901 + }, + { + "epoch": 1.403829396387273, + "grad_norm": 0.6194466352462769, + "learning_rate": 4.228316887529416e-06, + "loss": 0.1964, + "step": 14902 + }, + { + "epoch": 1.4039236004804407, + "grad_norm": 0.7914484143257141, + "learning_rate": 4.2270838344827285e-06, + "loss": 0.2117, + "step": 14903 + }, + { + "epoch": 1.4040178045736087, + "grad_norm": 0.6260371804237366, + "learning_rate": 4.225850913067457e-06, + "loss": 0.1961, + "step": 14904 + }, + { + "epoch": 1.4041120086667767, + "grad_norm": 0.5706886649131775, + "learning_rate": 4.224618123311718e-06, + "loss": 0.1881, + "step": 14905 + }, + { + "epoch": 1.4042062127599444, + "grad_norm": 0.6768814325332642, + "learning_rate": 4.2233854652436145e-06, + "loss": 0.1978, + "step": 14906 + }, + { + "epoch": 1.4043004168531121, + "grad_norm": 0.6236286163330078, + "learning_rate": 4.222152938891255e-06, + "loss": 0.2103, + "step": 14907 + }, + { + "epoch": 1.40439462094628, + "grad_norm": 0.6196622252464294, + "learning_rate": 4.2209205442827494e-06, + "loss": 0.1989, + "step": 14908 + }, + { + "epoch": 1.404488825039448, + "grad_norm": 0.6381906270980835, + "learning_rate": 4.219688281446188e-06, + "loss": 0.2065, + "step": 14909 + }, + { + "epoch": 1.4045830291326158, + "grad_norm": 0.6047495603561401, + "learning_rate": 4.218456150409673e-06, + "loss": 0.1887, + "step": 14910 + }, + { + "epoch": 1.4046772332257835, + "grad_norm": 0.5788443684577942, + "learning_rate": 4.217224151201303e-06, + "loss": 0.1988, + "step": 14911 + }, + { + "epoch": 1.4047714373189515, + "grad_norm": 0.698949933052063, + "learning_rate": 4.2159922838491675e-06, + "loss": 0.1992, + "step": 14912 + }, + { + "epoch": 1.4048656414121194, + "grad_norm": 0.667930543422699, + "learning_rate": 4.214760548381344e-06, + "loss": 0.1912, + "step": 14913 + }, + { + "epoch": 1.4049598455052872, + "grad_norm": 0.7808729410171509, + "learning_rate": 4.2135289448259346e-06, + "loss": 0.1954, + "step": 14914 + }, + { + "epoch": 1.405054049598455, + "grad_norm": 0.6170253157615662, + "learning_rate": 4.212297473211014e-06, + "loss": 0.1929, + "step": 14915 + }, + { + "epoch": 1.4051482536916229, + "grad_norm": 0.6978352665901184, + "learning_rate": 4.2110661335646585e-06, + "loss": 0.2145, + "step": 14916 + }, + { + "epoch": 1.4052424577847908, + "grad_norm": 0.6556413173675537, + "learning_rate": 4.2098349259149475e-06, + "loss": 0.2212, + "step": 14917 + }, + { + "epoch": 1.4053366618779586, + "grad_norm": 0.6603764891624451, + "learning_rate": 4.208603850289958e-06, + "loss": 0.231, + "step": 14918 + }, + { + "epoch": 1.4054308659711263, + "grad_norm": 0.6079287528991699, + "learning_rate": 4.207372906717755e-06, + "loss": 0.1884, + "step": 14919 + }, + { + "epoch": 1.4055250700642943, + "grad_norm": 0.6266393661499023, + "learning_rate": 4.206142095226408e-06, + "loss": 0.1858, + "step": 14920 + }, + { + "epoch": 1.4056192741574622, + "grad_norm": 0.6841825246810913, + "learning_rate": 4.204911415843985e-06, + "loss": 0.198, + "step": 14921 + }, + { + "epoch": 1.40571347825063, + "grad_norm": 0.6131460666656494, + "learning_rate": 4.2036808685985395e-06, + "loss": 0.2017, + "step": 14922 + }, + { + "epoch": 1.4058076823437977, + "grad_norm": 0.6430876851081848, + "learning_rate": 4.202450453518136e-06, + "loss": 0.213, + "step": 14923 + }, + { + "epoch": 1.4059018864369657, + "grad_norm": 0.5820490717887878, + "learning_rate": 4.201220170630831e-06, + "loss": 0.2016, + "step": 14924 + }, + { + "epoch": 1.4059960905301336, + "grad_norm": 0.6232588887214661, + "learning_rate": 4.1999900199646705e-06, + "loss": 0.1939, + "step": 14925 + }, + { + "epoch": 1.4060902946233014, + "grad_norm": 0.6412118673324585, + "learning_rate": 4.198760001547707e-06, + "loss": 0.2192, + "step": 14926 + }, + { + "epoch": 1.4061844987164691, + "grad_norm": 0.6292945742607117, + "learning_rate": 4.197530115407991e-06, + "loss": 0.1728, + "step": 14927 + }, + { + "epoch": 1.406278702809637, + "grad_norm": 0.7885642647743225, + "learning_rate": 4.196300361573559e-06, + "loss": 0.1836, + "step": 14928 + }, + { + "epoch": 1.406372906902805, + "grad_norm": 0.6131870150566101, + "learning_rate": 4.195070740072454e-06, + "loss": 0.19, + "step": 14929 + }, + { + "epoch": 1.4064671109959728, + "grad_norm": 0.681182861328125, + "learning_rate": 4.193841250932718e-06, + "loss": 0.1778, + "step": 14930 + }, + { + "epoch": 1.4065613150891405, + "grad_norm": 0.6163570880889893, + "learning_rate": 4.19261189418238e-06, + "loss": 0.1817, + "step": 14931 + }, + { + "epoch": 1.4066555191823085, + "grad_norm": 0.6059005260467529, + "learning_rate": 4.191382669849469e-06, + "loss": 0.2087, + "step": 14932 + }, + { + "epoch": 1.4067497232754764, + "grad_norm": 0.5996084809303284, + "learning_rate": 4.190153577962015e-06, + "loss": 0.1999, + "step": 14933 + }, + { + "epoch": 1.4068439273686442, + "grad_norm": 0.6310654282569885, + "learning_rate": 4.188924618548049e-06, + "loss": 0.1936, + "step": 14934 + }, + { + "epoch": 1.406938131461812, + "grad_norm": 0.6701446771621704, + "learning_rate": 4.187695791635585e-06, + "loss": 0.2152, + "step": 14935 + }, + { + "epoch": 1.4070323355549799, + "grad_norm": 0.6180370450019836, + "learning_rate": 4.186467097252646e-06, + "loss": 0.1728, + "step": 14936 + }, + { + "epoch": 1.4071265396481478, + "grad_norm": 0.6977247595787048, + "learning_rate": 4.185238535427253e-06, + "loss": 0.2078, + "step": 14937 + }, + { + "epoch": 1.4072207437413156, + "grad_norm": 0.7770658731460571, + "learning_rate": 4.184010106187409e-06, + "loss": 0.2163, + "step": 14938 + }, + { + "epoch": 1.4073149478344833, + "grad_norm": 0.6317363381385803, + "learning_rate": 4.182781809561129e-06, + "loss": 0.1835, + "step": 14939 + }, + { + "epoch": 1.4074091519276513, + "grad_norm": 0.707531213760376, + "learning_rate": 4.181553645576424e-06, + "loss": 0.2072, + "step": 14940 + }, + { + "epoch": 1.4075033560208192, + "grad_norm": 0.6822757124900818, + "learning_rate": 4.1803256142612914e-06, + "loss": 0.2022, + "step": 14941 + }, + { + "epoch": 1.407597560113987, + "grad_norm": 0.6395441293716431, + "learning_rate": 4.179097715643737e-06, + "loss": 0.1643, + "step": 14942 + }, + { + "epoch": 1.4076917642071547, + "grad_norm": 0.7331756353378296, + "learning_rate": 4.177869949751755e-06, + "loss": 0.226, + "step": 14943 + }, + { + "epoch": 1.4077859683003227, + "grad_norm": 0.6839967966079712, + "learning_rate": 4.176642316613342e-06, + "loss": 0.2189, + "step": 14944 + }, + { + "epoch": 1.4078801723934906, + "grad_norm": 0.6518542170524597, + "learning_rate": 4.175414816256494e-06, + "loss": 0.1798, + "step": 14945 + }, + { + "epoch": 1.4079743764866584, + "grad_norm": 0.6746199727058411, + "learning_rate": 4.174187448709192e-06, + "loss": 0.1834, + "step": 14946 + }, + { + "epoch": 1.408068580579826, + "grad_norm": 0.7846388816833496, + "learning_rate": 4.172960213999425e-06, + "loss": 0.2066, + "step": 14947 + }, + { + "epoch": 1.408162784672994, + "grad_norm": 0.6446897387504578, + "learning_rate": 4.171733112155182e-06, + "loss": 0.2383, + "step": 14948 + }, + { + "epoch": 1.408256988766162, + "grad_norm": 0.6351571679115295, + "learning_rate": 4.170506143204432e-06, + "loss": 0.1955, + "step": 14949 + }, + { + "epoch": 1.4083511928593297, + "grad_norm": 0.6581653356552124, + "learning_rate": 4.169279307175159e-06, + "loss": 0.1846, + "step": 14950 + }, + { + "epoch": 1.4084453969524975, + "grad_norm": 0.6995300650596619, + "learning_rate": 4.16805260409534e-06, + "loss": 0.2146, + "step": 14951 + }, + { + "epoch": 1.4085396010456654, + "grad_norm": 0.6600801348686218, + "learning_rate": 4.166826033992939e-06, + "loss": 0.2, + "step": 14952 + }, + { + "epoch": 1.4086338051388334, + "grad_norm": 0.690256655216217, + "learning_rate": 4.165599596895919e-06, + "loss": 0.196, + "step": 14953 + }, + { + "epoch": 1.4087280092320011, + "grad_norm": 0.6114471554756165, + "learning_rate": 4.164373292832258e-06, + "loss": 0.1857, + "step": 14954 + }, + { + "epoch": 1.4088222133251689, + "grad_norm": 0.6888704895973206, + "learning_rate": 4.163147121829911e-06, + "loss": 0.2215, + "step": 14955 + }, + { + "epoch": 1.4089164174183368, + "grad_norm": 0.7572583556175232, + "learning_rate": 4.161921083916833e-06, + "loss": 0.1904, + "step": 14956 + }, + { + "epoch": 1.4090106215115048, + "grad_norm": 0.6744598746299744, + "learning_rate": 4.160695179120983e-06, + "loss": 0.206, + "step": 14957 + }, + { + "epoch": 1.4091048256046725, + "grad_norm": 0.6713089346885681, + "learning_rate": 4.159469407470318e-06, + "loss": 0.188, + "step": 14958 + }, + { + "epoch": 1.4091990296978403, + "grad_norm": 0.6134618520736694, + "learning_rate": 4.158243768992778e-06, + "loss": 0.1675, + "step": 14959 + }, + { + "epoch": 1.4092932337910082, + "grad_norm": 0.6265523433685303, + "learning_rate": 4.1570182637163155e-06, + "loss": 0.1993, + "step": 14960 + }, + { + "epoch": 1.4093874378841762, + "grad_norm": 0.6656590104103088, + "learning_rate": 4.155792891668876e-06, + "loss": 0.1601, + "step": 14961 + }, + { + "epoch": 1.409481641977344, + "grad_norm": 0.6455593109130859, + "learning_rate": 4.154567652878394e-06, + "loss": 0.191, + "step": 14962 + }, + { + "epoch": 1.4095758460705117, + "grad_norm": 0.6836377382278442, + "learning_rate": 4.15334254737281e-06, + "loss": 0.1979, + "step": 14963 + }, + { + "epoch": 1.4096700501636796, + "grad_norm": 0.6486380696296692, + "learning_rate": 4.15211757518006e-06, + "loss": 0.2023, + "step": 14964 + }, + { + "epoch": 1.4097642542568476, + "grad_norm": 0.6408259868621826, + "learning_rate": 4.1508927363280705e-06, + "loss": 0.2027, + "step": 14965 + }, + { + "epoch": 1.4098584583500153, + "grad_norm": 0.6494690179824829, + "learning_rate": 4.149668030844772e-06, + "loss": 0.2095, + "step": 14966 + }, + { + "epoch": 1.409952662443183, + "grad_norm": 0.6200953722000122, + "learning_rate": 4.1484434587580935e-06, + "loss": 0.2054, + "step": 14967 + }, + { + "epoch": 1.410046866536351, + "grad_norm": 0.6189218759536743, + "learning_rate": 4.147219020095955e-06, + "loss": 0.1962, + "step": 14968 + }, + { + "epoch": 1.410141070629519, + "grad_norm": 0.6298625469207764, + "learning_rate": 4.145994714886266e-06, + "loss": 0.2122, + "step": 14969 + }, + { + "epoch": 1.4102352747226867, + "grad_norm": 0.7043928503990173, + "learning_rate": 4.144770543156959e-06, + "loss": 0.2267, + "step": 14970 + }, + { + "epoch": 1.4103294788158545, + "grad_norm": 0.7395245432853699, + "learning_rate": 4.143546504935938e-06, + "loss": 0.2588, + "step": 14971 + }, + { + "epoch": 1.4104236829090224, + "grad_norm": 0.6496203541755676, + "learning_rate": 4.1423226002511105e-06, + "loss": 0.1913, + "step": 14972 + }, + { + "epoch": 1.4105178870021902, + "grad_norm": 0.6287446022033691, + "learning_rate": 4.141098829130386e-06, + "loss": 0.2041, + "step": 14973 + }, + { + "epoch": 1.4106120910953581, + "grad_norm": 0.6247981786727905, + "learning_rate": 4.139875191601674e-06, + "loss": 0.1992, + "step": 14974 + }, + { + "epoch": 1.4107062951885259, + "grad_norm": 0.669303834438324, + "learning_rate": 4.1386516876928675e-06, + "loss": 0.207, + "step": 14975 + }, + { + "epoch": 1.4108004992816938, + "grad_norm": 0.7559048533439636, + "learning_rate": 4.137428317431866e-06, + "loss": 0.2009, + "step": 14976 + }, + { + "epoch": 1.4108947033748616, + "grad_norm": 0.6676762104034424, + "learning_rate": 4.136205080846569e-06, + "loss": 0.1957, + "step": 14977 + }, + { + "epoch": 1.4109889074680295, + "grad_norm": 0.6410189270973206, + "learning_rate": 4.134981977964862e-06, + "loss": 0.2169, + "step": 14978 + }, + { + "epoch": 1.4110831115611973, + "grad_norm": 0.6580454707145691, + "learning_rate": 4.133759008814636e-06, + "loss": 0.2029, + "step": 14979 + }, + { + "epoch": 1.4111773156543652, + "grad_norm": 0.6293115019798279, + "learning_rate": 4.13253617342378e-06, + "loss": 0.1742, + "step": 14980 + }, + { + "epoch": 1.411271519747533, + "grad_norm": 1.1270183324813843, + "learning_rate": 4.131313471820171e-06, + "loss": 0.2201, + "step": 14981 + }, + { + "epoch": 1.411365723840701, + "grad_norm": 0.6704361438751221, + "learning_rate": 4.130090904031694e-06, + "loss": 0.1931, + "step": 14982 + }, + { + "epoch": 1.4114599279338687, + "grad_norm": 0.6646503806114197, + "learning_rate": 4.128868470086218e-06, + "loss": 0.1955, + "step": 14983 + }, + { + "epoch": 1.4115541320270366, + "grad_norm": 0.6644883155822754, + "learning_rate": 4.127646170011621e-06, + "loss": 0.2071, + "step": 14984 + }, + { + "epoch": 1.4116483361202044, + "grad_norm": 0.6573548316955566, + "learning_rate": 4.1264240038357775e-06, + "loss": 0.1831, + "step": 14985 + }, + { + "epoch": 1.4117425402133723, + "grad_norm": 0.8367358446121216, + "learning_rate": 4.125201971586546e-06, + "loss": 0.2103, + "step": 14986 + }, + { + "epoch": 1.41183674430654, + "grad_norm": 0.7015900015830994, + "learning_rate": 4.123980073291796e-06, + "loss": 0.2075, + "step": 14987 + }, + { + "epoch": 1.411930948399708, + "grad_norm": 0.7227832078933716, + "learning_rate": 4.12275830897939e-06, + "loss": 0.1934, + "step": 14988 + }, + { + "epoch": 1.4120251524928757, + "grad_norm": 0.7276207804679871, + "learning_rate": 4.121536678677181e-06, + "loss": 0.1913, + "step": 14989 + }, + { + "epoch": 1.4121193565860437, + "grad_norm": 0.6449877023696899, + "learning_rate": 4.120315182413027e-06, + "loss": 0.2095, + "step": 14990 + }, + { + "epoch": 1.4122135606792114, + "grad_norm": 0.6663293242454529, + "learning_rate": 4.119093820214783e-06, + "loss": 0.2052, + "step": 14991 + }, + { + "epoch": 1.4123077647723794, + "grad_norm": 0.6059328317642212, + "learning_rate": 4.117872592110296e-06, + "loss": 0.1906, + "step": 14992 + }, + { + "epoch": 1.4124019688655471, + "grad_norm": 0.5791782140731812, + "learning_rate": 4.116651498127403e-06, + "loss": 0.1996, + "step": 14993 + }, + { + "epoch": 1.412496172958715, + "grad_norm": 0.6193673014640808, + "learning_rate": 4.115430538293962e-06, + "loss": 0.1535, + "step": 14994 + }, + { + "epoch": 1.4125903770518828, + "grad_norm": 0.7219436764717102, + "learning_rate": 4.114209712637806e-06, + "loss": 0.1959, + "step": 14995 + }, + { + "epoch": 1.4126845811450508, + "grad_norm": 0.6128940582275391, + "learning_rate": 4.112989021186766e-06, + "loss": 0.1865, + "step": 14996 + }, + { + "epoch": 1.4127787852382185, + "grad_norm": 1.6297779083251953, + "learning_rate": 4.111768463968682e-06, + "loss": 0.1732, + "step": 14997 + }, + { + "epoch": 1.4128729893313865, + "grad_norm": 0.635675311088562, + "learning_rate": 4.110548041011387e-06, + "loss": 0.1906, + "step": 14998 + }, + { + "epoch": 1.4129671934245542, + "grad_norm": 0.6329396963119507, + "learning_rate": 4.1093277523427e-06, + "loss": 0.1984, + "step": 14999 + }, + { + "epoch": 1.4130613975177222, + "grad_norm": 0.6555243134498596, + "learning_rate": 4.108107597990451e-06, + "loss": 0.2251, + "step": 15000 + }, + { + "epoch": 1.41315560161089, + "grad_norm": 0.5970786213874817, + "learning_rate": 4.106887577982464e-06, + "loss": 0.1892, + "step": 15001 + }, + { + "epoch": 1.413249805704058, + "grad_norm": 0.622941792011261, + "learning_rate": 4.105667692346551e-06, + "loss": 0.1785, + "step": 15002 + }, + { + "epoch": 1.4133440097972256, + "grad_norm": 0.5970519781112671, + "learning_rate": 4.1044479411105295e-06, + "loss": 0.1794, + "step": 15003 + }, + { + "epoch": 1.4134382138903936, + "grad_norm": 0.7031348347663879, + "learning_rate": 4.103228324302218e-06, + "loss": 0.2173, + "step": 15004 + }, + { + "epoch": 1.4135324179835613, + "grad_norm": 0.6484546661376953, + "learning_rate": 4.1020088419494156e-06, + "loss": 0.1931, + "step": 15005 + }, + { + "epoch": 1.4136266220767293, + "grad_norm": 0.6778308153152466, + "learning_rate": 4.100789494079932e-06, + "loss": 0.1821, + "step": 15006 + }, + { + "epoch": 1.413720826169897, + "grad_norm": 0.6300435662269592, + "learning_rate": 4.099570280721577e-06, + "loss": 0.1813, + "step": 15007 + }, + { + "epoch": 1.413815030263065, + "grad_norm": 0.6649633049964905, + "learning_rate": 4.098351201902143e-06, + "loss": 0.1821, + "step": 15008 + }, + { + "epoch": 1.4139092343562327, + "grad_norm": 0.6492722630500793, + "learning_rate": 4.097132257649422e-06, + "loss": 0.2151, + "step": 15009 + }, + { + "epoch": 1.4140034384494007, + "grad_norm": 0.6942569017410278, + "learning_rate": 4.095913447991222e-06, + "loss": 0.1975, + "step": 15010 + }, + { + "epoch": 1.4140976425425684, + "grad_norm": 0.6553627252578735, + "learning_rate": 4.094694772955325e-06, + "loss": 0.1769, + "step": 15011 + }, + { + "epoch": 1.4141918466357364, + "grad_norm": 0.6954379081726074, + "learning_rate": 4.093476232569519e-06, + "loss": 0.1863, + "step": 15012 + }, + { + "epoch": 1.4142860507289041, + "grad_norm": 0.7249972820281982, + "learning_rate": 4.092257826861587e-06, + "loss": 0.2101, + "step": 15013 + }, + { + "epoch": 1.414380254822072, + "grad_norm": 0.5798452496528625, + "learning_rate": 4.091039555859317e-06, + "loss": 0.1684, + "step": 15014 + }, + { + "epoch": 1.4144744589152398, + "grad_norm": 0.6643664240837097, + "learning_rate": 4.08982141959048e-06, + "loss": 0.1948, + "step": 15015 + }, + { + "epoch": 1.4145686630084078, + "grad_norm": 0.6006837487220764, + "learning_rate": 4.088603418082856e-06, + "loss": 0.1747, + "step": 15016 + }, + { + "epoch": 1.4146628671015755, + "grad_norm": 0.6077139377593994, + "learning_rate": 4.087385551364219e-06, + "loss": 0.2052, + "step": 15017 + }, + { + "epoch": 1.4147570711947435, + "grad_norm": 0.656932532787323, + "learning_rate": 4.086167819462332e-06, + "loss": 0.1915, + "step": 15018 + }, + { + "epoch": 1.4148512752879112, + "grad_norm": 0.6349786520004272, + "learning_rate": 4.0849502224049655e-06, + "loss": 0.1946, + "step": 15019 + }, + { + "epoch": 1.4149454793810792, + "grad_norm": 0.6988180875778198, + "learning_rate": 4.083732760219884e-06, + "loss": 0.1825, + "step": 15020 + }, + { + "epoch": 1.415039683474247, + "grad_norm": 0.7015491127967834, + "learning_rate": 4.082515432934842e-06, + "loss": 0.2342, + "step": 15021 + }, + { + "epoch": 1.4151338875674149, + "grad_norm": 0.6982563734054565, + "learning_rate": 4.081298240577603e-06, + "loss": 0.2207, + "step": 15022 + }, + { + "epoch": 1.4152280916605826, + "grad_norm": 0.7221394777297974, + "learning_rate": 4.0800811831759145e-06, + "loss": 0.2016, + "step": 15023 + }, + { + "epoch": 1.4153222957537506, + "grad_norm": 0.6412881016731262, + "learning_rate": 4.07886426075753e-06, + "loss": 0.2026, + "step": 15024 + }, + { + "epoch": 1.4154164998469183, + "grad_norm": 0.6701502203941345, + "learning_rate": 4.077647473350201e-06, + "loss": 0.2252, + "step": 15025 + }, + { + "epoch": 1.4155107039400863, + "grad_norm": 0.6229698657989502, + "learning_rate": 4.076430820981666e-06, + "loss": 0.1761, + "step": 15026 + }, + { + "epoch": 1.415604908033254, + "grad_norm": 0.7415137887001038, + "learning_rate": 4.075214303679669e-06, + "loss": 0.2075, + "step": 15027 + }, + { + "epoch": 1.415699112126422, + "grad_norm": 1.9039305448532104, + "learning_rate": 4.073997921471951e-06, + "loss": 0.2086, + "step": 15028 + }, + { + "epoch": 1.4157933162195897, + "grad_norm": 0.6051740646362305, + "learning_rate": 4.072781674386243e-06, + "loss": 0.1788, + "step": 15029 + }, + { + "epoch": 1.4158875203127577, + "grad_norm": 0.6752517223358154, + "learning_rate": 4.0715655624502805e-06, + "loss": 0.2028, + "step": 15030 + }, + { + "epoch": 1.4159817244059254, + "grad_norm": 0.7240444421768188, + "learning_rate": 4.0703495856917926e-06, + "loss": 0.2339, + "step": 15031 + }, + { + "epoch": 1.4160759284990934, + "grad_norm": 0.6766936182975769, + "learning_rate": 4.069133744138506e-06, + "loss": 0.2161, + "step": 15032 + }, + { + "epoch": 1.416170132592261, + "grad_norm": 0.6281977295875549, + "learning_rate": 4.067918037818138e-06, + "loss": 0.1734, + "step": 15033 + }, + { + "epoch": 1.416264336685429, + "grad_norm": 0.6772282123565674, + "learning_rate": 4.0667024667584145e-06, + "loss": 0.2405, + "step": 15034 + }, + { + "epoch": 1.4163585407785968, + "grad_norm": 0.566911518573761, + "learning_rate": 4.0654870309870535e-06, + "loss": 0.1648, + "step": 15035 + }, + { + "epoch": 1.4164527448717648, + "grad_norm": 0.6752029061317444, + "learning_rate": 4.064271730531761e-06, + "loss": 0.2056, + "step": 15036 + }, + { + "epoch": 1.4165469489649325, + "grad_norm": 0.6714105010032654, + "learning_rate": 4.0630565654202545e-06, + "loss": 0.1809, + "step": 15037 + }, + { + "epoch": 1.4166411530581005, + "grad_norm": 0.6412304639816284, + "learning_rate": 4.061841535680243e-06, + "loss": 0.1811, + "step": 15038 + }, + { + "epoch": 1.4167353571512682, + "grad_norm": 0.7268863916397095, + "learning_rate": 4.060626641339425e-06, + "loss": 0.2007, + "step": 15039 + }, + { + "epoch": 1.4168295612444362, + "grad_norm": 0.6820250749588013, + "learning_rate": 4.059411882425504e-06, + "loss": 0.2006, + "step": 15040 + }, + { + "epoch": 1.416923765337604, + "grad_norm": 0.5848016738891602, + "learning_rate": 4.058197258966183e-06, + "loss": 0.1965, + "step": 15041 + }, + { + "epoch": 1.4170179694307716, + "grad_norm": 0.638249397277832, + "learning_rate": 4.056982770989151e-06, + "loss": 0.1976, + "step": 15042 + }, + { + "epoch": 1.4171121735239396, + "grad_norm": 0.7490016222000122, + "learning_rate": 4.055768418522102e-06, + "loss": 0.2164, + "step": 15043 + }, + { + "epoch": 1.4172063776171075, + "grad_norm": 0.6565622687339783, + "learning_rate": 4.054554201592731e-06, + "loss": 0.2184, + "step": 15044 + }, + { + "epoch": 1.4173005817102753, + "grad_norm": 0.6109570264816284, + "learning_rate": 4.053340120228714e-06, + "loss": 0.1941, + "step": 15045 + }, + { + "epoch": 1.417394785803443, + "grad_norm": 0.6661521196365356, + "learning_rate": 4.052126174457739e-06, + "loss": 0.2016, + "step": 15046 + }, + { + "epoch": 1.417488989896611, + "grad_norm": 0.6523756980895996, + "learning_rate": 4.05091236430749e-06, + "loss": 0.2075, + "step": 15047 + }, + { + "epoch": 1.417583193989779, + "grad_norm": 0.7215223908424377, + "learning_rate": 4.049698689805639e-06, + "loss": 0.2237, + "step": 15048 + }, + { + "epoch": 1.4176773980829467, + "grad_norm": 0.6820694804191589, + "learning_rate": 4.048485150979854e-06, + "loss": 0.2189, + "step": 15049 + }, + { + "epoch": 1.4177716021761144, + "grad_norm": 0.6522488594055176, + "learning_rate": 4.0472717478578185e-06, + "loss": 0.1967, + "step": 15050 + }, + { + "epoch": 1.4178658062692824, + "grad_norm": 0.6289029121398926, + "learning_rate": 4.046058480467192e-06, + "loss": 0.2103, + "step": 15051 + }, + { + "epoch": 1.4179600103624503, + "grad_norm": 0.6443459391593933, + "learning_rate": 4.044845348835637e-06, + "loss": 0.1938, + "step": 15052 + }, + { + "epoch": 1.418054214455618, + "grad_norm": 0.6975728273391724, + "learning_rate": 4.0436323529908195e-06, + "loss": 0.2133, + "step": 15053 + }, + { + "epoch": 1.4181484185487858, + "grad_norm": 0.5422704219818115, + "learning_rate": 4.042419492960398e-06, + "loss": 0.1493, + "step": 15054 + }, + { + "epoch": 1.4182426226419538, + "grad_norm": 0.7132324576377869, + "learning_rate": 4.041206768772023e-06, + "loss": 0.1918, + "step": 15055 + }, + { + "epoch": 1.4183368267351217, + "grad_norm": 0.6467621922492981, + "learning_rate": 4.039994180453348e-06, + "loss": 0.1918, + "step": 15056 + }, + { + "epoch": 1.4184310308282895, + "grad_norm": 0.7953943610191345, + "learning_rate": 4.038781728032027e-06, + "loss": 0.2027, + "step": 15057 + }, + { + "epoch": 1.4185252349214572, + "grad_norm": 0.6614832282066345, + "learning_rate": 4.0375694115356986e-06, + "loss": 0.2045, + "step": 15058 + }, + { + "epoch": 1.4186194390146252, + "grad_norm": 0.6388512253761292, + "learning_rate": 4.036357230992009e-06, + "loss": 0.1934, + "step": 15059 + }, + { + "epoch": 1.4187136431077931, + "grad_norm": 0.6661310791969299, + "learning_rate": 4.0351451864286e-06, + "loss": 0.2051, + "step": 15060 + }, + { + "epoch": 1.4188078472009609, + "grad_norm": 0.6153001189231873, + "learning_rate": 4.033933277873104e-06, + "loss": 0.1955, + "step": 15061 + }, + { + "epoch": 1.4189020512941286, + "grad_norm": 0.689215362071991, + "learning_rate": 4.032721505353157e-06, + "loss": 0.2164, + "step": 15062 + }, + { + "epoch": 1.4189962553872966, + "grad_norm": 0.6913338303565979, + "learning_rate": 4.031509868896386e-06, + "loss": 0.1969, + "step": 15063 + }, + { + "epoch": 1.4190904594804645, + "grad_norm": 0.6429192423820496, + "learning_rate": 4.03029836853042e-06, + "loss": 0.2035, + "step": 15064 + }, + { + "epoch": 1.4191846635736323, + "grad_norm": 0.632829487323761, + "learning_rate": 4.029087004282888e-06, + "loss": 0.1768, + "step": 15065 + }, + { + "epoch": 1.4192788676668, + "grad_norm": 0.6011409759521484, + "learning_rate": 4.027875776181402e-06, + "loss": 0.1851, + "step": 15066 + }, + { + "epoch": 1.419373071759968, + "grad_norm": 0.6467154026031494, + "learning_rate": 4.026664684253584e-06, + "loss": 0.1947, + "step": 15067 + }, + { + "epoch": 1.419467275853136, + "grad_norm": 0.673510730266571, + "learning_rate": 4.025453728527053e-06, + "loss": 0.1949, + "step": 15068 + }, + { + "epoch": 1.4195614799463037, + "grad_norm": 0.656452476978302, + "learning_rate": 4.024242909029414e-06, + "loss": 0.2034, + "step": 15069 + }, + { + "epoch": 1.4196556840394714, + "grad_norm": 0.6704924702644348, + "learning_rate": 4.023032225788278e-06, + "loss": 0.2027, + "step": 15070 + }, + { + "epoch": 1.4197498881326394, + "grad_norm": 0.6681562662124634, + "learning_rate": 4.021821678831255e-06, + "loss": 0.2195, + "step": 15071 + }, + { + "epoch": 1.4198440922258073, + "grad_norm": 0.5850578546524048, + "learning_rate": 4.020611268185942e-06, + "loss": 0.2025, + "step": 15072 + }, + { + "epoch": 1.419938296318975, + "grad_norm": 0.608880341053009, + "learning_rate": 4.0194009938799365e-06, + "loss": 0.1768, + "step": 15073 + }, + { + "epoch": 1.4200325004121428, + "grad_norm": 0.7008351683616638, + "learning_rate": 4.018190855940837e-06, + "loss": 0.2058, + "step": 15074 + }, + { + "epoch": 1.4201267045053108, + "grad_norm": 0.616750955581665, + "learning_rate": 4.01698085439624e-06, + "loss": 0.1581, + "step": 15075 + }, + { + "epoch": 1.4202209085984787, + "grad_norm": 0.7935291528701782, + "learning_rate": 4.015770989273731e-06, + "loss": 0.1921, + "step": 15076 + }, + { + "epoch": 1.4203151126916465, + "grad_norm": 0.5856050848960876, + "learning_rate": 4.014561260600896e-06, + "loss": 0.2114, + "step": 15077 + }, + { + "epoch": 1.4204093167848142, + "grad_norm": 0.6580625176429749, + "learning_rate": 4.0133516684053264e-06, + "loss": 0.2019, + "step": 15078 + }, + { + "epoch": 1.4205035208779822, + "grad_norm": 0.7024085521697998, + "learning_rate": 4.012142212714593e-06, + "loss": 0.2201, + "step": 15079 + }, + { + "epoch": 1.42059772497115, + "grad_norm": 0.7355886697769165, + "learning_rate": 4.010932893556278e-06, + "loss": 0.2039, + "step": 15080 + }, + { + "epoch": 1.4206919290643178, + "grad_norm": 0.6439719796180725, + "learning_rate": 4.009723710957957e-06, + "loss": 0.2211, + "step": 15081 + }, + { + "epoch": 1.4207861331574856, + "grad_norm": 0.608222484588623, + "learning_rate": 4.008514664947198e-06, + "loss": 0.1888, + "step": 15082 + }, + { + "epoch": 1.4208803372506535, + "grad_norm": 0.6365930438041687, + "learning_rate": 4.00730575555157e-06, + "loss": 0.1883, + "step": 15083 + }, + { + "epoch": 1.4209745413438215, + "grad_norm": 0.5954399108886719, + "learning_rate": 4.006096982798642e-06, + "loss": 0.2025, + "step": 15084 + }, + { + "epoch": 1.4210687454369892, + "grad_norm": 0.6623506546020508, + "learning_rate": 4.004888346715972e-06, + "loss": 0.2047, + "step": 15085 + }, + { + "epoch": 1.421162949530157, + "grad_norm": 0.6372186541557312, + "learning_rate": 4.0036798473311125e-06, + "loss": 0.1964, + "step": 15086 + }, + { + "epoch": 1.421257153623325, + "grad_norm": 0.7034417390823364, + "learning_rate": 4.002471484671634e-06, + "loss": 0.2035, + "step": 15087 + }, + { + "epoch": 1.421351357716493, + "grad_norm": 0.6938350796699524, + "learning_rate": 4.0012632587650804e-06, + "loss": 0.1868, + "step": 15088 + }, + { + "epoch": 1.4214455618096606, + "grad_norm": 0.6624138951301575, + "learning_rate": 4.000055169638994e-06, + "loss": 0.2198, + "step": 15089 + }, + { + "epoch": 1.4215397659028284, + "grad_norm": 0.6621091961860657, + "learning_rate": 3.998847217320937e-06, + "loss": 0.212, + "step": 15090 + }, + { + "epoch": 1.4216339699959963, + "grad_norm": 0.6874463558197021, + "learning_rate": 3.997639401838444e-06, + "loss": 0.2276, + "step": 15091 + }, + { + "epoch": 1.4217281740891643, + "grad_norm": 0.7190878391265869, + "learning_rate": 3.9964317232190516e-06, + "loss": 0.1954, + "step": 15092 + }, + { + "epoch": 1.421822378182332, + "grad_norm": 0.749897301197052, + "learning_rate": 3.995224181490301e-06, + "loss": 0.2388, + "step": 15093 + }, + { + "epoch": 1.4219165822754998, + "grad_norm": 0.7081459164619446, + "learning_rate": 3.994016776679729e-06, + "loss": 0.2082, + "step": 15094 + }, + { + "epoch": 1.4220107863686677, + "grad_norm": 0.7634052634239197, + "learning_rate": 3.992809508814859e-06, + "loss": 0.2275, + "step": 15095 + }, + { + "epoch": 1.4221049904618357, + "grad_norm": 0.7295844554901123, + "learning_rate": 3.991602377923222e-06, + "loss": 0.22, + "step": 15096 + }, + { + "epoch": 1.4221991945550034, + "grad_norm": 0.6493740081787109, + "learning_rate": 3.990395384032348e-06, + "loss": 0.2059, + "step": 15097 + }, + { + "epoch": 1.4222933986481712, + "grad_norm": 0.5462346076965332, + "learning_rate": 3.989188527169749e-06, + "loss": 0.1789, + "step": 15098 + }, + { + "epoch": 1.4223876027413391, + "grad_norm": 0.6551412343978882, + "learning_rate": 3.987981807362948e-06, + "loss": 0.2172, + "step": 15099 + }, + { + "epoch": 1.422481806834507, + "grad_norm": 0.5961939096450806, + "learning_rate": 3.986775224639463e-06, + "loss": 0.1929, + "step": 15100 + }, + { + "epoch": 1.4225760109276748, + "grad_norm": 0.9297695159912109, + "learning_rate": 3.985568779026798e-06, + "loss": 0.1809, + "step": 15101 + }, + { + "epoch": 1.4226702150208426, + "grad_norm": 0.6887254118919373, + "learning_rate": 3.984362470552471e-06, + "loss": 0.2126, + "step": 15102 + }, + { + "epoch": 1.4227644191140105, + "grad_norm": 0.6743797659873962, + "learning_rate": 3.98315629924398e-06, + "loss": 0.1958, + "step": 15103 + }, + { + "epoch": 1.4228586232071785, + "grad_norm": 0.6334644556045532, + "learning_rate": 3.981950265128829e-06, + "loss": 0.1921, + "step": 15104 + }, + { + "epoch": 1.4229528273003462, + "grad_norm": 0.6796209216117859, + "learning_rate": 3.980744368234524e-06, + "loss": 0.2051, + "step": 15105 + }, + { + "epoch": 1.423047031393514, + "grad_norm": 0.6645632982254028, + "learning_rate": 3.979538608588552e-06, + "loss": 0.207, + "step": 15106 + }, + { + "epoch": 1.423141235486682, + "grad_norm": 0.6518348455429077, + "learning_rate": 3.97833298621841e-06, + "loss": 0.2077, + "step": 15107 + }, + { + "epoch": 1.4232354395798499, + "grad_norm": 0.6250102519989014, + "learning_rate": 3.977127501151593e-06, + "loss": 0.2145, + "step": 15108 + }, + { + "epoch": 1.4233296436730176, + "grad_norm": 0.6662275791168213, + "learning_rate": 3.975922153415579e-06, + "loss": 0.2448, + "step": 15109 + }, + { + "epoch": 1.4234238477661854, + "grad_norm": 0.6598562598228455, + "learning_rate": 3.974716943037858e-06, + "loss": 0.2005, + "step": 15110 + }, + { + "epoch": 1.4235180518593533, + "grad_norm": 0.7167573571205139, + "learning_rate": 3.973511870045911e-06, + "loss": 0.214, + "step": 15111 + }, + { + "epoch": 1.423612255952521, + "grad_norm": 0.8242859244346619, + "learning_rate": 3.972306934467214e-06, + "loss": 0.21, + "step": 15112 + }, + { + "epoch": 1.423706460045689, + "grad_norm": 0.6715300679206848, + "learning_rate": 3.9711021363292376e-06, + "loss": 0.1957, + "step": 15113 + }, + { + "epoch": 1.4238006641388568, + "grad_norm": 0.620347797870636, + "learning_rate": 3.969897475659457e-06, + "loss": 0.196, + "step": 15114 + }, + { + "epoch": 1.4238948682320247, + "grad_norm": 0.633220911026001, + "learning_rate": 3.968692952485341e-06, + "loss": 0.2279, + "step": 15115 + }, + { + "epoch": 1.4239890723251925, + "grad_norm": 0.6173372864723206, + "learning_rate": 3.967488566834352e-06, + "loss": 0.172, + "step": 15116 + }, + { + "epoch": 1.4240832764183604, + "grad_norm": 0.8390200734138489, + "learning_rate": 3.966284318733952e-06, + "loss": 0.2059, + "step": 15117 + }, + { + "epoch": 1.4241774805115281, + "grad_norm": 0.7647731900215149, + "learning_rate": 3.965080208211605e-06, + "loss": 0.1673, + "step": 15118 + }, + { + "epoch": 1.424271684604696, + "grad_norm": 0.6739823818206787, + "learning_rate": 3.963876235294758e-06, + "loss": 0.2161, + "step": 15119 + }, + { + "epoch": 1.4243658886978638, + "grad_norm": 0.6182190179824829, + "learning_rate": 3.962672400010868e-06, + "loss": 0.194, + "step": 15120 + }, + { + "epoch": 1.4244600927910318, + "grad_norm": 0.6688817143440247, + "learning_rate": 3.961468702387389e-06, + "loss": 0.2185, + "step": 15121 + }, + { + "epoch": 1.4245542968841995, + "grad_norm": 0.6367483139038086, + "learning_rate": 3.960265142451758e-06, + "loss": 0.198, + "step": 15122 + }, + { + "epoch": 1.4246485009773675, + "grad_norm": 0.6875576972961426, + "learning_rate": 3.959061720231422e-06, + "loss": 0.1994, + "step": 15123 + }, + { + "epoch": 1.4247427050705352, + "grad_norm": 0.6497072577476501, + "learning_rate": 3.957858435753825e-06, + "loss": 0.1734, + "step": 15124 + }, + { + "epoch": 1.4248369091637032, + "grad_norm": 0.6306024789810181, + "learning_rate": 3.9566552890464e-06, + "loss": 0.1932, + "step": 15125 + }, + { + "epoch": 1.424931113256871, + "grad_norm": 0.5928530693054199, + "learning_rate": 3.955452280136575e-06, + "loss": 0.1797, + "step": 15126 + }, + { + "epoch": 1.425025317350039, + "grad_norm": 0.6352770328521729, + "learning_rate": 3.954249409051791e-06, + "loss": 0.1943, + "step": 15127 + }, + { + "epoch": 1.4251195214432066, + "grad_norm": 0.6394056677818298, + "learning_rate": 3.953046675819472e-06, + "loss": 0.1926, + "step": 15128 + }, + { + "epoch": 1.4252137255363746, + "grad_norm": 0.7113199234008789, + "learning_rate": 3.951844080467032e-06, + "loss": 0.1988, + "step": 15129 + }, + { + "epoch": 1.4253079296295423, + "grad_norm": 0.7182572484016418, + "learning_rate": 3.950641623021909e-06, + "loss": 0.1948, + "step": 15130 + }, + { + "epoch": 1.4254021337227103, + "grad_norm": 0.6903520822525024, + "learning_rate": 3.949439303511512e-06, + "loss": 0.2071, + "step": 15131 + }, + { + "epoch": 1.425496337815878, + "grad_norm": 0.7144119143486023, + "learning_rate": 3.9482371219632535e-06, + "loss": 0.2064, + "step": 15132 + }, + { + "epoch": 1.425590541909046, + "grad_norm": 0.6513559222221375, + "learning_rate": 3.947035078404546e-06, + "loss": 0.2022, + "step": 15133 + }, + { + "epoch": 1.4256847460022137, + "grad_norm": 0.5769354104995728, + "learning_rate": 3.945833172862806e-06, + "loss": 0.1454, + "step": 15134 + }, + { + "epoch": 1.4257789500953817, + "grad_norm": 0.6987826228141785, + "learning_rate": 3.944631405365427e-06, + "loss": 0.2258, + "step": 15135 + }, + { + "epoch": 1.4258731541885494, + "grad_norm": 0.6186956763267517, + "learning_rate": 3.9434297759398164e-06, + "loss": 0.1752, + "step": 15136 + }, + { + "epoch": 1.4259673582817174, + "grad_norm": 0.7385438084602356, + "learning_rate": 3.942228284613379e-06, + "loss": 0.2365, + "step": 15137 + }, + { + "epoch": 1.4260615623748851, + "grad_norm": 0.6618998646736145, + "learning_rate": 3.9410269314135e-06, + "loss": 0.1915, + "step": 15138 + }, + { + "epoch": 1.426155766468053, + "grad_norm": 0.6850101351737976, + "learning_rate": 3.939825716367578e-06, + "loss": 0.2124, + "step": 15139 + }, + { + "epoch": 1.4262499705612208, + "grad_norm": 0.7157747149467468, + "learning_rate": 3.938624639503006e-06, + "loss": 0.2039, + "step": 15140 + }, + { + "epoch": 1.4263441746543888, + "grad_norm": 0.6254139542579651, + "learning_rate": 3.9374237008471615e-06, + "loss": 0.2011, + "step": 15141 + }, + { + "epoch": 1.4264383787475565, + "grad_norm": 0.615426778793335, + "learning_rate": 3.9362229004274376e-06, + "loss": 0.1881, + "step": 15142 + }, + { + "epoch": 1.4265325828407245, + "grad_norm": 0.7962415218353271, + "learning_rate": 3.935022238271205e-06, + "loss": 0.1924, + "step": 15143 + }, + { + "epoch": 1.4266267869338922, + "grad_norm": 0.6106542944908142, + "learning_rate": 3.933821714405846e-06, + "loss": 0.2219, + "step": 15144 + }, + { + "epoch": 1.4267209910270602, + "grad_norm": 0.7156044244766235, + "learning_rate": 3.932621328858735e-06, + "loss": 0.1895, + "step": 15145 + }, + { + "epoch": 1.426815195120228, + "grad_norm": 0.720986008644104, + "learning_rate": 3.931421081657238e-06, + "loss": 0.2261, + "step": 15146 + }, + { + "epoch": 1.4269093992133959, + "grad_norm": 0.7468794584274292, + "learning_rate": 3.930220972828726e-06, + "loss": 0.2667, + "step": 15147 + }, + { + "epoch": 1.4270036033065636, + "grad_norm": 0.6073909401893616, + "learning_rate": 3.929021002400568e-06, + "loss": 0.1955, + "step": 15148 + }, + { + "epoch": 1.4270978073997316, + "grad_norm": 0.7410126328468323, + "learning_rate": 3.927821170400115e-06, + "loss": 0.2043, + "step": 15149 + }, + { + "epoch": 1.4271920114928993, + "grad_norm": 0.5966479778289795, + "learning_rate": 3.9266214768547335e-06, + "loss": 0.1908, + "step": 15150 + }, + { + "epoch": 1.4272862155860673, + "grad_norm": 0.7098649144172668, + "learning_rate": 3.9254219217917725e-06, + "loss": 0.1902, + "step": 15151 + }, + { + "epoch": 1.427380419679235, + "grad_norm": 0.677027702331543, + "learning_rate": 3.924222505238588e-06, + "loss": 0.2138, + "step": 15152 + }, + { + "epoch": 1.427474623772403, + "grad_norm": 0.7108759880065918, + "learning_rate": 3.923023227222526e-06, + "loss": 0.1998, + "step": 15153 + }, + { + "epoch": 1.4275688278655707, + "grad_norm": 0.6519859433174133, + "learning_rate": 3.9218240877709305e-06, + "loss": 0.2162, + "step": 15154 + }, + { + "epoch": 1.4276630319587387, + "grad_norm": 0.7533174753189087, + "learning_rate": 3.92062508691115e-06, + "loss": 0.1936, + "step": 15155 + }, + { + "epoch": 1.4277572360519064, + "grad_norm": 0.587651252746582, + "learning_rate": 3.919426224670515e-06, + "loss": 0.1622, + "step": 15156 + }, + { + "epoch": 1.4278514401450744, + "grad_norm": 0.7207445502281189, + "learning_rate": 3.918227501076367e-06, + "loss": 0.1996, + "step": 15157 + }, + { + "epoch": 1.427945644238242, + "grad_norm": 0.5945635437965393, + "learning_rate": 3.917028916156041e-06, + "loss": 0.1784, + "step": 15158 + }, + { + "epoch": 1.42803984833141, + "grad_norm": 0.6569984555244446, + "learning_rate": 3.915830469936858e-06, + "loss": 0.1736, + "step": 15159 + }, + { + "epoch": 1.4281340524245778, + "grad_norm": 0.6692624092102051, + "learning_rate": 3.914632162446153e-06, + "loss": 0.2072, + "step": 15160 + }, + { + "epoch": 1.4282282565177458, + "grad_norm": 0.7507838010787964, + "learning_rate": 3.913433993711246e-06, + "loss": 0.2067, + "step": 15161 + }, + { + "epoch": 1.4283224606109135, + "grad_norm": 0.6753888130187988, + "learning_rate": 3.912235963759456e-06, + "loss": 0.1837, + "step": 15162 + }, + { + "epoch": 1.4284166647040815, + "grad_norm": 0.6562243700027466, + "learning_rate": 3.9110380726181e-06, + "loss": 0.1946, + "step": 15163 + }, + { + "epoch": 1.4285108687972492, + "grad_norm": 0.6343908905982971, + "learning_rate": 3.9098403203144965e-06, + "loss": 0.2137, + "step": 15164 + }, + { + "epoch": 1.4286050728904172, + "grad_norm": 0.6291208863258362, + "learning_rate": 3.908642706875951e-06, + "loss": 0.1765, + "step": 15165 + }, + { + "epoch": 1.428699276983585, + "grad_norm": 0.636816143989563, + "learning_rate": 3.907445232329766e-06, + "loss": 0.1738, + "step": 15166 + }, + { + "epoch": 1.4287934810767529, + "grad_norm": 0.6997500658035278, + "learning_rate": 3.90624789670326e-06, + "loss": 0.2083, + "step": 15167 + }, + { + "epoch": 1.4288876851699206, + "grad_norm": 0.7286765575408936, + "learning_rate": 3.905050700023726e-06, + "loss": 0.2262, + "step": 15168 + }, + { + "epoch": 1.4289818892630886, + "grad_norm": 0.6835652589797974, + "learning_rate": 3.903853642318453e-06, + "loss": 0.1858, + "step": 15169 + }, + { + "epoch": 1.4290760933562563, + "grad_norm": 0.6714411973953247, + "learning_rate": 3.902656723614754e-06, + "loss": 0.1996, + "step": 15170 + }, + { + "epoch": 1.4291702974494243, + "grad_norm": 0.6755989789962769, + "learning_rate": 3.90145994393991e-06, + "loss": 0.1941, + "step": 15171 + }, + { + "epoch": 1.429264501542592, + "grad_norm": 0.6982343792915344, + "learning_rate": 3.900263303321209e-06, + "loss": 0.2028, + "step": 15172 + }, + { + "epoch": 1.42935870563576, + "grad_norm": 0.604402482509613, + "learning_rate": 3.899066801785937e-06, + "loss": 0.1917, + "step": 15173 + }, + { + "epoch": 1.4294529097289277, + "grad_norm": 0.66719651222229, + "learning_rate": 3.897870439361381e-06, + "loss": 0.2128, + "step": 15174 + }, + { + "epoch": 1.4295471138220956, + "grad_norm": 0.6596137881278992, + "learning_rate": 3.896674216074812e-06, + "loss": 0.1853, + "step": 15175 + }, + { + "epoch": 1.4296413179152634, + "grad_norm": 0.6531592607498169, + "learning_rate": 3.895478131953511e-06, + "loss": 0.212, + "step": 15176 + }, + { + "epoch": 1.4297355220084313, + "grad_norm": 0.6396514773368835, + "learning_rate": 3.894282187024752e-06, + "loss": 0.222, + "step": 15177 + }, + { + "epoch": 1.429829726101599, + "grad_norm": 0.7242485880851746, + "learning_rate": 3.893086381315798e-06, + "loss": 0.2016, + "step": 15178 + }, + { + "epoch": 1.429923930194767, + "grad_norm": 0.6706082820892334, + "learning_rate": 3.89189071485392e-06, + "loss": 0.2185, + "step": 15179 + }, + { + "epoch": 1.4300181342879348, + "grad_norm": 0.6603735089302063, + "learning_rate": 3.890695187666382e-06, + "loss": 0.2115, + "step": 15180 + }, + { + "epoch": 1.4301123383811025, + "grad_norm": 0.7358943223953247, + "learning_rate": 3.8894997997804405e-06, + "loss": 0.2371, + "step": 15181 + }, + { + "epoch": 1.4302065424742705, + "grad_norm": 0.6819736361503601, + "learning_rate": 3.8883045512233564e-06, + "loss": 0.2149, + "step": 15182 + }, + { + "epoch": 1.4303007465674384, + "grad_norm": 0.6299451589584351, + "learning_rate": 3.887109442022377e-06, + "loss": 0.1961, + "step": 15183 + }, + { + "epoch": 1.4303949506606062, + "grad_norm": 0.5874504446983337, + "learning_rate": 3.8859144722047545e-06, + "loss": 0.1801, + "step": 15184 + }, + { + "epoch": 1.430489154753774, + "grad_norm": 0.72264164686203, + "learning_rate": 3.884719641797743e-06, + "loss": 0.1887, + "step": 15185 + }, + { + "epoch": 1.4305833588469419, + "grad_norm": 0.6677408814430237, + "learning_rate": 3.883524950828578e-06, + "loss": 0.1973, + "step": 15186 + }, + { + "epoch": 1.4306775629401098, + "grad_norm": 0.676523745059967, + "learning_rate": 3.8823303993245025e-06, + "loss": 0.2153, + "step": 15187 + }, + { + "epoch": 1.4307717670332776, + "grad_norm": 0.6481882929801941, + "learning_rate": 3.881135987312758e-06, + "loss": 0.1937, + "step": 15188 + }, + { + "epoch": 1.4308659711264453, + "grad_norm": 0.6754376292228699, + "learning_rate": 3.879941714820573e-06, + "loss": 0.2053, + "step": 15189 + }, + { + "epoch": 1.4309601752196133, + "grad_norm": 0.6309954524040222, + "learning_rate": 3.878747581875185e-06, + "loss": 0.1989, + "step": 15190 + }, + { + "epoch": 1.4310543793127812, + "grad_norm": 0.6919754147529602, + "learning_rate": 3.877553588503817e-06, + "loss": 0.2226, + "step": 15191 + }, + { + "epoch": 1.431148583405949, + "grad_norm": 0.9672317504882812, + "learning_rate": 3.876359734733697e-06, + "loss": 0.1738, + "step": 15192 + }, + { + "epoch": 1.4312427874991167, + "grad_norm": 0.641103982925415, + "learning_rate": 3.875166020592043e-06, + "loss": 0.2022, + "step": 15193 + }, + { + "epoch": 1.4313369915922847, + "grad_norm": 0.8770423531532288, + "learning_rate": 3.8739724461060755e-06, + "loss": 0.2035, + "step": 15194 + }, + { + "epoch": 1.4314311956854526, + "grad_norm": 0.6562981009483337, + "learning_rate": 3.872779011303014e-06, + "loss": 0.208, + "step": 15195 + }, + { + "epoch": 1.4315253997786204, + "grad_norm": 0.5817837119102478, + "learning_rate": 3.871585716210063e-06, + "loss": 0.1814, + "step": 15196 + }, + { + "epoch": 1.431619603871788, + "grad_norm": 0.6063922643661499, + "learning_rate": 3.870392560854437e-06, + "loss": 0.1735, + "step": 15197 + }, + { + "epoch": 1.431713807964956, + "grad_norm": 0.6343390941619873, + "learning_rate": 3.869199545263342e-06, + "loss": 0.1839, + "step": 15198 + }, + { + "epoch": 1.431808012058124, + "grad_norm": 0.6845496296882629, + "learning_rate": 3.868006669463977e-06, + "loss": 0.2165, + "step": 15199 + }, + { + "epoch": 1.4319022161512918, + "grad_norm": 0.752907395362854, + "learning_rate": 3.866813933483542e-06, + "loss": 0.2132, + "step": 15200 + }, + { + "epoch": 1.4319964202444595, + "grad_norm": 0.8951588273048401, + "learning_rate": 3.865621337349238e-06, + "loss": 0.2211, + "step": 15201 + }, + { + "epoch": 1.4320906243376275, + "grad_norm": 0.6657394170761108, + "learning_rate": 3.864428881088256e-06, + "loss": 0.1817, + "step": 15202 + }, + { + "epoch": 1.4321848284307954, + "grad_norm": 0.6672415733337402, + "learning_rate": 3.8632365647277756e-06, + "loss": 0.2022, + "step": 15203 + }, + { + "epoch": 1.4322790325239632, + "grad_norm": 0.8055460453033447, + "learning_rate": 3.862044388295e-06, + "loss": 0.1993, + "step": 15204 + }, + { + "epoch": 1.432373236617131, + "grad_norm": 0.7843263149261475, + "learning_rate": 3.860852351817105e-06, + "loss": 0.2299, + "step": 15205 + }, + { + "epoch": 1.4324674407102989, + "grad_norm": 0.6628491878509521, + "learning_rate": 3.8596604553212625e-06, + "loss": 0.1919, + "step": 15206 + }, + { + "epoch": 1.4325616448034668, + "grad_norm": 0.9990058541297913, + "learning_rate": 3.858468698834666e-06, + "loss": 0.1996, + "step": 15207 + }, + { + "epoch": 1.4326558488966346, + "grad_norm": 0.6103582382202148, + "learning_rate": 3.857277082384481e-06, + "loss": 0.1912, + "step": 15208 + }, + { + "epoch": 1.4327500529898023, + "grad_norm": 0.6782674193382263, + "learning_rate": 3.856085605997871e-06, + "loss": 0.2114, + "step": 15209 + }, + { + "epoch": 1.4328442570829703, + "grad_norm": 0.6303362846374512, + "learning_rate": 3.854894269702019e-06, + "loss": 0.217, + "step": 15210 + }, + { + "epoch": 1.4329384611761382, + "grad_norm": 0.7728652954101562, + "learning_rate": 3.85370307352408e-06, + "loss": 0.1905, + "step": 15211 + }, + { + "epoch": 1.433032665269306, + "grad_norm": 0.7087612748146057, + "learning_rate": 3.852512017491214e-06, + "loss": 0.19, + "step": 15212 + }, + { + "epoch": 1.4331268693624737, + "grad_norm": 0.6688991189002991, + "learning_rate": 3.8513211016305805e-06, + "loss": 0.196, + "step": 15213 + }, + { + "epoch": 1.4332210734556416, + "grad_norm": 0.686552107334137, + "learning_rate": 3.850130325969339e-06, + "loss": 0.1668, + "step": 15214 + }, + { + "epoch": 1.4333152775488096, + "grad_norm": 0.5662250518798828, + "learning_rate": 3.848939690534633e-06, + "loss": 0.1536, + "step": 15215 + }, + { + "epoch": 1.4334094816419773, + "grad_norm": 0.7102068662643433, + "learning_rate": 3.847749195353615e-06, + "loss": 0.2247, + "step": 15216 + }, + { + "epoch": 1.433503685735145, + "grad_norm": 0.6884145140647888, + "learning_rate": 3.846558840453434e-06, + "loss": 0.2108, + "step": 15217 + }, + { + "epoch": 1.433597889828313, + "grad_norm": 0.6879662871360779, + "learning_rate": 3.845368625861223e-06, + "loss": 0.2053, + "step": 15218 + }, + { + "epoch": 1.433692093921481, + "grad_norm": 0.6586831212043762, + "learning_rate": 3.844178551604128e-06, + "loss": 0.2093, + "step": 15219 + }, + { + "epoch": 1.4337862980146487, + "grad_norm": 0.7226083874702454, + "learning_rate": 3.842988617709283e-06, + "loss": 0.2202, + "step": 15220 + }, + { + "epoch": 1.4338805021078165, + "grad_norm": 0.6367272734642029, + "learning_rate": 3.841798824203818e-06, + "loss": 0.1957, + "step": 15221 + }, + { + "epoch": 1.4339747062009844, + "grad_norm": 0.6065813899040222, + "learning_rate": 3.840609171114867e-06, + "loss": 0.182, + "step": 15222 + }, + { + "epoch": 1.4340689102941524, + "grad_norm": 0.6629114151000977, + "learning_rate": 3.839419658469548e-06, + "loss": 0.2054, + "step": 15223 + }, + { + "epoch": 1.4341631143873201, + "grad_norm": 0.6589508056640625, + "learning_rate": 3.838230286294989e-06, + "loss": 0.231, + "step": 15224 + }, + { + "epoch": 1.4342573184804879, + "grad_norm": 0.6896048188209534, + "learning_rate": 3.837041054618312e-06, + "loss": 0.2144, + "step": 15225 + }, + { + "epoch": 1.4343515225736558, + "grad_norm": 0.7509519457817078, + "learning_rate": 3.8358519634666265e-06, + "loss": 0.2081, + "step": 15226 + }, + { + "epoch": 1.4344457266668238, + "grad_norm": 0.631653368473053, + "learning_rate": 3.83466301286705e-06, + "loss": 0.1847, + "step": 15227 + }, + { + "epoch": 1.4345399307599915, + "grad_norm": 0.6637871861457825, + "learning_rate": 3.833474202846695e-06, + "loss": 0.1942, + "step": 15228 + }, + { + "epoch": 1.4346341348531593, + "grad_norm": 0.6666586399078369, + "learning_rate": 3.8322855334326615e-06, + "loss": 0.2135, + "step": 15229 + }, + { + "epoch": 1.4347283389463272, + "grad_norm": 0.6628562808036804, + "learning_rate": 3.831097004652059e-06, + "loss": 0.1864, + "step": 15230 + }, + { + "epoch": 1.4348225430394952, + "grad_norm": 0.6609824895858765, + "learning_rate": 3.829908616531982e-06, + "loss": 0.2022, + "step": 15231 + }, + { + "epoch": 1.434916747132663, + "grad_norm": 0.651069164276123, + "learning_rate": 3.828720369099536e-06, + "loss": 0.1908, + "step": 15232 + }, + { + "epoch": 1.4350109512258307, + "grad_norm": 0.5750835537910461, + "learning_rate": 3.827532262381803e-06, + "loss": 0.1863, + "step": 15233 + }, + { + "epoch": 1.4351051553189986, + "grad_norm": 0.7546768188476562, + "learning_rate": 3.826344296405883e-06, + "loss": 0.2507, + "step": 15234 + }, + { + "epoch": 1.4351993594121666, + "grad_norm": 0.6671732664108276, + "learning_rate": 3.825156471198863e-06, + "loss": 0.2065, + "step": 15235 + }, + { + "epoch": 1.4352935635053343, + "grad_norm": 0.6724646687507629, + "learning_rate": 3.823968786787821e-06, + "loss": 0.1941, + "step": 15236 + }, + { + "epoch": 1.435387767598502, + "grad_norm": 0.687477171421051, + "learning_rate": 3.822781243199844e-06, + "loss": 0.177, + "step": 15237 + }, + { + "epoch": 1.43548197169167, + "grad_norm": 0.6782063841819763, + "learning_rate": 3.82159384046201e-06, + "loss": 0.2003, + "step": 15238 + }, + { + "epoch": 1.435576175784838, + "grad_norm": 0.6828404068946838, + "learning_rate": 3.820406578601389e-06, + "loss": 0.1944, + "step": 15239 + }, + { + "epoch": 1.4356703798780057, + "grad_norm": 0.6947293281555176, + "learning_rate": 3.819219457645053e-06, + "loss": 0.2044, + "step": 15240 + }, + { + "epoch": 1.4357645839711735, + "grad_norm": 0.6581609845161438, + "learning_rate": 3.818032477620079e-06, + "loss": 0.2079, + "step": 15241 + }, + { + "epoch": 1.4358587880643414, + "grad_norm": 0.6846004724502563, + "learning_rate": 3.816845638553523e-06, + "loss": 0.2115, + "step": 15242 + }, + { + "epoch": 1.4359529921575094, + "grad_norm": 0.6613644957542419, + "learning_rate": 3.8156589404724405e-06, + "loss": 0.1928, + "step": 15243 + }, + { + "epoch": 1.4360471962506771, + "grad_norm": 0.6498963236808777, + "learning_rate": 3.8144723834039076e-06, + "loss": 0.209, + "step": 15244 + }, + { + "epoch": 1.4361414003438449, + "grad_norm": 0.6460498571395874, + "learning_rate": 3.8132859673749688e-06, + "loss": 0.1703, + "step": 15245 + }, + { + "epoch": 1.4362356044370128, + "grad_norm": 0.7108054161071777, + "learning_rate": 3.812099692412672e-06, + "loss": 0.2185, + "step": 15246 + }, + { + "epoch": 1.4363298085301808, + "grad_norm": 0.6266716718673706, + "learning_rate": 3.8109135585440794e-06, + "loss": 0.2004, + "step": 15247 + }, + { + "epoch": 1.4364240126233485, + "grad_norm": 0.6401249170303345, + "learning_rate": 3.8097275657962284e-06, + "loss": 0.2087, + "step": 15248 + }, + { + "epoch": 1.4365182167165162, + "grad_norm": 0.730238676071167, + "learning_rate": 3.8085417141961554e-06, + "loss": 0.2222, + "step": 15249 + }, + { + "epoch": 1.4366124208096842, + "grad_norm": 0.6710932850837708, + "learning_rate": 3.8073560037709134e-06, + "loss": 0.2167, + "step": 15250 + }, + { + "epoch": 1.436706624902852, + "grad_norm": 0.6229298114776611, + "learning_rate": 3.8061704345475325e-06, + "loss": 0.1963, + "step": 15251 + }, + { + "epoch": 1.43680082899602, + "grad_norm": 0.8475515246391296, + "learning_rate": 3.80498500655304e-06, + "loss": 0.2124, + "step": 15252 + }, + { + "epoch": 1.4368950330891876, + "grad_norm": 0.5910754799842834, + "learning_rate": 3.8037997198144716e-06, + "loss": 0.1641, + "step": 15253 + }, + { + "epoch": 1.4369892371823556, + "grad_norm": 0.6734824776649475, + "learning_rate": 3.8026145743588548e-06, + "loss": 0.1906, + "step": 15254 + }, + { + "epoch": 1.4370834412755233, + "grad_norm": 0.5976563096046448, + "learning_rate": 3.8014295702132063e-06, + "loss": 0.1886, + "step": 15255 + }, + { + "epoch": 1.4371776453686913, + "grad_norm": 0.592056393623352, + "learning_rate": 3.8002447074045503e-06, + "loss": 0.1858, + "step": 15256 + }, + { + "epoch": 1.437271849461859, + "grad_norm": 0.662977933883667, + "learning_rate": 3.7990599859599066e-06, + "loss": 0.19, + "step": 15257 + }, + { + "epoch": 1.437366053555027, + "grad_norm": 0.6898549795150757, + "learning_rate": 3.7978754059062818e-06, + "loss": 0.2209, + "step": 15258 + }, + { + "epoch": 1.4374602576481947, + "grad_norm": 0.6281675696372986, + "learning_rate": 3.796690967270689e-06, + "loss": 0.1903, + "step": 15259 + }, + { + "epoch": 1.4375544617413627, + "grad_norm": 0.652545690536499, + "learning_rate": 3.7955066700801392e-06, + "loss": 0.1953, + "step": 15260 + }, + { + "epoch": 1.4376486658345304, + "grad_norm": 0.700383186340332, + "learning_rate": 3.79432251436163e-06, + "loss": 0.2275, + "step": 15261 + }, + { + "epoch": 1.4377428699276984, + "grad_norm": 0.5970531702041626, + "learning_rate": 3.793138500142168e-06, + "loss": 0.1798, + "step": 15262 + }, + { + "epoch": 1.4378370740208661, + "grad_norm": 0.7306908369064331, + "learning_rate": 3.791954627448743e-06, + "loss": 0.2266, + "step": 15263 + }, + { + "epoch": 1.437931278114034, + "grad_norm": 0.7055124640464783, + "learning_rate": 3.7907708963083544e-06, + "loss": 0.224, + "step": 15264 + }, + { + "epoch": 1.4380254822072018, + "grad_norm": 0.619551420211792, + "learning_rate": 3.789587306747995e-06, + "loss": 0.206, + "step": 15265 + }, + { + "epoch": 1.4381196863003698, + "grad_norm": 0.5783557891845703, + "learning_rate": 3.788403858794647e-06, + "loss": 0.1809, + "step": 15266 + }, + { + "epoch": 1.4382138903935375, + "grad_norm": 0.7996221780776978, + "learning_rate": 3.7872205524753e-06, + "loss": 0.2468, + "step": 15267 + }, + { + "epoch": 1.4383080944867055, + "grad_norm": 0.5824506282806396, + "learning_rate": 3.78603738781693e-06, + "loss": 0.2103, + "step": 15268 + }, + { + "epoch": 1.4384022985798732, + "grad_norm": 0.7703114748001099, + "learning_rate": 3.7848543648465163e-06, + "loss": 0.191, + "step": 15269 + }, + { + "epoch": 1.4384965026730412, + "grad_norm": 0.6755668520927429, + "learning_rate": 3.783671483591039e-06, + "loss": 0.1848, + "step": 15270 + }, + { + "epoch": 1.438590706766209, + "grad_norm": 0.6245675086975098, + "learning_rate": 3.782488744077463e-06, + "loss": 0.1826, + "step": 15271 + }, + { + "epoch": 1.4386849108593769, + "grad_norm": 0.7833127975463867, + "learning_rate": 3.7813061463327617e-06, + "loss": 0.1886, + "step": 15272 + }, + { + "epoch": 1.4387791149525446, + "grad_norm": 0.7326774597167969, + "learning_rate": 3.7801236903838946e-06, + "loss": 0.2157, + "step": 15273 + }, + { + "epoch": 1.4388733190457126, + "grad_norm": 0.7009978294372559, + "learning_rate": 3.7789413762578263e-06, + "loss": 0.2013, + "step": 15274 + }, + { + "epoch": 1.4389675231388803, + "grad_norm": 0.6456125378608704, + "learning_rate": 3.77775920398152e-06, + "loss": 0.1898, + "step": 15275 + }, + { + "epoch": 1.4390617272320483, + "grad_norm": 0.6135196089744568, + "learning_rate": 3.7765771735819223e-06, + "loss": 0.198, + "step": 15276 + }, + { + "epoch": 1.439155931325216, + "grad_norm": 0.713524580001831, + "learning_rate": 3.775395285085991e-06, + "loss": 0.202, + "step": 15277 + }, + { + "epoch": 1.439250135418384, + "grad_norm": 0.6370148658752441, + "learning_rate": 3.774213538520676e-06, + "loss": 0.1717, + "step": 15278 + }, + { + "epoch": 1.4393443395115517, + "grad_norm": 0.603708803653717, + "learning_rate": 3.7730319339129175e-06, + "loss": 0.1816, + "step": 15279 + }, + { + "epoch": 1.4394385436047197, + "grad_norm": 0.6384181976318359, + "learning_rate": 3.771850471289661e-06, + "loss": 0.179, + "step": 15280 + }, + { + "epoch": 1.4395327476978874, + "grad_norm": 0.6581946015357971, + "learning_rate": 3.7706691506778494e-06, + "loss": 0.1931, + "step": 15281 + }, + { + "epoch": 1.4396269517910554, + "grad_norm": 0.7333237528800964, + "learning_rate": 3.7694879721044155e-06, + "loss": 0.1842, + "step": 15282 + }, + { + "epoch": 1.4397211558842231, + "grad_norm": 0.6282057762145996, + "learning_rate": 3.768306935596283e-06, + "loss": 0.1916, + "step": 15283 + }, + { + "epoch": 1.439815359977391, + "grad_norm": 0.672105610370636, + "learning_rate": 3.767126041180398e-06, + "loss": 0.2026, + "step": 15284 + }, + { + "epoch": 1.4399095640705588, + "grad_norm": 0.7663817405700684, + "learning_rate": 3.7659452888836787e-06, + "loss": 0.2057, + "step": 15285 + }, + { + "epoch": 1.4400037681637268, + "grad_norm": 0.6527891755104065, + "learning_rate": 3.7647646787330404e-06, + "loss": 0.2168, + "step": 15286 + }, + { + "epoch": 1.4400979722568945, + "grad_norm": 0.6544922590255737, + "learning_rate": 3.763584210755418e-06, + "loss": 0.182, + "step": 15287 + }, + { + "epoch": 1.4401921763500625, + "grad_norm": 0.6385776400566101, + "learning_rate": 3.76240388497772e-06, + "loss": 0.1706, + "step": 15288 + }, + { + "epoch": 1.4402863804432302, + "grad_norm": 0.6630579829216003, + "learning_rate": 3.7612237014268538e-06, + "loss": 0.1916, + "step": 15289 + }, + { + "epoch": 1.4403805845363982, + "grad_norm": 0.641896665096283, + "learning_rate": 3.7600436601297417e-06, + "loss": 0.1957, + "step": 15290 + }, + { + "epoch": 1.440474788629566, + "grad_norm": 0.822803258895874, + "learning_rate": 3.758863761113285e-06, + "loss": 0.2088, + "step": 15291 + }, + { + "epoch": 1.4405689927227339, + "grad_norm": 0.6640996336936951, + "learning_rate": 3.757684004404383e-06, + "loss": 0.189, + "step": 15292 + }, + { + "epoch": 1.4406631968159016, + "grad_norm": 0.6632574796676636, + "learning_rate": 3.7565043900299392e-06, + "loss": 0.2002, + "step": 15293 + }, + { + "epoch": 1.4407574009090696, + "grad_norm": 0.6997697353363037, + "learning_rate": 3.755324918016855e-06, + "loss": 0.1976, + "step": 15294 + }, + { + "epoch": 1.4408516050022373, + "grad_norm": 0.6222142577171326, + "learning_rate": 3.7541455883920166e-06, + "loss": 0.1836, + "step": 15295 + }, + { + "epoch": 1.4409458090954053, + "grad_norm": 0.6241096258163452, + "learning_rate": 3.7529664011823186e-06, + "loss": 0.1911, + "step": 15296 + }, + { + "epoch": 1.441040013188573, + "grad_norm": 0.6768137812614441, + "learning_rate": 3.7517873564146503e-06, + "loss": 0.1914, + "step": 15297 + }, + { + "epoch": 1.441134217281741, + "grad_norm": 0.6475250720977783, + "learning_rate": 3.7506084541158903e-06, + "loss": 0.1831, + "step": 15298 + }, + { + "epoch": 1.4412284213749087, + "grad_norm": 0.6157442927360535, + "learning_rate": 3.7494296943129227e-06, + "loss": 0.1805, + "step": 15299 + }, + { + "epoch": 1.4413226254680767, + "grad_norm": 0.731525182723999, + "learning_rate": 3.7482510770326286e-06, + "loss": 0.2078, + "step": 15300 + }, + { + "epoch": 1.4414168295612444, + "grad_norm": 0.6318780779838562, + "learning_rate": 3.7470726023018745e-06, + "loss": 0.2073, + "step": 15301 + }, + { + "epoch": 1.4415110336544124, + "grad_norm": 0.686691164970398, + "learning_rate": 3.7458942701475385e-06, + "loss": 0.1412, + "step": 15302 + }, + { + "epoch": 1.44160523774758, + "grad_norm": 0.67552250623703, + "learning_rate": 3.744716080596482e-06, + "loss": 0.1999, + "step": 15303 + }, + { + "epoch": 1.441699441840748, + "grad_norm": 0.6695284247398376, + "learning_rate": 3.743538033675573e-06, + "loss": 0.2182, + "step": 15304 + }, + { + "epoch": 1.4417936459339158, + "grad_norm": 0.626176655292511, + "learning_rate": 3.7423601294116775e-06, + "loss": 0.199, + "step": 15305 + }, + { + "epoch": 1.4418878500270838, + "grad_norm": 0.6299342513084412, + "learning_rate": 3.741182367831644e-06, + "loss": 0.2096, + "step": 15306 + }, + { + "epoch": 1.4419820541202515, + "grad_norm": 0.6912399530410767, + "learning_rate": 3.740004748962335e-06, + "loss": 0.2153, + "step": 15307 + }, + { + "epoch": 1.4420762582134194, + "grad_norm": 0.6924498081207275, + "learning_rate": 3.7388272728305954e-06, + "loss": 0.2195, + "step": 15308 + }, + { + "epoch": 1.4421704623065872, + "grad_norm": 0.8784087300300598, + "learning_rate": 3.7376499394632783e-06, + "loss": 0.2118, + "step": 15309 + }, + { + "epoch": 1.4422646663997551, + "grad_norm": 0.6489197015762329, + "learning_rate": 3.7364727488872297e-06, + "loss": 0.1989, + "step": 15310 + }, + { + "epoch": 1.4423588704929229, + "grad_norm": 0.6295911073684692, + "learning_rate": 3.735295701129287e-06, + "loss": 0.1993, + "step": 15311 + }, + { + "epoch": 1.4424530745860908, + "grad_norm": 0.6100906133651733, + "learning_rate": 3.7341187962162928e-06, + "loss": 0.1964, + "step": 15312 + }, + { + "epoch": 1.4425472786792586, + "grad_norm": 0.6560591459274292, + "learning_rate": 3.7329420341750776e-06, + "loss": 0.2164, + "step": 15313 + }, + { + "epoch": 1.4426414827724265, + "grad_norm": 0.7599140405654907, + "learning_rate": 3.7317654150324765e-06, + "loss": 0.2352, + "step": 15314 + }, + { + "epoch": 1.4427356868655943, + "grad_norm": 0.6573992371559143, + "learning_rate": 3.7305889388153215e-06, + "loss": 0.1993, + "step": 15315 + }, + { + "epoch": 1.4428298909587622, + "grad_norm": 0.6520240902900696, + "learning_rate": 3.729412605550431e-06, + "loss": 0.2125, + "step": 15316 + }, + { + "epoch": 1.44292409505193, + "grad_norm": 0.6830780506134033, + "learning_rate": 3.72823641526463e-06, + "loss": 0.225, + "step": 15317 + }, + { + "epoch": 1.443018299145098, + "grad_norm": 0.6395137906074524, + "learning_rate": 3.7270603679847416e-06, + "loss": 0.1844, + "step": 15318 + }, + { + "epoch": 1.4431125032382657, + "grad_norm": 0.7229591608047485, + "learning_rate": 3.725884463737576e-06, + "loss": 0.1989, + "step": 15319 + }, + { + "epoch": 1.4432067073314334, + "grad_norm": 0.7150613069534302, + "learning_rate": 3.7247087025499464e-06, + "loss": 0.2221, + "step": 15320 + }, + { + "epoch": 1.4433009114246014, + "grad_norm": 0.6940883994102478, + "learning_rate": 3.723533084448667e-06, + "loss": 0.2131, + "step": 15321 + }, + { + "epoch": 1.4433951155177693, + "grad_norm": 0.6280433535575867, + "learning_rate": 3.7223576094605407e-06, + "loss": 0.1907, + "step": 15322 + }, + { + "epoch": 1.443489319610937, + "grad_norm": 0.6110469102859497, + "learning_rate": 3.721182277612363e-06, + "loss": 0.1655, + "step": 15323 + }, + { + "epoch": 1.4435835237041048, + "grad_norm": 0.5971839427947998, + "learning_rate": 3.720007088930945e-06, + "loss": 0.1979, + "step": 15324 + }, + { + "epoch": 1.4436777277972728, + "grad_norm": 0.7039603590965271, + "learning_rate": 3.7188320434430793e-06, + "loss": 0.2198, + "step": 15325 + }, + { + "epoch": 1.4437719318904407, + "grad_norm": 0.6401283740997314, + "learning_rate": 3.717657141175549e-06, + "loss": 0.1962, + "step": 15326 + }, + { + "epoch": 1.4438661359836085, + "grad_norm": 0.6265789270401001, + "learning_rate": 3.716482382155159e-06, + "loss": 0.1759, + "step": 15327 + }, + { + "epoch": 1.4439603400767762, + "grad_norm": 0.7420952320098877, + "learning_rate": 3.7153077664086877e-06, + "loss": 0.2096, + "step": 15328 + }, + { + "epoch": 1.4440545441699442, + "grad_norm": 0.6508326530456543, + "learning_rate": 3.7141332939629127e-06, + "loss": 0.1857, + "step": 15329 + }, + { + "epoch": 1.4441487482631121, + "grad_norm": 0.632019579410553, + "learning_rate": 3.712958964844626e-06, + "loss": 0.203, + "step": 15330 + }, + { + "epoch": 1.4442429523562799, + "grad_norm": 0.625632107257843, + "learning_rate": 3.7117847790805983e-06, + "loss": 0.2182, + "step": 15331 + }, + { + "epoch": 1.4443371564494476, + "grad_norm": 0.7232470512390137, + "learning_rate": 3.7106107366975995e-06, + "loss": 0.2072, + "step": 15332 + }, + { + "epoch": 1.4444313605426156, + "grad_norm": 0.5715131163597107, + "learning_rate": 3.7094368377224023e-06, + "loss": 0.1758, + "step": 15333 + }, + { + "epoch": 1.4445255646357835, + "grad_norm": 0.7722396850585938, + "learning_rate": 3.7082630821817778e-06, + "loss": 0.2351, + "step": 15334 + }, + { + "epoch": 1.4446197687289513, + "grad_norm": 0.661654531955719, + "learning_rate": 3.7070894701024806e-06, + "loss": 0.2122, + "step": 15335 + }, + { + "epoch": 1.444713972822119, + "grad_norm": 0.6670462489128113, + "learning_rate": 3.705916001511277e-06, + "loss": 0.2032, + "step": 15336 + }, + { + "epoch": 1.444808176915287, + "grad_norm": 0.8102666735649109, + "learning_rate": 3.704742676434925e-06, + "loss": 0.2276, + "step": 15337 + }, + { + "epoch": 1.444902381008455, + "grad_norm": 0.9872673153877258, + "learning_rate": 3.7035694949001733e-06, + "loss": 0.2211, + "step": 15338 + }, + { + "epoch": 1.4449965851016227, + "grad_norm": 0.6364572644233704, + "learning_rate": 3.7023964569337735e-06, + "loss": 0.2019, + "step": 15339 + }, + { + "epoch": 1.4450907891947904, + "grad_norm": 0.5677757859230042, + "learning_rate": 3.701223562562478e-06, + "loss": 0.1901, + "step": 15340 + }, + { + "epoch": 1.4451849932879584, + "grad_norm": 0.6046751737594604, + "learning_rate": 3.700050811813024e-06, + "loss": 0.2002, + "step": 15341 + }, + { + "epoch": 1.4452791973811263, + "grad_norm": 0.629215657711029, + "learning_rate": 3.6988782047121565e-06, + "loss": 0.1929, + "step": 15342 + }, + { + "epoch": 1.445373401474294, + "grad_norm": 0.8797250986099243, + "learning_rate": 3.697705741286608e-06, + "loss": 0.1939, + "step": 15343 + }, + { + "epoch": 1.4454676055674618, + "grad_norm": 0.6953577399253845, + "learning_rate": 3.6965334215631157e-06, + "loss": 0.2147, + "step": 15344 + }, + { + "epoch": 1.4455618096606297, + "grad_norm": 0.6435692310333252, + "learning_rate": 3.695361245568413e-06, + "loss": 0.2092, + "step": 15345 + }, + { + "epoch": 1.4456560137537977, + "grad_norm": 0.6839839220046997, + "learning_rate": 3.69418921332922e-06, + "loss": 0.2239, + "step": 15346 + }, + { + "epoch": 1.4457502178469654, + "grad_norm": 0.6410571336746216, + "learning_rate": 3.69301732487227e-06, + "loss": 0.1996, + "step": 15347 + }, + { + "epoch": 1.4458444219401332, + "grad_norm": 0.6643087863922119, + "learning_rate": 3.6918455802242745e-06, + "loss": 0.1942, + "step": 15348 + }, + { + "epoch": 1.4459386260333011, + "grad_norm": 0.7473112940788269, + "learning_rate": 3.690673979411955e-06, + "loss": 0.2349, + "step": 15349 + }, + { + "epoch": 1.446032830126469, + "grad_norm": 0.6772686243057251, + "learning_rate": 3.6895025224620307e-06, + "loss": 0.1812, + "step": 15350 + }, + { + "epoch": 1.4461270342196368, + "grad_norm": 0.652247965335846, + "learning_rate": 3.6883312094012047e-06, + "loss": 0.2005, + "step": 15351 + }, + { + "epoch": 1.4462212383128046, + "grad_norm": 0.7764046788215637, + "learning_rate": 3.6871600402561914e-06, + "loss": 0.1991, + "step": 15352 + }, + { + "epoch": 1.4463154424059725, + "grad_norm": 0.670874834060669, + "learning_rate": 3.685989015053689e-06, + "loss": 0.1719, + "step": 15353 + }, + { + "epoch": 1.4464096464991405, + "grad_norm": 0.6050533056259155, + "learning_rate": 3.6848181338204025e-06, + "loss": 0.22, + "step": 15354 + }, + { + "epoch": 1.4465038505923082, + "grad_norm": 0.7051668167114258, + "learning_rate": 3.683647396583032e-06, + "loss": 0.2152, + "step": 15355 + }, + { + "epoch": 1.446598054685476, + "grad_norm": 0.6484372019767761, + "learning_rate": 3.6824768033682655e-06, + "loss": 0.2102, + "step": 15356 + }, + { + "epoch": 1.446692258778644, + "grad_norm": 0.6553364396095276, + "learning_rate": 3.681306354202798e-06, + "loss": 0.1903, + "step": 15357 + }, + { + "epoch": 1.446786462871812, + "grad_norm": 0.6942259669303894, + "learning_rate": 3.680136049113322e-06, + "loss": 0.2106, + "step": 15358 + }, + { + "epoch": 1.4468806669649796, + "grad_norm": 0.6313175559043884, + "learning_rate": 3.6789658881265135e-06, + "loss": 0.1863, + "step": 15359 + }, + { + "epoch": 1.4469748710581474, + "grad_norm": 0.6666918992996216, + "learning_rate": 3.6777958712690597e-06, + "loss": 0.1896, + "step": 15360 + }, + { + "epoch": 1.4470690751513153, + "grad_norm": 0.6780277490615845, + "learning_rate": 3.67662599856764e-06, + "loss": 0.1693, + "step": 15361 + }, + { + "epoch": 1.4471632792444833, + "grad_norm": 0.6547707915306091, + "learning_rate": 3.675456270048927e-06, + "loss": 0.2084, + "step": 15362 + }, + { + "epoch": 1.447257483337651, + "grad_norm": 0.7450213432312012, + "learning_rate": 3.6742866857395855e-06, + "loss": 0.2341, + "step": 15363 + }, + { + "epoch": 1.4473516874308188, + "grad_norm": 0.6595832109451294, + "learning_rate": 3.6731172456662967e-06, + "loss": 0.1958, + "step": 15364 + }, + { + "epoch": 1.4474458915239867, + "grad_norm": 0.7072004079818726, + "learning_rate": 3.67194794985572e-06, + "loss": 0.2198, + "step": 15365 + }, + { + "epoch": 1.4475400956171547, + "grad_norm": 0.6067395806312561, + "learning_rate": 3.670778798334509e-06, + "loss": 0.2142, + "step": 15366 + }, + { + "epoch": 1.4476342997103224, + "grad_norm": 0.5847840905189514, + "learning_rate": 3.6696097911293373e-06, + "loss": 0.1829, + "step": 15367 + }, + { + "epoch": 1.4477285038034902, + "grad_norm": 0.6200958490371704, + "learning_rate": 3.668440928266852e-06, + "loss": 0.1959, + "step": 15368 + }, + { + "epoch": 1.4478227078966581, + "grad_norm": 0.7499897480010986, + "learning_rate": 3.667272209773699e-06, + "loss": 0.1951, + "step": 15369 + }, + { + "epoch": 1.447916911989826, + "grad_norm": 0.6599944829940796, + "learning_rate": 3.66610363567654e-06, + "loss": 0.2204, + "step": 15370 + }, + { + "epoch": 1.4480111160829938, + "grad_norm": 0.6325149536132812, + "learning_rate": 3.6649352060020137e-06, + "loss": 0.1736, + "step": 15371 + }, + { + "epoch": 1.4481053201761616, + "grad_norm": 0.6277233958244324, + "learning_rate": 3.663766920776759e-06, + "loss": 0.1954, + "step": 15372 + }, + { + "epoch": 1.4481995242693295, + "grad_norm": 0.6093193888664246, + "learning_rate": 3.6625987800274177e-06, + "loss": 0.1901, + "step": 15373 + }, + { + "epoch": 1.4482937283624975, + "grad_norm": 0.6746008396148682, + "learning_rate": 3.6614307837806283e-06, + "loss": 0.2099, + "step": 15374 + }, + { + "epoch": 1.4483879324556652, + "grad_norm": 0.6880073547363281, + "learning_rate": 3.660262932063017e-06, + "loss": 0.2604, + "step": 15375 + }, + { + "epoch": 1.448482136548833, + "grad_norm": 0.65242600440979, + "learning_rate": 3.6590952249012145e-06, + "loss": 0.1847, + "step": 15376 + }, + { + "epoch": 1.448576340642001, + "grad_norm": 0.7235094308853149, + "learning_rate": 3.657927662321851e-06, + "loss": 0.2094, + "step": 15377 + }, + { + "epoch": 1.4486705447351689, + "grad_norm": 0.663043737411499, + "learning_rate": 3.6567602443515416e-06, + "loss": 0.1809, + "step": 15378 + }, + { + "epoch": 1.4487647488283366, + "grad_norm": 0.6839006543159485, + "learning_rate": 3.655592971016909e-06, + "loss": 0.2093, + "step": 15379 + }, + { + "epoch": 1.4488589529215044, + "grad_norm": 0.7242204546928406, + "learning_rate": 3.654425842344571e-06, + "loss": 0.204, + "step": 15380 + }, + { + "epoch": 1.4489531570146723, + "grad_norm": 0.8180120587348938, + "learning_rate": 3.653258858361135e-06, + "loss": 0.1928, + "step": 15381 + }, + { + "epoch": 1.4490473611078403, + "grad_norm": 0.6022229194641113, + "learning_rate": 3.6520920190932152e-06, + "loss": 0.1854, + "step": 15382 + }, + { + "epoch": 1.449141565201008, + "grad_norm": 0.6465762853622437, + "learning_rate": 3.6509253245674113e-06, + "loss": 0.2374, + "step": 15383 + }, + { + "epoch": 1.4492357692941757, + "grad_norm": 0.6528346538543701, + "learning_rate": 3.6497587748103326e-06, + "loss": 0.207, + "step": 15384 + }, + { + "epoch": 1.4493299733873437, + "grad_norm": 0.6788538694381714, + "learning_rate": 3.6485923698485714e-06, + "loss": 0.1962, + "step": 15385 + }, + { + "epoch": 1.4494241774805117, + "grad_norm": 0.8187673687934875, + "learning_rate": 3.647426109708727e-06, + "loss": 0.2087, + "step": 15386 + }, + { + "epoch": 1.4495183815736794, + "grad_norm": 0.6599162817001343, + "learning_rate": 3.646259994417395e-06, + "loss": 0.2136, + "step": 15387 + }, + { + "epoch": 1.4496125856668471, + "grad_norm": 0.6779152154922485, + "learning_rate": 3.645094024001158e-06, + "loss": 0.2303, + "step": 15388 + }, + { + "epoch": 1.449706789760015, + "grad_norm": 0.6403079032897949, + "learning_rate": 3.6439281984866047e-06, + "loss": 0.1861, + "step": 15389 + }, + { + "epoch": 1.4498009938531828, + "grad_norm": 0.653789222240448, + "learning_rate": 3.6427625179003223e-06, + "loss": 0.1925, + "step": 15390 + }, + { + "epoch": 1.4498951979463508, + "grad_norm": 0.6621856689453125, + "learning_rate": 3.6415969822688824e-06, + "loss": 0.1931, + "step": 15391 + }, + { + "epoch": 1.4499894020395185, + "grad_norm": 0.6642829775810242, + "learning_rate": 3.6404315916188684e-06, + "loss": 0.1967, + "step": 15392 + }, + { + "epoch": 1.4500836061326865, + "grad_norm": 0.6505551934242249, + "learning_rate": 3.6392663459768452e-06, + "loss": 0.1911, + "step": 15393 + }, + { + "epoch": 1.4501778102258542, + "grad_norm": 0.6382884979248047, + "learning_rate": 3.6381012453693874e-06, + "loss": 0.1986, + "step": 15394 + }, + { + "epoch": 1.4502720143190222, + "grad_norm": 0.687646746635437, + "learning_rate": 3.6369362898230633e-06, + "loss": 0.2077, + "step": 15395 + }, + { + "epoch": 1.45036621841219, + "grad_norm": 0.6266944408416748, + "learning_rate": 3.6357714793644283e-06, + "loss": 0.1938, + "step": 15396 + }, + { + "epoch": 1.450460422505358, + "grad_norm": 0.666293203830719, + "learning_rate": 3.6346068140200474e-06, + "loss": 0.1862, + "step": 15397 + }, + { + "epoch": 1.4505546265985256, + "grad_norm": 0.6245224475860596, + "learning_rate": 3.633442293816478e-06, + "loss": 0.1859, + "step": 15398 + }, + { + "epoch": 1.4506488306916936, + "grad_norm": 0.6878665685653687, + "learning_rate": 3.632277918780267e-06, + "loss": 0.1804, + "step": 15399 + }, + { + "epoch": 1.4507430347848613, + "grad_norm": 0.6279546022415161, + "learning_rate": 3.6311136889379674e-06, + "loss": 0.1944, + "step": 15400 + }, + { + "epoch": 1.4508372388780293, + "grad_norm": 0.6689943075180054, + "learning_rate": 3.6299496043161285e-06, + "loss": 0.2103, + "step": 15401 + }, + { + "epoch": 1.450931442971197, + "grad_norm": 0.5821841359138489, + "learning_rate": 3.62878566494129e-06, + "loss": 0.1745, + "step": 15402 + }, + { + "epoch": 1.451025647064365, + "grad_norm": 0.6357420086860657, + "learning_rate": 3.6276218708399858e-06, + "loss": 0.2151, + "step": 15403 + }, + { + "epoch": 1.4511198511575327, + "grad_norm": 0.6925756335258484, + "learning_rate": 3.6264582220387634e-06, + "loss": 0.2016, + "step": 15404 + }, + { + "epoch": 1.4512140552507007, + "grad_norm": 0.6702736616134644, + "learning_rate": 3.625294718564152e-06, + "loss": 0.1967, + "step": 15405 + }, + { + "epoch": 1.4513082593438684, + "grad_norm": 0.6685515642166138, + "learning_rate": 3.624131360442671e-06, + "loss": 0.2129, + "step": 15406 + }, + { + "epoch": 1.4514024634370364, + "grad_norm": 0.6161972880363464, + "learning_rate": 3.622968147700864e-06, + "loss": 0.1789, + "step": 15407 + }, + { + "epoch": 1.4514966675302041, + "grad_norm": 0.6239951848983765, + "learning_rate": 3.621805080365245e-06, + "loss": 0.2033, + "step": 15408 + }, + { + "epoch": 1.451590871623372, + "grad_norm": 0.7480084896087646, + "learning_rate": 3.6206421584623296e-06, + "loss": 0.2225, + "step": 15409 + }, + { + "epoch": 1.4516850757165398, + "grad_norm": 0.7976895570755005, + "learning_rate": 3.6194793820186404e-06, + "loss": 0.199, + "step": 15410 + }, + { + "epoch": 1.4517792798097078, + "grad_norm": 0.6239093542098999, + "learning_rate": 3.618316751060692e-06, + "loss": 0.1986, + "step": 15411 + }, + { + "epoch": 1.4518734839028755, + "grad_norm": 0.6483273506164551, + "learning_rate": 3.6171542656149873e-06, + "loss": 0.1849, + "step": 15412 + }, + { + "epoch": 1.4519676879960435, + "grad_norm": 0.6124324202537537, + "learning_rate": 3.6159919257080366e-06, + "loss": 0.1855, + "step": 15413 + }, + { + "epoch": 1.4520618920892112, + "grad_norm": 0.6394926309585571, + "learning_rate": 3.6148297313663473e-06, + "loss": 0.1806, + "step": 15414 + }, + { + "epoch": 1.4521560961823792, + "grad_norm": 0.592024028301239, + "learning_rate": 3.613667682616411e-06, + "loss": 0.1929, + "step": 15415 + }, + { + "epoch": 1.452250300275547, + "grad_norm": 0.6319133043289185, + "learning_rate": 3.6125057794847286e-06, + "loss": 0.2147, + "step": 15416 + }, + { + "epoch": 1.4523445043687149, + "grad_norm": 0.6954970955848694, + "learning_rate": 3.611344021997796e-06, + "loss": 0.2286, + "step": 15417 + }, + { + "epoch": 1.4524387084618826, + "grad_norm": 0.6587648391723633, + "learning_rate": 3.610182410182096e-06, + "loss": 0.1838, + "step": 15418 + }, + { + "epoch": 1.4525329125550506, + "grad_norm": 0.691964864730835, + "learning_rate": 3.609020944064119e-06, + "loss": 0.2106, + "step": 15419 + }, + { + "epoch": 1.4526271166482183, + "grad_norm": 0.600549042224884, + "learning_rate": 3.6078596236703524e-06, + "loss": 0.1668, + "step": 15420 + }, + { + "epoch": 1.4527213207413863, + "grad_norm": 0.6247270107269287, + "learning_rate": 3.6066984490272684e-06, + "loss": 0.2028, + "step": 15421 + }, + { + "epoch": 1.452815524834554, + "grad_norm": 0.6843393445014954, + "learning_rate": 3.6055374201613503e-06, + "loss": 0.2436, + "step": 15422 + }, + { + "epoch": 1.452909728927722, + "grad_norm": 0.6278879046440125, + "learning_rate": 3.6043765370990657e-06, + "loss": 0.1806, + "step": 15423 + }, + { + "epoch": 1.4530039330208897, + "grad_norm": 0.8175564408302307, + "learning_rate": 3.6032157998668894e-06, + "loss": 0.2068, + "step": 15424 + }, + { + "epoch": 1.4530981371140577, + "grad_norm": 0.7069247961044312, + "learning_rate": 3.602055208491283e-06, + "loss": 0.1962, + "step": 15425 + }, + { + "epoch": 1.4531923412072254, + "grad_norm": 0.7326929569244385, + "learning_rate": 3.6008947629987124e-06, + "loss": 0.2096, + "step": 15426 + }, + { + "epoch": 1.4532865453003934, + "grad_norm": 0.6834288239479065, + "learning_rate": 3.5997344634156405e-06, + "loss": 0.2133, + "step": 15427 + }, + { + "epoch": 1.453380749393561, + "grad_norm": 0.6272591948509216, + "learning_rate": 3.598574309768519e-06, + "loss": 0.1773, + "step": 15428 + }, + { + "epoch": 1.453474953486729, + "grad_norm": 0.8258611559867859, + "learning_rate": 3.5974143020838017e-06, + "loss": 0.2431, + "step": 15429 + }, + { + "epoch": 1.4535691575798968, + "grad_norm": 0.6214420199394226, + "learning_rate": 3.596254440387944e-06, + "loss": 0.1845, + "step": 15430 + }, + { + "epoch": 1.4536633616730648, + "grad_norm": 0.692866861820221, + "learning_rate": 3.595094724707385e-06, + "loss": 0.19, + "step": 15431 + }, + { + "epoch": 1.4537575657662325, + "grad_norm": 0.6115120053291321, + "learning_rate": 3.593935155068575e-06, + "loss": 0.1704, + "step": 15432 + }, + { + "epoch": 1.4538517698594005, + "grad_norm": 0.598218560218811, + "learning_rate": 3.5927757314979485e-06, + "loss": 0.1791, + "step": 15433 + }, + { + "epoch": 1.4539459739525682, + "grad_norm": 0.700184166431427, + "learning_rate": 3.5916164540219435e-06, + "loss": 0.1959, + "step": 15434 + }, + { + "epoch": 1.4540401780457362, + "grad_norm": 0.7386876344680786, + "learning_rate": 3.590457322666997e-06, + "loss": 0.1972, + "step": 15435 + }, + { + "epoch": 1.454134382138904, + "grad_norm": 0.6803439855575562, + "learning_rate": 3.5892983374595335e-06, + "loss": 0.2096, + "step": 15436 + }, + { + "epoch": 1.4542285862320719, + "grad_norm": 0.6819314360618591, + "learning_rate": 3.588139498425981e-06, + "loss": 0.1913, + "step": 15437 + }, + { + "epoch": 1.4543227903252396, + "grad_norm": 0.639312744140625, + "learning_rate": 3.586980805592769e-06, + "loss": 0.1833, + "step": 15438 + }, + { + "epoch": 1.4544169944184075, + "grad_norm": 0.6426438689231873, + "learning_rate": 3.5858222589863077e-06, + "loss": 0.1959, + "step": 15439 + }, + { + "epoch": 1.4545111985115753, + "grad_norm": 0.7659159302711487, + "learning_rate": 3.5846638586330196e-06, + "loss": 0.2239, + "step": 15440 + }, + { + "epoch": 1.4546054026047432, + "grad_norm": 0.7493621110916138, + "learning_rate": 3.583505604559321e-06, + "loss": 0.1819, + "step": 15441 + }, + { + "epoch": 1.454699606697911, + "grad_norm": 0.6186150312423706, + "learning_rate": 3.582347496791616e-06, + "loss": 0.1737, + "step": 15442 + }, + { + "epoch": 1.454793810791079, + "grad_norm": 0.6180188655853271, + "learning_rate": 3.5811895353563073e-06, + "loss": 0.1866, + "step": 15443 + }, + { + "epoch": 1.4548880148842467, + "grad_norm": 0.8320621252059937, + "learning_rate": 3.5800317202798117e-06, + "loss": 0.2005, + "step": 15444 + }, + { + "epoch": 1.4549822189774146, + "grad_norm": 0.6498815417289734, + "learning_rate": 3.578874051588521e-06, + "loss": 0.1834, + "step": 15445 + }, + { + "epoch": 1.4550764230705824, + "grad_norm": 0.6876739859580994, + "learning_rate": 3.5777165293088255e-06, + "loss": 0.252, + "step": 15446 + }, + { + "epoch": 1.4551706271637503, + "grad_norm": 0.5977943539619446, + "learning_rate": 3.5765591534671316e-06, + "loss": 0.1823, + "step": 15447 + }, + { + "epoch": 1.455264831256918, + "grad_norm": 0.6090304255485535, + "learning_rate": 3.575401924089824e-06, + "loss": 0.1784, + "step": 15448 + }, + { + "epoch": 1.455359035350086, + "grad_norm": 0.6334052681922913, + "learning_rate": 3.574244841203285e-06, + "loss": 0.1734, + "step": 15449 + }, + { + "epoch": 1.4554532394432538, + "grad_norm": 0.6220776438713074, + "learning_rate": 3.573087904833901e-06, + "loss": 0.1721, + "step": 15450 + }, + { + "epoch": 1.4555474435364217, + "grad_norm": 0.6752516627311707, + "learning_rate": 3.571931115008055e-06, + "loss": 0.1778, + "step": 15451 + }, + { + "epoch": 1.4556416476295895, + "grad_norm": 0.6394069194793701, + "learning_rate": 3.5707744717521174e-06, + "loss": 0.1843, + "step": 15452 + }, + { + "epoch": 1.4557358517227574, + "grad_norm": 0.6487554311752319, + "learning_rate": 3.5696179750924653e-06, + "loss": 0.198, + "step": 15453 + }, + { + "epoch": 1.4558300558159252, + "grad_norm": 0.8762438893318176, + "learning_rate": 3.5684616250554716e-06, + "loss": 0.2065, + "step": 15454 + }, + { + "epoch": 1.4559242599090931, + "grad_norm": 0.6362253427505493, + "learning_rate": 3.5673054216674965e-06, + "loss": 0.1931, + "step": 15455 + }, + { + "epoch": 1.4560184640022609, + "grad_norm": 1.2721117734909058, + "learning_rate": 3.566149364954905e-06, + "loss": 0.2472, + "step": 15456 + }, + { + "epoch": 1.4561126680954288, + "grad_norm": 0.6947647929191589, + "learning_rate": 3.564993454944062e-06, + "loss": 0.2066, + "step": 15457 + }, + { + "epoch": 1.4562068721885966, + "grad_norm": 0.7357280254364014, + "learning_rate": 3.5638376916613173e-06, + "loss": 0.1993, + "step": 15458 + }, + { + "epoch": 1.4563010762817643, + "grad_norm": 0.655251681804657, + "learning_rate": 3.562682075133026e-06, + "loss": 0.2073, + "step": 15459 + }, + { + "epoch": 1.4563952803749323, + "grad_norm": 0.6016684174537659, + "learning_rate": 3.5615266053855423e-06, + "loss": 0.1954, + "step": 15460 + }, + { + "epoch": 1.4564894844681002, + "grad_norm": 0.6429963111877441, + "learning_rate": 3.5603712824452065e-06, + "loss": 0.1911, + "step": 15461 + }, + { + "epoch": 1.456583688561268, + "grad_norm": 0.6290830373764038, + "learning_rate": 3.559216106338368e-06, + "loss": 0.208, + "step": 15462 + }, + { + "epoch": 1.4566778926544357, + "grad_norm": 0.6305145025253296, + "learning_rate": 3.5580610770913593e-06, + "loss": 0.1744, + "step": 15463 + }, + { + "epoch": 1.4567720967476037, + "grad_norm": 0.6087737679481506, + "learning_rate": 3.556906194730524e-06, + "loss": 0.1798, + "step": 15464 + }, + { + "epoch": 1.4568663008407716, + "grad_norm": 0.6650774478912354, + "learning_rate": 3.5557514592821883e-06, + "loss": 0.2201, + "step": 15465 + }, + { + "epoch": 1.4569605049339394, + "grad_norm": 0.6110175251960754, + "learning_rate": 3.5545968707726864e-06, + "loss": 0.19, + "step": 15466 + }, + { + "epoch": 1.457054709027107, + "grad_norm": 0.6831899285316467, + "learning_rate": 3.5534424292283476e-06, + "loss": 0.2129, + "step": 15467 + }, + { + "epoch": 1.457148913120275, + "grad_norm": 0.6254287958145142, + "learning_rate": 3.5522881346754865e-06, + "loss": 0.1837, + "step": 15468 + }, + { + "epoch": 1.457243117213443, + "grad_norm": 0.6465926766395569, + "learning_rate": 3.5511339871404282e-06, + "loss": 0.1862, + "step": 15469 + }, + { + "epoch": 1.4573373213066108, + "grad_norm": 0.6309822797775269, + "learning_rate": 3.5499799866494912e-06, + "loss": 0.1732, + "step": 15470 + }, + { + "epoch": 1.4574315253997785, + "grad_norm": 0.6424391865730286, + "learning_rate": 3.548826133228983e-06, + "loss": 0.2, + "step": 15471 + }, + { + "epoch": 1.4575257294929465, + "grad_norm": 0.6469354629516602, + "learning_rate": 3.5476724269052187e-06, + "loss": 0.1803, + "step": 15472 + }, + { + "epoch": 1.4576199335861144, + "grad_norm": 0.7723602056503296, + "learning_rate": 3.546518867704499e-06, + "loss": 0.2271, + "step": 15473 + }, + { + "epoch": 1.4577141376792822, + "grad_norm": 0.7271719574928284, + "learning_rate": 3.545365455653129e-06, + "loss": 0.2317, + "step": 15474 + }, + { + "epoch": 1.45780834177245, + "grad_norm": 0.5881963968276978, + "learning_rate": 3.544212190777413e-06, + "loss": 0.2, + "step": 15475 + }, + { + "epoch": 1.4579025458656178, + "grad_norm": 0.5888350605964661, + "learning_rate": 3.5430590731036397e-06, + "loss": 0.1744, + "step": 15476 + }, + { + "epoch": 1.4579967499587858, + "grad_norm": 0.7601636052131653, + "learning_rate": 3.5419061026581046e-06, + "loss": 0.2323, + "step": 15477 + }, + { + "epoch": 1.4580909540519535, + "grad_norm": 0.631746232509613, + "learning_rate": 3.540753279467102e-06, + "loss": 0.1992, + "step": 15478 + }, + { + "epoch": 1.4581851581451213, + "grad_norm": 0.6227158308029175, + "learning_rate": 3.53960060355691e-06, + "loss": 0.1852, + "step": 15479 + }, + { + "epoch": 1.4582793622382892, + "grad_norm": 0.7304214835166931, + "learning_rate": 3.5384480749538163e-06, + "loss": 0.1964, + "step": 15480 + }, + { + "epoch": 1.4583735663314572, + "grad_norm": 0.5746965408325195, + "learning_rate": 3.537295693684102e-06, + "loss": 0.2189, + "step": 15481 + }, + { + "epoch": 1.458467770424625, + "grad_norm": 0.6734116077423096, + "learning_rate": 3.536143459774041e-06, + "loss": 0.1947, + "step": 15482 + }, + { + "epoch": 1.4585619745177927, + "grad_norm": 0.6554303765296936, + "learning_rate": 3.5349913732498984e-06, + "loss": 0.2011, + "step": 15483 + }, + { + "epoch": 1.4586561786109606, + "grad_norm": 0.659274160861969, + "learning_rate": 3.533839434137959e-06, + "loss": 0.1915, + "step": 15484 + }, + { + "epoch": 1.4587503827041286, + "grad_norm": 0.6228361129760742, + "learning_rate": 3.5326876424644798e-06, + "loss": 0.1799, + "step": 15485 + }, + { + "epoch": 1.4588445867972963, + "grad_norm": 0.6653767824172974, + "learning_rate": 3.5315359982557175e-06, + "loss": 0.1954, + "step": 15486 + }, + { + "epoch": 1.458938790890464, + "grad_norm": 0.6754751205444336, + "learning_rate": 3.5303845015379444e-06, + "loss": 0.1894, + "step": 15487 + }, + { + "epoch": 1.459032994983632, + "grad_norm": 0.6641196012496948, + "learning_rate": 3.52923315233741e-06, + "loss": 0.1968, + "step": 15488 + }, + { + "epoch": 1.4591271990768, + "grad_norm": 0.6987813115119934, + "learning_rate": 3.5280819506803645e-06, + "loss": 0.1802, + "step": 15489 + }, + { + "epoch": 1.4592214031699677, + "grad_norm": 0.6381571888923645, + "learning_rate": 3.5269308965930593e-06, + "loss": 0.1725, + "step": 15490 + }, + { + "epoch": 1.4593156072631355, + "grad_norm": 0.9219656586647034, + "learning_rate": 3.525779990101744e-06, + "loss": 0.1928, + "step": 15491 + }, + { + "epoch": 1.4594098113563034, + "grad_norm": 0.6034792065620422, + "learning_rate": 3.5246292312326536e-06, + "loss": 0.1639, + "step": 15492 + }, + { + "epoch": 1.4595040154494714, + "grad_norm": 0.6326768398284912, + "learning_rate": 3.5234786200120306e-06, + "loss": 0.1729, + "step": 15493 + }, + { + "epoch": 1.4595982195426391, + "grad_norm": 0.7265027165412903, + "learning_rate": 3.522328156466116e-06, + "loss": 0.2226, + "step": 15494 + }, + { + "epoch": 1.4596924236358069, + "grad_norm": 0.7184128165245056, + "learning_rate": 3.5211778406211326e-06, + "loss": 0.2039, + "step": 15495 + }, + { + "epoch": 1.4597866277289748, + "grad_norm": 0.6886721849441528, + "learning_rate": 3.5200276725033156e-06, + "loss": 0.2496, + "step": 15496 + }, + { + "epoch": 1.4598808318221428, + "grad_norm": 0.7013434767723083, + "learning_rate": 3.518877652138891e-06, + "loss": 0.1773, + "step": 15497 + }, + { + "epoch": 1.4599750359153105, + "grad_norm": 0.6575307846069336, + "learning_rate": 3.5177277795540763e-06, + "loss": 0.2187, + "step": 15498 + }, + { + "epoch": 1.4600692400084783, + "grad_norm": 0.7372726202011108, + "learning_rate": 3.5165780547750937e-06, + "loss": 0.2015, + "step": 15499 + }, + { + "epoch": 1.4601634441016462, + "grad_norm": 0.6284604668617249, + "learning_rate": 3.515428477828161e-06, + "loss": 0.2035, + "step": 15500 + }, + { + "epoch": 1.4602576481948142, + "grad_norm": 0.6217027306556702, + "learning_rate": 3.5142790487394883e-06, + "loss": 0.1751, + "step": 15501 + }, + { + "epoch": 1.460351852287982, + "grad_norm": 0.6114416122436523, + "learning_rate": 3.5131297675352803e-06, + "loss": 0.2129, + "step": 15502 + }, + { + "epoch": 1.4604460563811497, + "grad_norm": 0.6586828827857971, + "learning_rate": 3.511980634241745e-06, + "loss": 0.1893, + "step": 15503 + }, + { + "epoch": 1.4605402604743176, + "grad_norm": 0.7110599875450134, + "learning_rate": 3.5108316488850892e-06, + "loss": 0.1996, + "step": 15504 + }, + { + "epoch": 1.4606344645674856, + "grad_norm": 0.7150020003318787, + "learning_rate": 3.5096828114915036e-06, + "loss": 0.2008, + "step": 15505 + }, + { + "epoch": 1.4607286686606533, + "grad_norm": 0.7586897015571594, + "learning_rate": 3.508534122087187e-06, + "loss": 0.2009, + "step": 15506 + }, + { + "epoch": 1.460822872753821, + "grad_norm": 0.6721200942993164, + "learning_rate": 3.5073855806983358e-06, + "loss": 0.215, + "step": 15507 + }, + { + "epoch": 1.460917076846989, + "grad_norm": 0.6009416580200195, + "learning_rate": 3.5062371873511315e-06, + "loss": 0.2065, + "step": 15508 + }, + { + "epoch": 1.461011280940157, + "grad_norm": 0.616571843624115, + "learning_rate": 3.5050889420717615e-06, + "loss": 0.1866, + "step": 15509 + }, + { + "epoch": 1.4611054850333247, + "grad_norm": 0.6780067682266235, + "learning_rate": 3.503940844886411e-06, + "loss": 0.2358, + "step": 15510 + }, + { + "epoch": 1.4611996891264925, + "grad_norm": 0.6394216418266296, + "learning_rate": 3.502792895821253e-06, + "loss": 0.181, + "step": 15511 + }, + { + "epoch": 1.4612938932196604, + "grad_norm": 0.6644461750984192, + "learning_rate": 3.5016450949024682e-06, + "loss": 0.2183, + "step": 15512 + }, + { + "epoch": 1.4613880973128284, + "grad_norm": 0.61826491355896, + "learning_rate": 3.500497442156222e-06, + "loss": 0.2063, + "step": 15513 + }, + { + "epoch": 1.461482301405996, + "grad_norm": 0.6185021996498108, + "learning_rate": 3.499349937608685e-06, + "loss": 0.1936, + "step": 15514 + }, + { + "epoch": 1.4615765054991638, + "grad_norm": 0.6552667617797852, + "learning_rate": 3.4982025812860267e-06, + "loss": 0.1884, + "step": 15515 + }, + { + "epoch": 1.4616707095923318, + "grad_norm": 0.6592229604721069, + "learning_rate": 3.497055373214402e-06, + "loss": 0.2048, + "step": 15516 + }, + { + "epoch": 1.4617649136854998, + "grad_norm": 0.699374794960022, + "learning_rate": 3.495908313419971e-06, + "loss": 0.2093, + "step": 15517 + }, + { + "epoch": 1.4618591177786675, + "grad_norm": 0.6737303733825684, + "learning_rate": 3.4947614019288932e-06, + "loss": 0.2065, + "step": 15518 + }, + { + "epoch": 1.4619533218718352, + "grad_norm": 0.7677621245384216, + "learning_rate": 3.493614638767312e-06, + "loss": 0.1853, + "step": 15519 + }, + { + "epoch": 1.4620475259650032, + "grad_norm": 0.6520556211471558, + "learning_rate": 3.4924680239613796e-06, + "loss": 0.1791, + "step": 15520 + }, + { + "epoch": 1.4621417300581712, + "grad_norm": 0.6047016978263855, + "learning_rate": 3.491321557537244e-06, + "loss": 0.1773, + "step": 15521 + }, + { + "epoch": 1.462235934151339, + "grad_norm": 0.5962021350860596, + "learning_rate": 3.490175239521042e-06, + "loss": 0.1791, + "step": 15522 + }, + { + "epoch": 1.4623301382445066, + "grad_norm": 0.6825196146965027, + "learning_rate": 3.4890290699389062e-06, + "loss": 0.2071, + "step": 15523 + }, + { + "epoch": 1.4624243423376746, + "grad_norm": 0.6571821570396423, + "learning_rate": 3.4878830488169836e-06, + "loss": 0.1994, + "step": 15524 + }, + { + "epoch": 1.4625185464308426, + "grad_norm": 0.6083043813705444, + "learning_rate": 3.4867371761813982e-06, + "loss": 0.1825, + "step": 15525 + }, + { + "epoch": 1.4626127505240103, + "grad_norm": 0.6372992992401123, + "learning_rate": 3.4855914520582755e-06, + "loss": 0.2084, + "step": 15526 + }, + { + "epoch": 1.462706954617178, + "grad_norm": 0.6596412062644958, + "learning_rate": 3.484445876473742e-06, + "loss": 0.2024, + "step": 15527 + }, + { + "epoch": 1.462801158710346, + "grad_norm": 0.692679762840271, + "learning_rate": 3.4833004494539224e-06, + "loss": 0.2168, + "step": 15528 + }, + { + "epoch": 1.4628953628035137, + "grad_norm": 0.6300631761550903, + "learning_rate": 3.4821551710249278e-06, + "loss": 0.1813, + "step": 15529 + }, + { + "epoch": 1.4629895668966817, + "grad_norm": 0.7117627859115601, + "learning_rate": 3.4810100412128743e-06, + "loss": 0.194, + "step": 15530 + }, + { + "epoch": 1.4630837709898494, + "grad_norm": 0.6359407901763916, + "learning_rate": 3.479865060043878e-06, + "loss": 0.1849, + "step": 15531 + }, + { + "epoch": 1.4631779750830174, + "grad_norm": 0.7331950068473816, + "learning_rate": 3.478720227544038e-06, + "loss": 0.2259, + "step": 15532 + }, + { + "epoch": 1.4632721791761851, + "grad_norm": 0.6989693641662598, + "learning_rate": 3.477575543739463e-06, + "loss": 0.1972, + "step": 15533 + }, + { + "epoch": 1.463366383269353, + "grad_norm": 0.5889098644256592, + "learning_rate": 3.476431008656256e-06, + "loss": 0.1584, + "step": 15534 + }, + { + "epoch": 1.4634605873625208, + "grad_norm": 0.5969628095626831, + "learning_rate": 3.4752866223205062e-06, + "loss": 0.2111, + "step": 15535 + }, + { + "epoch": 1.4635547914556888, + "grad_norm": 0.6702145338058472, + "learning_rate": 3.4741423847583134e-06, + "loss": 0.1985, + "step": 15536 + }, + { + "epoch": 1.4636489955488565, + "grad_norm": 0.6264545321464539, + "learning_rate": 3.4729982959957697e-06, + "loss": 0.1927, + "step": 15537 + }, + { + "epoch": 1.4637431996420245, + "grad_norm": 0.6307673454284668, + "learning_rate": 3.471854356058956e-06, + "loss": 0.1882, + "step": 15538 + }, + { + "epoch": 1.4638374037351922, + "grad_norm": 0.7243738174438477, + "learning_rate": 3.470710564973958e-06, + "loss": 0.1891, + "step": 15539 + }, + { + "epoch": 1.4639316078283602, + "grad_norm": 0.7178024053573608, + "learning_rate": 3.4695669227668603e-06, + "loss": 0.1897, + "step": 15540 + }, + { + "epoch": 1.464025811921528, + "grad_norm": 0.6225504279136658, + "learning_rate": 3.4684234294637377e-06, + "loss": 0.1976, + "step": 15541 + }, + { + "epoch": 1.4641200160146959, + "grad_norm": 1.8272501230239868, + "learning_rate": 3.4672800850906574e-06, + "loss": 0.2049, + "step": 15542 + }, + { + "epoch": 1.4642142201078636, + "grad_norm": 0.6613777279853821, + "learning_rate": 3.4661368896736945e-06, + "loss": 0.1942, + "step": 15543 + }, + { + "epoch": 1.4643084242010316, + "grad_norm": 0.6397969722747803, + "learning_rate": 3.4649938432389184e-06, + "loss": 0.2033, + "step": 15544 + }, + { + "epoch": 1.4644026282941993, + "grad_norm": 0.6330974698066711, + "learning_rate": 3.463850945812387e-06, + "loss": 0.1836, + "step": 15545 + }, + { + "epoch": 1.4644968323873673, + "grad_norm": 0.6854223608970642, + "learning_rate": 3.4627081974201617e-06, + "loss": 0.1958, + "step": 15546 + }, + { + "epoch": 1.464591036480535, + "grad_norm": 0.5868847370147705, + "learning_rate": 3.4615655980883036e-06, + "loss": 0.1762, + "step": 15547 + }, + { + "epoch": 1.464685240573703, + "grad_norm": 0.6223350167274475, + "learning_rate": 3.4604231478428572e-06, + "loss": 0.1743, + "step": 15548 + }, + { + "epoch": 1.4647794446668707, + "grad_norm": 5.741677284240723, + "learning_rate": 3.459280846709877e-06, + "loss": 0.2156, + "step": 15549 + }, + { + "epoch": 1.4648736487600387, + "grad_norm": 0.6658562421798706, + "learning_rate": 3.458138694715413e-06, + "loss": 0.1989, + "step": 15550 + }, + { + "epoch": 1.4649678528532064, + "grad_norm": 0.6318368911743164, + "learning_rate": 3.4569966918855e-06, + "loss": 0.1604, + "step": 15551 + }, + { + "epoch": 1.4650620569463744, + "grad_norm": 0.6312087178230286, + "learning_rate": 3.4558548382461843e-06, + "loss": 0.1975, + "step": 15552 + }, + { + "epoch": 1.465156261039542, + "grad_norm": 0.6483545899391174, + "learning_rate": 3.4547131338234963e-06, + "loss": 0.1997, + "step": 15553 + }, + { + "epoch": 1.46525046513271, + "grad_norm": 0.6374752521514893, + "learning_rate": 3.453571578643472e-06, + "loss": 0.2125, + "step": 15554 + }, + { + "epoch": 1.4653446692258778, + "grad_norm": 0.6611496806144714, + "learning_rate": 3.4524301727321418e-06, + "loss": 0.1718, + "step": 15555 + }, + { + "epoch": 1.4654388733190458, + "grad_norm": 0.6343547701835632, + "learning_rate": 3.451288916115527e-06, + "loss": 0.1852, + "step": 15556 + }, + { + "epoch": 1.4655330774122135, + "grad_norm": 0.7141991853713989, + "learning_rate": 3.4501478088196526e-06, + "loss": 0.2021, + "step": 15557 + }, + { + "epoch": 1.4656272815053815, + "grad_norm": 0.7054789662361145, + "learning_rate": 3.449006850870542e-06, + "loss": 0.2539, + "step": 15558 + }, + { + "epoch": 1.4657214855985492, + "grad_norm": 0.6991019248962402, + "learning_rate": 3.4478660422942024e-06, + "loss": 0.2024, + "step": 15559 + }, + { + "epoch": 1.4658156896917172, + "grad_norm": 0.6906258463859558, + "learning_rate": 3.4467253831166502e-06, + "loss": 0.1809, + "step": 15560 + }, + { + "epoch": 1.465909893784885, + "grad_norm": 0.6545451879501343, + "learning_rate": 3.4455848733638974e-06, + "loss": 0.1724, + "step": 15561 + }, + { + "epoch": 1.4660040978780529, + "grad_norm": 0.7555856704711914, + "learning_rate": 3.4444445130619452e-06, + "loss": 0.1936, + "step": 15562 + }, + { + "epoch": 1.4660983019712206, + "grad_norm": 0.7309272885322571, + "learning_rate": 3.443304302236791e-06, + "loss": 0.1918, + "step": 15563 + }, + { + "epoch": 1.4661925060643886, + "grad_norm": 0.6634095311164856, + "learning_rate": 3.442164240914445e-06, + "loss": 0.2035, + "step": 15564 + }, + { + "epoch": 1.4662867101575563, + "grad_norm": 0.664567232131958, + "learning_rate": 3.441024329120897e-06, + "loss": 0.1951, + "step": 15565 + }, + { + "epoch": 1.4663809142507243, + "grad_norm": 0.6086796522140503, + "learning_rate": 3.4398845668821336e-06, + "loss": 0.2143, + "step": 15566 + }, + { + "epoch": 1.466475118343892, + "grad_norm": 0.6950627565383911, + "learning_rate": 3.438744954224147e-06, + "loss": 0.211, + "step": 15567 + }, + { + "epoch": 1.46656932243706, + "grad_norm": 0.7872999906539917, + "learning_rate": 3.4376054911729273e-06, + "loss": 0.1856, + "step": 15568 + }, + { + "epoch": 1.4666635265302277, + "grad_norm": 0.6647602319717407, + "learning_rate": 3.4364661777544472e-06, + "loss": 0.1879, + "step": 15569 + }, + { + "epoch": 1.4667577306233957, + "grad_norm": 0.6785277724266052, + "learning_rate": 3.4353270139946894e-06, + "loss": 0.1962, + "step": 15570 + }, + { + "epoch": 1.4668519347165634, + "grad_norm": 0.6362676024436951, + "learning_rate": 3.4341879999196316e-06, + "loss": 0.1907, + "step": 15571 + }, + { + "epoch": 1.4669461388097313, + "grad_norm": 0.6003015041351318, + "learning_rate": 3.4330491355552378e-06, + "loss": 0.1985, + "step": 15572 + }, + { + "epoch": 1.467040342902899, + "grad_norm": 0.6760033965110779, + "learning_rate": 3.431910420927479e-06, + "loss": 0.2223, + "step": 15573 + }, + { + "epoch": 1.467134546996067, + "grad_norm": 0.637233555316925, + "learning_rate": 3.430771856062325e-06, + "loss": 0.1917, + "step": 15574 + }, + { + "epoch": 1.4672287510892348, + "grad_norm": 0.5501244068145752, + "learning_rate": 3.4296334409857277e-06, + "loss": 0.1792, + "step": 15575 + }, + { + "epoch": 1.4673229551824027, + "grad_norm": 0.7237962484359741, + "learning_rate": 3.4284951757236506e-06, + "loss": 0.231, + "step": 15576 + }, + { + "epoch": 1.4674171592755705, + "grad_norm": 0.679079532623291, + "learning_rate": 3.427357060302049e-06, + "loss": 0.2195, + "step": 15577 + }, + { + "epoch": 1.4675113633687384, + "grad_norm": 0.6358046531677246, + "learning_rate": 3.426219094746871e-06, + "loss": 0.2156, + "step": 15578 + }, + { + "epoch": 1.4676055674619062, + "grad_norm": 0.7107905745506287, + "learning_rate": 3.4250812790840583e-06, + "loss": 0.224, + "step": 15579 + }, + { + "epoch": 1.4676997715550741, + "grad_norm": 0.7959108948707581, + "learning_rate": 3.4239436133395675e-06, + "loss": 0.1987, + "step": 15580 + }, + { + "epoch": 1.4677939756482419, + "grad_norm": 0.6889875531196594, + "learning_rate": 3.4228060975393318e-06, + "loss": 0.2116, + "step": 15581 + }, + { + "epoch": 1.4678881797414098, + "grad_norm": 1.0199923515319824, + "learning_rate": 3.4216687317092854e-06, + "loss": 0.1957, + "step": 15582 + }, + { + "epoch": 1.4679823838345776, + "grad_norm": 0.6064413189888, + "learning_rate": 3.420531515875366e-06, + "loss": 0.1675, + "step": 15583 + }, + { + "epoch": 1.4680765879277455, + "grad_norm": 0.7486051321029663, + "learning_rate": 3.4193944500635057e-06, + "loss": 0.2061, + "step": 15584 + }, + { + "epoch": 1.4681707920209133, + "grad_norm": 0.7588637471199036, + "learning_rate": 3.418257534299627e-06, + "loss": 0.2225, + "step": 15585 + }, + { + "epoch": 1.4682649961140812, + "grad_norm": 0.6680378317832947, + "learning_rate": 3.417120768609655e-06, + "loss": 0.1869, + "step": 15586 + }, + { + "epoch": 1.468359200207249, + "grad_norm": 0.6012219190597534, + "learning_rate": 3.4159841530195127e-06, + "loss": 0.1842, + "step": 15587 + }, + { + "epoch": 1.468453404300417, + "grad_norm": 0.7349840402603149, + "learning_rate": 3.4148476875551117e-06, + "loss": 0.2114, + "step": 15588 + }, + { + "epoch": 1.4685476083935847, + "grad_norm": 0.7364935278892517, + "learning_rate": 3.4137113722423677e-06, + "loss": 0.1734, + "step": 15589 + }, + { + "epoch": 1.4686418124867526, + "grad_norm": 0.6966760754585266, + "learning_rate": 3.4125752071071926e-06, + "loss": 0.236, + "step": 15590 + }, + { + "epoch": 1.4687360165799204, + "grad_norm": 0.6806097626686096, + "learning_rate": 3.4114391921754874e-06, + "loss": 0.2071, + "step": 15591 + }, + { + "epoch": 1.4688302206730883, + "grad_norm": 0.6912007927894592, + "learning_rate": 3.4103033274731624e-06, + "loss": 0.1879, + "step": 15592 + }, + { + "epoch": 1.468924424766256, + "grad_norm": 0.677087128162384, + "learning_rate": 3.4091676130261077e-06, + "loss": 0.2074, + "step": 15593 + }, + { + "epoch": 1.469018628859424, + "grad_norm": 0.6801339983940125, + "learning_rate": 3.408032048860226e-06, + "loss": 0.1929, + "step": 15594 + }, + { + "epoch": 1.4691128329525918, + "grad_norm": 0.77293461561203, + "learning_rate": 3.40689663500141e-06, + "loss": 0.23, + "step": 15595 + }, + { + "epoch": 1.4692070370457597, + "grad_norm": 0.6681442856788635, + "learning_rate": 3.4057613714755444e-06, + "loss": 0.2075, + "step": 15596 + }, + { + "epoch": 1.4693012411389275, + "grad_norm": 0.6287442445755005, + "learning_rate": 3.4046262583085188e-06, + "loss": 0.1815, + "step": 15597 + }, + { + "epoch": 1.4693954452320952, + "grad_norm": 0.6025934219360352, + "learning_rate": 3.4034912955262167e-06, + "loss": 0.1653, + "step": 15598 + }, + { + "epoch": 1.4694896493252632, + "grad_norm": 0.6409719586372375, + "learning_rate": 3.4023564831545107e-06, + "loss": 0.1911, + "step": 15599 + }, + { + "epoch": 1.4695838534184311, + "grad_norm": 0.6095578670501709, + "learning_rate": 3.4012218212192816e-06, + "loss": 0.1902, + "step": 15600 + }, + { + "epoch": 1.4696780575115989, + "grad_norm": 0.6680271625518799, + "learning_rate": 3.4000873097464036e-06, + "loss": 0.2042, + "step": 15601 + }, + { + "epoch": 1.4697722616047666, + "grad_norm": 0.6479302048683167, + "learning_rate": 3.3989529487617414e-06, + "loss": 0.2193, + "step": 15602 + }, + { + "epoch": 1.4698664656979346, + "grad_norm": 0.795434296131134, + "learning_rate": 3.3978187382911543e-06, + "loss": 0.191, + "step": 15603 + }, + { + "epoch": 1.4699606697911025, + "grad_norm": 0.69087815284729, + "learning_rate": 3.396684678360517e-06, + "loss": 0.2474, + "step": 15604 + }, + { + "epoch": 1.4700548738842703, + "grad_norm": 0.6540675163269043, + "learning_rate": 3.395550768995681e-06, + "loss": 0.1983, + "step": 15605 + }, + { + "epoch": 1.470149077977438, + "grad_norm": 0.6747296452522278, + "learning_rate": 3.3944170102224983e-06, + "loss": 0.2376, + "step": 15606 + }, + { + "epoch": 1.470243282070606, + "grad_norm": 0.6314950585365295, + "learning_rate": 3.3932834020668236e-06, + "loss": 0.1864, + "step": 15607 + }, + { + "epoch": 1.470337486163774, + "grad_norm": 0.6356666684150696, + "learning_rate": 3.392149944554508e-06, + "loss": 0.1986, + "step": 15608 + }, + { + "epoch": 1.4704316902569416, + "grad_norm": 0.6738851070404053, + "learning_rate": 3.3910166377113894e-06, + "loss": 0.2308, + "step": 15609 + }, + { + "epoch": 1.4705258943501094, + "grad_norm": 0.6042155623435974, + "learning_rate": 3.389883481563312e-06, + "loss": 0.1921, + "step": 15610 + }, + { + "epoch": 1.4706200984432773, + "grad_norm": 0.6968407034873962, + "learning_rate": 3.3887504761361178e-06, + "loss": 0.2222, + "step": 15611 + }, + { + "epoch": 1.4707143025364453, + "grad_norm": 0.6901258230209351, + "learning_rate": 3.3876176214556345e-06, + "loss": 0.204, + "step": 15612 + }, + { + "epoch": 1.470808506629613, + "grad_norm": 0.6558237671852112, + "learning_rate": 3.3864849175476957e-06, + "loss": 0.1914, + "step": 15613 + }, + { + "epoch": 1.4709027107227808, + "grad_norm": 0.639708399772644, + "learning_rate": 3.3853523644381314e-06, + "loss": 0.1884, + "step": 15614 + }, + { + "epoch": 1.4709969148159487, + "grad_norm": 0.6424216032028198, + "learning_rate": 3.3842199621527593e-06, + "loss": 0.1792, + "step": 15615 + }, + { + "epoch": 1.4710911189091167, + "grad_norm": 0.6951286196708679, + "learning_rate": 3.3830877107174042e-06, + "loss": 0.1884, + "step": 15616 + }, + { + "epoch": 1.4711853230022844, + "grad_norm": 0.6806188225746155, + "learning_rate": 3.3819556101578853e-06, + "loss": 0.2069, + "step": 15617 + }, + { + "epoch": 1.4712795270954522, + "grad_norm": 0.6278419494628906, + "learning_rate": 3.3808236605000143e-06, + "loss": 0.1935, + "step": 15618 + }, + { + "epoch": 1.4713737311886201, + "grad_norm": 0.6258873343467712, + "learning_rate": 3.379691861769594e-06, + "loss": 0.1801, + "step": 15619 + }, + { + "epoch": 1.471467935281788, + "grad_norm": 0.6044372320175171, + "learning_rate": 3.3785602139924432e-06, + "loss": 0.1706, + "step": 15620 + }, + { + "epoch": 1.4715621393749558, + "grad_norm": 0.6459899544715881, + "learning_rate": 3.377428717194361e-06, + "loss": 0.1998, + "step": 15621 + }, + { + "epoch": 1.4716563434681236, + "grad_norm": 0.6790457367897034, + "learning_rate": 3.3762973714011426e-06, + "loss": 0.2171, + "step": 15622 + }, + { + "epoch": 1.4717505475612915, + "grad_norm": 0.6954322457313538, + "learning_rate": 3.375166176638588e-06, + "loss": 0.205, + "step": 15623 + }, + { + "epoch": 1.4718447516544595, + "grad_norm": 0.5843218564987183, + "learning_rate": 3.374035132932493e-06, + "loss": 0.1933, + "step": 15624 + }, + { + "epoch": 1.4719389557476272, + "grad_norm": 0.6136788725852966, + "learning_rate": 3.37290424030864e-06, + "loss": 0.1785, + "step": 15625 + }, + { + "epoch": 1.472033159840795, + "grad_norm": 0.646652340888977, + "learning_rate": 3.371773498792821e-06, + "loss": 0.1969, + "step": 15626 + }, + { + "epoch": 1.472127363933963, + "grad_norm": 0.6401978135108948, + "learning_rate": 3.3706429084108196e-06, + "loss": 0.1831, + "step": 15627 + }, + { + "epoch": 1.472221568027131, + "grad_norm": 0.9398323893547058, + "learning_rate": 3.3695124691884084e-06, + "loss": 0.1983, + "step": 15628 + }, + { + "epoch": 1.4723157721202986, + "grad_norm": 0.6923646330833435, + "learning_rate": 3.368382181151367e-06, + "loss": 0.2101, + "step": 15629 + }, + { + "epoch": 1.4724099762134664, + "grad_norm": 0.6531728506088257, + "learning_rate": 3.367252044325473e-06, + "loss": 0.2026, + "step": 15630 + }, + { + "epoch": 1.4725041803066343, + "grad_norm": 0.6412551999092102, + "learning_rate": 3.366122058736485e-06, + "loss": 0.1883, + "step": 15631 + }, + { + "epoch": 1.4725983843998023, + "grad_norm": 0.6572036743164062, + "learning_rate": 3.3649922244101784e-06, + "loss": 0.1873, + "step": 15632 + }, + { + "epoch": 1.47269258849297, + "grad_norm": 0.6967328190803528, + "learning_rate": 3.3638625413723058e-06, + "loss": 0.2083, + "step": 15633 + }, + { + "epoch": 1.4727867925861378, + "grad_norm": 0.7096132636070251, + "learning_rate": 3.362733009648631e-06, + "loss": 0.2084, + "step": 15634 + }, + { + "epoch": 1.4728809966793057, + "grad_norm": 0.6445586085319519, + "learning_rate": 3.3616036292649113e-06, + "loss": 0.1814, + "step": 15635 + }, + { + "epoch": 1.4729752007724737, + "grad_norm": 0.6005943417549133, + "learning_rate": 3.3604744002468916e-06, + "loss": 0.1743, + "step": 15636 + }, + { + "epoch": 1.4730694048656414, + "grad_norm": 0.7228039503097534, + "learning_rate": 3.359345322620324e-06, + "loss": 0.2059, + "step": 15637 + }, + { + "epoch": 1.4731636089588092, + "grad_norm": 0.661433219909668, + "learning_rate": 3.358216396410956e-06, + "loss": 0.1939, + "step": 15638 + }, + { + "epoch": 1.4732578130519771, + "grad_norm": 0.6859742999076843, + "learning_rate": 3.3570876216445238e-06, + "loss": 0.2036, + "step": 15639 + }, + { + "epoch": 1.473352017145145, + "grad_norm": 0.6108340620994568, + "learning_rate": 3.3559589983467667e-06, + "loss": 0.1735, + "step": 15640 + }, + { + "epoch": 1.4734462212383128, + "grad_norm": 0.6382995247840881, + "learning_rate": 3.354830526543422e-06, + "loss": 0.1681, + "step": 15641 + }, + { + "epoch": 1.4735404253314806, + "grad_norm": 0.6694854497909546, + "learning_rate": 3.35370220626022e-06, + "loss": 0.1964, + "step": 15642 + }, + { + "epoch": 1.4736346294246485, + "grad_norm": 0.6144790053367615, + "learning_rate": 3.352574037522881e-06, + "loss": 0.1782, + "step": 15643 + }, + { + "epoch": 1.4737288335178165, + "grad_norm": 0.6441357135772705, + "learning_rate": 3.3514460203571365e-06, + "loss": 0.2078, + "step": 15644 + }, + { + "epoch": 1.4738230376109842, + "grad_norm": 0.6974300742149353, + "learning_rate": 3.3503181547887066e-06, + "loss": 0.1978, + "step": 15645 + }, + { + "epoch": 1.473917241704152, + "grad_norm": 0.6606546640396118, + "learning_rate": 3.349190440843304e-06, + "loss": 0.1778, + "step": 15646 + }, + { + "epoch": 1.47401144579732, + "grad_norm": 0.6658607721328735, + "learning_rate": 3.348062878546645e-06, + "loss": 0.2211, + "step": 15647 + }, + { + "epoch": 1.4741056498904879, + "grad_norm": 0.6312874555587769, + "learning_rate": 3.346935467924444e-06, + "loss": 0.1869, + "step": 15648 + }, + { + "epoch": 1.4741998539836556, + "grad_norm": 0.7111707329750061, + "learning_rate": 3.345808209002399e-06, + "loss": 0.1978, + "step": 15649 + }, + { + "epoch": 1.4742940580768233, + "grad_norm": 0.6756532788276672, + "learning_rate": 3.3446811018062177e-06, + "loss": 0.1899, + "step": 15650 + }, + { + "epoch": 1.4743882621699913, + "grad_norm": 0.7289779186248779, + "learning_rate": 3.343554146361604e-06, + "loss": 0.1924, + "step": 15651 + }, + { + "epoch": 1.4744824662631593, + "grad_norm": 0.6526416540145874, + "learning_rate": 3.3424273426942467e-06, + "loss": 0.1909, + "step": 15652 + }, + { + "epoch": 1.474576670356327, + "grad_norm": 0.7258555293083191, + "learning_rate": 3.341300690829842e-06, + "loss": 0.2143, + "step": 15653 + }, + { + "epoch": 1.4746708744494947, + "grad_norm": 0.6213929057121277, + "learning_rate": 3.340174190794082e-06, + "loss": 0.1957, + "step": 15654 + }, + { + "epoch": 1.4747650785426627, + "grad_norm": 0.6945509910583496, + "learning_rate": 3.3390478426126473e-06, + "loss": 0.1821, + "step": 15655 + }, + { + "epoch": 1.4748592826358307, + "grad_norm": 0.6115299463272095, + "learning_rate": 3.337921646311223e-06, + "loss": 0.2055, + "step": 15656 + }, + { + "epoch": 1.4749534867289984, + "grad_norm": 0.6380189061164856, + "learning_rate": 3.3367956019154923e-06, + "loss": 0.1935, + "step": 15657 + }, + { + "epoch": 1.4750476908221661, + "grad_norm": 0.6088358163833618, + "learning_rate": 3.3356697094511257e-06, + "loss": 0.1916, + "step": 15658 + }, + { + "epoch": 1.475141894915334, + "grad_norm": 0.8672076463699341, + "learning_rate": 3.334543968943791e-06, + "loss": 0.2283, + "step": 15659 + }, + { + "epoch": 1.475236099008502, + "grad_norm": 0.6404767632484436, + "learning_rate": 3.3334183804191677e-06, + "loss": 0.2344, + "step": 15660 + }, + { + "epoch": 1.4753303031016698, + "grad_norm": 0.5931467413902283, + "learning_rate": 3.332292943902915e-06, + "loss": 0.1667, + "step": 15661 + }, + { + "epoch": 1.4754245071948375, + "grad_norm": 0.641546905040741, + "learning_rate": 3.3311676594206934e-06, + "loss": 0.1975, + "step": 15662 + }, + { + "epoch": 1.4755187112880055, + "grad_norm": 0.7403029799461365, + "learning_rate": 3.3300425269981608e-06, + "loss": 0.2304, + "step": 15663 + }, + { + "epoch": 1.4756129153811732, + "grad_norm": 0.6915774345397949, + "learning_rate": 3.328917546660978e-06, + "loss": 0.1847, + "step": 15664 + }, + { + "epoch": 1.4757071194743412, + "grad_norm": 0.6517333984375, + "learning_rate": 3.327792718434789e-06, + "loss": 0.1939, + "step": 15665 + }, + { + "epoch": 1.475801323567509, + "grad_norm": 0.6195977926254272, + "learning_rate": 3.3266680423452444e-06, + "loss": 0.2018, + "step": 15666 + }, + { + "epoch": 1.4758955276606769, + "grad_norm": 0.6296122670173645, + "learning_rate": 3.325543518417993e-06, + "loss": 0.2061, + "step": 15667 + }, + { + "epoch": 1.4759897317538446, + "grad_norm": 0.656207799911499, + "learning_rate": 3.324419146678668e-06, + "loss": 0.1637, + "step": 15668 + }, + { + "epoch": 1.4760839358470126, + "grad_norm": 0.7016220688819885, + "learning_rate": 3.32329492715291e-06, + "loss": 0.2095, + "step": 15669 + }, + { + "epoch": 1.4761781399401803, + "grad_norm": 0.7365010976791382, + "learning_rate": 3.322170859866357e-06, + "loss": 0.2163, + "step": 15670 + }, + { + "epoch": 1.4762723440333483, + "grad_norm": 0.6614882349967957, + "learning_rate": 3.3210469448446325e-06, + "loss": 0.1864, + "step": 15671 + }, + { + "epoch": 1.476366548126516, + "grad_norm": 0.9303832054138184, + "learning_rate": 3.3199231821133705e-06, + "loss": 0.2342, + "step": 15672 + }, + { + "epoch": 1.476460752219684, + "grad_norm": 0.6372376680374146, + "learning_rate": 3.318799571698187e-06, + "loss": 0.1941, + "step": 15673 + }, + { + "epoch": 1.4765549563128517, + "grad_norm": 0.6549563407897949, + "learning_rate": 3.317676113624706e-06, + "loss": 0.1996, + "step": 15674 + }, + { + "epoch": 1.4766491604060197, + "grad_norm": 0.5790454149246216, + "learning_rate": 3.316552807918548e-06, + "loss": 0.1679, + "step": 15675 + }, + { + "epoch": 1.4767433644991874, + "grad_norm": 0.710728108882904, + "learning_rate": 3.3154296546053175e-06, + "loss": 0.2109, + "step": 15676 + }, + { + "epoch": 1.4768375685923554, + "grad_norm": 0.6472678780555725, + "learning_rate": 3.3143066537106306e-06, + "loss": 0.2079, + "step": 15677 + }, + { + "epoch": 1.4769317726855231, + "grad_norm": 0.6699776649475098, + "learning_rate": 3.313183805260094e-06, + "loss": 0.2014, + "step": 15678 + }, + { + "epoch": 1.477025976778691, + "grad_norm": 0.6860296130180359, + "learning_rate": 3.3120611092793043e-06, + "loss": 0.197, + "step": 15679 + }, + { + "epoch": 1.4771201808718588, + "grad_norm": 0.6158367395401001, + "learning_rate": 3.3109385657938642e-06, + "loss": 0.1856, + "step": 15680 + }, + { + "epoch": 1.4772143849650268, + "grad_norm": 0.7331546545028687, + "learning_rate": 3.3098161748293745e-06, + "loss": 0.2572, + "step": 15681 + }, + { + "epoch": 1.4773085890581945, + "grad_norm": 0.6883033514022827, + "learning_rate": 3.308693936411421e-06, + "loss": 0.1991, + "step": 15682 + }, + { + "epoch": 1.4774027931513625, + "grad_norm": 0.6336426138877869, + "learning_rate": 3.307571850565592e-06, + "loss": 0.2014, + "step": 15683 + }, + { + "epoch": 1.4774969972445302, + "grad_norm": 0.7869249582290649, + "learning_rate": 3.3064499173174734e-06, + "loss": 0.2091, + "step": 15684 + }, + { + "epoch": 1.4775912013376982, + "grad_norm": 0.6841399073600769, + "learning_rate": 3.3053281366926526e-06, + "loss": 0.1912, + "step": 15685 + }, + { + "epoch": 1.477685405430866, + "grad_norm": 0.64698326587677, + "learning_rate": 3.3042065087167008e-06, + "loss": 0.1586, + "step": 15686 + }, + { + "epoch": 1.4777796095240339, + "grad_norm": 0.6012874245643616, + "learning_rate": 3.3030850334151952e-06, + "loss": 0.1657, + "step": 15687 + }, + { + "epoch": 1.4778738136172016, + "grad_norm": 0.6796317100524902, + "learning_rate": 3.3019637108137113e-06, + "loss": 0.2049, + "step": 15688 + }, + { + "epoch": 1.4779680177103696, + "grad_norm": 0.5895856022834778, + "learning_rate": 3.30084254093781e-06, + "loss": 0.191, + "step": 15689 + }, + { + "epoch": 1.4780622218035373, + "grad_norm": 0.5807430148124695, + "learning_rate": 3.29972152381306e-06, + "loss": 0.1844, + "step": 15690 + }, + { + "epoch": 1.4781564258967053, + "grad_norm": 0.913033664226532, + "learning_rate": 3.2986006594650245e-06, + "loss": 0.1654, + "step": 15691 + }, + { + "epoch": 1.478250629989873, + "grad_norm": 0.8326259255409241, + "learning_rate": 3.297479947919253e-06, + "loss": 0.2274, + "step": 15692 + }, + { + "epoch": 1.478344834083041, + "grad_norm": 0.6548287868499756, + "learning_rate": 3.296359389201307e-06, + "loss": 0.222, + "step": 15693 + }, + { + "epoch": 1.4784390381762087, + "grad_norm": 0.6795612573623657, + "learning_rate": 3.295238983336736e-06, + "loss": 0.2183, + "step": 15694 + }, + { + "epoch": 1.4785332422693767, + "grad_norm": 0.7227304577827454, + "learning_rate": 3.2941187303510845e-06, + "loss": 0.1771, + "step": 15695 + }, + { + "epoch": 1.4786274463625444, + "grad_norm": 0.6277872323989868, + "learning_rate": 3.2929986302698913e-06, + "loss": 0.2068, + "step": 15696 + }, + { + "epoch": 1.4787216504557124, + "grad_norm": 0.6779505014419556, + "learning_rate": 3.2918786831187088e-06, + "loss": 0.1899, + "step": 15697 + }, + { + "epoch": 1.47881585454888, + "grad_norm": 0.6374543905258179, + "learning_rate": 3.2907588889230667e-06, + "loss": 0.191, + "step": 15698 + }, + { + "epoch": 1.478910058642048, + "grad_norm": 0.6203227043151855, + "learning_rate": 3.2896392477084905e-06, + "loss": 0.2057, + "step": 15699 + }, + { + "epoch": 1.4790042627352158, + "grad_norm": 0.6810561418533325, + "learning_rate": 3.2885197595005246e-06, + "loss": 0.1882, + "step": 15700 + }, + { + "epoch": 1.4790984668283838, + "grad_norm": 0.6487362384796143, + "learning_rate": 3.287400424324687e-06, + "loss": 0.2098, + "step": 15701 + }, + { + "epoch": 1.4791926709215515, + "grad_norm": 0.6776620149612427, + "learning_rate": 3.2862812422064983e-06, + "loss": 0.1931, + "step": 15702 + }, + { + "epoch": 1.4792868750147194, + "grad_norm": 0.6505196690559387, + "learning_rate": 3.28516221317148e-06, + "loss": 0.1834, + "step": 15703 + }, + { + "epoch": 1.4793810791078872, + "grad_norm": 0.6435573101043701, + "learning_rate": 3.2840433372451506e-06, + "loss": 0.2049, + "step": 15704 + }, + { + "epoch": 1.4794752832010551, + "grad_norm": 0.6023697257041931, + "learning_rate": 3.282924614453017e-06, + "loss": 0.185, + "step": 15705 + }, + { + "epoch": 1.4795694872942229, + "grad_norm": 0.6520904898643494, + "learning_rate": 3.2818060448205902e-06, + "loss": 0.1867, + "step": 15706 + }, + { + "epoch": 1.4796636913873908, + "grad_norm": 0.7034657597541809, + "learning_rate": 3.280687628373379e-06, + "loss": 0.1919, + "step": 15707 + }, + { + "epoch": 1.4797578954805586, + "grad_norm": 0.7249178290367126, + "learning_rate": 3.2795693651368776e-06, + "loss": 0.2048, + "step": 15708 + }, + { + "epoch": 1.4798520995737265, + "grad_norm": 0.6190721392631531, + "learning_rate": 3.2784512551365886e-06, + "loss": 0.2211, + "step": 15709 + }, + { + "epoch": 1.4799463036668943, + "grad_norm": 0.6146898865699768, + "learning_rate": 3.2773332983980087e-06, + "loss": 0.1842, + "step": 15710 + }, + { + "epoch": 1.4800405077600622, + "grad_norm": 0.7510910034179688, + "learning_rate": 3.276215494946624e-06, + "loss": 0.2133, + "step": 15711 + }, + { + "epoch": 1.48013471185323, + "grad_norm": 0.6793009042739868, + "learning_rate": 3.2750978448079276e-06, + "loss": 0.1825, + "step": 15712 + }, + { + "epoch": 1.480228915946398, + "grad_norm": 0.7033307552337646, + "learning_rate": 3.273980348007396e-06, + "loss": 0.1974, + "step": 15713 + }, + { + "epoch": 1.4803231200395657, + "grad_norm": 0.7821509838104248, + "learning_rate": 3.2728630045705166e-06, + "loss": 0.2525, + "step": 15714 + }, + { + "epoch": 1.4804173241327336, + "grad_norm": 0.8932392597198486, + "learning_rate": 3.2717458145227666e-06, + "loss": 0.1852, + "step": 15715 + }, + { + "epoch": 1.4805115282259014, + "grad_norm": 0.7076621651649475, + "learning_rate": 3.270628777889614e-06, + "loss": 0.1849, + "step": 15716 + }, + { + "epoch": 1.4806057323190693, + "grad_norm": 0.7159755229949951, + "learning_rate": 3.269511894696532e-06, + "loss": 0.205, + "step": 15717 + }, + { + "epoch": 1.480699936412237, + "grad_norm": 0.7273610234260559, + "learning_rate": 3.2683951649689914e-06, + "loss": 0.1993, + "step": 15718 + }, + { + "epoch": 1.480794140505405, + "grad_norm": 0.623056948184967, + "learning_rate": 3.2672785887324487e-06, + "loss": 0.1808, + "step": 15719 + }, + { + "epoch": 1.4808883445985728, + "grad_norm": 0.6815441250801086, + "learning_rate": 3.2661621660123666e-06, + "loss": 0.2028, + "step": 15720 + }, + { + "epoch": 1.4809825486917407, + "grad_norm": 0.6188926100730896, + "learning_rate": 3.2650458968342048e-06, + "loss": 0.1581, + "step": 15721 + }, + { + "epoch": 1.4810767527849085, + "grad_norm": 0.6613251566886902, + "learning_rate": 3.263929781223412e-06, + "loss": 0.2107, + "step": 15722 + }, + { + "epoch": 1.4811709568780764, + "grad_norm": 0.6647250056266785, + "learning_rate": 3.2628138192054336e-06, + "loss": 0.2139, + "step": 15723 + }, + { + "epoch": 1.4812651609712442, + "grad_norm": 0.6295896172523499, + "learning_rate": 3.2616980108057204e-06, + "loss": 0.2137, + "step": 15724 + }, + { + "epoch": 1.4813593650644121, + "grad_norm": 0.6682237386703491, + "learning_rate": 3.2605823560497163e-06, + "loss": 0.1959, + "step": 15725 + }, + { + "epoch": 1.4814535691575799, + "grad_norm": 0.6330900192260742, + "learning_rate": 3.259466854962855e-06, + "loss": 0.209, + "step": 15726 + }, + { + "epoch": 1.4815477732507478, + "grad_norm": 0.6337481737136841, + "learning_rate": 3.258351507570573e-06, + "loss": 0.1802, + "step": 15727 + }, + { + "epoch": 1.4816419773439156, + "grad_norm": 0.7027248740196228, + "learning_rate": 3.2572363138983054e-06, + "loss": 0.209, + "step": 15728 + }, + { + "epoch": 1.4817361814370835, + "grad_norm": 0.6419001221656799, + "learning_rate": 3.2561212739714752e-06, + "loss": 0.206, + "step": 15729 + }, + { + "epoch": 1.4818303855302513, + "grad_norm": 0.7860915064811707, + "learning_rate": 3.255006387815509e-06, + "loss": 0.2209, + "step": 15730 + }, + { + "epoch": 1.4819245896234192, + "grad_norm": 0.6406052112579346, + "learning_rate": 3.253891655455833e-06, + "loss": 0.2221, + "step": 15731 + }, + { + "epoch": 1.482018793716587, + "grad_norm": 0.6322653293609619, + "learning_rate": 3.2527770769178558e-06, + "loss": 0.2067, + "step": 15732 + }, + { + "epoch": 1.4821129978097547, + "grad_norm": 0.7294014096260071, + "learning_rate": 3.2516626522269965e-06, + "loss": 0.2085, + "step": 15733 + }, + { + "epoch": 1.4822072019029227, + "grad_norm": 0.8285627365112305, + "learning_rate": 3.250548381408668e-06, + "loss": 0.2149, + "step": 15734 + }, + { + "epoch": 1.4823014059960906, + "grad_norm": 0.6386702656745911, + "learning_rate": 3.249434264488276e-06, + "loss": 0.1966, + "step": 15735 + }, + { + "epoch": 1.4823956100892584, + "grad_norm": 0.6847113966941833, + "learning_rate": 3.2483203014912145e-06, + "loss": 0.2005, + "step": 15736 + }, + { + "epoch": 1.482489814182426, + "grad_norm": 0.7019851207733154, + "learning_rate": 3.2472064924428994e-06, + "loss": 0.211, + "step": 15737 + }, + { + "epoch": 1.482584018275594, + "grad_norm": 0.7889997363090515, + "learning_rate": 3.2460928373687197e-06, + "loss": 0.2056, + "step": 15738 + }, + { + "epoch": 1.482678222368762, + "grad_norm": 0.6844862103462219, + "learning_rate": 3.2449793362940617e-06, + "loss": 0.2051, + "step": 15739 + }, + { + "epoch": 1.4827724264619297, + "grad_norm": 0.6309056282043457, + "learning_rate": 3.243865989244328e-06, + "loss": 0.2041, + "step": 15740 + }, + { + "epoch": 1.4828666305550975, + "grad_norm": 0.5994337201118469, + "learning_rate": 3.242752796244898e-06, + "loss": 0.1808, + "step": 15741 + }, + { + "epoch": 1.4829608346482654, + "grad_norm": 0.5997973084449768, + "learning_rate": 3.2416397573211523e-06, + "loss": 0.1873, + "step": 15742 + }, + { + "epoch": 1.4830550387414334, + "grad_norm": 0.5694478154182434, + "learning_rate": 3.2405268724984706e-06, + "loss": 0.1771, + "step": 15743 + }, + { + "epoch": 1.4831492428346011, + "grad_norm": 0.6496260166168213, + "learning_rate": 3.2394141418022353e-06, + "loss": 0.1973, + "step": 15744 + }, + { + "epoch": 1.4832434469277689, + "grad_norm": 1.0584670305252075, + "learning_rate": 3.2383015652578077e-06, + "loss": 0.1838, + "step": 15745 + }, + { + "epoch": 1.4833376510209368, + "grad_norm": 0.6428386569023132, + "learning_rate": 3.2371891428905623e-06, + "loss": 0.2189, + "step": 15746 + }, + { + "epoch": 1.4834318551141048, + "grad_norm": 0.7382312417030334, + "learning_rate": 3.2360768747258674e-06, + "loss": 0.1986, + "step": 15747 + }, + { + "epoch": 1.4835260592072725, + "grad_norm": 1.1544512510299683, + "learning_rate": 3.2349647607890756e-06, + "loss": 0.1839, + "step": 15748 + }, + { + "epoch": 1.4836202633004403, + "grad_norm": 0.662822961807251, + "learning_rate": 3.2338528011055503e-06, + "loss": 0.167, + "step": 15749 + }, + { + "epoch": 1.4837144673936082, + "grad_norm": 0.636023759841919, + "learning_rate": 3.2327409957006493e-06, + "loss": 0.1873, + "step": 15750 + }, + { + "epoch": 1.4838086714867762, + "grad_norm": 0.7087684273719788, + "learning_rate": 3.231629344599715e-06, + "loss": 0.201, + "step": 15751 + }, + { + "epoch": 1.483902875579944, + "grad_norm": 0.7018948793411255, + "learning_rate": 3.230517847828103e-06, + "loss": 0.192, + "step": 15752 + }, + { + "epoch": 1.4839970796731117, + "grad_norm": 0.6414608955383301, + "learning_rate": 3.22940650541115e-06, + "loss": 0.2144, + "step": 15753 + }, + { + "epoch": 1.4840912837662796, + "grad_norm": 0.7314609289169312, + "learning_rate": 3.228295317374199e-06, + "loss": 0.182, + "step": 15754 + }, + { + "epoch": 1.4841854878594476, + "grad_norm": 0.6438114643096924, + "learning_rate": 3.2271842837425917e-06, + "loss": 0.2136, + "step": 15755 + }, + { + "epoch": 1.4842796919526153, + "grad_norm": 0.662824809551239, + "learning_rate": 3.226073404541652e-06, + "loss": 0.2214, + "step": 15756 + }, + { + "epoch": 1.484373896045783, + "grad_norm": 0.667413055896759, + "learning_rate": 3.224962679796716e-06, + "loss": 0.1954, + "step": 15757 + }, + { + "epoch": 1.484468100138951, + "grad_norm": 0.6730595827102661, + "learning_rate": 3.223852109533112e-06, + "loss": 0.2017, + "step": 15758 + }, + { + "epoch": 1.484562304232119, + "grad_norm": 0.6493986248970032, + "learning_rate": 3.222741693776156e-06, + "loss": 0.1947, + "step": 15759 + }, + { + "epoch": 1.4846565083252867, + "grad_norm": 0.7482509016990662, + "learning_rate": 3.2216314325511744e-06, + "loss": 0.2332, + "step": 15760 + }, + { + "epoch": 1.4847507124184545, + "grad_norm": 0.6473538875579834, + "learning_rate": 3.2205213258834754e-06, + "loss": 0.1908, + "step": 15761 + }, + { + "epoch": 1.4848449165116224, + "grad_norm": 0.6552290916442871, + "learning_rate": 3.219411373798378e-06, + "loss": 0.2022, + "step": 15762 + }, + { + "epoch": 1.4849391206047904, + "grad_norm": 0.6187937259674072, + "learning_rate": 3.2183015763211843e-06, + "loss": 0.1894, + "step": 15763 + }, + { + "epoch": 1.4850333246979581, + "grad_norm": 0.627206027507782, + "learning_rate": 3.217191933477203e-06, + "loss": 0.2141, + "step": 15764 + }, + { + "epoch": 1.4851275287911259, + "grad_norm": 0.6970780491828918, + "learning_rate": 3.2160824452917382e-06, + "loss": 0.2285, + "step": 15765 + }, + { + "epoch": 1.4852217328842938, + "grad_norm": 0.6371577978134155, + "learning_rate": 3.214973111790083e-06, + "loss": 0.1976, + "step": 15766 + }, + { + "epoch": 1.4853159369774618, + "grad_norm": 0.6760706901550293, + "learning_rate": 3.2138639329975328e-06, + "loss": 0.2077, + "step": 15767 + }, + { + "epoch": 1.4854101410706295, + "grad_norm": 0.6337851881980896, + "learning_rate": 3.212754908939384e-06, + "loss": 0.1768, + "step": 15768 + }, + { + "epoch": 1.4855043451637973, + "grad_norm": 1.500899076461792, + "learning_rate": 3.2116460396409165e-06, + "loss": 0.2179, + "step": 15769 + }, + { + "epoch": 1.4855985492569652, + "grad_norm": 0.7386575937271118, + "learning_rate": 3.2105373251274172e-06, + "loss": 0.234, + "step": 15770 + }, + { + "epoch": 1.4856927533501332, + "grad_norm": 0.6634941697120667, + "learning_rate": 3.2094287654241706e-06, + "loss": 0.2026, + "step": 15771 + }, + { + "epoch": 1.485786957443301, + "grad_norm": 0.6990938186645508, + "learning_rate": 3.2083203605564473e-06, + "loss": 0.1851, + "step": 15772 + }, + { + "epoch": 1.4858811615364687, + "grad_norm": 0.6416527032852173, + "learning_rate": 3.2072121105495224e-06, + "loss": 0.1925, + "step": 15773 + }, + { + "epoch": 1.4859753656296366, + "grad_norm": 0.6244597434997559, + "learning_rate": 3.2061040154286706e-06, + "loss": 0.1996, + "step": 15774 + }, + { + "epoch": 1.4860695697228046, + "grad_norm": 0.6215859055519104, + "learning_rate": 3.2049960752191533e-06, + "loss": 0.2081, + "step": 15775 + }, + { + "epoch": 1.4861637738159723, + "grad_norm": 0.7005284428596497, + "learning_rate": 3.2038882899462287e-06, + "loss": 0.2045, + "step": 15776 + }, + { + "epoch": 1.48625797790914, + "grad_norm": 0.6543198823928833, + "learning_rate": 3.2027806596351675e-06, + "loss": 0.1912, + "step": 15777 + }, + { + "epoch": 1.486352182002308, + "grad_norm": 0.752830445766449, + "learning_rate": 3.2016731843112193e-06, + "loss": 0.2148, + "step": 15778 + }, + { + "epoch": 1.486446386095476, + "grad_norm": 0.6643003821372986, + "learning_rate": 3.2005658639996296e-06, + "loss": 0.2081, + "step": 15779 + }, + { + "epoch": 1.4865405901886437, + "grad_norm": 0.6779760718345642, + "learning_rate": 3.1994586987256602e-06, + "loss": 0.1909, + "step": 15780 + }, + { + "epoch": 1.4866347942818114, + "grad_norm": 0.6295567154884338, + "learning_rate": 3.1983516885145503e-06, + "loss": 0.2061, + "step": 15781 + }, + { + "epoch": 1.4867289983749794, + "grad_norm": 0.6216533780097961, + "learning_rate": 3.1972448333915376e-06, + "loss": 0.1962, + "step": 15782 + }, + { + "epoch": 1.4868232024681474, + "grad_norm": 0.7079263925552368, + "learning_rate": 3.196138133381863e-06, + "loss": 0.2288, + "step": 15783 + }, + { + "epoch": 1.486917406561315, + "grad_norm": 0.6711618900299072, + "learning_rate": 3.1950315885107652e-06, + "loss": 0.2099, + "step": 15784 + }, + { + "epoch": 1.4870116106544828, + "grad_norm": 0.6961907744407654, + "learning_rate": 3.193925198803467e-06, + "loss": 0.2168, + "step": 15785 + }, + { + "epoch": 1.4871058147476508, + "grad_norm": 0.6956088542938232, + "learning_rate": 3.1928189642852e-06, + "loss": 0.2104, + "step": 15786 + }, + { + "epoch": 1.4872000188408188, + "grad_norm": 0.6182660460472107, + "learning_rate": 3.191712884981193e-06, + "loss": 0.1768, + "step": 15787 + }, + { + "epoch": 1.4872942229339865, + "grad_norm": 0.5577138066291809, + "learning_rate": 3.1906069609166568e-06, + "loss": 0.1569, + "step": 15788 + }, + { + "epoch": 1.4873884270271542, + "grad_norm": 0.6199938058853149, + "learning_rate": 3.189501192116813e-06, + "loss": 0.1885, + "step": 15789 + }, + { + "epoch": 1.4874826311203222, + "grad_norm": 0.7469103932380676, + "learning_rate": 3.1883955786068776e-06, + "loss": 0.1955, + "step": 15790 + }, + { + "epoch": 1.4875768352134902, + "grad_norm": 0.6799483895301819, + "learning_rate": 3.1872901204120554e-06, + "loss": 0.2181, + "step": 15791 + }, + { + "epoch": 1.487671039306658, + "grad_norm": 0.6947216987609863, + "learning_rate": 3.186184817557556e-06, + "loss": 0.1664, + "step": 15792 + }, + { + "epoch": 1.4877652433998256, + "grad_norm": 0.6201277375221252, + "learning_rate": 3.1850796700685783e-06, + "loss": 0.2054, + "step": 15793 + }, + { + "epoch": 1.4878594474929936, + "grad_norm": 0.6932709217071533, + "learning_rate": 3.183974677970324e-06, + "loss": 0.2093, + "step": 15794 + }, + { + "epoch": 1.4879536515861616, + "grad_norm": 0.6539770364761353, + "learning_rate": 3.182869841287991e-06, + "loss": 0.211, + "step": 15795 + }, + { + "epoch": 1.4880478556793293, + "grad_norm": 0.6298494338989258, + "learning_rate": 3.1817651600467647e-06, + "loss": 0.2015, + "step": 15796 + }, + { + "epoch": 1.488142059772497, + "grad_norm": 0.6212891340255737, + "learning_rate": 3.1806606342718383e-06, + "loss": 0.1465, + "step": 15797 + }, + { + "epoch": 1.488236263865665, + "grad_norm": 0.6836594343185425, + "learning_rate": 3.179556263988398e-06, + "loss": 0.2242, + "step": 15798 + }, + { + "epoch": 1.488330467958833, + "grad_norm": 0.695442259311676, + "learning_rate": 3.178452049221621e-06, + "loss": 0.1824, + "step": 15799 + }, + { + "epoch": 1.4884246720520007, + "grad_norm": 0.7029857039451599, + "learning_rate": 3.1773479899966897e-06, + "loss": 0.2015, + "step": 15800 + }, + { + "epoch": 1.4885188761451684, + "grad_norm": 0.5796753764152527, + "learning_rate": 3.1762440863387723e-06, + "loss": 0.1608, + "step": 15801 + }, + { + "epoch": 1.4886130802383364, + "grad_norm": 0.7375254034996033, + "learning_rate": 3.175140338273046e-06, + "loss": 0.2268, + "step": 15802 + }, + { + "epoch": 1.4887072843315041, + "grad_norm": 0.7068525552749634, + "learning_rate": 3.174036745824671e-06, + "loss": 0.188, + "step": 15803 + }, + { + "epoch": 1.488801488424672, + "grad_norm": 0.7670924663543701, + "learning_rate": 3.1729333090188153e-06, + "loss": 0.2065, + "step": 15804 + }, + { + "epoch": 1.4888956925178398, + "grad_norm": 0.6729752421379089, + "learning_rate": 3.1718300278806424e-06, + "loss": 0.1976, + "step": 15805 + }, + { + "epoch": 1.4889898966110078, + "grad_norm": 0.7018054723739624, + "learning_rate": 3.1707269024353003e-06, + "loss": 0.2203, + "step": 15806 + }, + { + "epoch": 1.4890841007041755, + "grad_norm": 0.6012523174285889, + "learning_rate": 3.169623932707947e-06, + "loss": 0.2016, + "step": 15807 + }, + { + "epoch": 1.4891783047973435, + "grad_norm": 0.643194854259491, + "learning_rate": 3.1685211187237354e-06, + "loss": 0.1731, + "step": 15808 + }, + { + "epoch": 1.4892725088905112, + "grad_norm": 0.6318308711051941, + "learning_rate": 3.167418460507803e-06, + "loss": 0.1695, + "step": 15809 + }, + { + "epoch": 1.4893667129836792, + "grad_norm": 0.6564774513244629, + "learning_rate": 3.1663159580852976e-06, + "loss": 0.2038, + "step": 15810 + }, + { + "epoch": 1.489460917076847, + "grad_norm": 0.6514729857444763, + "learning_rate": 3.1652136114813592e-06, + "loss": 0.1919, + "step": 15811 + }, + { + "epoch": 1.4895551211700149, + "grad_norm": 0.6297662854194641, + "learning_rate": 3.164111420721121e-06, + "loss": 0.1821, + "step": 15812 + }, + { + "epoch": 1.4896493252631826, + "grad_norm": 0.6865100264549255, + "learning_rate": 3.1630093858297074e-06, + "loss": 0.1833, + "step": 15813 + }, + { + "epoch": 1.4897435293563506, + "grad_norm": 0.6756977438926697, + "learning_rate": 3.1619075068322603e-06, + "loss": 0.1944, + "step": 15814 + }, + { + "epoch": 1.4898377334495183, + "grad_norm": 0.6531872153282166, + "learning_rate": 3.1608057837538976e-06, + "loss": 0.172, + "step": 15815 + }, + { + "epoch": 1.4899319375426863, + "grad_norm": 0.7377223372459412, + "learning_rate": 3.1597042166197334e-06, + "loss": 0.179, + "step": 15816 + }, + { + "epoch": 1.490026141635854, + "grad_norm": 0.6673306822776794, + "learning_rate": 3.158602805454898e-06, + "loss": 0.1883, + "step": 15817 + }, + { + "epoch": 1.490120345729022, + "grad_norm": 0.5960646867752075, + "learning_rate": 3.1575015502844995e-06, + "loss": 0.1905, + "step": 15818 + }, + { + "epoch": 1.4902145498221897, + "grad_norm": 0.6355289220809937, + "learning_rate": 3.156400451133641e-06, + "loss": 0.1965, + "step": 15819 + }, + { + "epoch": 1.4903087539153577, + "grad_norm": 0.7027295827865601, + "learning_rate": 3.1552995080274418e-06, + "loss": 0.2007, + "step": 15820 + }, + { + "epoch": 1.4904029580085254, + "grad_norm": 0.6796391606330872, + "learning_rate": 3.154198720991001e-06, + "loss": 0.185, + "step": 15821 + }, + { + "epoch": 1.4904971621016934, + "grad_norm": 0.6522025465965271, + "learning_rate": 3.1530980900494125e-06, + "loss": 0.2083, + "step": 15822 + }, + { + "epoch": 1.490591366194861, + "grad_norm": 0.6371049284934998, + "learning_rate": 3.1519976152277765e-06, + "loss": 0.1548, + "step": 15823 + }, + { + "epoch": 1.490685570288029, + "grad_norm": 0.6852626800537109, + "learning_rate": 3.1508972965511886e-06, + "loss": 0.1886, + "step": 15824 + }, + { + "epoch": 1.4907797743811968, + "grad_norm": 0.7229998111724854, + "learning_rate": 3.149797134044731e-06, + "loss": 0.198, + "step": 15825 + }, + { + "epoch": 1.4908739784743648, + "grad_norm": 0.6964998841285706, + "learning_rate": 3.148697127733493e-06, + "loss": 0.211, + "step": 15826 + }, + { + "epoch": 1.4909681825675325, + "grad_norm": 0.7541191577911377, + "learning_rate": 3.14759727764256e-06, + "loss": 0.192, + "step": 15827 + }, + { + "epoch": 1.4910623866607005, + "grad_norm": 0.7376460433006287, + "learning_rate": 3.1464975837970035e-06, + "loss": 0.2077, + "step": 15828 + }, + { + "epoch": 1.4911565907538682, + "grad_norm": 0.6410512924194336, + "learning_rate": 3.1453980462219e-06, + "loss": 0.1933, + "step": 15829 + }, + { + "epoch": 1.4912507948470362, + "grad_norm": 0.6522585153579712, + "learning_rate": 3.1442986649423266e-06, + "loss": 0.2313, + "step": 15830 + }, + { + "epoch": 1.491344998940204, + "grad_norm": 0.6673697829246521, + "learning_rate": 3.143199439983342e-06, + "loss": 0.1832, + "step": 15831 + }, + { + "epoch": 1.4914392030333719, + "grad_norm": 0.6484084725379944, + "learning_rate": 3.1421003713700184e-06, + "loss": 0.1725, + "step": 15832 + }, + { + "epoch": 1.4915334071265396, + "grad_norm": 0.6711766123771667, + "learning_rate": 3.1410014591274086e-06, + "loss": 0.2068, + "step": 15833 + }, + { + "epoch": 1.4916276112197075, + "grad_norm": 0.6606464982032776, + "learning_rate": 3.139902703280573e-06, + "loss": 0.179, + "step": 15834 + }, + { + "epoch": 1.4917218153128753, + "grad_norm": 0.6559191346168518, + "learning_rate": 3.138804103854568e-06, + "loss": 0.2205, + "step": 15835 + }, + { + "epoch": 1.4918160194060432, + "grad_norm": 0.7219986319541931, + "learning_rate": 3.137705660874438e-06, + "loss": 0.2169, + "step": 15836 + }, + { + "epoch": 1.491910223499211, + "grad_norm": 0.7164807915687561, + "learning_rate": 3.1366073743652313e-06, + "loss": 0.245, + "step": 15837 + }, + { + "epoch": 1.492004427592379, + "grad_norm": 0.6560298204421997, + "learning_rate": 3.1355092443519942e-06, + "loss": 0.222, + "step": 15838 + }, + { + "epoch": 1.4920986316855467, + "grad_norm": 0.6005601286888123, + "learning_rate": 3.1344112708597596e-06, + "loss": 0.1816, + "step": 15839 + }, + { + "epoch": 1.4921928357787146, + "grad_norm": 0.6249958872795105, + "learning_rate": 3.133313453913569e-06, + "loss": 0.1985, + "step": 15840 + }, + { + "epoch": 1.4922870398718824, + "grad_norm": 0.6505100727081299, + "learning_rate": 3.1322157935384477e-06, + "loss": 0.2066, + "step": 15841 + }, + { + "epoch": 1.4923812439650503, + "grad_norm": 0.7956879138946533, + "learning_rate": 3.1311182897594304e-06, + "loss": 0.2042, + "step": 15842 + }, + { + "epoch": 1.492475448058218, + "grad_norm": 0.6278946399688721, + "learning_rate": 3.130020942601536e-06, + "loss": 0.1901, + "step": 15843 + }, + { + "epoch": 1.492569652151386, + "grad_norm": 0.6191295981407166, + "learning_rate": 3.1289237520897885e-06, + "loss": 0.1918, + "step": 15844 + }, + { + "epoch": 1.4926638562445538, + "grad_norm": 0.6507278084754944, + "learning_rate": 3.1278267182492107e-06, + "loss": 0.1757, + "step": 15845 + }, + { + "epoch": 1.4927580603377217, + "grad_norm": 0.656032919883728, + "learning_rate": 3.1267298411048066e-06, + "loss": 0.2026, + "step": 15846 + }, + { + "epoch": 1.4928522644308895, + "grad_norm": 0.6542507410049438, + "learning_rate": 3.1256331206815926e-06, + "loss": 0.2006, + "step": 15847 + }, + { + "epoch": 1.4929464685240574, + "grad_norm": 0.6375792622566223, + "learning_rate": 3.124536557004578e-06, + "loss": 0.2012, + "step": 15848 + }, + { + "epoch": 1.4930406726172252, + "grad_norm": 0.6325379014015198, + "learning_rate": 3.12344015009876e-06, + "loss": 0.2118, + "step": 15849 + }, + { + "epoch": 1.4931348767103931, + "grad_norm": 0.6247312426567078, + "learning_rate": 3.1223438999891408e-06, + "loss": 0.1976, + "step": 15850 + }, + { + "epoch": 1.4932290808035609, + "grad_norm": 0.6271814703941345, + "learning_rate": 3.12124780670072e-06, + "loss": 0.1816, + "step": 15851 + }, + { + "epoch": 1.4933232848967288, + "grad_norm": 0.7313740253448486, + "learning_rate": 3.120151870258489e-06, + "loss": 0.205, + "step": 15852 + }, + { + "epoch": 1.4934174889898966, + "grad_norm": 0.6758707761764526, + "learning_rate": 3.119056090687428e-06, + "loss": 0.1929, + "step": 15853 + }, + { + "epoch": 1.4935116930830645, + "grad_norm": 0.6302174925804138, + "learning_rate": 3.1179604680125363e-06, + "loss": 0.1889, + "step": 15854 + }, + { + "epoch": 1.4936058971762323, + "grad_norm": 0.638113796710968, + "learning_rate": 3.1168650022587885e-06, + "loss": 0.1952, + "step": 15855 + }, + { + "epoch": 1.4937001012694002, + "grad_norm": 0.6867721676826477, + "learning_rate": 3.1157696934511572e-06, + "loss": 0.2184, + "step": 15856 + }, + { + "epoch": 1.493794305362568, + "grad_norm": 0.696965217590332, + "learning_rate": 3.1146745416146307e-06, + "loss": 0.196, + "step": 15857 + }, + { + "epoch": 1.493888509455736, + "grad_norm": 0.599137008190155, + "learning_rate": 3.1135795467741736e-06, + "loss": 0.1793, + "step": 15858 + }, + { + "epoch": 1.4939827135489037, + "grad_norm": 0.6748884320259094, + "learning_rate": 3.112484708954745e-06, + "loss": 0.1992, + "step": 15859 + }, + { + "epoch": 1.4940769176420716, + "grad_norm": 0.6321108937263489, + "learning_rate": 3.1113900281813237e-06, + "loss": 0.1855, + "step": 15860 + }, + { + "epoch": 1.4941711217352394, + "grad_norm": 0.6798797845840454, + "learning_rate": 3.110295504478864e-06, + "loss": 0.2328, + "step": 15861 + }, + { + "epoch": 1.4942653258284073, + "grad_norm": 0.6549807786941528, + "learning_rate": 3.1092011378723173e-06, + "loss": 0.1873, + "step": 15862 + }, + { + "epoch": 1.494359529921575, + "grad_norm": 0.7312194108963013, + "learning_rate": 3.1081069283866427e-06, + "loss": 0.2296, + "step": 15863 + }, + { + "epoch": 1.494453734014743, + "grad_norm": 0.625159740447998, + "learning_rate": 3.107012876046791e-06, + "loss": 0.1739, + "step": 15864 + }, + { + "epoch": 1.4945479381079108, + "grad_norm": 0.6311237812042236, + "learning_rate": 3.1059189808777036e-06, + "loss": 0.1682, + "step": 15865 + }, + { + "epoch": 1.4946421422010787, + "grad_norm": 0.6616176962852478, + "learning_rate": 3.1048252429043248e-06, + "loss": 0.1943, + "step": 15866 + }, + { + "epoch": 1.4947363462942465, + "grad_norm": 0.7420802712440491, + "learning_rate": 3.1037316621515976e-06, + "loss": 0.2159, + "step": 15867 + }, + { + "epoch": 1.4948305503874144, + "grad_norm": 0.5542891025543213, + "learning_rate": 3.10263823864445e-06, + "loss": 0.1802, + "step": 15868 + }, + { + "epoch": 1.4949247544805822, + "grad_norm": 0.6131301522254944, + "learning_rate": 3.1015449724078184e-06, + "loss": 0.1833, + "step": 15869 + }, + { + "epoch": 1.4950189585737501, + "grad_norm": 0.6341503262519836, + "learning_rate": 3.1004518634666323e-06, + "loss": 0.2234, + "step": 15870 + }, + { + "epoch": 1.4951131626669178, + "grad_norm": 0.7000523805618286, + "learning_rate": 3.099358911845811e-06, + "loss": 0.1884, + "step": 15871 + }, + { + "epoch": 1.4952073667600856, + "grad_norm": 0.6973499655723572, + "learning_rate": 3.098266117570282e-06, + "loss": 0.2032, + "step": 15872 + }, + { + "epoch": 1.4953015708532535, + "grad_norm": 1.0040256977081299, + "learning_rate": 3.0971734806649566e-06, + "loss": 0.1987, + "step": 15873 + }, + { + "epoch": 1.4953957749464215, + "grad_norm": 0.6852333545684814, + "learning_rate": 3.0960810011547503e-06, + "loss": 0.2058, + "step": 15874 + }, + { + "epoch": 1.4954899790395892, + "grad_norm": 0.6796458959579468, + "learning_rate": 3.0949886790645788e-06, + "loss": 0.2004, + "step": 15875 + }, + { + "epoch": 1.495584183132757, + "grad_norm": 0.6131322979927063, + "learning_rate": 3.09389651441934e-06, + "loss": 0.1955, + "step": 15876 + }, + { + "epoch": 1.495678387225925, + "grad_norm": 0.6436735391616821, + "learning_rate": 3.092804507243945e-06, + "loss": 0.2226, + "step": 15877 + }, + { + "epoch": 1.495772591319093, + "grad_norm": 0.6957037448883057, + "learning_rate": 3.091712657563285e-06, + "loss": 0.199, + "step": 15878 + }, + { + "epoch": 1.4958667954122606, + "grad_norm": 0.6697835922241211, + "learning_rate": 3.0906209654022613e-06, + "loss": 0.2007, + "step": 15879 + }, + { + "epoch": 1.4959609995054284, + "grad_norm": 0.6514902710914612, + "learning_rate": 3.0895294307857684e-06, + "loss": 0.1834, + "step": 15880 + }, + { + "epoch": 1.4960552035985963, + "grad_norm": 0.649848461151123, + "learning_rate": 3.0884380537386883e-06, + "loss": 0.2188, + "step": 15881 + }, + { + "epoch": 1.4961494076917643, + "grad_norm": 0.6319318413734436, + "learning_rate": 3.0873468342859125e-06, + "loss": 0.2028, + "step": 15882 + }, + { + "epoch": 1.496243611784932, + "grad_norm": 0.7015264630317688, + "learning_rate": 3.086255772452317e-06, + "loss": 0.1956, + "step": 15883 + }, + { + "epoch": 1.4963378158780998, + "grad_norm": 0.6468680500984192, + "learning_rate": 3.085164868262781e-06, + "loss": 0.1902, + "step": 15884 + }, + { + "epoch": 1.4964320199712677, + "grad_norm": 0.655238926410675, + "learning_rate": 3.0840741217421845e-06, + "loss": 0.2034, + "step": 15885 + }, + { + "epoch": 1.4965262240644357, + "grad_norm": 0.7291893362998962, + "learning_rate": 3.082983532915389e-06, + "loss": 0.1998, + "step": 15886 + }, + { + "epoch": 1.4966204281576034, + "grad_norm": 0.642575204372406, + "learning_rate": 3.0818931018072672e-06, + "loss": 0.1897, + "step": 15887 + }, + { + "epoch": 1.4967146322507712, + "grad_norm": 0.6005973219871521, + "learning_rate": 3.080802828442685e-06, + "loss": 0.1803, + "step": 15888 + }, + { + "epoch": 1.4968088363439391, + "grad_norm": 0.6991351842880249, + "learning_rate": 3.0797127128464966e-06, + "loss": 0.1731, + "step": 15889 + }, + { + "epoch": 1.496903040437107, + "grad_norm": 0.6910436153411865, + "learning_rate": 3.07862275504356e-06, + "loss": 0.2079, + "step": 15890 + }, + { + "epoch": 1.4969972445302748, + "grad_norm": 0.6432166695594788, + "learning_rate": 3.077532955058732e-06, + "loss": 0.1981, + "step": 15891 + }, + { + "epoch": 1.4970914486234426, + "grad_norm": 0.6242793798446655, + "learning_rate": 3.076443312916858e-06, + "loss": 0.1958, + "step": 15892 + }, + { + "epoch": 1.4971856527166105, + "grad_norm": 0.5713182091712952, + "learning_rate": 3.0753538286427773e-06, + "loss": 0.1909, + "step": 15893 + }, + { + "epoch": 1.4972798568097785, + "grad_norm": 0.6838253140449524, + "learning_rate": 3.074264502261346e-06, + "loss": 0.1997, + "step": 15894 + }, + { + "epoch": 1.4973740609029462, + "grad_norm": 0.6527348756790161, + "learning_rate": 3.0731753337973945e-06, + "loss": 0.2284, + "step": 15895 + }, + { + "epoch": 1.497468264996114, + "grad_norm": 0.6556838154792786, + "learning_rate": 3.0720863232757514e-06, + "loss": 0.2199, + "step": 15896 + }, + { + "epoch": 1.497562469089282, + "grad_norm": 0.6978013515472412, + "learning_rate": 3.07099747072126e-06, + "loss": 0.1974, + "step": 15897 + }, + { + "epoch": 1.4976566731824499, + "grad_norm": 0.5855468511581421, + "learning_rate": 3.069908776158743e-06, + "loss": 0.1787, + "step": 15898 + }, + { + "epoch": 1.4977508772756176, + "grad_norm": 0.6803790926933289, + "learning_rate": 3.0688202396130172e-06, + "loss": 0.1959, + "step": 15899 + }, + { + "epoch": 1.4978450813687854, + "grad_norm": 0.6151381731033325, + "learning_rate": 3.067731861108916e-06, + "loss": 0.1925, + "step": 15900 + }, + { + "epoch": 1.4979392854619533, + "grad_norm": 0.6695271730422974, + "learning_rate": 3.0666436406712485e-06, + "loss": 0.1757, + "step": 15901 + }, + { + "epoch": 1.4980334895551213, + "grad_norm": 0.606999397277832, + "learning_rate": 3.0655555783248248e-06, + "loss": 0.1845, + "step": 15902 + }, + { + "epoch": 1.498127693648289, + "grad_norm": 0.6408692002296448, + "learning_rate": 3.064467674094459e-06, + "loss": 0.2059, + "step": 15903 + }, + { + "epoch": 1.4982218977414568, + "grad_norm": 0.6725518107414246, + "learning_rate": 3.0633799280049604e-06, + "loss": 0.1927, + "step": 15904 + }, + { + "epoch": 1.4983161018346247, + "grad_norm": 0.6841219067573547, + "learning_rate": 3.0622923400811234e-06, + "loss": 0.1924, + "step": 15905 + }, + { + "epoch": 1.4984103059277927, + "grad_norm": 0.5929518938064575, + "learning_rate": 3.061204910347749e-06, + "loss": 0.1838, + "step": 15906 + }, + { + "epoch": 1.4985045100209604, + "grad_norm": 0.9958206415176392, + "learning_rate": 3.0601176388296382e-06, + "loss": 0.2065, + "step": 15907 + }, + { + "epoch": 1.4985987141141281, + "grad_norm": 0.6183670163154602, + "learning_rate": 3.059030525551575e-06, + "loss": 0.1894, + "step": 15908 + }, + { + "epoch": 1.498692918207296, + "grad_norm": 0.6932939291000366, + "learning_rate": 3.05794357053835e-06, + "loss": 0.1764, + "step": 15909 + }, + { + "epoch": 1.498787122300464, + "grad_norm": 0.642903745174408, + "learning_rate": 3.0568567738147505e-06, + "loss": 0.1892, + "step": 15910 + }, + { + "epoch": 1.4988813263936318, + "grad_norm": 0.677622377872467, + "learning_rate": 3.0557701354055516e-06, + "loss": 0.1842, + "step": 15911 + }, + { + "epoch": 1.4989755304867995, + "grad_norm": 0.7866834402084351, + "learning_rate": 3.0546836553355354e-06, + "loss": 0.2096, + "step": 15912 + }, + { + "epoch": 1.4990697345799675, + "grad_norm": 0.7292660474777222, + "learning_rate": 3.05359733362947e-06, + "loss": 0.2195, + "step": 15913 + }, + { + "epoch": 1.4991639386731355, + "grad_norm": 0.6775988936424255, + "learning_rate": 3.052511170312129e-06, + "loss": 0.2221, + "step": 15914 + }, + { + "epoch": 1.4992581427663032, + "grad_norm": 0.6540030241012573, + "learning_rate": 3.0514251654082803e-06, + "loss": 0.1985, + "step": 15915 + }, + { + "epoch": 1.499352346859471, + "grad_norm": 0.5825475454330444, + "learning_rate": 3.050339318942681e-06, + "loss": 0.1829, + "step": 15916 + }, + { + "epoch": 1.499446550952639, + "grad_norm": 0.8766392469406128, + "learning_rate": 3.0492536309400968e-06, + "loss": 0.2049, + "step": 15917 + }, + { + "epoch": 1.4995407550458069, + "grad_norm": 0.7007165551185608, + "learning_rate": 3.0481681014252763e-06, + "loss": 0.1925, + "step": 15918 + }, + { + "epoch": 1.4996349591389746, + "grad_norm": 0.7133749127388, + "learning_rate": 3.0470827304229734e-06, + "loss": 0.1924, + "step": 15919 + }, + { + "epoch": 1.4997291632321423, + "grad_norm": 0.6479188799858093, + "learning_rate": 3.0459975179579404e-06, + "loss": 0.1703, + "step": 15920 + }, + { + "epoch": 1.4998233673253103, + "grad_norm": 0.6277375817298889, + "learning_rate": 3.0449124640549154e-06, + "loss": 0.1952, + "step": 15921 + }, + { + "epoch": 1.4999175714184783, + "grad_norm": 0.6849794387817383, + "learning_rate": 3.0438275687386466e-06, + "loss": 0.2049, + "step": 15922 + }, + { + "epoch": 1.500011775511646, + "grad_norm": 0.6415727138519287, + "learning_rate": 3.0427428320338627e-06, + "loss": 0.1618, + "step": 15923 + }, + { + "epoch": 1.5001059796048137, + "grad_norm": 0.7276158332824707, + "learning_rate": 3.041658253965303e-06, + "loss": 0.1954, + "step": 15924 + }, + { + "epoch": 1.5002001836979817, + "grad_norm": 0.7348376512527466, + "learning_rate": 3.0405738345576987e-06, + "loss": 0.214, + "step": 15925 + }, + { + "epoch": 1.5002943877911497, + "grad_norm": 0.5843960046768188, + "learning_rate": 3.039489573835771e-06, + "loss": 0.1662, + "step": 15926 + }, + { + "epoch": 1.5003885918843174, + "grad_norm": 0.7853081226348877, + "learning_rate": 3.0384054718242453e-06, + "loss": 0.2091, + "step": 15927 + }, + { + "epoch": 1.5004827959774851, + "grad_norm": 0.7247233986854553, + "learning_rate": 3.037321528547845e-06, + "loss": 0.211, + "step": 15928 + }, + { + "epoch": 1.500577000070653, + "grad_norm": 0.7683022618293762, + "learning_rate": 3.0362377440312783e-06, + "loss": 0.2003, + "step": 15929 + }, + { + "epoch": 1.500671204163821, + "grad_norm": 0.7197926640510559, + "learning_rate": 3.0351541182992605e-06, + "loss": 0.2099, + "step": 15930 + }, + { + "epoch": 1.5007654082569888, + "grad_norm": 0.6092464923858643, + "learning_rate": 3.034070651376504e-06, + "loss": 0.1693, + "step": 15931 + }, + { + "epoch": 1.5008596123501565, + "grad_norm": 0.6016608476638794, + "learning_rate": 3.0329873432877087e-06, + "loss": 0.1725, + "step": 15932 + }, + { + "epoch": 1.5009538164433245, + "grad_norm": 0.6459596753120422, + "learning_rate": 3.031904194057571e-06, + "loss": 0.185, + "step": 15933 + }, + { + "epoch": 1.5010480205364924, + "grad_norm": 0.63637775182724, + "learning_rate": 3.030821203710801e-06, + "loss": 0.2081, + "step": 15934 + }, + { + "epoch": 1.5011422246296602, + "grad_norm": 0.6163280010223389, + "learning_rate": 3.029738372272084e-06, + "loss": 0.1679, + "step": 15935 + }, + { + "epoch": 1.501236428722828, + "grad_norm": 0.7546627521514893, + "learning_rate": 3.0286556997661064e-06, + "loss": 0.2143, + "step": 15936 + }, + { + "epoch": 1.5013306328159959, + "grad_norm": 0.6533668637275696, + "learning_rate": 3.027573186217567e-06, + "loss": 0.182, + "step": 15937 + }, + { + "epoch": 1.5014248369091638, + "grad_norm": 0.5748540759086609, + "learning_rate": 3.0264908316511422e-06, + "loss": 0.1821, + "step": 15938 + }, + { + "epoch": 1.5015190410023316, + "grad_norm": 0.6528898477554321, + "learning_rate": 3.0254086360915036e-06, + "loss": 0.221, + "step": 15939 + }, + { + "epoch": 1.5016132450954993, + "grad_norm": 0.6347132921218872, + "learning_rate": 3.024326599563342e-06, + "loss": 0.196, + "step": 15940 + }, + { + "epoch": 1.5017074491886673, + "grad_norm": 1.1245532035827637, + "learning_rate": 3.0232447220913207e-06, + "loss": 0.2049, + "step": 15941 + }, + { + "epoch": 1.5018016532818352, + "grad_norm": 0.7115646004676819, + "learning_rate": 3.0221630037001072e-06, + "loss": 0.2166, + "step": 15942 + }, + { + "epoch": 1.501895857375003, + "grad_norm": 0.6134077906608582, + "learning_rate": 3.0210814444143687e-06, + "loss": 0.1891, + "step": 15943 + }, + { + "epoch": 1.5019900614681707, + "grad_norm": 0.670565128326416, + "learning_rate": 3.0200000442587695e-06, + "loss": 0.2, + "step": 15944 + }, + { + "epoch": 1.5020842655613387, + "grad_norm": 0.6931569576263428, + "learning_rate": 3.0189188032579606e-06, + "loss": 0.2272, + "step": 15945 + }, + { + "epoch": 1.5021784696545066, + "grad_norm": 0.6226788759231567, + "learning_rate": 3.0178377214365994e-06, + "loss": 0.1822, + "step": 15946 + }, + { + "epoch": 1.5022726737476744, + "grad_norm": 0.6719318628311157, + "learning_rate": 3.0167567988193403e-06, + "loss": 0.1857, + "step": 15947 + }, + { + "epoch": 1.502366877840842, + "grad_norm": 0.6673892736434937, + "learning_rate": 3.0156760354308223e-06, + "loss": 0.1921, + "step": 15948 + }, + { + "epoch": 1.50246108193401, + "grad_norm": 0.5427615642547607, + "learning_rate": 3.0145954312956915e-06, + "loss": 0.1724, + "step": 15949 + }, + { + "epoch": 1.502555286027178, + "grad_norm": 0.6564127802848816, + "learning_rate": 3.0135149864385915e-06, + "loss": 0.2139, + "step": 15950 + }, + { + "epoch": 1.5026494901203458, + "grad_norm": 0.8202550411224365, + "learning_rate": 3.0124347008841515e-06, + "loss": 0.201, + "step": 15951 + }, + { + "epoch": 1.5027436942135135, + "grad_norm": 0.6755064725875854, + "learning_rate": 3.0113545746570107e-06, + "loss": 0.2421, + "step": 15952 + }, + { + "epoch": 1.5028378983066815, + "grad_norm": 0.6923037767410278, + "learning_rate": 3.0102746077817903e-06, + "loss": 0.2158, + "step": 15953 + }, + { + "epoch": 1.5029321023998494, + "grad_norm": 0.6704870462417603, + "learning_rate": 3.0091948002831183e-06, + "loss": 0.2081, + "step": 15954 + }, + { + "epoch": 1.5030263064930172, + "grad_norm": 0.6434040069580078, + "learning_rate": 3.0081151521856188e-06, + "loss": 0.1864, + "step": 15955 + }, + { + "epoch": 1.503120510586185, + "grad_norm": 0.571193516254425, + "learning_rate": 3.007035663513905e-06, + "loss": 0.1666, + "step": 15956 + }, + { + "epoch": 1.5032147146793529, + "grad_norm": 0.6750934720039368, + "learning_rate": 3.0059563342925956e-06, + "loss": 0.2158, + "step": 15957 + }, + { + "epoch": 1.5033089187725208, + "grad_norm": 0.5765085220336914, + "learning_rate": 3.0048771645462947e-06, + "loss": 0.2048, + "step": 15958 + }, + { + "epoch": 1.5034031228656886, + "grad_norm": 0.6211551427841187, + "learning_rate": 3.003798154299613e-06, + "loss": 0.1714, + "step": 15959 + }, + { + "epoch": 1.5034973269588563, + "grad_norm": 0.6313039064407349, + "learning_rate": 3.0027193035771576e-06, + "loss": 0.1599, + "step": 15960 + }, + { + "epoch": 1.5035915310520243, + "grad_norm": 0.6515534520149231, + "learning_rate": 3.001640612403518e-06, + "loss": 0.2064, + "step": 15961 + }, + { + "epoch": 1.5036857351451922, + "grad_norm": 0.68388831615448, + "learning_rate": 3.000562080803301e-06, + "loss": 0.2008, + "step": 15962 + }, + { + "epoch": 1.50377993923836, + "grad_norm": 0.6394569873809814, + "learning_rate": 2.9994837088010886e-06, + "loss": 0.1798, + "step": 15963 + }, + { + "epoch": 1.5038741433315277, + "grad_norm": 0.6411869525909424, + "learning_rate": 2.9984054964214747e-06, + "loss": 0.1959, + "step": 15964 + }, + { + "epoch": 1.5039683474246957, + "grad_norm": 0.6322797536849976, + "learning_rate": 2.9973274436890475e-06, + "loss": 0.1823, + "step": 15965 + }, + { + "epoch": 1.5040625515178636, + "grad_norm": 0.6207188367843628, + "learning_rate": 2.9962495506283805e-06, + "loss": 0.2079, + "step": 15966 + }, + { + "epoch": 1.5041567556110313, + "grad_norm": 0.6168134808540344, + "learning_rate": 2.995171817264055e-06, + "loss": 0.166, + "step": 15967 + }, + { + "epoch": 1.504250959704199, + "grad_norm": 0.6465839147567749, + "learning_rate": 2.994094243620649e-06, + "loss": 0.1947, + "step": 15968 + }, + { + "epoch": 1.504345163797367, + "grad_norm": 0.7306556105613708, + "learning_rate": 2.9930168297227257e-06, + "loss": 0.2244, + "step": 15969 + }, + { + "epoch": 1.504439367890535, + "grad_norm": 0.654899001121521, + "learning_rate": 2.9919395755948553e-06, + "loss": 0.1964, + "step": 15970 + }, + { + "epoch": 1.5045335719837027, + "grad_norm": 0.6639846563339233, + "learning_rate": 2.9908624812616038e-06, + "loss": 0.1942, + "step": 15971 + }, + { + "epoch": 1.5046277760768705, + "grad_norm": 0.6543169021606445, + "learning_rate": 2.9897855467475278e-06, + "loss": 0.1675, + "step": 15972 + }, + { + "epoch": 1.5047219801700384, + "grad_norm": 0.635490357875824, + "learning_rate": 2.988708772077177e-06, + "loss": 0.2129, + "step": 15973 + }, + { + "epoch": 1.5048161842632064, + "grad_norm": 0.6697912812232971, + "learning_rate": 2.9876321572751143e-06, + "loss": 0.1977, + "step": 15974 + }, + { + "epoch": 1.504910388356374, + "grad_norm": 0.76012122631073, + "learning_rate": 2.9865557023658843e-06, + "loss": 0.1981, + "step": 15975 + }, + { + "epoch": 1.5050045924495419, + "grad_norm": 0.7171528339385986, + "learning_rate": 2.9854794073740243e-06, + "loss": 0.2013, + "step": 15976 + }, + { + "epoch": 1.5050987965427098, + "grad_norm": 0.6229657530784607, + "learning_rate": 2.9844032723240877e-06, + "loss": 0.1954, + "step": 15977 + }, + { + "epoch": 1.5051930006358776, + "grad_norm": 0.6412497758865356, + "learning_rate": 2.983327297240607e-06, + "loss": 0.1931, + "step": 15978 + }, + { + "epoch": 1.5052872047290453, + "grad_norm": 0.7091441750526428, + "learning_rate": 2.9822514821481086e-06, + "loss": 0.2165, + "step": 15979 + }, + { + "epoch": 1.5053814088222133, + "grad_norm": 0.7062512040138245, + "learning_rate": 2.981175827071137e-06, + "loss": 0.1924, + "step": 15980 + }, + { + "epoch": 1.5054756129153812, + "grad_norm": 0.6737327575683594, + "learning_rate": 2.9801003320342104e-06, + "loss": 0.2075, + "step": 15981 + }, + { + "epoch": 1.505569817008549, + "grad_norm": 0.6810401678085327, + "learning_rate": 2.979024997061849e-06, + "loss": 0.2091, + "step": 15982 + }, + { + "epoch": 1.5056640211017167, + "grad_norm": 0.7100754976272583, + "learning_rate": 2.9779498221785774e-06, + "loss": 0.2057, + "step": 15983 + }, + { + "epoch": 1.5057582251948847, + "grad_norm": 0.6806728839874268, + "learning_rate": 2.9768748074089115e-06, + "loss": 0.2066, + "step": 15984 + }, + { + "epoch": 1.5058524292880526, + "grad_norm": 0.692293643951416, + "learning_rate": 2.9757999527773583e-06, + "loss": 0.186, + "step": 15985 + }, + { + "epoch": 1.5059466333812204, + "grad_norm": 0.652180016040802, + "learning_rate": 2.9747252583084297e-06, + "loss": 0.1849, + "step": 15986 + }, + { + "epoch": 1.506040837474388, + "grad_norm": 0.6136890053749084, + "learning_rate": 2.9736507240266332e-06, + "loss": 0.1991, + "step": 15987 + }, + { + "epoch": 1.506135041567556, + "grad_norm": 0.6313958168029785, + "learning_rate": 2.9725763499564643e-06, + "loss": 0.1889, + "step": 15988 + }, + { + "epoch": 1.506229245660724, + "grad_norm": 0.6914837956428528, + "learning_rate": 2.9715021361224216e-06, + "loss": 0.2058, + "step": 15989 + }, + { + "epoch": 1.5063234497538918, + "grad_norm": 0.6593897342681885, + "learning_rate": 2.9704280825490027e-06, + "loss": 0.194, + "step": 15990 + }, + { + "epoch": 1.5064176538470595, + "grad_norm": 0.6278865337371826, + "learning_rate": 2.9693541892606935e-06, + "loss": 0.2221, + "step": 15991 + }, + { + "epoch": 1.5065118579402275, + "grad_norm": 0.7037993669509888, + "learning_rate": 2.9682804562819835e-06, + "loss": 0.2224, + "step": 15992 + }, + { + "epoch": 1.5066060620333954, + "grad_norm": 0.6817741394042969, + "learning_rate": 2.9672068836373515e-06, + "loss": 0.1849, + "step": 15993 + }, + { + "epoch": 1.5067002661265632, + "grad_norm": 0.6765510439872742, + "learning_rate": 2.966133471351282e-06, + "loss": 0.2279, + "step": 15994 + }, + { + "epoch": 1.506794470219731, + "grad_norm": 0.879775881767273, + "learning_rate": 2.9650602194482448e-06, + "loss": 0.1813, + "step": 15995 + }, + { + "epoch": 1.5068886743128989, + "grad_norm": 0.6820473074913025, + "learning_rate": 2.9639871279527133e-06, + "loss": 0.1879, + "step": 15996 + }, + { + "epoch": 1.5069828784060668, + "grad_norm": 0.5898948907852173, + "learning_rate": 2.9629141968891604e-06, + "loss": 0.1889, + "step": 15997 + }, + { + "epoch": 1.5070770824992346, + "grad_norm": 0.6425492167472839, + "learning_rate": 2.9618414262820436e-06, + "loss": 0.2008, + "step": 15998 + }, + { + "epoch": 1.5071712865924023, + "grad_norm": 0.7598943710327148, + "learning_rate": 2.9607688161558266e-06, + "loss": 0.2493, + "step": 15999 + }, + { + "epoch": 1.5072654906855703, + "grad_norm": 0.6973783373832703, + "learning_rate": 2.959696366534971e-06, + "loss": 0.2033, + "step": 16000 + }, + { + "epoch": 1.5073596947787382, + "grad_norm": 0.6856785416603088, + "learning_rate": 2.9586240774439223e-06, + "loss": 0.1933, + "step": 16001 + }, + { + "epoch": 1.507453898871906, + "grad_norm": 0.6569346189498901, + "learning_rate": 2.957551948907138e-06, + "loss": 0.2198, + "step": 16002 + }, + { + "epoch": 1.5075481029650737, + "grad_norm": 0.6172292232513428, + "learning_rate": 2.9564799809490574e-06, + "loss": 0.2073, + "step": 16003 + }, + { + "epoch": 1.5076423070582416, + "grad_norm": 0.6552324295043945, + "learning_rate": 2.9554081735941263e-06, + "loss": 0.2161, + "step": 16004 + }, + { + "epoch": 1.5077365111514096, + "grad_norm": 0.6306608319282532, + "learning_rate": 2.9543365268667866e-06, + "loss": 0.2145, + "step": 16005 + }, + { + "epoch": 1.5078307152445773, + "grad_norm": 0.6723946928977966, + "learning_rate": 2.9532650407914676e-06, + "loss": 0.1839, + "step": 16006 + }, + { + "epoch": 1.507924919337745, + "grad_norm": 0.7015173435211182, + "learning_rate": 2.9521937153926028e-06, + "loss": 0.1978, + "step": 16007 + }, + { + "epoch": 1.508019123430913, + "grad_norm": 0.5856063365936279, + "learning_rate": 2.951122550694625e-06, + "loss": 0.1766, + "step": 16008 + }, + { + "epoch": 1.508113327524081, + "grad_norm": 0.6634768843650818, + "learning_rate": 2.9500515467219505e-06, + "loss": 0.2005, + "step": 16009 + }, + { + "epoch": 1.5082075316172487, + "grad_norm": 0.6804721355438232, + "learning_rate": 2.9489807034990037e-06, + "loss": 0.2082, + "step": 16010 + }, + { + "epoch": 1.5083017357104165, + "grad_norm": 0.7250627279281616, + "learning_rate": 2.9479100210502045e-06, + "loss": 0.1839, + "step": 16011 + }, + { + "epoch": 1.5083959398035844, + "grad_norm": 0.6678200364112854, + "learning_rate": 2.946839499399964e-06, + "loss": 0.1995, + "step": 16012 + }, + { + "epoch": 1.5084901438967524, + "grad_norm": 0.6348961591720581, + "learning_rate": 2.945769138572684e-06, + "loss": 0.1996, + "step": 16013 + }, + { + "epoch": 1.5085843479899201, + "grad_norm": 0.6270191073417664, + "learning_rate": 2.944698938592784e-06, + "loss": 0.1872, + "step": 16014 + }, + { + "epoch": 1.5086785520830879, + "grad_norm": 0.6524645090103149, + "learning_rate": 2.9436288994846583e-06, + "loss": 0.2006, + "step": 16015 + }, + { + "epoch": 1.5087727561762558, + "grad_norm": 0.6725365519523621, + "learning_rate": 2.9425590212727005e-06, + "loss": 0.2169, + "step": 16016 + }, + { + "epoch": 1.5088669602694238, + "grad_norm": 0.723200261592865, + "learning_rate": 2.9414893039813186e-06, + "loss": 0.1964, + "step": 16017 + }, + { + "epoch": 1.5089611643625915, + "grad_norm": 0.7292844653129578, + "learning_rate": 2.940419747634896e-06, + "loss": 0.2041, + "step": 16018 + }, + { + "epoch": 1.5090553684557593, + "grad_norm": 0.6949648857116699, + "learning_rate": 2.9393503522578183e-06, + "loss": 0.1989, + "step": 16019 + }, + { + "epoch": 1.5091495725489272, + "grad_norm": 0.6308997273445129, + "learning_rate": 2.938281117874472e-06, + "loss": 0.1818, + "step": 16020 + }, + { + "epoch": 1.5092437766420952, + "grad_norm": 0.7115275859832764, + "learning_rate": 2.937212044509241e-06, + "loss": 0.2033, + "step": 16021 + }, + { + "epoch": 1.509337980735263, + "grad_norm": 0.7062810063362122, + "learning_rate": 2.936143132186494e-06, + "loss": 0.1959, + "step": 16022 + }, + { + "epoch": 1.5094321848284307, + "grad_norm": 0.6680557727813721, + "learning_rate": 2.935074380930609e-06, + "loss": 0.1935, + "step": 16023 + }, + { + "epoch": 1.5095263889215986, + "grad_norm": 0.6889482736587524, + "learning_rate": 2.934005790765957e-06, + "loss": 0.1769, + "step": 16024 + }, + { + "epoch": 1.5096205930147666, + "grad_norm": 0.7399773597717285, + "learning_rate": 2.9329373617168976e-06, + "loss": 0.1852, + "step": 16025 + }, + { + "epoch": 1.5097147971079343, + "grad_norm": 0.6657445430755615, + "learning_rate": 2.9318690938077965e-06, + "loss": 0.2268, + "step": 16026 + }, + { + "epoch": 1.509809001201102, + "grad_norm": 0.6584805250167847, + "learning_rate": 2.9308009870630127e-06, + "loss": 0.2093, + "step": 16027 + }, + { + "epoch": 1.50990320529427, + "grad_norm": 0.6092638969421387, + "learning_rate": 2.929733041506897e-06, + "loss": 0.1733, + "step": 16028 + }, + { + "epoch": 1.509997409387438, + "grad_norm": 0.6681575775146484, + "learning_rate": 2.928665257163803e-06, + "loss": 0.1978, + "step": 16029 + }, + { + "epoch": 1.5100916134806057, + "grad_norm": 0.6568483114242554, + "learning_rate": 2.927597634058079e-06, + "loss": 0.2213, + "step": 16030 + }, + { + "epoch": 1.5101858175737735, + "grad_norm": 0.6793726682662964, + "learning_rate": 2.926530172214064e-06, + "loss": 0.2109, + "step": 16031 + }, + { + "epoch": 1.5102800216669414, + "grad_norm": 0.642000675201416, + "learning_rate": 2.925462871656104e-06, + "loss": 0.1953, + "step": 16032 + }, + { + "epoch": 1.5103742257601094, + "grad_norm": 0.6226553320884705, + "learning_rate": 2.9243957324085283e-06, + "loss": 0.1911, + "step": 16033 + }, + { + "epoch": 1.5104684298532771, + "grad_norm": 0.7639485001564026, + "learning_rate": 2.9233287544956747e-06, + "loss": 0.2093, + "step": 16034 + }, + { + "epoch": 1.5105626339464449, + "grad_norm": 0.6693103909492493, + "learning_rate": 2.922261937941867e-06, + "loss": 0.1792, + "step": 16035 + }, + { + "epoch": 1.5106568380396128, + "grad_norm": 0.5888148546218872, + "learning_rate": 2.921195282771433e-06, + "loss": 0.1778, + "step": 16036 + }, + { + "epoch": 1.5107510421327808, + "grad_norm": 0.6757652759552002, + "learning_rate": 2.920128789008698e-06, + "loss": 0.2134, + "step": 16037 + }, + { + "epoch": 1.5108452462259485, + "grad_norm": 0.7014308571815491, + "learning_rate": 2.9190624566779723e-06, + "loss": 0.2189, + "step": 16038 + }, + { + "epoch": 1.5109394503191163, + "grad_norm": 0.611684262752533, + "learning_rate": 2.9179962858035736e-06, + "loss": 0.1819, + "step": 16039 + }, + { + "epoch": 1.5110336544122842, + "grad_norm": 0.6903232932090759, + "learning_rate": 2.916930276409815e-06, + "loss": 0.2056, + "step": 16040 + }, + { + "epoch": 1.5111278585054522, + "grad_norm": 0.6848517060279846, + "learning_rate": 2.915864428520997e-06, + "loss": 0.1919, + "step": 16041 + }, + { + "epoch": 1.51122206259862, + "grad_norm": 0.6487221717834473, + "learning_rate": 2.9147987421614287e-06, + "loss": 0.1893, + "step": 16042 + }, + { + "epoch": 1.5113162666917876, + "grad_norm": 0.6601995229721069, + "learning_rate": 2.9137332173554043e-06, + "loss": 0.2007, + "step": 16043 + }, + { + "epoch": 1.5114104707849556, + "grad_norm": 0.7070102691650391, + "learning_rate": 2.912667854127221e-06, + "loss": 0.1818, + "step": 16044 + }, + { + "epoch": 1.5115046748781236, + "grad_norm": 0.6315514445304871, + "learning_rate": 2.9116026525011755e-06, + "loss": 0.1901, + "step": 16045 + }, + { + "epoch": 1.5115988789712913, + "grad_norm": 0.654120147228241, + "learning_rate": 2.9105376125015485e-06, + "loss": 0.1996, + "step": 16046 + }, + { + "epoch": 1.511693083064459, + "grad_norm": 0.7211007475852966, + "learning_rate": 2.9094727341526275e-06, + "loss": 0.1908, + "step": 16047 + }, + { + "epoch": 1.511787287157627, + "grad_norm": 0.6275424957275391, + "learning_rate": 2.9084080174786966e-06, + "loss": 0.1736, + "step": 16048 + }, + { + "epoch": 1.511881491250795, + "grad_norm": 0.5740016102790833, + "learning_rate": 2.9073434625040274e-06, + "loss": 0.1942, + "step": 16049 + }, + { + "epoch": 1.5119756953439627, + "grad_norm": 0.6556302309036255, + "learning_rate": 2.9062790692528963e-06, + "loss": 0.1856, + "step": 16050 + }, + { + "epoch": 1.5120698994371304, + "grad_norm": 0.6546376943588257, + "learning_rate": 2.9052148377495772e-06, + "loss": 0.1885, + "step": 16051 + }, + { + "epoch": 1.5121641035302984, + "grad_norm": 0.6720876097679138, + "learning_rate": 2.9041507680183313e-06, + "loss": 0.203, + "step": 16052 + }, + { + "epoch": 1.5122583076234664, + "grad_norm": 0.5759272575378418, + "learning_rate": 2.903086860083415e-06, + "loss": 0.1697, + "step": 16053 + }, + { + "epoch": 1.512352511716634, + "grad_norm": 0.708020031452179, + "learning_rate": 2.902023113969101e-06, + "loss": 0.1912, + "step": 16054 + }, + { + "epoch": 1.5124467158098018, + "grad_norm": 0.6964131593704224, + "learning_rate": 2.9009595296996372e-06, + "loss": 0.1885, + "step": 16055 + }, + { + "epoch": 1.5125409199029698, + "grad_norm": 0.6443415284156799, + "learning_rate": 2.899896107299268e-06, + "loss": 0.2038, + "step": 16056 + }, + { + "epoch": 1.5126351239961378, + "grad_norm": 0.8100490570068359, + "learning_rate": 2.8988328467922554e-06, + "loss": 0.2152, + "step": 16057 + }, + { + "epoch": 1.5127293280893055, + "grad_norm": 0.6466900706291199, + "learning_rate": 2.8977697482028356e-06, + "loss": 0.1781, + "step": 16058 + }, + { + "epoch": 1.5128235321824732, + "grad_norm": 0.6879572868347168, + "learning_rate": 2.8967068115552453e-06, + "loss": 0.1898, + "step": 16059 + }, + { + "epoch": 1.5129177362756412, + "grad_norm": 0.5906853675842285, + "learning_rate": 2.895644036873726e-06, + "loss": 0.201, + "step": 16060 + }, + { + "epoch": 1.5130119403688091, + "grad_norm": 0.6801936626434326, + "learning_rate": 2.8945814241825133e-06, + "loss": 0.1951, + "step": 16061 + }, + { + "epoch": 1.5131061444619769, + "grad_norm": 0.6945424675941467, + "learning_rate": 2.893518973505829e-06, + "loss": 0.2164, + "step": 16062 + }, + { + "epoch": 1.5132003485551446, + "grad_norm": 0.6183425784111023, + "learning_rate": 2.8924566848679024e-06, + "loss": 0.1894, + "step": 16063 + }, + { + "epoch": 1.5132945526483126, + "grad_norm": 0.6572199463844299, + "learning_rate": 2.8913945582929594e-06, + "loss": 0.1897, + "step": 16064 + }, + { + "epoch": 1.5133887567414805, + "grad_norm": 0.6754615902900696, + "learning_rate": 2.8903325938052108e-06, + "loss": 0.1929, + "step": 16065 + }, + { + "epoch": 1.5134829608346483, + "grad_norm": 0.7403187155723572, + "learning_rate": 2.8892707914288744e-06, + "loss": 0.2096, + "step": 16066 + }, + { + "epoch": 1.513577164927816, + "grad_norm": 0.6388410925865173, + "learning_rate": 2.888209151188163e-06, + "loss": 0.1893, + "step": 16067 + }, + { + "epoch": 1.513671369020984, + "grad_norm": 0.6676547527313232, + "learning_rate": 2.887147673107279e-06, + "loss": 0.1863, + "step": 16068 + }, + { + "epoch": 1.513765573114152, + "grad_norm": 0.6466985940933228, + "learning_rate": 2.886086357210429e-06, + "loss": 0.2171, + "step": 16069 + }, + { + "epoch": 1.5138597772073197, + "grad_norm": 0.7194705605506897, + "learning_rate": 2.885025203521814e-06, + "loss": 0.2174, + "step": 16070 + }, + { + "epoch": 1.5139539813004874, + "grad_norm": 0.6867430806159973, + "learning_rate": 2.883964212065625e-06, + "loss": 0.2138, + "step": 16071 + }, + { + "epoch": 1.5140481853936554, + "grad_norm": 0.6516374945640564, + "learning_rate": 2.8829033828660613e-06, + "loss": 0.2173, + "step": 16072 + }, + { + "epoch": 1.5141423894868233, + "grad_norm": 0.6495267152786255, + "learning_rate": 2.8818427159473027e-06, + "loss": 0.1956, + "step": 16073 + }, + { + "epoch": 1.514236593579991, + "grad_norm": 0.661798357963562, + "learning_rate": 2.8807822113335425e-06, + "loss": 0.2111, + "step": 16074 + }, + { + "epoch": 1.5143307976731588, + "grad_norm": 0.5697353482246399, + "learning_rate": 2.879721869048955e-06, + "loss": 0.1834, + "step": 16075 + }, + { + "epoch": 1.5144250017663268, + "grad_norm": 0.7053003907203674, + "learning_rate": 2.87866168911772e-06, + "loss": 0.1947, + "step": 16076 + }, + { + "epoch": 1.5145192058594947, + "grad_norm": 0.7660698890686035, + "learning_rate": 2.8776016715640155e-06, + "loss": 0.2051, + "step": 16077 + }, + { + "epoch": 1.5146134099526625, + "grad_norm": 0.6596674919128418, + "learning_rate": 2.8765418164120053e-06, + "loss": 0.2068, + "step": 16078 + }, + { + "epoch": 1.5147076140458302, + "grad_norm": 0.6137715578079224, + "learning_rate": 2.8754821236858577e-06, + "loss": 0.1715, + "step": 16079 + }, + { + "epoch": 1.5148018181389982, + "grad_norm": 0.5945097208023071, + "learning_rate": 2.874422593409738e-06, + "loss": 0.2105, + "step": 16080 + }, + { + "epoch": 1.5148960222321661, + "grad_norm": 0.6514506936073303, + "learning_rate": 2.8733632256078014e-06, + "loss": 0.2196, + "step": 16081 + }, + { + "epoch": 1.5149902263253339, + "grad_norm": 0.6846261620521545, + "learning_rate": 2.8723040203042074e-06, + "loss": 0.2014, + "step": 16082 + }, + { + "epoch": 1.5150844304185016, + "grad_norm": 0.5711610913276672, + "learning_rate": 2.8712449775231023e-06, + "loss": 0.1568, + "step": 16083 + }, + { + "epoch": 1.5151786345116696, + "grad_norm": 0.616041362285614, + "learning_rate": 2.8701860972886366e-06, + "loss": 0.1678, + "step": 16084 + }, + { + "epoch": 1.5152728386048375, + "grad_norm": 0.6680805087089539, + "learning_rate": 2.8691273796249562e-06, + "loss": 0.2017, + "step": 16085 + }, + { + "epoch": 1.5153670426980053, + "grad_norm": 0.6135803461074829, + "learning_rate": 2.868068824556197e-06, + "loss": 0.1855, + "step": 16086 + }, + { + "epoch": 1.515461246791173, + "grad_norm": 0.6743056178092957, + "learning_rate": 2.8670104321064995e-06, + "loss": 0.2018, + "step": 16087 + }, + { + "epoch": 1.515555450884341, + "grad_norm": 0.6641936898231506, + "learning_rate": 2.8659522022999977e-06, + "loss": 0.1871, + "step": 16088 + }, + { + "epoch": 1.515649654977509, + "grad_norm": 0.7147706747055054, + "learning_rate": 2.864894135160815e-06, + "loss": 0.2159, + "step": 16089 + }, + { + "epoch": 1.5157438590706767, + "grad_norm": 0.6171817183494568, + "learning_rate": 2.863836230713082e-06, + "loss": 0.1734, + "step": 16090 + }, + { + "epoch": 1.5158380631638444, + "grad_norm": 0.6959594488143921, + "learning_rate": 2.862778488980922e-06, + "loss": 0.1897, + "step": 16091 + }, + { + "epoch": 1.5159322672570124, + "grad_norm": 0.6163338422775269, + "learning_rate": 2.86172090998845e-06, + "loss": 0.197, + "step": 16092 + }, + { + "epoch": 1.5160264713501803, + "grad_norm": 0.6707096099853516, + "learning_rate": 2.860663493759774e-06, + "loss": 0.2224, + "step": 16093 + }, + { + "epoch": 1.516120675443348, + "grad_norm": 0.6781424880027771, + "learning_rate": 2.8596062403190196e-06, + "loss": 0.1859, + "step": 16094 + }, + { + "epoch": 1.5162148795365158, + "grad_norm": 0.5829667448997498, + "learning_rate": 2.858549149690284e-06, + "loss": 0.1801, + "step": 16095 + }, + { + "epoch": 1.5163090836296838, + "grad_norm": 0.6407762765884399, + "learning_rate": 2.8574922218976663e-06, + "loss": 0.1853, + "step": 16096 + }, + { + "epoch": 1.5164032877228517, + "grad_norm": 0.7396100163459778, + "learning_rate": 2.8564354569652785e-06, + "loss": 0.1907, + "step": 16097 + }, + { + "epoch": 1.5164974918160194, + "grad_norm": 0.6020578145980835, + "learning_rate": 2.85537885491721e-06, + "loss": 0.1865, + "step": 16098 + }, + { + "epoch": 1.5165916959091872, + "grad_norm": 0.6818649172782898, + "learning_rate": 2.8543224157775504e-06, + "loss": 0.1765, + "step": 16099 + }, + { + "epoch": 1.5166859000023551, + "grad_norm": 0.6281371116638184, + "learning_rate": 2.853266139570391e-06, + "loss": 0.1806, + "step": 16100 + }, + { + "epoch": 1.516780104095523, + "grad_norm": 0.6728824377059937, + "learning_rate": 2.852210026319818e-06, + "loss": 0.2119, + "step": 16101 + }, + { + "epoch": 1.5168743081886908, + "grad_norm": 0.6425108909606934, + "learning_rate": 2.851154076049909e-06, + "loss": 0.1929, + "step": 16102 + }, + { + "epoch": 1.5169685122818586, + "grad_norm": 0.6145150065422058, + "learning_rate": 2.850098288784742e-06, + "loss": 0.1863, + "step": 16103 + }, + { + "epoch": 1.5170627163750265, + "grad_norm": 0.661647617816925, + "learning_rate": 2.849042664548395e-06, + "loss": 0.2092, + "step": 16104 + }, + { + "epoch": 1.5171569204681945, + "grad_norm": 0.7248967885971069, + "learning_rate": 2.84798720336493e-06, + "loss": 0.1726, + "step": 16105 + }, + { + "epoch": 1.5172511245613622, + "grad_norm": 0.6972887516021729, + "learning_rate": 2.84693190525842e-06, + "loss": 0.2203, + "step": 16106 + }, + { + "epoch": 1.51734532865453, + "grad_norm": 0.6805843710899353, + "learning_rate": 2.8458767702529265e-06, + "loss": 0.1848, + "step": 16107 + }, + { + "epoch": 1.517439532747698, + "grad_norm": 0.6261547803878784, + "learning_rate": 2.8448217983725034e-06, + "loss": 0.2028, + "step": 16108 + }, + { + "epoch": 1.517533736840866, + "grad_norm": 0.6367445588111877, + "learning_rate": 2.84376698964121e-06, + "loss": 0.1844, + "step": 16109 + }, + { + "epoch": 1.5176279409340336, + "grad_norm": 0.6158941984176636, + "learning_rate": 2.8427123440830997e-06, + "loss": 0.2046, + "step": 16110 + }, + { + "epoch": 1.5177221450272014, + "grad_norm": 0.6778331398963928, + "learning_rate": 2.8416578617222156e-06, + "loss": 0.2139, + "step": 16111 + }, + { + "epoch": 1.5178163491203693, + "grad_norm": 0.6457752585411072, + "learning_rate": 2.8406035425826006e-06, + "loss": 0.1706, + "step": 16112 + }, + { + "epoch": 1.5179105532135373, + "grad_norm": 0.7141786217689514, + "learning_rate": 2.839549386688297e-06, + "loss": 0.2022, + "step": 16113 + }, + { + "epoch": 1.5180047573067048, + "grad_norm": 0.966853678226471, + "learning_rate": 2.838495394063344e-06, + "loss": 0.1911, + "step": 16114 + }, + { + "epoch": 1.5180989613998728, + "grad_norm": 0.6230742931365967, + "learning_rate": 2.837441564731769e-06, + "loss": 0.1786, + "step": 16115 + }, + { + "epoch": 1.5181931654930407, + "grad_norm": 0.6462972164154053, + "learning_rate": 2.836387898717603e-06, + "loss": 0.1913, + "step": 16116 + }, + { + "epoch": 1.5182873695862085, + "grad_norm": 0.745755672454834, + "learning_rate": 2.8353343960448754e-06, + "loss": 0.1984, + "step": 16117 + }, + { + "epoch": 1.5183815736793762, + "grad_norm": 0.6860038042068481, + "learning_rate": 2.8342810567376e-06, + "loss": 0.2104, + "step": 16118 + }, + { + "epoch": 1.5184757777725442, + "grad_norm": 0.7083778977394104, + "learning_rate": 2.833227880819799e-06, + "loss": 0.214, + "step": 16119 + }, + { + "epoch": 1.5185699818657121, + "grad_norm": 0.6527898907661438, + "learning_rate": 2.8321748683154893e-06, + "loss": 0.1839, + "step": 16120 + }, + { + "epoch": 1.5186641859588799, + "grad_norm": 0.6631081104278564, + "learning_rate": 2.8311220192486743e-06, + "loss": 0.2002, + "step": 16121 + }, + { + "epoch": 1.5187583900520476, + "grad_norm": 0.6198391318321228, + "learning_rate": 2.830069333643367e-06, + "loss": 0.203, + "step": 16122 + }, + { + "epoch": 1.5188525941452156, + "grad_norm": 0.709550678730011, + "learning_rate": 2.829016811523565e-06, + "loss": 0.1872, + "step": 16123 + }, + { + "epoch": 1.5189467982383835, + "grad_norm": 0.6676455736160278, + "learning_rate": 2.827964452913269e-06, + "loss": 0.2141, + "step": 16124 + }, + { + "epoch": 1.5190410023315513, + "grad_norm": 0.6386942863464355, + "learning_rate": 2.8269122578364792e-06, + "loss": 0.1799, + "step": 16125 + }, + { + "epoch": 1.519135206424719, + "grad_norm": 0.6231666207313538, + "learning_rate": 2.82586022631718e-06, + "loss": 0.2025, + "step": 16126 + }, + { + "epoch": 1.519229410517887, + "grad_norm": 0.6799577474594116, + "learning_rate": 2.8248083583793616e-06, + "loss": 0.2179, + "step": 16127 + }, + { + "epoch": 1.519323614611055, + "grad_norm": 0.6247674226760864, + "learning_rate": 2.823756654047014e-06, + "loss": 0.1725, + "step": 16128 + }, + { + "epoch": 1.5194178187042227, + "grad_norm": 0.6503288149833679, + "learning_rate": 2.8227051133441087e-06, + "loss": 0.2059, + "step": 16129 + }, + { + "epoch": 1.5195120227973904, + "grad_norm": 0.6307724118232727, + "learning_rate": 2.821653736294627e-06, + "loss": 0.2074, + "step": 16130 + }, + { + "epoch": 1.5196062268905584, + "grad_norm": 0.7084416747093201, + "learning_rate": 2.8206025229225453e-06, + "loss": 0.2136, + "step": 16131 + }, + { + "epoch": 1.5197004309837263, + "grad_norm": 0.6676310300827026, + "learning_rate": 2.819551473251828e-06, + "loss": 0.194, + "step": 16132 + }, + { + "epoch": 1.519794635076894, + "grad_norm": 0.6236906051635742, + "learning_rate": 2.8185005873064365e-06, + "loss": 0.1877, + "step": 16133 + }, + { + "epoch": 1.5198888391700618, + "grad_norm": 0.6586359143257141, + "learning_rate": 2.8174498651103445e-06, + "loss": 0.1878, + "step": 16134 + }, + { + "epoch": 1.5199830432632297, + "grad_norm": 0.6925024390220642, + "learning_rate": 2.816399306687503e-06, + "loss": 0.2178, + "step": 16135 + }, + { + "epoch": 1.5200772473563977, + "grad_norm": 0.7016938924789429, + "learning_rate": 2.8153489120618647e-06, + "loss": 0.1892, + "step": 16136 + }, + { + "epoch": 1.5201714514495654, + "grad_norm": 0.5964245200157166, + "learning_rate": 2.814298681257381e-06, + "loss": 0.1835, + "step": 16137 + }, + { + "epoch": 1.5202656555427332, + "grad_norm": 0.6487076282501221, + "learning_rate": 2.8132486142980052e-06, + "loss": 0.2053, + "step": 16138 + }, + { + "epoch": 1.5203598596359011, + "grad_norm": 0.6946191787719727, + "learning_rate": 2.812198711207671e-06, + "loss": 0.2294, + "step": 16139 + }, + { + "epoch": 1.520454063729069, + "grad_norm": 0.634026825428009, + "learning_rate": 2.8111489720103235e-06, + "loss": 0.1711, + "step": 16140 + }, + { + "epoch": 1.5205482678222368, + "grad_norm": 0.634139895439148, + "learning_rate": 2.8100993967298996e-06, + "loss": 0.1804, + "step": 16141 + }, + { + "epoch": 1.5206424719154046, + "grad_norm": 0.6851383447647095, + "learning_rate": 2.809049985390325e-06, + "loss": 0.1967, + "step": 16142 + }, + { + "epoch": 1.5207366760085725, + "grad_norm": 0.6436731219291687, + "learning_rate": 2.808000738015533e-06, + "loss": 0.1967, + "step": 16143 + }, + { + "epoch": 1.5208308801017405, + "grad_norm": 0.6751908659934998, + "learning_rate": 2.8069516546294494e-06, + "loss": 0.1777, + "step": 16144 + }, + { + "epoch": 1.5209250841949082, + "grad_norm": 0.6696066856384277, + "learning_rate": 2.80590273525599e-06, + "loss": 0.1933, + "step": 16145 + }, + { + "epoch": 1.521019288288076, + "grad_norm": 0.6453372836112976, + "learning_rate": 2.804853979919073e-06, + "loss": 0.1992, + "step": 16146 + }, + { + "epoch": 1.521113492381244, + "grad_norm": 0.6461153626441956, + "learning_rate": 2.8038053886426166e-06, + "loss": 0.1926, + "step": 16147 + }, + { + "epoch": 1.521207696474412, + "grad_norm": 0.6716781854629517, + "learning_rate": 2.802756961450522e-06, + "loss": 0.2198, + "step": 16148 + }, + { + "epoch": 1.5213019005675796, + "grad_norm": 0.5820134878158569, + "learning_rate": 2.8017086983667007e-06, + "loss": 0.1837, + "step": 16149 + }, + { + "epoch": 1.5213961046607474, + "grad_norm": 0.6865665912628174, + "learning_rate": 2.800660599415057e-06, + "loss": 0.2445, + "step": 16150 + }, + { + "epoch": 1.5214903087539153, + "grad_norm": 0.6357466578483582, + "learning_rate": 2.7996126646194844e-06, + "loss": 0.1814, + "step": 16151 + }, + { + "epoch": 1.5215845128470833, + "grad_norm": 0.848849892616272, + "learning_rate": 2.7985648940038766e-06, + "loss": 0.1885, + "step": 16152 + }, + { + "epoch": 1.521678716940251, + "grad_norm": 0.6532415151596069, + "learning_rate": 2.797517287592125e-06, + "loss": 0.1881, + "step": 16153 + }, + { + "epoch": 1.5217729210334188, + "grad_norm": 0.6398544311523438, + "learning_rate": 2.796469845408123e-06, + "loss": 0.1962, + "step": 16154 + }, + { + "epoch": 1.5218671251265867, + "grad_norm": 0.7329211831092834, + "learning_rate": 2.795422567475745e-06, + "loss": 0.2183, + "step": 16155 + }, + { + "epoch": 1.5219613292197547, + "grad_norm": 0.6775096654891968, + "learning_rate": 2.794375453818875e-06, + "loss": 0.1954, + "step": 16156 + }, + { + "epoch": 1.5220555333129224, + "grad_norm": 0.6235827803611755, + "learning_rate": 2.793328504461391e-06, + "loss": 0.1837, + "step": 16157 + }, + { + "epoch": 1.5221497374060902, + "grad_norm": 0.7192054986953735, + "learning_rate": 2.792281719427159e-06, + "loss": 0.1807, + "step": 16158 + }, + { + "epoch": 1.5222439414992581, + "grad_norm": 0.6491178870201111, + "learning_rate": 2.7912350987400515e-06, + "loss": 0.2009, + "step": 16159 + }, + { + "epoch": 1.522338145592426, + "grad_norm": 0.6319215893745422, + "learning_rate": 2.7901886424239346e-06, + "loss": 0.2008, + "step": 16160 + }, + { + "epoch": 1.5224323496855938, + "grad_norm": 0.5874225497245789, + "learning_rate": 2.7891423505026647e-06, + "loss": 0.1818, + "step": 16161 + }, + { + "epoch": 1.5225265537787616, + "grad_norm": 0.5624944567680359, + "learning_rate": 2.788096223000103e-06, + "loss": 0.1818, + "step": 16162 + }, + { + "epoch": 1.5226207578719295, + "grad_norm": 0.5381230711936951, + "learning_rate": 2.787050259940098e-06, + "loss": 0.1797, + "step": 16163 + }, + { + "epoch": 1.5227149619650975, + "grad_norm": 0.6449101567268372, + "learning_rate": 2.786004461346503e-06, + "loss": 0.1911, + "step": 16164 + }, + { + "epoch": 1.5228091660582652, + "grad_norm": 0.7586252689361572, + "learning_rate": 2.784958827243166e-06, + "loss": 0.2076, + "step": 16165 + }, + { + "epoch": 1.522903370151433, + "grad_norm": 0.7198145985603333, + "learning_rate": 2.7839133576539224e-06, + "loss": 0.185, + "step": 16166 + }, + { + "epoch": 1.522997574244601, + "grad_norm": 0.6372790336608887, + "learning_rate": 2.782868052602614e-06, + "loss": 0.1921, + "step": 16167 + }, + { + "epoch": 1.5230917783377689, + "grad_norm": 0.7216392755508423, + "learning_rate": 2.781822912113079e-06, + "loss": 0.1935, + "step": 16168 + }, + { + "epoch": 1.5231859824309366, + "grad_norm": 0.6526051163673401, + "learning_rate": 2.7807779362091415e-06, + "loss": 0.18, + "step": 16169 + }, + { + "epoch": 1.5232801865241044, + "grad_norm": 0.7025391459465027, + "learning_rate": 2.779733124914631e-06, + "loss": 0.2075, + "step": 16170 + }, + { + "epoch": 1.5233743906172723, + "grad_norm": 0.6882464289665222, + "learning_rate": 2.7786884782533765e-06, + "loss": 0.2282, + "step": 16171 + }, + { + "epoch": 1.5234685947104403, + "grad_norm": 0.7818648219108582, + "learning_rate": 2.777643996249191e-06, + "loss": 0.2138, + "step": 16172 + }, + { + "epoch": 1.523562798803608, + "grad_norm": 0.642633855342865, + "learning_rate": 2.7765996789258863e-06, + "loss": 0.1938, + "step": 16173 + }, + { + "epoch": 1.5236570028967757, + "grad_norm": 0.6424699425697327, + "learning_rate": 2.7755555263072866e-06, + "loss": 0.1951, + "step": 16174 + }, + { + "epoch": 1.5237512069899437, + "grad_norm": 0.6402602791786194, + "learning_rate": 2.774511538417193e-06, + "loss": 0.2002, + "step": 16175 + }, + { + "epoch": 1.5238454110831117, + "grad_norm": 0.6372763514518738, + "learning_rate": 2.7734677152794087e-06, + "loss": 0.1884, + "step": 16176 + }, + { + "epoch": 1.5239396151762794, + "grad_norm": 0.7095897197723389, + "learning_rate": 2.772424056917735e-06, + "loss": 0.1976, + "step": 16177 + }, + { + "epoch": 1.5240338192694471, + "grad_norm": 0.6663805246353149, + "learning_rate": 2.7713805633559755e-06, + "loss": 0.2092, + "step": 16178 + }, + { + "epoch": 1.524128023362615, + "grad_norm": 0.6282491087913513, + "learning_rate": 2.7703372346179145e-06, + "loss": 0.1948, + "step": 16179 + }, + { + "epoch": 1.524222227455783, + "grad_norm": 0.5726357102394104, + "learning_rate": 2.7692940707273453e-06, + "loss": 0.1569, + "step": 16180 + }, + { + "epoch": 1.5243164315489508, + "grad_norm": 0.5925726294517517, + "learning_rate": 2.7682510717080568e-06, + "loss": 0.192, + "step": 16181 + }, + { + "epoch": 1.5244106356421185, + "grad_norm": 0.5967519283294678, + "learning_rate": 2.767208237583825e-06, + "loss": 0.1785, + "step": 16182 + }, + { + "epoch": 1.5245048397352865, + "grad_norm": 0.7821174263954163, + "learning_rate": 2.7661655683784305e-06, + "loss": 0.2038, + "step": 16183 + }, + { + "epoch": 1.5245990438284545, + "grad_norm": 0.6633504033088684, + "learning_rate": 2.7651230641156524e-06, + "loss": 0.2039, + "step": 16184 + }, + { + "epoch": 1.5246932479216222, + "grad_norm": 0.6780624985694885, + "learning_rate": 2.7640807248192535e-06, + "loss": 0.2173, + "step": 16185 + }, + { + "epoch": 1.52478745201479, + "grad_norm": 0.6351194381713867, + "learning_rate": 2.7630385505130054e-06, + "loss": 0.1945, + "step": 16186 + }, + { + "epoch": 1.524881656107958, + "grad_norm": 0.6171590685844421, + "learning_rate": 2.761996541220674e-06, + "loss": 0.1956, + "step": 16187 + }, + { + "epoch": 1.5249758602011259, + "grad_norm": 1.4074138402938843, + "learning_rate": 2.7609546969660117e-06, + "loss": 0.166, + "step": 16188 + }, + { + "epoch": 1.5250700642942936, + "grad_norm": 0.6705545783042908, + "learning_rate": 2.7599130177727775e-06, + "loss": 0.1659, + "step": 16189 + }, + { + "epoch": 1.5251642683874613, + "grad_norm": 0.7179521322250366, + "learning_rate": 2.758871503664726e-06, + "loss": 0.2459, + "step": 16190 + }, + { + "epoch": 1.5252584724806293, + "grad_norm": 0.9105493426322937, + "learning_rate": 2.757830154665604e-06, + "loss": 0.2044, + "step": 16191 + }, + { + "epoch": 1.5253526765737973, + "grad_norm": 1.5305920839309692, + "learning_rate": 2.75678897079915e-06, + "loss": 0.1955, + "step": 16192 + }, + { + "epoch": 1.525446880666965, + "grad_norm": 0.601993203163147, + "learning_rate": 2.7557479520891104e-06, + "loss": 0.1768, + "step": 16193 + }, + { + "epoch": 1.5255410847601327, + "grad_norm": 0.7127960920333862, + "learning_rate": 2.754707098559225e-06, + "loss": 0.1897, + "step": 16194 + }, + { + "epoch": 1.5256352888533007, + "grad_norm": 0.6438483595848083, + "learning_rate": 2.7536664102332177e-06, + "loss": 0.1962, + "step": 16195 + }, + { + "epoch": 1.5257294929464686, + "grad_norm": 0.7449595928192139, + "learning_rate": 2.7526258871348245e-06, + "loss": 0.2078, + "step": 16196 + }, + { + "epoch": 1.5258236970396364, + "grad_norm": 0.6828126907348633, + "learning_rate": 2.7515855292877714e-06, + "loss": 0.1838, + "step": 16197 + }, + { + "epoch": 1.5259179011328041, + "grad_norm": 0.6403012871742249, + "learning_rate": 2.750545336715776e-06, + "loss": 0.2085, + "step": 16198 + }, + { + "epoch": 1.526012105225972, + "grad_norm": 0.6840063333511353, + "learning_rate": 2.7495053094425584e-06, + "loss": 0.2087, + "step": 16199 + }, + { + "epoch": 1.52610630931914, + "grad_norm": 0.6032674312591553, + "learning_rate": 2.748465447491835e-06, + "loss": 0.197, + "step": 16200 + }, + { + "epoch": 1.5262005134123078, + "grad_norm": 0.6746863126754761, + "learning_rate": 2.7474257508873117e-06, + "loss": 0.1897, + "step": 16201 + }, + { + "epoch": 1.5262947175054755, + "grad_norm": 0.6539238691329956, + "learning_rate": 2.7463862196527e-06, + "loss": 0.1987, + "step": 16202 + }, + { + "epoch": 1.5263889215986435, + "grad_norm": 0.7085531949996948, + "learning_rate": 2.745346853811698e-06, + "loss": 0.2066, + "step": 16203 + }, + { + "epoch": 1.5264831256918114, + "grad_norm": 0.6681137084960938, + "learning_rate": 2.7443076533880074e-06, + "loss": 0.2126, + "step": 16204 + }, + { + "epoch": 1.5265773297849792, + "grad_norm": 0.5994083285331726, + "learning_rate": 2.743268618405326e-06, + "loss": 0.1887, + "step": 16205 + }, + { + "epoch": 1.526671533878147, + "grad_norm": 0.653131365776062, + "learning_rate": 2.7422297488873395e-06, + "loss": 0.1942, + "step": 16206 + }, + { + "epoch": 1.5267657379713149, + "grad_norm": 0.6739969849586487, + "learning_rate": 2.7411910448577405e-06, + "loss": 0.1897, + "step": 16207 + }, + { + "epoch": 1.5268599420644828, + "grad_norm": 0.6109043955802917, + "learning_rate": 2.7401525063402137e-06, + "loss": 0.1867, + "step": 16208 + }, + { + "epoch": 1.5269541461576506, + "grad_norm": 0.6434895396232605, + "learning_rate": 2.7391141333584335e-06, + "loss": 0.1783, + "step": 16209 + }, + { + "epoch": 1.5270483502508183, + "grad_norm": 0.6863971948623657, + "learning_rate": 2.738075925936081e-06, + "loss": 0.2047, + "step": 16210 + }, + { + "epoch": 1.5271425543439863, + "grad_norm": 0.6185998320579529, + "learning_rate": 2.7370378840968315e-06, + "loss": 0.2065, + "step": 16211 + }, + { + "epoch": 1.5272367584371542, + "grad_norm": 0.623668909072876, + "learning_rate": 2.7360000078643512e-06, + "loss": 0.1946, + "step": 16212 + }, + { + "epoch": 1.527330962530322, + "grad_norm": 0.6324273347854614, + "learning_rate": 2.734962297262297e-06, + "loss": 0.1886, + "step": 16213 + }, + { + "epoch": 1.5274251666234897, + "grad_norm": 0.6629226803779602, + "learning_rate": 2.733924752314345e-06, + "loss": 0.212, + "step": 16214 + }, + { + "epoch": 1.5275193707166577, + "grad_norm": 0.6107110381126404, + "learning_rate": 2.7328873730441465e-06, + "loss": 0.1907, + "step": 16215 + }, + { + "epoch": 1.5276135748098256, + "grad_norm": 0.7059251070022583, + "learning_rate": 2.731850159475351e-06, + "loss": 0.2054, + "step": 16216 + }, + { + "epoch": 1.5277077789029934, + "grad_norm": 0.6416538953781128, + "learning_rate": 2.7308131116316117e-06, + "loss": 0.1819, + "step": 16217 + }, + { + "epoch": 1.527801982996161, + "grad_norm": 0.6149570345878601, + "learning_rate": 2.7297762295365794e-06, + "loss": 0.1737, + "step": 16218 + }, + { + "epoch": 1.527896187089329, + "grad_norm": 0.632332444190979, + "learning_rate": 2.7287395132138893e-06, + "loss": 0.1843, + "step": 16219 + }, + { + "epoch": 1.527990391182497, + "grad_norm": 0.7795052528381348, + "learning_rate": 2.727702962687183e-06, + "loss": 0.2041, + "step": 16220 + }, + { + "epoch": 1.5280845952756648, + "grad_norm": 0.6708505749702454, + "learning_rate": 2.7266665779800996e-06, + "loss": 0.1861, + "step": 16221 + }, + { + "epoch": 1.5281787993688325, + "grad_norm": 0.6182888746261597, + "learning_rate": 2.7256303591162637e-06, + "loss": 0.2017, + "step": 16222 + }, + { + "epoch": 1.5282730034620005, + "grad_norm": 0.7270907163619995, + "learning_rate": 2.724594306119306e-06, + "loss": 0.203, + "step": 16223 + }, + { + "epoch": 1.5283672075551684, + "grad_norm": 0.6367576718330383, + "learning_rate": 2.7235584190128517e-06, + "loss": 0.1878, + "step": 16224 + }, + { + "epoch": 1.5284614116483362, + "grad_norm": 0.6641680002212524, + "learning_rate": 2.7225226978205164e-06, + "loss": 0.1818, + "step": 16225 + }, + { + "epoch": 1.528555615741504, + "grad_norm": 0.6329187750816345, + "learning_rate": 2.7214871425659182e-06, + "loss": 0.1803, + "step": 16226 + }, + { + "epoch": 1.5286498198346719, + "grad_norm": 0.6695205569267273, + "learning_rate": 2.7204517532726724e-06, + "loss": 0.1793, + "step": 16227 + }, + { + "epoch": 1.5287440239278398, + "grad_norm": 0.6727643609046936, + "learning_rate": 2.719416529964385e-06, + "loss": 0.1958, + "step": 16228 + }, + { + "epoch": 1.5288382280210076, + "grad_norm": 0.6859456300735474, + "learning_rate": 2.718381472664654e-06, + "loss": 0.219, + "step": 16229 + }, + { + "epoch": 1.5289324321141753, + "grad_norm": 0.6441728472709656, + "learning_rate": 2.7173465813970934e-06, + "loss": 0.1676, + "step": 16230 + }, + { + "epoch": 1.5290266362073432, + "grad_norm": 0.6026852130889893, + "learning_rate": 2.716311856185293e-06, + "loss": 0.1913, + "step": 16231 + }, + { + "epoch": 1.5291208403005112, + "grad_norm": 0.7040894627571106, + "learning_rate": 2.715277297052844e-06, + "loss": 0.2152, + "step": 16232 + }, + { + "epoch": 1.529215044393679, + "grad_norm": 0.6501896381378174, + "learning_rate": 2.7142429040233387e-06, + "loss": 0.2194, + "step": 16233 + }, + { + "epoch": 1.5293092484868467, + "grad_norm": 0.6888206005096436, + "learning_rate": 2.713208677120365e-06, + "loss": 0.1928, + "step": 16234 + }, + { + "epoch": 1.5294034525800146, + "grad_norm": 0.7510620951652527, + "learning_rate": 2.7121746163675e-06, + "loss": 0.2071, + "step": 16235 + }, + { + "epoch": 1.5294976566731826, + "grad_norm": 0.5633466839790344, + "learning_rate": 2.7111407217883255e-06, + "loss": 0.1927, + "step": 16236 + }, + { + "epoch": 1.5295918607663503, + "grad_norm": 0.6861047148704529, + "learning_rate": 2.7101069934064174e-06, + "loss": 0.188, + "step": 16237 + }, + { + "epoch": 1.529686064859518, + "grad_norm": 0.6694995164871216, + "learning_rate": 2.7090734312453404e-06, + "loss": 0.2055, + "step": 16238 + }, + { + "epoch": 1.529780268952686, + "grad_norm": 0.6832932829856873, + "learning_rate": 2.708040035328665e-06, + "loss": 0.1818, + "step": 16239 + }, + { + "epoch": 1.529874473045854, + "grad_norm": 0.675410270690918, + "learning_rate": 2.707006805679958e-06, + "loss": 0.2025, + "step": 16240 + }, + { + "epoch": 1.5299686771390217, + "grad_norm": 0.6247263550758362, + "learning_rate": 2.7059737423227706e-06, + "loss": 0.1856, + "step": 16241 + }, + { + "epoch": 1.5300628812321895, + "grad_norm": 0.6771447658538818, + "learning_rate": 2.7049408452806656e-06, + "loss": 0.1922, + "step": 16242 + }, + { + "epoch": 1.5301570853253574, + "grad_norm": 0.7252166867256165, + "learning_rate": 2.7039081145771882e-06, + "loss": 0.1966, + "step": 16243 + }, + { + "epoch": 1.5302512894185254, + "grad_norm": 0.6284207701683044, + "learning_rate": 2.7028755502358907e-06, + "loss": 0.1801, + "step": 16244 + }, + { + "epoch": 1.5303454935116931, + "grad_norm": 0.5970359444618225, + "learning_rate": 2.7018431522803166e-06, + "loss": 0.1593, + "step": 16245 + }, + { + "epoch": 1.5304396976048609, + "grad_norm": 0.6221842765808105, + "learning_rate": 2.700810920734004e-06, + "loss": 0.1908, + "step": 16246 + }, + { + "epoch": 1.5305339016980288, + "grad_norm": 0.6366804242134094, + "learning_rate": 2.6997788556204906e-06, + "loss": 0.2117, + "step": 16247 + }, + { + "epoch": 1.5306281057911968, + "grad_norm": 0.6622896194458008, + "learning_rate": 2.6987469569633117e-06, + "loss": 0.2163, + "step": 16248 + }, + { + "epoch": 1.5307223098843645, + "grad_norm": 0.6698190569877625, + "learning_rate": 2.6977152247859917e-06, + "loss": 0.2053, + "step": 16249 + }, + { + "epoch": 1.5308165139775323, + "grad_norm": 0.7152218222618103, + "learning_rate": 2.696683659112057e-06, + "loss": 0.1911, + "step": 16250 + }, + { + "epoch": 1.5309107180707002, + "grad_norm": 0.7443094849586487, + "learning_rate": 2.6956522599650335e-06, + "loss": 0.2291, + "step": 16251 + }, + { + "epoch": 1.531004922163868, + "grad_norm": 0.7532116770744324, + "learning_rate": 2.6946210273684336e-06, + "loss": 0.201, + "step": 16252 + }, + { + "epoch": 1.5310991262570357, + "grad_norm": 0.6666504144668579, + "learning_rate": 2.6935899613457705e-06, + "loss": 0.1915, + "step": 16253 + }, + { + "epoch": 1.5311933303502037, + "grad_norm": 0.7396573424339294, + "learning_rate": 2.6925590619205553e-06, + "loss": 0.2026, + "step": 16254 + }, + { + "epoch": 1.5312875344433716, + "grad_norm": 0.6690954566001892, + "learning_rate": 2.6915283291162973e-06, + "loss": 0.1947, + "step": 16255 + }, + { + "epoch": 1.5313817385365394, + "grad_norm": 0.6184488534927368, + "learning_rate": 2.6904977629564942e-06, + "loss": 0.1685, + "step": 16256 + }, + { + "epoch": 1.531475942629707, + "grad_norm": 0.7449617981910706, + "learning_rate": 2.6894673634646464e-06, + "loss": 0.2041, + "step": 16257 + }, + { + "epoch": 1.531570146722875, + "grad_norm": 0.7334285974502563, + "learning_rate": 2.688437130664251e-06, + "loss": 0.1808, + "step": 16258 + }, + { + "epoch": 1.531664350816043, + "grad_norm": 0.7738111019134521, + "learning_rate": 2.687407064578793e-06, + "loss": 0.1998, + "step": 16259 + }, + { + "epoch": 1.5317585549092108, + "grad_norm": 0.6780102252960205, + "learning_rate": 2.6863771652317638e-06, + "loss": 0.2011, + "step": 16260 + }, + { + "epoch": 1.5318527590023785, + "grad_norm": 0.6936577558517456, + "learning_rate": 2.6853474326466487e-06, + "loss": 0.1961, + "step": 16261 + }, + { + "epoch": 1.5319469630955465, + "grad_norm": 0.6879404783248901, + "learning_rate": 2.684317866846923e-06, + "loss": 0.218, + "step": 16262 + }, + { + "epoch": 1.5320411671887144, + "grad_norm": 1.008069634437561, + "learning_rate": 2.6832884678560623e-06, + "loss": 0.2037, + "step": 16263 + }, + { + "epoch": 1.5321353712818822, + "grad_norm": 0.6407217383384705, + "learning_rate": 2.682259235697543e-06, + "loss": 0.1791, + "step": 16264 + }, + { + "epoch": 1.53222957537505, + "grad_norm": 0.6090212464332581, + "learning_rate": 2.6812301703948273e-06, + "loss": 0.1764, + "step": 16265 + }, + { + "epoch": 1.5323237794682179, + "grad_norm": 0.6163294911384583, + "learning_rate": 2.680201271971383e-06, + "loss": 0.1788, + "step": 16266 + }, + { + "epoch": 1.5324179835613858, + "grad_norm": 0.647861897945404, + "learning_rate": 2.679172540450672e-06, + "loss": 0.1781, + "step": 16267 + }, + { + "epoch": 1.5325121876545535, + "grad_norm": 0.6163336038589478, + "learning_rate": 2.67814397585615e-06, + "loss": 0.2099, + "step": 16268 + }, + { + "epoch": 1.5326063917477213, + "grad_norm": 0.7356839179992676, + "learning_rate": 2.6771155782112624e-06, + "loss": 0.2283, + "step": 16269 + }, + { + "epoch": 1.5327005958408892, + "grad_norm": 0.6488597989082336, + "learning_rate": 2.676087347539471e-06, + "loss": 0.1928, + "step": 16270 + }, + { + "epoch": 1.5327947999340572, + "grad_norm": 0.6527369618415833, + "learning_rate": 2.6750592838642144e-06, + "loss": 0.2036, + "step": 16271 + }, + { + "epoch": 1.532889004027225, + "grad_norm": 1.2340357303619385, + "learning_rate": 2.6740313872089306e-06, + "loss": 0.2188, + "step": 16272 + }, + { + "epoch": 1.5329832081203927, + "grad_norm": 0.6677947640419006, + "learning_rate": 2.6730036575970618e-06, + "loss": 0.1902, + "step": 16273 + }, + { + "epoch": 1.5330774122135606, + "grad_norm": 0.6580592393875122, + "learning_rate": 2.6719760950520445e-06, + "loss": 0.1929, + "step": 16274 + }, + { + "epoch": 1.5331716163067286, + "grad_norm": 0.6391635537147522, + "learning_rate": 2.670948699597302e-06, + "loss": 0.1764, + "step": 16275 + }, + { + "epoch": 1.5332658203998963, + "grad_norm": 0.6302698254585266, + "learning_rate": 2.6699214712562627e-06, + "loss": 0.186, + "step": 16276 + }, + { + "epoch": 1.533360024493064, + "grad_norm": 0.6598572134971619, + "learning_rate": 2.668894410052354e-06, + "loss": 0.201, + "step": 16277 + }, + { + "epoch": 1.533454228586232, + "grad_norm": 0.6469151377677917, + "learning_rate": 2.6678675160089872e-06, + "loss": 0.2002, + "step": 16278 + }, + { + "epoch": 1.5335484326794, + "grad_norm": 0.5996238589286804, + "learning_rate": 2.6668407891495806e-06, + "loss": 0.1552, + "step": 16279 + }, + { + "epoch": 1.5336426367725677, + "grad_norm": 0.5995888710021973, + "learning_rate": 2.6658142294975486e-06, + "loss": 0.1846, + "step": 16280 + }, + { + "epoch": 1.5337368408657355, + "grad_norm": 0.6957386136054993, + "learning_rate": 2.6647878370762903e-06, + "loss": 0.2026, + "step": 16281 + }, + { + "epoch": 1.5338310449589034, + "grad_norm": 0.5960253477096558, + "learning_rate": 2.663761611909218e-06, + "loss": 0.1952, + "step": 16282 + }, + { + "epoch": 1.5339252490520714, + "grad_norm": 0.6038472056388855, + "learning_rate": 2.662735554019722e-06, + "loss": 0.1941, + "step": 16283 + }, + { + "epoch": 1.5340194531452391, + "grad_norm": 0.7028207778930664, + "learning_rate": 2.6617096634312036e-06, + "loss": 0.1982, + "step": 16284 + }, + { + "epoch": 1.5341136572384069, + "grad_norm": 0.708573043346405, + "learning_rate": 2.660683940167057e-06, + "loss": 0.1869, + "step": 16285 + }, + { + "epoch": 1.5342078613315748, + "grad_norm": 0.6810007691383362, + "learning_rate": 2.659658384250663e-06, + "loss": 0.2024, + "step": 16286 + }, + { + "epoch": 1.5343020654247428, + "grad_norm": 0.6183075308799744, + "learning_rate": 2.6586329957054114e-06, + "loss": 0.1765, + "step": 16287 + }, + { + "epoch": 1.5343962695179105, + "grad_norm": 0.6494288444519043, + "learning_rate": 2.6576077745546823e-06, + "loss": 0.1976, + "step": 16288 + }, + { + "epoch": 1.5344904736110783, + "grad_norm": 0.6704617738723755, + "learning_rate": 2.65658272082185e-06, + "loss": 0.1741, + "step": 16289 + }, + { + "epoch": 1.5345846777042462, + "grad_norm": 0.6166954636573792, + "learning_rate": 2.655557834530288e-06, + "loss": 0.1712, + "step": 16290 + }, + { + "epoch": 1.5346788817974142, + "grad_norm": 0.7561928629875183, + "learning_rate": 2.6545331157033682e-06, + "loss": 0.191, + "step": 16291 + }, + { + "epoch": 1.534773085890582, + "grad_norm": 0.6683579087257385, + "learning_rate": 2.653508564364453e-06, + "loss": 0.195, + "step": 16292 + }, + { + "epoch": 1.5348672899837497, + "grad_norm": 0.6410741209983826, + "learning_rate": 2.652484180536902e-06, + "loss": 0.2003, + "step": 16293 + }, + { + "epoch": 1.5349614940769176, + "grad_norm": 0.6511902809143066, + "learning_rate": 2.6514599642440742e-06, + "loss": 0.1995, + "step": 16294 + }, + { + "epoch": 1.5350556981700856, + "grad_norm": 0.6683751940727234, + "learning_rate": 2.6504359155093273e-06, + "loss": 0.1971, + "step": 16295 + }, + { + "epoch": 1.5351499022632533, + "grad_norm": 0.6326442360877991, + "learning_rate": 2.649412034356005e-06, + "loss": 0.1763, + "step": 16296 + }, + { + "epoch": 1.535244106356421, + "grad_norm": 0.6577661633491516, + "learning_rate": 2.6483883208074557e-06, + "loss": 0.218, + "step": 16297 + }, + { + "epoch": 1.535338310449589, + "grad_norm": 0.673072338104248, + "learning_rate": 2.6473647748870258e-06, + "loss": 0.228, + "step": 16298 + }, + { + "epoch": 1.535432514542757, + "grad_norm": 0.6371365189552307, + "learning_rate": 2.6463413966180463e-06, + "loss": 0.1773, + "step": 16299 + }, + { + "epoch": 1.5355267186359247, + "grad_norm": 0.7425619959831238, + "learning_rate": 2.6453181860238563e-06, + "loss": 0.2231, + "step": 16300 + }, + { + "epoch": 1.5356209227290925, + "grad_norm": 0.6594802141189575, + "learning_rate": 2.6442951431277886e-06, + "loss": 0.2232, + "step": 16301 + }, + { + "epoch": 1.5357151268222604, + "grad_norm": 0.5638049840927124, + "learning_rate": 2.6432722679531654e-06, + "loss": 0.1602, + "step": 16302 + }, + { + "epoch": 1.5358093309154284, + "grad_norm": 0.6391991972923279, + "learning_rate": 2.64224956052331e-06, + "loss": 0.2083, + "step": 16303 + }, + { + "epoch": 1.535903535008596, + "grad_norm": 0.6430094242095947, + "learning_rate": 2.6412270208615477e-06, + "loss": 0.2043, + "step": 16304 + }, + { + "epoch": 1.5359977391017638, + "grad_norm": 0.6466431021690369, + "learning_rate": 2.6402046489911904e-06, + "loss": 0.1839, + "step": 16305 + }, + { + "epoch": 1.5360919431949318, + "grad_norm": 0.6663456559181213, + "learning_rate": 2.639182444935542e-06, + "loss": 0.1863, + "step": 16306 + }, + { + "epoch": 1.5361861472880998, + "grad_norm": 0.5823646187782288, + "learning_rate": 2.6381604087179247e-06, + "loss": 0.1977, + "step": 16307 + }, + { + "epoch": 1.5362803513812675, + "grad_norm": 0.6902181506156921, + "learning_rate": 2.6371385403616345e-06, + "loss": 0.2364, + "step": 16308 + }, + { + "epoch": 1.5363745554744352, + "grad_norm": 0.5892136693000793, + "learning_rate": 2.636116839889967e-06, + "loss": 0.1787, + "step": 16309 + }, + { + "epoch": 1.5364687595676032, + "grad_norm": 0.6292369961738586, + "learning_rate": 2.6350953073262297e-06, + "loss": 0.2, + "step": 16310 + }, + { + "epoch": 1.5365629636607712, + "grad_norm": 0.6879388689994812, + "learning_rate": 2.6340739426937103e-06, + "loss": 0.1945, + "step": 16311 + }, + { + "epoch": 1.536657167753939, + "grad_norm": 0.6777614951133728, + "learning_rate": 2.633052746015693e-06, + "loss": 0.1971, + "step": 16312 + }, + { + "epoch": 1.5367513718471066, + "grad_norm": 0.605811357498169, + "learning_rate": 2.6320317173154665e-06, + "loss": 0.1832, + "step": 16313 + }, + { + "epoch": 1.5368455759402746, + "grad_norm": 0.6477255821228027, + "learning_rate": 2.6310108566163138e-06, + "loss": 0.2055, + "step": 16314 + }, + { + "epoch": 1.5369397800334426, + "grad_norm": 0.625795841217041, + "learning_rate": 2.629990163941507e-06, + "loss": 0.1673, + "step": 16315 + }, + { + "epoch": 1.5370339841266103, + "grad_norm": 0.6637027859687805, + "learning_rate": 2.628969639314324e-06, + "loss": 0.1795, + "step": 16316 + }, + { + "epoch": 1.537128188219778, + "grad_norm": 0.6212215423583984, + "learning_rate": 2.6279492827580345e-06, + "loss": 0.1654, + "step": 16317 + }, + { + "epoch": 1.537222392312946, + "grad_norm": 0.6320905685424805, + "learning_rate": 2.626929094295899e-06, + "loss": 0.1973, + "step": 16318 + }, + { + "epoch": 1.537316596406114, + "grad_norm": 0.642948567867279, + "learning_rate": 2.625909073951184e-06, + "loss": 0.1787, + "step": 16319 + }, + { + "epoch": 1.5374108004992817, + "grad_norm": 0.6066054701805115, + "learning_rate": 2.624889221747149e-06, + "loss": 0.1766, + "step": 16320 + }, + { + "epoch": 1.5375050045924494, + "grad_norm": 0.7016599178314209, + "learning_rate": 2.623869537707042e-06, + "loss": 0.229, + "step": 16321 + }, + { + "epoch": 1.5375992086856174, + "grad_norm": 0.7277583479881287, + "learning_rate": 2.62285002185412e-06, + "loss": 0.1804, + "step": 16322 + }, + { + "epoch": 1.5376934127787854, + "grad_norm": 0.6963581442832947, + "learning_rate": 2.621830674211624e-06, + "loss": 0.1875, + "step": 16323 + }, + { + "epoch": 1.537787616871953, + "grad_norm": 0.7287015914916992, + "learning_rate": 2.6208114948027987e-06, + "loss": 0.2095, + "step": 16324 + }, + { + "epoch": 1.5378818209651208, + "grad_norm": 0.6737769246101379, + "learning_rate": 2.619792483650887e-06, + "loss": 0.2023, + "step": 16325 + }, + { + "epoch": 1.5379760250582888, + "grad_norm": 0.678702175617218, + "learning_rate": 2.6187736407791178e-06, + "loss": 0.1948, + "step": 16326 + }, + { + "epoch": 1.5380702291514567, + "grad_norm": 0.6418389081954956, + "learning_rate": 2.6177549662107237e-06, + "loss": 0.1974, + "step": 16327 + }, + { + "epoch": 1.5381644332446245, + "grad_norm": 0.6424870491027832, + "learning_rate": 2.616736459968936e-06, + "loss": 0.1767, + "step": 16328 + }, + { + "epoch": 1.5382586373377922, + "grad_norm": 0.7702372670173645, + "learning_rate": 2.6157181220769734e-06, + "loss": 0.2148, + "step": 16329 + }, + { + "epoch": 1.5383528414309602, + "grad_norm": 0.69382643699646, + "learning_rate": 2.6146999525580575e-06, + "loss": 0.2213, + "step": 16330 + }, + { + "epoch": 1.5384470455241281, + "grad_norm": 0.6558042168617249, + "learning_rate": 2.6136819514354075e-06, + "loss": 0.2048, + "step": 16331 + }, + { + "epoch": 1.5385412496172959, + "grad_norm": 0.6997061967849731, + "learning_rate": 2.612664118732232e-06, + "loss": 0.2212, + "step": 16332 + }, + { + "epoch": 1.5386354537104636, + "grad_norm": 0.5648784041404724, + "learning_rate": 2.611646454471736e-06, + "loss": 0.1708, + "step": 16333 + }, + { + "epoch": 1.5387296578036316, + "grad_norm": 1.061224102973938, + "learning_rate": 2.6106289586771273e-06, + "loss": 0.1672, + "step": 16334 + }, + { + "epoch": 1.5388238618967995, + "grad_norm": 0.6273217797279358, + "learning_rate": 2.609611631371609e-06, + "loss": 0.2216, + "step": 16335 + }, + { + "epoch": 1.5389180659899673, + "grad_norm": 0.5811769962310791, + "learning_rate": 2.6085944725783716e-06, + "loss": 0.187, + "step": 16336 + }, + { + "epoch": 1.539012270083135, + "grad_norm": 0.6060513257980347, + "learning_rate": 2.6075774823206122e-06, + "loss": 0.1984, + "step": 16337 + }, + { + "epoch": 1.539106474176303, + "grad_norm": 0.6198642253875732, + "learning_rate": 2.606560660621522e-06, + "loss": 0.2173, + "step": 16338 + }, + { + "epoch": 1.539200678269471, + "grad_norm": 0.6413329839706421, + "learning_rate": 2.6055440075042793e-06, + "loss": 0.1968, + "step": 16339 + }, + { + "epoch": 1.5392948823626387, + "grad_norm": 0.7054868340492249, + "learning_rate": 2.6045275229920686e-06, + "loss": 0.2208, + "step": 16340 + }, + { + "epoch": 1.5393890864558064, + "grad_norm": 0.6680474281311035, + "learning_rate": 2.6035112071080715e-06, + "loss": 0.1842, + "step": 16341 + }, + { + "epoch": 1.5394832905489744, + "grad_norm": 0.7053892612457275, + "learning_rate": 2.602495059875454e-06, + "loss": 0.2042, + "step": 16342 + }, + { + "epoch": 1.5395774946421423, + "grad_norm": 0.6567726731300354, + "learning_rate": 2.6014790813173907e-06, + "loss": 0.1894, + "step": 16343 + }, + { + "epoch": 1.53967169873531, + "grad_norm": 0.8481297492980957, + "learning_rate": 2.6004632714570486e-06, + "loss": 0.1972, + "step": 16344 + }, + { + "epoch": 1.5397659028284778, + "grad_norm": 0.6491158604621887, + "learning_rate": 2.5994476303175876e-06, + "loss": 0.1983, + "step": 16345 + }, + { + "epoch": 1.5398601069216458, + "grad_norm": 0.7349384427070618, + "learning_rate": 2.5984321579221593e-06, + "loss": 0.1801, + "step": 16346 + }, + { + "epoch": 1.5399543110148137, + "grad_norm": 0.7639728784561157, + "learning_rate": 2.5974168542939317e-06, + "loss": 0.1878, + "step": 16347 + }, + { + "epoch": 1.5400485151079815, + "grad_norm": 0.6427068114280701, + "learning_rate": 2.5964017194560466e-06, + "loss": 0.159, + "step": 16348 + }, + { + "epoch": 1.5401427192011492, + "grad_norm": 0.6762670874595642, + "learning_rate": 2.595386753431648e-06, + "loss": 0.1978, + "step": 16349 + }, + { + "epoch": 1.5402369232943172, + "grad_norm": 0.649671196937561, + "learning_rate": 2.5943719562438875e-06, + "loss": 0.1997, + "step": 16350 + }, + { + "epoch": 1.5403311273874851, + "grad_norm": 0.6771973371505737, + "learning_rate": 2.5933573279158996e-06, + "loss": 0.1911, + "step": 16351 + }, + { + "epoch": 1.5404253314806529, + "grad_norm": 0.6386894583702087, + "learning_rate": 2.5923428684708163e-06, + "loss": 0.1902, + "step": 16352 + }, + { + "epoch": 1.5405195355738206, + "grad_norm": 0.6674006581306458, + "learning_rate": 2.591328577931772e-06, + "loss": 0.1708, + "step": 16353 + }, + { + "epoch": 1.5406137396669886, + "grad_norm": 0.8000255227088928, + "learning_rate": 2.5903144563218974e-06, + "loss": 0.1919, + "step": 16354 + }, + { + "epoch": 1.5407079437601565, + "grad_norm": 0.6232596039772034, + "learning_rate": 2.589300503664308e-06, + "loss": 0.1949, + "step": 16355 + }, + { + "epoch": 1.5408021478533243, + "grad_norm": 0.7329729795455933, + "learning_rate": 2.5882867199821293e-06, + "loss": 0.2069, + "step": 16356 + }, + { + "epoch": 1.540896351946492, + "grad_norm": 0.6691109538078308, + "learning_rate": 2.587273105298479e-06, + "loss": 0.1986, + "step": 16357 + }, + { + "epoch": 1.54099055603966, + "grad_norm": 0.6441341042518616, + "learning_rate": 2.586259659636462e-06, + "loss": 0.2028, + "step": 16358 + }, + { + "epoch": 1.541084760132828, + "grad_norm": 0.7453688979148865, + "learning_rate": 2.58524638301919e-06, + "loss": 0.1959, + "step": 16359 + }, + { + "epoch": 1.5411789642259957, + "grad_norm": 0.6126582622528076, + "learning_rate": 2.584233275469772e-06, + "loss": 0.1821, + "step": 16360 + }, + { + "epoch": 1.5412731683191634, + "grad_norm": 0.6151001453399658, + "learning_rate": 2.5832203370112995e-06, + "loss": 0.1828, + "step": 16361 + }, + { + "epoch": 1.5413673724123313, + "grad_norm": 0.7793347835540771, + "learning_rate": 2.582207567666878e-06, + "loss": 0.1765, + "step": 16362 + }, + { + "epoch": 1.5414615765054993, + "grad_norm": 0.8821308016777039, + "learning_rate": 2.5811949674595916e-06, + "loss": 0.2105, + "step": 16363 + }, + { + "epoch": 1.541555780598667, + "grad_norm": 0.6928325295448303, + "learning_rate": 2.5801825364125343e-06, + "loss": 0.199, + "step": 16364 + }, + { + "epoch": 1.5416499846918348, + "grad_norm": 0.648412823677063, + "learning_rate": 2.5791702745487924e-06, + "loss": 0.1792, + "step": 16365 + }, + { + "epoch": 1.5417441887850027, + "grad_norm": 0.6915906667709351, + "learning_rate": 2.5781581818914424e-06, + "loss": 0.2278, + "step": 16366 + }, + { + "epoch": 1.5418383928781707, + "grad_norm": 0.6613881587982178, + "learning_rate": 2.577146258463563e-06, + "loss": 0.2078, + "step": 16367 + }, + { + "epoch": 1.5419325969713384, + "grad_norm": 0.5999644994735718, + "learning_rate": 2.5761345042882326e-06, + "loss": 0.1733, + "step": 16368 + }, + { + "epoch": 1.5420268010645062, + "grad_norm": 0.6650545001029968, + "learning_rate": 2.575122919388513e-06, + "loss": 0.188, + "step": 16369 + }, + { + "epoch": 1.5421210051576741, + "grad_norm": 0.640903651714325, + "learning_rate": 2.5741115037874776e-06, + "loss": 0.1903, + "step": 16370 + }, + { + "epoch": 1.542215209250842, + "grad_norm": 0.6816896200180054, + "learning_rate": 2.57310025750818e-06, + "loss": 0.199, + "step": 16371 + }, + { + "epoch": 1.5423094133440098, + "grad_norm": 0.6032940745353699, + "learning_rate": 2.5720891805736857e-06, + "loss": 0.2066, + "step": 16372 + }, + { + "epoch": 1.5424036174371776, + "grad_norm": 0.6310244798660278, + "learning_rate": 2.571078273007044e-06, + "loss": 0.186, + "step": 16373 + }, + { + "epoch": 1.5424978215303455, + "grad_norm": 0.6525641679763794, + "learning_rate": 2.570067534831305e-06, + "loss": 0.1781, + "step": 16374 + }, + { + "epoch": 1.5425920256235135, + "grad_norm": 0.6778793931007385, + "learning_rate": 2.5690569660695207e-06, + "loss": 0.1985, + "step": 16375 + }, + { + "epoch": 1.5426862297166812, + "grad_norm": 0.6526602506637573, + "learning_rate": 2.568046566744726e-06, + "loss": 0.2134, + "step": 16376 + }, + { + "epoch": 1.542780433809849, + "grad_norm": 0.7703606486320496, + "learning_rate": 2.5670363368799646e-06, + "loss": 0.2488, + "step": 16377 + }, + { + "epoch": 1.542874637903017, + "grad_norm": 0.6597605347633362, + "learning_rate": 2.5660262764982723e-06, + "loss": 0.2093, + "step": 16378 + }, + { + "epoch": 1.542968841996185, + "grad_norm": 0.6802690625190735, + "learning_rate": 2.565016385622675e-06, + "loss": 0.1996, + "step": 16379 + }, + { + "epoch": 1.5430630460893526, + "grad_norm": 0.6328654289245605, + "learning_rate": 2.5640066642762017e-06, + "loss": 0.1961, + "step": 16380 + }, + { + "epoch": 1.5431572501825204, + "grad_norm": 0.6502787470817566, + "learning_rate": 2.56299711248188e-06, + "loss": 0.2054, + "step": 16381 + }, + { + "epoch": 1.5432514542756883, + "grad_norm": 1.638429045677185, + "learning_rate": 2.561987730262723e-06, + "loss": 0.1946, + "step": 16382 + }, + { + "epoch": 1.5433456583688563, + "grad_norm": 0.5941994190216064, + "learning_rate": 2.5609785176417478e-06, + "loss": 0.1764, + "step": 16383 + }, + { + "epoch": 1.543439862462024, + "grad_norm": 0.6767863035202026, + "learning_rate": 2.559969474641971e-06, + "loss": 0.156, + "step": 16384 + }, + { + "epoch": 1.5435340665551918, + "grad_norm": 0.672737181186676, + "learning_rate": 2.5589606012863968e-06, + "loss": 0.1933, + "step": 16385 + }, + { + "epoch": 1.5436282706483597, + "grad_norm": 0.6486581563949585, + "learning_rate": 2.557951897598022e-06, + "loss": 0.1959, + "step": 16386 + }, + { + "epoch": 1.5437224747415277, + "grad_norm": 1.0440093278884888, + "learning_rate": 2.5569433635998597e-06, + "loss": 0.183, + "step": 16387 + }, + { + "epoch": 1.5438166788346954, + "grad_norm": 0.6651913523674011, + "learning_rate": 2.5559349993148984e-06, + "loss": 0.2024, + "step": 16388 + }, + { + "epoch": 1.5439108829278632, + "grad_norm": 0.669744610786438, + "learning_rate": 2.554926804766127e-06, + "loss": 0.2179, + "step": 16389 + }, + { + "epoch": 1.5440050870210311, + "grad_norm": 0.7099943161010742, + "learning_rate": 2.553918779976544e-06, + "loss": 0.2163, + "step": 16390 + }, + { + "epoch": 1.5440992911141989, + "grad_norm": 0.6635091304779053, + "learning_rate": 2.5529109249691285e-06, + "loss": 0.1972, + "step": 16391 + }, + { + "epoch": 1.5441934952073666, + "grad_norm": 0.6373885869979858, + "learning_rate": 2.5519032397668575e-06, + "loss": 0.1716, + "step": 16392 + }, + { + "epoch": 1.5442876993005346, + "grad_norm": 0.6005825996398926, + "learning_rate": 2.5508957243927126e-06, + "loss": 0.163, + "step": 16393 + }, + { + "epoch": 1.5443819033937025, + "grad_norm": 0.6473194360733032, + "learning_rate": 2.5498883788696673e-06, + "loss": 0.1759, + "step": 16394 + }, + { + "epoch": 1.5444761074868703, + "grad_norm": 1.1488068103790283, + "learning_rate": 2.5488812032206855e-06, + "loss": 0.1964, + "step": 16395 + }, + { + "epoch": 1.544570311580038, + "grad_norm": 0.7530942559242249, + "learning_rate": 2.547874197468736e-06, + "loss": 0.2287, + "step": 16396 + }, + { + "epoch": 1.544664515673206, + "grad_norm": 0.7974828481674194, + "learning_rate": 2.5468673616367835e-06, + "loss": 0.2141, + "step": 16397 + }, + { + "epoch": 1.544758719766374, + "grad_norm": 0.6990633010864258, + "learning_rate": 2.5458606957477784e-06, + "loss": 0.192, + "step": 16398 + }, + { + "epoch": 1.5448529238595416, + "grad_norm": 0.6368348002433777, + "learning_rate": 2.5448541998246767e-06, + "loss": 0.1847, + "step": 16399 + }, + { + "epoch": 1.5449471279527094, + "grad_norm": 0.7107001543045044, + "learning_rate": 2.543847873890433e-06, + "loss": 0.2334, + "step": 16400 + }, + { + "epoch": 1.5450413320458773, + "grad_norm": 0.6501368284225464, + "learning_rate": 2.5428417179679842e-06, + "loss": 0.2043, + "step": 16401 + }, + { + "epoch": 1.5451355361390453, + "grad_norm": 0.6266803741455078, + "learning_rate": 2.541835732080281e-06, + "loss": 0.2159, + "step": 16402 + }, + { + "epoch": 1.545229740232213, + "grad_norm": 0.646104633808136, + "learning_rate": 2.5408299162502546e-06, + "loss": 0.2266, + "step": 16403 + }, + { + "epoch": 1.5453239443253808, + "grad_norm": 0.6524372100830078, + "learning_rate": 2.5398242705008412e-06, + "loss": 0.2196, + "step": 16404 + }, + { + "epoch": 1.5454181484185487, + "grad_norm": 0.6856186389923096, + "learning_rate": 2.538818794854976e-06, + "loss": 0.2091, + "step": 16405 + }, + { + "epoch": 1.5455123525117167, + "grad_norm": 0.6180176138877869, + "learning_rate": 2.5378134893355777e-06, + "loss": 0.1891, + "step": 16406 + }, + { + "epoch": 1.5456065566048844, + "grad_norm": 0.6351768970489502, + "learning_rate": 2.536808353965572e-06, + "loss": 0.193, + "step": 16407 + }, + { + "epoch": 1.5457007606980522, + "grad_norm": 0.6795313954353333, + "learning_rate": 2.535803388767881e-06, + "loss": 0.2184, + "step": 16408 + }, + { + "epoch": 1.5457949647912201, + "grad_norm": 0.6426952481269836, + "learning_rate": 2.534798593765414e-06, + "loss": 0.197, + "step": 16409 + }, + { + "epoch": 1.545889168884388, + "grad_norm": 0.6773859262466431, + "learning_rate": 2.5337939689810864e-06, + "loss": 0.1714, + "step": 16410 + }, + { + "epoch": 1.5459833729775558, + "grad_norm": 0.7835344672203064, + "learning_rate": 2.5327895144378e-06, + "loss": 0.1668, + "step": 16411 + }, + { + "epoch": 1.5460775770707236, + "grad_norm": 0.6896972060203552, + "learning_rate": 2.5317852301584642e-06, + "loss": 0.2217, + "step": 16412 + }, + { + "epoch": 1.5461717811638915, + "grad_norm": 0.7565961480140686, + "learning_rate": 2.5307811161659724e-06, + "loss": 0.1882, + "step": 16413 + }, + { + "epoch": 1.5462659852570595, + "grad_norm": 0.6159785985946655, + "learning_rate": 2.5297771724832222e-06, + "loss": 0.1672, + "step": 16414 + }, + { + "epoch": 1.5463601893502272, + "grad_norm": 0.6429604291915894, + "learning_rate": 2.5287733991331074e-06, + "loss": 0.1837, + "step": 16415 + }, + { + "epoch": 1.546454393443395, + "grad_norm": 0.7100839018821716, + "learning_rate": 2.5277697961385118e-06, + "loss": 0.193, + "step": 16416 + }, + { + "epoch": 1.546548597536563, + "grad_norm": 0.6241758465766907, + "learning_rate": 2.5267663635223205e-06, + "loss": 0.1841, + "step": 16417 + }, + { + "epoch": 1.546642801629731, + "grad_norm": 0.6817502379417419, + "learning_rate": 2.525763101307417e-06, + "loss": 0.2044, + "step": 16418 + }, + { + "epoch": 1.5467370057228986, + "grad_norm": 0.750516414642334, + "learning_rate": 2.5247600095166702e-06, + "loss": 0.1999, + "step": 16419 + }, + { + "epoch": 1.5468312098160664, + "grad_norm": 0.6559773087501526, + "learning_rate": 2.5237570881729566e-06, + "loss": 0.1962, + "step": 16420 + }, + { + "epoch": 1.5469254139092343, + "grad_norm": 0.7364010810852051, + "learning_rate": 2.522754337299146e-06, + "loss": 0.1975, + "step": 16421 + }, + { + "epoch": 1.5470196180024023, + "grad_norm": 0.6457366943359375, + "learning_rate": 2.5217517569181003e-06, + "loss": 0.184, + "step": 16422 + }, + { + "epoch": 1.54711382209557, + "grad_norm": 0.6395455002784729, + "learning_rate": 2.5207493470526747e-06, + "loss": 0.1684, + "step": 16423 + }, + { + "epoch": 1.5472080261887378, + "grad_norm": 0.6169290542602539, + "learning_rate": 2.5197471077257362e-06, + "loss": 0.1963, + "step": 16424 + }, + { + "epoch": 1.5473022302819057, + "grad_norm": 0.6614916324615479, + "learning_rate": 2.518745038960132e-06, + "loss": 0.1964, + "step": 16425 + }, + { + "epoch": 1.5473964343750737, + "grad_norm": 0.6604982018470764, + "learning_rate": 2.5177431407787056e-06, + "loss": 0.192, + "step": 16426 + }, + { + "epoch": 1.5474906384682414, + "grad_norm": 0.6813318133354187, + "learning_rate": 2.516741413204312e-06, + "loss": 0.2218, + "step": 16427 + }, + { + "epoch": 1.5475848425614092, + "grad_norm": 0.6619900465011597, + "learning_rate": 2.515739856259788e-06, + "loss": 0.2102, + "step": 16428 + }, + { + "epoch": 1.5476790466545771, + "grad_norm": 0.6388282179832458, + "learning_rate": 2.5147384699679646e-06, + "loss": 0.1953, + "step": 16429 + }, + { + "epoch": 1.547773250747745, + "grad_norm": 0.7000366449356079, + "learning_rate": 2.5137372543516847e-06, + "loss": 0.2173, + "step": 16430 + }, + { + "epoch": 1.5478674548409128, + "grad_norm": 0.6196354031562805, + "learning_rate": 2.5127362094337737e-06, + "loss": 0.1891, + "step": 16431 + }, + { + "epoch": 1.5479616589340806, + "grad_norm": 0.6312999725341797, + "learning_rate": 2.5117353352370544e-06, + "loss": 0.1813, + "step": 16432 + }, + { + "epoch": 1.5480558630272485, + "grad_norm": 0.6917158365249634, + "learning_rate": 2.5107346317843494e-06, + "loss": 0.2173, + "step": 16433 + }, + { + "epoch": 1.5481500671204165, + "grad_norm": 0.8725519180297852, + "learning_rate": 2.50973409909848e-06, + "loss": 0.2205, + "step": 16434 + }, + { + "epoch": 1.5482442712135842, + "grad_norm": 0.6720922589302063, + "learning_rate": 2.5087337372022546e-06, + "loss": 0.1904, + "step": 16435 + }, + { + "epoch": 1.548338475306752, + "grad_norm": 0.6058753132820129, + "learning_rate": 2.5077335461184847e-06, + "loss": 0.1544, + "step": 16436 + }, + { + "epoch": 1.54843267939992, + "grad_norm": 0.6513206958770752, + "learning_rate": 2.506733525869981e-06, + "loss": 0.1971, + "step": 16437 + }, + { + "epoch": 1.5485268834930879, + "grad_norm": 0.6544703841209412, + "learning_rate": 2.505733676479537e-06, + "loss": 0.2054, + "step": 16438 + }, + { + "epoch": 1.5486210875862556, + "grad_norm": 0.7138603329658508, + "learning_rate": 2.504733997969957e-06, + "loss": 0.2071, + "step": 16439 + }, + { + "epoch": 1.5487152916794233, + "grad_norm": 0.7055628299713135, + "learning_rate": 2.5037344903640347e-06, + "loss": 0.2039, + "step": 16440 + }, + { + "epoch": 1.5488094957725913, + "grad_norm": 0.6753688454627991, + "learning_rate": 2.5027351536845578e-06, + "loss": 0.2096, + "step": 16441 + }, + { + "epoch": 1.5489036998657593, + "grad_norm": 0.6567484736442566, + "learning_rate": 2.5017359879543168e-06, + "loss": 0.2127, + "step": 16442 + }, + { + "epoch": 1.548997903958927, + "grad_norm": 0.6667615175247192, + "learning_rate": 2.500736993196088e-06, + "loss": 0.2014, + "step": 16443 + }, + { + "epoch": 1.5490921080520947, + "grad_norm": 0.6586611270904541, + "learning_rate": 2.499738169432654e-06, + "loss": 0.1909, + "step": 16444 + }, + { + "epoch": 1.5491863121452627, + "grad_norm": 0.6358745098114014, + "learning_rate": 2.498739516686792e-06, + "loss": 0.2005, + "step": 16445 + }, + { + "epoch": 1.5492805162384307, + "grad_norm": 0.6489126682281494, + "learning_rate": 2.497741034981267e-06, + "loss": 0.1889, + "step": 16446 + }, + { + "epoch": 1.5493747203315984, + "grad_norm": 0.6446957588195801, + "learning_rate": 2.4967427243388485e-06, + "loss": 0.2126, + "step": 16447 + }, + { + "epoch": 1.5494689244247661, + "grad_norm": 0.6304161548614502, + "learning_rate": 2.4957445847823036e-06, + "loss": 0.1778, + "step": 16448 + }, + { + "epoch": 1.549563128517934, + "grad_norm": 0.6196681261062622, + "learning_rate": 2.4947466163343838e-06, + "loss": 0.2067, + "step": 16449 + }, + { + "epoch": 1.549657332611102, + "grad_norm": 0.6782927513122559, + "learning_rate": 2.4937488190178518e-06, + "loss": 0.1987, + "step": 16450 + }, + { + "epoch": 1.5497515367042698, + "grad_norm": 0.7568427324295044, + "learning_rate": 2.4927511928554525e-06, + "loss": 0.1914, + "step": 16451 + }, + { + "epoch": 1.5498457407974375, + "grad_norm": 0.8635743856430054, + "learning_rate": 2.4917537378699386e-06, + "loss": 0.2025, + "step": 16452 + }, + { + "epoch": 1.5499399448906055, + "grad_norm": 0.6215642094612122, + "learning_rate": 2.490756454084049e-06, + "loss": 0.193, + "step": 16453 + }, + { + "epoch": 1.5500341489837735, + "grad_norm": 0.6086340546607971, + "learning_rate": 2.4897593415205255e-06, + "loss": 0.1849, + "step": 16454 + }, + { + "epoch": 1.5501283530769412, + "grad_norm": 0.6403914093971252, + "learning_rate": 2.488762400202106e-06, + "loss": 0.219, + "step": 16455 + }, + { + "epoch": 1.550222557170109, + "grad_norm": 0.6786202788352966, + "learning_rate": 2.4877656301515174e-06, + "loss": 0.2304, + "step": 16456 + }, + { + "epoch": 1.5503167612632769, + "grad_norm": 0.6342893242835999, + "learning_rate": 2.4867690313914906e-06, + "loss": 0.2067, + "step": 16457 + }, + { + "epoch": 1.5504109653564448, + "grad_norm": 0.6511574387550354, + "learning_rate": 2.485772603944753e-06, + "loss": 0.2091, + "step": 16458 + }, + { + "epoch": 1.5505051694496126, + "grad_norm": 0.682508111000061, + "learning_rate": 2.484776347834017e-06, + "loss": 0.1901, + "step": 16459 + }, + { + "epoch": 1.5505993735427803, + "grad_norm": 0.641307532787323, + "learning_rate": 2.483780263082003e-06, + "loss": 0.1877, + "step": 16460 + }, + { + "epoch": 1.5506935776359483, + "grad_norm": 0.6705555319786072, + "learning_rate": 2.4827843497114256e-06, + "loss": 0.2013, + "step": 16461 + }, + { + "epoch": 1.5507877817291162, + "grad_norm": 0.6825522184371948, + "learning_rate": 2.4817886077449917e-06, + "loss": 0.1911, + "step": 16462 + }, + { + "epoch": 1.550881985822284, + "grad_norm": 0.611046552658081, + "learning_rate": 2.480793037205398e-06, + "loss": 0.1725, + "step": 16463 + }, + { + "epoch": 1.5509761899154517, + "grad_norm": 0.672132670879364, + "learning_rate": 2.4797976381153575e-06, + "loss": 0.1983, + "step": 16464 + }, + { + "epoch": 1.5510703940086197, + "grad_norm": 0.7160225510597229, + "learning_rate": 2.4788024104975615e-06, + "loss": 0.229, + "step": 16465 + }, + { + "epoch": 1.5511645981017876, + "grad_norm": 0.7172216773033142, + "learning_rate": 2.477807354374696e-06, + "loss": 0.1916, + "step": 16466 + }, + { + "epoch": 1.5512588021949554, + "grad_norm": 0.6677335500717163, + "learning_rate": 2.476812469769463e-06, + "loss": 0.2029, + "step": 16467 + }, + { + "epoch": 1.5513530062881231, + "grad_norm": 0.6702734231948853, + "learning_rate": 2.4758177567045393e-06, + "loss": 0.1912, + "step": 16468 + }, + { + "epoch": 1.551447210381291, + "grad_norm": 0.7241215705871582, + "learning_rate": 2.474823215202602e-06, + "loss": 0.2051, + "step": 16469 + }, + { + "epoch": 1.551541414474459, + "grad_norm": 0.6005356907844543, + "learning_rate": 2.473828845286339e-06, + "loss": 0.1831, + "step": 16470 + }, + { + "epoch": 1.5516356185676268, + "grad_norm": 0.6623706221580505, + "learning_rate": 2.472834646978417e-06, + "loss": 0.1872, + "step": 16471 + }, + { + "epoch": 1.5517298226607945, + "grad_norm": 0.642845630645752, + "learning_rate": 2.4718406203015045e-06, + "loss": 0.2, + "step": 16472 + }, + { + "epoch": 1.5518240267539625, + "grad_norm": 0.6071888208389282, + "learning_rate": 2.4708467652782675e-06, + "loss": 0.1666, + "step": 16473 + }, + { + "epoch": 1.5519182308471304, + "grad_norm": 0.6258337497711182, + "learning_rate": 2.4698530819313714e-06, + "loss": 0.1925, + "step": 16474 + }, + { + "epoch": 1.5520124349402982, + "grad_norm": 0.5780388712882996, + "learning_rate": 2.468859570283467e-06, + "loss": 0.1917, + "step": 16475 + }, + { + "epoch": 1.552106639033466, + "grad_norm": 0.6399002075195312, + "learning_rate": 2.4678662303572122e-06, + "loss": 0.197, + "step": 16476 + }, + { + "epoch": 1.5522008431266339, + "grad_norm": 0.6279283761978149, + "learning_rate": 2.466873062175259e-06, + "loss": 0.189, + "step": 16477 + }, + { + "epoch": 1.5522950472198018, + "grad_norm": 0.6647738218307495, + "learning_rate": 2.4658800657602476e-06, + "loss": 0.1976, + "step": 16478 + }, + { + "epoch": 1.5523892513129696, + "grad_norm": 0.5951058268547058, + "learning_rate": 2.4648872411348223e-06, + "loss": 0.1845, + "step": 16479 + }, + { + "epoch": 1.5524834554061373, + "grad_norm": 0.6937251091003418, + "learning_rate": 2.4638945883216236e-06, + "loss": 0.1977, + "step": 16480 + }, + { + "epoch": 1.5525776594993053, + "grad_norm": 0.6020039916038513, + "learning_rate": 2.4629021073432803e-06, + "loss": 0.2067, + "step": 16481 + }, + { + "epoch": 1.5526718635924732, + "grad_norm": 0.6047183871269226, + "learning_rate": 2.461909798222428e-06, + "loss": 0.2017, + "step": 16482 + }, + { + "epoch": 1.552766067685641, + "grad_norm": 0.6549305319786072, + "learning_rate": 2.4609176609816876e-06, + "loss": 0.1903, + "step": 16483 + }, + { + "epoch": 1.5528602717788087, + "grad_norm": 0.6725935339927673, + "learning_rate": 2.4599256956436846e-06, + "loss": 0.1927, + "step": 16484 + }, + { + "epoch": 1.5529544758719767, + "grad_norm": 0.7078056335449219, + "learning_rate": 2.4589339022310386e-06, + "loss": 0.2203, + "step": 16485 + }, + { + "epoch": 1.5530486799651446, + "grad_norm": 0.7287416458129883, + "learning_rate": 2.45794228076636e-06, + "loss": 0.2246, + "step": 16486 + }, + { + "epoch": 1.5531428840583124, + "grad_norm": 0.6040006279945374, + "learning_rate": 2.4569508312722635e-06, + "loss": 0.2172, + "step": 16487 + }, + { + "epoch": 1.55323708815148, + "grad_norm": 0.6748787760734558, + "learning_rate": 2.455959553771351e-06, + "loss": 0.1957, + "step": 16488 + }, + { + "epoch": 1.553331292244648, + "grad_norm": 0.6507366895675659, + "learning_rate": 2.4549684482862278e-06, + "loss": 0.2456, + "step": 16489 + }, + { + "epoch": 1.553425496337816, + "grad_norm": 0.635037899017334, + "learning_rate": 2.453977514839496e-06, + "loss": 0.1962, + "step": 16490 + }, + { + "epoch": 1.5535197004309838, + "grad_norm": 0.6460270285606384, + "learning_rate": 2.4529867534537435e-06, + "loss": 0.1942, + "step": 16491 + }, + { + "epoch": 1.5536139045241515, + "grad_norm": 0.6219245195388794, + "learning_rate": 2.4519961641515677e-06, + "loss": 0.1803, + "step": 16492 + }, + { + "epoch": 1.5537081086173194, + "grad_norm": 0.6302159428596497, + "learning_rate": 2.4510057469555504e-06, + "loss": 0.1928, + "step": 16493 + }, + { + "epoch": 1.5538023127104874, + "grad_norm": 0.6582379341125488, + "learning_rate": 2.4500155018882755e-06, + "loss": 0.2043, + "step": 16494 + }, + { + "epoch": 1.5538965168036551, + "grad_norm": 0.7801764011383057, + "learning_rate": 2.449025428972327e-06, + "loss": 0.1884, + "step": 16495 + }, + { + "epoch": 1.5539907208968229, + "grad_norm": 0.5923693180084229, + "learning_rate": 2.4480355282302738e-06, + "loss": 0.1863, + "step": 16496 + }, + { + "epoch": 1.5540849249899908, + "grad_norm": 0.6438308358192444, + "learning_rate": 2.4470457996846896e-06, + "loss": 0.1843, + "step": 16497 + }, + { + "epoch": 1.5541791290831588, + "grad_norm": 0.8064566254615784, + "learning_rate": 2.4460562433581445e-06, + "loss": 0.2187, + "step": 16498 + }, + { + "epoch": 1.5542733331763265, + "grad_norm": 0.6894887685775757, + "learning_rate": 2.4450668592731976e-06, + "loss": 0.1998, + "step": 16499 + }, + { + "epoch": 1.5543675372694943, + "grad_norm": 0.6374278664588928, + "learning_rate": 2.4440776474524096e-06, + "loss": 0.2294, + "step": 16500 + }, + { + "epoch": 1.5544617413626622, + "grad_norm": 0.6965348124504089, + "learning_rate": 2.4430886079183402e-06, + "loss": 0.1854, + "step": 16501 + }, + { + "epoch": 1.5545559454558302, + "grad_norm": 0.6538490653038025, + "learning_rate": 2.4420997406935364e-06, + "loss": 0.1936, + "step": 16502 + }, + { + "epoch": 1.554650149548998, + "grad_norm": 0.6493823528289795, + "learning_rate": 2.4411110458005414e-06, + "loss": 0.1716, + "step": 16503 + }, + { + "epoch": 1.5547443536421657, + "grad_norm": 0.7109135985374451, + "learning_rate": 2.4401225232619117e-06, + "loss": 0.2086, + "step": 16504 + }, + { + "epoch": 1.5548385577353336, + "grad_norm": 0.6588006019592285, + "learning_rate": 2.4391341731001793e-06, + "loss": 0.2074, + "step": 16505 + }, + { + "epoch": 1.5549327618285016, + "grad_norm": 0.610553503036499, + "learning_rate": 2.438145995337875e-06, + "loss": 0.1769, + "step": 16506 + }, + { + "epoch": 1.5550269659216693, + "grad_norm": 0.6453066468238831, + "learning_rate": 2.437157989997542e-06, + "loss": 0.1916, + "step": 16507 + }, + { + "epoch": 1.555121170014837, + "grad_norm": 0.6897177696228027, + "learning_rate": 2.436170157101704e-06, + "loss": 0.1961, + "step": 16508 + }, + { + "epoch": 1.555215374108005, + "grad_norm": 0.647250235080719, + "learning_rate": 2.4351824966728775e-06, + "loss": 0.2084, + "step": 16509 + }, + { + "epoch": 1.555309578201173, + "grad_norm": 0.6197291612625122, + "learning_rate": 2.4341950087335954e-06, + "loss": 0.1854, + "step": 16510 + }, + { + "epoch": 1.5554037822943407, + "grad_norm": 0.6840108633041382, + "learning_rate": 2.4332076933063677e-06, + "loss": 0.1899, + "step": 16511 + }, + { + "epoch": 1.5554979863875085, + "grad_norm": 0.6628212928771973, + "learning_rate": 2.432220550413704e-06, + "loss": 0.1736, + "step": 16512 + }, + { + "epoch": 1.5555921904806764, + "grad_norm": 0.6374439597129822, + "learning_rate": 2.431233580078115e-06, + "loss": 0.1847, + "step": 16513 + }, + { + "epoch": 1.5556863945738444, + "grad_norm": 0.6410982012748718, + "learning_rate": 2.43024678232211e-06, + "loss": 0.2123, + "step": 16514 + }, + { + "epoch": 1.5557805986670121, + "grad_norm": 0.6972041726112366, + "learning_rate": 2.4292601571681805e-06, + "loss": 0.1872, + "step": 16515 + }, + { + "epoch": 1.5558748027601799, + "grad_norm": 0.5675978064537048, + "learning_rate": 2.4282737046388293e-06, + "loss": 0.1995, + "step": 16516 + }, + { + "epoch": 1.5559690068533478, + "grad_norm": 0.6456544399261475, + "learning_rate": 2.4272874247565492e-06, + "loss": 0.1867, + "step": 16517 + }, + { + "epoch": 1.5560632109465158, + "grad_norm": 0.6558977961540222, + "learning_rate": 2.4263013175438256e-06, + "loss": 0.1731, + "step": 16518 + }, + { + "epoch": 1.5561574150396835, + "grad_norm": 0.6935216188430786, + "learning_rate": 2.4253153830231446e-06, + "loss": 0.1999, + "step": 16519 + }, + { + "epoch": 1.5562516191328513, + "grad_norm": 0.723755955696106, + "learning_rate": 2.42432962121699e-06, + "loss": 0.1967, + "step": 16520 + }, + { + "epoch": 1.5563458232260192, + "grad_norm": 0.6899499297142029, + "learning_rate": 2.423344032147833e-06, + "loss": 0.21, + "step": 16521 + }, + { + "epoch": 1.5564400273191872, + "grad_norm": 0.6751235127449036, + "learning_rate": 2.4223586158381516e-06, + "loss": 0.2221, + "step": 16522 + }, + { + "epoch": 1.556534231412355, + "grad_norm": 0.641545295715332, + "learning_rate": 2.4213733723104115e-06, + "loss": 0.1913, + "step": 16523 + }, + { + "epoch": 1.5566284355055227, + "grad_norm": 0.5932109355926514, + "learning_rate": 2.420388301587079e-06, + "loss": 0.1629, + "step": 16524 + }, + { + "epoch": 1.5567226395986906, + "grad_norm": 0.5994042754173279, + "learning_rate": 2.419403403690618e-06, + "loss": 0.1886, + "step": 16525 + }, + { + "epoch": 1.5568168436918586, + "grad_norm": 0.7845425605773926, + "learning_rate": 2.4184186786434793e-06, + "loss": 0.1787, + "step": 16526 + }, + { + "epoch": 1.5569110477850263, + "grad_norm": 0.636869490146637, + "learning_rate": 2.417434126468123e-06, + "loss": 0.2078, + "step": 16527 + }, + { + "epoch": 1.557005251878194, + "grad_norm": 0.6482181549072266, + "learning_rate": 2.416449747186993e-06, + "loss": 0.1829, + "step": 16528 + }, + { + "epoch": 1.557099455971362, + "grad_norm": 0.6135715842247009, + "learning_rate": 2.415465540822537e-06, + "loss": 0.1757, + "step": 16529 + }, + { + "epoch": 1.5571936600645297, + "grad_norm": 0.6853678822517395, + "learning_rate": 2.414481507397198e-06, + "loss": 0.1954, + "step": 16530 + }, + { + "epoch": 1.5572878641576975, + "grad_norm": 0.619922935962677, + "learning_rate": 2.4134976469334104e-06, + "loss": 0.1763, + "step": 16531 + }, + { + "epoch": 1.5573820682508654, + "grad_norm": 1.4710171222686768, + "learning_rate": 2.4125139594536117e-06, + "loss": 0.1836, + "step": 16532 + }, + { + "epoch": 1.5574762723440334, + "grad_norm": 0.6790376305580139, + "learning_rate": 2.4115304449802255e-06, + "loss": 0.2082, + "step": 16533 + }, + { + "epoch": 1.5575704764372011, + "grad_norm": 0.6932697296142578, + "learning_rate": 2.410547103535682e-06, + "loss": 0.2046, + "step": 16534 + }, + { + "epoch": 1.5576646805303689, + "grad_norm": 0.6530988812446594, + "learning_rate": 2.409563935142404e-06, + "loss": 0.2091, + "step": 16535 + }, + { + "epoch": 1.5577588846235368, + "grad_norm": 0.6402018666267395, + "learning_rate": 2.4085809398228045e-06, + "loss": 0.1887, + "step": 16536 + }, + { + "epoch": 1.5578530887167048, + "grad_norm": 0.6248202323913574, + "learning_rate": 2.407598117599299e-06, + "loss": 0.1871, + "step": 16537 + }, + { + "epoch": 1.5579472928098725, + "grad_norm": 0.6733875274658203, + "learning_rate": 2.4066154684943023e-06, + "loss": 0.206, + "step": 16538 + }, + { + "epoch": 1.5580414969030403, + "grad_norm": 0.7862675786018372, + "learning_rate": 2.4056329925302135e-06, + "loss": 0.1907, + "step": 16539 + }, + { + "epoch": 1.5581357009962082, + "grad_norm": 0.6684961915016174, + "learning_rate": 2.404650689729436e-06, + "loss": 0.1944, + "step": 16540 + }, + { + "epoch": 1.5582299050893762, + "grad_norm": 0.6752827167510986, + "learning_rate": 2.403668560114374e-06, + "loss": 0.2158, + "step": 16541 + }, + { + "epoch": 1.558324109182544, + "grad_norm": 0.5895867943763733, + "learning_rate": 2.4026866037074158e-06, + "loss": 0.149, + "step": 16542 + }, + { + "epoch": 1.5584183132757117, + "grad_norm": 0.6277616620063782, + "learning_rate": 2.401704820530947e-06, + "loss": 0.182, + "step": 16543 + }, + { + "epoch": 1.5585125173688796, + "grad_norm": 0.5878896713256836, + "learning_rate": 2.400723210607364e-06, + "loss": 0.2018, + "step": 16544 + }, + { + "epoch": 1.5586067214620476, + "grad_norm": 0.6188334226608276, + "learning_rate": 2.3997417739590457e-06, + "loss": 0.2131, + "step": 16545 + }, + { + "epoch": 1.5587009255552153, + "grad_norm": 0.7101216912269592, + "learning_rate": 2.3987605106083623e-06, + "loss": 0.226, + "step": 16546 + }, + { + "epoch": 1.558795129648383, + "grad_norm": 0.6682934165000916, + "learning_rate": 2.3977794205777015e-06, + "loss": 0.1793, + "step": 16547 + }, + { + "epoch": 1.558889333741551, + "grad_norm": 0.6556659936904907, + "learning_rate": 2.396798503889426e-06, + "loss": 0.1864, + "step": 16548 + }, + { + "epoch": 1.558983537834719, + "grad_norm": 0.5727357268333435, + "learning_rate": 2.3958177605658985e-06, + "loss": 0.1634, + "step": 16549 + }, + { + "epoch": 1.5590777419278867, + "grad_norm": 0.6460142731666565, + "learning_rate": 2.394837190629491e-06, + "loss": 0.1767, + "step": 16550 + }, + { + "epoch": 1.5591719460210545, + "grad_norm": 0.6470164656639099, + "learning_rate": 2.393856794102557e-06, + "loss": 0.182, + "step": 16551 + }, + { + "epoch": 1.5592661501142224, + "grad_norm": 0.6724783182144165, + "learning_rate": 2.3928765710074486e-06, + "loss": 0.1817, + "step": 16552 + }, + { + "epoch": 1.5593603542073904, + "grad_norm": 0.6727150082588196, + "learning_rate": 2.391896521366519e-06, + "loss": 0.191, + "step": 16553 + }, + { + "epoch": 1.5594545583005581, + "grad_norm": 0.6314756274223328, + "learning_rate": 2.390916645202118e-06, + "loss": 0.1558, + "step": 16554 + }, + { + "epoch": 1.5595487623937259, + "grad_norm": 0.63203364610672, + "learning_rate": 2.3899369425365824e-06, + "loss": 0.2149, + "step": 16555 + }, + { + "epoch": 1.5596429664868938, + "grad_norm": 0.7332870960235596, + "learning_rate": 2.3889574133922532e-06, + "loss": 0.2019, + "step": 16556 + }, + { + "epoch": 1.5597371705800618, + "grad_norm": 0.6748360395431519, + "learning_rate": 2.387978057791469e-06, + "loss": 0.1948, + "step": 16557 + }, + { + "epoch": 1.5598313746732295, + "grad_norm": 0.6236615777015686, + "learning_rate": 2.386998875756554e-06, + "loss": 0.1884, + "step": 16558 + }, + { + "epoch": 1.5599255787663973, + "grad_norm": 0.6263788938522339, + "learning_rate": 2.386019867309839e-06, + "loss": 0.1792, + "step": 16559 + }, + { + "epoch": 1.5600197828595652, + "grad_norm": 0.6550841927528381, + "learning_rate": 2.3850410324736496e-06, + "loss": 0.1821, + "step": 16560 + }, + { + "epoch": 1.5601139869527332, + "grad_norm": 0.7686969637870789, + "learning_rate": 2.384062371270297e-06, + "loss": 0.2161, + "step": 16561 + }, + { + "epoch": 1.560208191045901, + "grad_norm": 0.5874224305152893, + "learning_rate": 2.3830838837221047e-06, + "loss": 0.1629, + "step": 16562 + }, + { + "epoch": 1.5603023951390687, + "grad_norm": 0.6518653035163879, + "learning_rate": 2.3821055698513763e-06, + "loss": 0.2069, + "step": 16563 + }, + { + "epoch": 1.5603965992322366, + "grad_norm": 0.6342228055000305, + "learning_rate": 2.3811274296804222e-06, + "loss": 0.2176, + "step": 16564 + }, + { + "epoch": 1.5604908033254046, + "grad_norm": 0.770420253276825, + "learning_rate": 2.380149463231548e-06, + "loss": 0.218, + "step": 16565 + }, + { + "epoch": 1.5605850074185723, + "grad_norm": 0.6777490377426147, + "learning_rate": 2.3791716705270484e-06, + "loss": 0.1879, + "step": 16566 + }, + { + "epoch": 1.56067921151174, + "grad_norm": 0.6285408139228821, + "learning_rate": 2.378194051589222e-06, + "loss": 0.2169, + "step": 16567 + }, + { + "epoch": 1.560773415604908, + "grad_norm": 0.7333067655563354, + "learning_rate": 2.377216606440357e-06, + "loss": 0.2246, + "step": 16568 + }, + { + "epoch": 1.560867619698076, + "grad_norm": 0.7452640533447266, + "learning_rate": 2.3762393351027424e-06, + "loss": 0.2258, + "step": 16569 + }, + { + "epoch": 1.5609618237912437, + "grad_norm": 0.6964256763458252, + "learning_rate": 2.3752622375986635e-06, + "loss": 0.2068, + "step": 16570 + }, + { + "epoch": 1.5610560278844114, + "grad_norm": 0.7265715003013611, + "learning_rate": 2.374285313950394e-06, + "loss": 0.2081, + "step": 16571 + }, + { + "epoch": 1.5611502319775794, + "grad_norm": 0.7704083323478699, + "learning_rate": 2.3733085641802168e-06, + "loss": 0.2038, + "step": 16572 + }, + { + "epoch": 1.5612444360707474, + "grad_norm": 0.6363699436187744, + "learning_rate": 2.3723319883103958e-06, + "loss": 0.2074, + "step": 16573 + }, + { + "epoch": 1.561338640163915, + "grad_norm": 0.6404546499252319, + "learning_rate": 2.371355586363202e-06, + "loss": 0.1877, + "step": 16574 + }, + { + "epoch": 1.5614328442570828, + "grad_norm": 0.7770257592201233, + "learning_rate": 2.3703793583609013e-06, + "loss": 0.2003, + "step": 16575 + }, + { + "epoch": 1.5615270483502508, + "grad_norm": 0.6772834062576294, + "learning_rate": 2.369403304325748e-06, + "loss": 0.1796, + "step": 16576 + }, + { + "epoch": 1.5616212524434188, + "grad_norm": 0.6184030175209045, + "learning_rate": 2.368427424280001e-06, + "loss": 0.2002, + "step": 16577 + }, + { + "epoch": 1.5617154565365865, + "grad_norm": 0.614189088344574, + "learning_rate": 2.3674517182459133e-06, + "loss": 0.1788, + "step": 16578 + }, + { + "epoch": 1.5618096606297542, + "grad_norm": 0.6026573777198792, + "learning_rate": 2.3664761862457276e-06, + "loss": 0.174, + "step": 16579 + }, + { + "epoch": 1.5619038647229222, + "grad_norm": 0.6518381237983704, + "learning_rate": 2.365500828301691e-06, + "loss": 0.1868, + "step": 16580 + }, + { + "epoch": 1.5619980688160902, + "grad_norm": 0.7336990833282471, + "learning_rate": 2.3645256444360443e-06, + "loss": 0.2223, + "step": 16581 + }, + { + "epoch": 1.562092272909258, + "grad_norm": 0.6668913960456848, + "learning_rate": 2.3635506346710224e-06, + "loss": 0.1703, + "step": 16582 + }, + { + "epoch": 1.5621864770024256, + "grad_norm": 0.6120978593826294, + "learning_rate": 2.362575799028849e-06, + "loss": 0.1978, + "step": 16583 + }, + { + "epoch": 1.5622806810955936, + "grad_norm": 0.6925798654556274, + "learning_rate": 2.361601137531766e-06, + "loss": 0.1987, + "step": 16584 + }, + { + "epoch": 1.5623748851887616, + "grad_norm": 0.7289588451385498, + "learning_rate": 2.360626650201989e-06, + "loss": 0.1949, + "step": 16585 + }, + { + "epoch": 1.5624690892819293, + "grad_norm": 0.6873010396957397, + "learning_rate": 2.359652337061734e-06, + "loss": 0.2177, + "step": 16586 + }, + { + "epoch": 1.562563293375097, + "grad_norm": 0.6446362733840942, + "learning_rate": 2.3586781981332276e-06, + "loss": 0.2019, + "step": 16587 + }, + { + "epoch": 1.562657497468265, + "grad_norm": 0.8122641444206238, + "learning_rate": 2.3577042334386744e-06, + "loss": 0.2087, + "step": 16588 + }, + { + "epoch": 1.562751701561433, + "grad_norm": 0.5987395644187927, + "learning_rate": 2.356730443000279e-06, + "loss": 0.209, + "step": 16589 + }, + { + "epoch": 1.5628459056546007, + "grad_norm": 0.6892624497413635, + "learning_rate": 2.3557568268402565e-06, + "loss": 0.1956, + "step": 16590 + }, + { + "epoch": 1.5629401097477684, + "grad_norm": 0.6857694387435913, + "learning_rate": 2.3547833849807987e-06, + "loss": 0.2126, + "step": 16591 + }, + { + "epoch": 1.5630343138409364, + "grad_norm": 0.634675920009613, + "learning_rate": 2.353810117444102e-06, + "loss": 0.2062, + "step": 16592 + }, + { + "epoch": 1.5631285179341043, + "grad_norm": 0.7113621830940247, + "learning_rate": 2.352837024252359e-06, + "loss": 0.2034, + "step": 16593 + }, + { + "epoch": 1.563222722027272, + "grad_norm": 0.6645920872688293, + "learning_rate": 2.35186410542776e-06, + "loss": 0.1701, + "step": 16594 + }, + { + "epoch": 1.5633169261204398, + "grad_norm": 0.6594851016998291, + "learning_rate": 2.3508913609924865e-06, + "loss": 0.1685, + "step": 16595 + }, + { + "epoch": 1.5634111302136078, + "grad_norm": 0.6979818940162659, + "learning_rate": 2.3499187909687193e-06, + "loss": 0.2235, + "step": 16596 + }, + { + "epoch": 1.5635053343067757, + "grad_norm": 0.6058693528175354, + "learning_rate": 2.348946395378637e-06, + "loss": 0.1942, + "step": 16597 + }, + { + "epoch": 1.5635995383999435, + "grad_norm": 0.6438829302787781, + "learning_rate": 2.347974174244406e-06, + "loss": 0.18, + "step": 16598 + }, + { + "epoch": 1.5636937424931112, + "grad_norm": 0.5994290113449097, + "learning_rate": 2.3470021275881995e-06, + "loss": 0.1753, + "step": 16599 + }, + { + "epoch": 1.5637879465862792, + "grad_norm": 0.6323649883270264, + "learning_rate": 2.3460302554321823e-06, + "loss": 0.1891, + "step": 16600 + }, + { + "epoch": 1.5638821506794471, + "grad_norm": 0.6612114906311035, + "learning_rate": 2.3450585577985087e-06, + "loss": 0.1971, + "step": 16601 + }, + { + "epoch": 1.5639763547726149, + "grad_norm": 0.7276307940483093, + "learning_rate": 2.3440870347093426e-06, + "loss": 0.1897, + "step": 16602 + }, + { + "epoch": 1.5640705588657826, + "grad_norm": 0.6305508017539978, + "learning_rate": 2.3431156861868288e-06, + "loss": 0.1801, + "step": 16603 + }, + { + "epoch": 1.5641647629589506, + "grad_norm": 0.6388223171234131, + "learning_rate": 2.3421445122531215e-06, + "loss": 0.1893, + "step": 16604 + }, + { + "epoch": 1.5642589670521185, + "grad_norm": 0.6965458989143372, + "learning_rate": 2.3411735129303604e-06, + "loss": 0.1973, + "step": 16605 + }, + { + "epoch": 1.5643531711452863, + "grad_norm": 0.6546480655670166, + "learning_rate": 2.340202688240687e-06, + "loss": 0.1759, + "step": 16606 + }, + { + "epoch": 1.564447375238454, + "grad_norm": 0.6175102591514587, + "learning_rate": 2.339232038206242e-06, + "loss": 0.1713, + "step": 16607 + }, + { + "epoch": 1.564541579331622, + "grad_norm": 0.7065578699111938, + "learning_rate": 2.33826156284915e-06, + "loss": 0.2033, + "step": 16608 + }, + { + "epoch": 1.56463578342479, + "grad_norm": 0.6182644963264465, + "learning_rate": 2.3372912621915445e-06, + "loss": 0.186, + "step": 16609 + }, + { + "epoch": 1.5647299875179577, + "grad_norm": 0.7014366388320923, + "learning_rate": 2.3363211362555515e-06, + "loss": 0.2152, + "step": 16610 + }, + { + "epoch": 1.5648241916111254, + "grad_norm": 0.7184340953826904, + "learning_rate": 2.335351185063286e-06, + "loss": 0.2095, + "step": 16611 + }, + { + "epoch": 1.5649183957042934, + "grad_norm": 0.677432656288147, + "learning_rate": 2.33438140863687e-06, + "loss": 0.2079, + "step": 16612 + }, + { + "epoch": 1.5650125997974613, + "grad_norm": 0.6766159534454346, + "learning_rate": 2.3334118069984102e-06, + "loss": 0.2053, + "step": 16613 + }, + { + "epoch": 1.565106803890629, + "grad_norm": 0.702167809009552, + "learning_rate": 2.3324423801700168e-06, + "loss": 0.1925, + "step": 16614 + }, + { + "epoch": 1.5652010079837968, + "grad_norm": 0.6005187630653381, + "learning_rate": 2.331473128173799e-06, + "loss": 0.1561, + "step": 16615 + }, + { + "epoch": 1.5652952120769648, + "grad_norm": 0.6294963359832764, + "learning_rate": 2.330504051031851e-06, + "loss": 0.1808, + "step": 16616 + }, + { + "epoch": 1.5653894161701327, + "grad_norm": 0.6566830277442932, + "learning_rate": 2.329535148766271e-06, + "loss": 0.1842, + "step": 16617 + }, + { + "epoch": 1.5654836202633005, + "grad_norm": 0.6712966561317444, + "learning_rate": 2.3285664213991555e-06, + "loss": 0.2009, + "step": 16618 + }, + { + "epoch": 1.5655778243564682, + "grad_norm": 0.6773460507392883, + "learning_rate": 2.327597868952587e-06, + "loss": 0.1939, + "step": 16619 + }, + { + "epoch": 1.5656720284496362, + "grad_norm": 0.6457515358924866, + "learning_rate": 2.326629491448652e-06, + "loss": 0.1829, + "step": 16620 + }, + { + "epoch": 1.5657662325428041, + "grad_norm": 0.6580275297164917, + "learning_rate": 2.3256612889094345e-06, + "loss": 0.213, + "step": 16621 + }, + { + "epoch": 1.5658604366359719, + "grad_norm": 0.5753109455108643, + "learning_rate": 2.3246932613570083e-06, + "loss": 0.1819, + "step": 16622 + }, + { + "epoch": 1.5659546407291396, + "grad_norm": 0.611003577709198, + "learning_rate": 2.323725408813441e-06, + "loss": 0.1751, + "step": 16623 + }, + { + "epoch": 1.5660488448223076, + "grad_norm": 0.6628442406654358, + "learning_rate": 2.322757731300811e-06, + "loss": 0.1938, + "step": 16624 + }, + { + "epoch": 1.5661430489154755, + "grad_norm": 0.6529707312583923, + "learning_rate": 2.3217902288411775e-06, + "loss": 0.2061, + "step": 16625 + }, + { + "epoch": 1.5662372530086432, + "grad_norm": 0.6242614388465881, + "learning_rate": 2.3208229014565963e-06, + "loss": 0.1691, + "step": 16626 + }, + { + "epoch": 1.566331457101811, + "grad_norm": 0.7480343580245972, + "learning_rate": 2.319855749169134e-06, + "loss": 0.26, + "step": 16627 + }, + { + "epoch": 1.566425661194979, + "grad_norm": 0.620380163192749, + "learning_rate": 2.318888772000839e-06, + "loss": 0.1691, + "step": 16628 + }, + { + "epoch": 1.566519865288147, + "grad_norm": 1.880743145942688, + "learning_rate": 2.3179219699737553e-06, + "loss": 0.1984, + "step": 16629 + }, + { + "epoch": 1.5666140693813146, + "grad_norm": 0.6099167466163635, + "learning_rate": 2.3169553431099325e-06, + "loss": 0.1953, + "step": 16630 + }, + { + "epoch": 1.5667082734744824, + "grad_norm": 1.4367554187774658, + "learning_rate": 2.315988891431412e-06, + "loss": 0.1781, + "step": 16631 + }, + { + "epoch": 1.5668024775676503, + "grad_norm": 0.6985436677932739, + "learning_rate": 2.315022614960225e-06, + "loss": 0.1975, + "step": 16632 + }, + { + "epoch": 1.5668966816608183, + "grad_norm": 0.730664074420929, + "learning_rate": 2.314056513718409e-06, + "loss": 0.2278, + "step": 16633 + }, + { + "epoch": 1.566990885753986, + "grad_norm": 0.5613934397697449, + "learning_rate": 2.313090587727992e-06, + "loss": 0.1831, + "step": 16634 + }, + { + "epoch": 1.5670850898471538, + "grad_norm": 0.7184987664222717, + "learning_rate": 2.3121248370109962e-06, + "loss": 0.2269, + "step": 16635 + }, + { + "epoch": 1.5671792939403217, + "grad_norm": 0.6556032299995422, + "learning_rate": 2.311159261589443e-06, + "loss": 0.1993, + "step": 16636 + }, + { + "epoch": 1.5672734980334897, + "grad_norm": 0.5733645558357239, + "learning_rate": 2.3101938614853524e-06, + "loss": 0.1839, + "step": 16637 + }, + { + "epoch": 1.5673677021266574, + "grad_norm": 0.6518971920013428, + "learning_rate": 2.309228636720732e-06, + "loss": 0.1738, + "step": 16638 + }, + { + "epoch": 1.5674619062198252, + "grad_norm": 0.6515443325042725, + "learning_rate": 2.3082635873175918e-06, + "loss": 0.1927, + "step": 16639 + }, + { + "epoch": 1.5675561103129931, + "grad_norm": 0.6167797446250916, + "learning_rate": 2.3072987132979407e-06, + "loss": 0.1726, + "step": 16640 + }, + { + "epoch": 1.567650314406161, + "grad_norm": 0.6479583382606506, + "learning_rate": 2.306334014683773e-06, + "loss": 0.1974, + "step": 16641 + }, + { + "epoch": 1.5677445184993288, + "grad_norm": 0.6952350735664368, + "learning_rate": 2.3053694914970914e-06, + "loss": 0.2245, + "step": 16642 + }, + { + "epoch": 1.5678387225924966, + "grad_norm": 0.6067778468132019, + "learning_rate": 2.3044051437598826e-06, + "loss": 0.1957, + "step": 16643 + }, + { + "epoch": 1.5679329266856645, + "grad_norm": 0.631545901298523, + "learning_rate": 2.30344097149414e-06, + "loss": 0.195, + "step": 16644 + }, + { + "epoch": 1.5680271307788325, + "grad_norm": 0.6462365984916687, + "learning_rate": 2.3024769747218433e-06, + "loss": 0.1954, + "step": 16645 + }, + { + "epoch": 1.5681213348720002, + "grad_norm": 0.7221664786338806, + "learning_rate": 2.301513153464977e-06, + "loss": 0.1836, + "step": 16646 + }, + { + "epoch": 1.568215538965168, + "grad_norm": 0.7191948294639587, + "learning_rate": 2.30054950774552e-06, + "loss": 0.2139, + "step": 16647 + }, + { + "epoch": 1.568309743058336, + "grad_norm": 0.7464423179626465, + "learning_rate": 2.2995860375854374e-06, + "loss": 0.2296, + "step": 16648 + }, + { + "epoch": 1.5684039471515039, + "grad_norm": 0.6501345038414001, + "learning_rate": 2.298622743006702e-06, + "loss": 0.2043, + "step": 16649 + }, + { + "epoch": 1.5684981512446716, + "grad_norm": 0.6614580154418945, + "learning_rate": 2.297659624031282e-06, + "loss": 0.196, + "step": 16650 + }, + { + "epoch": 1.5685923553378394, + "grad_norm": 0.6374590396881104, + "learning_rate": 2.296696680681132e-06, + "loss": 0.1881, + "step": 16651 + }, + { + "epoch": 1.5686865594310073, + "grad_norm": 0.6366037130355835, + "learning_rate": 2.295733912978213e-06, + "loss": 0.197, + "step": 16652 + }, + { + "epoch": 1.5687807635241753, + "grad_norm": 0.6388632655143738, + "learning_rate": 2.2947713209444733e-06, + "loss": 0.1718, + "step": 16653 + }, + { + "epoch": 1.568874967617343, + "grad_norm": 0.6700445413589478, + "learning_rate": 2.293808904601864e-06, + "loss": 0.1991, + "step": 16654 + }, + { + "epoch": 1.5689691717105108, + "grad_norm": 0.6413102746009827, + "learning_rate": 2.2928466639723323e-06, + "loss": 0.2413, + "step": 16655 + }, + { + "epoch": 1.5690633758036787, + "grad_norm": 0.8119991421699524, + "learning_rate": 2.2918845990778137e-06, + "loss": 0.245, + "step": 16656 + }, + { + "epoch": 1.5691575798968467, + "grad_norm": 0.5904214978218079, + "learning_rate": 2.2909227099402464e-06, + "loss": 0.1865, + "step": 16657 + }, + { + "epoch": 1.5692517839900144, + "grad_norm": 0.5855529308319092, + "learning_rate": 2.289960996581566e-06, + "loss": 0.1677, + "step": 16658 + }, + { + "epoch": 1.5693459880831822, + "grad_norm": 0.6935964822769165, + "learning_rate": 2.2889994590236965e-06, + "loss": 0.2137, + "step": 16659 + }, + { + "epoch": 1.5694401921763501, + "grad_norm": 0.6831810474395752, + "learning_rate": 2.288038097288564e-06, + "loss": 0.2173, + "step": 16660 + }, + { + "epoch": 1.569534396269518, + "grad_norm": 0.7017054557800293, + "learning_rate": 2.287076911398094e-06, + "loss": 0.2038, + "step": 16661 + }, + { + "epoch": 1.5696286003626858, + "grad_norm": 0.7106721997261047, + "learning_rate": 2.2861159013741975e-06, + "loss": 0.1934, + "step": 16662 + }, + { + "epoch": 1.5697228044558535, + "grad_norm": 0.7033999562263489, + "learning_rate": 2.2851550672387823e-06, + "loss": 0.2187, + "step": 16663 + }, + { + "epoch": 1.5698170085490215, + "grad_norm": 0.670293390750885, + "learning_rate": 2.284194409013769e-06, + "loss": 0.1615, + "step": 16664 + }, + { + "epoch": 1.5699112126421895, + "grad_norm": 0.645819365978241, + "learning_rate": 2.283233926721056e-06, + "loss": 0.1692, + "step": 16665 + }, + { + "epoch": 1.5700054167353572, + "grad_norm": 0.6455112099647522, + "learning_rate": 2.282273620382538e-06, + "loss": 0.2016, + "step": 16666 + }, + { + "epoch": 1.570099620828525, + "grad_norm": 0.6257402896881104, + "learning_rate": 2.281313490020122e-06, + "loss": 0.1738, + "step": 16667 + }, + { + "epoch": 1.570193824921693, + "grad_norm": 0.6529598236083984, + "learning_rate": 2.280353535655696e-06, + "loss": 0.1819, + "step": 16668 + }, + { + "epoch": 1.5702880290148606, + "grad_norm": 0.6802598834037781, + "learning_rate": 2.279393757311146e-06, + "loss": 0.1957, + "step": 16669 + }, + { + "epoch": 1.5703822331080284, + "grad_norm": 0.7839449048042297, + "learning_rate": 2.2784341550083577e-06, + "loss": 0.2185, + "step": 16670 + }, + { + "epoch": 1.5704764372011963, + "grad_norm": 0.7309479117393494, + "learning_rate": 2.2774747287692156e-06, + "loss": 0.2451, + "step": 16671 + }, + { + "epoch": 1.5705706412943643, + "grad_norm": 0.6225112080574036, + "learning_rate": 2.27651547861559e-06, + "loss": 0.2154, + "step": 16672 + }, + { + "epoch": 1.570664845387532, + "grad_norm": 0.652088463306427, + "learning_rate": 2.2755564045693558e-06, + "loss": 0.1877, + "step": 16673 + }, + { + "epoch": 1.5707590494806998, + "grad_norm": 0.6621286273002625, + "learning_rate": 2.274597506652384e-06, + "loss": 0.2105, + "step": 16674 + }, + { + "epoch": 1.5708532535738677, + "grad_norm": 0.8194100856781006, + "learning_rate": 2.2736387848865337e-06, + "loss": 0.204, + "step": 16675 + }, + { + "epoch": 1.5709474576670357, + "grad_norm": 0.6432061195373535, + "learning_rate": 2.2726802392936687e-06, + "loss": 0.1925, + "step": 16676 + }, + { + "epoch": 1.5710416617602034, + "grad_norm": 0.649989902973175, + "learning_rate": 2.2717218698956478e-06, + "loss": 0.185, + "step": 16677 + }, + { + "epoch": 1.5711358658533712, + "grad_norm": 0.6618895530700684, + "learning_rate": 2.270763676714317e-06, + "loss": 0.2027, + "step": 16678 + }, + { + "epoch": 1.5712300699465391, + "grad_norm": 0.729942262172699, + "learning_rate": 2.269805659771529e-06, + "loss": 0.18, + "step": 16679 + }, + { + "epoch": 1.571324274039707, + "grad_norm": 0.6783851981163025, + "learning_rate": 2.2688478190891283e-06, + "loss": 0.1899, + "step": 16680 + }, + { + "epoch": 1.5714184781328748, + "grad_norm": 0.5861131548881531, + "learning_rate": 2.267890154688952e-06, + "loss": 0.1833, + "step": 16681 + }, + { + "epoch": 1.5715126822260426, + "grad_norm": 0.6407277584075928, + "learning_rate": 2.2669326665928404e-06, + "loss": 0.2267, + "step": 16682 + }, + { + "epoch": 1.5716068863192105, + "grad_norm": 0.6662833094596863, + "learning_rate": 2.2659753548226203e-06, + "loss": 0.21, + "step": 16683 + }, + { + "epoch": 1.5717010904123785, + "grad_norm": 0.688582181930542, + "learning_rate": 2.265018219400127e-06, + "loss": 0.1982, + "step": 16684 + }, + { + "epoch": 1.5717952945055462, + "grad_norm": 0.6786016821861267, + "learning_rate": 2.264061260347177e-06, + "loss": 0.2037, + "step": 16685 + }, + { + "epoch": 1.571889498598714, + "grad_norm": 0.6584218144416809, + "learning_rate": 2.2631044776855936e-06, + "loss": 0.1759, + "step": 16686 + }, + { + "epoch": 1.571983702691882, + "grad_norm": 0.616054892539978, + "learning_rate": 2.262147871437197e-06, + "loss": 0.1661, + "step": 16687 + }, + { + "epoch": 1.5720779067850499, + "grad_norm": 0.6182340383529663, + "learning_rate": 2.261191441623792e-06, + "loss": 0.1773, + "step": 16688 + }, + { + "epoch": 1.5721721108782176, + "grad_norm": 0.6300203800201416, + "learning_rate": 2.2602351882671925e-06, + "loss": 0.2037, + "step": 16689 + }, + { + "epoch": 1.5722663149713854, + "grad_norm": 0.6373847723007202, + "learning_rate": 2.259279111389201e-06, + "loss": 0.1848, + "step": 16690 + }, + { + "epoch": 1.5723605190645533, + "grad_norm": 0.6501670479774475, + "learning_rate": 2.258323211011616e-06, + "loss": 0.2288, + "step": 16691 + }, + { + "epoch": 1.5724547231577213, + "grad_norm": 0.5920819044113159, + "learning_rate": 2.2573674871562357e-06, + "loss": 0.2012, + "step": 16692 + }, + { + "epoch": 1.572548927250889, + "grad_norm": 0.631029486656189, + "learning_rate": 2.256411939844849e-06, + "loss": 0.1904, + "step": 16693 + }, + { + "epoch": 1.5726431313440568, + "grad_norm": 0.6770601272583008, + "learning_rate": 2.2554565690992457e-06, + "loss": 0.194, + "step": 16694 + }, + { + "epoch": 1.5727373354372247, + "grad_norm": 0.7404630184173584, + "learning_rate": 2.2545013749412127e-06, + "loss": 0.2271, + "step": 16695 + }, + { + "epoch": 1.5728315395303927, + "grad_norm": 0.6604426503181458, + "learning_rate": 2.2535463573925244e-06, + "loss": 0.2113, + "step": 16696 + }, + { + "epoch": 1.5729257436235604, + "grad_norm": 0.6434115767478943, + "learning_rate": 2.252591516474959e-06, + "loss": 0.1751, + "step": 16697 + }, + { + "epoch": 1.5730199477167282, + "grad_norm": 0.6137346625328064, + "learning_rate": 2.2516368522102916e-06, + "loss": 0.1905, + "step": 16698 + }, + { + "epoch": 1.573114151809896, + "grad_norm": 0.650473952293396, + "learning_rate": 2.2506823646202834e-06, + "loss": 0.1753, + "step": 16699 + }, + { + "epoch": 1.573208355903064, + "grad_norm": 0.6308495998382568, + "learning_rate": 2.249728053726703e-06, + "loss": 0.1651, + "step": 16700 + }, + { + "epoch": 1.5733025599962318, + "grad_norm": 0.6628783941268921, + "learning_rate": 2.248773919551311e-06, + "loss": 0.1875, + "step": 16701 + }, + { + "epoch": 1.5733967640893995, + "grad_norm": 0.651411771774292, + "learning_rate": 2.2478199621158615e-06, + "loss": 0.1967, + "step": 16702 + }, + { + "epoch": 1.5734909681825675, + "grad_norm": 0.6001507639884949, + "learning_rate": 2.2468661814421e-06, + "loss": 0.1814, + "step": 16703 + }, + { + "epoch": 1.5735851722757355, + "grad_norm": 0.6134946942329407, + "learning_rate": 2.2459125775517854e-06, + "loss": 0.1804, + "step": 16704 + }, + { + "epoch": 1.5736793763689032, + "grad_norm": 0.6726778149604797, + "learning_rate": 2.2449591504666566e-06, + "loss": 0.2289, + "step": 16705 + }, + { + "epoch": 1.573773580462071, + "grad_norm": 0.5789441466331482, + "learning_rate": 2.244005900208447e-06, + "loss": 0.158, + "step": 16706 + }, + { + "epoch": 1.573867784555239, + "grad_norm": 0.60833340883255, + "learning_rate": 2.2430528267989028e-06, + "loss": 0.1838, + "step": 16707 + }, + { + "epoch": 1.5739619886484069, + "grad_norm": 0.6395013928413391, + "learning_rate": 2.242099930259751e-06, + "loss": 0.22, + "step": 16708 + }, + { + "epoch": 1.5740561927415746, + "grad_norm": 0.5981082320213318, + "learning_rate": 2.2411472106127152e-06, + "loss": 0.1982, + "step": 16709 + }, + { + "epoch": 1.5741503968347423, + "grad_norm": 0.6421570777893066, + "learning_rate": 2.2401946678795226e-06, + "loss": 0.2067, + "step": 16710 + }, + { + "epoch": 1.5742446009279103, + "grad_norm": 0.6160954236984253, + "learning_rate": 2.2392423020818954e-06, + "loss": 0.1701, + "step": 16711 + }, + { + "epoch": 1.5743388050210783, + "grad_norm": 0.6442878246307373, + "learning_rate": 2.2382901132415423e-06, + "loss": 0.2005, + "step": 16712 + }, + { + "epoch": 1.574433009114246, + "grad_norm": 0.7537941336631775, + "learning_rate": 2.23733810138018e-06, + "loss": 0.1967, + "step": 16713 + }, + { + "epoch": 1.5745272132074137, + "grad_norm": 0.6588831543922424, + "learning_rate": 2.2363862665195156e-06, + "loss": 0.2002, + "step": 16714 + }, + { + "epoch": 1.5746214173005817, + "grad_norm": 0.6572132706642151, + "learning_rate": 2.2354346086812483e-06, + "loss": 0.1875, + "step": 16715 + }, + { + "epoch": 1.5747156213937497, + "grad_norm": 0.6329585909843445, + "learning_rate": 2.2344831278870805e-06, + "loss": 0.1905, + "step": 16716 + }, + { + "epoch": 1.5748098254869174, + "grad_norm": 0.5940223336219788, + "learning_rate": 2.23353182415871e-06, + "loss": 0.1749, + "step": 16717 + }, + { + "epoch": 1.5749040295800851, + "grad_norm": 0.7670513987541199, + "learning_rate": 2.232580697517821e-06, + "loss": 0.1851, + "step": 16718 + }, + { + "epoch": 1.574998233673253, + "grad_norm": 0.7109354138374329, + "learning_rate": 2.231629747986106e-06, + "loss": 0.187, + "step": 16719 + }, + { + "epoch": 1.575092437766421, + "grad_norm": 0.6600790619850159, + "learning_rate": 2.2306789755852487e-06, + "loss": 0.2, + "step": 16720 + }, + { + "epoch": 1.5751866418595888, + "grad_norm": 0.765315055847168, + "learning_rate": 2.2297283803369265e-06, + "loss": 0.2186, + "step": 16721 + }, + { + "epoch": 1.5752808459527565, + "grad_norm": 0.5743632316589355, + "learning_rate": 2.2287779622628115e-06, + "loss": 0.1774, + "step": 16722 + }, + { + "epoch": 1.5753750500459245, + "grad_norm": 0.6753613948822021, + "learning_rate": 2.227827721384577e-06, + "loss": 0.2228, + "step": 16723 + }, + { + "epoch": 1.5754692541390924, + "grad_norm": 0.6676024794578552, + "learning_rate": 2.226877657723894e-06, + "loss": 0.1736, + "step": 16724 + }, + { + "epoch": 1.5755634582322602, + "grad_norm": 0.7018056511878967, + "learning_rate": 2.2259277713024176e-06, + "loss": 0.199, + "step": 16725 + }, + { + "epoch": 1.575657662325428, + "grad_norm": 0.6563654541969299, + "learning_rate": 2.2249780621418117e-06, + "loss": 0.2033, + "step": 16726 + }, + { + "epoch": 1.5757518664185959, + "grad_norm": 0.6973061561584473, + "learning_rate": 2.224028530263733e-06, + "loss": 0.1934, + "step": 16727 + }, + { + "epoch": 1.5758460705117638, + "grad_norm": 0.6676140427589417, + "learning_rate": 2.2230791756898273e-06, + "loss": 0.2212, + "step": 16728 + }, + { + "epoch": 1.5759402746049316, + "grad_norm": 0.613617479801178, + "learning_rate": 2.2221299984417434e-06, + "loss": 0.1762, + "step": 16729 + }, + { + "epoch": 1.5760344786980993, + "grad_norm": 0.6599076986312866, + "learning_rate": 2.221180998541126e-06, + "loss": 0.1748, + "step": 16730 + }, + { + "epoch": 1.5761286827912673, + "grad_norm": 0.6179454326629639, + "learning_rate": 2.22023217600961e-06, + "loss": 0.1827, + "step": 16731 + }, + { + "epoch": 1.5762228868844352, + "grad_norm": 0.6900299787521362, + "learning_rate": 2.2192835308688354e-06, + "loss": 0.2123, + "step": 16732 + }, + { + "epoch": 1.576317090977603, + "grad_norm": 0.6685370206832886, + "learning_rate": 2.2183350631404256e-06, + "loss": 0.194, + "step": 16733 + }, + { + "epoch": 1.5764112950707707, + "grad_norm": 0.6364673972129822, + "learning_rate": 2.217386772846012e-06, + "loss": 0.1748, + "step": 16734 + }, + { + "epoch": 1.5765054991639387, + "grad_norm": 0.6273661255836487, + "learning_rate": 2.2164386600072173e-06, + "loss": 0.1937, + "step": 16735 + }, + { + "epoch": 1.5765997032571066, + "grad_norm": 0.5692266821861267, + "learning_rate": 2.215490724645657e-06, + "loss": 0.1678, + "step": 16736 + }, + { + "epoch": 1.5766939073502744, + "grad_norm": 0.6795256733894348, + "learning_rate": 2.2145429667829464e-06, + "loss": 0.1849, + "step": 16737 + }, + { + "epoch": 1.576788111443442, + "grad_norm": 0.6403346061706543, + "learning_rate": 2.2135953864407e-06, + "loss": 0.1934, + "step": 16738 + }, + { + "epoch": 1.57688231553661, + "grad_norm": 0.6476584672927856, + "learning_rate": 2.2126479836405177e-06, + "loss": 0.2095, + "step": 16739 + }, + { + "epoch": 1.576976519629778, + "grad_norm": 0.6238870620727539, + "learning_rate": 2.211700758404005e-06, + "loss": 0.1858, + "step": 16740 + }, + { + "epoch": 1.5770707237229458, + "grad_norm": 0.6293942928314209, + "learning_rate": 2.2107537107527633e-06, + "loss": 0.1936, + "step": 16741 + }, + { + "epoch": 1.5771649278161135, + "grad_norm": 0.6041991114616394, + "learning_rate": 2.2098068407083815e-06, + "loss": 0.1931, + "step": 16742 + }, + { + "epoch": 1.5772591319092815, + "grad_norm": 0.6643226146697998, + "learning_rate": 2.208860148292448e-06, + "loss": 0.1961, + "step": 16743 + }, + { + "epoch": 1.5773533360024494, + "grad_norm": 0.6403629779815674, + "learning_rate": 2.207913633526556e-06, + "loss": 0.1793, + "step": 16744 + }, + { + "epoch": 1.5774475400956172, + "grad_norm": 0.6918803453445435, + "learning_rate": 2.206967296432285e-06, + "loss": 0.1982, + "step": 16745 + }, + { + "epoch": 1.577541744188785, + "grad_norm": 0.6899217367172241, + "learning_rate": 2.2060211370312077e-06, + "loss": 0.2132, + "step": 16746 + }, + { + "epoch": 1.5776359482819529, + "grad_norm": 0.6735967993736267, + "learning_rate": 2.2050751553449034e-06, + "loss": 0.1941, + "step": 16747 + }, + { + "epoch": 1.5777301523751208, + "grad_norm": 0.5775204300880432, + "learning_rate": 2.2041293513949436e-06, + "loss": 0.186, + "step": 16748 + }, + { + "epoch": 1.5778243564682886, + "grad_norm": 0.6950227618217468, + "learning_rate": 2.2031837252028878e-06, + "loss": 0.2158, + "step": 16749 + }, + { + "epoch": 1.5779185605614563, + "grad_norm": 0.6783015131950378, + "learning_rate": 2.2022382767903006e-06, + "loss": 0.2047, + "step": 16750 + }, + { + "epoch": 1.5780127646546243, + "grad_norm": 0.7202771306037903, + "learning_rate": 2.201293006178744e-06, + "loss": 0.2242, + "step": 16751 + }, + { + "epoch": 1.5781069687477922, + "grad_norm": 0.6268263459205627, + "learning_rate": 2.2003479133897643e-06, + "loss": 0.1812, + "step": 16752 + }, + { + "epoch": 1.57820117284096, + "grad_norm": 0.6736927628517151, + "learning_rate": 2.199402998444915e-06, + "loss": 0.2153, + "step": 16753 + }, + { + "epoch": 1.5782953769341277, + "grad_norm": 0.5757262110710144, + "learning_rate": 2.1984582613657436e-06, + "loss": 0.1706, + "step": 16754 + }, + { + "epoch": 1.5783895810272957, + "grad_norm": 0.6886053085327148, + "learning_rate": 2.197513702173786e-06, + "loss": 0.1889, + "step": 16755 + }, + { + "epoch": 1.5784837851204636, + "grad_norm": 0.7031805515289307, + "learning_rate": 2.1965693208905835e-06, + "loss": 0.2053, + "step": 16756 + }, + { + "epoch": 1.5785779892136313, + "grad_norm": 0.6282926797866821, + "learning_rate": 2.195625117537671e-06, + "loss": 0.1808, + "step": 16757 + }, + { + "epoch": 1.578672193306799, + "grad_norm": 0.7353420257568359, + "learning_rate": 2.1946810921365734e-06, + "loss": 0.2176, + "step": 16758 + }, + { + "epoch": 1.578766397399967, + "grad_norm": 0.6532454490661621, + "learning_rate": 2.1937372447088178e-06, + "loss": 0.1755, + "step": 16759 + }, + { + "epoch": 1.578860601493135, + "grad_norm": 0.6954646706581116, + "learning_rate": 2.192793575275928e-06, + "loss": 0.189, + "step": 16760 + }, + { + "epoch": 1.5789548055863027, + "grad_norm": 0.6403496861457825, + "learning_rate": 2.191850083859419e-06, + "loss": 0.1635, + "step": 16761 + }, + { + "epoch": 1.5790490096794705, + "grad_norm": 0.6036338210105896, + "learning_rate": 2.1909067704808007e-06, + "loss": 0.2017, + "step": 16762 + }, + { + "epoch": 1.5791432137726384, + "grad_norm": 0.6709696054458618, + "learning_rate": 2.1899636351615848e-06, + "loss": 0.1925, + "step": 16763 + }, + { + "epoch": 1.5792374178658064, + "grad_norm": 0.6412215828895569, + "learning_rate": 2.1890206779232804e-06, + "loss": 0.1981, + "step": 16764 + }, + { + "epoch": 1.5793316219589741, + "grad_norm": 0.6793758273124695, + "learning_rate": 2.1880778987873806e-06, + "loss": 0.2149, + "step": 16765 + }, + { + "epoch": 1.5794258260521419, + "grad_norm": 0.5914242267608643, + "learning_rate": 2.187135297775387e-06, + "loss": 0.1934, + "step": 16766 + }, + { + "epoch": 1.5795200301453098, + "grad_norm": 0.7470661401748657, + "learning_rate": 2.1861928749087936e-06, + "loss": 0.2182, + "step": 16767 + }, + { + "epoch": 1.5796142342384778, + "grad_norm": 0.6642493605613708, + "learning_rate": 2.1852506302090836e-06, + "loss": 0.1639, + "step": 16768 + }, + { + "epoch": 1.5797084383316455, + "grad_norm": 0.6759933233261108, + "learning_rate": 2.1843085636977458e-06, + "loss": 0.2289, + "step": 16769 + }, + { + "epoch": 1.5798026424248133, + "grad_norm": 0.6871334314346313, + "learning_rate": 2.1833666753962624e-06, + "loss": 0.1991, + "step": 16770 + }, + { + "epoch": 1.5798968465179812, + "grad_norm": 0.7563052773475647, + "learning_rate": 2.182424965326104e-06, + "loss": 0.2207, + "step": 16771 + }, + { + "epoch": 1.5799910506111492, + "grad_norm": 0.6464877128601074, + "learning_rate": 2.181483433508751e-06, + "loss": 0.1827, + "step": 16772 + }, + { + "epoch": 1.580085254704317, + "grad_norm": 0.6580135226249695, + "learning_rate": 2.180542079965663e-06, + "loss": 0.1866, + "step": 16773 + }, + { + "epoch": 1.5801794587974847, + "grad_norm": 0.6818664073944092, + "learning_rate": 2.17960090471831e-06, + "loss": 0.1923, + "step": 16774 + }, + { + "epoch": 1.5802736628906526, + "grad_norm": 0.6275060176849365, + "learning_rate": 2.1786599077881522e-06, + "loss": 0.2035, + "step": 16775 + }, + { + "epoch": 1.5803678669838206, + "grad_norm": 0.7090134024620056, + "learning_rate": 2.1777190891966425e-06, + "loss": 0.1996, + "step": 16776 + }, + { + "epoch": 1.5804620710769883, + "grad_norm": 0.738533079624176, + "learning_rate": 2.1767784489652345e-06, + "loss": 0.2026, + "step": 16777 + }, + { + "epoch": 1.580556275170156, + "grad_norm": 0.6837629675865173, + "learning_rate": 2.175837987115379e-06, + "loss": 0.2036, + "step": 16778 + }, + { + "epoch": 1.580650479263324, + "grad_norm": 0.6072348356246948, + "learning_rate": 2.174897703668516e-06, + "loss": 0.1913, + "step": 16779 + }, + { + "epoch": 1.580744683356492, + "grad_norm": 0.684169352054596, + "learning_rate": 2.173957598646087e-06, + "loss": 0.2253, + "step": 16780 + }, + { + "epoch": 1.5808388874496597, + "grad_norm": 0.6403519511222839, + "learning_rate": 2.1730176720695307e-06, + "loss": 0.2062, + "step": 16781 + }, + { + "epoch": 1.5809330915428275, + "grad_norm": 0.6331693530082703, + "learning_rate": 2.1720779239602753e-06, + "loss": 0.1934, + "step": 16782 + }, + { + "epoch": 1.5810272956359954, + "grad_norm": 0.6811143755912781, + "learning_rate": 2.1711383543397448e-06, + "loss": 0.2249, + "step": 16783 + }, + { + "epoch": 1.5811214997291634, + "grad_norm": 0.690163791179657, + "learning_rate": 2.170198963229372e-06, + "loss": 0.177, + "step": 16784 + }, + { + "epoch": 1.5812157038223311, + "grad_norm": 0.7179564833641052, + "learning_rate": 2.1692597506505717e-06, + "loss": 0.1992, + "step": 16785 + }, + { + "epoch": 1.5813099079154989, + "grad_norm": 0.6467909216880798, + "learning_rate": 2.1683207166247578e-06, + "loss": 0.2166, + "step": 16786 + }, + { + "epoch": 1.5814041120086668, + "grad_norm": 0.6622843146324158, + "learning_rate": 2.167381861173343e-06, + "loss": 0.1952, + "step": 16787 + }, + { + "epoch": 1.5814983161018348, + "grad_norm": 0.6338428854942322, + "learning_rate": 2.1664431843177382e-06, + "loss": 0.1996, + "step": 16788 + }, + { + "epoch": 1.5815925201950025, + "grad_norm": 0.6173291206359863, + "learning_rate": 2.1655046860793417e-06, + "loss": 0.1677, + "step": 16789 + }, + { + "epoch": 1.5816867242881703, + "grad_norm": 0.6547451019287109, + "learning_rate": 2.1645663664795534e-06, + "loss": 0.2005, + "step": 16790 + }, + { + "epoch": 1.5817809283813382, + "grad_norm": 0.6102684736251831, + "learning_rate": 2.1636282255397723e-06, + "loss": 0.1948, + "step": 16791 + }, + { + "epoch": 1.5818751324745062, + "grad_norm": 0.6864402890205383, + "learning_rate": 2.162690263281384e-06, + "loss": 0.2043, + "step": 16792 + }, + { + "epoch": 1.581969336567674, + "grad_norm": 0.6589213609695435, + "learning_rate": 2.1617524797257792e-06, + "loss": 0.1956, + "step": 16793 + }, + { + "epoch": 1.5820635406608416, + "grad_norm": 0.6801182627677917, + "learning_rate": 2.1608148748943424e-06, + "loss": 0.1761, + "step": 16794 + }, + { + "epoch": 1.5821577447540096, + "grad_norm": 0.6492635607719421, + "learning_rate": 2.1598774488084474e-06, + "loss": 0.1878, + "step": 16795 + }, + { + "epoch": 1.5822519488471776, + "grad_norm": 0.6200484037399292, + "learning_rate": 2.1589402014894714e-06, + "loss": 0.1949, + "step": 16796 + }, + { + "epoch": 1.5823461529403453, + "grad_norm": 0.6302139163017273, + "learning_rate": 2.158003132958787e-06, + "loss": 0.1821, + "step": 16797 + }, + { + "epoch": 1.582440357033513, + "grad_norm": 0.6472926735877991, + "learning_rate": 2.1570662432377576e-06, + "loss": 0.1864, + "step": 16798 + }, + { + "epoch": 1.582534561126681, + "grad_norm": 0.6446152329444885, + "learning_rate": 2.1561295323477472e-06, + "loss": 0.1838, + "step": 16799 + }, + { + "epoch": 1.582628765219849, + "grad_norm": 0.6802383661270142, + "learning_rate": 2.1551930003101163e-06, + "loss": 0.1973, + "step": 16800 + }, + { + "epoch": 1.5827229693130167, + "grad_norm": 0.6307654976844788, + "learning_rate": 2.1542566471462168e-06, + "loss": 0.2022, + "step": 16801 + }, + { + "epoch": 1.5828171734061844, + "grad_norm": 0.6357226371765137, + "learning_rate": 2.153320472877397e-06, + "loss": 0.1944, + "step": 16802 + }, + { + "epoch": 1.5829113774993524, + "grad_norm": 0.6586205959320068, + "learning_rate": 2.152384477525007e-06, + "loss": 0.2113, + "step": 16803 + }, + { + "epoch": 1.5830055815925204, + "grad_norm": 0.7027658224105835, + "learning_rate": 2.1514486611103892e-06, + "loss": 0.2011, + "step": 16804 + }, + { + "epoch": 1.5830997856856879, + "grad_norm": 0.6034343242645264, + "learning_rate": 2.1505130236548767e-06, + "loss": 0.1946, + "step": 16805 + }, + { + "epoch": 1.5831939897788558, + "grad_norm": 0.6628339290618896, + "learning_rate": 2.149577565179807e-06, + "loss": 0.2054, + "step": 16806 + }, + { + "epoch": 1.5832881938720238, + "grad_norm": 0.6580031514167786, + "learning_rate": 2.1486422857065128e-06, + "loss": 0.1951, + "step": 16807 + }, + { + "epoch": 1.5833823979651915, + "grad_norm": 0.7510112524032593, + "learning_rate": 2.1477071852563138e-06, + "loss": 0.1958, + "step": 16808 + }, + { + "epoch": 1.5834766020583593, + "grad_norm": 0.6545968055725098, + "learning_rate": 2.1467722638505352e-06, + "loss": 0.2193, + "step": 16809 + }, + { + "epoch": 1.5835708061515272, + "grad_norm": 0.6425247192382812, + "learning_rate": 2.1458375215104965e-06, + "loss": 0.1956, + "step": 16810 + }, + { + "epoch": 1.5836650102446952, + "grad_norm": 0.6466528177261353, + "learning_rate": 2.1449029582575064e-06, + "loss": 0.198, + "step": 16811 + }, + { + "epoch": 1.583759214337863, + "grad_norm": 0.6519531011581421, + "learning_rate": 2.143968574112879e-06, + "loss": 0.19, + "step": 16812 + }, + { + "epoch": 1.5838534184310307, + "grad_norm": 0.699285089969635, + "learning_rate": 2.143034369097916e-06, + "loss": 0.2098, + "step": 16813 + }, + { + "epoch": 1.5839476225241986, + "grad_norm": 0.6333559155464172, + "learning_rate": 2.1421003432339203e-06, + "loss": 0.1815, + "step": 16814 + }, + { + "epoch": 1.5840418266173666, + "grad_norm": 0.7741021513938904, + "learning_rate": 2.141166496542192e-06, + "loss": 0.1922, + "step": 16815 + }, + { + "epoch": 1.5841360307105343, + "grad_norm": 0.6113501787185669, + "learning_rate": 2.1402328290440176e-06, + "loss": 0.1773, + "step": 16816 + }, + { + "epoch": 1.584230234803702, + "grad_norm": 0.6935142874717712, + "learning_rate": 2.139299340760692e-06, + "loss": 0.205, + "step": 16817 + }, + { + "epoch": 1.58432443889687, + "grad_norm": 0.6054620146751404, + "learning_rate": 2.138366031713499e-06, + "loss": 0.1778, + "step": 16818 + }, + { + "epoch": 1.584418642990038, + "grad_norm": 0.5931374430656433, + "learning_rate": 2.1374329019237173e-06, + "loss": 0.2062, + "step": 16819 + }, + { + "epoch": 1.5845128470832057, + "grad_norm": 0.6228781938552856, + "learning_rate": 2.136499951412625e-06, + "loss": 0.1985, + "step": 16820 + }, + { + "epoch": 1.5846070511763735, + "grad_norm": 0.6543347835540771, + "learning_rate": 2.1355671802014976e-06, + "loss": 0.1828, + "step": 16821 + }, + { + "epoch": 1.5847012552695414, + "grad_norm": 0.6243523359298706, + "learning_rate": 2.1346345883116014e-06, + "loss": 0.1874, + "step": 16822 + }, + { + "epoch": 1.5847954593627094, + "grad_norm": 0.8013496398925781, + "learning_rate": 2.133702175764194e-06, + "loss": 0.239, + "step": 16823 + }, + { + "epoch": 1.5848896634558771, + "grad_norm": 0.8083053827285767, + "learning_rate": 2.1327699425805493e-06, + "loss": 0.1956, + "step": 16824 + }, + { + "epoch": 1.5849838675490449, + "grad_norm": 0.7009676098823547, + "learning_rate": 2.131837888781916e-06, + "loss": 0.2103, + "step": 16825 + }, + { + "epoch": 1.5850780716422128, + "grad_norm": 0.6287767887115479, + "learning_rate": 2.1309060143895443e-06, + "loss": 0.1802, + "step": 16826 + }, + { + "epoch": 1.5851722757353808, + "grad_norm": 0.6486721634864807, + "learning_rate": 2.129974319424686e-06, + "loss": 0.2046, + "step": 16827 + }, + { + "epoch": 1.5852664798285485, + "grad_norm": 0.6545873880386353, + "learning_rate": 2.129042803908586e-06, + "loss": 0.1912, + "step": 16828 + }, + { + "epoch": 1.5853606839217163, + "grad_norm": 0.6733275055885315, + "learning_rate": 2.1281114678624813e-06, + "loss": 0.1835, + "step": 16829 + }, + { + "epoch": 1.5854548880148842, + "grad_norm": 0.6418502330780029, + "learning_rate": 2.1271803113076085e-06, + "loss": 0.1892, + "step": 16830 + }, + { + "epoch": 1.5855490921080522, + "grad_norm": 0.7270448207855225, + "learning_rate": 2.1262493342652036e-06, + "loss": 0.2058, + "step": 16831 + }, + { + "epoch": 1.58564329620122, + "grad_norm": 0.6308165192604065, + "learning_rate": 2.1253185367564865e-06, + "loss": 0.1998, + "step": 16832 + }, + { + "epoch": 1.5857375002943876, + "grad_norm": 0.5984036922454834, + "learning_rate": 2.1243879188026873e-06, + "loss": 0.1696, + "step": 16833 + }, + { + "epoch": 1.5858317043875556, + "grad_norm": 0.6338097453117371, + "learning_rate": 2.123457480425025e-06, + "loss": 0.1807, + "step": 16834 + }, + { + "epoch": 1.5859259084807236, + "grad_norm": 0.7448612451553345, + "learning_rate": 2.12252722164471e-06, + "loss": 0.2285, + "step": 16835 + }, + { + "epoch": 1.5860201125738913, + "grad_norm": 0.6317852139472961, + "learning_rate": 2.1215971424829584e-06, + "loss": 0.1813, + "step": 16836 + }, + { + "epoch": 1.586114316667059, + "grad_norm": 0.6573054194450378, + "learning_rate": 2.1206672429609786e-06, + "loss": 0.2028, + "step": 16837 + }, + { + "epoch": 1.586208520760227, + "grad_norm": 0.7340366840362549, + "learning_rate": 2.1197375230999707e-06, + "loss": 0.2047, + "step": 16838 + }, + { + "epoch": 1.586302724853395, + "grad_norm": 0.6898893713951111, + "learning_rate": 2.1188079829211296e-06, + "loss": 0.1876, + "step": 16839 + }, + { + "epoch": 1.5863969289465627, + "grad_norm": 0.6378750801086426, + "learning_rate": 2.1178786224456603e-06, + "loss": 0.194, + "step": 16840 + }, + { + "epoch": 1.5864911330397304, + "grad_norm": 0.6489059329032898, + "learning_rate": 2.116949441694748e-06, + "loss": 0.1953, + "step": 16841 + }, + { + "epoch": 1.5865853371328984, + "grad_norm": 0.5951984524726868, + "learning_rate": 2.116020440689577e-06, + "loss": 0.1814, + "step": 16842 + }, + { + "epoch": 1.5866795412260664, + "grad_norm": 0.7144877314567566, + "learning_rate": 2.115091619451334e-06, + "loss": 0.225, + "step": 16843 + }, + { + "epoch": 1.586773745319234, + "grad_norm": 0.6485958099365234, + "learning_rate": 2.1141629780011975e-06, + "loss": 0.1985, + "step": 16844 + }, + { + "epoch": 1.5868679494124018, + "grad_norm": 0.7251630425453186, + "learning_rate": 2.1132345163603386e-06, + "loss": 0.1814, + "step": 16845 + }, + { + "epoch": 1.5869621535055698, + "grad_norm": 0.7341436743736267, + "learning_rate": 2.1123062345499303e-06, + "loss": 0.2144, + "step": 16846 + }, + { + "epoch": 1.5870563575987378, + "grad_norm": 0.730889081954956, + "learning_rate": 2.1113781325911397e-06, + "loss": 0.2146, + "step": 16847 + }, + { + "epoch": 1.5871505616919055, + "grad_norm": 0.6238629221916199, + "learning_rate": 2.110450210505126e-06, + "loss": 0.1813, + "step": 16848 + }, + { + "epoch": 1.5872447657850732, + "grad_norm": 0.6476354002952576, + "learning_rate": 2.109522468313049e-06, + "loss": 0.1905, + "step": 16849 + }, + { + "epoch": 1.5873389698782412, + "grad_norm": 0.7104042172431946, + "learning_rate": 2.1085949060360654e-06, + "loss": 0.2263, + "step": 16850 + }, + { + "epoch": 1.5874331739714092, + "grad_norm": 0.7384299039840698, + "learning_rate": 2.1076675236953194e-06, + "loss": 0.2109, + "step": 16851 + }, + { + "epoch": 1.5875273780645769, + "grad_norm": 0.609963595867157, + "learning_rate": 2.106740321311962e-06, + "loss": 0.1767, + "step": 16852 + }, + { + "epoch": 1.5876215821577446, + "grad_norm": 0.6352958679199219, + "learning_rate": 2.10581329890713e-06, + "loss": 0.1635, + "step": 16853 + }, + { + "epoch": 1.5877157862509126, + "grad_norm": 0.6387240886688232, + "learning_rate": 2.1048864565019635e-06, + "loss": 0.2126, + "step": 16854 + }, + { + "epoch": 1.5878099903440805, + "grad_norm": 0.7180310487747192, + "learning_rate": 2.1039597941175984e-06, + "loss": 0.2195, + "step": 16855 + }, + { + "epoch": 1.5879041944372483, + "grad_norm": 0.7523316144943237, + "learning_rate": 2.103033311775158e-06, + "loss": 0.1632, + "step": 16856 + }, + { + "epoch": 1.587998398530416, + "grad_norm": 0.6734952926635742, + "learning_rate": 2.102107009495772e-06, + "loss": 0.2272, + "step": 16857 + }, + { + "epoch": 1.588092602623584, + "grad_norm": 0.6469452977180481, + "learning_rate": 2.1011808873005626e-06, + "loss": 0.1658, + "step": 16858 + }, + { + "epoch": 1.588186806716752, + "grad_norm": 0.6462699770927429, + "learning_rate": 2.1002549452106422e-06, + "loss": 0.1922, + "step": 16859 + }, + { + "epoch": 1.5882810108099197, + "grad_norm": 0.6486620903015137, + "learning_rate": 2.099329183247126e-06, + "loss": 0.2065, + "step": 16860 + }, + { + "epoch": 1.5883752149030874, + "grad_norm": 0.6494008302688599, + "learning_rate": 2.098403601431126e-06, + "loss": 0.1947, + "step": 16861 + }, + { + "epoch": 1.5884694189962554, + "grad_norm": 0.6822574734687805, + "learning_rate": 2.0974781997837444e-06, + "loss": 0.1876, + "step": 16862 + }, + { + "epoch": 1.5885636230894233, + "grad_norm": 0.6509035229682922, + "learning_rate": 2.0965529783260783e-06, + "loss": 0.2122, + "step": 16863 + }, + { + "epoch": 1.588657827182591, + "grad_norm": 0.6158427596092224, + "learning_rate": 2.0956279370792276e-06, + "loss": 0.1934, + "step": 16864 + }, + { + "epoch": 1.5887520312757588, + "grad_norm": 0.6245700120925903, + "learning_rate": 2.094703076064286e-06, + "loss": 0.1884, + "step": 16865 + }, + { + "epoch": 1.5888462353689268, + "grad_norm": 0.7309079170227051, + "learning_rate": 2.093778395302338e-06, + "loss": 0.2161, + "step": 16866 + }, + { + "epoch": 1.5889404394620947, + "grad_norm": 0.6970020532608032, + "learning_rate": 2.0928538948144693e-06, + "loss": 0.1856, + "step": 16867 + }, + { + "epoch": 1.5890346435552625, + "grad_norm": 0.6402873396873474, + "learning_rate": 2.091929574621764e-06, + "loss": 0.1983, + "step": 16868 + }, + { + "epoch": 1.5891288476484302, + "grad_norm": 0.6710144877433777, + "learning_rate": 2.091005434745291e-06, + "loss": 0.1944, + "step": 16869 + }, + { + "epoch": 1.5892230517415982, + "grad_norm": 0.6847306489944458, + "learning_rate": 2.090081475206126e-06, + "loss": 0.2105, + "step": 16870 + }, + { + "epoch": 1.5893172558347661, + "grad_norm": 0.6467797756195068, + "learning_rate": 2.089157696025339e-06, + "loss": 0.1815, + "step": 16871 + }, + { + "epoch": 1.5894114599279339, + "grad_norm": 0.6493083834648132, + "learning_rate": 2.088234097223988e-06, + "loss": 0.1829, + "step": 16872 + }, + { + "epoch": 1.5895056640211016, + "grad_norm": 0.6799706816673279, + "learning_rate": 2.0873106788231346e-06, + "loss": 0.2089, + "step": 16873 + }, + { + "epoch": 1.5895998681142696, + "grad_norm": 0.6377043128013611, + "learning_rate": 2.086387440843839e-06, + "loss": 0.1904, + "step": 16874 + }, + { + "epoch": 1.5896940722074375, + "grad_norm": 0.6574447154998779, + "learning_rate": 2.0854643833071454e-06, + "loss": 0.2127, + "step": 16875 + }, + { + "epoch": 1.5897882763006053, + "grad_norm": 0.6165361404418945, + "learning_rate": 2.0845415062341035e-06, + "loss": 0.1894, + "step": 16876 + }, + { + "epoch": 1.589882480393773, + "grad_norm": 0.6518315076828003, + "learning_rate": 2.0836188096457586e-06, + "loss": 0.2247, + "step": 16877 + }, + { + "epoch": 1.589976684486941, + "grad_norm": 0.649908185005188, + "learning_rate": 2.082696293563149e-06, + "loss": 0.1771, + "step": 16878 + }, + { + "epoch": 1.590070888580109, + "grad_norm": 0.6095840334892273, + "learning_rate": 2.0817739580073026e-06, + "loss": 0.201, + "step": 16879 + }, + { + "epoch": 1.5901650926732767, + "grad_norm": 0.6645314693450928, + "learning_rate": 2.080851802999262e-06, + "loss": 0.2107, + "step": 16880 + }, + { + "epoch": 1.5902592967664444, + "grad_norm": 0.6977287530899048, + "learning_rate": 2.0799298285600455e-06, + "loss": 0.2013, + "step": 16881 + }, + { + "epoch": 1.5903535008596124, + "grad_norm": 0.7408657670021057, + "learning_rate": 2.0790080347106768e-06, + "loss": 0.1993, + "step": 16882 + }, + { + "epoch": 1.5904477049527803, + "grad_norm": 0.6141915321350098, + "learning_rate": 2.078086421472174e-06, + "loss": 0.1929, + "step": 16883 + }, + { + "epoch": 1.590541909045948, + "grad_norm": 0.6887730360031128, + "learning_rate": 2.0771649888655553e-06, + "loss": 0.1928, + "step": 16884 + }, + { + "epoch": 1.5906361131391158, + "grad_norm": 0.7160045504570007, + "learning_rate": 2.076243736911825e-06, + "loss": 0.2232, + "step": 16885 + }, + { + "epoch": 1.5907303172322838, + "grad_norm": 0.5830559134483337, + "learning_rate": 2.0753226656319914e-06, + "loss": 0.1984, + "step": 16886 + }, + { + "epoch": 1.5908245213254517, + "grad_norm": 0.6876028180122375, + "learning_rate": 2.0744017750470592e-06, + "loss": 0.1734, + "step": 16887 + }, + { + "epoch": 1.5909187254186195, + "grad_norm": 0.6495875120162964, + "learning_rate": 2.0734810651780213e-06, + "loss": 0.2092, + "step": 16888 + }, + { + "epoch": 1.5910129295117872, + "grad_norm": 0.6716800928115845, + "learning_rate": 2.0725605360458743e-06, + "loss": 0.2108, + "step": 16889 + }, + { + "epoch": 1.5911071336049551, + "grad_norm": 0.7414377331733704, + "learning_rate": 2.07164018767161e-06, + "loss": 0.2066, + "step": 16890 + }, + { + "epoch": 1.591201337698123, + "grad_norm": 0.5687053799629211, + "learning_rate": 2.070720020076207e-06, + "loss": 0.161, + "step": 16891 + }, + { + "epoch": 1.5912955417912908, + "grad_norm": 0.615281879901886, + "learning_rate": 2.0698000332806534e-06, + "loss": 0.1772, + "step": 16892 + }, + { + "epoch": 1.5913897458844586, + "grad_norm": 0.6414679884910583, + "learning_rate": 2.0688802273059218e-06, + "loss": 0.1807, + "step": 16893 + }, + { + "epoch": 1.5914839499776265, + "grad_norm": 0.6869304776191711, + "learning_rate": 2.067960602172985e-06, + "loss": 0.2049, + "step": 16894 + }, + { + "epoch": 1.5915781540707945, + "grad_norm": 0.7120691537857056, + "learning_rate": 2.0670411579028172e-06, + "loss": 0.2571, + "step": 16895 + }, + { + "epoch": 1.5916723581639622, + "grad_norm": 0.6378860473632812, + "learning_rate": 2.0661218945163776e-06, + "loss": 0.2169, + "step": 16896 + }, + { + "epoch": 1.59176656225713, + "grad_norm": 0.6451314091682434, + "learning_rate": 2.065202812034627e-06, + "loss": 0.2028, + "step": 16897 + }, + { + "epoch": 1.591860766350298, + "grad_norm": 0.6550354957580566, + "learning_rate": 2.064283910478527e-06, + "loss": 0.2118, + "step": 16898 + }, + { + "epoch": 1.591954970443466, + "grad_norm": 0.6120240092277527, + "learning_rate": 2.0633651898690256e-06, + "loss": 0.1837, + "step": 16899 + }, + { + "epoch": 1.5920491745366336, + "grad_norm": 0.6445263028144836, + "learning_rate": 2.06244665022707e-06, + "loss": 0.1903, + "step": 16900 + }, + { + "epoch": 1.5921433786298014, + "grad_norm": 0.6270973682403564, + "learning_rate": 2.06152829157361e-06, + "loss": 0.2017, + "step": 16901 + }, + { + "epoch": 1.5922375827229693, + "grad_norm": 0.668395459651947, + "learning_rate": 2.060610113929582e-06, + "loss": 0.1838, + "step": 16902 + }, + { + "epoch": 1.5923317868161373, + "grad_norm": 0.6957286596298218, + "learning_rate": 2.059692117315919e-06, + "loss": 0.161, + "step": 16903 + }, + { + "epoch": 1.592425990909305, + "grad_norm": 0.6116093993186951, + "learning_rate": 2.0587743017535564e-06, + "loss": 0.1982, + "step": 16904 + }, + { + "epoch": 1.5925201950024728, + "grad_norm": 0.8217107057571411, + "learning_rate": 2.0578566672634237e-06, + "loss": 0.2277, + "step": 16905 + }, + { + "epoch": 1.5926143990956407, + "grad_norm": 0.6316429972648621, + "learning_rate": 2.056939213866438e-06, + "loss": 0.1705, + "step": 16906 + }, + { + "epoch": 1.5927086031888087, + "grad_norm": 0.680424153804779, + "learning_rate": 2.0560219415835237e-06, + "loss": 0.1972, + "step": 16907 + }, + { + "epoch": 1.5928028072819764, + "grad_norm": 0.7076070308685303, + "learning_rate": 2.0551048504355965e-06, + "loss": 0.2183, + "step": 16908 + }, + { + "epoch": 1.5928970113751442, + "grad_norm": 0.6588847041130066, + "learning_rate": 2.054187940443563e-06, + "loss": 0.1858, + "step": 16909 + }, + { + "epoch": 1.5929912154683121, + "grad_norm": 0.5913216471672058, + "learning_rate": 2.0532712116283326e-06, + "loss": 0.1752, + "step": 16910 + }, + { + "epoch": 1.59308541956148, + "grad_norm": 0.6831443905830383, + "learning_rate": 2.0523546640108114e-06, + "loss": 0.2054, + "step": 16911 + }, + { + "epoch": 1.5931796236546478, + "grad_norm": 0.7347297072410583, + "learning_rate": 2.051438297611893e-06, + "loss": 0.2185, + "step": 16912 + }, + { + "epoch": 1.5932738277478156, + "grad_norm": 0.6393066644668579, + "learning_rate": 2.0505221124524743e-06, + "loss": 0.2076, + "step": 16913 + }, + { + "epoch": 1.5933680318409835, + "grad_norm": 0.6571082472801208, + "learning_rate": 2.049606108553448e-06, + "loss": 0.1824, + "step": 16914 + }, + { + "epoch": 1.5934622359341515, + "grad_norm": 0.7159751653671265, + "learning_rate": 2.048690285935697e-06, + "loss": 0.2031, + "step": 16915 + }, + { + "epoch": 1.5935564400273192, + "grad_norm": 0.6926108002662659, + "learning_rate": 2.0477746446200997e-06, + "loss": 0.1899, + "step": 16916 + }, + { + "epoch": 1.593650644120487, + "grad_norm": 0.6743190288543701, + "learning_rate": 2.0468591846275443e-06, + "loss": 0.1837, + "step": 16917 + }, + { + "epoch": 1.593744848213655, + "grad_norm": 0.6317774653434753, + "learning_rate": 2.0459439059789e-06, + "loss": 0.1819, + "step": 16918 + }, + { + "epoch": 1.5938390523068229, + "grad_norm": 0.677336573600769, + "learning_rate": 2.045028808695029e-06, + "loss": 0.1979, + "step": 16919 + }, + { + "epoch": 1.5939332563999906, + "grad_norm": 0.7855684757232666, + "learning_rate": 2.0441138927968094e-06, + "loss": 0.2128, + "step": 16920 + }, + { + "epoch": 1.5940274604931584, + "grad_norm": 0.6705695986747742, + "learning_rate": 2.043199158305098e-06, + "loss": 0.1975, + "step": 16921 + }, + { + "epoch": 1.5941216645863263, + "grad_norm": 0.7156023383140564, + "learning_rate": 2.0422846052407474e-06, + "loss": 0.202, + "step": 16922 + }, + { + "epoch": 1.5942158686794943, + "grad_norm": 0.7329447865486145, + "learning_rate": 2.0413702336246156e-06, + "loss": 0.2039, + "step": 16923 + }, + { + "epoch": 1.594310072772662, + "grad_norm": 0.7201051115989685, + "learning_rate": 2.0404560434775535e-06, + "loss": 0.1935, + "step": 16924 + }, + { + "epoch": 1.5944042768658298, + "grad_norm": 0.7321786880493164, + "learning_rate": 2.0395420348203996e-06, + "loss": 0.1947, + "step": 16925 + }, + { + "epoch": 1.5944984809589977, + "grad_norm": 0.6338954567909241, + "learning_rate": 2.0386282076739996e-06, + "loss": 0.1976, + "step": 16926 + }, + { + "epoch": 1.5945926850521657, + "grad_norm": 0.6547569036483765, + "learning_rate": 2.0377145620591907e-06, + "loss": 0.1811, + "step": 16927 + }, + { + "epoch": 1.5946868891453334, + "grad_norm": 0.6150227189064026, + "learning_rate": 2.0368010979968013e-06, + "loss": 0.1973, + "step": 16928 + }, + { + "epoch": 1.5947810932385011, + "grad_norm": 0.666617214679718, + "learning_rate": 2.0358878155076622e-06, + "loss": 0.204, + "step": 16929 + }, + { + "epoch": 1.594875297331669, + "grad_norm": 0.7155197262763977, + "learning_rate": 2.0349747146125996e-06, + "loss": 0.1949, + "step": 16930 + }, + { + "epoch": 1.594969501424837, + "grad_norm": 0.6648327112197876, + "learning_rate": 2.0340617953324305e-06, + "loss": 0.2177, + "step": 16931 + }, + { + "epoch": 1.5950637055180048, + "grad_norm": 0.6485323309898376, + "learning_rate": 2.033149057687974e-06, + "loss": 0.1997, + "step": 16932 + }, + { + "epoch": 1.5951579096111725, + "grad_norm": 0.6812450289726257, + "learning_rate": 2.0322365017000367e-06, + "loss": 0.2042, + "step": 16933 + }, + { + "epoch": 1.5952521137043405, + "grad_norm": 0.6267063617706299, + "learning_rate": 2.03132412738943e-06, + "loss": 0.1776, + "step": 16934 + }, + { + "epoch": 1.5953463177975085, + "grad_norm": 0.600155770778656, + "learning_rate": 2.03041193477696e-06, + "loss": 0.1498, + "step": 16935 + }, + { + "epoch": 1.5954405218906762, + "grad_norm": 0.6206088066101074, + "learning_rate": 2.0294999238834203e-06, + "loss": 0.162, + "step": 16936 + }, + { + "epoch": 1.595534725983844, + "grad_norm": 0.6867279410362244, + "learning_rate": 2.028588094729609e-06, + "loss": 0.2371, + "step": 16937 + }, + { + "epoch": 1.595628930077012, + "grad_norm": 0.6605044603347778, + "learning_rate": 2.0276764473363185e-06, + "loss": 0.1725, + "step": 16938 + }, + { + "epoch": 1.5957231341701799, + "grad_norm": 0.6010245680809021, + "learning_rate": 2.0267649817243327e-06, + "loss": 0.1632, + "step": 16939 + }, + { + "epoch": 1.5958173382633476, + "grad_norm": 0.7845609188079834, + "learning_rate": 2.0258536979144373e-06, + "loss": 0.2188, + "step": 16940 + }, + { + "epoch": 1.5959115423565153, + "grad_norm": 0.5976998209953308, + "learning_rate": 2.0249425959274116e-06, + "loss": 0.1867, + "step": 16941 + }, + { + "epoch": 1.5960057464496833, + "grad_norm": 0.6509695053100586, + "learning_rate": 2.0240316757840283e-06, + "loss": 0.1893, + "step": 16942 + }, + { + "epoch": 1.596099950542851, + "grad_norm": 0.6574884653091431, + "learning_rate": 2.023120937505054e-06, + "loss": 0.1892, + "step": 16943 + }, + { + "epoch": 1.5961941546360188, + "grad_norm": 0.6583541631698608, + "learning_rate": 2.0222103811112605e-06, + "loss": 0.186, + "step": 16944 + }, + { + "epoch": 1.5962883587291867, + "grad_norm": 0.6240532398223877, + "learning_rate": 2.02130000662341e-06, + "loss": 0.1952, + "step": 16945 + }, + { + "epoch": 1.5963825628223547, + "grad_norm": 0.594275712966919, + "learning_rate": 2.0203898140622568e-06, + "loss": 0.1833, + "step": 16946 + }, + { + "epoch": 1.5964767669155224, + "grad_norm": 0.6507365703582764, + "learning_rate": 2.0194798034485565e-06, + "loss": 0.2249, + "step": 16947 + }, + { + "epoch": 1.5965709710086902, + "grad_norm": 0.6944069266319275, + "learning_rate": 2.0185699748030607e-06, + "loss": 0.1928, + "step": 16948 + }, + { + "epoch": 1.5966651751018581, + "grad_norm": 0.6407192349433899, + "learning_rate": 2.017660328146511e-06, + "loss": 0.196, + "step": 16949 + }, + { + "epoch": 1.596759379195026, + "grad_norm": 0.6506879925727844, + "learning_rate": 2.0167508634996504e-06, + "loss": 0.2043, + "step": 16950 + }, + { + "epoch": 1.5968535832881938, + "grad_norm": 0.6605292558670044, + "learning_rate": 2.015841580883219e-06, + "loss": 0.1706, + "step": 16951 + }, + { + "epoch": 1.5969477873813616, + "grad_norm": 0.6389287114143372, + "learning_rate": 2.014932480317945e-06, + "loss": 0.2312, + "step": 16952 + }, + { + "epoch": 1.5970419914745295, + "grad_norm": 0.6113000512123108, + "learning_rate": 2.0140235618245585e-06, + "loss": 0.1951, + "step": 16953 + }, + { + "epoch": 1.5971361955676975, + "grad_norm": 0.6415550708770752, + "learning_rate": 2.0131148254237898e-06, + "loss": 0.1963, + "step": 16954 + }, + { + "epoch": 1.5972303996608652, + "grad_norm": 0.6620209217071533, + "learning_rate": 2.012206271136353e-06, + "loss": 0.1836, + "step": 16955 + }, + { + "epoch": 1.597324603754033, + "grad_norm": 0.6675146818161011, + "learning_rate": 2.0112978989829634e-06, + "loss": 0.1652, + "step": 16956 + }, + { + "epoch": 1.597418807847201, + "grad_norm": 0.6546598672866821, + "learning_rate": 2.0103897089843406e-06, + "loss": 0.2106, + "step": 16957 + }, + { + "epoch": 1.5975130119403689, + "grad_norm": 0.6360905766487122, + "learning_rate": 2.009481701161189e-06, + "loss": 0.1773, + "step": 16958 + }, + { + "epoch": 1.5976072160335366, + "grad_norm": 0.6721376776695251, + "learning_rate": 2.0085738755342067e-06, + "loss": 0.2002, + "step": 16959 + }, + { + "epoch": 1.5977014201267044, + "grad_norm": 0.6992481350898743, + "learning_rate": 2.0076662321241036e-06, + "loss": 0.1922, + "step": 16960 + }, + { + "epoch": 1.5977956242198723, + "grad_norm": 0.6002038717269897, + "learning_rate": 2.0067587709515714e-06, + "loss": 0.1759, + "step": 16961 + }, + { + "epoch": 1.5978898283130403, + "grad_norm": 0.7566602230072021, + "learning_rate": 2.0058514920372986e-06, + "loss": 0.1843, + "step": 16962 + }, + { + "epoch": 1.597984032406208, + "grad_norm": 0.6735486388206482, + "learning_rate": 2.004944395401974e-06, + "loss": 0.1815, + "step": 16963 + }, + { + "epoch": 1.5980782364993757, + "grad_norm": 0.6113762259483337, + "learning_rate": 2.0040374810662855e-06, + "loss": 0.1774, + "step": 16964 + }, + { + "epoch": 1.5981724405925437, + "grad_norm": 0.5759824514389038, + "learning_rate": 2.0031307490509054e-06, + "loss": 0.175, + "step": 16965 + }, + { + "epoch": 1.5982666446857117, + "grad_norm": 0.5985716581344604, + "learning_rate": 2.0022241993765124e-06, + "loss": 0.1791, + "step": 16966 + }, + { + "epoch": 1.5983608487788794, + "grad_norm": 0.6667709946632385, + "learning_rate": 2.0013178320637783e-06, + "loss": 0.2181, + "step": 16967 + }, + { + "epoch": 1.5984550528720471, + "grad_norm": 0.6682126522064209, + "learning_rate": 2.0004116471333644e-06, + "loss": 0.1807, + "step": 16968 + }, + { + "epoch": 1.598549256965215, + "grad_norm": 0.6660087704658508, + "learning_rate": 1.999505644605938e-06, + "loss": 0.1859, + "step": 16969 + }, + { + "epoch": 1.598643461058383, + "grad_norm": 0.6404968500137329, + "learning_rate": 1.9985998245021576e-06, + "loss": 0.2026, + "step": 16970 + }, + { + "epoch": 1.5987376651515508, + "grad_norm": 0.6893209218978882, + "learning_rate": 1.9976941868426735e-06, + "loss": 0.2068, + "step": 16971 + }, + { + "epoch": 1.5988318692447185, + "grad_norm": 0.6096252202987671, + "learning_rate": 1.9967887316481403e-06, + "loss": 0.1738, + "step": 16972 + }, + { + "epoch": 1.5989260733378865, + "grad_norm": 0.6376148462295532, + "learning_rate": 1.9958834589391983e-06, + "loss": 0.1631, + "step": 16973 + }, + { + "epoch": 1.5990202774310545, + "grad_norm": 0.6672173738479614, + "learning_rate": 1.994978368736492e-06, + "loss": 0.2128, + "step": 16974 + }, + { + "epoch": 1.5991144815242222, + "grad_norm": 0.7077866792678833, + "learning_rate": 1.9940734610606614e-06, + "loss": 0.185, + "step": 16975 + }, + { + "epoch": 1.59920868561739, + "grad_norm": 0.6899238228797913, + "learning_rate": 1.993168735932336e-06, + "loss": 0.2095, + "step": 16976 + }, + { + "epoch": 1.599302889710558, + "grad_norm": 0.6598714590072632, + "learning_rate": 1.992264193372145e-06, + "loss": 0.2066, + "step": 16977 + }, + { + "epoch": 1.5993970938037259, + "grad_norm": 0.7184349894523621, + "learning_rate": 1.9913598334007177e-06, + "loss": 0.2171, + "step": 16978 + }, + { + "epoch": 1.5994912978968936, + "grad_norm": 0.6424234509468079, + "learning_rate": 1.990455656038669e-06, + "loss": 0.2161, + "step": 16979 + }, + { + "epoch": 1.5995855019900613, + "grad_norm": 0.610517680644989, + "learning_rate": 1.9895516613066203e-06, + "loss": 0.2063, + "step": 16980 + }, + { + "epoch": 1.5996797060832293, + "grad_norm": 0.6170348525047302, + "learning_rate": 1.9886478492251805e-06, + "loss": 0.1742, + "step": 16981 + }, + { + "epoch": 1.5997739101763973, + "grad_norm": 0.7161915898323059, + "learning_rate": 1.987744219814961e-06, + "loss": 0.2132, + "step": 16982 + }, + { + "epoch": 1.599868114269565, + "grad_norm": 0.6135024428367615, + "learning_rate": 1.986840773096563e-06, + "loss": 0.187, + "step": 16983 + }, + { + "epoch": 1.5999623183627327, + "grad_norm": 0.6426527500152588, + "learning_rate": 1.9859375090905876e-06, + "loss": 0.1901, + "step": 16984 + }, + { + "epoch": 1.6000565224559007, + "grad_norm": 0.6360190510749817, + "learning_rate": 1.985034427817634e-06, + "loss": 0.192, + "step": 16985 + }, + { + "epoch": 1.6001507265490686, + "grad_norm": 0.7561869025230408, + "learning_rate": 1.984131529298288e-06, + "loss": 0.225, + "step": 16986 + }, + { + "epoch": 1.6002449306422364, + "grad_norm": 0.6566307544708252, + "learning_rate": 1.9832288135531385e-06, + "loss": 0.1912, + "step": 16987 + }, + { + "epoch": 1.6003391347354041, + "grad_norm": 0.631393551826477, + "learning_rate": 1.982326280602774e-06, + "loss": 0.1944, + "step": 16988 + }, + { + "epoch": 1.600433338828572, + "grad_norm": 0.6858162879943848, + "learning_rate": 1.9814239304677676e-06, + "loss": 0.1825, + "step": 16989 + }, + { + "epoch": 1.60052754292174, + "grad_norm": 0.6661040186882019, + "learning_rate": 1.980521763168697e-06, + "loss": 0.2158, + "step": 16990 + }, + { + "epoch": 1.6006217470149078, + "grad_norm": 0.6457971334457397, + "learning_rate": 1.979619778726134e-06, + "loss": 0.1717, + "step": 16991 + }, + { + "epoch": 1.6007159511080755, + "grad_norm": 0.6137654185295105, + "learning_rate": 1.978717977160641e-06, + "loss": 0.1697, + "step": 16992 + }, + { + "epoch": 1.6008101552012435, + "grad_norm": 0.6896564960479736, + "learning_rate": 1.9778163584927845e-06, + "loss": 0.1996, + "step": 16993 + }, + { + "epoch": 1.6009043592944114, + "grad_norm": 0.6603320837020874, + "learning_rate": 1.976914922743124e-06, + "loss": 0.2017, + "step": 16994 + }, + { + "epoch": 1.6009985633875792, + "grad_norm": 0.63653564453125, + "learning_rate": 1.9760136699322107e-06, + "loss": 0.1804, + "step": 16995 + }, + { + "epoch": 1.601092767480747, + "grad_norm": 0.6182230710983276, + "learning_rate": 1.97511260008059e-06, + "loss": 0.1739, + "step": 16996 + }, + { + "epoch": 1.6011869715739149, + "grad_norm": 0.6287508010864258, + "learning_rate": 1.9742117132088166e-06, + "loss": 0.1839, + "step": 16997 + }, + { + "epoch": 1.6012811756670828, + "grad_norm": 0.6807714700698853, + "learning_rate": 1.97331100933743e-06, + "loss": 0.2137, + "step": 16998 + }, + { + "epoch": 1.6013753797602506, + "grad_norm": 0.7110925316810608, + "learning_rate": 1.97241048848696e-06, + "loss": 0.2155, + "step": 16999 + }, + { + "epoch": 1.6014695838534183, + "grad_norm": 0.6958010792732239, + "learning_rate": 1.971510150677951e-06, + "loss": 0.2185, + "step": 17000 + }, + { + "epoch": 1.6015637879465863, + "grad_norm": 0.7163333296775818, + "learning_rate": 1.970609995930928e-06, + "loss": 0.2006, + "step": 17001 + }, + { + "epoch": 1.6016579920397542, + "grad_norm": 0.6332830786705017, + "learning_rate": 1.9697100242664112e-06, + "loss": 0.1886, + "step": 17002 + }, + { + "epoch": 1.601752196132922, + "grad_norm": 0.7441249489784241, + "learning_rate": 1.968810235704924e-06, + "loss": 0.1767, + "step": 17003 + }, + { + "epoch": 1.6018464002260897, + "grad_norm": 0.6677401661872864, + "learning_rate": 1.9679106302669882e-06, + "loss": 0.1984, + "step": 17004 + }, + { + "epoch": 1.6019406043192577, + "grad_norm": 0.6124136447906494, + "learning_rate": 1.9670112079731084e-06, + "loss": 0.1572, + "step": 17005 + }, + { + "epoch": 1.6020348084124256, + "grad_norm": 0.6781045198440552, + "learning_rate": 1.9661119688437968e-06, + "loss": 0.1985, + "step": 17006 + }, + { + "epoch": 1.6021290125055934, + "grad_norm": 0.6994994878768921, + "learning_rate": 1.965212912899559e-06, + "loss": 0.1934, + "step": 17007 + }, + { + "epoch": 1.602223216598761, + "grad_norm": 0.6974150538444519, + "learning_rate": 1.9643140401608906e-06, + "loss": 0.2143, + "step": 17008 + }, + { + "epoch": 1.602317420691929, + "grad_norm": 0.6470915079116821, + "learning_rate": 1.963415350648289e-06, + "loss": 0.1873, + "step": 17009 + }, + { + "epoch": 1.602411624785097, + "grad_norm": 0.6578443646430969, + "learning_rate": 1.9625168443822494e-06, + "loss": 0.2036, + "step": 17010 + }, + { + "epoch": 1.6025058288782648, + "grad_norm": 0.6516851186752319, + "learning_rate": 1.961618521383253e-06, + "loss": 0.2028, + "step": 17011 + }, + { + "epoch": 1.6026000329714325, + "grad_norm": 0.8125045299530029, + "learning_rate": 1.960720381671789e-06, + "loss": 0.2176, + "step": 17012 + }, + { + "epoch": 1.6026942370646005, + "grad_norm": 0.6228019595146179, + "learning_rate": 1.9598224252683297e-06, + "loss": 0.1897, + "step": 17013 + }, + { + "epoch": 1.6027884411577684, + "grad_norm": 0.8371632695198059, + "learning_rate": 1.958924652193355e-06, + "loss": 0.2218, + "step": 17014 + }, + { + "epoch": 1.6028826452509362, + "grad_norm": 0.73078453540802, + "learning_rate": 1.9580270624673346e-06, + "loss": 0.2083, + "step": 17015 + }, + { + "epoch": 1.602976849344104, + "grad_norm": 0.6354836225509644, + "learning_rate": 1.9571296561107333e-06, + "loss": 0.1784, + "step": 17016 + }, + { + "epoch": 1.6030710534372719, + "grad_norm": 0.6360405683517456, + "learning_rate": 1.956232433144014e-06, + "loss": 0.1628, + "step": 17017 + }, + { + "epoch": 1.6031652575304398, + "grad_norm": 0.614295482635498, + "learning_rate": 1.9553353935876373e-06, + "loss": 0.1904, + "step": 17018 + }, + { + "epoch": 1.6032594616236076, + "grad_norm": 0.6972296833992004, + "learning_rate": 1.9544385374620525e-06, + "loss": 0.2285, + "step": 17019 + }, + { + "epoch": 1.6033536657167753, + "grad_norm": 0.663750946521759, + "learning_rate": 1.9535418647877146e-06, + "loss": 0.2058, + "step": 17020 + }, + { + "epoch": 1.6034478698099432, + "grad_norm": 0.6997053623199463, + "learning_rate": 1.952645375585064e-06, + "loss": 0.2247, + "step": 17021 + }, + { + "epoch": 1.6035420739031112, + "grad_norm": 0.6695412993431091, + "learning_rate": 1.9517490698745466e-06, + "loss": 0.2161, + "step": 17022 + }, + { + "epoch": 1.603636277996279, + "grad_norm": 0.6272669434547424, + "learning_rate": 1.9508529476765946e-06, + "loss": 0.2141, + "step": 17023 + }, + { + "epoch": 1.6037304820894467, + "grad_norm": 0.6253587007522583, + "learning_rate": 1.949957009011644e-06, + "loss": 0.1798, + "step": 17024 + }, + { + "epoch": 1.6038246861826146, + "grad_norm": 0.5858707427978516, + "learning_rate": 1.949061253900125e-06, + "loss": 0.1818, + "step": 17025 + }, + { + "epoch": 1.6039188902757826, + "grad_norm": 0.6380616426467896, + "learning_rate": 1.9481656823624586e-06, + "loss": 0.1562, + "step": 17026 + }, + { + "epoch": 1.6040130943689503, + "grad_norm": 0.6433847546577454, + "learning_rate": 1.9472702944190657e-06, + "loss": 0.1839, + "step": 17027 + }, + { + "epoch": 1.604107298462118, + "grad_norm": 0.6600707173347473, + "learning_rate": 1.9463750900903676e-06, + "loss": 0.2091, + "step": 17028 + }, + { + "epoch": 1.604201502555286, + "grad_norm": 0.6305294632911682, + "learning_rate": 1.94548006939677e-06, + "loss": 0.1952, + "step": 17029 + }, + { + "epoch": 1.604295706648454, + "grad_norm": 0.6270998120307922, + "learning_rate": 1.9445852323586835e-06, + "loss": 0.2067, + "step": 17030 + }, + { + "epoch": 1.6043899107416217, + "grad_norm": 0.7592684626579285, + "learning_rate": 1.9436905789965144e-06, + "loss": 0.2037, + "step": 17031 + }, + { + "epoch": 1.6044841148347895, + "grad_norm": 0.690092921257019, + "learning_rate": 1.9427961093306592e-06, + "loss": 0.2143, + "step": 17032 + }, + { + "epoch": 1.6045783189279574, + "grad_norm": 0.7281653881072998, + "learning_rate": 1.941901823381508e-06, + "loss": 0.2066, + "step": 17033 + }, + { + "epoch": 1.6046725230211254, + "grad_norm": 0.6461312770843506, + "learning_rate": 1.941007721169462e-06, + "loss": 0.1812, + "step": 17034 + }, + { + "epoch": 1.6047667271142931, + "grad_norm": 0.6451976895332336, + "learning_rate": 1.9401138027149036e-06, + "loss": 0.1843, + "step": 17035 + }, + { + "epoch": 1.6048609312074609, + "grad_norm": 0.7128356695175171, + "learning_rate": 1.9392200680382113e-06, + "loss": 0.181, + "step": 17036 + }, + { + "epoch": 1.6049551353006288, + "grad_norm": 0.595065176486969, + "learning_rate": 1.938326517159772e-06, + "loss": 0.1907, + "step": 17037 + }, + { + "epoch": 1.6050493393937968, + "grad_norm": 0.6782664060592651, + "learning_rate": 1.9374331500999554e-06, + "loss": 0.1907, + "step": 17038 + }, + { + "epoch": 1.6051435434869645, + "grad_norm": 0.6576317548751831, + "learning_rate": 1.9365399668791274e-06, + "loss": 0.2098, + "step": 17039 + }, + { + "epoch": 1.6052377475801323, + "grad_norm": 0.5967773199081421, + "learning_rate": 1.9356469675176637e-06, + "loss": 0.1806, + "step": 17040 + }, + { + "epoch": 1.6053319516733002, + "grad_norm": 0.7943901419639587, + "learning_rate": 1.9347541520359203e-06, + "loss": 0.2358, + "step": 17041 + }, + { + "epoch": 1.6054261557664682, + "grad_norm": 0.671789288520813, + "learning_rate": 1.933861520454253e-06, + "loss": 0.2017, + "step": 17042 + }, + { + "epoch": 1.605520359859636, + "grad_norm": 0.7067272067070007, + "learning_rate": 1.9329690727930185e-06, + "loss": 0.2119, + "step": 17043 + }, + { + "epoch": 1.6056145639528037, + "grad_norm": 0.6046809554100037, + "learning_rate": 1.932076809072567e-06, + "loss": 0.1871, + "step": 17044 + }, + { + "epoch": 1.6057087680459716, + "grad_norm": 0.812608540058136, + "learning_rate": 1.931184729313239e-06, + "loss": 0.2227, + "step": 17045 + }, + { + "epoch": 1.6058029721391396, + "grad_norm": 0.7496895790100098, + "learning_rate": 1.9302928335353775e-06, + "loss": 0.212, + "step": 17046 + }, + { + "epoch": 1.6058971762323073, + "grad_norm": 0.5570060014724731, + "learning_rate": 1.929401121759322e-06, + "loss": 0.177, + "step": 17047 + }, + { + "epoch": 1.605991380325475, + "grad_norm": 0.71700119972229, + "learning_rate": 1.9285095940054e-06, + "loss": 0.1881, + "step": 17048 + }, + { + "epoch": 1.606085584418643, + "grad_norm": 0.6562269330024719, + "learning_rate": 1.9276182502939424e-06, + "loss": 0.1872, + "step": 17049 + }, + { + "epoch": 1.606179788511811, + "grad_norm": 0.7206482291221619, + "learning_rate": 1.926727090645275e-06, + "loss": 0.2001, + "step": 17050 + }, + { + "epoch": 1.6062739926049787, + "grad_norm": 0.6981536746025085, + "learning_rate": 1.9258361150797135e-06, + "loss": 0.2127, + "step": 17051 + }, + { + "epoch": 1.6063681966981465, + "grad_norm": 0.7051399946212769, + "learning_rate": 1.9249453236175774e-06, + "loss": 0.2152, + "step": 17052 + }, + { + "epoch": 1.6064624007913144, + "grad_norm": 0.6378706097602844, + "learning_rate": 1.9240547162791736e-06, + "loss": 0.1631, + "step": 17053 + }, + { + "epoch": 1.6065566048844824, + "grad_norm": 0.6317290663719177, + "learning_rate": 1.9231642930848126e-06, + "loss": 0.1741, + "step": 17054 + }, + { + "epoch": 1.6066508089776501, + "grad_norm": 0.6226803660392761, + "learning_rate": 1.922274054054799e-06, + "loss": 0.1939, + "step": 17055 + }, + { + "epoch": 1.6067450130708179, + "grad_norm": 0.6561529636383057, + "learning_rate": 1.9213839992094264e-06, + "loss": 0.1811, + "step": 17056 + }, + { + "epoch": 1.6068392171639858, + "grad_norm": 0.6007041931152344, + "learning_rate": 1.9204941285689926e-06, + "loss": 0.1885, + "step": 17057 + }, + { + "epoch": 1.6069334212571538, + "grad_norm": 0.6497009992599487, + "learning_rate": 1.9196044421537907e-06, + "loss": 0.1969, + "step": 17058 + }, + { + "epoch": 1.6070276253503215, + "grad_norm": 7.670639514923096, + "learning_rate": 1.918714939984102e-06, + "loss": 0.1921, + "step": 17059 + }, + { + "epoch": 1.6071218294434892, + "grad_norm": 0.618530809879303, + "learning_rate": 1.917825622080213e-06, + "loss": 0.1538, + "step": 17060 + }, + { + "epoch": 1.6072160335366572, + "grad_norm": 0.6352978944778442, + "learning_rate": 1.9169364884623953e-06, + "loss": 0.1918, + "step": 17061 + }, + { + "epoch": 1.6073102376298252, + "grad_norm": 0.6399124264717102, + "learning_rate": 1.9160475391509304e-06, + "loss": 0.2012, + "step": 17062 + }, + { + "epoch": 1.607404441722993, + "grad_norm": 0.7114628553390503, + "learning_rate": 1.9151587741660803e-06, + "loss": 0.191, + "step": 17063 + }, + { + "epoch": 1.6074986458161606, + "grad_norm": 0.6930683255195618, + "learning_rate": 1.914270193528114e-06, + "loss": 0.2051, + "step": 17064 + }, + { + "epoch": 1.6075928499093286, + "grad_norm": 0.717863917350769, + "learning_rate": 1.9133817972572944e-06, + "loss": 0.1928, + "step": 17065 + }, + { + "epoch": 1.6076870540024966, + "grad_norm": 0.6552441716194153, + "learning_rate": 1.9124935853738734e-06, + "loss": 0.1801, + "step": 17066 + }, + { + "epoch": 1.6077812580956643, + "grad_norm": 0.6529089212417603, + "learning_rate": 1.9116055578981075e-06, + "loss": 0.205, + "step": 17067 + }, + { + "epoch": 1.607875462188832, + "grad_norm": 0.6935768127441406, + "learning_rate": 1.9107177148502443e-06, + "loss": 0.2242, + "step": 17068 + }, + { + "epoch": 1.607969666282, + "grad_norm": 0.603960394859314, + "learning_rate": 1.9098300562505266e-06, + "loss": 0.1864, + "step": 17069 + }, + { + "epoch": 1.608063870375168, + "grad_norm": 0.6930103898048401, + "learning_rate": 1.908942582119194e-06, + "loss": 0.2089, + "step": 17070 + }, + { + "epoch": 1.6081580744683357, + "grad_norm": 0.7470356225967407, + "learning_rate": 1.9080552924764874e-06, + "loss": 0.1875, + "step": 17071 + }, + { + "epoch": 1.6082522785615034, + "grad_norm": 0.6638361811637878, + "learning_rate": 1.9071681873426319e-06, + "loss": 0.1741, + "step": 17072 + }, + { + "epoch": 1.6083464826546714, + "grad_norm": 0.6275650858879089, + "learning_rate": 1.9062812667378573e-06, + "loss": 0.1891, + "step": 17073 + }, + { + "epoch": 1.6084406867478394, + "grad_norm": 0.7013517022132874, + "learning_rate": 1.9053945306823884e-06, + "loss": 0.2197, + "step": 17074 + }, + { + "epoch": 1.608534890841007, + "grad_norm": 0.6467006802558899, + "learning_rate": 1.9045079791964439e-06, + "loss": 0.2125, + "step": 17075 + }, + { + "epoch": 1.6086290949341748, + "grad_norm": 0.6520530581474304, + "learning_rate": 1.9036216123002326e-06, + "loss": 0.2251, + "step": 17076 + }, + { + "epoch": 1.6087232990273428, + "grad_norm": 0.6930379867553711, + "learning_rate": 1.9027354300139743e-06, + "loss": 0.1927, + "step": 17077 + }, + { + "epoch": 1.6088175031205108, + "grad_norm": 0.632620632648468, + "learning_rate": 1.9018494323578707e-06, + "loss": 0.1939, + "step": 17078 + }, + { + "epoch": 1.6089117072136785, + "grad_norm": 0.6659923791885376, + "learning_rate": 1.9009636193521198e-06, + "loss": 0.1935, + "step": 17079 + }, + { + "epoch": 1.6090059113068462, + "grad_norm": 0.690721333026886, + "learning_rate": 1.9000779910169277e-06, + "loss": 0.1875, + "step": 17080 + }, + { + "epoch": 1.6091001154000142, + "grad_norm": 0.6876536011695862, + "learning_rate": 1.8991925473724848e-06, + "loss": 0.2163, + "step": 17081 + }, + { + "epoch": 1.609194319493182, + "grad_norm": 0.9192591309547424, + "learning_rate": 1.8983072884389753e-06, + "loss": 0.1845, + "step": 17082 + }, + { + "epoch": 1.6092885235863497, + "grad_norm": 0.8170506358146667, + "learning_rate": 1.8974222142365938e-06, + "loss": 0.2015, + "step": 17083 + }, + { + "epoch": 1.6093827276795176, + "grad_norm": 0.6728833913803101, + "learning_rate": 1.8965373247855168e-06, + "loss": 0.1778, + "step": 17084 + }, + { + "epoch": 1.6094769317726856, + "grad_norm": 0.6478422284126282, + "learning_rate": 1.8956526201059179e-06, + "loss": 0.1671, + "step": 17085 + }, + { + "epoch": 1.6095711358658533, + "grad_norm": 0.7013288736343384, + "learning_rate": 1.8947681002179729e-06, + "loss": 0.1915, + "step": 17086 + }, + { + "epoch": 1.609665339959021, + "grad_norm": 0.6429296135902405, + "learning_rate": 1.893883765141854e-06, + "loss": 0.1703, + "step": 17087 + }, + { + "epoch": 1.609759544052189, + "grad_norm": 0.6315440535545349, + "learning_rate": 1.8929996148977181e-06, + "loss": 0.1646, + "step": 17088 + }, + { + "epoch": 1.609853748145357, + "grad_norm": 0.7238953709602356, + "learning_rate": 1.892115649505728e-06, + "loss": 0.1846, + "step": 17089 + }, + { + "epoch": 1.6099479522385247, + "grad_norm": 0.7134349942207336, + "learning_rate": 1.8912318689860444e-06, + "loss": 0.1904, + "step": 17090 + }, + { + "epoch": 1.6100421563316925, + "grad_norm": 0.6606116890907288, + "learning_rate": 1.8903482733588108e-06, + "loss": 0.2081, + "step": 17091 + }, + { + "epoch": 1.6101363604248604, + "grad_norm": 1.245687484741211, + "learning_rate": 1.8894648626441792e-06, + "loss": 0.2144, + "step": 17092 + }, + { + "epoch": 1.6102305645180284, + "grad_norm": 0.6560180187225342, + "learning_rate": 1.8885816368622945e-06, + "loss": 0.19, + "step": 17093 + }, + { + "epoch": 1.6103247686111961, + "grad_norm": 1.1060028076171875, + "learning_rate": 1.8876985960332894e-06, + "loss": 0.2134, + "step": 17094 + }, + { + "epoch": 1.6104189727043638, + "grad_norm": 0.6218519806861877, + "learning_rate": 1.8868157401773058e-06, + "loss": 0.1846, + "step": 17095 + }, + { + "epoch": 1.6105131767975318, + "grad_norm": 0.6682586073875427, + "learning_rate": 1.8859330693144674e-06, + "loss": 0.207, + "step": 17096 + }, + { + "epoch": 1.6106073808906998, + "grad_norm": 0.7348806262016296, + "learning_rate": 1.885050583464907e-06, + "loss": 0.1848, + "step": 17097 + }, + { + "epoch": 1.6107015849838675, + "grad_norm": 0.7002991437911987, + "learning_rate": 1.8841682826487396e-06, + "loss": 0.2116, + "step": 17098 + }, + { + "epoch": 1.6107957890770352, + "grad_norm": 0.6557016968727112, + "learning_rate": 1.883286166886088e-06, + "loss": 0.1983, + "step": 17099 + }, + { + "epoch": 1.6108899931702032, + "grad_norm": 0.9070132970809937, + "learning_rate": 1.8824042361970663e-06, + "loss": 0.1883, + "step": 17100 + }, + { + "epoch": 1.6109841972633712, + "grad_norm": 0.6374247670173645, + "learning_rate": 1.8815224906017792e-06, + "loss": 0.1972, + "step": 17101 + }, + { + "epoch": 1.611078401356539, + "grad_norm": 0.7107747197151184, + "learning_rate": 1.8806409301203355e-06, + "loss": 0.1891, + "step": 17102 + }, + { + "epoch": 1.6111726054497066, + "grad_norm": 0.6548818945884705, + "learning_rate": 1.8797595547728386e-06, + "loss": 0.2045, + "step": 17103 + }, + { + "epoch": 1.6112668095428746, + "grad_norm": 0.7211184501647949, + "learning_rate": 1.8788783645793784e-06, + "loss": 0.2218, + "step": 17104 + }, + { + "epoch": 1.6113610136360426, + "grad_norm": 0.5960847735404968, + "learning_rate": 1.8779973595600543e-06, + "loss": 0.1847, + "step": 17105 + }, + { + "epoch": 1.6114552177292103, + "grad_norm": 0.7239816188812256, + "learning_rate": 1.8771165397349478e-06, + "loss": 0.2077, + "step": 17106 + }, + { + "epoch": 1.611549421822378, + "grad_norm": 0.6416701674461365, + "learning_rate": 1.8762359051241473e-06, + "loss": 0.1878, + "step": 17107 + }, + { + "epoch": 1.611643625915546, + "grad_norm": 0.6655055284500122, + "learning_rate": 1.8753554557477337e-06, + "loss": 0.2004, + "step": 17108 + }, + { + "epoch": 1.611737830008714, + "grad_norm": 0.6688492894172668, + "learning_rate": 1.8744751916257785e-06, + "loss": 0.1896, + "step": 17109 + }, + { + "epoch": 1.6118320341018817, + "grad_norm": 0.7001370787620544, + "learning_rate": 1.8735951127783557e-06, + "loss": 0.2163, + "step": 17110 + }, + { + "epoch": 1.6119262381950494, + "grad_norm": 0.7874501943588257, + "learning_rate": 1.8727152192255339e-06, + "loss": 0.2219, + "step": 17111 + }, + { + "epoch": 1.6120204422882174, + "grad_norm": 0.673001766204834, + "learning_rate": 1.871835510987372e-06, + "loss": 0.2036, + "step": 17112 + }, + { + "epoch": 1.6121146463813854, + "grad_norm": 0.6884879469871521, + "learning_rate": 1.8709559880839312e-06, + "loss": 0.1888, + "step": 17113 + }, + { + "epoch": 1.612208850474553, + "grad_norm": 0.6903436183929443, + "learning_rate": 1.8700766505352686e-06, + "loss": 0.2128, + "step": 17114 + }, + { + "epoch": 1.6123030545677208, + "grad_norm": 0.6216661930084229, + "learning_rate": 1.8691974983614303e-06, + "loss": 0.1816, + "step": 17115 + }, + { + "epoch": 1.6123972586608888, + "grad_norm": 0.6292722821235657, + "learning_rate": 1.8683185315824592e-06, + "loss": 0.1862, + "step": 17116 + }, + { + "epoch": 1.6124914627540567, + "grad_norm": 0.5919272899627686, + "learning_rate": 1.867439750218406e-06, + "loss": 0.172, + "step": 17117 + }, + { + "epoch": 1.6125856668472245, + "grad_norm": 0.632280707359314, + "learning_rate": 1.8665611542893047e-06, + "loss": 0.1947, + "step": 17118 + }, + { + "epoch": 1.6126798709403922, + "grad_norm": 0.6402468681335449, + "learning_rate": 1.8656827438151815e-06, + "loss": 0.1773, + "step": 17119 + }, + { + "epoch": 1.6127740750335602, + "grad_norm": 0.628898024559021, + "learning_rate": 1.864804518816078e-06, + "loss": 0.198, + "step": 17120 + }, + { + "epoch": 1.6128682791267281, + "grad_norm": 0.6768243312835693, + "learning_rate": 1.8639264793120114e-06, + "loss": 0.1948, + "step": 17121 + }, + { + "epoch": 1.6129624832198959, + "grad_norm": 0.6429453492164612, + "learning_rate": 1.8630486253230017e-06, + "loss": 0.1894, + "step": 17122 + }, + { + "epoch": 1.6130566873130636, + "grad_norm": 0.6443604230880737, + "learning_rate": 1.8621709568690683e-06, + "loss": 0.1767, + "step": 17123 + }, + { + "epoch": 1.6131508914062316, + "grad_norm": 0.6549245715141296, + "learning_rate": 1.8612934739702237e-06, + "loss": 0.1852, + "step": 17124 + }, + { + "epoch": 1.6132450954993995, + "grad_norm": 0.6844817399978638, + "learning_rate": 1.8604161766464734e-06, + "loss": 0.1986, + "step": 17125 + }, + { + "epoch": 1.6133392995925673, + "grad_norm": 0.6662511229515076, + "learning_rate": 1.8595390649178214e-06, + "loss": 0.1713, + "step": 17126 + }, + { + "epoch": 1.613433503685735, + "grad_norm": 0.699615478515625, + "learning_rate": 1.8586621388042713e-06, + "loss": 0.2162, + "step": 17127 + }, + { + "epoch": 1.613527707778903, + "grad_norm": 0.7042679786682129, + "learning_rate": 1.857785398325812e-06, + "loss": 0.2162, + "step": 17128 + }, + { + "epoch": 1.613621911872071, + "grad_norm": 0.7047753930091858, + "learning_rate": 1.8569088435024385e-06, + "loss": 0.1899, + "step": 17129 + }, + { + "epoch": 1.6137161159652387, + "grad_norm": 0.6133434772491455, + "learning_rate": 1.8560324743541391e-06, + "loss": 0.1823, + "step": 17130 + }, + { + "epoch": 1.6138103200584064, + "grad_norm": 0.6867203712463379, + "learning_rate": 1.8551562909008925e-06, + "loss": 0.202, + "step": 17131 + }, + { + "epoch": 1.6139045241515744, + "grad_norm": 0.6246338486671448, + "learning_rate": 1.854280293162678e-06, + "loss": 0.1955, + "step": 17132 + }, + { + "epoch": 1.6139987282447423, + "grad_norm": 0.701133668422699, + "learning_rate": 1.8534044811594721e-06, + "loss": 0.1999, + "step": 17133 + }, + { + "epoch": 1.61409293233791, + "grad_norm": 0.6869482398033142, + "learning_rate": 1.852528854911242e-06, + "loss": 0.2133, + "step": 17134 + }, + { + "epoch": 1.6141871364310778, + "grad_norm": 0.6875615119934082, + "learning_rate": 1.851653414437956e-06, + "loss": 0.1913, + "step": 17135 + }, + { + "epoch": 1.6142813405242458, + "grad_norm": 0.6641170978546143, + "learning_rate": 1.8507781597595709e-06, + "loss": 0.204, + "step": 17136 + }, + { + "epoch": 1.6143755446174137, + "grad_norm": 0.5416911244392395, + "learning_rate": 1.8499030908960502e-06, + "loss": 0.1708, + "step": 17137 + }, + { + "epoch": 1.6144697487105815, + "grad_norm": 0.6619910001754761, + "learning_rate": 1.8490282078673405e-06, + "loss": 0.2056, + "step": 17138 + }, + { + "epoch": 1.6145639528037492, + "grad_norm": 0.6576054692268372, + "learning_rate": 1.8481535106933935e-06, + "loss": 0.1913, + "step": 17139 + }, + { + "epoch": 1.6146581568969172, + "grad_norm": 0.6916207075119019, + "learning_rate": 1.8472789993941554e-06, + "loss": 0.2053, + "step": 17140 + }, + { + "epoch": 1.6147523609900851, + "grad_norm": 0.60109943151474, + "learning_rate": 1.8464046739895625e-06, + "loss": 0.178, + "step": 17141 + }, + { + "epoch": 1.6148465650832529, + "grad_norm": 0.7228315472602844, + "learning_rate": 1.8455305344995523e-06, + "loss": 0.2126, + "step": 17142 + }, + { + "epoch": 1.6149407691764206, + "grad_norm": 0.6615751385688782, + "learning_rate": 1.844656580944061e-06, + "loss": 0.1949, + "step": 17143 + }, + { + "epoch": 1.6150349732695886, + "grad_norm": 0.6760201454162598, + "learning_rate": 1.843782813343008e-06, + "loss": 0.1635, + "step": 17144 + }, + { + "epoch": 1.6151291773627565, + "grad_norm": 0.668273389339447, + "learning_rate": 1.8429092317163244e-06, + "loss": 0.181, + "step": 17145 + }, + { + "epoch": 1.6152233814559243, + "grad_norm": 0.6613141894340515, + "learning_rate": 1.842035836083922e-06, + "loss": 0.2068, + "step": 17146 + }, + { + "epoch": 1.615317585549092, + "grad_norm": 0.6070382595062256, + "learning_rate": 1.84116262646572e-06, + "loss": 0.1963, + "step": 17147 + }, + { + "epoch": 1.61541178964226, + "grad_norm": 0.5954920649528503, + "learning_rate": 1.8402896028816298e-06, + "loss": 0.2059, + "step": 17148 + }, + { + "epoch": 1.615505993735428, + "grad_norm": 0.6403561234474182, + "learning_rate": 1.8394167653515537e-06, + "loss": 0.1692, + "step": 17149 + }, + { + "epoch": 1.6156001978285957, + "grad_norm": 1.1120206117630005, + "learning_rate": 1.8385441138953952e-06, + "loss": 0.1819, + "step": 17150 + }, + { + "epoch": 1.6156944019217634, + "grad_norm": 0.6109408736228943, + "learning_rate": 1.837671648533057e-06, + "loss": 0.1758, + "step": 17151 + }, + { + "epoch": 1.6157886060149314, + "grad_norm": 0.8498703241348267, + "learning_rate": 1.8367993692844244e-06, + "loss": 0.2524, + "step": 17152 + }, + { + "epoch": 1.6158828101080993, + "grad_norm": 0.6426171660423279, + "learning_rate": 1.8359272761693915e-06, + "loss": 0.1947, + "step": 17153 + }, + { + "epoch": 1.615977014201267, + "grad_norm": 0.7028778791427612, + "learning_rate": 1.8350553692078454e-06, + "loss": 0.2042, + "step": 17154 + }, + { + "epoch": 1.6160712182944348, + "grad_norm": 0.64783775806427, + "learning_rate": 1.834183648419664e-06, + "loss": 0.1795, + "step": 17155 + }, + { + "epoch": 1.6161654223876027, + "grad_norm": 0.6946761012077332, + "learning_rate": 1.833312113824719e-06, + "loss": 0.1976, + "step": 17156 + }, + { + "epoch": 1.6162596264807707, + "grad_norm": 0.7017341256141663, + "learning_rate": 1.8324407654428921e-06, + "loss": 0.2018, + "step": 17157 + }, + { + "epoch": 1.6163538305739384, + "grad_norm": 0.6161543726921082, + "learning_rate": 1.8315696032940478e-06, + "loss": 0.1805, + "step": 17158 + }, + { + "epoch": 1.6164480346671062, + "grad_norm": 0.6772741079330444, + "learning_rate": 1.8306986273980442e-06, + "loss": 0.2029, + "step": 17159 + }, + { + "epoch": 1.6165422387602741, + "grad_norm": 0.6760283708572388, + "learning_rate": 1.8298278377747513e-06, + "loss": 0.2163, + "step": 17160 + }, + { + "epoch": 1.616636442853442, + "grad_norm": 0.6412596702575684, + "learning_rate": 1.8289572344440198e-06, + "loss": 0.1903, + "step": 17161 + }, + { + "epoch": 1.6167306469466098, + "grad_norm": 0.7114439010620117, + "learning_rate": 1.828086817425696e-06, + "loss": 0.2194, + "step": 17162 + }, + { + "epoch": 1.6168248510397776, + "grad_norm": 0.6724783182144165, + "learning_rate": 1.8272165867396318e-06, + "loss": 0.2137, + "step": 17163 + }, + { + "epoch": 1.6169190551329455, + "grad_norm": 0.6525052785873413, + "learning_rate": 1.8263465424056714e-06, + "loss": 0.2101, + "step": 17164 + }, + { + "epoch": 1.6170132592261135, + "grad_norm": 0.6216458678245544, + "learning_rate": 1.8254766844436477e-06, + "loss": 0.1812, + "step": 17165 + }, + { + "epoch": 1.6171074633192812, + "grad_norm": 0.625395655632019, + "learning_rate": 1.8246070128733984e-06, + "loss": 0.1848, + "step": 17166 + }, + { + "epoch": 1.617201667412449, + "grad_norm": 0.706031858921051, + "learning_rate": 1.823737527714754e-06, + "loss": 0.231, + "step": 17167 + }, + { + "epoch": 1.617295871505617, + "grad_norm": 0.6518822908401489, + "learning_rate": 1.8228682289875376e-06, + "loss": 0.2093, + "step": 17168 + }, + { + "epoch": 1.617390075598785, + "grad_norm": 0.6999724507331848, + "learning_rate": 1.8219991167115702e-06, + "loss": 0.2365, + "step": 17169 + }, + { + "epoch": 1.6174842796919526, + "grad_norm": 0.7110457420349121, + "learning_rate": 1.8211301909066747e-06, + "loss": 0.2039, + "step": 17170 + }, + { + "epoch": 1.6175784837851204, + "grad_norm": 0.7404806017875671, + "learning_rate": 1.8202614515926565e-06, + "loss": 0.2306, + "step": 17171 + }, + { + "epoch": 1.6176726878782883, + "grad_norm": 0.6038290858268738, + "learning_rate": 1.8193928987893273e-06, + "loss": 0.1804, + "step": 17172 + }, + { + "epoch": 1.6177668919714563, + "grad_norm": 0.614624559879303, + "learning_rate": 1.8185245325164935e-06, + "loss": 0.2137, + "step": 17173 + }, + { + "epoch": 1.617861096064624, + "grad_norm": 0.6252606511116028, + "learning_rate": 1.8176563527939518e-06, + "loss": 0.1702, + "step": 17174 + }, + { + "epoch": 1.6179553001577918, + "grad_norm": 0.6974897384643555, + "learning_rate": 1.8167883596415014e-06, + "loss": 0.2177, + "step": 17175 + }, + { + "epoch": 1.6180495042509597, + "grad_norm": 0.6368634700775146, + "learning_rate": 1.8159205530789303e-06, + "loss": 0.1813, + "step": 17176 + }, + { + "epoch": 1.6181437083441277, + "grad_norm": 0.7428598403930664, + "learning_rate": 1.8150529331260292e-06, + "loss": 0.2222, + "step": 17177 + }, + { + "epoch": 1.6182379124372954, + "grad_norm": 0.6269071102142334, + "learning_rate": 1.8141854998025766e-06, + "loss": 0.2157, + "step": 17178 + }, + { + "epoch": 1.6183321165304632, + "grad_norm": 0.7417349815368652, + "learning_rate": 1.8133182531283555e-06, + "loss": 0.2044, + "step": 17179 + }, + { + "epoch": 1.6184263206236311, + "grad_norm": 0.6804085969924927, + "learning_rate": 1.8124511931231403e-06, + "loss": 0.1962, + "step": 17180 + }, + { + "epoch": 1.618520524716799, + "grad_norm": 0.6238267421722412, + "learning_rate": 1.8115843198066984e-06, + "loss": 0.163, + "step": 17181 + }, + { + "epoch": 1.6186147288099668, + "grad_norm": 0.7084079384803772, + "learning_rate": 1.8107176331987974e-06, + "loss": 0.1904, + "step": 17182 + }, + { + "epoch": 1.6187089329031346, + "grad_norm": 0.6880285143852234, + "learning_rate": 1.8098511333192026e-06, + "loss": 0.2023, + "step": 17183 + }, + { + "epoch": 1.6188031369963025, + "grad_norm": 0.6665380001068115, + "learning_rate": 1.8089848201876648e-06, + "loss": 0.2026, + "step": 17184 + }, + { + "epoch": 1.6188973410894705, + "grad_norm": 0.6488173604011536, + "learning_rate": 1.8081186938239437e-06, + "loss": 0.1846, + "step": 17185 + }, + { + "epoch": 1.6189915451826382, + "grad_norm": 0.6484012603759766, + "learning_rate": 1.8072527542477825e-06, + "loss": 0.2002, + "step": 17186 + }, + { + "epoch": 1.619085749275806, + "grad_norm": 1.004101276397705, + "learning_rate": 1.8063870014789297e-06, + "loss": 0.2189, + "step": 17187 + }, + { + "epoch": 1.619179953368974, + "grad_norm": 0.6525694727897644, + "learning_rate": 1.8055214355371265e-06, + "loss": 0.1867, + "step": 17188 + }, + { + "epoch": 1.6192741574621419, + "grad_norm": 0.6535707712173462, + "learning_rate": 1.8046560564421056e-06, + "loss": 0.1901, + "step": 17189 + }, + { + "epoch": 1.6193683615553096, + "grad_norm": 0.6303517818450928, + "learning_rate": 1.8037908642136004e-06, + "loss": 0.1854, + "step": 17190 + }, + { + "epoch": 1.6194625656484773, + "grad_norm": 0.7350384593009949, + "learning_rate": 1.8029258588713427e-06, + "loss": 0.1883, + "step": 17191 + }, + { + "epoch": 1.6195567697416453, + "grad_norm": 0.5876874327659607, + "learning_rate": 1.8020610404350492e-06, + "loss": 0.1805, + "step": 17192 + }, + { + "epoch": 1.6196509738348133, + "grad_norm": 0.6583977937698364, + "learning_rate": 1.8011964089244415e-06, + "loss": 0.2002, + "step": 17193 + }, + { + "epoch": 1.619745177927981, + "grad_norm": 0.7399848103523254, + "learning_rate": 1.8003319643592388e-06, + "loss": 0.1956, + "step": 17194 + }, + { + "epoch": 1.6198393820211487, + "grad_norm": 0.8242473602294922, + "learning_rate": 1.7994677067591493e-06, + "loss": 0.1885, + "step": 17195 + }, + { + "epoch": 1.6199335861143167, + "grad_norm": 0.6625346541404724, + "learning_rate": 1.7986036361438718e-06, + "loss": 0.2006, + "step": 17196 + }, + { + "epoch": 1.6200277902074847, + "grad_norm": 0.670408308506012, + "learning_rate": 1.7977397525331208e-06, + "loss": 0.179, + "step": 17197 + }, + { + "epoch": 1.6201219943006524, + "grad_norm": 0.7978299260139465, + "learning_rate": 1.7968760559465891e-06, + "loss": 0.2172, + "step": 17198 + }, + { + "epoch": 1.6202161983938201, + "grad_norm": 0.6636545062065125, + "learning_rate": 1.7960125464039636e-06, + "loss": 0.1752, + "step": 17199 + }, + { + "epoch": 1.620310402486988, + "grad_norm": 0.7238003611564636, + "learning_rate": 1.7951492239249457e-06, + "loss": 0.2306, + "step": 17200 + }, + { + "epoch": 1.620404606580156, + "grad_norm": 0.6232163906097412, + "learning_rate": 1.7942860885292135e-06, + "loss": 0.196, + "step": 17201 + }, + { + "epoch": 1.6204988106733238, + "grad_norm": 0.6866005063056946, + "learning_rate": 1.7934231402364466e-06, + "loss": 0.1997, + "step": 17202 + }, + { + "epoch": 1.6205930147664915, + "grad_norm": 0.6618146300315857, + "learning_rate": 1.7925603790663247e-06, + "loss": 0.1906, + "step": 17203 + }, + { + "epoch": 1.6206872188596595, + "grad_norm": 0.5978967547416687, + "learning_rate": 1.7916978050385215e-06, + "loss": 0.1895, + "step": 17204 + }, + { + "epoch": 1.6207814229528275, + "grad_norm": 0.7060754895210266, + "learning_rate": 1.7908354181726994e-06, + "loss": 0.2291, + "step": 17205 + }, + { + "epoch": 1.6208756270459952, + "grad_norm": 0.643155038356781, + "learning_rate": 1.7899732184885255e-06, + "loss": 0.2077, + "step": 17206 + }, + { + "epoch": 1.620969831139163, + "grad_norm": 0.6400883793830872, + "learning_rate": 1.7891112060056626e-06, + "loss": 0.1817, + "step": 17207 + }, + { + "epoch": 1.621064035232331, + "grad_norm": 0.6722229719161987, + "learning_rate": 1.7882493807437596e-06, + "loss": 0.1776, + "step": 17208 + }, + { + "epoch": 1.6211582393254989, + "grad_norm": 0.715329110622406, + "learning_rate": 1.7873877427224706e-06, + "loss": 0.2024, + "step": 17209 + }, + { + "epoch": 1.6212524434186666, + "grad_norm": 0.7191175222396851, + "learning_rate": 1.7865262919614446e-06, + "loss": 0.2078, + "step": 17210 + }, + { + "epoch": 1.6213466475118343, + "grad_norm": 0.6324275135993958, + "learning_rate": 1.785665028480319e-06, + "loss": 0.1847, + "step": 17211 + }, + { + "epoch": 1.6214408516050023, + "grad_norm": 0.5900797843933105, + "learning_rate": 1.7848039522987347e-06, + "loss": 0.1553, + "step": 17212 + }, + { + "epoch": 1.6215350556981702, + "grad_norm": 0.62995845079422, + "learning_rate": 1.7839430634363274e-06, + "loss": 0.2161, + "step": 17213 + }, + { + "epoch": 1.621629259791338, + "grad_norm": 0.686939537525177, + "learning_rate": 1.7830823619127246e-06, + "loss": 0.2007, + "step": 17214 + }, + { + "epoch": 1.6217234638845057, + "grad_norm": 0.6298737525939941, + "learning_rate": 1.7822218477475496e-06, + "loss": 0.1777, + "step": 17215 + }, + { + "epoch": 1.6218176679776737, + "grad_norm": 0.6338664889335632, + "learning_rate": 1.7813615209604252e-06, + "loss": 0.2047, + "step": 17216 + }, + { + "epoch": 1.6219118720708416, + "grad_norm": 0.6423327326774597, + "learning_rate": 1.7805013815709715e-06, + "loss": 0.1799, + "step": 17217 + }, + { + "epoch": 1.6220060761640094, + "grad_norm": 0.6747575402259827, + "learning_rate": 1.7796414295987952e-06, + "loss": 0.2167, + "step": 17218 + }, + { + "epoch": 1.6221002802571771, + "grad_norm": 0.6236206889152527, + "learning_rate": 1.778781665063507e-06, + "loss": 0.1851, + "step": 17219 + }, + { + "epoch": 1.622194484350345, + "grad_norm": 0.5348717570304871, + "learning_rate": 1.7779220879847136e-06, + "loss": 0.1685, + "step": 17220 + }, + { + "epoch": 1.6222886884435128, + "grad_norm": 0.6100355386734009, + "learning_rate": 1.7770626983820105e-06, + "loss": 0.1873, + "step": 17221 + }, + { + "epoch": 1.6223828925366806, + "grad_norm": 0.6110270619392395, + "learning_rate": 1.7762034962749941e-06, + "loss": 0.1763, + "step": 17222 + }, + { + "epoch": 1.6224770966298485, + "grad_norm": 0.6329869627952576, + "learning_rate": 1.7753444816832588e-06, + "loss": 0.1836, + "step": 17223 + }, + { + "epoch": 1.6225713007230165, + "grad_norm": 0.7160716652870178, + "learning_rate": 1.7744856546263868e-06, + "loss": 0.2174, + "step": 17224 + }, + { + "epoch": 1.6226655048161842, + "grad_norm": 0.6878299713134766, + "learning_rate": 1.7736270151239655e-06, + "loss": 0.2144, + "step": 17225 + }, + { + "epoch": 1.622759708909352, + "grad_norm": 0.6290382146835327, + "learning_rate": 1.7727685631955682e-06, + "loss": 0.1849, + "step": 17226 + }, + { + "epoch": 1.62285391300252, + "grad_norm": 0.6803714632987976, + "learning_rate": 1.7719102988607716e-06, + "loss": 0.2088, + "step": 17227 + }, + { + "epoch": 1.6229481170956879, + "grad_norm": 0.6757082343101501, + "learning_rate": 1.771052222139147e-06, + "loss": 0.19, + "step": 17228 + }, + { + "epoch": 1.6230423211888556, + "grad_norm": 0.6137799024581909, + "learning_rate": 1.7701943330502558e-06, + "loss": 0.1797, + "step": 17229 + }, + { + "epoch": 1.6231365252820233, + "grad_norm": 0.6618055701255798, + "learning_rate": 1.7693366316136618e-06, + "loss": 0.2008, + "step": 17230 + }, + { + "epoch": 1.6232307293751913, + "grad_norm": 0.5803620219230652, + "learning_rate": 1.7684791178489236e-06, + "loss": 0.1866, + "step": 17231 + }, + { + "epoch": 1.6233249334683593, + "grad_norm": 0.6533998250961304, + "learning_rate": 1.7676217917755889e-06, + "loss": 0.1974, + "step": 17232 + }, + { + "epoch": 1.623419137561527, + "grad_norm": 0.6308205127716064, + "learning_rate": 1.76676465341321e-06, + "loss": 0.2137, + "step": 17233 + }, + { + "epoch": 1.6235133416546947, + "grad_norm": 0.7170531749725342, + "learning_rate": 1.765907702781332e-06, + "loss": 0.181, + "step": 17234 + }, + { + "epoch": 1.6236075457478627, + "grad_norm": 0.6850894093513489, + "learning_rate": 1.7650509398994919e-06, + "loss": 0.1876, + "step": 17235 + }, + { + "epoch": 1.6237017498410307, + "grad_norm": 0.6540127992630005, + "learning_rate": 1.7641943647872217e-06, + "loss": 0.2041, + "step": 17236 + }, + { + "epoch": 1.6237959539341984, + "grad_norm": 0.6298664808273315, + "learning_rate": 1.7633379774640624e-06, + "loss": 0.2044, + "step": 17237 + }, + { + "epoch": 1.6238901580273661, + "grad_norm": 0.6866254210472107, + "learning_rate": 1.762481777949534e-06, + "loss": 0.191, + "step": 17238 + }, + { + "epoch": 1.623984362120534, + "grad_norm": 0.693366527557373, + "learning_rate": 1.7616257662631597e-06, + "loss": 0.1958, + "step": 17239 + }, + { + "epoch": 1.624078566213702, + "grad_norm": 0.6166165471076965, + "learning_rate": 1.7607699424244583e-06, + "loss": 0.1974, + "step": 17240 + }, + { + "epoch": 1.6241727703068698, + "grad_norm": 0.6323557496070862, + "learning_rate": 1.7599143064529467e-06, + "loss": 0.207, + "step": 17241 + }, + { + "epoch": 1.6242669744000375, + "grad_norm": 0.6584812998771667, + "learning_rate": 1.7590588583681301e-06, + "loss": 0.1971, + "step": 17242 + }, + { + "epoch": 1.6243611784932055, + "grad_norm": 0.6964744329452515, + "learning_rate": 1.7582035981895163e-06, + "loss": 0.1947, + "step": 17243 + }, + { + "epoch": 1.6244553825863735, + "grad_norm": 0.6552649140357971, + "learning_rate": 1.7573485259366086e-06, + "loss": 0.1832, + "step": 17244 + }, + { + "epoch": 1.6245495866795412, + "grad_norm": 0.6573282480239868, + "learning_rate": 1.7564936416289003e-06, + "loss": 0.2029, + "step": 17245 + }, + { + "epoch": 1.624643790772709, + "grad_norm": 0.7544728517532349, + "learning_rate": 1.7556389452858858e-06, + "loss": 0.2356, + "step": 17246 + }, + { + "epoch": 1.6247379948658769, + "grad_norm": 0.7063323259353638, + "learning_rate": 1.7547844369270551e-06, + "loss": 0.1909, + "step": 17247 + }, + { + "epoch": 1.6248321989590448, + "grad_norm": 0.6265422105789185, + "learning_rate": 1.7539301165718892e-06, + "loss": 0.1673, + "step": 17248 + }, + { + "epoch": 1.6249264030522126, + "grad_norm": 0.7208700180053711, + "learning_rate": 1.7530759842398693e-06, + "loss": 0.2349, + "step": 17249 + }, + { + "epoch": 1.6250206071453803, + "grad_norm": 0.621737003326416, + "learning_rate": 1.752222039950473e-06, + "loss": 0.1807, + "step": 17250 + }, + { + "epoch": 1.6251148112385483, + "grad_norm": 0.6207723021507263, + "learning_rate": 1.7513682837231672e-06, + "loss": 0.168, + "step": 17251 + }, + { + "epoch": 1.6252090153317162, + "grad_norm": 0.6770493984222412, + "learning_rate": 1.7505147155774216e-06, + "loss": 0.2156, + "step": 17252 + }, + { + "epoch": 1.625303219424884, + "grad_norm": 0.6314629912376404, + "learning_rate": 1.749661335532702e-06, + "loss": 0.2019, + "step": 17253 + }, + { + "epoch": 1.6253974235180517, + "grad_norm": 0.6600848436355591, + "learning_rate": 1.7488081436084613e-06, + "loss": 0.1967, + "step": 17254 + }, + { + "epoch": 1.6254916276112197, + "grad_norm": 0.6188461780548096, + "learning_rate": 1.7479551398241546e-06, + "loss": 0.1984, + "step": 17255 + }, + { + "epoch": 1.6255858317043876, + "grad_norm": 0.6627752780914307, + "learning_rate": 1.7471023241992325e-06, + "loss": 0.1576, + "step": 17256 + }, + { + "epoch": 1.6256800357975554, + "grad_norm": 0.5846261978149414, + "learning_rate": 1.746249696753143e-06, + "loss": 0.1853, + "step": 17257 + }, + { + "epoch": 1.6257742398907231, + "grad_norm": 0.6488931775093079, + "learning_rate": 1.7453972575053224e-06, + "loss": 0.1951, + "step": 17258 + }, + { + "epoch": 1.625868443983891, + "grad_norm": 0.6597906351089478, + "learning_rate": 1.744545006475211e-06, + "loss": 0.206, + "step": 17259 + }, + { + "epoch": 1.625962648077059, + "grad_norm": 0.6859642863273621, + "learning_rate": 1.7436929436822425e-06, + "loss": 0.1766, + "step": 17260 + }, + { + "epoch": 1.6260568521702268, + "grad_norm": 0.5902611017227173, + "learning_rate": 1.742841069145842e-06, + "loss": 0.1799, + "step": 17261 + }, + { + "epoch": 1.6261510562633945, + "grad_norm": 0.6606028079986572, + "learning_rate": 1.7419893828854339e-06, + "loss": 0.2033, + "step": 17262 + }, + { + "epoch": 1.6262452603565625, + "grad_norm": 0.6939049959182739, + "learning_rate": 1.741137884920443e-06, + "loss": 0.1956, + "step": 17263 + }, + { + "epoch": 1.6263394644497304, + "grad_norm": 0.6413030624389648, + "learning_rate": 1.740286575270277e-06, + "loss": 0.2069, + "step": 17264 + }, + { + "epoch": 1.6264336685428982, + "grad_norm": 0.6549431681632996, + "learning_rate": 1.7394354539543545e-06, + "loss": 0.2089, + "step": 17265 + }, + { + "epoch": 1.626527872636066, + "grad_norm": 0.6486162543296814, + "learning_rate": 1.738584520992076e-06, + "loss": 0.192, + "step": 17266 + }, + { + "epoch": 1.6266220767292339, + "grad_norm": 0.633261501789093, + "learning_rate": 1.7377337764028468e-06, + "loss": 0.1998, + "step": 17267 + }, + { + "epoch": 1.6267162808224018, + "grad_norm": 0.6332877278327942, + "learning_rate": 1.7368832202060682e-06, + "loss": 0.1714, + "step": 17268 + }, + { + "epoch": 1.6268104849155696, + "grad_norm": 0.6749432682991028, + "learning_rate": 1.7360328524211278e-06, + "loss": 0.2122, + "step": 17269 + }, + { + "epoch": 1.6269046890087373, + "grad_norm": 0.683599591255188, + "learning_rate": 1.7351826730674192e-06, + "loss": 0.2088, + "step": 17270 + }, + { + "epoch": 1.6269988931019053, + "grad_norm": 0.6694850325584412, + "learning_rate": 1.7343326821643314e-06, + "loss": 0.1943, + "step": 17271 + }, + { + "epoch": 1.6270930971950732, + "grad_norm": 0.6966886520385742, + "learning_rate": 1.7334828797312375e-06, + "loss": 0.214, + "step": 17272 + }, + { + "epoch": 1.627187301288241, + "grad_norm": 0.6932169795036316, + "learning_rate": 1.732633265787519e-06, + "loss": 0.2024, + "step": 17273 + }, + { + "epoch": 1.6272815053814087, + "grad_norm": 0.6216726899147034, + "learning_rate": 1.73178384035255e-06, + "loss": 0.1878, + "step": 17274 + }, + { + "epoch": 1.6273757094745767, + "grad_norm": 0.651899516582489, + "learning_rate": 1.7309346034456963e-06, + "loss": 0.1959, + "step": 17275 + }, + { + "epoch": 1.6274699135677446, + "grad_norm": 0.7007536292076111, + "learning_rate": 1.730085555086317e-06, + "loss": 0.1967, + "step": 17276 + }, + { + "epoch": 1.6275641176609124, + "grad_norm": 0.6730961203575134, + "learning_rate": 1.7292366952937823e-06, + "loss": 0.1766, + "step": 17277 + }, + { + "epoch": 1.62765832175408, + "grad_norm": 0.6632020473480225, + "learning_rate": 1.7283880240874418e-06, + "loss": 0.1818, + "step": 17278 + }, + { + "epoch": 1.627752525847248, + "grad_norm": 0.7121394276618958, + "learning_rate": 1.7275395414866437e-06, + "loss": 0.2033, + "step": 17279 + }, + { + "epoch": 1.627846729940416, + "grad_norm": 0.6867882609367371, + "learning_rate": 1.7266912475107378e-06, + "loss": 0.1997, + "step": 17280 + }, + { + "epoch": 1.6279409340335838, + "grad_norm": 0.6606661677360535, + "learning_rate": 1.725843142179069e-06, + "loss": 0.1945, + "step": 17281 + }, + { + "epoch": 1.6280351381267515, + "grad_norm": 0.7026911973953247, + "learning_rate": 1.7249952255109702e-06, + "loss": 0.2041, + "step": 17282 + }, + { + "epoch": 1.6281293422199195, + "grad_norm": 0.6324657201766968, + "learning_rate": 1.7241474975257777e-06, + "loss": 0.2068, + "step": 17283 + }, + { + "epoch": 1.6282235463130874, + "grad_norm": 0.6406705975532532, + "learning_rate": 1.7232999582428244e-06, + "loss": 0.1984, + "step": 17284 + }, + { + "epoch": 1.6283177504062551, + "grad_norm": 0.6478314995765686, + "learning_rate": 1.7224526076814284e-06, + "loss": 0.169, + "step": 17285 + }, + { + "epoch": 1.6284119544994229, + "grad_norm": 0.6279692053794861, + "learning_rate": 1.7216054458609155e-06, + "loss": 0.1907, + "step": 17286 + }, + { + "epoch": 1.6285061585925908, + "grad_norm": 0.6874887347221375, + "learning_rate": 1.7207584728006033e-06, + "loss": 0.206, + "step": 17287 + }, + { + "epoch": 1.6286003626857588, + "grad_norm": 0.6094188094139099, + "learning_rate": 1.7199116885197996e-06, + "loss": 0.1995, + "step": 17288 + }, + { + "epoch": 1.6286945667789265, + "grad_norm": 0.6797674298286438, + "learning_rate": 1.719065093037815e-06, + "loss": 0.2134, + "step": 17289 + }, + { + "epoch": 1.6287887708720943, + "grad_norm": 0.7516131401062012, + "learning_rate": 1.7182186863739548e-06, + "loss": 0.2025, + "step": 17290 + }, + { + "epoch": 1.6288829749652622, + "grad_norm": 0.6889533400535583, + "learning_rate": 1.7173724685475146e-06, + "loss": 0.1877, + "step": 17291 + }, + { + "epoch": 1.6289771790584302, + "grad_norm": 0.6603233814239502, + "learning_rate": 1.7165264395777913e-06, + "loss": 0.1825, + "step": 17292 + }, + { + "epoch": 1.629071383151598, + "grad_norm": 0.7016066908836365, + "learning_rate": 1.7156805994840787e-06, + "loss": 0.1807, + "step": 17293 + }, + { + "epoch": 1.6291655872447657, + "grad_norm": 0.7911700010299683, + "learning_rate": 1.7148349482856598e-06, + "loss": 0.1915, + "step": 17294 + }, + { + "epoch": 1.6292597913379336, + "grad_norm": 0.6259404420852661, + "learning_rate": 1.7139894860018158e-06, + "loss": 0.1686, + "step": 17295 + }, + { + "epoch": 1.6293539954311016, + "grad_norm": 0.6378378868103027, + "learning_rate": 1.713144212651825e-06, + "loss": 0.1911, + "step": 17296 + }, + { + "epoch": 1.6294481995242693, + "grad_norm": 0.6419534087181091, + "learning_rate": 1.712299128254965e-06, + "loss": 0.1972, + "step": 17297 + }, + { + "epoch": 1.629542403617437, + "grad_norm": 0.6143866777420044, + "learning_rate": 1.7114542328304995e-06, + "loss": 0.1806, + "step": 17298 + }, + { + "epoch": 1.629636607710605, + "grad_norm": 0.6389146447181702, + "learning_rate": 1.7106095263976951e-06, + "loss": 0.1858, + "step": 17299 + }, + { + "epoch": 1.629730811803773, + "grad_norm": 0.6596473455429077, + "learning_rate": 1.7097650089758167e-06, + "loss": 0.1979, + "step": 17300 + }, + { + "epoch": 1.6298250158969407, + "grad_norm": 0.679501473903656, + "learning_rate": 1.708920680584115e-06, + "loss": 0.194, + "step": 17301 + }, + { + "epoch": 1.6299192199901085, + "grad_norm": 0.720536470413208, + "learning_rate": 1.7080765412418443e-06, + "loss": 0.2055, + "step": 17302 + }, + { + "epoch": 1.6300134240832764, + "grad_norm": 0.6592379808425903, + "learning_rate": 1.7072325909682542e-06, + "loss": 0.1884, + "step": 17303 + }, + { + "epoch": 1.6301076281764444, + "grad_norm": 0.6442597508430481, + "learning_rate": 1.7063888297825825e-06, + "loss": 0.1956, + "step": 17304 + }, + { + "epoch": 1.6302018322696121, + "grad_norm": 0.6127653121948242, + "learning_rate": 1.7055452577040754e-06, + "loss": 0.1849, + "step": 17305 + }, + { + "epoch": 1.6302960363627799, + "grad_norm": 0.6029418110847473, + "learning_rate": 1.7047018747519617e-06, + "loss": 0.1716, + "step": 17306 + }, + { + "epoch": 1.6303902404559478, + "grad_norm": 0.6264557838439941, + "learning_rate": 1.7038586809454738e-06, + "loss": 0.1777, + "step": 17307 + }, + { + "epoch": 1.6304844445491158, + "grad_norm": 0.6971416473388672, + "learning_rate": 1.7030156763038408e-06, + "loss": 0.2296, + "step": 17308 + }, + { + "epoch": 1.6305786486422835, + "grad_norm": 0.6296881437301636, + "learning_rate": 1.7021728608462795e-06, + "loss": 0.2043, + "step": 17309 + }, + { + "epoch": 1.6306728527354513, + "grad_norm": 0.7314896583557129, + "learning_rate": 1.7013302345920103e-06, + "loss": 0.217, + "step": 17310 + }, + { + "epoch": 1.6307670568286192, + "grad_norm": 0.6422860622406006, + "learning_rate": 1.7004877975602474e-06, + "loss": 0.1991, + "step": 17311 + }, + { + "epoch": 1.6308612609217872, + "grad_norm": 0.68195641040802, + "learning_rate": 1.6996455497701958e-06, + "loss": 0.1947, + "step": 17312 + }, + { + "epoch": 1.630955465014955, + "grad_norm": 0.8967283368110657, + "learning_rate": 1.6988034912410622e-06, + "loss": 0.2223, + "step": 17313 + }, + { + "epoch": 1.6310496691081227, + "grad_norm": 0.6603388786315918, + "learning_rate": 1.6979616219920504e-06, + "loss": 0.1911, + "step": 17314 + }, + { + "epoch": 1.6311438732012906, + "grad_norm": 0.7128387093544006, + "learning_rate": 1.6971199420423522e-06, + "loss": 0.2124, + "step": 17315 + }, + { + "epoch": 1.6312380772944586, + "grad_norm": 0.6550495028495789, + "learning_rate": 1.6962784514111564e-06, + "loss": 0.1965, + "step": 17316 + }, + { + "epoch": 1.6313322813876263, + "grad_norm": 0.5830024480819702, + "learning_rate": 1.6954371501176569e-06, + "loss": 0.1824, + "step": 17317 + }, + { + "epoch": 1.631426485480794, + "grad_norm": 0.7229143381118774, + "learning_rate": 1.6945960381810345e-06, + "loss": 0.1783, + "step": 17318 + }, + { + "epoch": 1.631520689573962, + "grad_norm": 0.6041586995124817, + "learning_rate": 1.6937551156204647e-06, + "loss": 0.1838, + "step": 17319 + }, + { + "epoch": 1.63161489366713, + "grad_norm": 0.7068755626678467, + "learning_rate": 1.6929143824551241e-06, + "loss": 0.2129, + "step": 17320 + }, + { + "epoch": 1.6317090977602977, + "grad_norm": 0.6394292712211609, + "learning_rate": 1.6920738387041857e-06, + "loss": 0.1995, + "step": 17321 + }, + { + "epoch": 1.6318033018534654, + "grad_norm": 0.6189360618591309, + "learning_rate": 1.6912334843868083e-06, + "loss": 0.1742, + "step": 17322 + }, + { + "epoch": 1.6318975059466334, + "grad_norm": 0.7381575107574463, + "learning_rate": 1.690393319522159e-06, + "loss": 0.2174, + "step": 17323 + }, + { + "epoch": 1.6319917100398014, + "grad_norm": 0.6230690479278564, + "learning_rate": 1.6895533441293943e-06, + "loss": 0.2096, + "step": 17324 + }, + { + "epoch": 1.632085914132969, + "grad_norm": 0.6463793516159058, + "learning_rate": 1.6887135582276637e-06, + "loss": 0.1923, + "step": 17325 + }, + { + "epoch": 1.6321801182261368, + "grad_norm": 0.6478497982025146, + "learning_rate": 1.6878739618361173e-06, + "loss": 0.2073, + "step": 17326 + }, + { + "epoch": 1.6322743223193048, + "grad_norm": 0.706958532333374, + "learning_rate": 1.687034554973902e-06, + "loss": 0.2041, + "step": 17327 + }, + { + "epoch": 1.6323685264124728, + "grad_norm": 0.7188374400138855, + "learning_rate": 1.6861953376601525e-06, + "loss": 0.1882, + "step": 17328 + }, + { + "epoch": 1.6324627305056405, + "grad_norm": 0.6503280997276306, + "learning_rate": 1.685356309914007e-06, + "loss": 0.2046, + "step": 17329 + }, + { + "epoch": 1.6325569345988082, + "grad_norm": 0.677300751209259, + "learning_rate": 1.6845174717545997e-06, + "loss": 0.2, + "step": 17330 + }, + { + "epoch": 1.6326511386919762, + "grad_norm": 1.1053460836410522, + "learning_rate": 1.6836788232010537e-06, + "loss": 0.1837, + "step": 17331 + }, + { + "epoch": 1.6327453427851442, + "grad_norm": 0.6405396461486816, + "learning_rate": 1.6828403642724866e-06, + "loss": 0.2099, + "step": 17332 + }, + { + "epoch": 1.632839546878312, + "grad_norm": 0.6526809334754944, + "learning_rate": 1.6820020949880268e-06, + "loss": 0.1722, + "step": 17333 + }, + { + "epoch": 1.6329337509714796, + "grad_norm": 0.7204709649085999, + "learning_rate": 1.681164015366784e-06, + "loss": 0.2174, + "step": 17334 + }, + { + "epoch": 1.6330279550646476, + "grad_norm": 0.6111131906509399, + "learning_rate": 1.6803261254278635e-06, + "loss": 0.1891, + "step": 17335 + }, + { + "epoch": 1.6331221591578156, + "grad_norm": 0.6242854595184326, + "learning_rate": 1.6794884251903753e-06, + "loss": 0.1768, + "step": 17336 + }, + { + "epoch": 1.6332163632509833, + "grad_norm": 0.6689353585243225, + "learning_rate": 1.6786509146734197e-06, + "loss": 0.1646, + "step": 17337 + }, + { + "epoch": 1.633310567344151, + "grad_norm": 0.634945273399353, + "learning_rate": 1.6778135938960915e-06, + "loss": 0.1985, + "step": 17338 + }, + { + "epoch": 1.633404771437319, + "grad_norm": 0.6326664686203003, + "learning_rate": 1.676976462877483e-06, + "loss": 0.1998, + "step": 17339 + }, + { + "epoch": 1.633498975530487, + "grad_norm": 0.5912447571754456, + "learning_rate": 1.6761395216366861e-06, + "loss": 0.1844, + "step": 17340 + }, + { + "epoch": 1.6335931796236547, + "grad_norm": 0.7482324838638306, + "learning_rate": 1.6753027701927783e-06, + "loss": 0.2028, + "step": 17341 + }, + { + "epoch": 1.6336873837168224, + "grad_norm": 0.7161656022071838, + "learning_rate": 1.6744662085648423e-06, + "loss": 0.1886, + "step": 17342 + }, + { + "epoch": 1.6337815878099904, + "grad_norm": 0.6114586591720581, + "learning_rate": 1.6736298367719538e-06, + "loss": 0.166, + "step": 17343 + }, + { + "epoch": 1.6338757919031583, + "grad_norm": 0.612693190574646, + "learning_rate": 1.6727936548331803e-06, + "loss": 0.1909, + "step": 17344 + }, + { + "epoch": 1.633969995996326, + "grad_norm": 0.655239462852478, + "learning_rate": 1.6719576627675927e-06, + "loss": 0.1954, + "step": 17345 + }, + { + "epoch": 1.6340642000894938, + "grad_norm": 0.6061803102493286, + "learning_rate": 1.6711218605942458e-06, + "loss": 0.171, + "step": 17346 + }, + { + "epoch": 1.6341584041826618, + "grad_norm": 0.6310243010520935, + "learning_rate": 1.6702862483322025e-06, + "loss": 0.1749, + "step": 17347 + }, + { + "epoch": 1.6342526082758297, + "grad_norm": 0.6760424971580505, + "learning_rate": 1.669450826000517e-06, + "loss": 0.2135, + "step": 17348 + }, + { + "epoch": 1.6343468123689975, + "grad_norm": 0.6666778922080994, + "learning_rate": 1.6686155936182347e-06, + "loss": 0.2076, + "step": 17349 + }, + { + "epoch": 1.6344410164621652, + "grad_norm": 0.7133226990699768, + "learning_rate": 1.667780551204401e-06, + "loss": 0.2241, + "step": 17350 + }, + { + "epoch": 1.6345352205553332, + "grad_norm": 0.7003960013389587, + "learning_rate": 1.6669456987780585e-06, + "loss": 0.1679, + "step": 17351 + }, + { + "epoch": 1.6346294246485011, + "grad_norm": 0.6771157383918762, + "learning_rate": 1.6661110363582389e-06, + "loss": 0.184, + "step": 17352 + }, + { + "epoch": 1.6347236287416689, + "grad_norm": 0.7309131026268005, + "learning_rate": 1.6652765639639779e-06, + "loss": 0.2288, + "step": 17353 + }, + { + "epoch": 1.6348178328348366, + "grad_norm": 0.646629273891449, + "learning_rate": 1.6644422816143024e-06, + "loss": 0.2011, + "step": 17354 + }, + { + "epoch": 1.6349120369280046, + "grad_norm": 0.6323843598365784, + "learning_rate": 1.6636081893282342e-06, + "loss": 0.1726, + "step": 17355 + }, + { + "epoch": 1.6350062410211725, + "grad_norm": 0.6303738355636597, + "learning_rate": 1.6627742871247887e-06, + "loss": 0.173, + "step": 17356 + }, + { + "epoch": 1.6351004451143403, + "grad_norm": 0.6361851692199707, + "learning_rate": 1.6619405750229834e-06, + "loss": 0.1899, + "step": 17357 + }, + { + "epoch": 1.635194649207508, + "grad_norm": 0.6274666786193848, + "learning_rate": 1.6611070530418317e-06, + "loss": 0.1708, + "step": 17358 + }, + { + "epoch": 1.635288853300676, + "grad_norm": 0.6658263802528381, + "learning_rate": 1.6602737212003316e-06, + "loss": 0.1906, + "step": 17359 + }, + { + "epoch": 1.6353830573938437, + "grad_norm": 0.6502881050109863, + "learning_rate": 1.6594405795174896e-06, + "loss": 0.2335, + "step": 17360 + }, + { + "epoch": 1.6354772614870114, + "grad_norm": 0.7372328042984009, + "learning_rate": 1.6586076280123032e-06, + "loss": 0.2047, + "step": 17361 + }, + { + "epoch": 1.6355714655801794, + "grad_norm": 0.6899813413619995, + "learning_rate": 1.6577748667037596e-06, + "loss": 0.1947, + "step": 17362 + }, + { + "epoch": 1.6356656696733474, + "grad_norm": 0.6836014986038208, + "learning_rate": 1.6569422956108515e-06, + "loss": 0.2082, + "step": 17363 + }, + { + "epoch": 1.635759873766515, + "grad_norm": 0.7940660119056702, + "learning_rate": 1.656109914752565e-06, + "loss": 0.216, + "step": 17364 + }, + { + "epoch": 1.6358540778596828, + "grad_norm": 0.6817182898521423, + "learning_rate": 1.6552777241478735e-06, + "loss": 0.1848, + "step": 17365 + }, + { + "epoch": 1.6359482819528508, + "grad_norm": 0.6277564764022827, + "learning_rate": 1.6544457238157551e-06, + "loss": 0.2055, + "step": 17366 + }, + { + "epoch": 1.6360424860460188, + "grad_norm": 0.743478536605835, + "learning_rate": 1.6536139137751829e-06, + "loss": 0.2412, + "step": 17367 + }, + { + "epoch": 1.6361366901391865, + "grad_norm": 0.6931159496307373, + "learning_rate": 1.65278229404512e-06, + "loss": 0.1853, + "step": 17368 + }, + { + "epoch": 1.6362308942323542, + "grad_norm": 0.6792427897453308, + "learning_rate": 1.6519508646445293e-06, + "loss": 0.1983, + "step": 17369 + }, + { + "epoch": 1.6363250983255222, + "grad_norm": 0.7112650871276855, + "learning_rate": 1.6511196255923711e-06, + "loss": 0.2064, + "step": 17370 + }, + { + "epoch": 1.6364193024186902, + "grad_norm": 0.5833031535148621, + "learning_rate": 1.6502885769075983e-06, + "loss": 0.1814, + "step": 17371 + }, + { + "epoch": 1.636513506511858, + "grad_norm": 0.696064293384552, + "learning_rate": 1.6494577186091543e-06, + "loss": 0.1982, + "step": 17372 + }, + { + "epoch": 1.6366077106050256, + "grad_norm": 0.6086500883102417, + "learning_rate": 1.648627050715993e-06, + "loss": 0.1847, + "step": 17373 + }, + { + "epoch": 1.6367019146981936, + "grad_norm": 0.6393113732337952, + "learning_rate": 1.6477965732470502e-06, + "loss": 0.1968, + "step": 17374 + }, + { + "epoch": 1.6367961187913616, + "grad_norm": 0.6088919043540955, + "learning_rate": 1.646966286221261e-06, + "loss": 0.1949, + "step": 17375 + }, + { + "epoch": 1.6368903228845293, + "grad_norm": 0.7196995615959167, + "learning_rate": 1.646136189657558e-06, + "loss": 0.1961, + "step": 17376 + }, + { + "epoch": 1.636984526977697, + "grad_norm": 0.58502596616745, + "learning_rate": 1.6453062835748723e-06, + "loss": 0.1746, + "step": 17377 + }, + { + "epoch": 1.637078731070865, + "grad_norm": 0.6163436770439148, + "learning_rate": 1.6444765679921215e-06, + "loss": 0.1781, + "step": 17378 + }, + { + "epoch": 1.637172935164033, + "grad_norm": 0.6482148766517639, + "learning_rate": 1.643647042928227e-06, + "loss": 0.1929, + "step": 17379 + }, + { + "epoch": 1.6372671392572007, + "grad_norm": 0.680004894733429, + "learning_rate": 1.6428177084021058e-06, + "loss": 0.205, + "step": 17380 + }, + { + "epoch": 1.6373613433503684, + "grad_norm": 0.5815574526786804, + "learning_rate": 1.6419885644326627e-06, + "loss": 0.1688, + "step": 17381 + }, + { + "epoch": 1.6374555474435364, + "grad_norm": 0.688826858997345, + "learning_rate": 1.6411596110388062e-06, + "loss": 0.1762, + "step": 17382 + }, + { + "epoch": 1.6375497515367043, + "grad_norm": 0.6474208235740662, + "learning_rate": 1.6403308482394408e-06, + "loss": 0.1948, + "step": 17383 + }, + { + "epoch": 1.637643955629872, + "grad_norm": 0.6522237062454224, + "learning_rate": 1.6395022760534574e-06, + "loss": 0.184, + "step": 17384 + }, + { + "epoch": 1.6377381597230398, + "grad_norm": 0.6208203434944153, + "learning_rate": 1.638673894499755e-06, + "loss": 0.17, + "step": 17385 + }, + { + "epoch": 1.6378323638162078, + "grad_norm": 0.6058263182640076, + "learning_rate": 1.6378457035972161e-06, + "loss": 0.1881, + "step": 17386 + }, + { + "epoch": 1.6379265679093757, + "grad_norm": 0.7474789023399353, + "learning_rate": 1.6370177033647272e-06, + "loss": 0.2422, + "step": 17387 + }, + { + "epoch": 1.6380207720025435, + "grad_norm": 0.6309479475021362, + "learning_rate": 1.6361898938211707e-06, + "loss": 0.1976, + "step": 17388 + }, + { + "epoch": 1.6381149760957112, + "grad_norm": 0.7674265503883362, + "learning_rate": 1.635362274985417e-06, + "loss": 0.1952, + "step": 17389 + }, + { + "epoch": 1.6382091801888792, + "grad_norm": 0.686428964138031, + "learning_rate": 1.63453484687634e-06, + "loss": 0.1894, + "step": 17390 + }, + { + "epoch": 1.6383033842820471, + "grad_norm": 0.6450231075286865, + "learning_rate": 1.633707609512809e-06, + "loss": 0.1921, + "step": 17391 + }, + { + "epoch": 1.6383975883752149, + "grad_norm": 0.654132604598999, + "learning_rate": 1.6328805629136801e-06, + "loss": 0.2007, + "step": 17392 + }, + { + "epoch": 1.6384917924683826, + "grad_norm": 0.6398543119430542, + "learning_rate": 1.6320537070978138e-06, + "loss": 0.1883, + "step": 17393 + }, + { + "epoch": 1.6385859965615506, + "grad_norm": 0.7517440915107727, + "learning_rate": 1.631227042084067e-06, + "loss": 0.2274, + "step": 17394 + }, + { + "epoch": 1.6386802006547185, + "grad_norm": 0.6238316297531128, + "learning_rate": 1.6304005678912872e-06, + "loss": 0.18, + "step": 17395 + }, + { + "epoch": 1.6387744047478863, + "grad_norm": 0.7552535533905029, + "learning_rate": 1.6295742845383146e-06, + "loss": 0.2045, + "step": 17396 + }, + { + "epoch": 1.638868608841054, + "grad_norm": 0.6809061765670776, + "learning_rate": 1.6287481920439941e-06, + "loss": 0.2131, + "step": 17397 + }, + { + "epoch": 1.638962812934222, + "grad_norm": 0.6528740525245667, + "learning_rate": 1.627922290427163e-06, + "loss": 0.182, + "step": 17398 + }, + { + "epoch": 1.63905701702739, + "grad_norm": 0.6091368198394775, + "learning_rate": 1.6270965797066496e-06, + "loss": 0.1671, + "step": 17399 + }, + { + "epoch": 1.6391512211205577, + "grad_norm": 0.6520994305610657, + "learning_rate": 1.6262710599012832e-06, + "loss": 0.2017, + "step": 17400 + }, + { + "epoch": 1.6392454252137254, + "grad_norm": 0.6457948088645935, + "learning_rate": 1.6254457310298887e-06, + "loss": 0.1866, + "step": 17401 + }, + { + "epoch": 1.6393396293068934, + "grad_norm": 0.6623305678367615, + "learning_rate": 1.6246205931112802e-06, + "loss": 0.1944, + "step": 17402 + }, + { + "epoch": 1.6394338334000613, + "grad_norm": 0.6019333004951477, + "learning_rate": 1.6237956461642756e-06, + "loss": 0.1966, + "step": 17403 + }, + { + "epoch": 1.639528037493229, + "grad_norm": 0.6890769600868225, + "learning_rate": 1.6229708902076868e-06, + "loss": 0.194, + "step": 17404 + }, + { + "epoch": 1.6396222415863968, + "grad_norm": 0.7059999108314514, + "learning_rate": 1.6221463252603154e-06, + "loss": 0.1919, + "step": 17405 + }, + { + "epoch": 1.6397164456795648, + "grad_norm": 0.6976580619812012, + "learning_rate": 1.621321951340963e-06, + "loss": 0.191, + "step": 17406 + }, + { + "epoch": 1.6398106497727327, + "grad_norm": 0.627890408039093, + "learning_rate": 1.620497768468431e-06, + "loss": 0.1848, + "step": 17407 + }, + { + "epoch": 1.6399048538659005, + "grad_norm": 0.6436595916748047, + "learning_rate": 1.6196737766615068e-06, + "loss": 0.1969, + "step": 17408 + }, + { + "epoch": 1.6399990579590682, + "grad_norm": 0.5872290730476379, + "learning_rate": 1.6188499759389798e-06, + "loss": 0.1797, + "step": 17409 + }, + { + "epoch": 1.6400932620522362, + "grad_norm": 0.7314695715904236, + "learning_rate": 1.6180263663196382e-06, + "loss": 0.2144, + "step": 17410 + }, + { + "epoch": 1.6401874661454041, + "grad_norm": 0.6125447154045105, + "learning_rate": 1.6172029478222595e-06, + "loss": 0.1778, + "step": 17411 + }, + { + "epoch": 1.6402816702385719, + "grad_norm": 0.7243834137916565, + "learning_rate": 1.6163797204656117e-06, + "loss": 0.2015, + "step": 17412 + }, + { + "epoch": 1.6403758743317396, + "grad_norm": 0.6943835616111755, + "learning_rate": 1.6155566842684767e-06, + "loss": 0.189, + "step": 17413 + }, + { + "epoch": 1.6404700784249076, + "grad_norm": 0.8817789554595947, + "learning_rate": 1.614733839249617e-06, + "loss": 0.199, + "step": 17414 + }, + { + "epoch": 1.6405642825180755, + "grad_norm": 0.7127186059951782, + "learning_rate": 1.6139111854277901e-06, + "loss": 0.2081, + "step": 17415 + }, + { + "epoch": 1.6406584866112432, + "grad_norm": 0.6457104682922363, + "learning_rate": 1.6130887228217584e-06, + "loss": 0.1944, + "step": 17416 + }, + { + "epoch": 1.640752690704411, + "grad_norm": 0.7006966471672058, + "learning_rate": 1.6122664514502761e-06, + "loss": 0.1831, + "step": 17417 + }, + { + "epoch": 1.640846894797579, + "grad_norm": 0.6882437467575073, + "learning_rate": 1.6114443713320893e-06, + "loss": 0.2011, + "step": 17418 + }, + { + "epoch": 1.640941098890747, + "grad_norm": 0.7212029099464417, + "learning_rate": 1.610622482485943e-06, + "loss": 0.1879, + "step": 17419 + }, + { + "epoch": 1.6410353029839146, + "grad_norm": 0.572352945804596, + "learning_rate": 1.6098007849305819e-06, + "loss": 0.1688, + "step": 17420 + }, + { + "epoch": 1.6411295070770824, + "grad_norm": 0.6336104273796082, + "learning_rate": 1.6089792786847346e-06, + "loss": 0.1832, + "step": 17421 + }, + { + "epoch": 1.6412237111702503, + "grad_norm": 0.6650234460830688, + "learning_rate": 1.6081579637671385e-06, + "loss": 0.1827, + "step": 17422 + }, + { + "epoch": 1.6413179152634183, + "grad_norm": 0.6842238903045654, + "learning_rate": 1.6073368401965206e-06, + "loss": 0.2079, + "step": 17423 + }, + { + "epoch": 1.641412119356586, + "grad_norm": 0.8026834726333618, + "learning_rate": 1.6065159079915992e-06, + "loss": 0.1905, + "step": 17424 + }, + { + "epoch": 1.6415063234497538, + "grad_norm": 0.6949887275695801, + "learning_rate": 1.6056951671710997e-06, + "loss": 0.2144, + "step": 17425 + }, + { + "epoch": 1.6416005275429217, + "grad_norm": 0.7393868565559387, + "learning_rate": 1.604874617753729e-06, + "loss": 0.1922, + "step": 17426 + }, + { + "epoch": 1.6416947316360897, + "grad_norm": 0.6920140981674194, + "learning_rate": 1.604054259758201e-06, + "loss": 0.2111, + "step": 17427 + }, + { + "epoch": 1.6417889357292574, + "grad_norm": 0.6958423852920532, + "learning_rate": 1.603234093203222e-06, + "loss": 0.1982, + "step": 17428 + }, + { + "epoch": 1.6418831398224252, + "grad_norm": 0.6376303434371948, + "learning_rate": 1.6024141181074905e-06, + "loss": 0.1697, + "step": 17429 + }, + { + "epoch": 1.6419773439155931, + "grad_norm": 0.6730598211288452, + "learning_rate": 1.6015943344897022e-06, + "loss": 0.1958, + "step": 17430 + }, + { + "epoch": 1.642071548008761, + "grad_norm": 0.5982670783996582, + "learning_rate": 1.600774742368556e-06, + "loss": 0.1731, + "step": 17431 + }, + { + "epoch": 1.6421657521019288, + "grad_norm": 0.688256025314331, + "learning_rate": 1.5999553417627312e-06, + "loss": 0.1956, + "step": 17432 + }, + { + "epoch": 1.6422599561950966, + "grad_norm": 0.6104230284690857, + "learning_rate": 1.5991361326909162e-06, + "loss": 0.1702, + "step": 17433 + }, + { + "epoch": 1.6423541602882645, + "grad_norm": 0.6189106106758118, + "learning_rate": 1.5983171151717924e-06, + "loss": 0.2018, + "step": 17434 + }, + { + "epoch": 1.6424483643814325, + "grad_norm": 1.8000823259353638, + "learning_rate": 1.5974982892240309e-06, + "loss": 0.2136, + "step": 17435 + }, + { + "epoch": 1.6425425684746002, + "grad_norm": 0.6984451413154602, + "learning_rate": 1.596679654866302e-06, + "loss": 0.1844, + "step": 17436 + }, + { + "epoch": 1.642636772567768, + "grad_norm": 0.5976433753967285, + "learning_rate": 1.5958612121172723e-06, + "loss": 0.1893, + "step": 17437 + }, + { + "epoch": 1.642730976660936, + "grad_norm": 0.7203249335289001, + "learning_rate": 1.5950429609956065e-06, + "loss": 0.2086, + "step": 17438 + }, + { + "epoch": 1.6428251807541039, + "grad_norm": 0.6157548427581787, + "learning_rate": 1.5942249015199572e-06, + "loss": 0.1917, + "step": 17439 + }, + { + "epoch": 1.6429193848472716, + "grad_norm": 0.6497460603713989, + "learning_rate": 1.5934070337089802e-06, + "loss": 0.1953, + "step": 17440 + }, + { + "epoch": 1.6430135889404394, + "grad_norm": 0.5885839462280273, + "learning_rate": 1.5925893575813255e-06, + "loss": 0.181, + "step": 17441 + }, + { + "epoch": 1.6431077930336073, + "grad_norm": 0.6948521137237549, + "learning_rate": 1.5917718731556341e-06, + "loss": 0.1945, + "step": 17442 + }, + { + "epoch": 1.6432019971267753, + "grad_norm": 0.7173735499382019, + "learning_rate": 1.5909545804505477e-06, + "loss": 0.2126, + "step": 17443 + }, + { + "epoch": 1.643296201219943, + "grad_norm": 0.6352149844169617, + "learning_rate": 1.5901374794847035e-06, + "loss": 0.1879, + "step": 17444 + }, + { + "epoch": 1.6433904053131108, + "grad_norm": 0.6374592781066895, + "learning_rate": 1.5893205702767289e-06, + "loss": 0.1771, + "step": 17445 + }, + { + "epoch": 1.6434846094062787, + "grad_norm": 0.6871230602264404, + "learning_rate": 1.5885038528452524e-06, + "loss": 0.1803, + "step": 17446 + }, + { + "epoch": 1.6435788134994467, + "grad_norm": 0.6220190525054932, + "learning_rate": 1.5876873272088999e-06, + "loss": 0.169, + "step": 17447 + }, + { + "epoch": 1.6436730175926144, + "grad_norm": 0.6975589394569397, + "learning_rate": 1.586870993386286e-06, + "loss": 0.1916, + "step": 17448 + }, + { + "epoch": 1.6437672216857822, + "grad_norm": 0.646195113658905, + "learning_rate": 1.5860548513960184e-06, + "loss": 0.1975, + "step": 17449 + }, + { + "epoch": 1.6438614257789501, + "grad_norm": 0.6283693909645081, + "learning_rate": 1.5852389012567192e-06, + "loss": 0.1695, + "step": 17450 + }, + { + "epoch": 1.643955629872118, + "grad_norm": 0.757831871509552, + "learning_rate": 1.584423142986985e-06, + "loss": 0.1838, + "step": 17451 + }, + { + "epoch": 1.6440498339652858, + "grad_norm": 0.6395748853683472, + "learning_rate": 1.5836075766054148e-06, + "loss": 0.1896, + "step": 17452 + }, + { + "epoch": 1.6441440380584535, + "grad_norm": 0.6705828309059143, + "learning_rate": 1.5827922021306118e-06, + "loss": 0.1991, + "step": 17453 + }, + { + "epoch": 1.6442382421516215, + "grad_norm": 0.7036987543106079, + "learning_rate": 1.5819770195811646e-06, + "loss": 0.1925, + "step": 17454 + }, + { + "epoch": 1.6443324462447895, + "grad_norm": 0.6130074858665466, + "learning_rate": 1.581162028975658e-06, + "loss": 0.1694, + "step": 17455 + }, + { + "epoch": 1.6444266503379572, + "grad_norm": 0.6923983693122864, + "learning_rate": 1.5803472303326772e-06, + "loss": 0.2257, + "step": 17456 + }, + { + "epoch": 1.644520854431125, + "grad_norm": 0.6676929593086243, + "learning_rate": 1.579532623670802e-06, + "loss": 0.1798, + "step": 17457 + }, + { + "epoch": 1.644615058524293, + "grad_norm": 0.6116167902946472, + "learning_rate": 1.5787182090086038e-06, + "loss": 0.202, + "step": 17458 + }, + { + "epoch": 1.6447092626174609, + "grad_norm": 0.659077525138855, + "learning_rate": 1.5779039863646538e-06, + "loss": 0.2086, + "step": 17459 + }, + { + "epoch": 1.6448034667106286, + "grad_norm": 0.5949646830558777, + "learning_rate": 1.5770899557575215e-06, + "loss": 0.1849, + "step": 17460 + }, + { + "epoch": 1.6448976708037963, + "grad_norm": 0.5903298854827881, + "learning_rate": 1.5762761172057616e-06, + "loss": 0.1746, + "step": 17461 + }, + { + "epoch": 1.6449918748969643, + "grad_norm": 0.662360429763794, + "learning_rate": 1.5754624707279331e-06, + "loss": 0.1867, + "step": 17462 + }, + { + "epoch": 1.6450860789901323, + "grad_norm": 0.6063056588172913, + "learning_rate": 1.574649016342592e-06, + "loss": 0.1769, + "step": 17463 + }, + { + "epoch": 1.6451802830833, + "grad_norm": 0.6706985831260681, + "learning_rate": 1.5738357540682804e-06, + "loss": 0.191, + "step": 17464 + }, + { + "epoch": 1.6452744871764677, + "grad_norm": 0.5914628505706787, + "learning_rate": 1.5730226839235484e-06, + "loss": 0.1807, + "step": 17465 + }, + { + "epoch": 1.6453686912696357, + "grad_norm": 0.6614682674407959, + "learning_rate": 1.5722098059269285e-06, + "loss": 0.2187, + "step": 17466 + }, + { + "epoch": 1.6454628953628037, + "grad_norm": 0.7109770774841309, + "learning_rate": 1.571397120096959e-06, + "loss": 0.2209, + "step": 17467 + }, + { + "epoch": 1.6455570994559714, + "grad_norm": 0.7011008858680725, + "learning_rate": 1.5705846264521728e-06, + "loss": 0.1948, + "step": 17468 + }, + { + "epoch": 1.6456513035491391, + "grad_norm": 0.7054222226142883, + "learning_rate": 1.5697723250110907e-06, + "loss": 0.2142, + "step": 17469 + }, + { + "epoch": 1.645745507642307, + "grad_norm": 0.6907052993774414, + "learning_rate": 1.5689602157922379e-06, + "loss": 0.1878, + "step": 17470 + }, + { + "epoch": 1.645839711735475, + "grad_norm": 0.623249351978302, + "learning_rate": 1.5681482988141329e-06, + "loss": 0.1885, + "step": 17471 + }, + { + "epoch": 1.6459339158286428, + "grad_norm": 0.6415420174598694, + "learning_rate": 1.567336574095284e-06, + "loss": 0.1766, + "step": 17472 + }, + { + "epoch": 1.6460281199218105, + "grad_norm": 0.6554179787635803, + "learning_rate": 1.5665250416542055e-06, + "loss": 0.2015, + "step": 17473 + }, + { + "epoch": 1.6461223240149785, + "grad_norm": 0.6666958928108215, + "learning_rate": 1.565713701509397e-06, + "loss": 0.212, + "step": 17474 + }, + { + "epoch": 1.6462165281081464, + "grad_norm": 0.5999184250831604, + "learning_rate": 1.5649025536793616e-06, + "loss": 0.2156, + "step": 17475 + }, + { + "epoch": 1.6463107322013142, + "grad_norm": 0.6318747401237488, + "learning_rate": 1.564091598182591e-06, + "loss": 0.2039, + "step": 17476 + }, + { + "epoch": 1.646404936294482, + "grad_norm": 0.6377449631690979, + "learning_rate": 1.5632808350375773e-06, + "loss": 0.2152, + "step": 17477 + }, + { + "epoch": 1.6464991403876499, + "grad_norm": 0.6212782263755798, + "learning_rate": 1.562470264262812e-06, + "loss": 0.1791, + "step": 17478 + }, + { + "epoch": 1.6465933444808178, + "grad_norm": 0.9053308963775635, + "learning_rate": 1.5616598858767707e-06, + "loss": 0.1643, + "step": 17479 + }, + { + "epoch": 1.6466875485739856, + "grad_norm": 0.6468185782432556, + "learning_rate": 1.5608496998979338e-06, + "loss": 0.1917, + "step": 17480 + }, + { + "epoch": 1.6467817526671533, + "grad_norm": 0.6143595576286316, + "learning_rate": 1.560039706344777e-06, + "loss": 0.1831, + "step": 17481 + }, + { + "epoch": 1.6468759567603213, + "grad_norm": 0.5871161222457886, + "learning_rate": 1.5592299052357663e-06, + "loss": 0.1753, + "step": 17482 + }, + { + "epoch": 1.6469701608534892, + "grad_norm": 0.6375309228897095, + "learning_rate": 1.558420296589367e-06, + "loss": 0.1727, + "step": 17483 + }, + { + "epoch": 1.647064364946657, + "grad_norm": 0.7632774710655212, + "learning_rate": 1.5576108804240431e-06, + "loss": 0.1852, + "step": 17484 + }, + { + "epoch": 1.6471585690398247, + "grad_norm": 0.7169836759567261, + "learning_rate": 1.5568016567582446e-06, + "loss": 0.1984, + "step": 17485 + }, + { + "epoch": 1.6472527731329927, + "grad_norm": 0.6869579553604126, + "learning_rate": 1.5559926256104262e-06, + "loss": 0.2111, + "step": 17486 + }, + { + "epoch": 1.6473469772261606, + "grad_norm": 0.6542017459869385, + "learning_rate": 1.5551837869990372e-06, + "loss": 0.1911, + "step": 17487 + }, + { + "epoch": 1.6474411813193284, + "grad_norm": 0.5915003418922424, + "learning_rate": 1.5543751409425178e-06, + "loss": 0.1605, + "step": 17488 + }, + { + "epoch": 1.6475353854124961, + "grad_norm": 0.64701247215271, + "learning_rate": 1.5535666874593026e-06, + "loss": 0.1863, + "step": 17489 + }, + { + "epoch": 1.647629589505664, + "grad_norm": 0.6819285750389099, + "learning_rate": 1.552758426567833e-06, + "loss": 0.1917, + "step": 17490 + }, + { + "epoch": 1.647723793598832, + "grad_norm": 0.8095874786376953, + "learning_rate": 1.551950358286537e-06, + "loss": 0.1917, + "step": 17491 + }, + { + "epoch": 1.6478179976919998, + "grad_norm": 0.6300013661384583, + "learning_rate": 1.5511424826338318e-06, + "loss": 0.2012, + "step": 17492 + }, + { + "epoch": 1.6479122017851675, + "grad_norm": 0.7067247033119202, + "learning_rate": 1.5503347996281492e-06, + "loss": 0.199, + "step": 17493 + }, + { + "epoch": 1.6480064058783355, + "grad_norm": 0.6496374011039734, + "learning_rate": 1.5495273092879014e-06, + "loss": 0.201, + "step": 17494 + }, + { + "epoch": 1.6481006099715034, + "grad_norm": 0.7110437750816345, + "learning_rate": 1.548720011631497e-06, + "loss": 0.1987, + "step": 17495 + }, + { + "epoch": 1.6481948140646712, + "grad_norm": 0.6960927248001099, + "learning_rate": 1.5479129066773469e-06, + "loss": 0.208, + "step": 17496 + }, + { + "epoch": 1.648289018157839, + "grad_norm": 0.6513646841049194, + "learning_rate": 1.547105994443856e-06, + "loss": 0.2019, + "step": 17497 + }, + { + "epoch": 1.6483832222510069, + "grad_norm": 0.6682662963867188, + "learning_rate": 1.5462992749494187e-06, + "loss": 0.2005, + "step": 17498 + }, + { + "epoch": 1.6484774263441746, + "grad_norm": 0.7011797428131104, + "learning_rate": 1.5454927482124327e-06, + "loss": 0.2149, + "step": 17499 + }, + { + "epoch": 1.6485716304373423, + "grad_norm": 0.6496496200561523, + "learning_rate": 1.5446864142512885e-06, + "loss": 0.2006, + "step": 17500 + }, + { + "epoch": 1.6486658345305103, + "grad_norm": 0.7411315441131592, + "learning_rate": 1.5438802730843684e-06, + "loss": 0.1993, + "step": 17501 + }, + { + "epoch": 1.6487600386236783, + "grad_norm": 0.6486896872520447, + "learning_rate": 1.5430743247300561e-06, + "loss": 0.1885, + "step": 17502 + }, + { + "epoch": 1.648854242716846, + "grad_norm": 0.6243578195571899, + "learning_rate": 1.5422685692067297e-06, + "loss": 0.1858, + "step": 17503 + }, + { + "epoch": 1.6489484468100137, + "grad_norm": 0.7015368938446045, + "learning_rate": 1.5414630065327586e-06, + "loss": 0.2376, + "step": 17504 + }, + { + "epoch": 1.6490426509031817, + "grad_norm": 0.6350616216659546, + "learning_rate": 1.5406576367265135e-06, + "loss": 0.187, + "step": 17505 + }, + { + "epoch": 1.6491368549963497, + "grad_norm": 0.6465875506401062, + "learning_rate": 1.5398524598063558e-06, + "loss": 0.2049, + "step": 17506 + }, + { + "epoch": 1.6492310590895174, + "grad_norm": 0.6372957229614258, + "learning_rate": 1.5390474757906449e-06, + "loss": 0.1708, + "step": 17507 + }, + { + "epoch": 1.6493252631826851, + "grad_norm": 0.6691349148750305, + "learning_rate": 1.5382426846977394e-06, + "loss": 0.1829, + "step": 17508 + }, + { + "epoch": 1.649419467275853, + "grad_norm": 0.6700156331062317, + "learning_rate": 1.5374380865459847e-06, + "loss": 0.1845, + "step": 17509 + }, + { + "epoch": 1.649513671369021, + "grad_norm": 0.6702751517295837, + "learning_rate": 1.5366336813537298e-06, + "loss": 0.2013, + "step": 17510 + }, + { + "epoch": 1.6496078754621888, + "grad_norm": 0.6582763195037842, + "learning_rate": 1.5358294691393173e-06, + "loss": 0.2033, + "step": 17511 + }, + { + "epoch": 1.6497020795553565, + "grad_norm": 0.697255551815033, + "learning_rate": 1.5350254499210814e-06, + "loss": 0.1999, + "step": 17512 + }, + { + "epoch": 1.6497962836485245, + "grad_norm": 0.6184289455413818, + "learning_rate": 1.534221623717358e-06, + "loss": 0.1886, + "step": 17513 + }, + { + "epoch": 1.6498904877416924, + "grad_norm": 0.6472569108009338, + "learning_rate": 1.533417990546473e-06, + "loss": 0.1988, + "step": 17514 + }, + { + "epoch": 1.6499846918348602, + "grad_norm": 0.6780841946601868, + "learning_rate": 1.5326145504267532e-06, + "loss": 0.1746, + "step": 17515 + }, + { + "epoch": 1.650078895928028, + "grad_norm": 0.6907581686973572, + "learning_rate": 1.5318113033765137e-06, + "loss": 0.2006, + "step": 17516 + }, + { + "epoch": 1.6501731000211959, + "grad_norm": 0.6199663877487183, + "learning_rate": 1.5310082494140744e-06, + "loss": 0.1836, + "step": 17517 + }, + { + "epoch": 1.6502673041143638, + "grad_norm": 0.7279683351516724, + "learning_rate": 1.530205388557746e-06, + "loss": 0.1735, + "step": 17518 + }, + { + "epoch": 1.6503615082075316, + "grad_norm": 0.6609973311424255, + "learning_rate": 1.5294027208258311e-06, + "loss": 0.1869, + "step": 17519 + }, + { + "epoch": 1.6504557123006993, + "grad_norm": 0.7526587247848511, + "learning_rate": 1.5286002462366344e-06, + "loss": 0.1935, + "step": 17520 + }, + { + "epoch": 1.6505499163938673, + "grad_norm": 0.7436555624008179, + "learning_rate": 1.527797964808455e-06, + "loss": 0.1802, + "step": 17521 + }, + { + "epoch": 1.6506441204870352, + "grad_norm": 0.6578570008277893, + "learning_rate": 1.5269958765595826e-06, + "loss": 0.2053, + "step": 17522 + }, + { + "epoch": 1.650738324580203, + "grad_norm": 0.6650799512863159, + "learning_rate": 1.5261939815083083e-06, + "loss": 0.1958, + "step": 17523 + }, + { + "epoch": 1.6508325286733707, + "grad_norm": 0.6936365962028503, + "learning_rate": 1.5253922796729193e-06, + "loss": 0.2127, + "step": 17524 + }, + { + "epoch": 1.6509267327665387, + "grad_norm": 0.6985155344009399, + "learning_rate": 1.5245907710716912e-06, + "loss": 0.2033, + "step": 17525 + }, + { + "epoch": 1.6510209368597066, + "grad_norm": 0.5962125062942505, + "learning_rate": 1.5237894557228972e-06, + "loss": 0.2049, + "step": 17526 + }, + { + "epoch": 1.6511151409528744, + "grad_norm": 0.6634477972984314, + "learning_rate": 1.522988333644816e-06, + "loss": 0.1949, + "step": 17527 + }, + { + "epoch": 1.651209345046042, + "grad_norm": 0.6634848713874817, + "learning_rate": 1.5221874048557117e-06, + "loss": 0.1851, + "step": 17528 + }, + { + "epoch": 1.65130354913921, + "grad_norm": 0.6660844683647156, + "learning_rate": 1.5213866693738411e-06, + "loss": 0.1918, + "step": 17529 + }, + { + "epoch": 1.651397753232378, + "grad_norm": 0.7167404294013977, + "learning_rate": 1.5205861272174705e-06, + "loss": 0.2064, + "step": 17530 + }, + { + "epoch": 1.6514919573255458, + "grad_norm": 0.6679966449737549, + "learning_rate": 1.519785778404851e-06, + "loss": 0.1875, + "step": 17531 + }, + { + "epoch": 1.6515861614187135, + "grad_norm": 0.5997360944747925, + "learning_rate": 1.5189856229542255e-06, + "loss": 0.1859, + "step": 17532 + }, + { + "epoch": 1.6516803655118815, + "grad_norm": 0.6006638407707214, + "learning_rate": 1.5181856608838486e-06, + "loss": 0.1715, + "step": 17533 + }, + { + "epoch": 1.6517745696050494, + "grad_norm": 0.6586245894432068, + "learning_rate": 1.5173858922119555e-06, + "loss": 0.1769, + "step": 17534 + }, + { + "epoch": 1.6518687736982172, + "grad_norm": 0.6618895530700684, + "learning_rate": 1.5165863169567808e-06, + "loss": 0.2026, + "step": 17535 + }, + { + "epoch": 1.651962977791385, + "grad_norm": 0.7292799353599548, + "learning_rate": 1.5157869351365583e-06, + "loss": 0.2152, + "step": 17536 + }, + { + "epoch": 1.6520571818845529, + "grad_norm": 0.7579766511917114, + "learning_rate": 1.5149877467695173e-06, + "loss": 0.1741, + "step": 17537 + }, + { + "epoch": 1.6521513859777208, + "grad_norm": 0.6587605476379395, + "learning_rate": 1.5141887518738752e-06, + "loss": 0.1996, + "step": 17538 + }, + { + "epoch": 1.6522455900708886, + "grad_norm": 0.6389364004135132, + "learning_rate": 1.5133899504678529e-06, + "loss": 0.1825, + "step": 17539 + }, + { + "epoch": 1.6523397941640563, + "grad_norm": 0.6553727984428406, + "learning_rate": 1.512591342569667e-06, + "loss": 0.1833, + "step": 17540 + }, + { + "epoch": 1.6524339982572243, + "grad_norm": 0.629486083984375, + "learning_rate": 1.5117929281975218e-06, + "loss": 0.1702, + "step": 17541 + }, + { + "epoch": 1.6525282023503922, + "grad_norm": 0.6960748434066772, + "learning_rate": 1.5109947073696253e-06, + "loss": 0.2204, + "step": 17542 + }, + { + "epoch": 1.65262240644356, + "grad_norm": 0.5940431356430054, + "learning_rate": 1.5101966801041812e-06, + "loss": 0.2061, + "step": 17543 + }, + { + "epoch": 1.6527166105367277, + "grad_norm": 0.7151151895523071, + "learning_rate": 1.5093988464193787e-06, + "loss": 0.2134, + "step": 17544 + }, + { + "epoch": 1.6528108146298957, + "grad_norm": 0.6399179697036743, + "learning_rate": 1.5086012063334165e-06, + "loss": 0.2077, + "step": 17545 + }, + { + "epoch": 1.6529050187230636, + "grad_norm": 0.5704237222671509, + "learning_rate": 1.5078037598644767e-06, + "loss": 0.1568, + "step": 17546 + }, + { + "epoch": 1.6529992228162314, + "grad_norm": 0.6839672923088074, + "learning_rate": 1.5070065070307449e-06, + "loss": 0.2075, + "step": 17547 + }, + { + "epoch": 1.653093426909399, + "grad_norm": 0.6249094605445862, + "learning_rate": 1.506209447850402e-06, + "loss": 0.2364, + "step": 17548 + }, + { + "epoch": 1.653187631002567, + "grad_norm": 0.6789788603782654, + "learning_rate": 1.5054125823416166e-06, + "loss": 0.1876, + "step": 17549 + }, + { + "epoch": 1.653281835095735, + "grad_norm": 0.6458231806755066, + "learning_rate": 1.5046159105225622e-06, + "loss": 0.2022, + "step": 17550 + }, + { + "epoch": 1.6533760391889027, + "grad_norm": 0.6392346620559692, + "learning_rate": 1.5038194324114053e-06, + "loss": 0.2003, + "step": 17551 + }, + { + "epoch": 1.6534702432820705, + "grad_norm": 0.6021639704704285, + "learning_rate": 1.5030231480263024e-06, + "loss": 0.1804, + "step": 17552 + }, + { + "epoch": 1.6535644473752384, + "grad_norm": 0.6376745700836182, + "learning_rate": 1.5022270573854148e-06, + "loss": 0.1921, + "step": 17553 + }, + { + "epoch": 1.6536586514684064, + "grad_norm": 0.6818093061447144, + "learning_rate": 1.5014311605068898e-06, + "loss": 0.2161, + "step": 17554 + }, + { + "epoch": 1.6537528555615741, + "grad_norm": 0.700320303440094, + "learning_rate": 1.50063545740888e-06, + "loss": 0.1857, + "step": 17555 + }, + { + "epoch": 1.6538470596547419, + "grad_norm": 0.6865605711936951, + "learning_rate": 1.4998399481095248e-06, + "loss": 0.1815, + "step": 17556 + }, + { + "epoch": 1.6539412637479098, + "grad_norm": 0.5971400737762451, + "learning_rate": 1.4990446326269637e-06, + "loss": 0.1789, + "step": 17557 + }, + { + "epoch": 1.6540354678410778, + "grad_norm": 1.5523242950439453, + "learning_rate": 1.498249510979335e-06, + "loss": 0.2267, + "step": 17558 + }, + { + "epoch": 1.6541296719342455, + "grad_norm": 0.68794184923172, + "learning_rate": 1.497454583184763e-06, + "loss": 0.1871, + "step": 17559 + }, + { + "epoch": 1.6542238760274133, + "grad_norm": 0.6559624075889587, + "learning_rate": 1.4966598492613759e-06, + "loss": 0.1996, + "step": 17560 + }, + { + "epoch": 1.6543180801205812, + "grad_norm": 0.6656409502029419, + "learning_rate": 1.4958653092272968e-06, + "loss": 0.1851, + "step": 17561 + }, + { + "epoch": 1.6544122842137492, + "grad_norm": 0.6483403444290161, + "learning_rate": 1.4950709631006388e-06, + "loss": 0.2161, + "step": 17562 + }, + { + "epoch": 1.654506488306917, + "grad_norm": 0.580389678478241, + "learning_rate": 1.4942768108995166e-06, + "loss": 0.1693, + "step": 17563 + }, + { + "epoch": 1.6546006924000847, + "grad_norm": 0.6562122106552124, + "learning_rate": 1.4934828526420387e-06, + "loss": 0.1689, + "step": 17564 + }, + { + "epoch": 1.6546948964932526, + "grad_norm": 0.648463785648346, + "learning_rate": 1.4926890883463074e-06, + "loss": 0.1988, + "step": 17565 + }, + { + "epoch": 1.6547891005864206, + "grad_norm": 0.6589934825897217, + "learning_rate": 1.4918955180304173e-06, + "loss": 0.1664, + "step": 17566 + }, + { + "epoch": 1.6548833046795883, + "grad_norm": 0.6080276966094971, + "learning_rate": 1.4911021417124716e-06, + "loss": 0.1703, + "step": 17567 + }, + { + "epoch": 1.654977508772756, + "grad_norm": 0.6568965911865234, + "learning_rate": 1.4903089594105568e-06, + "loss": 0.2019, + "step": 17568 + }, + { + "epoch": 1.655071712865924, + "grad_norm": 0.6023063659667969, + "learning_rate": 1.4895159711427542e-06, + "loss": 0.17, + "step": 17569 + }, + { + "epoch": 1.655165916959092, + "grad_norm": 0.6778880953788757, + "learning_rate": 1.4887231769271526e-06, + "loss": 0.1857, + "step": 17570 + }, + { + "epoch": 1.6552601210522597, + "grad_norm": 0.634365975856781, + "learning_rate": 1.4879305767818264e-06, + "loss": 0.1952, + "step": 17571 + }, + { + "epoch": 1.6553543251454275, + "grad_norm": 0.6282731890678406, + "learning_rate": 1.4871381707248422e-06, + "loss": 0.1807, + "step": 17572 + }, + { + "epoch": 1.6554485292385954, + "grad_norm": 0.7004312872886658, + "learning_rate": 1.4863459587742778e-06, + "loss": 0.2079, + "step": 17573 + }, + { + "epoch": 1.6555427333317634, + "grad_norm": 0.6796080470085144, + "learning_rate": 1.4855539409481922e-06, + "loss": 0.2126, + "step": 17574 + }, + { + "epoch": 1.6556369374249311, + "grad_norm": 0.6595932245254517, + "learning_rate": 1.484762117264642e-06, + "loss": 0.194, + "step": 17575 + }, + { + "epoch": 1.6557311415180989, + "grad_norm": 0.5841668844223022, + "learning_rate": 1.483970487741685e-06, + "loss": 0.1847, + "step": 17576 + }, + { + "epoch": 1.6558253456112668, + "grad_norm": 0.8948422074317932, + "learning_rate": 1.4831790523973733e-06, + "loss": 0.1852, + "step": 17577 + }, + { + "epoch": 1.6559195497044348, + "grad_norm": 0.6995124816894531, + "learning_rate": 1.4823878112497493e-06, + "loss": 0.209, + "step": 17578 + }, + { + "epoch": 1.6560137537976025, + "grad_norm": 0.6554468274116516, + "learning_rate": 1.481596764316855e-06, + "loss": 0.2028, + "step": 17579 + }, + { + "epoch": 1.6561079578907703, + "grad_norm": 0.681593656539917, + "learning_rate": 1.4808059116167306e-06, + "loss": 0.2043, + "step": 17580 + }, + { + "epoch": 1.6562021619839382, + "grad_norm": 0.6716547012329102, + "learning_rate": 1.480015253167404e-06, + "loss": 0.1995, + "step": 17581 + }, + { + "epoch": 1.6562963660771062, + "grad_norm": 0.7072934508323669, + "learning_rate": 1.4792247889869072e-06, + "loss": 0.1853, + "step": 17582 + }, + { + "epoch": 1.656390570170274, + "grad_norm": 0.6691693067550659, + "learning_rate": 1.4784345190932637e-06, + "loss": 0.1829, + "step": 17583 + }, + { + "epoch": 1.6564847742634417, + "grad_norm": 0.6505060791969299, + "learning_rate": 1.4776444435044911e-06, + "loss": 0.1994, + "step": 17584 + }, + { + "epoch": 1.6565789783566096, + "grad_norm": 0.6570205092430115, + "learning_rate": 1.4768545622386066e-06, + "loss": 0.2214, + "step": 17585 + }, + { + "epoch": 1.6566731824497776, + "grad_norm": 0.6212388873100281, + "learning_rate": 1.4760648753136174e-06, + "loss": 0.1803, + "step": 17586 + }, + { + "epoch": 1.6567673865429453, + "grad_norm": 0.7011963129043579, + "learning_rate": 1.475275382747532e-06, + "loss": 0.2393, + "step": 17587 + }, + { + "epoch": 1.656861590636113, + "grad_norm": 0.7012268304824829, + "learning_rate": 1.474486084558353e-06, + "loss": 0.2197, + "step": 17588 + }, + { + "epoch": 1.656955794729281, + "grad_norm": 0.6832666397094727, + "learning_rate": 1.4736969807640744e-06, + "loss": 0.2358, + "step": 17589 + }, + { + "epoch": 1.657049998822449, + "grad_norm": 0.6616160869598389, + "learning_rate": 1.4729080713826938e-06, + "loss": 0.1826, + "step": 17590 + }, + { + "epoch": 1.6571442029156167, + "grad_norm": 0.6552468538284302, + "learning_rate": 1.4721193564321934e-06, + "loss": 0.188, + "step": 17591 + }, + { + "epoch": 1.6572384070087844, + "grad_norm": 0.7042325139045715, + "learning_rate": 1.4713308359305611e-06, + "loss": 0.1911, + "step": 17592 + }, + { + "epoch": 1.6573326111019524, + "grad_norm": 0.6441843509674072, + "learning_rate": 1.470542509895777e-06, + "loss": 0.214, + "step": 17593 + }, + { + "epoch": 1.6574268151951204, + "grad_norm": 0.689139187335968, + "learning_rate": 1.4697543783458124e-06, + "loss": 0.2179, + "step": 17594 + }, + { + "epoch": 1.657521019288288, + "grad_norm": 0.7108054161071777, + "learning_rate": 1.4689664412986437e-06, + "loss": 0.182, + "step": 17595 + }, + { + "epoch": 1.6576152233814558, + "grad_norm": 0.7257113456726074, + "learning_rate": 1.4681786987722302e-06, + "loss": 0.2171, + "step": 17596 + }, + { + "epoch": 1.6577094274746238, + "grad_norm": 0.6485233306884766, + "learning_rate": 1.4673911507845374e-06, + "loss": 0.198, + "step": 17597 + }, + { + "epoch": 1.6578036315677918, + "grad_norm": 0.6282594203948975, + "learning_rate": 1.4666037973535253e-06, + "loss": 0.1832, + "step": 17598 + }, + { + "epoch": 1.6578978356609595, + "grad_norm": 0.6632306575775146, + "learning_rate": 1.4658166384971406e-06, + "loss": 0.1808, + "step": 17599 + }, + { + "epoch": 1.6579920397541272, + "grad_norm": 0.7635298371315002, + "learning_rate": 1.4650296742333347e-06, + "loss": 0.2266, + "step": 17600 + }, + { + "epoch": 1.6580862438472952, + "grad_norm": 0.6387211680412292, + "learning_rate": 1.4642429045800544e-06, + "loss": 0.1981, + "step": 17601 + }, + { + "epoch": 1.6581804479404632, + "grad_norm": 0.7338160276412964, + "learning_rate": 1.463456329555235e-06, + "loss": 0.2161, + "step": 17602 + }, + { + "epoch": 1.658274652033631, + "grad_norm": 0.6713192462921143, + "learning_rate": 1.4626699491768126e-06, + "loss": 0.2033, + "step": 17603 + }, + { + "epoch": 1.6583688561267986, + "grad_norm": 0.6394454836845398, + "learning_rate": 1.4618837634627214e-06, + "loss": 0.2103, + "step": 17604 + }, + { + "epoch": 1.6584630602199666, + "grad_norm": 0.6917247772216797, + "learning_rate": 1.461097772430884e-06, + "loss": 0.2007, + "step": 17605 + }, + { + "epoch": 1.6585572643131345, + "grad_norm": 0.6584296226501465, + "learning_rate": 1.460311976099219e-06, + "loss": 0.2028, + "step": 17606 + }, + { + "epoch": 1.6586514684063023, + "grad_norm": 0.6669653058052063, + "learning_rate": 1.4595263744856524e-06, + "loss": 0.2034, + "step": 17607 + }, + { + "epoch": 1.65874567249947, + "grad_norm": 0.7105797529220581, + "learning_rate": 1.4587409676080932e-06, + "loss": 0.1733, + "step": 17608 + }, + { + "epoch": 1.658839876592638, + "grad_norm": 0.6380971670150757, + "learning_rate": 1.4579557554844437e-06, + "loss": 0.1845, + "step": 17609 + }, + { + "epoch": 1.658934080685806, + "grad_norm": 0.6251140236854553, + "learning_rate": 1.4571707381326184e-06, + "loss": 0.1932, + "step": 17610 + }, + { + "epoch": 1.6590282847789737, + "grad_norm": 0.7100645899772644, + "learning_rate": 1.4563859155705118e-06, + "loss": 0.2093, + "step": 17611 + }, + { + "epoch": 1.6591224888721414, + "grad_norm": 0.6430773138999939, + "learning_rate": 1.4556012878160152e-06, + "loss": 0.2009, + "step": 17612 + }, + { + "epoch": 1.6592166929653094, + "grad_norm": 0.6023344397544861, + "learning_rate": 1.454816854887028e-06, + "loss": 0.1848, + "step": 17613 + }, + { + "epoch": 1.6593108970584773, + "grad_norm": 0.7361556887626648, + "learning_rate": 1.4540326168014318e-06, + "loss": 0.219, + "step": 17614 + }, + { + "epoch": 1.659405101151645, + "grad_norm": 0.6500685811042786, + "learning_rate": 1.4532485735771052e-06, + "loss": 0.2, + "step": 17615 + }, + { + "epoch": 1.6594993052448128, + "grad_norm": 0.6812999248504639, + "learning_rate": 1.4524647252319302e-06, + "loss": 0.201, + "step": 17616 + }, + { + "epoch": 1.6595935093379808, + "grad_norm": 0.6409395933151245, + "learning_rate": 1.4516810717837804e-06, + "loss": 0.2078, + "step": 17617 + }, + { + "epoch": 1.6596877134311487, + "grad_norm": 0.7471181750297546, + "learning_rate": 1.4508976132505204e-06, + "loss": 0.2125, + "step": 17618 + }, + { + "epoch": 1.6597819175243165, + "grad_norm": 0.7450597882270813, + "learning_rate": 1.4501143496500158e-06, + "loss": 0.223, + "step": 17619 + }, + { + "epoch": 1.6598761216174842, + "grad_norm": 0.6431857347488403, + "learning_rate": 1.4493312810001293e-06, + "loss": 0.2345, + "step": 17620 + }, + { + "epoch": 1.6599703257106522, + "grad_norm": 0.6000074148178101, + "learning_rate": 1.4485484073187107e-06, + "loss": 0.1886, + "step": 17621 + }, + { + "epoch": 1.6600645298038201, + "grad_norm": 0.6162887811660767, + "learning_rate": 1.4477657286236135e-06, + "loss": 0.1787, + "step": 17622 + }, + { + "epoch": 1.6601587338969879, + "grad_norm": 0.6685981154441833, + "learning_rate": 1.4469832449326871e-06, + "loss": 0.2122, + "step": 17623 + }, + { + "epoch": 1.6602529379901556, + "grad_norm": 0.6540847420692444, + "learning_rate": 1.4462009562637668e-06, + "loss": 0.172, + "step": 17624 + }, + { + "epoch": 1.6603471420833236, + "grad_norm": 0.6713460087776184, + "learning_rate": 1.4454188626346966e-06, + "loss": 0.193, + "step": 17625 + }, + { + "epoch": 1.6604413461764915, + "grad_norm": 0.6472384929656982, + "learning_rate": 1.444636964063303e-06, + "loss": 0.2031, + "step": 17626 + }, + { + "epoch": 1.6605355502696593, + "grad_norm": 0.6349501609802246, + "learning_rate": 1.4438552605674182e-06, + "loss": 0.1907, + "step": 17627 + }, + { + "epoch": 1.660629754362827, + "grad_norm": 0.6609911322593689, + "learning_rate": 1.4430737521648685e-06, + "loss": 0.2009, + "step": 17628 + }, + { + "epoch": 1.660723958455995, + "grad_norm": 0.6586636304855347, + "learning_rate": 1.4422924388734682e-06, + "loss": 0.2147, + "step": 17629 + }, + { + "epoch": 1.660818162549163, + "grad_norm": 0.7303486466407776, + "learning_rate": 1.4415113207110376e-06, + "loss": 0.1848, + "step": 17630 + }, + { + "epoch": 1.6609123666423307, + "grad_norm": 0.6686525344848633, + "learning_rate": 1.4407303976953824e-06, + "loss": 0.1884, + "step": 17631 + }, + { + "epoch": 1.6610065707354984, + "grad_norm": 0.6891011595726013, + "learning_rate": 1.4399496698443104e-06, + "loss": 0.2007, + "step": 17632 + }, + { + "epoch": 1.6611007748286664, + "grad_norm": 0.6761496663093567, + "learning_rate": 1.4391691371756277e-06, + "loss": 0.1793, + "step": 17633 + }, + { + "epoch": 1.6611949789218343, + "grad_norm": 0.6334503889083862, + "learning_rate": 1.4383887997071255e-06, + "loss": 0.1817, + "step": 17634 + }, + { + "epoch": 1.6612891830150018, + "grad_norm": 0.6404047608375549, + "learning_rate": 1.4376086574566018e-06, + "loss": 0.1947, + "step": 17635 + }, + { + "epoch": 1.6613833871081698, + "grad_norm": 0.6635481715202332, + "learning_rate": 1.436828710441841e-06, + "loss": 0.2029, + "step": 17636 + }, + { + "epoch": 1.6614775912013378, + "grad_norm": 0.7378470301628113, + "learning_rate": 1.436048958680628e-06, + "loss": 0.2121, + "step": 17637 + }, + { + "epoch": 1.6615717952945055, + "grad_norm": 0.7180721759796143, + "learning_rate": 1.4352694021907455e-06, + "loss": 0.199, + "step": 17638 + }, + { + "epoch": 1.6616659993876732, + "grad_norm": 0.6997587084770203, + "learning_rate": 1.4344900409899643e-06, + "loss": 0.2158, + "step": 17639 + }, + { + "epoch": 1.6617602034808412, + "grad_norm": 0.6561702489852905, + "learning_rate": 1.433710875096057e-06, + "loss": 0.1951, + "step": 17640 + }, + { + "epoch": 1.6618544075740092, + "grad_norm": 0.6174152493476868, + "learning_rate": 1.432931904526792e-06, + "loss": 0.1697, + "step": 17641 + }, + { + "epoch": 1.661948611667177, + "grad_norm": 0.5758946537971497, + "learning_rate": 1.4321531292999269e-06, + "loss": 0.1786, + "step": 17642 + }, + { + "epoch": 1.6620428157603446, + "grad_norm": 0.9768653512001038, + "learning_rate": 1.43137454943322e-06, + "loss": 0.1608, + "step": 17643 + }, + { + "epoch": 1.6621370198535126, + "grad_norm": 0.6561266779899597, + "learning_rate": 1.4305961649444277e-06, + "loss": 0.2116, + "step": 17644 + }, + { + "epoch": 1.6622312239466805, + "grad_norm": 0.6503586769104004, + "learning_rate": 1.4298179758512954e-06, + "loss": 0.1835, + "step": 17645 + }, + { + "epoch": 1.6623254280398483, + "grad_norm": 0.6128221154212952, + "learning_rate": 1.429039982171563e-06, + "loss": 0.1791, + "step": 17646 + }, + { + "epoch": 1.662419632133016, + "grad_norm": 0.69385826587677, + "learning_rate": 1.4282621839229793e-06, + "loss": 0.2068, + "step": 17647 + }, + { + "epoch": 1.662513836226184, + "grad_norm": 0.6319029927253723, + "learning_rate": 1.427484581123274e-06, + "loss": 0.1791, + "step": 17648 + }, + { + "epoch": 1.662608040319352, + "grad_norm": 0.6989104151725769, + "learning_rate": 1.4267071737901728e-06, + "loss": 0.1845, + "step": 17649 + }, + { + "epoch": 1.6627022444125197, + "grad_norm": 0.5964011549949646, + "learning_rate": 1.4259299619414114e-06, + "loss": 0.17, + "step": 17650 + }, + { + "epoch": 1.6627964485056874, + "grad_norm": 0.7641205191612244, + "learning_rate": 1.4251529455947078e-06, + "loss": 0.1687, + "step": 17651 + }, + { + "epoch": 1.6628906525988554, + "grad_norm": 0.6229805946350098, + "learning_rate": 1.4243761247677734e-06, + "loss": 0.1904, + "step": 17652 + }, + { + "epoch": 1.6629848566920233, + "grad_norm": 0.6460482478141785, + "learning_rate": 1.4235994994783297e-06, + "loss": 0.1899, + "step": 17653 + }, + { + "epoch": 1.663079060785191, + "grad_norm": 0.6331815123558044, + "learning_rate": 1.4228230697440815e-06, + "loss": 0.2021, + "step": 17654 + }, + { + "epoch": 1.6631732648783588, + "grad_norm": 0.6770843863487244, + "learning_rate": 1.42204683558273e-06, + "loss": 0.2061, + "step": 17655 + }, + { + "epoch": 1.6632674689715268, + "grad_norm": 0.6638312935829163, + "learning_rate": 1.4212707970119765e-06, + "loss": 0.1951, + "step": 17656 + }, + { + "epoch": 1.6633616730646947, + "grad_norm": 0.693246066570282, + "learning_rate": 1.4204949540495183e-06, + "loss": 0.1961, + "step": 17657 + }, + { + "epoch": 1.6634558771578625, + "grad_norm": 0.6464352607727051, + "learning_rate": 1.4197193067130422e-06, + "loss": 0.1924, + "step": 17658 + }, + { + "epoch": 1.6635500812510302, + "grad_norm": 0.5881369113922119, + "learning_rate": 1.418943855020235e-06, + "loss": 0.1694, + "step": 17659 + }, + { + "epoch": 1.6636442853441982, + "grad_norm": 0.6073905229568481, + "learning_rate": 1.4181685989887806e-06, + "loss": 0.1885, + "step": 17660 + }, + { + "epoch": 1.6637384894373661, + "grad_norm": 0.7137670516967773, + "learning_rate": 1.4173935386363524e-06, + "loss": 0.2168, + "step": 17661 + }, + { + "epoch": 1.6638326935305339, + "grad_norm": 0.6231116056442261, + "learning_rate": 1.4166186739806242e-06, + "loss": 0.2214, + "step": 17662 + }, + { + "epoch": 1.6639268976237016, + "grad_norm": 0.6087729930877686, + "learning_rate": 1.4158440050392675e-06, + "loss": 0.1817, + "step": 17663 + }, + { + "epoch": 1.6640211017168696, + "grad_norm": 0.5989171862602234, + "learning_rate": 1.4150695318299412e-06, + "loss": 0.1787, + "step": 17664 + }, + { + "epoch": 1.6641153058100375, + "grad_norm": 0.665279746055603, + "learning_rate": 1.414295254370308e-06, + "loss": 0.1869, + "step": 17665 + }, + { + "epoch": 1.6642095099032053, + "grad_norm": 0.6534229516983032, + "learning_rate": 1.4135211726780196e-06, + "loss": 0.1715, + "step": 17666 + }, + { + "epoch": 1.664303713996373, + "grad_norm": 0.6989510655403137, + "learning_rate": 1.412747286770727e-06, + "loss": 0.1934, + "step": 17667 + }, + { + "epoch": 1.664397918089541, + "grad_norm": 0.6644416451454163, + "learning_rate": 1.4119735966660797e-06, + "loss": 0.1865, + "step": 17668 + }, + { + "epoch": 1.664492122182709, + "grad_norm": 0.6417335867881775, + "learning_rate": 1.4112001023817123e-06, + "loss": 0.207, + "step": 17669 + }, + { + "epoch": 1.6645863262758767, + "grad_norm": 0.6809008121490479, + "learning_rate": 1.4104268039352686e-06, + "loss": 0.1932, + "step": 17670 + }, + { + "epoch": 1.6646805303690444, + "grad_norm": 0.6541202068328857, + "learning_rate": 1.4096537013443755e-06, + "loss": 0.1886, + "step": 17671 + }, + { + "epoch": 1.6647747344622124, + "grad_norm": 0.7825481295585632, + "learning_rate": 1.4088807946266635e-06, + "loss": 0.2121, + "step": 17672 + }, + { + "epoch": 1.6648689385553803, + "grad_norm": 0.8834565877914429, + "learning_rate": 1.4081080837997585e-06, + "loss": 0.2159, + "step": 17673 + }, + { + "epoch": 1.664963142648548, + "grad_norm": 0.6058242321014404, + "learning_rate": 1.407335568881273e-06, + "loss": 0.1838, + "step": 17674 + }, + { + "epoch": 1.6650573467417158, + "grad_norm": 0.7065714597702026, + "learning_rate": 1.4065632498888294e-06, + "loss": 0.2108, + "step": 17675 + }, + { + "epoch": 1.6651515508348838, + "grad_norm": 0.6951255202293396, + "learning_rate": 1.4057911268400305e-06, + "loss": 0.1868, + "step": 17676 + }, + { + "epoch": 1.6652457549280517, + "grad_norm": 0.6267175674438477, + "learning_rate": 1.4050191997524843e-06, + "loss": 0.1854, + "step": 17677 + }, + { + "epoch": 1.6653399590212195, + "grad_norm": 0.7287028431892395, + "learning_rate": 1.4042474686437957e-06, + "loss": 0.2042, + "step": 17678 + }, + { + "epoch": 1.6654341631143872, + "grad_norm": 0.6523891091346741, + "learning_rate": 1.4034759335315562e-06, + "loss": 0.2005, + "step": 17679 + }, + { + "epoch": 1.6655283672075551, + "grad_norm": 0.6233696937561035, + "learning_rate": 1.4027045944333606e-06, + "loss": 0.1923, + "step": 17680 + }, + { + "epoch": 1.665622571300723, + "grad_norm": 0.6110519766807556, + "learning_rate": 1.401933451366798e-06, + "loss": 0.1944, + "step": 17681 + }, + { + "epoch": 1.6657167753938908, + "grad_norm": 0.6156954765319824, + "learning_rate": 1.401162504349448e-06, + "loss": 0.1741, + "step": 17682 + }, + { + "epoch": 1.6658109794870586, + "grad_norm": 0.6730902194976807, + "learning_rate": 1.400391753398891e-06, + "loss": 0.1828, + "step": 17683 + }, + { + "epoch": 1.6659051835802265, + "grad_norm": 0.7032499313354492, + "learning_rate": 1.3996211985327034e-06, + "loss": 0.1695, + "step": 17684 + }, + { + "epoch": 1.6659993876733945, + "grad_norm": 0.6364924311637878, + "learning_rate": 1.398850839768454e-06, + "loss": 0.1632, + "step": 17685 + }, + { + "epoch": 1.6660935917665622, + "grad_norm": 0.664461076259613, + "learning_rate": 1.398080677123702e-06, + "loss": 0.1858, + "step": 17686 + }, + { + "epoch": 1.66618779585973, + "grad_norm": 0.5921967625617981, + "learning_rate": 1.397310710616019e-06, + "loss": 0.167, + "step": 17687 + }, + { + "epoch": 1.666281999952898, + "grad_norm": 0.6417059302330017, + "learning_rate": 1.396540940262956e-06, + "loss": 0.1914, + "step": 17688 + }, + { + "epoch": 1.666376204046066, + "grad_norm": 0.6602506637573242, + "learning_rate": 1.3957713660820605e-06, + "loss": 0.1865, + "step": 17689 + }, + { + "epoch": 1.6664704081392336, + "grad_norm": 0.669553279876709, + "learning_rate": 1.3950019880908894e-06, + "loss": 0.1869, + "step": 17690 + }, + { + "epoch": 1.6665646122324014, + "grad_norm": 0.7122140526771545, + "learning_rate": 1.39423280630698e-06, + "loss": 0.2258, + "step": 17691 + }, + { + "epoch": 1.6666588163255693, + "grad_norm": 0.6299037337303162, + "learning_rate": 1.393463820747868e-06, + "loss": 0.1681, + "step": 17692 + }, + { + "epoch": 1.6667530204187373, + "grad_norm": 0.6788545250892639, + "learning_rate": 1.392695031431095e-06, + "loss": 0.2038, + "step": 17693 + }, + { + "epoch": 1.666847224511905, + "grad_norm": 0.630938708782196, + "learning_rate": 1.3919264383741871e-06, + "loss": 0.2085, + "step": 17694 + }, + { + "epoch": 1.6669414286050728, + "grad_norm": 0.7515336275100708, + "learning_rate": 1.3911580415946667e-06, + "loss": 0.2029, + "step": 17695 + }, + { + "epoch": 1.6670356326982407, + "grad_norm": 0.6248697638511658, + "learning_rate": 1.3903898411100568e-06, + "loss": 0.1826, + "step": 17696 + }, + { + "epoch": 1.6671298367914087, + "grad_norm": 0.6295955181121826, + "learning_rate": 1.3896218369378756e-06, + "loss": 0.1907, + "step": 17697 + }, + { + "epoch": 1.6672240408845764, + "grad_norm": 0.5748365521430969, + "learning_rate": 1.388854029095631e-06, + "loss": 0.1632, + "step": 17698 + }, + { + "epoch": 1.6673182449777442, + "grad_norm": 0.6384146809577942, + "learning_rate": 1.3880864176008312e-06, + "loss": 0.166, + "step": 17699 + }, + { + "epoch": 1.6674124490709121, + "grad_norm": 0.6905065178871155, + "learning_rate": 1.3873190024709816e-06, + "loss": 0.1982, + "step": 17700 + }, + { + "epoch": 1.66750665316408, + "grad_norm": 0.7195608019828796, + "learning_rate": 1.3865517837235776e-06, + "loss": 0.1983, + "step": 17701 + }, + { + "epoch": 1.6676008572572478, + "grad_norm": 0.6161376237869263, + "learning_rate": 1.3857847613761134e-06, + "loss": 0.204, + "step": 17702 + }, + { + "epoch": 1.6676950613504156, + "grad_norm": 0.6908882260322571, + "learning_rate": 1.3850179354460802e-06, + "loss": 0.2004, + "step": 17703 + }, + { + "epoch": 1.6677892654435835, + "grad_norm": 0.6385560631752014, + "learning_rate": 1.38425130595096e-06, + "loss": 0.1816, + "step": 17704 + }, + { + "epoch": 1.6678834695367515, + "grad_norm": 0.6749087572097778, + "learning_rate": 1.383484872908236e-06, + "loss": 0.166, + "step": 17705 + }, + { + "epoch": 1.6679776736299192, + "grad_norm": 0.6486237645149231, + "learning_rate": 1.3827186363353818e-06, + "loss": 0.2061, + "step": 17706 + }, + { + "epoch": 1.668071877723087, + "grad_norm": 0.6470707654953003, + "learning_rate": 1.3819525962498703e-06, + "loss": 0.1921, + "step": 17707 + }, + { + "epoch": 1.668166081816255, + "grad_norm": 0.6324625611305237, + "learning_rate": 1.3811867526691668e-06, + "loss": 0.1921, + "step": 17708 + }, + { + "epoch": 1.6682602859094229, + "grad_norm": 0.6587686538696289, + "learning_rate": 1.3804211056107332e-06, + "loss": 0.2075, + "step": 17709 + }, + { + "epoch": 1.6683544900025906, + "grad_norm": 0.7326374053955078, + "learning_rate": 1.3796556550920315e-06, + "loss": 0.1837, + "step": 17710 + }, + { + "epoch": 1.6684486940957584, + "grad_norm": 0.6420719027519226, + "learning_rate": 1.3788904011305105e-06, + "loss": 0.1979, + "step": 17711 + }, + { + "epoch": 1.6685428981889263, + "grad_norm": 0.6310940384864807, + "learning_rate": 1.3781253437436216e-06, + "loss": 0.2227, + "step": 17712 + }, + { + "epoch": 1.6686371022820943, + "grad_norm": 0.6826558709144592, + "learning_rate": 1.3773604829488107e-06, + "loss": 0.2038, + "step": 17713 + }, + { + "epoch": 1.668731306375262, + "grad_norm": 0.6317348480224609, + "learning_rate": 1.3765958187635131e-06, + "loss": 0.149, + "step": 17714 + }, + { + "epoch": 1.6688255104684298, + "grad_norm": 0.6829084753990173, + "learning_rate": 1.3758313512051702e-06, + "loss": 0.2007, + "step": 17715 + }, + { + "epoch": 1.6689197145615977, + "grad_norm": 0.6108052134513855, + "learning_rate": 1.375067080291208e-06, + "loss": 0.1698, + "step": 17716 + }, + { + "epoch": 1.6690139186547657, + "grad_norm": 0.7161389589309692, + "learning_rate": 1.374303006039054e-06, + "loss": 0.2011, + "step": 17717 + }, + { + "epoch": 1.6691081227479334, + "grad_norm": 0.6809845566749573, + "learning_rate": 1.3735391284661348e-06, + "loss": 0.2201, + "step": 17718 + }, + { + "epoch": 1.6692023268411011, + "grad_norm": 0.7134252786636353, + "learning_rate": 1.3727754475898615e-06, + "loss": 0.1986, + "step": 17719 + }, + { + "epoch": 1.669296530934269, + "grad_norm": 0.6815935373306274, + "learning_rate": 1.3720119634276507e-06, + "loss": 0.1852, + "step": 17720 + }, + { + "epoch": 1.669390735027437, + "grad_norm": 0.6288490891456604, + "learning_rate": 1.3712486759969124e-06, + "loss": 0.1788, + "step": 17721 + }, + { + "epoch": 1.6694849391206048, + "grad_norm": 0.6670456528663635, + "learning_rate": 1.3704855853150467e-06, + "loss": 0.2324, + "step": 17722 + }, + { + "epoch": 1.6695791432137725, + "grad_norm": 0.6168465614318848, + "learning_rate": 1.3697226913994555e-06, + "loss": 0.1728, + "step": 17723 + }, + { + "epoch": 1.6696733473069405, + "grad_norm": 0.6151394248008728, + "learning_rate": 1.368959994267537e-06, + "loss": 0.17, + "step": 17724 + }, + { + "epoch": 1.6697675514001085, + "grad_norm": 0.6413464546203613, + "learning_rate": 1.3681974939366772e-06, + "loss": 0.1906, + "step": 17725 + }, + { + "epoch": 1.6698617554932762, + "grad_norm": 0.715067982673645, + "learning_rate": 1.367435190424261e-06, + "loss": 0.2071, + "step": 17726 + }, + { + "epoch": 1.669955959586444, + "grad_norm": 0.6484623551368713, + "learning_rate": 1.366673083747676e-06, + "loss": 0.1819, + "step": 17727 + }, + { + "epoch": 1.670050163679612, + "grad_norm": 0.6648810505867004, + "learning_rate": 1.3659111739242981e-06, + "loss": 0.2219, + "step": 17728 + }, + { + "epoch": 1.6701443677727799, + "grad_norm": 0.6700533032417297, + "learning_rate": 1.365149460971492e-06, + "loss": 0.1767, + "step": 17729 + }, + { + "epoch": 1.6702385718659476, + "grad_norm": 0.6849918365478516, + "learning_rate": 1.3643879449066377e-06, + "loss": 0.1883, + "step": 17730 + }, + { + "epoch": 1.6703327759591153, + "grad_norm": 0.7058325409889221, + "learning_rate": 1.363626625747093e-06, + "loss": 0.2169, + "step": 17731 + }, + { + "epoch": 1.6704269800522833, + "grad_norm": 0.5893673896789551, + "learning_rate": 1.3628655035102167e-06, + "loss": 0.1624, + "step": 17732 + }, + { + "epoch": 1.6705211841454513, + "grad_norm": 0.6795918345451355, + "learning_rate": 1.3621045782133635e-06, + "loss": 0.2165, + "step": 17733 + }, + { + "epoch": 1.670615388238619, + "grad_norm": 0.8626487851142883, + "learning_rate": 1.3613438498738861e-06, + "loss": 0.2098, + "step": 17734 + }, + { + "epoch": 1.6707095923317867, + "grad_norm": 0.6874573826789856, + "learning_rate": 1.3605833185091278e-06, + "loss": 0.1691, + "step": 17735 + }, + { + "epoch": 1.6708037964249547, + "grad_norm": 0.6788274645805359, + "learning_rate": 1.3598229841364308e-06, + "loss": 0.1944, + "step": 17736 + }, + { + "epoch": 1.6708980005181227, + "grad_norm": 0.7230943441390991, + "learning_rate": 1.3590628467731337e-06, + "loss": 0.1851, + "step": 17737 + }, + { + "epoch": 1.6709922046112904, + "grad_norm": 0.888152003288269, + "learning_rate": 1.358302906436566e-06, + "loss": 0.1855, + "step": 17738 + }, + { + "epoch": 1.6710864087044581, + "grad_norm": 0.6920692920684814, + "learning_rate": 1.357543163144056e-06, + "loss": 0.2027, + "step": 17739 + }, + { + "epoch": 1.671180612797626, + "grad_norm": 0.6482422947883606, + "learning_rate": 1.35678361691293e-06, + "loss": 0.1988, + "step": 17740 + }, + { + "epoch": 1.671274816890794, + "grad_norm": 0.762971043586731, + "learning_rate": 1.3560242677605018e-06, + "loss": 0.2051, + "step": 17741 + }, + { + "epoch": 1.6713690209839618, + "grad_norm": 0.6341733336448669, + "learning_rate": 1.3552651157040897e-06, + "loss": 0.1822, + "step": 17742 + }, + { + "epoch": 1.6714632250771295, + "grad_norm": 0.7378972768783569, + "learning_rate": 1.3545061607610032e-06, + "loss": 0.2038, + "step": 17743 + }, + { + "epoch": 1.6715574291702975, + "grad_norm": 0.6339300274848938, + "learning_rate": 1.3537474029485452e-06, + "loss": 0.1796, + "step": 17744 + }, + { + "epoch": 1.6716516332634654, + "grad_norm": 0.7358366847038269, + "learning_rate": 1.3529888422840199e-06, + "loss": 0.2064, + "step": 17745 + }, + { + "epoch": 1.6717458373566332, + "grad_norm": 0.697050929069519, + "learning_rate": 1.3522304787847197e-06, + "loss": 0.2048, + "step": 17746 + }, + { + "epoch": 1.671840041449801, + "grad_norm": 0.630322277545929, + "learning_rate": 1.351472312467942e-06, + "loss": 0.2001, + "step": 17747 + }, + { + "epoch": 1.6719342455429689, + "grad_norm": 1.134198784828186, + "learning_rate": 1.3507143433509673e-06, + "loss": 0.22, + "step": 17748 + }, + { + "epoch": 1.6720284496361368, + "grad_norm": 0.6161059737205505, + "learning_rate": 1.349956571451082e-06, + "loss": 0.1805, + "step": 17749 + }, + { + "epoch": 1.6721226537293046, + "grad_norm": 0.6653415560722351, + "learning_rate": 1.3491989967855668e-06, + "loss": 0.185, + "step": 17750 + }, + { + "epoch": 1.6722168578224723, + "grad_norm": 0.7343589067459106, + "learning_rate": 1.348441619371691e-06, + "loss": 0.1658, + "step": 17751 + }, + { + "epoch": 1.6723110619156403, + "grad_norm": 0.6304425001144409, + "learning_rate": 1.3476844392267264e-06, + "loss": 0.1738, + "step": 17752 + }, + { + "epoch": 1.6724052660088082, + "grad_norm": 0.599528431892395, + "learning_rate": 1.3469274563679401e-06, + "loss": 0.1587, + "step": 17753 + }, + { + "epoch": 1.672499470101976, + "grad_norm": 0.7063929438591003, + "learning_rate": 1.3461706708125878e-06, + "loss": 0.181, + "step": 17754 + }, + { + "epoch": 1.6725936741951437, + "grad_norm": 0.732979953289032, + "learning_rate": 1.3454140825779294e-06, + "loss": 0.2248, + "step": 17755 + }, + { + "epoch": 1.6726878782883117, + "grad_norm": 0.7059483528137207, + "learning_rate": 1.3446576916812116e-06, + "loss": 0.2099, + "step": 17756 + }, + { + "epoch": 1.6727820823814796, + "grad_norm": 0.6447253823280334, + "learning_rate": 1.3439014981396847e-06, + "loss": 0.2043, + "step": 17757 + }, + { + "epoch": 1.6728762864746474, + "grad_norm": 0.6934409737586975, + "learning_rate": 1.343145501970593e-06, + "loss": 0.1824, + "step": 17758 + }, + { + "epoch": 1.672970490567815, + "grad_norm": 0.6274111866950989, + "learning_rate": 1.3423897031911703e-06, + "loss": 0.184, + "step": 17759 + }, + { + "epoch": 1.673064694660983, + "grad_norm": 0.6619078516960144, + "learning_rate": 1.3416341018186507e-06, + "loss": 0.1882, + "step": 17760 + }, + { + "epoch": 1.673158898754151, + "grad_norm": 0.7142669558525085, + "learning_rate": 1.3408786978702671e-06, + "loss": 0.196, + "step": 17761 + }, + { + "epoch": 1.6732531028473188, + "grad_norm": 0.6544268131256104, + "learning_rate": 1.3401234913632377e-06, + "loss": 0.1908, + "step": 17762 + }, + { + "epoch": 1.6733473069404865, + "grad_norm": 0.6190822124481201, + "learning_rate": 1.3393684823147857e-06, + "loss": 0.1853, + "step": 17763 + }, + { + "epoch": 1.6734415110336545, + "grad_norm": 0.7447613477706909, + "learning_rate": 1.3386136707421281e-06, + "loss": 0.2173, + "step": 17764 + }, + { + "epoch": 1.6735357151268224, + "grad_norm": 0.6771336197853088, + "learning_rate": 1.337859056662475e-06, + "loss": 0.1845, + "step": 17765 + }, + { + "epoch": 1.6736299192199902, + "grad_norm": 0.678631603717804, + "learning_rate": 1.3371046400930266e-06, + "loss": 0.2036, + "step": 17766 + }, + { + "epoch": 1.673724123313158, + "grad_norm": 0.7423925995826721, + "learning_rate": 1.3363504210509936e-06, + "loss": 0.2199, + "step": 17767 + }, + { + "epoch": 1.6738183274063259, + "grad_norm": 0.6490384936332703, + "learning_rate": 1.3355963995535703e-06, + "loss": 0.1887, + "step": 17768 + }, + { + "epoch": 1.6739125314994938, + "grad_norm": 0.6802162528038025, + "learning_rate": 1.3348425756179439e-06, + "loss": 0.1708, + "step": 17769 + }, + { + "epoch": 1.6740067355926616, + "grad_norm": 0.6522113680839539, + "learning_rate": 1.334088949261313e-06, + "loss": 0.1977, + "step": 17770 + }, + { + "epoch": 1.6741009396858293, + "grad_norm": 0.631261944770813, + "learning_rate": 1.3333355205008558e-06, + "loss": 0.1832, + "step": 17771 + }, + { + "epoch": 1.6741951437789973, + "grad_norm": 0.7070450782775879, + "learning_rate": 1.33258228935375e-06, + "loss": 0.1863, + "step": 17772 + }, + { + "epoch": 1.674289347872165, + "grad_norm": 0.6928162574768066, + "learning_rate": 1.3318292558371714e-06, + "loss": 0.1836, + "step": 17773 + }, + { + "epoch": 1.6743835519653327, + "grad_norm": 0.6214320063591003, + "learning_rate": 1.3310764199682946e-06, + "loss": 0.2011, + "step": 17774 + }, + { + "epoch": 1.6744777560585007, + "grad_norm": 0.6680625081062317, + "learning_rate": 1.3303237817642789e-06, + "loss": 0.1837, + "step": 17775 + }, + { + "epoch": 1.6745719601516686, + "grad_norm": 0.623860776424408, + "learning_rate": 1.3295713412422907e-06, + "loss": 0.1836, + "step": 17776 + }, + { + "epoch": 1.6746661642448364, + "grad_norm": 0.5853019952774048, + "learning_rate": 1.328819098419486e-06, + "loss": 0.1886, + "step": 17777 + }, + { + "epoch": 1.6747603683380041, + "grad_norm": 0.675097644329071, + "learning_rate": 1.3280670533130135e-06, + "loss": 0.1737, + "step": 17778 + }, + { + "epoch": 1.674854572431172, + "grad_norm": 0.7148059010505676, + "learning_rate": 1.327315205940024e-06, + "loss": 0.2086, + "step": 17779 + }, + { + "epoch": 1.67494877652434, + "grad_norm": 0.6851511597633362, + "learning_rate": 1.3265635563176626e-06, + "loss": 0.1969, + "step": 17780 + }, + { + "epoch": 1.6750429806175078, + "grad_norm": 0.6803012490272522, + "learning_rate": 1.3258121044630633e-06, + "loss": 0.1722, + "step": 17781 + }, + { + "epoch": 1.6751371847106755, + "grad_norm": 0.6636247634887695, + "learning_rate": 1.3250608503933637e-06, + "loss": 0.1943, + "step": 17782 + }, + { + "epoch": 1.6752313888038435, + "grad_norm": 0.6178423166275024, + "learning_rate": 1.3243097941256943e-06, + "loss": 0.1779, + "step": 17783 + }, + { + "epoch": 1.6753255928970114, + "grad_norm": 0.7024008631706238, + "learning_rate": 1.3235589356771771e-06, + "loss": 0.2151, + "step": 17784 + }, + { + "epoch": 1.6754197969901792, + "grad_norm": 0.6634231805801392, + "learning_rate": 1.3228082750649362e-06, + "loss": 0.1846, + "step": 17785 + }, + { + "epoch": 1.675514001083347, + "grad_norm": 0.7389572262763977, + "learning_rate": 1.3220578123060845e-06, + "loss": 0.2154, + "step": 17786 + }, + { + "epoch": 1.6756082051765149, + "grad_norm": 0.6536784172058105, + "learning_rate": 1.3213075474177373e-06, + "loss": 0.1915, + "step": 17787 + }, + { + "epoch": 1.6757024092696828, + "grad_norm": 0.6692500114440918, + "learning_rate": 1.3205574804169973e-06, + "loss": 0.201, + "step": 17788 + }, + { + "epoch": 1.6757966133628506, + "grad_norm": 0.6383397579193115, + "learning_rate": 1.3198076113209713e-06, + "loss": 0.1718, + "step": 17789 + }, + { + "epoch": 1.6758908174560183, + "grad_norm": 0.6334207057952881, + "learning_rate": 1.3190579401467562e-06, + "loss": 0.2114, + "step": 17790 + }, + { + "epoch": 1.6759850215491863, + "grad_norm": 0.7100887894630432, + "learning_rate": 1.3183084669114442e-06, + "loss": 0.193, + "step": 17791 + }, + { + "epoch": 1.6760792256423542, + "grad_norm": 0.6818283796310425, + "learning_rate": 1.317559191632125e-06, + "loss": 0.2141, + "step": 17792 + }, + { + "epoch": 1.676173429735522, + "grad_norm": 0.644614040851593, + "learning_rate": 1.3168101143258871e-06, + "loss": 0.1761, + "step": 17793 + }, + { + "epoch": 1.6762676338286897, + "grad_norm": 0.5869244933128357, + "learning_rate": 1.3160612350098045e-06, + "loss": 0.1881, + "step": 17794 + }, + { + "epoch": 1.6763618379218577, + "grad_norm": 0.7334598302841187, + "learning_rate": 1.3153125537009581e-06, + "loss": 0.2073, + "step": 17795 + }, + { + "epoch": 1.6764560420150256, + "grad_norm": 0.6654959917068481, + "learning_rate": 1.3145640704164143e-06, + "loss": 0.1905, + "step": 17796 + }, + { + "epoch": 1.6765502461081934, + "grad_norm": 0.6341982483863831, + "learning_rate": 1.3138157851732424e-06, + "loss": 0.1922, + "step": 17797 + }, + { + "epoch": 1.676644450201361, + "grad_norm": 0.6961618661880493, + "learning_rate": 1.3130676979885059e-06, + "loss": 0.1833, + "step": 17798 + }, + { + "epoch": 1.676738654294529, + "grad_norm": 0.7072240710258484, + "learning_rate": 1.3123198088792577e-06, + "loss": 0.1792, + "step": 17799 + }, + { + "epoch": 1.676832858387697, + "grad_norm": 0.6211265921592712, + "learning_rate": 1.311572117862554e-06, + "loss": 0.1976, + "step": 17800 + }, + { + "epoch": 1.6769270624808648, + "grad_norm": 0.7154965996742249, + "learning_rate": 1.3108246249554457e-06, + "loss": 0.1953, + "step": 17801 + }, + { + "epoch": 1.6770212665740325, + "grad_norm": 0.6488645076751709, + "learning_rate": 1.3100773301749715e-06, + "loss": 0.2021, + "step": 17802 + }, + { + "epoch": 1.6771154706672005, + "grad_norm": 0.6766074895858765, + "learning_rate": 1.309330233538173e-06, + "loss": 0.1749, + "step": 17803 + }, + { + "epoch": 1.6772096747603684, + "grad_norm": 0.6735768914222717, + "learning_rate": 1.308583335062088e-06, + "loss": 0.1853, + "step": 17804 + }, + { + "epoch": 1.6773038788535362, + "grad_norm": 0.6607761383056641, + "learning_rate": 1.307836634763745e-06, + "loss": 0.1873, + "step": 17805 + }, + { + "epoch": 1.677398082946704, + "grad_norm": 0.7383633255958557, + "learning_rate": 1.307090132660165e-06, + "loss": 0.1977, + "step": 17806 + }, + { + "epoch": 1.6774922870398719, + "grad_norm": 0.6929439902305603, + "learning_rate": 1.306343828768377e-06, + "loss": 0.181, + "step": 17807 + }, + { + "epoch": 1.6775864911330398, + "grad_norm": 0.5572612881660461, + "learning_rate": 1.3055977231053962e-06, + "loss": 0.1931, + "step": 17808 + }, + { + "epoch": 1.6776806952262076, + "grad_norm": 0.7010521292686462, + "learning_rate": 1.3048518156882285e-06, + "loss": 0.2156, + "step": 17809 + }, + { + "epoch": 1.6777748993193753, + "grad_norm": 0.6510460376739502, + "learning_rate": 1.30410610653389e-06, + "loss": 0.1952, + "step": 17810 + }, + { + "epoch": 1.6778691034125433, + "grad_norm": 0.6749451756477356, + "learning_rate": 1.303360595659382e-06, + "loss": 0.2302, + "step": 17811 + }, + { + "epoch": 1.6779633075057112, + "grad_norm": 0.6118618249893188, + "learning_rate": 1.3026152830816996e-06, + "loss": 0.1855, + "step": 17812 + }, + { + "epoch": 1.678057511598879, + "grad_norm": 0.5943413972854614, + "learning_rate": 1.3018701688178393e-06, + "loss": 0.1621, + "step": 17813 + }, + { + "epoch": 1.6781517156920467, + "grad_norm": 0.696884036064148, + "learning_rate": 1.301125252884794e-06, + "loss": 0.2018, + "step": 17814 + }, + { + "epoch": 1.6782459197852146, + "grad_norm": 0.6266376972198486, + "learning_rate": 1.3003805352995425e-06, + "loss": 0.1885, + "step": 17815 + }, + { + "epoch": 1.6783401238783826, + "grad_norm": 0.6385643482208252, + "learning_rate": 1.29963601607907e-06, + "loss": 0.1706, + "step": 17816 + }, + { + "epoch": 1.6784343279715503, + "grad_norm": 0.7097641825675964, + "learning_rate": 1.298891695240354e-06, + "loss": 0.2586, + "step": 17817 + }, + { + "epoch": 1.678528532064718, + "grad_norm": 0.6582649350166321, + "learning_rate": 1.2981475728003611e-06, + "loss": 0.2036, + "step": 17818 + }, + { + "epoch": 1.678622736157886, + "grad_norm": 0.6356112360954285, + "learning_rate": 1.297403648776061e-06, + "loss": 0.1818, + "step": 17819 + }, + { + "epoch": 1.678716940251054, + "grad_norm": 0.721987783908844, + "learning_rate": 1.296659923184419e-06, + "loss": 0.2498, + "step": 17820 + }, + { + "epoch": 1.6788111443442217, + "grad_norm": 0.6707038283348083, + "learning_rate": 1.2959163960423893e-06, + "loss": 0.1692, + "step": 17821 + }, + { + "epoch": 1.6789053484373895, + "grad_norm": 0.6467146873474121, + "learning_rate": 1.2951730673669261e-06, + "loss": 0.1937, + "step": 17822 + }, + { + "epoch": 1.6789995525305574, + "grad_norm": 0.6501058340072632, + "learning_rate": 1.2944299371749812e-06, + "loss": 0.2189, + "step": 17823 + }, + { + "epoch": 1.6790937566237254, + "grad_norm": 0.6409503817558289, + "learning_rate": 1.2936870054834982e-06, + "loss": 0.1925, + "step": 17824 + }, + { + "epoch": 1.6791879607168931, + "grad_norm": 0.6841661930084229, + "learning_rate": 1.2929442723094133e-06, + "loss": 0.1662, + "step": 17825 + }, + { + "epoch": 1.6792821648100609, + "grad_norm": 0.6333213448524475, + "learning_rate": 1.2922017376696638e-06, + "loss": 0.1735, + "step": 17826 + }, + { + "epoch": 1.6793763689032288, + "grad_norm": 0.5788587927818298, + "learning_rate": 1.2914594015811854e-06, + "loss": 0.1644, + "step": 17827 + }, + { + "epoch": 1.6794705729963968, + "grad_norm": 0.6400187015533447, + "learning_rate": 1.2907172640608978e-06, + "loss": 0.1988, + "step": 17828 + }, + { + "epoch": 1.6795647770895645, + "grad_norm": 0.6779884099960327, + "learning_rate": 1.2899753251257252e-06, + "loss": 0.2341, + "step": 17829 + }, + { + "epoch": 1.6796589811827323, + "grad_norm": 0.7089108228683472, + "learning_rate": 1.2892335847925886e-06, + "loss": 0.2071, + "step": 17830 + }, + { + "epoch": 1.6797531852759002, + "grad_norm": 0.6162636280059814, + "learning_rate": 1.2884920430783942e-06, + "loss": 0.1828, + "step": 17831 + }, + { + "epoch": 1.6798473893690682, + "grad_norm": 0.626998782157898, + "learning_rate": 1.2877507000000545e-06, + "loss": 0.1888, + "step": 17832 + }, + { + "epoch": 1.679941593462236, + "grad_norm": 0.5967768430709839, + "learning_rate": 1.2870095555744755e-06, + "loss": 0.1541, + "step": 17833 + }, + { + "epoch": 1.6800357975554037, + "grad_norm": 0.6613903045654297, + "learning_rate": 1.2862686098185506e-06, + "loss": 0.1899, + "step": 17834 + }, + { + "epoch": 1.6801300016485716, + "grad_norm": 0.6769422292709351, + "learning_rate": 1.2855278627491795e-06, + "loss": 0.1844, + "step": 17835 + }, + { + "epoch": 1.6802242057417396, + "grad_norm": 0.7312199473381042, + "learning_rate": 1.284787314383249e-06, + "loss": 0.1838, + "step": 17836 + }, + { + "epoch": 1.6803184098349073, + "grad_norm": 0.6149859428405762, + "learning_rate": 1.2840469647376452e-06, + "loss": 0.1689, + "step": 17837 + }, + { + "epoch": 1.680412613928075, + "grad_norm": 0.6647670865058899, + "learning_rate": 1.2833068138292526e-06, + "loss": 0.1969, + "step": 17838 + }, + { + "epoch": 1.680506818021243, + "grad_norm": 0.6740341782569885, + "learning_rate": 1.2825668616749431e-06, + "loss": 0.1796, + "step": 17839 + }, + { + "epoch": 1.680601022114411, + "grad_norm": 0.7125089168548584, + "learning_rate": 1.2818271082915911e-06, + "loss": 0.1774, + "step": 17840 + }, + { + "epoch": 1.6806952262075787, + "grad_norm": 0.6824539303779602, + "learning_rate": 1.2810875536960654e-06, + "loss": 0.2208, + "step": 17841 + }, + { + "epoch": 1.6807894303007465, + "grad_norm": 0.7195398807525635, + "learning_rate": 1.2803481979052257e-06, + "loss": 0.2147, + "step": 17842 + }, + { + "epoch": 1.6808836343939144, + "grad_norm": 0.623807430267334, + "learning_rate": 1.279609040935933e-06, + "loss": 0.1857, + "step": 17843 + }, + { + "epoch": 1.6809778384870824, + "grad_norm": 0.6549000144004822, + "learning_rate": 1.2788700828050415e-06, + "loss": 0.1797, + "step": 17844 + }, + { + "epoch": 1.6810720425802501, + "grad_norm": 0.6454442739486694, + "learning_rate": 1.2781313235294002e-06, + "loss": 0.181, + "step": 17845 + }, + { + "epoch": 1.6811662466734179, + "grad_norm": 0.7384193539619446, + "learning_rate": 1.2773927631258487e-06, + "loss": 0.23, + "step": 17846 + }, + { + "epoch": 1.6812604507665858, + "grad_norm": 0.6494235992431641, + "learning_rate": 1.276654401611237e-06, + "loss": 0.2326, + "step": 17847 + }, + { + "epoch": 1.6813546548597538, + "grad_norm": 0.672817587852478, + "learning_rate": 1.2759162390023961e-06, + "loss": 0.1924, + "step": 17848 + }, + { + "epoch": 1.6814488589529215, + "grad_norm": 0.6448466777801514, + "learning_rate": 1.2751782753161535e-06, + "loss": 0.1845, + "step": 17849 + }, + { + "epoch": 1.6815430630460892, + "grad_norm": 0.6963547468185425, + "learning_rate": 1.2744405105693402e-06, + "loss": 0.2093, + "step": 17850 + }, + { + "epoch": 1.6816372671392572, + "grad_norm": 0.6959251761436462, + "learning_rate": 1.2737029447787797e-06, + "loss": 0.1904, + "step": 17851 + }, + { + "epoch": 1.6817314712324252, + "grad_norm": 0.6796267628669739, + "learning_rate": 1.272965577961286e-06, + "loss": 0.1694, + "step": 17852 + }, + { + "epoch": 1.681825675325593, + "grad_norm": 0.6188008785247803, + "learning_rate": 1.2722284101336724e-06, + "loss": 0.1954, + "step": 17853 + }, + { + "epoch": 1.6819198794187606, + "grad_norm": 0.7073794603347778, + "learning_rate": 1.271491441312752e-06, + "loss": 0.2062, + "step": 17854 + }, + { + "epoch": 1.6820140835119286, + "grad_norm": 0.6268400549888611, + "learning_rate": 1.270754671515323e-06, + "loss": 0.1832, + "step": 17855 + }, + { + "epoch": 1.6821082876050966, + "grad_norm": 0.7445459961891174, + "learning_rate": 1.270018100758188e-06, + "loss": 0.2333, + "step": 17856 + }, + { + "epoch": 1.6822024916982643, + "grad_norm": 0.6499781608581543, + "learning_rate": 1.2692817290581426e-06, + "loss": 0.1848, + "step": 17857 + }, + { + "epoch": 1.682296695791432, + "grad_norm": 0.6811954975128174, + "learning_rate": 1.2685455564319748e-06, + "loss": 0.1799, + "step": 17858 + }, + { + "epoch": 1.6823908998846, + "grad_norm": 0.7441204190254211, + "learning_rate": 1.2678095828964721e-06, + "loss": 0.1992, + "step": 17859 + }, + { + "epoch": 1.682485103977768, + "grad_norm": 0.6719151139259338, + "learning_rate": 1.2670738084684175e-06, + "loss": 0.2269, + "step": 17860 + }, + { + "epoch": 1.6825793080709357, + "grad_norm": 0.681549608707428, + "learning_rate": 1.2663382331645835e-06, + "loss": 0.1719, + "step": 17861 + }, + { + "epoch": 1.6826735121641034, + "grad_norm": 0.6589304804801941, + "learning_rate": 1.2656028570017441e-06, + "loss": 0.1765, + "step": 17862 + }, + { + "epoch": 1.6827677162572714, + "grad_norm": 0.6553161144256592, + "learning_rate": 1.2648676799966697e-06, + "loss": 0.188, + "step": 17863 + }, + { + "epoch": 1.6828619203504394, + "grad_norm": 0.7294104099273682, + "learning_rate": 1.2641327021661222e-06, + "loss": 0.1842, + "step": 17864 + }, + { + "epoch": 1.682956124443607, + "grad_norm": 0.6651660203933716, + "learning_rate": 1.2633979235268569e-06, + "loss": 0.194, + "step": 17865 + }, + { + "epoch": 1.6830503285367748, + "grad_norm": 0.585040807723999, + "learning_rate": 1.2626633440956305e-06, + "loss": 0.1693, + "step": 17866 + }, + { + "epoch": 1.6831445326299428, + "grad_norm": 0.7369555234909058, + "learning_rate": 1.261928963889194e-06, + "loss": 0.2079, + "step": 17867 + }, + { + "epoch": 1.6832387367231108, + "grad_norm": 0.6434837579727173, + "learning_rate": 1.2611947829242887e-06, + "loss": 0.1947, + "step": 17868 + }, + { + "epoch": 1.6833329408162785, + "grad_norm": 0.679571270942688, + "learning_rate": 1.2604608012176579e-06, + "loss": 0.2242, + "step": 17869 + }, + { + "epoch": 1.6834271449094462, + "grad_norm": 0.6934368014335632, + "learning_rate": 1.2597270187860378e-06, + "loss": 0.1976, + "step": 17870 + }, + { + "epoch": 1.6835213490026142, + "grad_norm": 0.5825608968734741, + "learning_rate": 1.2589934356461575e-06, + "loss": 0.1844, + "step": 17871 + }, + { + "epoch": 1.6836155530957821, + "grad_norm": 0.6324750781059265, + "learning_rate": 1.2582600518147448e-06, + "loss": 0.1659, + "step": 17872 + }, + { + "epoch": 1.6837097571889499, + "grad_norm": 0.6621334552764893, + "learning_rate": 1.2575268673085239e-06, + "loss": 0.1759, + "step": 17873 + }, + { + "epoch": 1.6838039612821176, + "grad_norm": 0.7086099982261658, + "learning_rate": 1.2567938821442093e-06, + "loss": 0.2152, + "step": 17874 + }, + { + "epoch": 1.6838981653752856, + "grad_norm": 0.6397452354431152, + "learning_rate": 1.2560610963385189e-06, + "loss": 0.1877, + "step": 17875 + }, + { + "epoch": 1.6839923694684535, + "grad_norm": 0.640308141708374, + "learning_rate": 1.2553285099081547e-06, + "loss": 0.2108, + "step": 17876 + }, + { + "epoch": 1.6840865735616213, + "grad_norm": 0.6639888286590576, + "learning_rate": 1.2545961228698255e-06, + "loss": 0.2011, + "step": 17877 + }, + { + "epoch": 1.684180777654789, + "grad_norm": 0.6414461731910706, + "learning_rate": 1.2538639352402305e-06, + "loss": 0.1865, + "step": 17878 + }, + { + "epoch": 1.684274981747957, + "grad_norm": 0.6911826133728027, + "learning_rate": 1.2531319470360625e-06, + "loss": 0.1992, + "step": 17879 + }, + { + "epoch": 1.684369185841125, + "grad_norm": 0.7354703545570374, + "learning_rate": 1.2524001582740141e-06, + "loss": 0.2222, + "step": 17880 + }, + { + "epoch": 1.6844633899342927, + "grad_norm": 0.6461732387542725, + "learning_rate": 1.2516685689707719e-06, + "loss": 0.1672, + "step": 17881 + }, + { + "epoch": 1.6845575940274604, + "grad_norm": 0.7001529932022095, + "learning_rate": 1.2509371791430135e-06, + "loss": 0.1848, + "step": 17882 + }, + { + "epoch": 1.6846517981206284, + "grad_norm": 0.7101517915725708, + "learning_rate": 1.2502059888074191e-06, + "loss": 0.1973, + "step": 17883 + }, + { + "epoch": 1.6847460022137963, + "grad_norm": 0.6809945702552795, + "learning_rate": 1.2494749979806608e-06, + "loss": 0.2033, + "step": 17884 + }, + { + "epoch": 1.684840206306964, + "grad_norm": 0.6272152066230774, + "learning_rate": 1.2487442066794065e-06, + "loss": 0.1805, + "step": 17885 + }, + { + "epoch": 1.6849344104001318, + "grad_norm": 0.6663166284561157, + "learning_rate": 1.2480136149203138e-06, + "loss": 0.2151, + "step": 17886 + }, + { + "epoch": 1.6850286144932998, + "grad_norm": 0.6448500752449036, + "learning_rate": 1.2472832227200505e-06, + "loss": 0.1785, + "step": 17887 + }, + { + "epoch": 1.6851228185864677, + "grad_norm": 0.6031123995780945, + "learning_rate": 1.2465530300952655e-06, + "loss": 0.2118, + "step": 17888 + }, + { + "epoch": 1.6852170226796355, + "grad_norm": 0.6824389696121216, + "learning_rate": 1.2458230370626068e-06, + "loss": 0.2117, + "step": 17889 + }, + { + "epoch": 1.6853112267728032, + "grad_norm": 0.633580207824707, + "learning_rate": 1.2450932436387208e-06, + "loss": 0.194, + "step": 17890 + }, + { + "epoch": 1.6854054308659712, + "grad_norm": 0.702239990234375, + "learning_rate": 1.2443636498402523e-06, + "loss": 0.1787, + "step": 17891 + }, + { + "epoch": 1.6854996349591391, + "grad_norm": 0.6519957184791565, + "learning_rate": 1.24363425568383e-06, + "loss": 0.1931, + "step": 17892 + }, + { + "epoch": 1.6855938390523069, + "grad_norm": 0.6838902235031128, + "learning_rate": 1.2429050611860894e-06, + "loss": 0.2116, + "step": 17893 + }, + { + "epoch": 1.6856880431454746, + "grad_norm": 0.6391249299049377, + "learning_rate": 1.2421760663636583e-06, + "loss": 0.2014, + "step": 17894 + }, + { + "epoch": 1.6857822472386426, + "grad_norm": 0.6512613892555237, + "learning_rate": 1.2414472712331549e-06, + "loss": 0.2059, + "step": 17895 + }, + { + "epoch": 1.6858764513318105, + "grad_norm": 0.6511378288269043, + "learning_rate": 1.240718675811199e-06, + "loss": 0.2034, + "step": 17896 + }, + { + "epoch": 1.6859706554249783, + "grad_norm": 0.6198116540908813, + "learning_rate": 1.239990280114407e-06, + "loss": 0.2015, + "step": 17897 + }, + { + "epoch": 1.686064859518146, + "grad_norm": 0.6795098185539246, + "learning_rate": 1.2392620841593805e-06, + "loss": 0.2042, + "step": 17898 + }, + { + "epoch": 1.686159063611314, + "grad_norm": 0.6171090006828308, + "learning_rate": 1.2385340879627283e-06, + "loss": 0.1858, + "step": 17899 + }, + { + "epoch": 1.686253267704482, + "grad_norm": 0.6320787668228149, + "learning_rate": 1.2378062915410504e-06, + "loss": 0.1771, + "step": 17900 + }, + { + "epoch": 1.6863474717976497, + "grad_norm": 0.7158340811729431, + "learning_rate": 1.2370786949109382e-06, + "loss": 0.2007, + "step": 17901 + }, + { + "epoch": 1.6864416758908174, + "grad_norm": 0.6730626225471497, + "learning_rate": 1.2363512980889836e-06, + "loss": 0.2055, + "step": 17902 + }, + { + "epoch": 1.6865358799839854, + "grad_norm": 0.6145924925804138, + "learning_rate": 1.2356241010917747e-06, + "loss": 0.188, + "step": 17903 + }, + { + "epoch": 1.6866300840771533, + "grad_norm": 0.6048918962478638, + "learning_rate": 1.2348971039358914e-06, + "loss": 0.1969, + "step": 17904 + }, + { + "epoch": 1.686724288170321, + "grad_norm": 0.6175239086151123, + "learning_rate": 1.2341703066379073e-06, + "loss": 0.1836, + "step": 17905 + }, + { + "epoch": 1.6868184922634888, + "grad_norm": 0.6972577571868896, + "learning_rate": 1.2334437092143959e-06, + "loss": 0.1855, + "step": 17906 + }, + { + "epoch": 1.6869126963566567, + "grad_norm": 0.7666021585464478, + "learning_rate": 1.2327173116819292e-06, + "loss": 0.2043, + "step": 17907 + }, + { + "epoch": 1.6870069004498247, + "grad_norm": 0.6751794219017029, + "learning_rate": 1.2319911140570629e-06, + "loss": 0.1868, + "step": 17908 + }, + { + "epoch": 1.6871011045429924, + "grad_norm": 0.6828281283378601, + "learning_rate": 1.2312651163563606e-06, + "loss": 0.2155, + "step": 17909 + }, + { + "epoch": 1.6871953086361602, + "grad_norm": 0.6635129451751709, + "learning_rate": 1.2305393185963754e-06, + "loss": 0.1727, + "step": 17910 + }, + { + "epoch": 1.6872895127293281, + "grad_norm": 0.5928038358688354, + "learning_rate": 1.2298137207936555e-06, + "loss": 0.165, + "step": 17911 + }, + { + "epoch": 1.6873837168224959, + "grad_norm": 0.6852033734321594, + "learning_rate": 1.2290883229647455e-06, + "loss": 0.1853, + "step": 17912 + }, + { + "epoch": 1.6874779209156636, + "grad_norm": 0.6938362717628479, + "learning_rate": 1.2283631251261873e-06, + "loss": 0.2052, + "step": 17913 + }, + { + "epoch": 1.6875721250088316, + "grad_norm": 0.6766852140426636, + "learning_rate": 1.2276381272945149e-06, + "loss": 0.2295, + "step": 17914 + }, + { + "epoch": 1.6876663291019995, + "grad_norm": 0.6537317633628845, + "learning_rate": 1.226913329486261e-06, + "loss": 0.1989, + "step": 17915 + }, + { + "epoch": 1.6877605331951673, + "grad_norm": 0.6622162461280823, + "learning_rate": 1.2261887317179499e-06, + "loss": 0.1808, + "step": 17916 + }, + { + "epoch": 1.687854737288335, + "grad_norm": 0.558259129524231, + "learning_rate": 1.2254643340061035e-06, + "loss": 0.1624, + "step": 17917 + }, + { + "epoch": 1.687948941381503, + "grad_norm": 0.6519166827201843, + "learning_rate": 1.2247401363672428e-06, + "loss": 0.1964, + "step": 17918 + }, + { + "epoch": 1.688043145474671, + "grad_norm": 0.6809887886047363, + "learning_rate": 1.224016138817876e-06, + "loss": 0.2071, + "step": 17919 + }, + { + "epoch": 1.6881373495678387, + "grad_norm": 0.624958336353302, + "learning_rate": 1.2232923413745146e-06, + "loss": 0.1705, + "step": 17920 + }, + { + "epoch": 1.6882315536610064, + "grad_norm": 0.6159089207649231, + "learning_rate": 1.2225687440536626e-06, + "loss": 0.1744, + "step": 17921 + }, + { + "epoch": 1.6883257577541744, + "grad_norm": 0.6885070204734802, + "learning_rate": 1.2218453468718161e-06, + "loss": 0.2124, + "step": 17922 + }, + { + "epoch": 1.6884199618473423, + "grad_norm": 1.4782142639160156, + "learning_rate": 1.2211221498454706e-06, + "loss": 0.1916, + "step": 17923 + }, + { + "epoch": 1.68851416594051, + "grad_norm": 0.6769928932189941, + "learning_rate": 1.2203991529911197e-06, + "loss": 0.1957, + "step": 17924 + }, + { + "epoch": 1.6886083700336778, + "grad_norm": 0.7010208368301392, + "learning_rate": 1.2196763563252456e-06, + "loss": 0.174, + "step": 17925 + }, + { + "epoch": 1.6887025741268458, + "grad_norm": 0.6963124871253967, + "learning_rate": 1.2189537598643253e-06, + "loss": 0.1942, + "step": 17926 + }, + { + "epoch": 1.6887967782200137, + "grad_norm": 0.6661259531974792, + "learning_rate": 1.2182313636248433e-06, + "loss": 0.1786, + "step": 17927 + }, + { + "epoch": 1.6888909823131815, + "grad_norm": 0.6243340373039246, + "learning_rate": 1.2175091676232685e-06, + "loss": 0.1821, + "step": 17928 + }, + { + "epoch": 1.6889851864063492, + "grad_norm": 0.6905919313430786, + "learning_rate": 1.2167871718760638e-06, + "loss": 0.1847, + "step": 17929 + }, + { + "epoch": 1.6890793904995172, + "grad_norm": 0.6800353527069092, + "learning_rate": 1.2160653763996965e-06, + "loss": 0.2098, + "step": 17930 + }, + { + "epoch": 1.6891735945926851, + "grad_norm": 0.6478415131568909, + "learning_rate": 1.215343781210624e-06, + "loss": 0.2011, + "step": 17931 + }, + { + "epoch": 1.6892677986858529, + "grad_norm": 0.6425968408584595, + "learning_rate": 1.2146223863252982e-06, + "loss": 0.1726, + "step": 17932 + }, + { + "epoch": 1.6893620027790206, + "grad_norm": 0.6161893606185913, + "learning_rate": 1.2139011917601673e-06, + "loss": 0.19, + "step": 17933 + }, + { + "epoch": 1.6894562068721886, + "grad_norm": 0.6595200896263123, + "learning_rate": 1.2131801975316804e-06, + "loss": 0.2129, + "step": 17934 + }, + { + "epoch": 1.6895504109653565, + "grad_norm": 0.7028770446777344, + "learning_rate": 1.212459403656272e-06, + "loss": 0.1903, + "step": 17935 + }, + { + "epoch": 1.6896446150585243, + "grad_norm": 0.6268740296363831, + "learning_rate": 1.211738810150379e-06, + "loss": 0.1834, + "step": 17936 + }, + { + "epoch": 1.689738819151692, + "grad_norm": 0.6336575746536255, + "learning_rate": 1.211018417030434e-06, + "loss": 0.1636, + "step": 17937 + }, + { + "epoch": 1.68983302324486, + "grad_norm": 0.6407804489135742, + "learning_rate": 1.2102982243128603e-06, + "loss": 0.204, + "step": 17938 + }, + { + "epoch": 1.689927227338028, + "grad_norm": 0.6109455823898315, + "learning_rate": 1.2095782320140803e-06, + "loss": 0.1753, + "step": 17939 + }, + { + "epoch": 1.6900214314311957, + "grad_norm": 0.6444677114486694, + "learning_rate": 1.2088584401505133e-06, + "loss": 0.1659, + "step": 17940 + }, + { + "epoch": 1.6901156355243634, + "grad_norm": 0.7086964845657349, + "learning_rate": 1.2081388487385704e-06, + "loss": 0.1933, + "step": 17941 + }, + { + "epoch": 1.6902098396175314, + "grad_norm": 0.6854089498519897, + "learning_rate": 1.2074194577946529e-06, + "loss": 0.2159, + "step": 17942 + }, + { + "epoch": 1.6903040437106993, + "grad_norm": 0.6481835842132568, + "learning_rate": 1.2067002673351758e-06, + "loss": 0.191, + "step": 17943 + }, + { + "epoch": 1.690398247803867, + "grad_norm": 0.6236292123794556, + "learning_rate": 1.20598127737653e-06, + "loss": 0.1907, + "step": 17944 + }, + { + "epoch": 1.6904924518970348, + "grad_norm": 0.6148814558982849, + "learning_rate": 1.2052624879351105e-06, + "loss": 0.2072, + "step": 17945 + }, + { + "epoch": 1.6905866559902027, + "grad_norm": 0.6329357624053955, + "learning_rate": 1.2045438990273062e-06, + "loss": 0.1835, + "step": 17946 + }, + { + "epoch": 1.6906808600833707, + "grad_norm": 0.6403841376304626, + "learning_rate": 1.2038255106695074e-06, + "loss": 0.1743, + "step": 17947 + }, + { + "epoch": 1.6907750641765384, + "grad_norm": 0.6254767179489136, + "learning_rate": 1.2031073228780865e-06, + "loss": 0.1779, + "step": 17948 + }, + { + "epoch": 1.6908692682697062, + "grad_norm": 0.5949995517730713, + "learning_rate": 1.202389335669425e-06, + "loss": 0.1763, + "step": 17949 + }, + { + "epoch": 1.6909634723628741, + "grad_norm": 0.709050714969635, + "learning_rate": 1.201671549059893e-06, + "loss": 0.1913, + "step": 17950 + }, + { + "epoch": 1.691057676456042, + "grad_norm": 0.6388276815414429, + "learning_rate": 1.2009539630658551e-06, + "loss": 0.1645, + "step": 17951 + }, + { + "epoch": 1.6911518805492098, + "grad_norm": 0.623392641544342, + "learning_rate": 1.2002365777036751e-06, + "loss": 0.1997, + "step": 17952 + }, + { + "epoch": 1.6912460846423776, + "grad_norm": 0.704318642616272, + "learning_rate": 1.199519392989712e-06, + "loss": 0.1975, + "step": 17953 + }, + { + "epoch": 1.6913402887355455, + "grad_norm": 0.6862435340881348, + "learning_rate": 1.198802408940315e-06, + "loss": 0.2066, + "step": 17954 + }, + { + "epoch": 1.6914344928287135, + "grad_norm": 0.5987230539321899, + "learning_rate": 1.1980856255718365e-06, + "loss": 0.1924, + "step": 17955 + }, + { + "epoch": 1.6915286969218812, + "grad_norm": 0.6653030514717102, + "learning_rate": 1.1973690429006157e-06, + "loss": 0.1938, + "step": 17956 + }, + { + "epoch": 1.691622901015049, + "grad_norm": 0.7109711766242981, + "learning_rate": 1.196652660942994e-06, + "loss": 0.2102, + "step": 17957 + }, + { + "epoch": 1.691717105108217, + "grad_norm": 0.6443082690238953, + "learning_rate": 1.1959364797153095e-06, + "loss": 0.1971, + "step": 17958 + }, + { + "epoch": 1.691811309201385, + "grad_norm": 0.6614705920219421, + "learning_rate": 1.1952204992338856e-06, + "loss": 0.2021, + "step": 17959 + }, + { + "epoch": 1.6919055132945526, + "grad_norm": 0.6767356991767883, + "learning_rate": 1.1945047195150528e-06, + "loss": 0.1916, + "step": 17960 + }, + { + "epoch": 1.6919997173877204, + "grad_norm": 0.6969180107116699, + "learning_rate": 1.1937891405751312e-06, + "loss": 0.1918, + "step": 17961 + }, + { + "epoch": 1.6920939214808883, + "grad_norm": 1.0817089080810547, + "learning_rate": 1.1930737624304346e-06, + "loss": 0.2153, + "step": 17962 + }, + { + "epoch": 1.6921881255740563, + "grad_norm": 0.6809571385383606, + "learning_rate": 1.1923585850972762e-06, + "loss": 0.1885, + "step": 17963 + }, + { + "epoch": 1.692282329667224, + "grad_norm": 1.001025915145874, + "learning_rate": 1.1916436085919648e-06, + "loss": 0.2007, + "step": 17964 + }, + { + "epoch": 1.6923765337603918, + "grad_norm": 0.8061049580574036, + "learning_rate": 1.1909288329308023e-06, + "loss": 0.205, + "step": 17965 + }, + { + "epoch": 1.6924707378535597, + "grad_norm": 0.6809053421020508, + "learning_rate": 1.1902142581300836e-06, + "loss": 0.1944, + "step": 17966 + }, + { + "epoch": 1.6925649419467277, + "grad_norm": 0.7150509357452393, + "learning_rate": 1.1894998842061045e-06, + "loss": 0.1893, + "step": 17967 + }, + { + "epoch": 1.6926591460398954, + "grad_norm": 0.70624178647995, + "learning_rate": 1.1887857111751566e-06, + "loss": 0.2108, + "step": 17968 + }, + { + "epoch": 1.6927533501330632, + "grad_norm": 0.6196686029434204, + "learning_rate": 1.1880717390535178e-06, + "loss": 0.214, + "step": 17969 + }, + { + "epoch": 1.6928475542262311, + "grad_norm": 0.7185632586479187, + "learning_rate": 1.187357967857472e-06, + "loss": 0.1987, + "step": 17970 + }, + { + "epoch": 1.692941758319399, + "grad_norm": 0.5882018804550171, + "learning_rate": 1.186644397603296e-06, + "loss": 0.1782, + "step": 17971 + }, + { + "epoch": 1.6930359624125668, + "grad_norm": 0.720920741558075, + "learning_rate": 1.1859310283072545e-06, + "loss": 0.2091, + "step": 17972 + }, + { + "epoch": 1.6931301665057346, + "grad_norm": 0.6742173433303833, + "learning_rate": 1.185217859985618e-06, + "loss": 0.1777, + "step": 17973 + }, + { + "epoch": 1.6932243705989025, + "grad_norm": 0.7380731105804443, + "learning_rate": 1.1845048926546487e-06, + "loss": 0.2349, + "step": 17974 + }, + { + "epoch": 1.6933185746920705, + "grad_norm": 0.7536550760269165, + "learning_rate": 1.1837921263305985e-06, + "loss": 0.1984, + "step": 17975 + }, + { + "epoch": 1.6934127787852382, + "grad_norm": 0.6372663378715515, + "learning_rate": 1.183079561029723e-06, + "loss": 0.1832, + "step": 17976 + }, + { + "epoch": 1.693506982878406, + "grad_norm": 0.6015812158584595, + "learning_rate": 1.1823671967682704e-06, + "loss": 0.144, + "step": 17977 + }, + { + "epoch": 1.693601186971574, + "grad_norm": 0.6695241332054138, + "learning_rate": 1.181655033562481e-06, + "loss": 0.192, + "step": 17978 + }, + { + "epoch": 1.6936953910647419, + "grad_norm": 0.6968777775764465, + "learning_rate": 1.180943071428594e-06, + "loss": 0.1887, + "step": 17979 + }, + { + "epoch": 1.6937895951579096, + "grad_norm": 0.6087028980255127, + "learning_rate": 1.1802313103828466e-06, + "loss": 0.177, + "step": 17980 + }, + { + "epoch": 1.6938837992510773, + "grad_norm": 0.6628785133361816, + "learning_rate": 1.1795197504414657e-06, + "loss": 0.2069, + "step": 17981 + }, + { + "epoch": 1.6939780033442453, + "grad_norm": 0.6341779828071594, + "learning_rate": 1.1788083916206705e-06, + "loss": 0.1866, + "step": 17982 + }, + { + "epoch": 1.6940722074374133, + "grad_norm": 0.7151958346366882, + "learning_rate": 1.1780972339366914e-06, + "loss": 0.2168, + "step": 17983 + }, + { + "epoch": 1.694166411530581, + "grad_norm": 0.7450733780860901, + "learning_rate": 1.1773862774057377e-06, + "loss": 0.2009, + "step": 17984 + }, + { + "epoch": 1.6942606156237487, + "grad_norm": 0.6015254259109497, + "learning_rate": 1.1766755220440195e-06, + "loss": 0.1629, + "step": 17985 + }, + { + "epoch": 1.6943548197169167, + "grad_norm": 0.7582864761352539, + "learning_rate": 1.1759649678677454e-06, + "loss": 0.2035, + "step": 17986 + }, + { + "epoch": 1.6944490238100847, + "grad_norm": 0.7008610963821411, + "learning_rate": 1.1752546148931188e-06, + "loss": 0.1909, + "step": 17987 + }, + { + "epoch": 1.6945432279032524, + "grad_norm": 0.6471156477928162, + "learning_rate": 1.1745444631363312e-06, + "loss": 0.1963, + "step": 17988 + }, + { + "epoch": 1.6946374319964201, + "grad_norm": 0.6539919972419739, + "learning_rate": 1.1738345126135798e-06, + "loss": 0.2085, + "step": 17989 + }, + { + "epoch": 1.694731636089588, + "grad_norm": 0.6169882416725159, + "learning_rate": 1.1731247633410536e-06, + "loss": 0.1976, + "step": 17990 + }, + { + "epoch": 1.694825840182756, + "grad_norm": 0.808809220790863, + "learning_rate": 1.172415215334932e-06, + "loss": 0.1951, + "step": 17991 + }, + { + "epoch": 1.6949200442759238, + "grad_norm": 0.6773445010185242, + "learning_rate": 1.1717058686113948e-06, + "loss": 0.194, + "step": 17992 + }, + { + "epoch": 1.6950142483690915, + "grad_norm": 0.7250120043754578, + "learning_rate": 1.1709967231866204e-06, + "loss": 0.1944, + "step": 17993 + }, + { + "epoch": 1.6951084524622595, + "grad_norm": 0.6955695152282715, + "learning_rate": 1.170287779076772e-06, + "loss": 0.2072, + "step": 17994 + }, + { + "epoch": 1.6952026565554275, + "grad_norm": 0.7111067771911621, + "learning_rate": 1.1695790362980198e-06, + "loss": 0.1859, + "step": 17995 + }, + { + "epoch": 1.6952968606485952, + "grad_norm": 0.7299478650093079, + "learning_rate": 1.1688704948665196e-06, + "loss": 0.198, + "step": 17996 + }, + { + "epoch": 1.695391064741763, + "grad_norm": 0.578731119632721, + "learning_rate": 1.16816215479843e-06, + "loss": 0.1586, + "step": 17997 + }, + { + "epoch": 1.695485268834931, + "grad_norm": 0.6320825219154358, + "learning_rate": 1.1674540161099045e-06, + "loss": 0.1719, + "step": 17998 + }, + { + "epoch": 1.6955794729280989, + "grad_norm": 0.6582252383232117, + "learning_rate": 1.1667460788170849e-06, + "loss": 0.2082, + "step": 17999 + }, + { + "epoch": 1.6956736770212666, + "grad_norm": 0.6311085820198059, + "learning_rate": 1.1660383429361155e-06, + "loss": 0.1753, + "step": 18000 + }, + { + "epoch": 1.6957678811144343, + "grad_norm": 0.7495169043540955, + "learning_rate": 1.165330808483136e-06, + "loss": 0.2064, + "step": 18001 + }, + { + "epoch": 1.6958620852076023, + "grad_norm": 0.6125212907791138, + "learning_rate": 1.1646234754742747e-06, + "loss": 0.1708, + "step": 18002 + }, + { + "epoch": 1.6959562893007702, + "grad_norm": 0.651889979839325, + "learning_rate": 1.1639163439256629e-06, + "loss": 0.1872, + "step": 18003 + }, + { + "epoch": 1.696050493393938, + "grad_norm": 0.7018201947212219, + "learning_rate": 1.1632094138534256e-06, + "loss": 0.1775, + "step": 18004 + }, + { + "epoch": 1.6961446974871057, + "grad_norm": 0.6348908543586731, + "learning_rate": 1.1625026852736797e-06, + "loss": 0.1991, + "step": 18005 + }, + { + "epoch": 1.6962389015802737, + "grad_norm": 0.5990427732467651, + "learning_rate": 1.1617961582025384e-06, + "loss": 0.1693, + "step": 18006 + }, + { + "epoch": 1.6963331056734416, + "grad_norm": 0.6148661375045776, + "learning_rate": 1.1610898326561127e-06, + "loss": 0.1561, + "step": 18007 + }, + { + "epoch": 1.6964273097666094, + "grad_norm": 0.7107434868812561, + "learning_rate": 1.160383708650511e-06, + "loss": 0.2055, + "step": 18008 + }, + { + "epoch": 1.6965215138597771, + "grad_norm": 0.8598737120628357, + "learning_rate": 1.1596777862018293e-06, + "loss": 0.2075, + "step": 18009 + }, + { + "epoch": 1.696615717952945, + "grad_norm": 0.6815014481544495, + "learning_rate": 1.158972065326166e-06, + "loss": 0.1966, + "step": 18010 + }, + { + "epoch": 1.696709922046113, + "grad_norm": 0.6880999207496643, + "learning_rate": 1.1582665460396146e-06, + "loss": 0.198, + "step": 18011 + }, + { + "epoch": 1.6968041261392808, + "grad_norm": 0.721699595451355, + "learning_rate": 1.1575612283582571e-06, + "loss": 0.1838, + "step": 18012 + }, + { + "epoch": 1.6968983302324485, + "grad_norm": 0.6525470018386841, + "learning_rate": 1.1568561122981792e-06, + "loss": 0.207, + "step": 18013 + }, + { + "epoch": 1.6969925343256165, + "grad_norm": 0.6364315152168274, + "learning_rate": 1.1561511978754604e-06, + "loss": 0.1903, + "step": 18014 + }, + { + "epoch": 1.6970867384187844, + "grad_norm": 0.631808876991272, + "learning_rate": 1.1554464851061698e-06, + "loss": 0.1961, + "step": 18015 + }, + { + "epoch": 1.6971809425119522, + "grad_norm": 0.646543025970459, + "learning_rate": 1.154741974006377e-06, + "loss": 0.1896, + "step": 18016 + }, + { + "epoch": 1.69727514660512, + "grad_norm": 0.7164186835289001, + "learning_rate": 1.1540376645921492e-06, + "loss": 0.2018, + "step": 18017 + }, + { + "epoch": 1.6973693506982879, + "grad_norm": 0.6542921662330627, + "learning_rate": 1.1533335568795412e-06, + "loss": 0.199, + "step": 18018 + }, + { + "epoch": 1.6974635547914558, + "grad_norm": 0.6426371335983276, + "learning_rate": 1.1526296508846092e-06, + "loss": 0.2056, + "step": 18019 + }, + { + "epoch": 1.6975577588846236, + "grad_norm": 0.6970218420028687, + "learning_rate": 1.1519259466234068e-06, + "loss": 0.2002, + "step": 18020 + }, + { + "epoch": 1.6976519629777913, + "grad_norm": 0.6598572134971619, + "learning_rate": 1.1512224441119756e-06, + "loss": 0.2262, + "step": 18021 + }, + { + "epoch": 1.6977461670709593, + "grad_norm": 0.7594718933105469, + "learning_rate": 1.1505191433663544e-06, + "loss": 0.2121, + "step": 18022 + }, + { + "epoch": 1.6978403711641272, + "grad_norm": 0.6339885592460632, + "learning_rate": 1.1498160444025862e-06, + "loss": 0.2117, + "step": 18023 + }, + { + "epoch": 1.697934575257295, + "grad_norm": 0.8442044854164124, + "learning_rate": 1.149113147236699e-06, + "loss": 0.1847, + "step": 18024 + }, + { + "epoch": 1.6980287793504627, + "grad_norm": 0.7565668225288391, + "learning_rate": 1.1484104518847193e-06, + "loss": 0.2016, + "step": 18025 + }, + { + "epoch": 1.6981229834436307, + "grad_norm": 0.7381719350814819, + "learning_rate": 1.1477079583626693e-06, + "loss": 0.2391, + "step": 18026 + }, + { + "epoch": 1.6982171875367986, + "grad_norm": 0.638436496257782, + "learning_rate": 1.1470056666865714e-06, + "loss": 0.183, + "step": 18027 + }, + { + "epoch": 1.6983113916299664, + "grad_norm": 0.6921818852424622, + "learning_rate": 1.146303576872433e-06, + "loss": 0.1773, + "step": 18028 + }, + { + "epoch": 1.698405595723134, + "grad_norm": 0.6457545757293701, + "learning_rate": 1.1456016889362654e-06, + "loss": 0.2013, + "step": 18029 + }, + { + "epoch": 1.698499799816302, + "grad_norm": 0.6686806082725525, + "learning_rate": 1.1449000028940749e-06, + "loss": 0.2066, + "step": 18030 + }, + { + "epoch": 1.69859400390947, + "grad_norm": 0.721925675868988, + "learning_rate": 1.1441985187618576e-06, + "loss": 0.1792, + "step": 18031 + }, + { + "epoch": 1.6986882080026378, + "grad_norm": 0.6582679748535156, + "learning_rate": 1.1434972365556086e-06, + "loss": 0.2222, + "step": 18032 + }, + { + "epoch": 1.6987824120958055, + "grad_norm": 0.6933612823486328, + "learning_rate": 1.1427961562913225e-06, + "loss": 0.2038, + "step": 18033 + }, + { + "epoch": 1.6988766161889735, + "grad_norm": 0.6419912576675415, + "learning_rate": 1.1420952779849803e-06, + "loss": 0.1892, + "step": 18034 + }, + { + "epoch": 1.6989708202821414, + "grad_norm": 0.6878123879432678, + "learning_rate": 1.1413946016525656e-06, + "loss": 0.1916, + "step": 18035 + }, + { + "epoch": 1.6990650243753092, + "grad_norm": 0.6557255387306213, + "learning_rate": 1.1406941273100525e-06, + "loss": 0.1866, + "step": 18036 + }, + { + "epoch": 1.699159228468477, + "grad_norm": 0.7365891337394714, + "learning_rate": 1.1399938549734145e-06, + "loss": 0.193, + "step": 18037 + }, + { + "epoch": 1.6992534325616448, + "grad_norm": 0.6552425622940063, + "learning_rate": 1.1392937846586216e-06, + "loss": 0.1859, + "step": 18038 + }, + { + "epoch": 1.6993476366548128, + "grad_norm": 0.8205013275146484, + "learning_rate": 1.1385939163816306e-06, + "loss": 0.2262, + "step": 18039 + }, + { + "epoch": 1.6994418407479805, + "grad_norm": 0.6691708564758301, + "learning_rate": 1.1378942501584035e-06, + "loss": 0.2103, + "step": 18040 + }, + { + "epoch": 1.6995360448411483, + "grad_norm": 0.6229633092880249, + "learning_rate": 1.1371947860048948e-06, + "loss": 0.1636, + "step": 18041 + }, + { + "epoch": 1.6996302489343162, + "grad_norm": 0.6062520742416382, + "learning_rate": 1.1364955239370501e-06, + "loss": 0.197, + "step": 18042 + }, + { + "epoch": 1.6997244530274842, + "grad_norm": 0.8412054777145386, + "learning_rate": 1.135796463970814e-06, + "loss": 0.1724, + "step": 18043 + }, + { + "epoch": 1.699818657120652, + "grad_norm": 0.7708158493041992, + "learning_rate": 1.1350976061221309e-06, + "loss": 0.2009, + "step": 18044 + }, + { + "epoch": 1.6999128612138197, + "grad_norm": 0.7285130620002747, + "learning_rate": 1.1343989504069308e-06, + "loss": 0.215, + "step": 18045 + }, + { + "epoch": 1.7000070653069876, + "grad_norm": 0.6278975605964661, + "learning_rate": 1.1337004968411437e-06, + "loss": 0.1816, + "step": 18046 + }, + { + "epoch": 1.7001012694001556, + "grad_norm": 0.619029700756073, + "learning_rate": 1.1330022454406975e-06, + "loss": 0.1763, + "step": 18047 + }, + { + "epoch": 1.7001954734933233, + "grad_norm": 0.8753061890602112, + "learning_rate": 1.1323041962215153e-06, + "loss": 0.2082, + "step": 18048 + }, + { + "epoch": 1.700289677586491, + "grad_norm": 0.6890900731086731, + "learning_rate": 1.1316063491995099e-06, + "loss": 0.207, + "step": 18049 + }, + { + "epoch": 1.700383881679659, + "grad_norm": 0.6700932383537292, + "learning_rate": 1.1309087043905943e-06, + "loss": 0.1982, + "step": 18050 + }, + { + "epoch": 1.7004780857728268, + "grad_norm": 0.6805413961410522, + "learning_rate": 1.1302112618106786e-06, + "loss": 0.2113, + "step": 18051 + }, + { + "epoch": 1.7005722898659945, + "grad_norm": 0.5929723978042603, + "learning_rate": 1.1295140214756616e-06, + "loss": 0.1503, + "step": 18052 + }, + { + "epoch": 1.7006664939591625, + "grad_norm": 0.6638513803482056, + "learning_rate": 1.128816983401444e-06, + "loss": 0.1946, + "step": 18053 + }, + { + "epoch": 1.7007606980523304, + "grad_norm": 0.6715105772018433, + "learning_rate": 1.1281201476039205e-06, + "loss": 0.1761, + "step": 18054 + }, + { + "epoch": 1.7008549021454982, + "grad_norm": 0.6856271028518677, + "learning_rate": 1.1274235140989753e-06, + "loss": 0.2068, + "step": 18055 + }, + { + "epoch": 1.700949106238666, + "grad_norm": 0.6468895673751831, + "learning_rate": 1.1267270829024968e-06, + "loss": 0.1663, + "step": 18056 + }, + { + "epoch": 1.7010433103318339, + "grad_norm": 0.7396970987319946, + "learning_rate": 1.1260308540303655e-06, + "loss": 0.1849, + "step": 18057 + }, + { + "epoch": 1.7011375144250018, + "grad_norm": 0.6991780996322632, + "learning_rate": 1.1253348274984544e-06, + "loss": 0.1959, + "step": 18058 + }, + { + "epoch": 1.7012317185181696, + "grad_norm": 0.730093240737915, + "learning_rate": 1.1246390033226295e-06, + "loss": 0.2105, + "step": 18059 + }, + { + "epoch": 1.7013259226113373, + "grad_norm": 0.63701331615448, + "learning_rate": 1.123943381518766e-06, + "loss": 0.1744, + "step": 18060 + }, + { + "epoch": 1.7014201267045053, + "grad_norm": 0.5293937921524048, + "learning_rate": 1.1232479621027203e-06, + "loss": 0.1643, + "step": 18061 + }, + { + "epoch": 1.7015143307976732, + "grad_norm": 0.6485114693641663, + "learning_rate": 1.122552745090345e-06, + "loss": 0.1888, + "step": 18062 + }, + { + "epoch": 1.701608534890841, + "grad_norm": 0.7461566925048828, + "learning_rate": 1.121857730497501e-06, + "loss": 0.1954, + "step": 18063 + }, + { + "epoch": 1.7017027389840087, + "grad_norm": 0.7191745638847351, + "learning_rate": 1.1211629183400307e-06, + "loss": 0.1896, + "step": 18064 + }, + { + "epoch": 1.7017969430771767, + "grad_norm": 0.7366457581520081, + "learning_rate": 1.1204683086337754e-06, + "loss": 0.2272, + "step": 18065 + }, + { + "epoch": 1.7018911471703446, + "grad_norm": 0.5956347584724426, + "learning_rate": 1.1197739013945752e-06, + "loss": 0.1957, + "step": 18066 + }, + { + "epoch": 1.7019853512635124, + "grad_norm": 0.7121246457099915, + "learning_rate": 1.1190796966382644e-06, + "loss": 0.2042, + "step": 18067 + }, + { + "epoch": 1.70207955535668, + "grad_norm": 0.7454216480255127, + "learning_rate": 1.1183856943806703e-06, + "loss": 0.1915, + "step": 18068 + }, + { + "epoch": 1.702173759449848, + "grad_norm": 0.6396015882492065, + "learning_rate": 1.1176918946376182e-06, + "loss": 0.176, + "step": 18069 + }, + { + "epoch": 1.702267963543016, + "grad_norm": 0.6907809376716614, + "learning_rate": 1.1169982974249295e-06, + "loss": 0.1794, + "step": 18070 + }, + { + "epoch": 1.7023621676361838, + "grad_norm": 0.7384300231933594, + "learning_rate": 1.1163049027584149e-06, + "loss": 0.1911, + "step": 18071 + }, + { + "epoch": 1.7024563717293515, + "grad_norm": 0.6600772738456726, + "learning_rate": 1.1156117106538877e-06, + "loss": 0.173, + "step": 18072 + }, + { + "epoch": 1.7025505758225195, + "grad_norm": 0.6174034476280212, + "learning_rate": 1.114918721127155e-06, + "loss": 0.1808, + "step": 18073 + }, + { + "epoch": 1.7026447799156874, + "grad_norm": 0.6251533031463623, + "learning_rate": 1.1142259341940153e-06, + "loss": 0.185, + "step": 18074 + }, + { + "epoch": 1.7027389840088551, + "grad_norm": 0.5912071466445923, + "learning_rate": 1.1135333498702683e-06, + "loss": 0.155, + "step": 18075 + }, + { + "epoch": 1.7028331881020229, + "grad_norm": 0.6848728060722351, + "learning_rate": 1.1128409681717024e-06, + "loss": 0.1908, + "step": 18076 + }, + { + "epoch": 1.7029273921951908, + "grad_norm": 0.654954195022583, + "learning_rate": 1.1121487891141059e-06, + "loss": 0.1887, + "step": 18077 + }, + { + "epoch": 1.7030215962883588, + "grad_norm": 0.7168022394180298, + "learning_rate": 1.1114568127132641e-06, + "loss": 0.2023, + "step": 18078 + }, + { + "epoch": 1.7031158003815265, + "grad_norm": 0.6656650900840759, + "learning_rate": 1.1107650389849522e-06, + "loss": 0.2134, + "step": 18079 + }, + { + "epoch": 1.7032100044746943, + "grad_norm": 0.6357574462890625, + "learning_rate": 1.1100734679449442e-06, + "loss": 0.2161, + "step": 18080 + }, + { + "epoch": 1.7033042085678622, + "grad_norm": 1.0112943649291992, + "learning_rate": 1.1093820996090121e-06, + "loss": 0.1967, + "step": 18081 + }, + { + "epoch": 1.7033984126610302, + "grad_norm": 0.6312639117240906, + "learning_rate": 1.1086909339929165e-06, + "loss": 0.2089, + "step": 18082 + }, + { + "epoch": 1.703492616754198, + "grad_norm": 0.6553789377212524, + "learning_rate": 1.1079999711124189e-06, + "loss": 0.1786, + "step": 18083 + }, + { + "epoch": 1.7035868208473657, + "grad_norm": 0.6632435321807861, + "learning_rate": 1.1073092109832718e-06, + "loss": 0.1785, + "step": 18084 + }, + { + "epoch": 1.7036810249405336, + "grad_norm": 0.7690532207489014, + "learning_rate": 1.1066186536212308e-06, + "loss": 0.1872, + "step": 18085 + }, + { + "epoch": 1.7037752290337016, + "grad_norm": 0.7200743556022644, + "learning_rate": 1.105928299042035e-06, + "loss": 0.2289, + "step": 18086 + }, + { + "epoch": 1.7038694331268693, + "grad_norm": 0.6458424925804138, + "learning_rate": 1.1052381472614293e-06, + "loss": 0.1758, + "step": 18087 + }, + { + "epoch": 1.703963637220037, + "grad_norm": 0.7137967348098755, + "learning_rate": 1.1045481982951523e-06, + "loss": 0.2087, + "step": 18088 + }, + { + "epoch": 1.704057841313205, + "grad_norm": 0.7425330877304077, + "learning_rate": 1.1038584521589313e-06, + "loss": 0.1972, + "step": 18089 + }, + { + "epoch": 1.704152045406373, + "grad_norm": 0.5867472290992737, + "learning_rate": 1.103168908868495e-06, + "loss": 0.1698, + "step": 18090 + }, + { + "epoch": 1.7042462494995407, + "grad_norm": 0.6742032766342163, + "learning_rate": 1.1024795684395695e-06, + "loss": 0.1824, + "step": 18091 + }, + { + "epoch": 1.7043404535927085, + "grad_norm": 0.6082066893577576, + "learning_rate": 1.101790430887868e-06, + "loss": 0.1746, + "step": 18092 + }, + { + "epoch": 1.7044346576858764, + "grad_norm": 0.6216294169425964, + "learning_rate": 1.1011014962291066e-06, + "loss": 0.1856, + "step": 18093 + }, + { + "epoch": 1.7045288617790444, + "grad_norm": 0.702573299407959, + "learning_rate": 1.1004127644789952e-06, + "loss": 0.2118, + "step": 18094 + }, + { + "epoch": 1.7046230658722121, + "grad_norm": 0.6047021746635437, + "learning_rate": 1.0997242356532335e-06, + "loss": 0.1749, + "step": 18095 + }, + { + "epoch": 1.7047172699653799, + "grad_norm": 0.68147212266922, + "learning_rate": 1.099035909767524e-06, + "loss": 0.2011, + "step": 18096 + }, + { + "epoch": 1.7048114740585478, + "grad_norm": 0.5945382118225098, + "learning_rate": 1.0983477868375636e-06, + "loss": 0.1743, + "step": 18097 + }, + { + "epoch": 1.7049056781517158, + "grad_norm": 0.6400014758110046, + "learning_rate": 1.0976598668790406e-06, + "loss": 0.2174, + "step": 18098 + }, + { + "epoch": 1.7049998822448835, + "grad_norm": 0.7022170424461365, + "learning_rate": 1.0969721499076357e-06, + "loss": 0.2149, + "step": 18099 + }, + { + "epoch": 1.7050940863380513, + "grad_norm": 0.6862964630126953, + "learning_rate": 1.09628463593904e-06, + "loss": 0.1827, + "step": 18100 + }, + { + "epoch": 1.7051882904312192, + "grad_norm": 1.012186884880066, + "learning_rate": 1.095597324988923e-06, + "loss": 0.1928, + "step": 18101 + }, + { + "epoch": 1.7052824945243872, + "grad_norm": 0.6898453235626221, + "learning_rate": 1.094910217072954e-06, + "loss": 0.1603, + "step": 18102 + }, + { + "epoch": 1.705376698617555, + "grad_norm": 0.6907361745834351, + "learning_rate": 1.0942233122068091e-06, + "loss": 0.1886, + "step": 18103 + }, + { + "epoch": 1.7054709027107227, + "grad_norm": 0.5983191728591919, + "learning_rate": 1.0935366104061451e-06, + "loss": 0.1772, + "step": 18104 + }, + { + "epoch": 1.7055651068038906, + "grad_norm": 0.7511815428733826, + "learning_rate": 1.0928501116866175e-06, + "loss": 0.2346, + "step": 18105 + }, + { + "epoch": 1.7056593108970586, + "grad_norm": 0.7115627527236938, + "learning_rate": 1.0921638160638836e-06, + "loss": 0.2029, + "step": 18106 + }, + { + "epoch": 1.7057535149902263, + "grad_norm": 0.7544174194335938, + "learning_rate": 1.0914777235535934e-06, + "loss": 0.2098, + "step": 18107 + }, + { + "epoch": 1.705847719083394, + "grad_norm": 0.616888701915741, + "learning_rate": 1.0907918341713853e-06, + "loss": 0.2089, + "step": 18108 + }, + { + "epoch": 1.705941923176562, + "grad_norm": 0.6795505881309509, + "learning_rate": 1.0901061479329023e-06, + "loss": 0.2163, + "step": 18109 + }, + { + "epoch": 1.70603612726973, + "grad_norm": 0.698206901550293, + "learning_rate": 1.089420664853781e-06, + "loss": 0.1752, + "step": 18110 + }, + { + "epoch": 1.7061303313628977, + "grad_norm": 0.6527726054191589, + "learning_rate": 1.0887353849496462e-06, + "loss": 0.1839, + "step": 18111 + }, + { + "epoch": 1.7062245354560654, + "grad_norm": 0.6876722574234009, + "learning_rate": 1.088050308236126e-06, + "loss": 0.2122, + "step": 18112 + }, + { + "epoch": 1.7063187395492334, + "grad_norm": 0.6527280211448669, + "learning_rate": 1.0873654347288443e-06, + "loss": 0.1718, + "step": 18113 + }, + { + "epoch": 1.7064129436424014, + "grad_norm": 0.6006307005882263, + "learning_rate": 1.0866807644434118e-06, + "loss": 0.1745, + "step": 18114 + }, + { + "epoch": 1.706507147735569, + "grad_norm": 0.7178559303283691, + "learning_rate": 1.0859962973954442e-06, + "loss": 0.2063, + "step": 18115 + }, + { + "epoch": 1.7066013518287368, + "grad_norm": 0.6288301348686218, + "learning_rate": 1.0853120336005452e-06, + "loss": 0.2015, + "step": 18116 + }, + { + "epoch": 1.7066955559219048, + "grad_norm": 0.6841304302215576, + "learning_rate": 1.0846279730743192e-06, + "loss": 0.1845, + "step": 18117 + }, + { + "epoch": 1.7067897600150728, + "grad_norm": 0.6126241087913513, + "learning_rate": 1.0839441158323648e-06, + "loss": 0.1839, + "step": 18118 + }, + { + "epoch": 1.7068839641082405, + "grad_norm": 0.687613844871521, + "learning_rate": 1.083260461890271e-06, + "loss": 0.2175, + "step": 18119 + }, + { + "epoch": 1.7069781682014082, + "grad_norm": 0.6671021580696106, + "learning_rate": 1.0825770112636292e-06, + "loss": 0.1971, + "step": 18120 + }, + { + "epoch": 1.7070723722945762, + "grad_norm": 0.6684998273849487, + "learning_rate": 1.081893763968025e-06, + "loss": 0.1991, + "step": 18121 + }, + { + "epoch": 1.7071665763877442, + "grad_norm": 0.6642913222312927, + "learning_rate": 1.0812107200190325e-06, + "loss": 0.1927, + "step": 18122 + }, + { + "epoch": 1.707260780480912, + "grad_norm": 0.6435067057609558, + "learning_rate": 1.0805278794322304e-06, + "loss": 0.1944, + "step": 18123 + }, + { + "epoch": 1.7073549845740796, + "grad_norm": 0.6502464413642883, + "learning_rate": 1.0798452422231864e-06, + "loss": 0.1953, + "step": 18124 + }, + { + "epoch": 1.7074491886672476, + "grad_norm": 0.6137212514877319, + "learning_rate": 1.0791628084074668e-06, + "loss": 0.1827, + "step": 18125 + }, + { + "epoch": 1.7075433927604156, + "grad_norm": 0.6438696980476379, + "learning_rate": 1.0784805780006302e-06, + "loss": 0.1859, + "step": 18126 + }, + { + "epoch": 1.7076375968535833, + "grad_norm": 0.6703856587409973, + "learning_rate": 1.0777985510182332e-06, + "loss": 0.1902, + "step": 18127 + }, + { + "epoch": 1.707731800946751, + "grad_norm": 0.6582531332969666, + "learning_rate": 1.07711672747583e-06, + "loss": 0.1943, + "step": 18128 + }, + { + "epoch": 1.707826005039919, + "grad_norm": 0.851245641708374, + "learning_rate": 1.0764351073889624e-06, + "loss": 0.1973, + "step": 18129 + }, + { + "epoch": 1.707920209133087, + "grad_norm": 0.8826058506965637, + "learning_rate": 1.0757536907731758e-06, + "loss": 0.2247, + "step": 18130 + }, + { + "epoch": 1.7080144132262547, + "grad_norm": 0.6787847876548767, + "learning_rate": 1.075072477644008e-06, + "loss": 0.231, + "step": 18131 + }, + { + "epoch": 1.7081086173194224, + "grad_norm": 0.6661331057548523, + "learning_rate": 1.0743914680169887e-06, + "loss": 0.2095, + "step": 18132 + }, + { + "epoch": 1.7082028214125904, + "grad_norm": 0.6378578543663025, + "learning_rate": 1.0737106619076477e-06, + "loss": 0.1976, + "step": 18133 + }, + { + "epoch": 1.7082970255057583, + "grad_norm": 0.71204674243927, + "learning_rate": 1.07303005933151e-06, + "loss": 0.2129, + "step": 18134 + }, + { + "epoch": 1.708391229598926, + "grad_norm": 0.5995174050331116, + "learning_rate": 1.072349660304093e-06, + "loss": 0.1967, + "step": 18135 + }, + { + "epoch": 1.7084854336920938, + "grad_norm": 0.6255856156349182, + "learning_rate": 1.0716694648409066e-06, + "loss": 0.1882, + "step": 18136 + }, + { + "epoch": 1.7085796377852618, + "grad_norm": 0.6656293869018555, + "learning_rate": 1.0709894729574688e-06, + "loss": 0.2014, + "step": 18137 + }, + { + "epoch": 1.7086738418784297, + "grad_norm": 0.6369428038597107, + "learning_rate": 1.0703096846692796e-06, + "loss": 0.1806, + "step": 18138 + }, + { + "epoch": 1.7087680459715975, + "grad_norm": 0.6439520120620728, + "learning_rate": 1.0696300999918353e-06, + "loss": 0.1806, + "step": 18139 + }, + { + "epoch": 1.7088622500647652, + "grad_norm": 0.6167713403701782, + "learning_rate": 1.06895071894064e-06, + "loss": 0.1976, + "step": 18140 + }, + { + "epoch": 1.7089564541579332, + "grad_norm": 1.0032925605773926, + "learning_rate": 1.0682715415311807e-06, + "loss": 0.1637, + "step": 18141 + }, + { + "epoch": 1.7090506582511011, + "grad_norm": 0.6650711894035339, + "learning_rate": 1.067592567778939e-06, + "loss": 0.1915, + "step": 18142 + }, + { + "epoch": 1.7091448623442689, + "grad_norm": 0.7527432441711426, + "learning_rate": 1.066913797699406e-06, + "loss": 0.1813, + "step": 18143 + }, + { + "epoch": 1.7092390664374366, + "grad_norm": 0.8132700324058533, + "learning_rate": 1.0662352313080526e-06, + "loss": 0.1962, + "step": 18144 + }, + { + "epoch": 1.7093332705306046, + "grad_norm": 0.6715097427368164, + "learning_rate": 1.065556868620351e-06, + "loss": 0.2228, + "step": 18145 + }, + { + "epoch": 1.7094274746237725, + "grad_norm": 0.6167192459106445, + "learning_rate": 1.06487870965177e-06, + "loss": 0.1796, + "step": 18146 + }, + { + "epoch": 1.7095216787169403, + "grad_norm": 0.6042233109474182, + "learning_rate": 1.0642007544177745e-06, + "loss": 0.1978, + "step": 18147 + }, + { + "epoch": 1.709615882810108, + "grad_norm": 0.6343446969985962, + "learning_rate": 1.0635230029338196e-06, + "loss": 0.1799, + "step": 18148 + }, + { + "epoch": 1.709710086903276, + "grad_norm": 0.7236276865005493, + "learning_rate": 1.06284545521536e-06, + "loss": 0.2177, + "step": 18149 + }, + { + "epoch": 1.709804290996444, + "grad_norm": 0.5993223190307617, + "learning_rate": 1.0621681112778482e-06, + "loss": 0.1743, + "step": 18150 + }, + { + "epoch": 1.7098984950896117, + "grad_norm": 0.6236757040023804, + "learning_rate": 1.061490971136724e-06, + "loss": 0.1815, + "step": 18151 + }, + { + "epoch": 1.7099926991827794, + "grad_norm": 0.6627745628356934, + "learning_rate": 1.0608140348074292e-06, + "loss": 0.2088, + "step": 18152 + }, + { + "epoch": 1.7100869032759474, + "grad_norm": 0.652539074420929, + "learning_rate": 1.0601373023054018e-06, + "loss": 0.167, + "step": 18153 + }, + { + "epoch": 1.7101811073691153, + "grad_norm": 0.628743588924408, + "learning_rate": 1.059460773646067e-06, + "loss": 0.1709, + "step": 18154 + }, + { + "epoch": 1.710275311462283, + "grad_norm": 0.6140427589416504, + "learning_rate": 1.058784448844855e-06, + "loss": 0.179, + "step": 18155 + }, + { + "epoch": 1.7103695155554508, + "grad_norm": 2.1912851333618164, + "learning_rate": 1.0581083279171843e-06, + "loss": 0.1854, + "step": 18156 + }, + { + "epoch": 1.7104637196486188, + "grad_norm": 0.6636489629745483, + "learning_rate": 1.0574324108784728e-06, + "loss": 0.1941, + "step": 18157 + }, + { + "epoch": 1.7105579237417867, + "grad_norm": 0.7394426465034485, + "learning_rate": 1.0567566977441334e-06, + "loss": 0.2125, + "step": 18158 + }, + { + "epoch": 1.7106521278349545, + "grad_norm": 0.6625232696533203, + "learning_rate": 1.0560811885295708e-06, + "loss": 0.201, + "step": 18159 + }, + { + "epoch": 1.7107463319281222, + "grad_norm": 0.6909732222557068, + "learning_rate": 1.055405883250189e-06, + "loss": 0.1804, + "step": 18160 + }, + { + "epoch": 1.7108405360212902, + "grad_norm": 0.6699437499046326, + "learning_rate": 1.054730781921388e-06, + "loss": 0.2196, + "step": 18161 + }, + { + "epoch": 1.7109347401144581, + "grad_norm": 0.6777843832969666, + "learning_rate": 1.054055884558558e-06, + "loss": 0.194, + "step": 18162 + }, + { + "epoch": 1.7110289442076259, + "grad_norm": 0.6522235870361328, + "learning_rate": 1.0533811911770896e-06, + "loss": 0.1768, + "step": 18163 + }, + { + "epoch": 1.7111231483007936, + "grad_norm": 0.659884512424469, + "learning_rate": 1.0527067017923654e-06, + "loss": 0.188, + "step": 18164 + }, + { + "epoch": 1.7112173523939616, + "grad_norm": 0.6575126647949219, + "learning_rate": 1.0520324164197683e-06, + "loss": 0.1951, + "step": 18165 + }, + { + "epoch": 1.7113115564871295, + "grad_norm": 0.6973704099655151, + "learning_rate": 1.051358335074667e-06, + "loss": 0.1987, + "step": 18166 + }, + { + "epoch": 1.7114057605802973, + "grad_norm": 0.6405927538871765, + "learning_rate": 1.0506844577724352e-06, + "loss": 0.1963, + "step": 18167 + }, + { + "epoch": 1.711499964673465, + "grad_norm": 0.6451680064201355, + "learning_rate": 1.0500107845284402e-06, + "loss": 0.1817, + "step": 18168 + }, + { + "epoch": 1.711594168766633, + "grad_norm": 0.5515987277030945, + "learning_rate": 1.049337315358039e-06, + "loss": 0.1632, + "step": 18169 + }, + { + "epoch": 1.711688372859801, + "grad_norm": 0.6623426079750061, + "learning_rate": 1.04866405027659e-06, + "loss": 0.1984, + "step": 18170 + }, + { + "epoch": 1.7117825769529686, + "grad_norm": 0.6332948207855225, + "learning_rate": 1.0479909892994456e-06, + "loss": 0.1891, + "step": 18171 + }, + { + "epoch": 1.7118767810461364, + "grad_norm": 0.6678027510643005, + "learning_rate": 1.047318132441949e-06, + "loss": 0.1899, + "step": 18172 + }, + { + "epoch": 1.7119709851393043, + "grad_norm": 0.7194743752479553, + "learning_rate": 1.0466454797194448e-06, + "loss": 0.2106, + "step": 18173 + }, + { + "epoch": 1.7120651892324723, + "grad_norm": 0.6241485476493835, + "learning_rate": 1.0459730311472716e-06, + "loss": 0.1709, + "step": 18174 + }, + { + "epoch": 1.71215939332564, + "grad_norm": 0.6804275512695312, + "learning_rate": 1.0453007867407617e-06, + "loss": 0.1752, + "step": 18175 + }, + { + "epoch": 1.7122535974188078, + "grad_norm": 0.7246103882789612, + "learning_rate": 1.0446287465152383e-06, + "loss": 0.1967, + "step": 18176 + }, + { + "epoch": 1.7123478015119757, + "grad_norm": 0.7021299600601196, + "learning_rate": 1.0439569104860337e-06, + "loss": 0.2153, + "step": 18177 + }, + { + "epoch": 1.7124420056051437, + "grad_norm": 0.6220550537109375, + "learning_rate": 1.043285278668462e-06, + "loss": 0.1616, + "step": 18178 + }, + { + "epoch": 1.7125362096983114, + "grad_norm": 0.6525019407272339, + "learning_rate": 1.0426138510778338e-06, + "loss": 0.1964, + "step": 18179 + }, + { + "epoch": 1.7126304137914792, + "grad_norm": 0.6876130700111389, + "learning_rate": 1.0419426277294663e-06, + "loss": 0.1733, + "step": 18180 + }, + { + "epoch": 1.7127246178846471, + "grad_norm": 0.6705190539360046, + "learning_rate": 1.041271608638662e-06, + "loss": 0.2052, + "step": 18181 + }, + { + "epoch": 1.712818821977815, + "grad_norm": 0.6083057522773743, + "learning_rate": 1.0406007938207153e-06, + "loss": 0.1654, + "step": 18182 + }, + { + "epoch": 1.7129130260709828, + "grad_norm": 0.6081562638282776, + "learning_rate": 1.0399301832909303e-06, + "loss": 0.1769, + "step": 18183 + }, + { + "epoch": 1.7130072301641506, + "grad_norm": 0.650029718875885, + "learning_rate": 1.039259777064594e-06, + "loss": 0.1716, + "step": 18184 + }, + { + "epoch": 1.7131014342573185, + "grad_norm": 0.6393464207649231, + "learning_rate": 1.0385895751569909e-06, + "loss": 0.1655, + "step": 18185 + }, + { + "epoch": 1.7131956383504865, + "grad_norm": 0.6619433164596558, + "learning_rate": 1.037919577583404e-06, + "loss": 0.2049, + "step": 18186 + }, + { + "epoch": 1.7132898424436542, + "grad_norm": 0.7716788649559021, + "learning_rate": 1.0372497843591133e-06, + "loss": 0.2108, + "step": 18187 + }, + { + "epoch": 1.713384046536822, + "grad_norm": 0.6516677737236023, + "learning_rate": 1.0365801954993871e-06, + "loss": 0.1917, + "step": 18188 + }, + { + "epoch": 1.71347825062999, + "grad_norm": 0.6674495935440063, + "learning_rate": 1.035910811019495e-06, + "loss": 0.1888, + "step": 18189 + }, + { + "epoch": 1.7135724547231577, + "grad_norm": 0.6783060431480408, + "learning_rate": 1.0352416309347003e-06, + "loss": 0.2232, + "step": 18190 + }, + { + "epoch": 1.7136666588163254, + "grad_norm": 0.7408477067947388, + "learning_rate": 1.0345726552602598e-06, + "loss": 0.2063, + "step": 18191 + }, + { + "epoch": 1.7137608629094934, + "grad_norm": 0.6930561661720276, + "learning_rate": 1.033903884011428e-06, + "loss": 0.1881, + "step": 18192 + }, + { + "epoch": 1.7138550670026613, + "grad_norm": 0.6963168382644653, + "learning_rate": 1.0332353172034548e-06, + "loss": 0.2045, + "step": 18193 + }, + { + "epoch": 1.713949271095829, + "grad_norm": 0.6521729826927185, + "learning_rate": 1.0325669548515826e-06, + "loss": 0.1841, + "step": 18194 + }, + { + "epoch": 1.7140434751889968, + "grad_norm": 0.652824342250824, + "learning_rate": 1.0318987969710548e-06, + "loss": 0.2027, + "step": 18195 + }, + { + "epoch": 1.7141376792821648, + "grad_norm": 0.5659494996070862, + "learning_rate": 1.0312308435771013e-06, + "loss": 0.167, + "step": 18196 + }, + { + "epoch": 1.7142318833753327, + "grad_norm": 0.6974686980247498, + "learning_rate": 1.0305630946849554e-06, + "loss": 0.2141, + "step": 18197 + }, + { + "epoch": 1.7143260874685005, + "grad_norm": 0.6462910771369934, + "learning_rate": 1.029895550309844e-06, + "loss": 0.1891, + "step": 18198 + }, + { + "epoch": 1.7144202915616682, + "grad_norm": 0.7359683513641357, + "learning_rate": 1.0292282104669837e-06, + "loss": 0.1938, + "step": 18199 + }, + { + "epoch": 1.7145144956548362, + "grad_norm": 0.6937127113342285, + "learning_rate": 1.028561075171597e-06, + "loss": 0.2113, + "step": 18200 + }, + { + "epoch": 1.7146086997480041, + "grad_norm": 0.6812256574630737, + "learning_rate": 1.0278941444388902e-06, + "loss": 0.1776, + "step": 18201 + }, + { + "epoch": 1.7147029038411719, + "grad_norm": 0.6547383666038513, + "learning_rate": 1.0272274182840724e-06, + "loss": 0.1916, + "step": 18202 + }, + { + "epoch": 1.7147971079343396, + "grad_norm": 0.7019928693771362, + "learning_rate": 1.0265608967223483e-06, + "loss": 0.2001, + "step": 18203 + }, + { + "epoch": 1.7148913120275076, + "grad_norm": 0.6956671476364136, + "learning_rate": 1.0258945797689112e-06, + "loss": 0.1973, + "step": 18204 + }, + { + "epoch": 1.7149855161206755, + "grad_norm": 0.6771193742752075, + "learning_rate": 1.025228467438959e-06, + "loss": 0.1947, + "step": 18205 + }, + { + "epoch": 1.7150797202138433, + "grad_norm": 0.597726583480835, + "learning_rate": 1.0245625597476748e-06, + "loss": 0.1738, + "step": 18206 + }, + { + "epoch": 1.715173924307011, + "grad_norm": 0.6639793515205383, + "learning_rate": 1.0238968567102448e-06, + "loss": 0.2011, + "step": 18207 + }, + { + "epoch": 1.715268128400179, + "grad_norm": 0.625450074672699, + "learning_rate": 1.0232313583418507e-06, + "loss": 0.2001, + "step": 18208 + }, + { + "epoch": 1.715362332493347, + "grad_norm": 0.6180112361907959, + "learning_rate": 1.022566064657663e-06, + "loss": 0.1849, + "step": 18209 + }, + { + "epoch": 1.7154565365865146, + "grad_norm": 0.6868562698364258, + "learning_rate": 1.0219009756728516e-06, + "loss": 0.2031, + "step": 18210 + }, + { + "epoch": 1.7155507406796824, + "grad_norm": 0.6511270403862, + "learning_rate": 1.0212360914025865e-06, + "loss": 0.1744, + "step": 18211 + }, + { + "epoch": 1.7156449447728503, + "grad_norm": 0.6476146578788757, + "learning_rate": 1.0205714118620214e-06, + "loss": 0.1861, + "step": 18212 + }, + { + "epoch": 1.7157391488660183, + "grad_norm": 0.758589506149292, + "learning_rate": 1.019906937066315e-06, + "loss": 0.1983, + "step": 18213 + }, + { + "epoch": 1.715833352959186, + "grad_norm": 0.6474173665046692, + "learning_rate": 1.019242667030621e-06, + "loss": 0.2005, + "step": 18214 + }, + { + "epoch": 1.7159275570523538, + "grad_norm": 0.6607803106307983, + "learning_rate": 1.0185786017700828e-06, + "loss": 0.1853, + "step": 18215 + }, + { + "epoch": 1.7160217611455217, + "grad_norm": 0.6892468333244324, + "learning_rate": 1.017914741299838e-06, + "loss": 0.1937, + "step": 18216 + }, + { + "epoch": 1.7161159652386897, + "grad_norm": 0.6857177019119263, + "learning_rate": 1.0172510856350326e-06, + "loss": 0.1789, + "step": 18217 + }, + { + "epoch": 1.7162101693318574, + "grad_norm": 0.607350766658783, + "learning_rate": 1.0165876347907944e-06, + "loss": 0.2003, + "step": 18218 + }, + { + "epoch": 1.7163043734250252, + "grad_norm": 0.638554036617279, + "learning_rate": 1.0159243887822479e-06, + "loss": 0.2004, + "step": 18219 + }, + { + "epoch": 1.7163985775181931, + "grad_norm": 0.6638485789299011, + "learning_rate": 1.0152613476245232e-06, + "loss": 0.1718, + "step": 18220 + }, + { + "epoch": 1.716492781611361, + "grad_norm": 0.6935262680053711, + "learning_rate": 1.014598511332735e-06, + "loss": 0.1828, + "step": 18221 + }, + { + "epoch": 1.7165869857045288, + "grad_norm": 0.6868494749069214, + "learning_rate": 1.0139358799219924e-06, + "loss": 0.2059, + "step": 18222 + }, + { + "epoch": 1.7166811897976966, + "grad_norm": 0.7230465412139893, + "learning_rate": 1.013273453407413e-06, + "loss": 0.1789, + "step": 18223 + }, + { + "epoch": 1.7167753938908645, + "grad_norm": 0.7610085606575012, + "learning_rate": 1.0126112318040981e-06, + "loss": 0.1926, + "step": 18224 + }, + { + "epoch": 1.7168695979840325, + "grad_norm": 0.6834355592727661, + "learning_rate": 1.0119492151271438e-06, + "loss": 0.2293, + "step": 18225 + }, + { + "epoch": 1.7169638020772002, + "grad_norm": 0.6361607313156128, + "learning_rate": 1.0112874033916465e-06, + "loss": 0.1826, + "step": 18226 + }, + { + "epoch": 1.717058006170368, + "grad_norm": 0.7722209095954895, + "learning_rate": 1.010625796612701e-06, + "loss": 0.2007, + "step": 18227 + }, + { + "epoch": 1.717152210263536, + "grad_norm": 0.6115467548370361, + "learning_rate": 1.0099643948053872e-06, + "loss": 0.2103, + "step": 18228 + }, + { + "epoch": 1.7172464143567039, + "grad_norm": 0.7447245717048645, + "learning_rate": 1.009303197984789e-06, + "loss": 0.1919, + "step": 18229 + }, + { + "epoch": 1.7173406184498716, + "grad_norm": 0.6729543805122375, + "learning_rate": 1.008642206165984e-06, + "loss": 0.1762, + "step": 18230 + }, + { + "epoch": 1.7174348225430394, + "grad_norm": 0.6252509355545044, + "learning_rate": 1.0079814193640403e-06, + "loss": 0.1947, + "step": 18231 + }, + { + "epoch": 1.7175290266362073, + "grad_norm": 0.6713355183601379, + "learning_rate": 1.007320837594027e-06, + "loss": 0.2338, + "step": 18232 + }, + { + "epoch": 1.7176232307293753, + "grad_norm": 0.6256462931632996, + "learning_rate": 1.0066604608710073e-06, + "loss": 0.2017, + "step": 18233 + }, + { + "epoch": 1.717717434822543, + "grad_norm": 0.6962444186210632, + "learning_rate": 1.0060002892100363e-06, + "loss": 0.1896, + "step": 18234 + }, + { + "epoch": 1.7178116389157108, + "grad_norm": 0.6423612833023071, + "learning_rate": 1.0053403226261694e-06, + "loss": 0.1663, + "step": 18235 + }, + { + "epoch": 1.7179058430088787, + "grad_norm": 0.6608044505119324, + "learning_rate": 1.0046805611344512e-06, + "loss": 0.1918, + "step": 18236 + }, + { + "epoch": 1.7180000471020467, + "grad_norm": 0.7020596861839294, + "learning_rate": 1.0040210047499289e-06, + "loss": 0.1788, + "step": 18237 + }, + { + "epoch": 1.7180942511952144, + "grad_norm": 0.6175522208213806, + "learning_rate": 1.0033616534876423e-06, + "loss": 0.1907, + "step": 18238 + }, + { + "epoch": 1.7181884552883822, + "grad_norm": 0.941880464553833, + "learning_rate": 1.0027025073626206e-06, + "loss": 0.1775, + "step": 18239 + }, + { + "epoch": 1.7182826593815501, + "grad_norm": 0.5877217054367065, + "learning_rate": 1.0020435663898985e-06, + "loss": 0.1795, + "step": 18240 + }, + { + "epoch": 1.718376863474718, + "grad_norm": 0.7008559703826904, + "learning_rate": 1.0013848305844975e-06, + "loss": 0.214, + "step": 18241 + }, + { + "epoch": 1.7184710675678858, + "grad_norm": 0.6821069717407227, + "learning_rate": 1.0007262999614387e-06, + "loss": 0.1908, + "step": 18242 + }, + { + "epoch": 1.7185652716610536, + "grad_norm": 0.6792659163475037, + "learning_rate": 1.00006797453574e-06, + "loss": 0.1775, + "step": 18243 + }, + { + "epoch": 1.7186594757542215, + "grad_norm": 0.6423118114471436, + "learning_rate": 9.994098543224073e-07, + "loss": 0.2076, + "step": 18244 + }, + { + "epoch": 1.7187536798473895, + "grad_norm": 0.6862238645553589, + "learning_rate": 9.98751939336452e-07, + "loss": 0.1977, + "step": 18245 + }, + { + "epoch": 1.7188478839405572, + "grad_norm": 0.7554076313972473, + "learning_rate": 9.98094229592872e-07, + "loss": 0.2046, + "step": 18246 + }, + { + "epoch": 1.718942088033725, + "grad_norm": 0.6626954078674316, + "learning_rate": 9.974367251066642e-07, + "loss": 0.2038, + "step": 18247 + }, + { + "epoch": 1.719036292126893, + "grad_norm": 0.7395123839378357, + "learning_rate": 9.967794258928243e-07, + "loss": 0.1841, + "step": 18248 + }, + { + "epoch": 1.7191304962200609, + "grad_norm": 0.6694244742393494, + "learning_rate": 9.961223319663349e-07, + "loss": 0.1823, + "step": 18249 + }, + { + "epoch": 1.7192247003132286, + "grad_norm": 0.6248368620872498, + "learning_rate": 9.954654433421818e-07, + "loss": 0.216, + "step": 18250 + }, + { + "epoch": 1.7193189044063963, + "grad_norm": 0.6329765319824219, + "learning_rate": 9.94808760035344e-07, + "loss": 0.1958, + "step": 18251 + }, + { + "epoch": 1.7194131084995643, + "grad_norm": 0.6825846433639526, + "learning_rate": 9.941522820607908e-07, + "loss": 0.1955, + "step": 18252 + }, + { + "epoch": 1.7195073125927323, + "grad_norm": 0.6598239541053772, + "learning_rate": 9.934960094334934e-07, + "loss": 0.178, + "step": 18253 + }, + { + "epoch": 1.7196015166859, + "grad_norm": 0.6834250688552856, + "learning_rate": 9.928399421684186e-07, + "loss": 0.2026, + "step": 18254 + }, + { + "epoch": 1.7196957207790677, + "grad_norm": 0.6410261392593384, + "learning_rate": 9.921840802805228e-07, + "loss": 0.1953, + "step": 18255 + }, + { + "epoch": 1.7197899248722357, + "grad_norm": 0.6028664112091064, + "learning_rate": 9.915284237847566e-07, + "loss": 0.1866, + "step": 18256 + }, + { + "epoch": 1.7198841289654037, + "grad_norm": 0.6074325442314148, + "learning_rate": 9.908729726960776e-07, + "loss": 0.1796, + "step": 18257 + }, + { + "epoch": 1.7199783330585714, + "grad_norm": 0.700425922870636, + "learning_rate": 9.902177270294288e-07, + "loss": 0.1744, + "step": 18258 + }, + { + "epoch": 1.7200725371517391, + "grad_norm": 0.6485363841056824, + "learning_rate": 9.895626867997454e-07, + "loss": 0.1787, + "step": 18259 + }, + { + "epoch": 1.720166741244907, + "grad_norm": 0.6814277172088623, + "learning_rate": 9.889078520219708e-07, + "loss": 0.187, + "step": 18260 + }, + { + "epoch": 1.720260945338075, + "grad_norm": 0.6854941248893738, + "learning_rate": 9.882532227110343e-07, + "loss": 0.1849, + "step": 18261 + }, + { + "epoch": 1.7203551494312428, + "grad_norm": 0.5841566324234009, + "learning_rate": 9.87598798881857e-07, + "loss": 0.1937, + "step": 18262 + }, + { + "epoch": 1.7204493535244105, + "grad_norm": 0.6688234806060791, + "learning_rate": 9.869445805493682e-07, + "loss": 0.2034, + "step": 18263 + }, + { + "epoch": 1.7205435576175785, + "grad_norm": 0.6713355183601379, + "learning_rate": 9.862905677284828e-07, + "loss": 0.1775, + "step": 18264 + }, + { + "epoch": 1.7206377617107464, + "grad_norm": 0.633851945400238, + "learning_rate": 9.8563676043411e-07, + "loss": 0.1961, + "step": 18265 + }, + { + "epoch": 1.7207319658039142, + "grad_norm": 0.6884269714355469, + "learning_rate": 9.849831586811597e-07, + "loss": 0.1894, + "step": 18266 + }, + { + "epoch": 1.720826169897082, + "grad_norm": 0.6791149973869324, + "learning_rate": 9.843297624845382e-07, + "loss": 0.2027, + "step": 18267 + }, + { + "epoch": 1.7209203739902499, + "grad_norm": 0.6391034126281738, + "learning_rate": 9.83676571859138e-07, + "loss": 0.1858, + "step": 18268 + }, + { + "epoch": 1.7210145780834178, + "grad_norm": 0.7244287729263306, + "learning_rate": 9.830235868198567e-07, + "loss": 0.204, + "step": 18269 + }, + { + "epoch": 1.7211087821765856, + "grad_norm": 0.6550241708755493, + "learning_rate": 9.823708073815852e-07, + "loss": 0.1966, + "step": 18270 + }, + { + "epoch": 1.7212029862697533, + "grad_norm": 0.7238351702690125, + "learning_rate": 9.817182335592023e-07, + "loss": 0.2001, + "step": 18271 + }, + { + "epoch": 1.7212971903629213, + "grad_norm": 0.6201068162918091, + "learning_rate": 9.81065865367592e-07, + "loss": 0.1891, + "step": 18272 + }, + { + "epoch": 1.7213913944560892, + "grad_norm": 0.690910816192627, + "learning_rate": 9.804137028216286e-07, + "loss": 0.1918, + "step": 18273 + }, + { + "epoch": 1.721485598549257, + "grad_norm": 0.5405566096305847, + "learning_rate": 9.797617459361808e-07, + "loss": 0.1959, + "step": 18274 + }, + { + "epoch": 1.7215798026424247, + "grad_norm": 0.6876025199890137, + "learning_rate": 9.791099947261162e-07, + "loss": 0.2096, + "step": 18275 + }, + { + "epoch": 1.7216740067355927, + "grad_norm": 0.6582169532775879, + "learning_rate": 9.784584492062942e-07, + "loss": 0.2002, + "step": 18276 + }, + { + "epoch": 1.7217682108287606, + "grad_norm": 0.6868323683738708, + "learning_rate": 9.778071093915709e-07, + "loss": 0.225, + "step": 18277 + }, + { + "epoch": 1.7218624149219284, + "grad_norm": 0.7047544717788696, + "learning_rate": 9.771559752968008e-07, + "loss": 0.2121, + "step": 18278 + }, + { + "epoch": 1.7219566190150961, + "grad_norm": 0.646683931350708, + "learning_rate": 9.765050469368254e-07, + "loss": 0.174, + "step": 18279 + }, + { + "epoch": 1.722050823108264, + "grad_norm": 0.6365481615066528, + "learning_rate": 9.758543243264939e-07, + "loss": 0.1659, + "step": 18280 + }, + { + "epoch": 1.722145027201432, + "grad_norm": 0.6406670212745667, + "learning_rate": 9.75203807480637e-07, + "loss": 0.1995, + "step": 18281 + }, + { + "epoch": 1.7222392312945998, + "grad_norm": 0.6405190229415894, + "learning_rate": 9.7455349641409e-07, + "loss": 0.2014, + "step": 18282 + }, + { + "epoch": 1.7223334353877675, + "grad_norm": 0.7335394620895386, + "learning_rate": 9.73903391141684e-07, + "loss": 0.1992, + "step": 18283 + }, + { + "epoch": 1.7224276394809355, + "grad_norm": 0.6978424191474915, + "learning_rate": 9.732534916782377e-07, + "loss": 0.1933, + "step": 18284 + }, + { + "epoch": 1.7225218435741034, + "grad_norm": 0.6657301783561707, + "learning_rate": 9.72603798038574e-07, + "loss": 0.1853, + "step": 18285 + }, + { + "epoch": 1.7226160476672712, + "grad_norm": 0.6643604040145874, + "learning_rate": 9.719543102375028e-07, + "loss": 0.205, + "step": 18286 + }, + { + "epoch": 1.722710251760439, + "grad_norm": 0.6385934352874756, + "learning_rate": 9.713050282898351e-07, + "loss": 0.1801, + "step": 18287 + }, + { + "epoch": 1.7228044558536069, + "grad_norm": 0.7379485368728638, + "learning_rate": 9.706559522103775e-07, + "loss": 0.2111, + "step": 18288 + }, + { + "epoch": 1.7228986599467748, + "grad_norm": 0.6756138801574707, + "learning_rate": 9.700070820139274e-07, + "loss": 0.1954, + "step": 18289 + }, + { + "epoch": 1.7229928640399426, + "grad_norm": 0.7081589102745056, + "learning_rate": 9.693584177152804e-07, + "loss": 0.1977, + "step": 18290 + }, + { + "epoch": 1.7230870681331103, + "grad_norm": 0.7241060137748718, + "learning_rate": 9.687099593292304e-07, + "loss": 0.2215, + "step": 18291 + }, + { + "epoch": 1.7231812722262783, + "grad_norm": 0.7350836992263794, + "learning_rate": 9.680617068705577e-07, + "loss": 0.212, + "step": 18292 + }, + { + "epoch": 1.7232754763194462, + "grad_norm": 0.6153767108917236, + "learning_rate": 9.674136603540463e-07, + "loss": 0.1926, + "step": 18293 + }, + { + "epoch": 1.723369680412614, + "grad_norm": 0.7060182690620422, + "learning_rate": 9.667658197944752e-07, + "loss": 0.2125, + "step": 18294 + }, + { + "epoch": 1.7234638845057817, + "grad_norm": 0.6536703109741211, + "learning_rate": 9.661181852066127e-07, + "loss": 0.1823, + "step": 18295 + }, + { + "epoch": 1.7235580885989497, + "grad_norm": 0.6886441111564636, + "learning_rate": 9.654707566052236e-07, + "loss": 0.2147, + "step": 18296 + }, + { + "epoch": 1.7236522926921176, + "grad_norm": 0.6352055668830872, + "learning_rate": 9.648235340050772e-07, + "loss": 0.2042, + "step": 18297 + }, + { + "epoch": 1.7237464967852854, + "grad_norm": 0.76617032289505, + "learning_rate": 9.64176517420926e-07, + "loss": 0.1771, + "step": 18298 + }, + { + "epoch": 1.723840700878453, + "grad_norm": 0.6597750782966614, + "learning_rate": 9.63529706867522e-07, + "loss": 0.1948, + "step": 18299 + }, + { + "epoch": 1.723934904971621, + "grad_norm": 0.7410958409309387, + "learning_rate": 9.628831023596197e-07, + "loss": 0.2078, + "step": 18300 + }, + { + "epoch": 1.724029109064789, + "grad_norm": 0.6253803968429565, + "learning_rate": 9.622367039119584e-07, + "loss": 0.1765, + "step": 18301 + }, + { + "epoch": 1.7241233131579567, + "grad_norm": 0.8138961791992188, + "learning_rate": 9.615905115392733e-07, + "loss": 0.1848, + "step": 18302 + }, + { + "epoch": 1.7242175172511245, + "grad_norm": 0.6228686571121216, + "learning_rate": 9.609445252563078e-07, + "loss": 0.1879, + "step": 18303 + }, + { + "epoch": 1.7243117213442924, + "grad_norm": 0.6464802026748657, + "learning_rate": 9.602987450777845e-07, + "loss": 0.2094, + "step": 18304 + }, + { + "epoch": 1.7244059254374604, + "grad_norm": 0.5831804871559143, + "learning_rate": 9.5965317101843e-07, + "loss": 0.1748, + "step": 18305 + }, + { + "epoch": 1.7245001295306281, + "grad_norm": 0.6228283643722534, + "learning_rate": 9.590078030929628e-07, + "loss": 0.1865, + "step": 18306 + }, + { + "epoch": 1.7245943336237959, + "grad_norm": 0.8141707181930542, + "learning_rate": 9.583626413161018e-07, + "loss": 0.2085, + "step": 18307 + }, + { + "epoch": 1.7246885377169638, + "grad_norm": 0.7365865707397461, + "learning_rate": 9.57717685702555e-07, + "loss": 0.2034, + "step": 18308 + }, + { + "epoch": 1.7247827418101318, + "grad_norm": 0.7254769802093506, + "learning_rate": 9.570729362670284e-07, + "loss": 0.2196, + "step": 18309 + }, + { + "epoch": 1.7248769459032995, + "grad_norm": 0.6105891466140747, + "learning_rate": 9.564283930242258e-07, + "loss": 0.1968, + "step": 18310 + }, + { + "epoch": 1.7249711499964673, + "grad_norm": 0.7820233106613159, + "learning_rate": 9.5578405598884e-07, + "loss": 0.1708, + "step": 18311 + }, + { + "epoch": 1.7250653540896352, + "grad_norm": 0.6825641393661499, + "learning_rate": 9.551399251755654e-07, + "loss": 0.2058, + "step": 18312 + }, + { + "epoch": 1.7251595581828032, + "grad_norm": 0.6899803876876831, + "learning_rate": 9.5449600059909e-07, + "loss": 0.1909, + "step": 18313 + }, + { + "epoch": 1.725253762275971, + "grad_norm": 0.6918448805809021, + "learning_rate": 9.538522822740937e-07, + "loss": 0.2049, + "step": 18314 + }, + { + "epoch": 1.7253479663691387, + "grad_norm": 0.6330221891403198, + "learning_rate": 9.53208770215257e-07, + "loss": 0.172, + "step": 18315 + }, + { + "epoch": 1.7254421704623066, + "grad_norm": 0.6738264560699463, + "learning_rate": 9.525654644372495e-07, + "loss": 0.2003, + "step": 18316 + }, + { + "epoch": 1.7255363745554746, + "grad_norm": 0.6461614370346069, + "learning_rate": 9.519223649547437e-07, + "loss": 0.2075, + "step": 18317 + }, + { + "epoch": 1.7256305786486423, + "grad_norm": 0.7237948775291443, + "learning_rate": 9.512794717823992e-07, + "loss": 0.1925, + "step": 18318 + }, + { + "epoch": 1.72572478274181, + "grad_norm": 0.7411685585975647, + "learning_rate": 9.506367849348763e-07, + "loss": 0.1935, + "step": 18319 + }, + { + "epoch": 1.725818986834978, + "grad_norm": 0.6511257886886597, + "learning_rate": 9.499943044268323e-07, + "loss": 0.1934, + "step": 18320 + }, + { + "epoch": 1.725913190928146, + "grad_norm": 0.6301068067550659, + "learning_rate": 9.49352030272912e-07, + "loss": 0.1771, + "step": 18321 + }, + { + "epoch": 1.7260073950213137, + "grad_norm": 0.6631643772125244, + "learning_rate": 9.487099624877627e-07, + "loss": 0.2078, + "step": 18322 + }, + { + "epoch": 1.7261015991144815, + "grad_norm": 0.6139720678329468, + "learning_rate": 9.48068101086026e-07, + "loss": 0.1854, + "step": 18323 + }, + { + "epoch": 1.7261958032076494, + "grad_norm": 0.6714940667152405, + "learning_rate": 9.474264460823346e-07, + "loss": 0.1943, + "step": 18324 + }, + { + "epoch": 1.7262900073008174, + "grad_norm": 0.6282404065132141, + "learning_rate": 9.467849974913212e-07, + "loss": 0.2068, + "step": 18325 + }, + { + "epoch": 1.7263842113939851, + "grad_norm": 0.6319966912269592, + "learning_rate": 9.461437553276098e-07, + "loss": 0.1847, + "step": 18326 + }, + { + "epoch": 1.7264784154871529, + "grad_norm": 0.6349130868911743, + "learning_rate": 9.45502719605822e-07, + "loss": 0.1986, + "step": 18327 + }, + { + "epoch": 1.7265726195803208, + "grad_norm": 0.646722137928009, + "learning_rate": 9.448618903405782e-07, + "loss": 0.1895, + "step": 18328 + }, + { + "epoch": 1.7266668236734886, + "grad_norm": 0.6951996088027954, + "learning_rate": 9.442212675464845e-07, + "loss": 0.2064, + "step": 18329 + }, + { + "epoch": 1.7267610277666563, + "grad_norm": 0.6779601573944092, + "learning_rate": 9.435808512381506e-07, + "loss": 0.2039, + "step": 18330 + }, + { + "epoch": 1.7268552318598243, + "grad_norm": 0.756580114364624, + "learning_rate": 9.429406414301822e-07, + "loss": 0.178, + "step": 18331 + }, + { + "epoch": 1.7269494359529922, + "grad_norm": 0.6476389765739441, + "learning_rate": 9.423006381371724e-07, + "loss": 0.1985, + "step": 18332 + }, + { + "epoch": 1.72704364004616, + "grad_norm": 0.653612494468689, + "learning_rate": 9.416608413737149e-07, + "loss": 0.18, + "step": 18333 + }, + { + "epoch": 1.7271378441393277, + "grad_norm": 0.6533164978027344, + "learning_rate": 9.410212511544025e-07, + "loss": 0.192, + "step": 18334 + }, + { + "epoch": 1.7272320482324957, + "grad_norm": 0.6046565771102905, + "learning_rate": 9.403818674938148e-07, + "loss": 0.1901, + "step": 18335 + }, + { + "epoch": 1.7273262523256636, + "grad_norm": 0.7406439185142517, + "learning_rate": 9.397426904065277e-07, + "loss": 0.1853, + "step": 18336 + }, + { + "epoch": 1.7274204564188314, + "grad_norm": 0.6120457649230957, + "learning_rate": 9.391037199071229e-07, + "loss": 0.1881, + "step": 18337 + }, + { + "epoch": 1.727514660511999, + "grad_norm": 0.6178550124168396, + "learning_rate": 9.384649560101667e-07, + "loss": 0.1781, + "step": 18338 + }, + { + "epoch": 1.727608864605167, + "grad_norm": 0.7009128332138062, + "learning_rate": 9.378263987302194e-07, + "loss": 0.2391, + "step": 18339 + }, + { + "epoch": 1.727703068698335, + "grad_norm": 0.6016954183578491, + "learning_rate": 9.371880480818485e-07, + "loss": 0.1631, + "step": 18340 + }, + { + "epoch": 1.7277972727915027, + "grad_norm": 0.6001085042953491, + "learning_rate": 9.365499040796066e-07, + "loss": 0.1927, + "step": 18341 + }, + { + "epoch": 1.7278914768846705, + "grad_norm": 0.6183971762657166, + "learning_rate": 9.359119667380412e-07, + "loss": 0.1741, + "step": 18342 + }, + { + "epoch": 1.7279856809778384, + "grad_norm": 0.7459295392036438, + "learning_rate": 9.352742360717015e-07, + "loss": 0.2034, + "step": 18343 + }, + { + "epoch": 1.7280798850710064, + "grad_norm": 0.6946557760238647, + "learning_rate": 9.346367120951305e-07, + "loss": 0.1828, + "step": 18344 + }, + { + "epoch": 1.7281740891641741, + "grad_norm": 0.6636278033256531, + "learning_rate": 9.339993948228587e-07, + "loss": 0.1718, + "step": 18345 + }, + { + "epoch": 1.7282682932573419, + "grad_norm": 0.6898250579833984, + "learning_rate": 9.333622842694234e-07, + "loss": 0.1958, + "step": 18346 + }, + { + "epoch": 1.7283624973505098, + "grad_norm": 0.6806836128234863, + "learning_rate": 9.327253804493508e-07, + "loss": 0.2064, + "step": 18347 + }, + { + "epoch": 1.7284567014436778, + "grad_norm": 0.7299802899360657, + "learning_rate": 9.320886833771603e-07, + "loss": 0.2029, + "step": 18348 + }, + { + "epoch": 1.7285509055368455, + "grad_norm": 0.6397003531455994, + "learning_rate": 9.314521930673714e-07, + "loss": 0.1931, + "step": 18349 + }, + { + "epoch": 1.7286451096300133, + "grad_norm": 0.6581284999847412, + "learning_rate": 9.308159095345004e-07, + "loss": 0.1888, + "step": 18350 + }, + { + "epoch": 1.7287393137231812, + "grad_norm": 0.6814366579055786, + "learning_rate": 9.301798327930489e-07, + "loss": 0.1968, + "step": 18351 + }, + { + "epoch": 1.7288335178163492, + "grad_norm": 0.7398734092712402, + "learning_rate": 9.295439628575253e-07, + "loss": 0.2177, + "step": 18352 + }, + { + "epoch": 1.728927721909517, + "grad_norm": 0.640472948551178, + "learning_rate": 9.289082997424281e-07, + "loss": 0.1842, + "step": 18353 + }, + { + "epoch": 1.7290219260026847, + "grad_norm": 0.6734275817871094, + "learning_rate": 9.28272843462249e-07, + "loss": 0.23, + "step": 18354 + }, + { + "epoch": 1.7291161300958526, + "grad_norm": 0.6754468083381653, + "learning_rate": 9.276375940314807e-07, + "loss": 0.1917, + "step": 18355 + }, + { + "epoch": 1.7292103341890206, + "grad_norm": 0.6099188923835754, + "learning_rate": 9.270025514646042e-07, + "loss": 0.168, + "step": 18356 + }, + { + "epoch": 1.7293045382821883, + "grad_norm": 0.651532769203186, + "learning_rate": 9.26367715776102e-07, + "loss": 0.1998, + "step": 18357 + }, + { + "epoch": 1.729398742375356, + "grad_norm": 0.6025399565696716, + "learning_rate": 9.257330869804482e-07, + "loss": 0.1894, + "step": 18358 + }, + { + "epoch": 1.729492946468524, + "grad_norm": 0.6744096875190735, + "learning_rate": 9.250986650921124e-07, + "loss": 0.2052, + "step": 18359 + }, + { + "epoch": 1.729587150561692, + "grad_norm": 0.5932868719100952, + "learning_rate": 9.24464450125564e-07, + "loss": 0.1838, + "step": 18360 + }, + { + "epoch": 1.7296813546548597, + "grad_norm": 0.627146303653717, + "learning_rate": 9.238304420952593e-07, + "loss": 0.1953, + "step": 18361 + }, + { + "epoch": 1.7297755587480275, + "grad_norm": 0.6654533743858337, + "learning_rate": 9.231966410156578e-07, + "loss": 0.1779, + "step": 18362 + }, + { + "epoch": 1.7298697628411954, + "grad_norm": 0.6812465786933899, + "learning_rate": 9.225630469012125e-07, + "loss": 0.2031, + "step": 18363 + }, + { + "epoch": 1.7299639669343634, + "grad_norm": 0.5945623517036438, + "learning_rate": 9.219296597663663e-07, + "loss": 0.1651, + "step": 18364 + }, + { + "epoch": 1.7300581710275311, + "grad_norm": 0.6883321404457092, + "learning_rate": 9.212964796255641e-07, + "loss": 0.1889, + "step": 18365 + }, + { + "epoch": 1.7301523751206989, + "grad_norm": 0.7249394059181213, + "learning_rate": 9.206635064932423e-07, + "loss": 0.2093, + "step": 18366 + }, + { + "epoch": 1.7302465792138668, + "grad_norm": 0.6443730592727661, + "learning_rate": 9.200307403838327e-07, + "loss": 0.1948, + "step": 18367 + }, + { + "epoch": 1.7303407833070348, + "grad_norm": 0.6195492744445801, + "learning_rate": 9.193981813117669e-07, + "loss": 0.2017, + "step": 18368 + }, + { + "epoch": 1.7304349874002025, + "grad_norm": 0.6574223041534424, + "learning_rate": 9.187658292914647e-07, + "loss": 0.1864, + "step": 18369 + }, + { + "epoch": 1.7305291914933703, + "grad_norm": 0.6537573337554932, + "learning_rate": 9.181336843373456e-07, + "loss": 0.1752, + "step": 18370 + }, + { + "epoch": 1.7306233955865382, + "grad_norm": 0.6777458786964417, + "learning_rate": 9.175017464638258e-07, + "loss": 0.1912, + "step": 18371 + }, + { + "epoch": 1.7307175996797062, + "grad_norm": 0.6786985397338867, + "learning_rate": 9.168700156853106e-07, + "loss": 0.192, + "step": 18372 + }, + { + "epoch": 1.730811803772874, + "grad_norm": 0.7072835564613342, + "learning_rate": 9.16238492016206e-07, + "loss": 0.2029, + "step": 18373 + }, + { + "epoch": 1.7309060078660417, + "grad_norm": 0.6088541746139526, + "learning_rate": 9.15607175470915e-07, + "loss": 0.1665, + "step": 18374 + }, + { + "epoch": 1.7310002119592096, + "grad_norm": 0.6614930033683777, + "learning_rate": 9.149760660638285e-07, + "loss": 0.185, + "step": 18375 + }, + { + "epoch": 1.7310944160523776, + "grad_norm": 0.6818583011627197, + "learning_rate": 9.143451638093348e-07, + "loss": 0.1911, + "step": 18376 + }, + { + "epoch": 1.7311886201455453, + "grad_norm": 0.5903543829917908, + "learning_rate": 9.137144687218269e-07, + "loss": 0.1596, + "step": 18377 + }, + { + "epoch": 1.731282824238713, + "grad_norm": 0.7084120512008667, + "learning_rate": 9.130839808156799e-07, + "loss": 0.1981, + "step": 18378 + }, + { + "epoch": 1.731377028331881, + "grad_norm": 0.6994187831878662, + "learning_rate": 9.124537001052692e-07, + "loss": 0.1805, + "step": 18379 + }, + { + "epoch": 1.731471232425049, + "grad_norm": 0.6367105841636658, + "learning_rate": 9.118236266049707e-07, + "loss": 0.1806, + "step": 18380 + }, + { + "epoch": 1.7315654365182167, + "grad_norm": 0.7205740809440613, + "learning_rate": 9.111937603291499e-07, + "loss": 0.2346, + "step": 18381 + }, + { + "epoch": 1.7316596406113844, + "grad_norm": 0.6373416781425476, + "learning_rate": 9.10564101292164e-07, + "loss": 0.1923, + "step": 18382 + }, + { + "epoch": 1.7317538447045524, + "grad_norm": 0.6876963973045349, + "learning_rate": 9.09934649508375e-07, + "loss": 0.2063, + "step": 18383 + }, + { + "epoch": 1.7318480487977204, + "grad_norm": 0.6859636902809143, + "learning_rate": 9.093054049921357e-07, + "loss": 0.1931, + "step": 18384 + }, + { + "epoch": 1.731942252890888, + "grad_norm": 0.649297833442688, + "learning_rate": 9.086763677577903e-07, + "loss": 0.2143, + "step": 18385 + }, + { + "epoch": 1.7320364569840558, + "grad_norm": 0.680216372013092, + "learning_rate": 9.080475378196829e-07, + "loss": 0.1921, + "step": 18386 + }, + { + "epoch": 1.7321306610772238, + "grad_norm": 0.6525050401687622, + "learning_rate": 9.074189151921553e-07, + "loss": 0.1993, + "step": 18387 + }, + { + "epoch": 1.7322248651703918, + "grad_norm": 0.6437455415725708, + "learning_rate": 9.06790499889536e-07, + "loss": 0.1816, + "step": 18388 + }, + { + "epoch": 1.7323190692635595, + "grad_norm": 0.7058247327804565, + "learning_rate": 9.061622919261571e-07, + "loss": 0.1948, + "step": 18389 + }, + { + "epoch": 1.7324132733567272, + "grad_norm": 0.740924596786499, + "learning_rate": 9.055342913163434e-07, + "loss": 0.2043, + "step": 18390 + }, + { + "epoch": 1.7325074774498952, + "grad_norm": 0.6546656489372253, + "learning_rate": 9.049064980744104e-07, + "loss": 0.1964, + "step": 18391 + }, + { + "epoch": 1.7326016815430632, + "grad_norm": 0.7722698450088501, + "learning_rate": 9.042789122146755e-07, + "loss": 0.2137, + "step": 18392 + }, + { + "epoch": 1.732695885636231, + "grad_norm": 1.0166223049163818, + "learning_rate": 9.036515337514496e-07, + "loss": 0.2079, + "step": 18393 + }, + { + "epoch": 1.7327900897293986, + "grad_norm": 0.6798054575920105, + "learning_rate": 9.030243626990343e-07, + "loss": 0.2092, + "step": 18394 + }, + { + "epoch": 1.7328842938225666, + "grad_norm": 0.6436615586280823, + "learning_rate": 9.023973990717349e-07, + "loss": 0.1765, + "step": 18395 + }, + { + "epoch": 1.7329784979157346, + "grad_norm": 0.6995458006858826, + "learning_rate": 9.017706428838425e-07, + "loss": 0.1997, + "step": 18396 + }, + { + "epoch": 1.7330727020089023, + "grad_norm": 0.6393980383872986, + "learning_rate": 9.011440941496519e-07, + "loss": 0.1835, + "step": 18397 + }, + { + "epoch": 1.73316690610207, + "grad_norm": 0.7130224108695984, + "learning_rate": 9.005177528834464e-07, + "loss": 0.1693, + "step": 18398 + }, + { + "epoch": 1.733261110195238, + "grad_norm": 0.7812324166297913, + "learning_rate": 8.998916190995078e-07, + "loss": 0.2229, + "step": 18399 + }, + { + "epoch": 1.733355314288406, + "grad_norm": 0.5957273840904236, + "learning_rate": 8.992656928121158e-07, + "loss": 0.169, + "step": 18400 + }, + { + "epoch": 1.7334495183815737, + "grad_norm": 0.5725364089012146, + "learning_rate": 8.98639974035539e-07, + "loss": 0.1582, + "step": 18401 + }, + { + "epoch": 1.7335437224747414, + "grad_norm": 0.5999626517295837, + "learning_rate": 8.98014462784047e-07, + "loss": 0.1781, + "step": 18402 + }, + { + "epoch": 1.7336379265679094, + "grad_norm": 0.6020220518112183, + "learning_rate": 8.973891590719031e-07, + "loss": 0.1918, + "step": 18403 + }, + { + "epoch": 1.7337321306610773, + "grad_norm": 0.6308332085609436, + "learning_rate": 8.967640629133611e-07, + "loss": 0.1724, + "step": 18404 + }, + { + "epoch": 1.733826334754245, + "grad_norm": 0.6978043913841248, + "learning_rate": 8.9613917432268e-07, + "loss": 0.1987, + "step": 18405 + }, + { + "epoch": 1.7339205388474128, + "grad_norm": 0.6582713723182678, + "learning_rate": 8.955144933141025e-07, + "loss": 0.1973, + "step": 18406 + }, + { + "epoch": 1.7340147429405808, + "grad_norm": 0.704949676990509, + "learning_rate": 8.94890019901875e-07, + "loss": 0.1914, + "step": 18407 + }, + { + "epoch": 1.7341089470337487, + "grad_norm": 0.6758086681365967, + "learning_rate": 8.942657541002386e-07, + "loss": 0.2316, + "step": 18408 + }, + { + "epoch": 1.7342031511269165, + "grad_norm": 0.5849722027778625, + "learning_rate": 8.936416959234229e-07, + "loss": 0.186, + "step": 18409 + }, + { + "epoch": 1.7342973552200842, + "grad_norm": 0.6267807483673096, + "learning_rate": 8.930178453856597e-07, + "loss": 0.177, + "step": 18410 + }, + { + "epoch": 1.7343915593132522, + "grad_norm": 0.6762515902519226, + "learning_rate": 8.923942025011768e-07, + "loss": 0.1808, + "step": 18411 + }, + { + "epoch": 1.7344857634064201, + "grad_norm": 0.6666058897972107, + "learning_rate": 8.917707672841879e-07, + "loss": 0.2017, + "step": 18412 + }, + { + "epoch": 1.7345799674995879, + "grad_norm": 0.7240074872970581, + "learning_rate": 8.911475397489122e-07, + "loss": 0.2095, + "step": 18413 + }, + { + "epoch": 1.7346741715927556, + "grad_norm": 0.955720841884613, + "learning_rate": 8.905245199095625e-07, + "loss": 0.1926, + "step": 18414 + }, + { + "epoch": 1.7347683756859236, + "grad_norm": 0.7881523966789246, + "learning_rate": 8.899017077803406e-07, + "loss": 0.1949, + "step": 18415 + }, + { + "epoch": 1.7348625797790915, + "grad_norm": 0.6623057723045349, + "learning_rate": 8.892791033754456e-07, + "loss": 0.2009, + "step": 18416 + }, + { + "epoch": 1.7349567838722593, + "grad_norm": 0.6543082594871521, + "learning_rate": 8.886567067090812e-07, + "loss": 0.1869, + "step": 18417 + }, + { + "epoch": 1.735050987965427, + "grad_norm": 0.608604371547699, + "learning_rate": 8.880345177954341e-07, + "loss": 0.1577, + "step": 18418 + }, + { + "epoch": 1.735145192058595, + "grad_norm": 0.6755335330963135, + "learning_rate": 8.874125366486886e-07, + "loss": 0.1895, + "step": 18419 + }, + { + "epoch": 1.735239396151763, + "grad_norm": 0.6955905556678772, + "learning_rate": 8.867907632830341e-07, + "loss": 0.1924, + "step": 18420 + }, + { + "epoch": 1.7353336002449307, + "grad_norm": 0.6710397005081177, + "learning_rate": 8.86169197712643e-07, + "loss": 0.1898, + "step": 18421 + }, + { + "epoch": 1.7354278043380984, + "grad_norm": 0.7040006518363953, + "learning_rate": 8.855478399516881e-07, + "loss": 0.2304, + "step": 18422 + }, + { + "epoch": 1.7355220084312664, + "grad_norm": 0.6744738221168518, + "learning_rate": 8.849266900143383e-07, + "loss": 0.1962, + "step": 18423 + }, + { + "epoch": 1.7356162125244343, + "grad_norm": 0.6585153937339783, + "learning_rate": 8.843057479147576e-07, + "loss": 0.1639, + "step": 18424 + }, + { + "epoch": 1.735710416617602, + "grad_norm": 0.6540251970291138, + "learning_rate": 8.836850136671027e-07, + "loss": 0.1934, + "step": 18425 + }, + { + "epoch": 1.7358046207107698, + "grad_norm": 0.705668568611145, + "learning_rate": 8.830644872855276e-07, + "loss": 0.2126, + "step": 18426 + }, + { + "epoch": 1.7358988248039378, + "grad_norm": 0.6492533683776855, + "learning_rate": 8.824441687841834e-07, + "loss": 0.1717, + "step": 18427 + }, + { + "epoch": 1.7359930288971057, + "grad_norm": 0.631546139717102, + "learning_rate": 8.81824058177212e-07, + "loss": 0.2, + "step": 18428 + }, + { + "epoch": 1.7360872329902735, + "grad_norm": 0.6409475207328796, + "learning_rate": 8.812041554787521e-07, + "loss": 0.2104, + "step": 18429 + }, + { + "epoch": 1.7361814370834412, + "grad_norm": 0.6669663786888123, + "learning_rate": 8.805844607029435e-07, + "loss": 0.1864, + "step": 18430 + }, + { + "epoch": 1.7362756411766092, + "grad_norm": 0.6519623398780823, + "learning_rate": 8.799649738639094e-07, + "loss": 0.1992, + "step": 18431 + }, + { + "epoch": 1.7363698452697771, + "grad_norm": 0.6021997928619385, + "learning_rate": 8.793456949757784e-07, + "loss": 0.1898, + "step": 18432 + }, + { + "epoch": 1.7364640493629449, + "grad_norm": 0.7147196531295776, + "learning_rate": 8.787266240526738e-07, + "loss": 0.2135, + "step": 18433 + }, + { + "epoch": 1.7365582534561126, + "grad_norm": 0.7053173780441284, + "learning_rate": 8.781077611087075e-07, + "loss": 0.2086, + "step": 18434 + }, + { + "epoch": 1.7366524575492805, + "grad_norm": 0.681104838848114, + "learning_rate": 8.774891061579904e-07, + "loss": 0.2365, + "step": 18435 + }, + { + "epoch": 1.7367466616424485, + "grad_norm": 0.640718400478363, + "learning_rate": 8.768706592146293e-07, + "loss": 0.1864, + "step": 18436 + }, + { + "epoch": 1.7368408657356162, + "grad_norm": 0.609404444694519, + "learning_rate": 8.762524202927281e-07, + "loss": 0.1594, + "step": 18437 + }, + { + "epoch": 1.736935069828784, + "grad_norm": 0.684023380279541, + "learning_rate": 8.756343894063801e-07, + "loss": 0.2187, + "step": 18438 + }, + { + "epoch": 1.737029273921952, + "grad_norm": 0.6988298892974854, + "learning_rate": 8.750165665696797e-07, + "loss": 0.1995, + "step": 18439 + }, + { + "epoch": 1.73712347801512, + "grad_norm": 0.7350949048995972, + "learning_rate": 8.743989517967155e-07, + "loss": 0.1903, + "step": 18440 + }, + { + "epoch": 1.7372176821082876, + "grad_norm": 0.5716772079467773, + "learning_rate": 8.737815451015663e-07, + "loss": 0.1826, + "step": 18441 + }, + { + "epoch": 1.7373118862014554, + "grad_norm": 0.9348627328872681, + "learning_rate": 8.731643464983109e-07, + "loss": 0.1691, + "step": 18442 + }, + { + "epoch": 1.7374060902946233, + "grad_norm": 0.7201164364814758, + "learning_rate": 8.725473560010256e-07, + "loss": 0.2413, + "step": 18443 + }, + { + "epoch": 1.7375002943877913, + "grad_norm": 0.7446562647819519, + "learning_rate": 8.719305736237749e-07, + "loss": 0.1959, + "step": 18444 + }, + { + "epoch": 1.737594498480959, + "grad_norm": 0.6912760734558105, + "learning_rate": 8.713139993806263e-07, + "loss": 0.1982, + "step": 18445 + }, + { + "epoch": 1.7376887025741268, + "grad_norm": 0.6774753928184509, + "learning_rate": 8.706976332856331e-07, + "loss": 0.1638, + "step": 18446 + }, + { + "epoch": 1.7377829066672947, + "grad_norm": 0.6914879083633423, + "learning_rate": 8.700814753528541e-07, + "loss": 0.1588, + "step": 18447 + }, + { + "epoch": 1.7378771107604627, + "grad_norm": 0.63276606798172, + "learning_rate": 8.69465525596338e-07, + "loss": 0.1755, + "step": 18448 + }, + { + "epoch": 1.7379713148536304, + "grad_norm": 0.6125465631484985, + "learning_rate": 8.688497840301269e-07, + "loss": 0.1786, + "step": 18449 + }, + { + "epoch": 1.7380655189467982, + "grad_norm": 1.0072364807128906, + "learning_rate": 8.682342506682629e-07, + "loss": 0.1974, + "step": 18450 + }, + { + "epoch": 1.7381597230399661, + "grad_norm": 0.6694815158843994, + "learning_rate": 8.676189255247814e-07, + "loss": 0.1786, + "step": 18451 + }, + { + "epoch": 1.738253927133134, + "grad_norm": 0.705101728439331, + "learning_rate": 8.670038086137111e-07, + "loss": 0.2359, + "step": 18452 + }, + { + "epoch": 1.7383481312263018, + "grad_norm": 0.6535754203796387, + "learning_rate": 8.663888999490777e-07, + "loss": 0.2002, + "step": 18453 + }, + { + "epoch": 1.7384423353194696, + "grad_norm": 0.7289561629295349, + "learning_rate": 8.657741995449043e-07, + "loss": 0.1815, + "step": 18454 + }, + { + "epoch": 1.7385365394126375, + "grad_norm": 0.6739023327827454, + "learning_rate": 8.651597074152063e-07, + "loss": 0.182, + "step": 18455 + }, + { + "epoch": 1.7386307435058055, + "grad_norm": 0.6043060421943665, + "learning_rate": 8.645454235739903e-07, + "loss": 0.1956, + "step": 18456 + }, + { + "epoch": 1.7387249475989732, + "grad_norm": 0.659800112247467, + "learning_rate": 8.639313480352707e-07, + "loss": 0.1681, + "step": 18457 + }, + { + "epoch": 1.738819151692141, + "grad_norm": 0.6852256059646606, + "learning_rate": 8.633174808130452e-07, + "loss": 0.2143, + "step": 18458 + }, + { + "epoch": 1.738913355785309, + "grad_norm": 0.6718716025352478, + "learning_rate": 8.627038219213102e-07, + "loss": 0.2027, + "step": 18459 + }, + { + "epoch": 1.7390075598784769, + "grad_norm": 0.662808895111084, + "learning_rate": 8.620903713740581e-07, + "loss": 0.1907, + "step": 18460 + }, + { + "epoch": 1.7391017639716446, + "grad_norm": 0.6268913745880127, + "learning_rate": 8.614771291852797e-07, + "loss": 0.1957, + "step": 18461 + }, + { + "epoch": 1.7391959680648124, + "grad_norm": 0.9465031027793884, + "learning_rate": 8.60864095368954e-07, + "loss": 0.2138, + "step": 18462 + }, + { + "epoch": 1.7392901721579803, + "grad_norm": 0.6139705777168274, + "learning_rate": 8.602512699390619e-07, + "loss": 0.1908, + "step": 18463 + }, + { + "epoch": 1.7393843762511483, + "grad_norm": 0.7043342590332031, + "learning_rate": 8.596386529095768e-07, + "loss": 0.1746, + "step": 18464 + }, + { + "epoch": 1.7394785803443158, + "grad_norm": 0.6834427118301392, + "learning_rate": 8.590262442944641e-07, + "loss": 0.1912, + "step": 18465 + }, + { + "epoch": 1.7395727844374838, + "grad_norm": 0.6388500332832336, + "learning_rate": 8.584140441076894e-07, + "loss": 0.1793, + "step": 18466 + }, + { + "epoch": 1.7396669885306517, + "grad_norm": 0.5999411940574646, + "learning_rate": 8.578020523632147e-07, + "loss": 0.158, + "step": 18467 + }, + { + "epoch": 1.7397611926238195, + "grad_norm": 0.6816447377204895, + "learning_rate": 8.5719026907499e-07, + "loss": 0.2044, + "step": 18468 + }, + { + "epoch": 1.7398553967169872, + "grad_norm": 0.6604149341583252, + "learning_rate": 8.565786942569677e-07, + "loss": 0.1973, + "step": 18469 + }, + { + "epoch": 1.7399496008101552, + "grad_norm": 0.7390600442886353, + "learning_rate": 8.559673279230929e-07, + "loss": 0.2278, + "step": 18470 + }, + { + "epoch": 1.740043804903323, + "grad_norm": 0.6293715238571167, + "learning_rate": 8.553561700873026e-07, + "loss": 0.2093, + "step": 18471 + }, + { + "epoch": 1.7401380089964908, + "grad_norm": 0.7075802087783813, + "learning_rate": 8.547452207635332e-07, + "loss": 0.1985, + "step": 18472 + }, + { + "epoch": 1.7402322130896586, + "grad_norm": 0.7181007266044617, + "learning_rate": 8.541344799657192e-07, + "loss": 0.2036, + "step": 18473 + }, + { + "epoch": 1.7403264171828265, + "grad_norm": 0.7272784113883972, + "learning_rate": 8.535239477077827e-07, + "loss": 0.2118, + "step": 18474 + }, + { + "epoch": 1.7404206212759945, + "grad_norm": 0.6046308279037476, + "learning_rate": 8.529136240036439e-07, + "loss": 0.1628, + "step": 18475 + }, + { + "epoch": 1.7405148253691622, + "grad_norm": 0.6928007006645203, + "learning_rate": 8.523035088672215e-07, + "loss": 0.174, + "step": 18476 + }, + { + "epoch": 1.74060902946233, + "grad_norm": 0.66817706823349, + "learning_rate": 8.516936023124267e-07, + "loss": 0.2126, + "step": 18477 + }, + { + "epoch": 1.740703233555498, + "grad_norm": 0.6573317646980286, + "learning_rate": 8.510839043531649e-07, + "loss": 0.2082, + "step": 18478 + }, + { + "epoch": 1.740797437648666, + "grad_norm": 0.674961268901825, + "learning_rate": 8.504744150033395e-07, + "loss": 0.184, + "step": 18479 + }, + { + "epoch": 1.7408916417418336, + "grad_norm": 0.6107397675514221, + "learning_rate": 8.498651342768482e-07, + "loss": 0.1752, + "step": 18480 + }, + { + "epoch": 1.7409858458350014, + "grad_norm": 0.7066890597343445, + "learning_rate": 8.492560621875823e-07, + "loss": 0.1866, + "step": 18481 + }, + { + "epoch": 1.7410800499281693, + "grad_norm": 0.6572678685188293, + "learning_rate": 8.486471987494294e-07, + "loss": 0.1799, + "step": 18482 + }, + { + "epoch": 1.7411742540213373, + "grad_norm": 0.6278140544891357, + "learning_rate": 8.480385439762751e-07, + "loss": 0.1752, + "step": 18483 + }, + { + "epoch": 1.741268458114505, + "grad_norm": 0.7136510014533997, + "learning_rate": 8.474300978819939e-07, + "loss": 0.195, + "step": 18484 + }, + { + "epoch": 1.7413626622076728, + "grad_norm": 0.6753626465797424, + "learning_rate": 8.468218604804624e-07, + "loss": 0.1834, + "step": 18485 + }, + { + "epoch": 1.7414568663008407, + "grad_norm": 0.687878429889679, + "learning_rate": 8.462138317855473e-07, + "loss": 0.1936, + "step": 18486 + }, + { + "epoch": 1.7415510703940087, + "grad_norm": 0.6868911981582642, + "learning_rate": 8.456060118111131e-07, + "loss": 0.1889, + "step": 18487 + }, + { + "epoch": 1.7416452744871764, + "grad_norm": 0.6118377447128296, + "learning_rate": 8.44998400571021e-07, + "loss": 0.1729, + "step": 18488 + }, + { + "epoch": 1.7417394785803442, + "grad_norm": 0.7410522699356079, + "learning_rate": 8.443909980791221e-07, + "loss": 0.1922, + "step": 18489 + }, + { + "epoch": 1.7418336826735121, + "grad_norm": 0.6565917730331421, + "learning_rate": 8.437838043492675e-07, + "loss": 0.1734, + "step": 18490 + }, + { + "epoch": 1.74192788676668, + "grad_norm": 0.6755452752113342, + "learning_rate": 8.431768193953049e-07, + "loss": 0.2046, + "step": 18491 + }, + { + "epoch": 1.7420220908598478, + "grad_norm": 0.6946268081665039, + "learning_rate": 8.425700432310701e-07, + "loss": 0.1965, + "step": 18492 + }, + { + "epoch": 1.7421162949530156, + "grad_norm": 0.6437829732894897, + "learning_rate": 8.419634758704009e-07, + "loss": 0.1781, + "step": 18493 + }, + { + "epoch": 1.7422104990461835, + "grad_norm": 0.7048546671867371, + "learning_rate": 8.413571173271295e-07, + "loss": 0.2046, + "step": 18494 + }, + { + "epoch": 1.7423047031393515, + "grad_norm": 0.6869180202484131, + "learning_rate": 8.407509676150794e-07, + "loss": 0.2087, + "step": 18495 + }, + { + "epoch": 1.7423989072325192, + "grad_norm": 0.6828222274780273, + "learning_rate": 8.401450267480682e-07, + "loss": 0.1926, + "step": 18496 + }, + { + "epoch": 1.742493111325687, + "grad_norm": 0.6646389365196228, + "learning_rate": 8.395392947399205e-07, + "loss": 0.2123, + "step": 18497 + }, + { + "epoch": 1.742587315418855, + "grad_norm": 0.7597160339355469, + "learning_rate": 8.389337716044443e-07, + "loss": 0.1776, + "step": 18498 + }, + { + "epoch": 1.7426815195120229, + "grad_norm": 0.835570216178894, + "learning_rate": 8.38328457355444e-07, + "loss": 0.2219, + "step": 18499 + }, + { + "epoch": 1.7427757236051906, + "grad_norm": 0.661425769329071, + "learning_rate": 8.37723352006724e-07, + "loss": 0.1839, + "step": 18500 + }, + { + "epoch": 1.7428699276983584, + "grad_norm": 0.5716953277587891, + "learning_rate": 8.371184555720824e-07, + "loss": 0.1825, + "step": 18501 + }, + { + "epoch": 1.7429641317915263, + "grad_norm": 0.6040512919425964, + "learning_rate": 8.36513768065309e-07, + "loss": 0.1688, + "step": 18502 + }, + { + "epoch": 1.7430583358846943, + "grad_norm": 0.6377480030059814, + "learning_rate": 8.35909289500193e-07, + "loss": 0.1837, + "step": 18503 + }, + { + "epoch": 1.743152539977862, + "grad_norm": 0.6187005639076233, + "learning_rate": 8.353050198905199e-07, + "loss": 0.1992, + "step": 18504 + }, + { + "epoch": 1.7432467440710298, + "grad_norm": 0.6267953515052795, + "learning_rate": 8.347009592500644e-07, + "loss": 0.153, + "step": 18505 + }, + { + "epoch": 1.7433409481641977, + "grad_norm": 0.6637890934944153, + "learning_rate": 8.340971075926007e-07, + "loss": 0.2132, + "step": 18506 + }, + { + "epoch": 1.7434351522573657, + "grad_norm": 0.6383355259895325, + "learning_rate": 8.334934649319004e-07, + "loss": 0.1964, + "step": 18507 + }, + { + "epoch": 1.7435293563505334, + "grad_norm": 0.6890792846679688, + "learning_rate": 8.328900312817234e-07, + "loss": 0.1932, + "step": 18508 + }, + { + "epoch": 1.7436235604437011, + "grad_norm": 0.7252713441848755, + "learning_rate": 8.32286806655832e-07, + "loss": 0.197, + "step": 18509 + }, + { + "epoch": 1.743717764536869, + "grad_norm": 0.6572889685630798, + "learning_rate": 8.316837910679798e-07, + "loss": 0.1913, + "step": 18510 + }, + { + "epoch": 1.743811968630037, + "grad_norm": 0.6808494925498962, + "learning_rate": 8.310809845319156e-07, + "loss": 0.1929, + "step": 18511 + }, + { + "epoch": 1.7439061727232048, + "grad_norm": 0.6998562812805176, + "learning_rate": 8.304783870613841e-07, + "loss": 0.2131, + "step": 18512 + }, + { + "epoch": 1.7440003768163725, + "grad_norm": 0.6793148517608643, + "learning_rate": 8.298759986701288e-07, + "loss": 0.1951, + "step": 18513 + }, + { + "epoch": 1.7440945809095405, + "grad_norm": 0.693396806716919, + "learning_rate": 8.292738193718819e-07, + "loss": 0.1897, + "step": 18514 + }, + { + "epoch": 1.7441887850027085, + "grad_norm": 0.6428536772727966, + "learning_rate": 8.286718491803736e-07, + "loss": 0.178, + "step": 18515 + }, + { + "epoch": 1.7442829890958762, + "grad_norm": 0.6498225927352905, + "learning_rate": 8.280700881093306e-07, + "loss": 0.2167, + "step": 18516 + }, + { + "epoch": 1.744377193189044, + "grad_norm": 0.7418729662895203, + "learning_rate": 8.274685361724755e-07, + "loss": 0.2288, + "step": 18517 + }, + { + "epoch": 1.744471397282212, + "grad_norm": 0.6538975834846497, + "learning_rate": 8.268671933835226e-07, + "loss": 0.1811, + "step": 18518 + }, + { + "epoch": 1.7445656013753799, + "grad_norm": 0.5824927687644958, + "learning_rate": 8.262660597561833e-07, + "loss": 0.2025, + "step": 18519 + }, + { + "epoch": 1.7446598054685476, + "grad_norm": 0.7858545184135437, + "learning_rate": 8.256651353041678e-07, + "loss": 0.2035, + "step": 18520 + }, + { + "epoch": 1.7447540095617153, + "grad_norm": 0.6475818753242493, + "learning_rate": 8.250644200411739e-07, + "loss": 0.1631, + "step": 18521 + }, + { + "epoch": 1.7448482136548833, + "grad_norm": 0.6249920129776001, + "learning_rate": 8.244639139808997e-07, + "loss": 0.1771, + "step": 18522 + }, + { + "epoch": 1.7449424177480513, + "grad_norm": 0.738472580909729, + "learning_rate": 8.238636171370406e-07, + "loss": 0.1844, + "step": 18523 + }, + { + "epoch": 1.745036621841219, + "grad_norm": 0.7863634824752808, + "learning_rate": 8.232635295232805e-07, + "loss": 0.1848, + "step": 18524 + }, + { + "epoch": 1.7451308259343867, + "grad_norm": 0.606253981590271, + "learning_rate": 8.226636511533059e-07, + "loss": 0.1793, + "step": 18525 + }, + { + "epoch": 1.7452250300275547, + "grad_norm": 0.7103930115699768, + "learning_rate": 8.220639820407917e-07, + "loss": 0.2121, + "step": 18526 + }, + { + "epoch": 1.7453192341207227, + "grad_norm": 0.6445847749710083, + "learning_rate": 8.214645221994122e-07, + "loss": 0.1909, + "step": 18527 + }, + { + "epoch": 1.7454134382138904, + "grad_norm": 0.6096863746643066, + "learning_rate": 8.208652716428378e-07, + "loss": 0.1886, + "step": 18528 + }, + { + "epoch": 1.7455076423070581, + "grad_norm": 0.5972059965133667, + "learning_rate": 8.202662303847298e-07, + "loss": 0.1718, + "step": 18529 + }, + { + "epoch": 1.745601846400226, + "grad_norm": 1.0268930196762085, + "learning_rate": 8.196673984387482e-07, + "loss": 0.2109, + "step": 18530 + }, + { + "epoch": 1.745696050493394, + "grad_norm": 0.6883505582809448, + "learning_rate": 8.1906877581855e-07, + "loss": 0.2144, + "step": 18531 + }, + { + "epoch": 1.7457902545865618, + "grad_norm": 0.6555224657058716, + "learning_rate": 8.184703625377799e-07, + "loss": 0.2136, + "step": 18532 + }, + { + "epoch": 1.7458844586797295, + "grad_norm": 0.6272273659706116, + "learning_rate": 8.178721586100846e-07, + "loss": 0.1943, + "step": 18533 + }, + { + "epoch": 1.7459786627728975, + "grad_norm": 0.6227652430534363, + "learning_rate": 8.172741640491066e-07, + "loss": 0.1994, + "step": 18534 + }, + { + "epoch": 1.7460728668660654, + "grad_norm": 0.6621069312095642, + "learning_rate": 8.166763788684795e-07, + "loss": 0.2011, + "step": 18535 + }, + { + "epoch": 1.7461670709592332, + "grad_norm": 0.6356806755065918, + "learning_rate": 8.1607880308183e-07, + "loss": 0.1858, + "step": 18536 + }, + { + "epoch": 1.746261275052401, + "grad_norm": 0.6337907910346985, + "learning_rate": 8.154814367027897e-07, + "loss": 0.1949, + "step": 18537 + }, + { + "epoch": 1.7463554791455689, + "grad_norm": 0.7005070447921753, + "learning_rate": 8.148842797449774e-07, + "loss": 0.2107, + "step": 18538 + }, + { + "epoch": 1.7464496832387368, + "grad_norm": 0.6095236539840698, + "learning_rate": 8.142873322220057e-07, + "loss": 0.1855, + "step": 18539 + }, + { + "epoch": 1.7465438873319046, + "grad_norm": 0.5911375880241394, + "learning_rate": 8.136905941474904e-07, + "loss": 0.1703, + "step": 18540 + }, + { + "epoch": 1.7466380914250723, + "grad_norm": 0.8228804469108582, + "learning_rate": 8.130940655350372e-07, + "loss": 0.2276, + "step": 18541 + }, + { + "epoch": 1.7467322955182403, + "grad_norm": 0.636143684387207, + "learning_rate": 8.124977463982453e-07, + "loss": 0.1966, + "step": 18542 + }, + { + "epoch": 1.7468264996114082, + "grad_norm": 0.6936721205711365, + "learning_rate": 8.119016367507138e-07, + "loss": 0.1829, + "step": 18543 + }, + { + "epoch": 1.746920703704576, + "grad_norm": 0.707241952419281, + "learning_rate": 8.113057366060362e-07, + "loss": 0.191, + "step": 18544 + }, + { + "epoch": 1.7470149077977437, + "grad_norm": 0.7110002636909485, + "learning_rate": 8.107100459777972e-07, + "loss": 0.1824, + "step": 18545 + }, + { + "epoch": 1.7471091118909117, + "grad_norm": 0.6631872057914734, + "learning_rate": 8.101145648795805e-07, + "loss": 0.1857, + "step": 18546 + }, + { + "epoch": 1.7472033159840796, + "grad_norm": 0.6365458369255066, + "learning_rate": 8.095192933249652e-07, + "loss": 0.2156, + "step": 18547 + }, + { + "epoch": 1.7472975200772474, + "grad_norm": 0.7477833032608032, + "learning_rate": 8.089242313275226e-07, + "loss": 0.1915, + "step": 18548 + }, + { + "epoch": 1.747391724170415, + "grad_norm": 0.728469967842102, + "learning_rate": 8.083293789008218e-07, + "loss": 0.168, + "step": 18549 + }, + { + "epoch": 1.747485928263583, + "grad_norm": 0.6091916561126709, + "learning_rate": 8.077347360584275e-07, + "loss": 0.1767, + "step": 18550 + }, + { + "epoch": 1.747580132356751, + "grad_norm": 0.6055939197540283, + "learning_rate": 8.071403028138969e-07, + "loss": 0.1793, + "step": 18551 + }, + { + "epoch": 1.7476743364499188, + "grad_norm": 0.6508654952049255, + "learning_rate": 8.065460791807822e-07, + "loss": 0.1848, + "step": 18552 + }, + { + "epoch": 1.7477685405430865, + "grad_norm": 0.604834794998169, + "learning_rate": 8.059520651726371e-07, + "loss": 0.1698, + "step": 18553 + }, + { + "epoch": 1.7478627446362545, + "grad_norm": 0.6898211240768433, + "learning_rate": 8.053582608030041e-07, + "loss": 0.1983, + "step": 18554 + }, + { + "epoch": 1.7479569487294224, + "grad_norm": 0.5613889694213867, + "learning_rate": 8.047646660854213e-07, + "loss": 0.185, + "step": 18555 + }, + { + "epoch": 1.7480511528225902, + "grad_norm": 0.6882143020629883, + "learning_rate": 8.041712810334245e-07, + "loss": 0.1821, + "step": 18556 + }, + { + "epoch": 1.748145356915758, + "grad_norm": 0.6715866327285767, + "learning_rate": 8.035781056605463e-07, + "loss": 0.1931, + "step": 18557 + }, + { + "epoch": 1.7482395610089259, + "grad_norm": 0.6297674775123596, + "learning_rate": 8.029851399803068e-07, + "loss": 0.1757, + "step": 18558 + }, + { + "epoch": 1.7483337651020938, + "grad_norm": 0.6968896389007568, + "learning_rate": 8.023923840062309e-07, + "loss": 0.1816, + "step": 18559 + }, + { + "epoch": 1.7484279691952616, + "grad_norm": 0.6847019195556641, + "learning_rate": 8.017998377518343e-07, + "loss": 0.1755, + "step": 18560 + }, + { + "epoch": 1.7485221732884293, + "grad_norm": 0.6652508974075317, + "learning_rate": 8.012075012306253e-07, + "loss": 0.1884, + "step": 18561 + }, + { + "epoch": 1.7486163773815973, + "grad_norm": 0.7000791430473328, + "learning_rate": 8.006153744561107e-07, + "loss": 0.1869, + "step": 18562 + }, + { + "epoch": 1.7487105814747652, + "grad_norm": 0.6731979846954346, + "learning_rate": 8.000234574417954e-07, + "loss": 0.1896, + "step": 18563 + }, + { + "epoch": 1.748804785567933, + "grad_norm": 0.6780688166618347, + "learning_rate": 7.994317502011705e-07, + "loss": 0.1945, + "step": 18564 + }, + { + "epoch": 1.7488989896611007, + "grad_norm": 0.6443861722946167, + "learning_rate": 7.988402527477335e-07, + "loss": 0.1815, + "step": 18565 + }, + { + "epoch": 1.7489931937542686, + "grad_norm": 0.6712549924850464, + "learning_rate": 7.982489650949654e-07, + "loss": 0.2053, + "step": 18566 + }, + { + "epoch": 1.7490873978474366, + "grad_norm": 0.6732940077781677, + "learning_rate": 7.976578872563534e-07, + "loss": 0.2093, + "step": 18567 + }, + { + "epoch": 1.7491816019406043, + "grad_norm": 0.6036709547042847, + "learning_rate": 7.970670192453733e-07, + "loss": 0.1926, + "step": 18568 + }, + { + "epoch": 1.749275806033772, + "grad_norm": 0.8076704740524292, + "learning_rate": 7.964763610754978e-07, + "loss": 0.1927, + "step": 18569 + }, + { + "epoch": 1.74937001012694, + "grad_norm": 0.831139326095581, + "learning_rate": 7.958859127601937e-07, + "loss": 0.1764, + "step": 18570 + }, + { + "epoch": 1.749464214220108, + "grad_norm": 0.9794051051139832, + "learning_rate": 7.95295674312927e-07, + "loss": 0.1869, + "step": 18571 + }, + { + "epoch": 1.7495584183132757, + "grad_norm": 0.6633066534996033, + "learning_rate": 7.947056457471524e-07, + "loss": 0.1751, + "step": 18572 + }, + { + "epoch": 1.7496526224064435, + "grad_norm": 0.6702530980110168, + "learning_rate": 7.941158270763261e-07, + "loss": 0.2074, + "step": 18573 + }, + { + "epoch": 1.7497468264996114, + "grad_norm": 0.6720075011253357, + "learning_rate": 7.93526218313897e-07, + "loss": 0.1741, + "step": 18574 + }, + { + "epoch": 1.7498410305927794, + "grad_norm": 0.6878265738487244, + "learning_rate": 7.929368194733089e-07, + "loss": 0.2038, + "step": 18575 + }, + { + "epoch": 1.7499352346859471, + "grad_norm": 0.6586441397666931, + "learning_rate": 7.923476305679977e-07, + "loss": 0.1867, + "step": 18576 + }, + { + "epoch": 1.7500294387791149, + "grad_norm": 0.8241437077522278, + "learning_rate": 7.917586516114007e-07, + "loss": 0.1837, + "step": 18577 + }, + { + "epoch": 1.7501236428722828, + "grad_norm": 0.6873905062675476, + "learning_rate": 7.911698826169501e-07, + "loss": 0.195, + "step": 18578 + }, + { + "epoch": 1.7502178469654508, + "grad_norm": 0.5969401597976685, + "learning_rate": 7.905813235980653e-07, + "loss": 0.1788, + "step": 18579 + }, + { + "epoch": 1.7503120510586185, + "grad_norm": 0.8423927426338196, + "learning_rate": 7.899929745681689e-07, + "loss": 0.2348, + "step": 18580 + }, + { + "epoch": 1.7504062551517863, + "grad_norm": 0.6558899283409119, + "learning_rate": 7.89404835540678e-07, + "loss": 0.2, + "step": 18581 + }, + { + "epoch": 1.7505004592449542, + "grad_norm": 0.6926812529563904, + "learning_rate": 7.888169065289997e-07, + "loss": 0.2119, + "step": 18582 + }, + { + "epoch": 1.7505946633381222, + "grad_norm": 0.6233304738998413, + "learning_rate": 7.882291875465408e-07, + "loss": 0.1665, + "step": 18583 + }, + { + "epoch": 1.75068886743129, + "grad_norm": 0.6532228589057922, + "learning_rate": 7.876416786067053e-07, + "loss": 0.2214, + "step": 18584 + }, + { + "epoch": 1.7507830715244577, + "grad_norm": 0.7344880700111389, + "learning_rate": 7.870543797228847e-07, + "loss": 0.2136, + "step": 18585 + }, + { + "epoch": 1.7508772756176256, + "grad_norm": 0.6469510197639465, + "learning_rate": 7.864672909084714e-07, + "loss": 0.1973, + "step": 18586 + }, + { + "epoch": 1.7509714797107936, + "grad_norm": 0.5995925068855286, + "learning_rate": 7.85880412176856e-07, + "loss": 0.1886, + "step": 18587 + }, + { + "epoch": 1.7510656838039613, + "grad_norm": 0.6568569540977478, + "learning_rate": 7.852937435414143e-07, + "loss": 0.1921, + "step": 18588 + }, + { + "epoch": 1.751159887897129, + "grad_norm": 0.6559253931045532, + "learning_rate": 7.847072850155268e-07, + "loss": 0.1909, + "step": 18589 + }, + { + "epoch": 1.751254091990297, + "grad_norm": 0.6460072994232178, + "learning_rate": 7.841210366125662e-07, + "loss": 0.1973, + "step": 18590 + }, + { + "epoch": 1.751348296083465, + "grad_norm": 0.7013749480247498, + "learning_rate": 7.835349983458996e-07, + "loss": 0.2206, + "step": 18591 + }, + { + "epoch": 1.7514425001766327, + "grad_norm": 0.6947945356369019, + "learning_rate": 7.829491702288839e-07, + "loss": 0.2032, + "step": 18592 + }, + { + "epoch": 1.7515367042698005, + "grad_norm": 0.6729248762130737, + "learning_rate": 7.823635522748851e-07, + "loss": 0.222, + "step": 18593 + }, + { + "epoch": 1.7516309083629684, + "grad_norm": 0.7111519575119019, + "learning_rate": 7.817781444972528e-07, + "loss": 0.2069, + "step": 18594 + }, + { + "epoch": 1.7517251124561364, + "grad_norm": 0.6821576952934265, + "learning_rate": 7.811929469093338e-07, + "loss": 0.21, + "step": 18595 + }, + { + "epoch": 1.7518193165493041, + "grad_norm": 0.6710083484649658, + "learning_rate": 7.806079595244731e-07, + "loss": 0.1811, + "step": 18596 + }, + { + "epoch": 1.7519135206424719, + "grad_norm": 0.6114735007286072, + "learning_rate": 7.8002318235601e-07, + "loss": 0.1854, + "step": 18597 + }, + { + "epoch": 1.7520077247356398, + "grad_norm": 0.759047269821167, + "learning_rate": 7.79438615417275e-07, + "loss": 0.2022, + "step": 18598 + }, + { + "epoch": 1.7521019288288078, + "grad_norm": 0.6617277264595032, + "learning_rate": 7.788542587216008e-07, + "loss": 0.1865, + "step": 18599 + }, + { + "epoch": 1.7521961329219755, + "grad_norm": 0.6306414008140564, + "learning_rate": 7.782701122823111e-07, + "loss": 0.2017, + "step": 18600 + }, + { + "epoch": 1.7522903370151433, + "grad_norm": 0.7386486530303955, + "learning_rate": 7.776861761127231e-07, + "loss": 0.1761, + "step": 18601 + }, + { + "epoch": 1.7523845411083112, + "grad_norm": 0.6602556109428406, + "learning_rate": 7.771024502261526e-07, + "loss": 0.2193, + "step": 18602 + }, + { + "epoch": 1.752478745201479, + "grad_norm": 0.7331061363220215, + "learning_rate": 7.765189346359114e-07, + "loss": 0.2026, + "step": 18603 + }, + { + "epoch": 1.7525729492946467, + "grad_norm": 0.7011824250221252, + "learning_rate": 7.759356293553011e-07, + "loss": 0.1867, + "step": 18604 + }, + { + "epoch": 1.7526671533878146, + "grad_norm": 0.8126789927482605, + "learning_rate": 7.753525343976265e-07, + "loss": 0.1952, + "step": 18605 + }, + { + "epoch": 1.7527613574809826, + "grad_norm": 0.6619715094566345, + "learning_rate": 7.747696497761781e-07, + "loss": 0.1993, + "step": 18606 + }, + { + "epoch": 1.7528555615741503, + "grad_norm": 0.6641839146614075, + "learning_rate": 7.741869755042486e-07, + "loss": 0.2053, + "step": 18607 + }, + { + "epoch": 1.752949765667318, + "grad_norm": 0.7459710836410522, + "learning_rate": 7.736045115951252e-07, + "loss": 0.2031, + "step": 18608 + }, + { + "epoch": 1.753043969760486, + "grad_norm": 0.6663244366645813, + "learning_rate": 7.73022258062086e-07, + "loss": 0.1704, + "step": 18609 + }, + { + "epoch": 1.753138173853654, + "grad_norm": 0.6225571036338806, + "learning_rate": 7.724402149184107e-07, + "loss": 0.1868, + "step": 18610 + }, + { + "epoch": 1.7532323779468217, + "grad_norm": 0.572202205657959, + "learning_rate": 7.718583821773695e-07, + "loss": 0.1762, + "step": 18611 + }, + { + "epoch": 1.7533265820399895, + "grad_norm": 0.6788207292556763, + "learning_rate": 7.712767598522275e-07, + "loss": 0.184, + "step": 18612 + }, + { + "epoch": 1.7534207861331574, + "grad_norm": 0.6373607516288757, + "learning_rate": 7.706953479562473e-07, + "loss": 0.1987, + "step": 18613 + }, + { + "epoch": 1.7535149902263254, + "grad_norm": 0.6726533770561218, + "learning_rate": 7.701141465026896e-07, + "loss": 0.1733, + "step": 18614 + }, + { + "epoch": 1.7536091943194931, + "grad_norm": 0.762245237827301, + "learning_rate": 7.695331555048024e-07, + "loss": 0.2021, + "step": 18615 + }, + { + "epoch": 1.7537033984126609, + "grad_norm": 0.6375834941864014, + "learning_rate": 7.689523749758332e-07, + "loss": 0.1786, + "step": 18616 + }, + { + "epoch": 1.7537976025058288, + "grad_norm": 0.6688347458839417, + "learning_rate": 7.683718049290267e-07, + "loss": 0.2117, + "step": 18617 + }, + { + "epoch": 1.7538918065989968, + "grad_norm": 0.6413112878799438, + "learning_rate": 7.677914453776203e-07, + "loss": 0.1768, + "step": 18618 + }, + { + "epoch": 1.7539860106921645, + "grad_norm": 0.5684576034545898, + "learning_rate": 7.672112963348466e-07, + "loss": 0.1615, + "step": 18619 + }, + { + "epoch": 1.7540802147853323, + "grad_norm": 0.6422943472862244, + "learning_rate": 7.666313578139328e-07, + "loss": 0.1704, + "step": 18620 + }, + { + "epoch": 1.7541744188785002, + "grad_norm": 0.6456118226051331, + "learning_rate": 7.660516298281062e-07, + "loss": 0.1861, + "step": 18621 + }, + { + "epoch": 1.7542686229716682, + "grad_norm": 0.6674378514289856, + "learning_rate": 7.654721123905806e-07, + "loss": 0.203, + "step": 18622 + }, + { + "epoch": 1.754362827064836, + "grad_norm": 0.6868914365768433, + "learning_rate": 7.648928055145733e-07, + "loss": 0.2268, + "step": 18623 + }, + { + "epoch": 1.7544570311580037, + "grad_norm": 0.7327835559844971, + "learning_rate": 7.643137092132935e-07, + "loss": 0.203, + "step": 18624 + }, + { + "epoch": 1.7545512352511716, + "grad_norm": 0.7301385998725891, + "learning_rate": 7.637348234999431e-07, + "loss": 0.2093, + "step": 18625 + }, + { + "epoch": 1.7546454393443396, + "grad_norm": 0.6672301888465881, + "learning_rate": 7.631561483877226e-07, + "loss": 0.1895, + "step": 18626 + }, + { + "epoch": 1.7547396434375073, + "grad_norm": 0.7064858675003052, + "learning_rate": 7.62577683889828e-07, + "loss": 0.1912, + "step": 18627 + }, + { + "epoch": 1.754833847530675, + "grad_norm": 0.6245591044425964, + "learning_rate": 7.619994300194478e-07, + "loss": 0.1825, + "step": 18628 + }, + { + "epoch": 1.754928051623843, + "grad_norm": 0.6343905925750732, + "learning_rate": 7.614213867897668e-07, + "loss": 0.2124, + "step": 18629 + }, + { + "epoch": 1.755022255717011, + "grad_norm": 0.646212100982666, + "learning_rate": 7.60843554213967e-07, + "loss": 0.1878, + "step": 18630 + }, + { + "epoch": 1.7551164598101787, + "grad_norm": 0.6522904634475708, + "learning_rate": 7.602659323052231e-07, + "loss": 0.1724, + "step": 18631 + }, + { + "epoch": 1.7552106639033465, + "grad_norm": 0.6088020205497742, + "learning_rate": 7.596885210767024e-07, + "loss": 0.1728, + "step": 18632 + }, + { + "epoch": 1.7553048679965144, + "grad_norm": 0.6309208869934082, + "learning_rate": 7.591113205415779e-07, + "loss": 0.1772, + "step": 18633 + }, + { + "epoch": 1.7553990720896824, + "grad_norm": 0.6297697424888611, + "learning_rate": 7.585343307130055e-07, + "loss": 0.176, + "step": 18634 + }, + { + "epoch": 1.7554932761828501, + "grad_norm": 0.5904176235198975, + "learning_rate": 7.579575516041415e-07, + "loss": 0.1702, + "step": 18635 + }, + { + "epoch": 1.7555874802760179, + "grad_norm": 0.610162079334259, + "learning_rate": 7.573809832281376e-07, + "loss": 0.1641, + "step": 18636 + }, + { + "epoch": 1.7556816843691858, + "grad_norm": 0.662539541721344, + "learning_rate": 7.568046255981432e-07, + "loss": 0.1811, + "step": 18637 + }, + { + "epoch": 1.7557758884623538, + "grad_norm": 0.6001244187355042, + "learning_rate": 7.562284787272978e-07, + "loss": 0.1746, + "step": 18638 + }, + { + "epoch": 1.7558700925555215, + "grad_norm": 0.6731216907501221, + "learning_rate": 7.556525426287376e-07, + "loss": 0.2095, + "step": 18639 + }, + { + "epoch": 1.7559642966486892, + "grad_norm": 0.6648710370063782, + "learning_rate": 7.550768173155975e-07, + "loss": 0.1788, + "step": 18640 + }, + { + "epoch": 1.7560585007418572, + "grad_norm": 0.6438722610473633, + "learning_rate": 7.545013028010028e-07, + "loss": 0.1609, + "step": 18641 + }, + { + "epoch": 1.7561527048350252, + "grad_norm": 0.7448234558105469, + "learning_rate": 7.539259990980763e-07, + "loss": 0.2226, + "step": 18642 + }, + { + "epoch": 1.756246908928193, + "grad_norm": 0.6625121831893921, + "learning_rate": 7.533509062199384e-07, + "loss": 0.1822, + "step": 18643 + }, + { + "epoch": 1.7563411130213606, + "grad_norm": 0.8268623352050781, + "learning_rate": 7.527760241796978e-07, + "loss": 0.2109, + "step": 18644 + }, + { + "epoch": 1.7564353171145286, + "grad_norm": 0.677430272102356, + "learning_rate": 7.522013529904671e-07, + "loss": 0.2259, + "step": 18645 + }, + { + "epoch": 1.7565295212076966, + "grad_norm": 0.6224123239517212, + "learning_rate": 7.516268926653458e-07, + "loss": 0.1739, + "step": 18646 + }, + { + "epoch": 1.7566237253008643, + "grad_norm": 0.6237516403198242, + "learning_rate": 7.510526432174336e-07, + "loss": 0.1794, + "step": 18647 + }, + { + "epoch": 1.756717929394032, + "grad_norm": 0.611696183681488, + "learning_rate": 7.504786046598278e-07, + "loss": 0.1806, + "step": 18648 + }, + { + "epoch": 1.7568121334872, + "grad_norm": 0.7928467988967896, + "learning_rate": 7.499047770056123e-07, + "loss": 0.2075, + "step": 18649 + }, + { + "epoch": 1.756906337580368, + "grad_norm": 0.6605842709541321, + "learning_rate": 7.493311602678732e-07, + "loss": 0.2053, + "step": 18650 + }, + { + "epoch": 1.7570005416735357, + "grad_norm": 0.6715100407600403, + "learning_rate": 7.487577544596913e-07, + "loss": 0.19, + "step": 18651 + }, + { + "epoch": 1.7570947457667034, + "grad_norm": 0.741716742515564, + "learning_rate": 7.481845595941383e-07, + "loss": 0.2028, + "step": 18652 + }, + { + "epoch": 1.7571889498598714, + "grad_norm": 0.6929696798324585, + "learning_rate": 7.476115756842861e-07, + "loss": 0.2194, + "step": 18653 + }, + { + "epoch": 1.7572831539530394, + "grad_norm": 0.6631684303283691, + "learning_rate": 7.470388027432007e-07, + "loss": 0.2077, + "step": 18654 + }, + { + "epoch": 1.757377358046207, + "grad_norm": 0.6561174392700195, + "learning_rate": 7.464662407839408e-07, + "loss": 0.1861, + "step": 18655 + }, + { + "epoch": 1.7574715621393748, + "grad_norm": 0.6470728516578674, + "learning_rate": 7.458938898195601e-07, + "loss": 0.1884, + "step": 18656 + }, + { + "epoch": 1.7575657662325428, + "grad_norm": 0.6355688571929932, + "learning_rate": 7.453217498631093e-07, + "loss": 0.2058, + "step": 18657 + }, + { + "epoch": 1.7576599703257108, + "grad_norm": 0.7344101667404175, + "learning_rate": 7.447498209276382e-07, + "loss": 0.1986, + "step": 18658 + }, + { + "epoch": 1.7577541744188785, + "grad_norm": 0.6645457148551941, + "learning_rate": 7.441781030261819e-07, + "loss": 0.1789, + "step": 18659 + }, + { + "epoch": 1.7578483785120462, + "grad_norm": 0.6591945886611938, + "learning_rate": 7.436065961717797e-07, + "loss": 0.2197, + "step": 18660 + }, + { + "epoch": 1.7579425826052142, + "grad_norm": 0.6482409238815308, + "learning_rate": 7.430353003774638e-07, + "loss": 0.1952, + "step": 18661 + }, + { + "epoch": 1.7580367866983821, + "grad_norm": 0.7033007144927979, + "learning_rate": 7.424642156562578e-07, + "loss": 0.1828, + "step": 18662 + }, + { + "epoch": 1.7581309907915499, + "grad_norm": 0.6775185465812683, + "learning_rate": 7.418933420211849e-07, + "loss": 0.2068, + "step": 18663 + }, + { + "epoch": 1.7582251948847176, + "grad_norm": 0.6388871669769287, + "learning_rate": 7.413226794852635e-07, + "loss": 0.1761, + "step": 18664 + }, + { + "epoch": 1.7583193989778856, + "grad_norm": 0.626383900642395, + "learning_rate": 7.40752228061502e-07, + "loss": 0.187, + "step": 18665 + }, + { + "epoch": 1.7584136030710535, + "grad_norm": 0.6561257243156433, + "learning_rate": 7.40181987762909e-07, + "loss": 0.1919, + "step": 18666 + }, + { + "epoch": 1.7585078071642213, + "grad_norm": 0.6641181111335754, + "learning_rate": 7.396119586024897e-07, + "loss": 0.2131, + "step": 18667 + }, + { + "epoch": 1.758602011257389, + "grad_norm": 0.5611432790756226, + "learning_rate": 7.39042140593238e-07, + "loss": 0.1754, + "step": 18668 + }, + { + "epoch": 1.758696215350557, + "grad_norm": 0.6419809460639954, + "learning_rate": 7.384725337481458e-07, + "loss": 0.2138, + "step": 18669 + }, + { + "epoch": 1.758790419443725, + "grad_norm": 0.6619876623153687, + "learning_rate": 7.37903138080206e-07, + "loss": 0.2042, + "step": 18670 + }, + { + "epoch": 1.7588846235368927, + "grad_norm": 0.6841108202934265, + "learning_rate": 7.373339536023982e-07, + "loss": 0.1991, + "step": 18671 + }, + { + "epoch": 1.7589788276300604, + "grad_norm": 0.6848726272583008, + "learning_rate": 7.367649803276988e-07, + "loss": 0.1862, + "step": 18672 + }, + { + "epoch": 1.7590730317232284, + "grad_norm": 0.6747719645500183, + "learning_rate": 7.361962182690863e-07, + "loss": 0.1852, + "step": 18673 + }, + { + "epoch": 1.7591672358163963, + "grad_norm": 0.6555435657501221, + "learning_rate": 7.356276674395269e-07, + "loss": 0.1659, + "step": 18674 + }, + { + "epoch": 1.759261439909564, + "grad_norm": 0.7199277281761169, + "learning_rate": 7.350593278519824e-07, + "loss": 0.2054, + "step": 18675 + }, + { + "epoch": 1.7593556440027318, + "grad_norm": 0.6618119478225708, + "learning_rate": 7.344911995194149e-07, + "loss": 0.1674, + "step": 18676 + }, + { + "epoch": 1.7594498480958998, + "grad_norm": 0.7032886147499084, + "learning_rate": 7.339232824547782e-07, + "loss": 0.2016, + "step": 18677 + }, + { + "epoch": 1.7595440521890677, + "grad_norm": 0.692939281463623, + "learning_rate": 7.333555766710188e-07, + "loss": 0.2168, + "step": 18678 + }, + { + "epoch": 1.7596382562822355, + "grad_norm": 0.6232210397720337, + "learning_rate": 7.32788082181084e-07, + "loss": 0.1994, + "step": 18679 + }, + { + "epoch": 1.7597324603754032, + "grad_norm": 0.6903019547462463, + "learning_rate": 7.322207989979146e-07, + "loss": 0.2051, + "step": 18680 + }, + { + "epoch": 1.7598266644685712, + "grad_norm": 0.8852872252464294, + "learning_rate": 7.316537271344426e-07, + "loss": 0.1994, + "step": 18681 + }, + { + "epoch": 1.7599208685617391, + "grad_norm": 0.638813316822052, + "learning_rate": 7.310868666035986e-07, + "loss": 0.1615, + "step": 18682 + }, + { + "epoch": 1.7600150726549069, + "grad_norm": 0.6534923911094666, + "learning_rate": 7.305202174183112e-07, + "loss": 0.1899, + "step": 18683 + }, + { + "epoch": 1.7601092767480746, + "grad_norm": 0.7521393299102783, + "learning_rate": 7.299537795914958e-07, + "loss": 0.2198, + "step": 18684 + }, + { + "epoch": 1.7602034808412426, + "grad_norm": 0.6901976466178894, + "learning_rate": 7.293875531360728e-07, + "loss": 0.1765, + "step": 18685 + }, + { + "epoch": 1.7602976849344105, + "grad_norm": 0.6153367161750793, + "learning_rate": 7.2882153806495e-07, + "loss": 0.1736, + "step": 18686 + }, + { + "epoch": 1.7603918890275783, + "grad_norm": 0.7112525701522827, + "learning_rate": 7.282557343910335e-07, + "loss": 0.2419, + "step": 18687 + }, + { + "epoch": 1.760486093120746, + "grad_norm": 0.6666744947433472, + "learning_rate": 7.276901421272264e-07, + "loss": 0.1875, + "step": 18688 + }, + { + "epoch": 1.760580297213914, + "grad_norm": 0.5716570019721985, + "learning_rate": 7.27124761286423e-07, + "loss": 0.1697, + "step": 18689 + }, + { + "epoch": 1.760674501307082, + "grad_norm": 0.6441033482551575, + "learning_rate": 7.265595918815149e-07, + "loss": 0.1911, + "step": 18690 + }, + { + "epoch": 1.7607687054002497, + "grad_norm": 0.7376765608787537, + "learning_rate": 7.25994633925392e-07, + "loss": 0.1789, + "step": 18691 + }, + { + "epoch": 1.7608629094934174, + "grad_norm": 0.5842069387435913, + "learning_rate": 7.254298874309328e-07, + "loss": 0.1747, + "step": 18692 + }, + { + "epoch": 1.7609571135865854, + "grad_norm": 0.7579582333564758, + "learning_rate": 7.248653524110172e-07, + "loss": 0.2121, + "step": 18693 + }, + { + "epoch": 1.7610513176797533, + "grad_norm": 0.6068270802497864, + "learning_rate": 7.243010288785135e-07, + "loss": 0.1719, + "step": 18694 + }, + { + "epoch": 1.761145521772921, + "grad_norm": 0.6542767882347107, + "learning_rate": 7.237369168462937e-07, + "loss": 0.1911, + "step": 18695 + }, + { + "epoch": 1.7612397258660888, + "grad_norm": 0.6807528734207153, + "learning_rate": 7.231730163272166e-07, + "loss": 0.1853, + "step": 18696 + }, + { + "epoch": 1.7613339299592567, + "grad_norm": 0.6296195387840271, + "learning_rate": 7.226093273341406e-07, + "loss": 0.1906, + "step": 18697 + }, + { + "epoch": 1.7614281340524247, + "grad_norm": 0.6691365242004395, + "learning_rate": 7.220458498799221e-07, + "loss": 0.1941, + "step": 18698 + }, + { + "epoch": 1.7615223381455924, + "grad_norm": 0.6610446572303772, + "learning_rate": 7.214825839774053e-07, + "loss": 0.1996, + "step": 18699 + }, + { + "epoch": 1.7616165422387602, + "grad_norm": 0.8709343671798706, + "learning_rate": 7.209195296394356e-07, + "loss": 0.2024, + "step": 18700 + }, + { + "epoch": 1.7617107463319281, + "grad_norm": 0.7031115293502808, + "learning_rate": 7.203566868788514e-07, + "loss": 0.1927, + "step": 18701 + }, + { + "epoch": 1.761804950425096, + "grad_norm": 0.662255585193634, + "learning_rate": 7.197940557084848e-07, + "loss": 0.2048, + "step": 18702 + }, + { + "epoch": 1.7618991545182638, + "grad_norm": 0.7295622825622559, + "learning_rate": 7.192316361411666e-07, + "loss": 0.2029, + "step": 18703 + }, + { + "epoch": 1.7619933586114316, + "grad_norm": 0.5789169073104858, + "learning_rate": 7.18669428189721e-07, + "loss": 0.159, + "step": 18704 + }, + { + "epoch": 1.7620875627045995, + "grad_norm": 0.627537190914154, + "learning_rate": 7.181074318669645e-07, + "loss": 0.1804, + "step": 18705 + }, + { + "epoch": 1.7621817667977675, + "grad_norm": 0.6640955209732056, + "learning_rate": 7.175456471857134e-07, + "loss": 0.1793, + "step": 18706 + }, + { + "epoch": 1.7622759708909352, + "grad_norm": 0.6473426818847656, + "learning_rate": 7.169840741587797e-07, + "loss": 0.2012, + "step": 18707 + }, + { + "epoch": 1.762370174984103, + "grad_norm": 0.6217076182365417, + "learning_rate": 7.164227127989643e-07, + "loss": 0.1752, + "step": 18708 + }, + { + "epoch": 1.762464379077271, + "grad_norm": 0.6201004385948181, + "learning_rate": 7.158615631190657e-07, + "loss": 0.1701, + "step": 18709 + }, + { + "epoch": 1.762558583170439, + "grad_norm": 0.6264468431472778, + "learning_rate": 7.15300625131885e-07, + "loss": 0.1692, + "step": 18710 + }, + { + "epoch": 1.7626527872636066, + "grad_norm": 0.6493064165115356, + "learning_rate": 7.147398988502086e-07, + "loss": 0.1864, + "step": 18711 + }, + { + "epoch": 1.7627469913567744, + "grad_norm": 0.8641001582145691, + "learning_rate": 7.141793842868194e-07, + "loss": 0.1627, + "step": 18712 + }, + { + "epoch": 1.7628411954499423, + "grad_norm": 0.6677326560020447, + "learning_rate": 7.136190814545052e-07, + "loss": 0.2123, + "step": 18713 + }, + { + "epoch": 1.7629353995431103, + "grad_norm": 0.5769965648651123, + "learning_rate": 7.130589903660368e-07, + "loss": 0.1812, + "step": 18714 + }, + { + "epoch": 1.763029603636278, + "grad_norm": 0.663173258304596, + "learning_rate": 7.124991110341839e-07, + "loss": 0.1851, + "step": 18715 + }, + { + "epoch": 1.7631238077294458, + "grad_norm": 0.6828328371047974, + "learning_rate": 7.119394434717152e-07, + "loss": 0.2102, + "step": 18716 + }, + { + "epoch": 1.7632180118226137, + "grad_norm": 0.7221760749816895, + "learning_rate": 7.11379987691393e-07, + "loss": 0.1936, + "step": 18717 + }, + { + "epoch": 1.7633122159157817, + "grad_norm": 0.6994985342025757, + "learning_rate": 7.10820743705971e-07, + "loss": 0.2159, + "step": 18718 + }, + { + "epoch": 1.7634064200089494, + "grad_norm": 0.663030743598938, + "learning_rate": 7.102617115282018e-07, + "loss": 0.1965, + "step": 18719 + }, + { + "epoch": 1.7635006241021172, + "grad_norm": 0.6177852749824524, + "learning_rate": 7.097028911708337e-07, + "loss": 0.1795, + "step": 18720 + }, + { + "epoch": 1.7635948281952851, + "grad_norm": 0.6149967312812805, + "learning_rate": 7.091442826466055e-07, + "loss": 0.2113, + "step": 18721 + }, + { + "epoch": 1.763689032288453, + "grad_norm": 0.6708802580833435, + "learning_rate": 7.08585885968257e-07, + "loss": 0.2216, + "step": 18722 + }, + { + "epoch": 1.7637832363816208, + "grad_norm": 0.7028321623802185, + "learning_rate": 7.080277011485204e-07, + "loss": 0.1943, + "step": 18723 + }, + { + "epoch": 1.7638774404747886, + "grad_norm": 0.6543398499488831, + "learning_rate": 7.074697282001219e-07, + "loss": 0.2065, + "step": 18724 + }, + { + "epoch": 1.7639716445679565, + "grad_norm": 0.6498114466667175, + "learning_rate": 7.06911967135786e-07, + "loss": 0.2022, + "step": 18725 + }, + { + "epoch": 1.7640658486611245, + "grad_norm": 0.6824131011962891, + "learning_rate": 7.06354417968228e-07, + "loss": 0.1957, + "step": 18726 + }, + { + "epoch": 1.7641600527542922, + "grad_norm": 0.5951253771781921, + "learning_rate": 7.057970807101621e-07, + "loss": 0.1615, + "step": 18727 + }, + { + "epoch": 1.76425425684746, + "grad_norm": 0.6944735050201416, + "learning_rate": 7.052399553742972e-07, + "loss": 0.1955, + "step": 18728 + }, + { + "epoch": 1.764348460940628, + "grad_norm": 0.684751570224762, + "learning_rate": 7.046830419733353e-07, + "loss": 0.1782, + "step": 18729 + }, + { + "epoch": 1.7644426650337959, + "grad_norm": 0.7559695839881897, + "learning_rate": 7.04126340519975e-07, + "loss": 0.2014, + "step": 18730 + }, + { + "epoch": 1.7645368691269636, + "grad_norm": 0.5982239246368408, + "learning_rate": 7.035698510269129e-07, + "loss": 0.1676, + "step": 18731 + }, + { + "epoch": 1.7646310732201314, + "grad_norm": 0.6184801459312439, + "learning_rate": 7.030135735068333e-07, + "loss": 0.1653, + "step": 18732 + }, + { + "epoch": 1.7647252773132993, + "grad_norm": 0.6522563099861145, + "learning_rate": 7.024575079724239e-07, + "loss": 0.188, + "step": 18733 + }, + { + "epoch": 1.7648194814064673, + "grad_norm": 0.6570615768432617, + "learning_rate": 7.019016544363599e-07, + "loss": 0.1918, + "step": 18734 + }, + { + "epoch": 1.764913685499635, + "grad_norm": 0.661007821559906, + "learning_rate": 7.013460129113203e-07, + "loss": 0.1744, + "step": 18735 + }, + { + "epoch": 1.7650078895928027, + "grad_norm": 0.679739773273468, + "learning_rate": 7.007905834099715e-07, + "loss": 0.1797, + "step": 18736 + }, + { + "epoch": 1.7651020936859707, + "grad_norm": 0.7122992873191833, + "learning_rate": 7.002353659449779e-07, + "loss": 0.1935, + "step": 18737 + }, + { + "epoch": 1.7651962977791387, + "grad_norm": 0.6649571657180786, + "learning_rate": 6.996803605290015e-07, + "loss": 0.2127, + "step": 18738 + }, + { + "epoch": 1.7652905018723064, + "grad_norm": 0.6347363591194153, + "learning_rate": 6.991255671746955e-07, + "loss": 0.1867, + "step": 18739 + }, + { + "epoch": 1.7653847059654741, + "grad_norm": 0.6770117878913879, + "learning_rate": 6.985709858947099e-07, + "loss": 0.1727, + "step": 18740 + }, + { + "epoch": 1.765478910058642, + "grad_norm": 0.722261369228363, + "learning_rate": 6.980166167016922e-07, + "loss": 0.2064, + "step": 18741 + }, + { + "epoch": 1.7655731141518098, + "grad_norm": 0.6550199389457703, + "learning_rate": 6.974624596082802e-07, + "loss": 0.1881, + "step": 18742 + }, + { + "epoch": 1.7656673182449776, + "grad_norm": 0.6317087411880493, + "learning_rate": 6.969085146271116e-07, + "loss": 0.1966, + "step": 18743 + }, + { + "epoch": 1.7657615223381455, + "grad_norm": 0.6629270315170288, + "learning_rate": 6.963547817708171e-07, + "loss": 0.2014, + "step": 18744 + }, + { + "epoch": 1.7658557264313135, + "grad_norm": 0.6230559349060059, + "learning_rate": 6.958012610520215e-07, + "loss": 0.1863, + "step": 18745 + }, + { + "epoch": 1.7659499305244812, + "grad_norm": 0.618025004863739, + "learning_rate": 6.952479524833444e-07, + "loss": 0.1758, + "step": 18746 + }, + { + "epoch": 1.766044134617649, + "grad_norm": 0.6751699447631836, + "learning_rate": 6.946948560774059e-07, + "loss": 0.2253, + "step": 18747 + }, + { + "epoch": 1.766138338710817, + "grad_norm": 0.6590555310249329, + "learning_rate": 6.941419718468168e-07, + "loss": 0.2017, + "step": 18748 + }, + { + "epoch": 1.766232542803985, + "grad_norm": 0.6804190874099731, + "learning_rate": 6.935892998041782e-07, + "loss": 0.1949, + "step": 18749 + }, + { + "epoch": 1.7663267468971526, + "grad_norm": 0.6995640397071838, + "learning_rate": 6.930368399621001e-07, + "loss": 0.2027, + "step": 18750 + }, + { + "epoch": 1.7664209509903204, + "grad_norm": 0.6769449710845947, + "learning_rate": 6.924845923331758e-07, + "loss": 0.2004, + "step": 18751 + }, + { + "epoch": 1.7665151550834883, + "grad_norm": 0.6506038904190063, + "learning_rate": 6.919325569299939e-07, + "loss": 0.176, + "step": 18752 + }, + { + "epoch": 1.7666093591766563, + "grad_norm": 0.6735564470291138, + "learning_rate": 6.913807337651479e-07, + "loss": 0.2084, + "step": 18753 + }, + { + "epoch": 1.766703563269824, + "grad_norm": 2.2290399074554443, + "learning_rate": 6.908291228512165e-07, + "loss": 0.1817, + "step": 18754 + }, + { + "epoch": 1.7667977673629918, + "grad_norm": 0.7361657023429871, + "learning_rate": 6.902777242007775e-07, + "loss": 0.2233, + "step": 18755 + }, + { + "epoch": 1.7668919714561597, + "grad_norm": 0.7112666368484497, + "learning_rate": 6.897265378264039e-07, + "loss": 0.2168, + "step": 18756 + }, + { + "epoch": 1.7669861755493277, + "grad_norm": 1.0875725746154785, + "learning_rate": 6.89175563740665e-07, + "loss": 0.2086, + "step": 18757 + }, + { + "epoch": 1.7670803796424954, + "grad_norm": 0.736074686050415, + "learning_rate": 6.886248019561215e-07, + "loss": 0.1983, + "step": 18758 + }, + { + "epoch": 1.7671745837356632, + "grad_norm": 0.6421671509742737, + "learning_rate": 6.880742524853323e-07, + "loss": 0.2058, + "step": 18759 + }, + { + "epoch": 1.7672687878288311, + "grad_norm": 0.8324503898620605, + "learning_rate": 6.875239153408541e-07, + "loss": 0.1933, + "step": 18760 + }, + { + "epoch": 1.767362991921999, + "grad_norm": 0.6461583375930786, + "learning_rate": 6.869737905352303e-07, + "loss": 0.2001, + "step": 18761 + }, + { + "epoch": 1.7674571960151668, + "grad_norm": 0.6334694623947144, + "learning_rate": 6.864238780810062e-07, + "loss": 0.2183, + "step": 18762 + }, + { + "epoch": 1.7675514001083346, + "grad_norm": 0.6211568713188171, + "learning_rate": 6.85874177990724e-07, + "loss": 0.1882, + "step": 18763 + }, + { + "epoch": 1.7676456042015025, + "grad_norm": 0.7000827193260193, + "learning_rate": 6.853246902769129e-07, + "loss": 0.1908, + "step": 18764 + }, + { + "epoch": 1.7677398082946705, + "grad_norm": 0.6585289835929871, + "learning_rate": 6.847754149521069e-07, + "loss": 0.202, + "step": 18765 + }, + { + "epoch": 1.7678340123878382, + "grad_norm": 0.6150330901145935, + "learning_rate": 6.84226352028825e-07, + "loss": 0.1888, + "step": 18766 + }, + { + "epoch": 1.767928216481006, + "grad_norm": 0.8438021540641785, + "learning_rate": 6.836775015195895e-07, + "loss": 0.1794, + "step": 18767 + }, + { + "epoch": 1.768022420574174, + "grad_norm": 0.6473048329353333, + "learning_rate": 6.831288634369171e-07, + "loss": 0.2077, + "step": 18768 + }, + { + "epoch": 1.7681166246673419, + "grad_norm": 0.6201759576797485, + "learning_rate": 6.825804377933142e-07, + "loss": 0.1901, + "step": 18769 + }, + { + "epoch": 1.7682108287605096, + "grad_norm": 0.6857398152351379, + "learning_rate": 6.820322246012856e-07, + "loss": 0.2226, + "step": 18770 + }, + { + "epoch": 1.7683050328536773, + "grad_norm": 0.661655068397522, + "learning_rate": 6.814842238733354e-07, + "loss": 0.2176, + "step": 18771 + }, + { + "epoch": 1.7683992369468453, + "grad_norm": 0.67503422498703, + "learning_rate": 6.80936435621955e-07, + "loss": 0.1948, + "step": 18772 + }, + { + "epoch": 1.7684934410400133, + "grad_norm": 0.6564956903457642, + "learning_rate": 6.803888598596364e-07, + "loss": 0.1822, + "step": 18773 + }, + { + "epoch": 1.768587645133181, + "grad_norm": 0.693681001663208, + "learning_rate": 6.798414965988643e-07, + "loss": 0.1914, + "step": 18774 + }, + { + "epoch": 1.7686818492263487, + "grad_norm": 0.6001759171485901, + "learning_rate": 6.792943458521206e-07, + "loss": 0.192, + "step": 18775 + }, + { + "epoch": 1.7687760533195167, + "grad_norm": 0.6142124533653259, + "learning_rate": 6.787474076318801e-07, + "loss": 0.1932, + "step": 18776 + }, + { + "epoch": 1.7688702574126847, + "grad_norm": 0.6608695983886719, + "learning_rate": 6.782006819506137e-07, + "loss": 0.169, + "step": 18777 + }, + { + "epoch": 1.7689644615058524, + "grad_norm": 1.1633796691894531, + "learning_rate": 6.776541688207905e-07, + "loss": 0.1945, + "step": 18778 + }, + { + "epoch": 1.7690586655990201, + "grad_norm": 0.6649760007858276, + "learning_rate": 6.77107868254867e-07, + "loss": 0.1766, + "step": 18779 + }, + { + "epoch": 1.769152869692188, + "grad_norm": 0.6783766150474548, + "learning_rate": 6.765617802653024e-07, + "loss": 0.1836, + "step": 18780 + }, + { + "epoch": 1.769247073785356, + "grad_norm": 0.701332151889801, + "learning_rate": 6.760159048645499e-07, + "loss": 0.1972, + "step": 18781 + }, + { + "epoch": 1.7693412778785238, + "grad_norm": 0.6640893816947937, + "learning_rate": 6.754702420650527e-07, + "loss": 0.1672, + "step": 18782 + }, + { + "epoch": 1.7694354819716915, + "grad_norm": 0.7054677605628967, + "learning_rate": 6.749247918792556e-07, + "loss": 0.1914, + "step": 18783 + }, + { + "epoch": 1.7695296860648595, + "grad_norm": 0.6651020050048828, + "learning_rate": 6.743795543195953e-07, + "loss": 0.1852, + "step": 18784 + }, + { + "epoch": 1.7696238901580275, + "grad_norm": 0.7026653289794922, + "learning_rate": 6.738345293985038e-07, + "loss": 0.1886, + "step": 18785 + }, + { + "epoch": 1.7697180942511952, + "grad_norm": 0.6611322164535522, + "learning_rate": 6.732897171284059e-07, + "loss": 0.1805, + "step": 18786 + }, + { + "epoch": 1.769812298344363, + "grad_norm": 0.6668574810028076, + "learning_rate": 6.727451175217282e-07, + "loss": 0.1885, + "step": 18787 + }, + { + "epoch": 1.769906502437531, + "grad_norm": 0.6399896144866943, + "learning_rate": 6.722007305908873e-07, + "loss": 0.1729, + "step": 18788 + }, + { + "epoch": 1.7700007065306989, + "grad_norm": 0.6232643127441406, + "learning_rate": 6.716565563482925e-07, + "loss": 0.186, + "step": 18789 + }, + { + "epoch": 1.7700949106238666, + "grad_norm": 0.6482470035552979, + "learning_rate": 6.71112594806358e-07, + "loss": 0.1938, + "step": 18790 + }, + { + "epoch": 1.7701891147170343, + "grad_norm": 0.6848767399787903, + "learning_rate": 6.705688459774839e-07, + "loss": 0.1835, + "step": 18791 + }, + { + "epoch": 1.7702833188102023, + "grad_norm": 0.8514116406440735, + "learning_rate": 6.70025309874065e-07, + "loss": 0.1972, + "step": 18792 + }, + { + "epoch": 1.7703775229033702, + "grad_norm": 0.6496803760528564, + "learning_rate": 6.694819865085012e-07, + "loss": 0.1751, + "step": 18793 + }, + { + "epoch": 1.770471726996538, + "grad_norm": 0.699759304523468, + "learning_rate": 6.689388758931781e-07, + "loss": 0.1821, + "step": 18794 + }, + { + "epoch": 1.7705659310897057, + "grad_norm": 0.6508598327636719, + "learning_rate": 6.68395978040477e-07, + "loss": 0.1952, + "step": 18795 + }, + { + "epoch": 1.7706601351828737, + "grad_norm": 0.6039749979972839, + "learning_rate": 6.678532929627801e-07, + "loss": 0.1792, + "step": 18796 + }, + { + "epoch": 1.7707543392760416, + "grad_norm": 0.6619493365287781, + "learning_rate": 6.673108206724621e-07, + "loss": 0.1999, + "step": 18797 + }, + { + "epoch": 1.7708485433692094, + "grad_norm": 0.5428429841995239, + "learning_rate": 6.667685611818886e-07, + "loss": 0.1648, + "step": 18798 + }, + { + "epoch": 1.7709427474623771, + "grad_norm": 0.6618406772613525, + "learning_rate": 6.662265145034263e-07, + "loss": 0.1957, + "step": 18799 + }, + { + "epoch": 1.771036951555545, + "grad_norm": 0.6611621975898743, + "learning_rate": 6.656846806494366e-07, + "loss": 0.2116, + "step": 18800 + }, + { + "epoch": 1.771131155648713, + "grad_norm": 0.6667648553848267, + "learning_rate": 6.651430596322695e-07, + "loss": 0.2, + "step": 18801 + }, + { + "epoch": 1.7712253597418808, + "grad_norm": 0.6235458254814148, + "learning_rate": 6.646016514642784e-07, + "loss": 0.1848, + "step": 18802 + }, + { + "epoch": 1.7713195638350485, + "grad_norm": 0.6254004836082458, + "learning_rate": 6.640604561578079e-07, + "loss": 0.1878, + "step": 18803 + }, + { + "epoch": 1.7714137679282165, + "grad_norm": 0.663865864276886, + "learning_rate": 6.635194737251959e-07, + "loss": 0.2085, + "step": 18804 + }, + { + "epoch": 1.7715079720213844, + "grad_norm": 0.6067942380905151, + "learning_rate": 6.629787041787805e-07, + "loss": 0.177, + "step": 18805 + }, + { + "epoch": 1.7716021761145522, + "grad_norm": 0.6628295183181763, + "learning_rate": 6.624381475308883e-07, + "loss": 0.2121, + "step": 18806 + }, + { + "epoch": 1.77169638020772, + "grad_norm": 0.6948346495628357, + "learning_rate": 6.618978037938484e-07, + "loss": 0.211, + "step": 18807 + }, + { + "epoch": 1.7717905843008879, + "grad_norm": 0.6519178152084351, + "learning_rate": 6.613576729799809e-07, + "loss": 0.2031, + "step": 18808 + }, + { + "epoch": 1.7718847883940558, + "grad_norm": 0.6935162544250488, + "learning_rate": 6.608177551016004e-07, + "loss": 0.1988, + "step": 18809 + }, + { + "epoch": 1.7719789924872236, + "grad_norm": 0.6876966953277588, + "learning_rate": 6.602780501710193e-07, + "loss": 0.1954, + "step": 18810 + }, + { + "epoch": 1.7720731965803913, + "grad_norm": 0.6378141641616821, + "learning_rate": 6.597385582005411e-07, + "loss": 0.1895, + "step": 18811 + }, + { + "epoch": 1.7721674006735593, + "grad_norm": 0.6201403737068176, + "learning_rate": 6.591992792024693e-07, + "loss": 0.1738, + "step": 18812 + }, + { + "epoch": 1.7722616047667272, + "grad_norm": 0.6032953858375549, + "learning_rate": 6.586602131891007e-07, + "loss": 0.2068, + "step": 18813 + }, + { + "epoch": 1.772355808859895, + "grad_norm": 0.6335887908935547, + "learning_rate": 6.581213601727243e-07, + "loss": 0.1885, + "step": 18814 + }, + { + "epoch": 1.7724500129530627, + "grad_norm": 0.6177152991294861, + "learning_rate": 6.575827201656304e-07, + "loss": 0.1835, + "step": 18815 + }, + { + "epoch": 1.7725442170462307, + "grad_norm": 0.6353725790977478, + "learning_rate": 6.570442931800969e-07, + "loss": 0.2003, + "step": 18816 + }, + { + "epoch": 1.7726384211393986, + "grad_norm": 0.6191625595092773, + "learning_rate": 6.565060792284028e-07, + "loss": 0.1756, + "step": 18817 + }, + { + "epoch": 1.7727326252325664, + "grad_norm": 0.6994245052337646, + "learning_rate": 6.559680783228217e-07, + "loss": 0.2076, + "step": 18818 + }, + { + "epoch": 1.772826829325734, + "grad_norm": 0.6597037315368652, + "learning_rate": 6.554302904756182e-07, + "loss": 0.1897, + "step": 18819 + }, + { + "epoch": 1.772921033418902, + "grad_norm": 0.6516751050949097, + "learning_rate": 6.548927156990548e-07, + "loss": 0.2202, + "step": 18820 + }, + { + "epoch": 1.77301523751207, + "grad_norm": 0.6519423127174377, + "learning_rate": 6.543553540053926e-07, + "loss": 0.21, + "step": 18821 + }, + { + "epoch": 1.7731094416052378, + "grad_norm": 0.6038287281990051, + "learning_rate": 6.538182054068787e-07, + "loss": 0.1818, + "step": 18822 + }, + { + "epoch": 1.7732036456984055, + "grad_norm": 0.6104490756988525, + "learning_rate": 6.532812699157653e-07, + "loss": 0.1644, + "step": 18823 + }, + { + "epoch": 1.7732978497915735, + "grad_norm": 0.6544206738471985, + "learning_rate": 6.52744547544295e-07, + "loss": 0.1995, + "step": 18824 + }, + { + "epoch": 1.7733920538847414, + "grad_norm": 0.6738982200622559, + "learning_rate": 6.522080383047047e-07, + "loss": 0.2134, + "step": 18825 + }, + { + "epoch": 1.7734862579779092, + "grad_norm": 0.6365258097648621, + "learning_rate": 6.516717422092245e-07, + "loss": 0.226, + "step": 18826 + }, + { + "epoch": 1.773580462071077, + "grad_norm": 0.6545007824897766, + "learning_rate": 6.51135659270089e-07, + "loss": 0.1955, + "step": 18827 + }, + { + "epoch": 1.7736746661642449, + "grad_norm": 0.8270087838172913, + "learning_rate": 6.505997894995187e-07, + "loss": 0.2051, + "step": 18828 + }, + { + "epoch": 1.7737688702574128, + "grad_norm": 0.674146294593811, + "learning_rate": 6.500641329097302e-07, + "loss": 0.202, + "step": 18829 + }, + { + "epoch": 1.7738630743505805, + "grad_norm": 0.7583029866218567, + "learning_rate": 6.495286895129415e-07, + "loss": 0.1984, + "step": 18830 + }, + { + "epoch": 1.7739572784437483, + "grad_norm": 0.6309941411018372, + "learning_rate": 6.489934593213598e-07, + "loss": 0.1569, + "step": 18831 + }, + { + "epoch": 1.7740514825369162, + "grad_norm": 0.6320933699607849, + "learning_rate": 6.484584423471852e-07, + "loss": 0.1927, + "step": 18832 + }, + { + "epoch": 1.7741456866300842, + "grad_norm": 0.6087074279785156, + "learning_rate": 6.479236386026244e-07, + "loss": 0.1782, + "step": 18833 + }, + { + "epoch": 1.774239890723252, + "grad_norm": 0.691861093044281, + "learning_rate": 6.47389048099868e-07, + "loss": 0.2143, + "step": 18834 + }, + { + "epoch": 1.7743340948164197, + "grad_norm": 0.7514015436172485, + "learning_rate": 6.468546708511037e-07, + "loss": 0.2197, + "step": 18835 + }, + { + "epoch": 1.7744282989095876, + "grad_norm": 0.6492466926574707, + "learning_rate": 6.463205068685174e-07, + "loss": 0.1603, + "step": 18836 + }, + { + "epoch": 1.7745225030027556, + "grad_norm": 0.6837599277496338, + "learning_rate": 6.457865561642907e-07, + "loss": 0.2147, + "step": 18837 + }, + { + "epoch": 1.7746167070959233, + "grad_norm": 0.6451874375343323, + "learning_rate": 6.452528187505969e-07, + "loss": 0.1651, + "step": 18838 + }, + { + "epoch": 1.774710911189091, + "grad_norm": 0.7161141037940979, + "learning_rate": 6.447192946396052e-07, + "loss": 0.2112, + "step": 18839 + }, + { + "epoch": 1.774805115282259, + "grad_norm": 0.6483317613601685, + "learning_rate": 6.441859838434827e-07, + "loss": 0.1829, + "step": 18840 + }, + { + "epoch": 1.774899319375427, + "grad_norm": 0.7365931272506714, + "learning_rate": 6.436528863743885e-07, + "loss": 0.1826, + "step": 18841 + }, + { + "epoch": 1.7749935234685947, + "grad_norm": 0.6644814610481262, + "learning_rate": 6.431200022444773e-07, + "loss": 0.1862, + "step": 18842 + }, + { + "epoch": 1.7750877275617625, + "grad_norm": 0.6445244550704956, + "learning_rate": 6.425873314659037e-07, + "loss": 0.1986, + "step": 18843 + }, + { + "epoch": 1.7751819316549304, + "grad_norm": 0.6263026595115662, + "learning_rate": 6.42054874050807e-07, + "loss": 0.2123, + "step": 18844 + }, + { + "epoch": 1.7752761357480984, + "grad_norm": 0.6531116366386414, + "learning_rate": 6.415226300113342e-07, + "loss": 0.181, + "step": 18845 + }, + { + "epoch": 1.7753703398412661, + "grad_norm": 0.6579196453094482, + "learning_rate": 6.409905993596178e-07, + "loss": 0.2073, + "step": 18846 + }, + { + "epoch": 1.7754645439344339, + "grad_norm": 0.5932646989822388, + "learning_rate": 6.404587821077879e-07, + "loss": 0.1665, + "step": 18847 + }, + { + "epoch": 1.7755587480276018, + "grad_norm": 0.6194208264350891, + "learning_rate": 6.399271782679762e-07, + "loss": 0.1814, + "step": 18848 + }, + { + "epoch": 1.7756529521207698, + "grad_norm": 0.5777454376220703, + "learning_rate": 6.393957878522972e-07, + "loss": 0.1754, + "step": 18849 + }, + { + "epoch": 1.7757471562139375, + "grad_norm": 0.7162359356880188, + "learning_rate": 6.388646108728724e-07, + "loss": 0.1936, + "step": 18850 + }, + { + "epoch": 1.7758413603071053, + "grad_norm": 0.6331944465637207, + "learning_rate": 6.383336473418111e-07, + "loss": 0.1895, + "step": 18851 + }, + { + "epoch": 1.7759355644002732, + "grad_norm": 0.652292013168335, + "learning_rate": 6.378028972712202e-07, + "loss": 0.2272, + "step": 18852 + }, + { + "epoch": 1.7760297684934412, + "grad_norm": 0.7276932001113892, + "learning_rate": 6.372723606732046e-07, + "loss": 0.1998, + "step": 18853 + }, + { + "epoch": 1.776123972586609, + "grad_norm": 0.5779160261154175, + "learning_rate": 6.367420375598554e-07, + "loss": 0.1627, + "step": 18854 + }, + { + "epoch": 1.7762181766797767, + "grad_norm": 0.6181293725967407, + "learning_rate": 6.36211927943271e-07, + "loss": 0.1708, + "step": 18855 + }, + { + "epoch": 1.7763123807729446, + "grad_norm": 0.6790673136711121, + "learning_rate": 6.35682031835535e-07, + "loss": 0.2167, + "step": 18856 + }, + { + "epoch": 1.7764065848661126, + "grad_norm": 0.6449689865112305, + "learning_rate": 6.3515234924873e-07, + "loss": 0.2031, + "step": 18857 + }, + { + "epoch": 1.7765007889592803, + "grad_norm": 0.6832816004753113, + "learning_rate": 6.346228801949372e-07, + "loss": 0.2348, + "step": 18858 + }, + { + "epoch": 1.776594993052448, + "grad_norm": 0.685299277305603, + "learning_rate": 6.340936246862239e-07, + "loss": 0.1738, + "step": 18859 + }, + { + "epoch": 1.776689197145616, + "grad_norm": 0.6643345952033997, + "learning_rate": 6.335645827346604e-07, + "loss": 0.1897, + "step": 18860 + }, + { + "epoch": 1.776783401238784, + "grad_norm": 0.6404387950897217, + "learning_rate": 6.330357543523125e-07, + "loss": 0.1921, + "step": 18861 + }, + { + "epoch": 1.7768776053319517, + "grad_norm": 0.66966313123703, + "learning_rate": 6.325071395512328e-07, + "loss": 0.1945, + "step": 18862 + }, + { + "epoch": 1.7769718094251195, + "grad_norm": 0.6696110963821411, + "learning_rate": 6.319787383434783e-07, + "loss": 0.2189, + "step": 18863 + }, + { + "epoch": 1.7770660135182874, + "grad_norm": 0.695486307144165, + "learning_rate": 6.314505507410984e-07, + "loss": 0.2251, + "step": 18864 + }, + { + "epoch": 1.7771602176114554, + "grad_norm": 0.6277366280555725, + "learning_rate": 6.309225767561345e-07, + "loss": 0.1937, + "step": 18865 + }, + { + "epoch": 1.777254421704623, + "grad_norm": 0.631859302520752, + "learning_rate": 6.303948164006212e-07, + "loss": 0.1868, + "step": 18866 + }, + { + "epoch": 1.7773486257977908, + "grad_norm": 0.7215867042541504, + "learning_rate": 6.298672696866004e-07, + "loss": 0.1877, + "step": 18867 + }, + { + "epoch": 1.7774428298909588, + "grad_norm": 0.6455090641975403, + "learning_rate": 6.293399366260977e-07, + "loss": 0.1855, + "step": 18868 + }, + { + "epoch": 1.7775370339841268, + "grad_norm": 0.6297513842582703, + "learning_rate": 6.288128172311326e-07, + "loss": 0.1885, + "step": 18869 + }, + { + "epoch": 1.7776312380772945, + "grad_norm": 0.7350378632545471, + "learning_rate": 6.282859115137308e-07, + "loss": 0.2221, + "step": 18870 + }, + { + "epoch": 1.7777254421704622, + "grad_norm": 0.62552410364151, + "learning_rate": 6.277592194859039e-07, + "loss": 0.1835, + "step": 18871 + }, + { + "epoch": 1.7778196462636302, + "grad_norm": 0.6194043159484863, + "learning_rate": 6.27232741159659e-07, + "loss": 0.1974, + "step": 18872 + }, + { + "epoch": 1.7779138503567982, + "grad_norm": 0.6858185529708862, + "learning_rate": 6.267064765470055e-07, + "loss": 0.1927, + "step": 18873 + }, + { + "epoch": 1.778008054449966, + "grad_norm": 0.627632200717926, + "learning_rate": 6.261804256599402e-07, + "loss": 0.2018, + "step": 18874 + }, + { + "epoch": 1.7781022585431336, + "grad_norm": 0.6912941336631775, + "learning_rate": 6.25654588510457e-07, + "loss": 0.1982, + "step": 18875 + }, + { + "epoch": 1.7781964626363016, + "grad_norm": 0.6505584120750427, + "learning_rate": 6.251289651105464e-07, + "loss": 0.1927, + "step": 18876 + }, + { + "epoch": 1.7782906667294696, + "grad_norm": 0.7614810466766357, + "learning_rate": 6.246035554721963e-07, + "loss": 0.2277, + "step": 18877 + }, + { + "epoch": 1.7783848708226373, + "grad_norm": 0.6901644468307495, + "learning_rate": 6.240783596073819e-07, + "loss": 0.1836, + "step": 18878 + }, + { + "epoch": 1.778479074915805, + "grad_norm": 0.6384099125862122, + "learning_rate": 6.235533775280811e-07, + "loss": 0.1885, + "step": 18879 + }, + { + "epoch": 1.778573279008973, + "grad_norm": 0.6384451389312744, + "learning_rate": 6.230286092462667e-07, + "loss": 0.1858, + "step": 18880 + }, + { + "epoch": 1.7786674831021407, + "grad_norm": 0.6098473072052002, + "learning_rate": 6.225040547738992e-07, + "loss": 0.1979, + "step": 18881 + }, + { + "epoch": 1.7787616871953085, + "grad_norm": 0.635390043258667, + "learning_rate": 6.219797141229422e-07, + "loss": 0.2048, + "step": 18882 + }, + { + "epoch": 1.7788558912884764, + "grad_norm": 0.6741024851799011, + "learning_rate": 6.21455587305353e-07, + "loss": 0.1782, + "step": 18883 + }, + { + "epoch": 1.7789500953816444, + "grad_norm": 0.7610291242599487, + "learning_rate": 6.209316743330796e-07, + "loss": 0.1835, + "step": 18884 + }, + { + "epoch": 1.7790442994748121, + "grad_norm": 0.7101349234580994, + "learning_rate": 6.204079752180692e-07, + "loss": 0.1999, + "step": 18885 + }, + { + "epoch": 1.7791385035679799, + "grad_norm": 0.641293466091156, + "learning_rate": 6.198844899722623e-07, + "loss": 0.1997, + "step": 18886 + }, + { + "epoch": 1.7792327076611478, + "grad_norm": 0.6606078147888184, + "learning_rate": 6.193612186075948e-07, + "loss": 0.2157, + "step": 18887 + }, + { + "epoch": 1.7793269117543158, + "grad_norm": 0.7069306969642639, + "learning_rate": 6.188381611360006e-07, + "loss": 0.1989, + "step": 18888 + }, + { + "epoch": 1.7794211158474835, + "grad_norm": 0.6216179132461548, + "learning_rate": 6.183153175694034e-07, + "loss": 0.2156, + "step": 18889 + }, + { + "epoch": 1.7795153199406513, + "grad_norm": 0.7265723347663879, + "learning_rate": 6.177926879197271e-07, + "loss": 0.1902, + "step": 18890 + }, + { + "epoch": 1.7796095240338192, + "grad_norm": 0.7844327092170715, + "learning_rate": 6.172702721988866e-07, + "loss": 0.2204, + "step": 18891 + }, + { + "epoch": 1.7797037281269872, + "grad_norm": 0.7905182838439941, + "learning_rate": 6.167480704187944e-07, + "loss": 0.1886, + "step": 18892 + }, + { + "epoch": 1.779797932220155, + "grad_norm": 0.692022442817688, + "learning_rate": 6.162260825913591e-07, + "loss": 0.1888, + "step": 18893 + }, + { + "epoch": 1.7798921363133227, + "grad_norm": 0.6562528014183044, + "learning_rate": 6.157043087284797e-07, + "loss": 0.2017, + "step": 18894 + }, + { + "epoch": 1.7799863404064906, + "grad_norm": 0.7171071171760559, + "learning_rate": 6.15182748842057e-07, + "loss": 0.2045, + "step": 18895 + }, + { + "epoch": 1.7800805444996586, + "grad_norm": 0.6339259743690491, + "learning_rate": 6.14661402943979e-07, + "loss": 0.1747, + "step": 18896 + }, + { + "epoch": 1.7801747485928263, + "grad_norm": 0.6818625926971436, + "learning_rate": 6.141402710461363e-07, + "loss": 0.2319, + "step": 18897 + }, + { + "epoch": 1.780268952685994, + "grad_norm": 0.6656239628791809, + "learning_rate": 6.136193531604128e-07, + "loss": 0.1867, + "step": 18898 + }, + { + "epoch": 1.780363156779162, + "grad_norm": 0.6257752180099487, + "learning_rate": 6.130986492986835e-07, + "loss": 0.1755, + "step": 18899 + }, + { + "epoch": 1.78045736087233, + "grad_norm": 0.647856593132019, + "learning_rate": 6.125781594728208e-07, + "loss": 0.1818, + "step": 18900 + }, + { + "epoch": 1.7805515649654977, + "grad_norm": 0.673330545425415, + "learning_rate": 6.120578836946956e-07, + "loss": 0.2133, + "step": 18901 + }, + { + "epoch": 1.7806457690586655, + "grad_norm": 0.6279948949813843, + "learning_rate": 6.115378219761681e-07, + "loss": 0.1814, + "step": 18902 + }, + { + "epoch": 1.7807399731518334, + "grad_norm": 0.6680616736412048, + "learning_rate": 6.110179743290979e-07, + "loss": 0.1902, + "step": 18903 + }, + { + "epoch": 1.7808341772450014, + "grad_norm": 0.6360419988632202, + "learning_rate": 6.104983407653397e-07, + "loss": 0.1885, + "step": 18904 + }, + { + "epoch": 1.780928381338169, + "grad_norm": 0.6680687069892883, + "learning_rate": 6.099789212967411e-07, + "loss": 0.204, + "step": 18905 + }, + { + "epoch": 1.7810225854313368, + "grad_norm": 0.6434610486030579, + "learning_rate": 6.094597159351412e-07, + "loss": 0.2015, + "step": 18906 + }, + { + "epoch": 1.7811167895245048, + "grad_norm": 0.670667290687561, + "learning_rate": 6.089407246923861e-07, + "loss": 0.1918, + "step": 18907 + }, + { + "epoch": 1.7812109936176728, + "grad_norm": 0.5744823217391968, + "learning_rate": 6.084219475803055e-07, + "loss": 0.1582, + "step": 18908 + }, + { + "epoch": 1.7813051977108405, + "grad_norm": 0.6056269407272339, + "learning_rate": 6.079033846107262e-07, + "loss": 0.1756, + "step": 18909 + }, + { + "epoch": 1.7813994018040082, + "grad_norm": 0.7023749351501465, + "learning_rate": 6.07385035795478e-07, + "loss": 0.1877, + "step": 18910 + }, + { + "epoch": 1.7814936058971762, + "grad_norm": 0.6520842909812927, + "learning_rate": 6.068669011463768e-07, + "loss": 0.1901, + "step": 18911 + }, + { + "epoch": 1.7815878099903442, + "grad_norm": 0.7571308016777039, + "learning_rate": 6.063489806752332e-07, + "loss": 0.2238, + "step": 18912 + }, + { + "epoch": 1.781682014083512, + "grad_norm": 0.6574953198432922, + "learning_rate": 6.058312743938643e-07, + "loss": 0.2013, + "step": 18913 + }, + { + "epoch": 1.7817762181766796, + "grad_norm": 0.670661211013794, + "learning_rate": 6.053137823140709e-07, + "loss": 0.171, + "step": 18914 + }, + { + "epoch": 1.7818704222698476, + "grad_norm": 0.668809711933136, + "learning_rate": 6.047965044476501e-07, + "loss": 0.1783, + "step": 18915 + }, + { + "epoch": 1.7819646263630156, + "grad_norm": 0.630757749080658, + "learning_rate": 6.042794408064001e-07, + "loss": 0.1935, + "step": 18916 + }, + { + "epoch": 1.7820588304561833, + "grad_norm": 0.6640926599502563, + "learning_rate": 6.037625914021106e-07, + "loss": 0.1649, + "step": 18917 + }, + { + "epoch": 1.782153034549351, + "grad_norm": 0.666729211807251, + "learning_rate": 6.032459562465631e-07, + "loss": 0.2147, + "step": 18918 + }, + { + "epoch": 1.782247238642519, + "grad_norm": 0.6913374066352844, + "learning_rate": 6.027295353515417e-07, + "loss": 0.1813, + "step": 18919 + }, + { + "epoch": 1.782341442735687, + "grad_norm": 0.7098759412765503, + "learning_rate": 6.022133287288201e-07, + "loss": 0.2107, + "step": 18920 + }, + { + "epoch": 1.7824356468288547, + "grad_norm": 1.4247428178787231, + "learning_rate": 6.016973363901679e-07, + "loss": 0.2083, + "step": 18921 + }, + { + "epoch": 1.7825298509220224, + "grad_norm": 0.6035720109939575, + "learning_rate": 6.011815583473512e-07, + "loss": 0.1628, + "step": 18922 + }, + { + "epoch": 1.7826240550151904, + "grad_norm": 0.6486483812332153, + "learning_rate": 6.006659946121318e-07, + "loss": 0.2028, + "step": 18923 + }, + { + "epoch": 1.7827182591083583, + "grad_norm": 0.6193046569824219, + "learning_rate": 6.001506451962624e-07, + "loss": 0.1745, + "step": 18924 + }, + { + "epoch": 1.782812463201526, + "grad_norm": 0.6953390836715698, + "learning_rate": 5.996355101114981e-07, + "loss": 0.2003, + "step": 18925 + }, + { + "epoch": 1.7829066672946938, + "grad_norm": 0.6082496047019958, + "learning_rate": 5.991205893695795e-07, + "loss": 0.1648, + "step": 18926 + }, + { + "epoch": 1.7830008713878618, + "grad_norm": 0.6720678210258484, + "learning_rate": 5.986058829822516e-07, + "loss": 0.1852, + "step": 18927 + }, + { + "epoch": 1.7830950754810297, + "grad_norm": 0.6750378608703613, + "learning_rate": 5.980913909612485e-07, + "loss": 0.2034, + "step": 18928 + }, + { + "epoch": 1.7831892795741975, + "grad_norm": 0.7238875031471252, + "learning_rate": 5.975771133183006e-07, + "loss": 0.2009, + "step": 18929 + }, + { + "epoch": 1.7832834836673652, + "grad_norm": 0.6554574966430664, + "learning_rate": 5.970630500651386e-07, + "loss": 0.2014, + "step": 18930 + }, + { + "epoch": 1.7833776877605332, + "grad_norm": 0.6924046874046326, + "learning_rate": 5.965492012134788e-07, + "loss": 0.2403, + "step": 18931 + }, + { + "epoch": 1.7834718918537011, + "grad_norm": 0.6402032375335693, + "learning_rate": 5.960355667750395e-07, + "loss": 0.1738, + "step": 18932 + }, + { + "epoch": 1.7835660959468689, + "grad_norm": 0.7946323752403259, + "learning_rate": 5.955221467615346e-07, + "loss": 0.1985, + "step": 18933 + }, + { + "epoch": 1.7836603000400366, + "grad_norm": 0.7671371102333069, + "learning_rate": 5.950089411846683e-07, + "loss": 0.1942, + "step": 18934 + }, + { + "epoch": 1.7837545041332046, + "grad_norm": 0.7086248993873596, + "learning_rate": 5.944959500561442e-07, + "loss": 0.2018, + "step": 18935 + }, + { + "epoch": 1.7838487082263725, + "grad_norm": 0.6174538135528564, + "learning_rate": 5.939831733876567e-07, + "loss": 0.1913, + "step": 18936 + }, + { + "epoch": 1.7839429123195403, + "grad_norm": 0.6665925979614258, + "learning_rate": 5.934706111908994e-07, + "loss": 0.1845, + "step": 18937 + }, + { + "epoch": 1.784037116412708, + "grad_norm": 0.6362730264663696, + "learning_rate": 5.929582634775611e-07, + "loss": 0.1804, + "step": 18938 + }, + { + "epoch": 1.784131320505876, + "grad_norm": 0.6679801344871521, + "learning_rate": 5.924461302593221e-07, + "loss": 0.1816, + "step": 18939 + }, + { + "epoch": 1.784225524599044, + "grad_norm": 0.8629591464996338, + "learning_rate": 5.919342115478599e-07, + "loss": 0.208, + "step": 18940 + }, + { + "epoch": 1.7843197286922117, + "grad_norm": 0.6771908402442932, + "learning_rate": 5.914225073548485e-07, + "loss": 0.1816, + "step": 18941 + }, + { + "epoch": 1.7844139327853794, + "grad_norm": 0.6483758687973022, + "learning_rate": 5.90911017691953e-07, + "loss": 0.1789, + "step": 18942 + }, + { + "epoch": 1.7845081368785474, + "grad_norm": 0.620595395565033, + "learning_rate": 5.903997425708385e-07, + "loss": 0.1938, + "step": 18943 + }, + { + "epoch": 1.7846023409717153, + "grad_norm": 0.6206614971160889, + "learning_rate": 5.898886820031635e-07, + "loss": 0.1972, + "step": 18944 + }, + { + "epoch": 1.784696545064883, + "grad_norm": 0.6920983791351318, + "learning_rate": 5.893778360005786e-07, + "loss": 0.2124, + "step": 18945 + }, + { + "epoch": 1.7847907491580508, + "grad_norm": 0.5786925554275513, + "learning_rate": 5.888672045747313e-07, + "loss": 0.1786, + "step": 18946 + }, + { + "epoch": 1.7848849532512188, + "grad_norm": 0.621561586856842, + "learning_rate": 5.883567877372687e-07, + "loss": 0.2001, + "step": 18947 + }, + { + "epoch": 1.7849791573443867, + "grad_norm": 0.6603097319602966, + "learning_rate": 5.878465854998261e-07, + "loss": 0.2111, + "step": 18948 + }, + { + "epoch": 1.7850733614375545, + "grad_norm": 0.6399976015090942, + "learning_rate": 5.873365978740353e-07, + "loss": 0.1638, + "step": 18949 + }, + { + "epoch": 1.7851675655307222, + "grad_norm": 0.6775232553482056, + "learning_rate": 5.868268248715292e-07, + "loss": 0.2016, + "step": 18950 + }, + { + "epoch": 1.7852617696238902, + "grad_norm": 0.6444012522697449, + "learning_rate": 5.863172665039308e-07, + "loss": 0.1836, + "step": 18951 + }, + { + "epoch": 1.7853559737170581, + "grad_norm": 0.5827812552452087, + "learning_rate": 5.858079227828539e-07, + "loss": 0.1704, + "step": 18952 + }, + { + "epoch": 1.7854501778102259, + "grad_norm": 0.8426401019096375, + "learning_rate": 5.852987937199172e-07, + "loss": 0.1883, + "step": 18953 + }, + { + "epoch": 1.7855443819033936, + "grad_norm": 0.6850066184997559, + "learning_rate": 5.847898793267292e-07, + "loss": 0.2335, + "step": 18954 + }, + { + "epoch": 1.7856385859965616, + "grad_norm": 0.6605983972549438, + "learning_rate": 5.842811796148906e-07, + "loss": 0.208, + "step": 18955 + }, + { + "epoch": 1.7857327900897295, + "grad_norm": 0.7833412885665894, + "learning_rate": 5.837726945960032e-07, + "loss": 0.1941, + "step": 18956 + }, + { + "epoch": 1.7858269941828973, + "grad_norm": 0.651619017124176, + "learning_rate": 5.832644242816632e-07, + "loss": 0.1953, + "step": 18957 + }, + { + "epoch": 1.785921198276065, + "grad_norm": 0.652336835861206, + "learning_rate": 5.827563686834548e-07, + "loss": 0.1763, + "step": 18958 + }, + { + "epoch": 1.786015402369233, + "grad_norm": 0.6696328520774841, + "learning_rate": 5.822485278129664e-07, + "loss": 0.1847, + "step": 18959 + }, + { + "epoch": 1.786109606462401, + "grad_norm": 0.6166433691978455, + "learning_rate": 5.817409016817765e-07, + "loss": 0.1649, + "step": 18960 + }, + { + "epoch": 1.7862038105555686, + "grad_norm": 0.6885592937469482, + "learning_rate": 5.812334903014593e-07, + "loss": 0.1897, + "step": 18961 + }, + { + "epoch": 1.7862980146487364, + "grad_norm": 0.6394980549812317, + "learning_rate": 5.807262936835845e-07, + "loss": 0.1814, + "step": 18962 + }, + { + "epoch": 1.7863922187419043, + "grad_norm": 0.5924050807952881, + "learning_rate": 5.802193118397193e-07, + "loss": 0.1835, + "step": 18963 + }, + { + "epoch": 1.7864864228350723, + "grad_norm": 0.7013352513313293, + "learning_rate": 5.7971254478142e-07, + "loss": 0.1959, + "step": 18964 + }, + { + "epoch": 1.78658062692824, + "grad_norm": 0.6453135013580322, + "learning_rate": 5.792059925202443e-07, + "loss": 0.1701, + "step": 18965 + }, + { + "epoch": 1.7866748310214078, + "grad_norm": 0.665113091468811, + "learning_rate": 5.786996550677415e-07, + "loss": 0.2228, + "step": 18966 + }, + { + "epoch": 1.7867690351145757, + "grad_norm": 0.6630818247795105, + "learning_rate": 5.781935324354571e-07, + "loss": 0.1952, + "step": 18967 + }, + { + "epoch": 1.7868632392077437, + "grad_norm": 0.6618506908416748, + "learning_rate": 5.776876246349294e-07, + "loss": 0.1958, + "step": 18968 + }, + { + "epoch": 1.7869574433009114, + "grad_norm": 0.7780861854553223, + "learning_rate": 5.771819316776972e-07, + "loss": 0.197, + "step": 18969 + }, + { + "epoch": 1.7870516473940792, + "grad_norm": 0.7015801072120667, + "learning_rate": 5.7667645357529e-07, + "loss": 0.1792, + "step": 18970 + }, + { + "epoch": 1.7871458514872471, + "grad_norm": 0.6162405014038086, + "learning_rate": 5.76171190339232e-07, + "loss": 0.2133, + "step": 18971 + }, + { + "epoch": 1.787240055580415, + "grad_norm": 0.6168376207351685, + "learning_rate": 5.756661419810449e-07, + "loss": 0.1964, + "step": 18972 + }, + { + "epoch": 1.7873342596735828, + "grad_norm": 0.6623192429542542, + "learning_rate": 5.751613085122465e-07, + "loss": 0.1815, + "step": 18973 + }, + { + "epoch": 1.7874284637667506, + "grad_norm": 0.6553171873092651, + "learning_rate": 5.746566899443451e-07, + "loss": 0.1782, + "step": 18974 + }, + { + "epoch": 1.7875226678599185, + "grad_norm": 0.6070336103439331, + "learning_rate": 5.741522862888483e-07, + "loss": 0.1739, + "step": 18975 + }, + { + "epoch": 1.7876168719530865, + "grad_norm": 0.6405748724937439, + "learning_rate": 5.736480975572555e-07, + "loss": 0.1778, + "step": 18976 + }, + { + "epoch": 1.7877110760462542, + "grad_norm": 0.6848527193069458, + "learning_rate": 5.731441237610646e-07, + "loss": 0.2019, + "step": 18977 + }, + { + "epoch": 1.787805280139422, + "grad_norm": 0.6570281982421875, + "learning_rate": 5.726403649117684e-07, + "loss": 0.2136, + "step": 18978 + }, + { + "epoch": 1.78789948423259, + "grad_norm": 0.7297435998916626, + "learning_rate": 5.721368210208489e-07, + "loss": 0.222, + "step": 18979 + }, + { + "epoch": 1.787993688325758, + "grad_norm": 0.6587645411491394, + "learning_rate": 5.716334920997913e-07, + "loss": 0.1749, + "step": 18980 + }, + { + "epoch": 1.7880878924189256, + "grad_norm": 0.6483520269393921, + "learning_rate": 5.711303781600719e-07, + "loss": 0.1995, + "step": 18981 + }, + { + "epoch": 1.7881820965120934, + "grad_norm": 0.6250553131103516, + "learning_rate": 5.706274792131616e-07, + "loss": 0.1928, + "step": 18982 + }, + { + "epoch": 1.7882763006052613, + "grad_norm": 0.7242453694343567, + "learning_rate": 5.701247952705269e-07, + "loss": 0.2132, + "step": 18983 + }, + { + "epoch": 1.7883705046984293, + "grad_norm": 0.67424076795578, + "learning_rate": 5.696223263436317e-07, + "loss": 0.1875, + "step": 18984 + }, + { + "epoch": 1.788464708791597, + "grad_norm": 0.6228921413421631, + "learning_rate": 5.691200724439328e-07, + "loss": 0.2008, + "step": 18985 + }, + { + "epoch": 1.7885589128847648, + "grad_norm": 0.7612162828445435, + "learning_rate": 5.686180335828773e-07, + "loss": 0.1724, + "step": 18986 + }, + { + "epoch": 1.7886531169779327, + "grad_norm": 0.845587432384491, + "learning_rate": 5.681162097719195e-07, + "loss": 0.1946, + "step": 18987 + }, + { + "epoch": 1.7887473210711007, + "grad_norm": 0.6682807207107544, + "learning_rate": 5.676146010224992e-07, + "loss": 0.2019, + "step": 18988 + }, + { + "epoch": 1.7888415251642684, + "grad_norm": 0.6140900254249573, + "learning_rate": 5.671132073460505e-07, + "loss": 0.189, + "step": 18989 + }, + { + "epoch": 1.7889357292574362, + "grad_norm": 0.683236300945282, + "learning_rate": 5.666120287540122e-07, + "loss": 0.19, + "step": 18990 + }, + { + "epoch": 1.7890299333506041, + "grad_norm": 0.6152132749557495, + "learning_rate": 5.661110652578083e-07, + "loss": 0.1844, + "step": 18991 + }, + { + "epoch": 1.789124137443772, + "grad_norm": 0.609509289264679, + "learning_rate": 5.65610316868861e-07, + "loss": 0.1522, + "step": 18992 + }, + { + "epoch": 1.7892183415369398, + "grad_norm": 0.6877182126045227, + "learning_rate": 5.651097835985886e-07, + "loss": 0.1975, + "step": 18993 + }, + { + "epoch": 1.7893125456301076, + "grad_norm": 0.6550241112709045, + "learning_rate": 5.646094654584055e-07, + "loss": 0.1891, + "step": 18994 + }, + { + "epoch": 1.7894067497232755, + "grad_norm": 0.7051395177841187, + "learning_rate": 5.641093624597172e-07, + "loss": 0.2281, + "step": 18995 + }, + { + "epoch": 1.7895009538164435, + "grad_norm": 0.6980713605880737, + "learning_rate": 5.636094746139287e-07, + "loss": 0.2056, + "step": 18996 + }, + { + "epoch": 1.7895951579096112, + "grad_norm": 0.8167104721069336, + "learning_rate": 5.6310980193244e-07, + "loss": 0.202, + "step": 18997 + }, + { + "epoch": 1.789689362002779, + "grad_norm": 0.7286479473114014, + "learning_rate": 5.626103444266395e-07, + "loss": 0.2073, + "step": 18998 + }, + { + "epoch": 1.789783566095947, + "grad_norm": 0.7497352957725525, + "learning_rate": 5.621111021079195e-07, + "loss": 0.2161, + "step": 18999 + }, + { + "epoch": 1.7898777701891149, + "grad_norm": 0.6730028986930847, + "learning_rate": 5.61612074987663e-07, + "loss": 0.1954, + "step": 19000 + }, + { + "epoch": 1.7899719742822826, + "grad_norm": 0.5976526737213135, + "learning_rate": 5.611132630772465e-07, + "loss": 0.1686, + "step": 19001 + }, + { + "epoch": 1.7900661783754503, + "grad_norm": 0.6673054695129395, + "learning_rate": 5.606146663880463e-07, + "loss": 0.1965, + "step": 19002 + }, + { + "epoch": 1.7901603824686183, + "grad_norm": 0.6583447456359863, + "learning_rate": 5.601162849314301e-07, + "loss": 0.1917, + "step": 19003 + }, + { + "epoch": 1.7902545865617863, + "grad_norm": 0.6494748592376709, + "learning_rate": 5.59618118718761e-07, + "loss": 0.1933, + "step": 19004 + }, + { + "epoch": 1.790348790654954, + "grad_norm": 0.6123377680778503, + "learning_rate": 5.591201677613988e-07, + "loss": 0.1653, + "step": 19005 + }, + { + "epoch": 1.7904429947481217, + "grad_norm": 0.6768871545791626, + "learning_rate": 5.586224320706979e-07, + "loss": 0.2026, + "step": 19006 + }, + { + "epoch": 1.7905371988412897, + "grad_norm": 0.6274359226226807, + "learning_rate": 5.581249116580068e-07, + "loss": 0.2034, + "step": 19007 + }, + { + "epoch": 1.7906314029344577, + "grad_norm": 0.6595060229301453, + "learning_rate": 5.576276065346686e-07, + "loss": 0.1899, + "step": 19008 + }, + { + "epoch": 1.7907256070276254, + "grad_norm": 0.6125357151031494, + "learning_rate": 5.571305167120245e-07, + "loss": 0.1715, + "step": 19009 + }, + { + "epoch": 1.7908198111207931, + "grad_norm": 0.6774811148643494, + "learning_rate": 5.566336422014096e-07, + "loss": 0.191, + "step": 19010 + }, + { + "epoch": 1.790914015213961, + "grad_norm": 0.637002170085907, + "learning_rate": 5.561369830141505e-07, + "loss": 0.1784, + "step": 19011 + }, + { + "epoch": 1.791008219307129, + "grad_norm": 0.7424461841583252, + "learning_rate": 5.556405391615738e-07, + "loss": 0.1928, + "step": 19012 + }, + { + "epoch": 1.7911024234002968, + "grad_norm": 0.6591859459877014, + "learning_rate": 5.551443106549991e-07, + "loss": 0.1948, + "step": 19013 + }, + { + "epoch": 1.7911966274934645, + "grad_norm": 0.6124240756034851, + "learning_rate": 5.546482975057399e-07, + "loss": 0.1833, + "step": 19014 + }, + { + "epoch": 1.7912908315866325, + "grad_norm": 0.6331250667572021, + "learning_rate": 5.541524997251091e-07, + "loss": 0.1866, + "step": 19015 + }, + { + "epoch": 1.7913850356798005, + "grad_norm": 0.6631602048873901, + "learning_rate": 5.536569173244078e-07, + "loss": 0.1848, + "step": 19016 + }, + { + "epoch": 1.7914792397729682, + "grad_norm": 0.6510537266731262, + "learning_rate": 5.531615503149379e-07, + "loss": 0.1703, + "step": 19017 + }, + { + "epoch": 1.791573443866136, + "grad_norm": 0.6576735377311707, + "learning_rate": 5.526663987079961e-07, + "loss": 0.1933, + "step": 19018 + }, + { + "epoch": 1.7916676479593039, + "grad_norm": 0.6598816514015198, + "learning_rate": 5.521714625148689e-07, + "loss": 0.1822, + "step": 19019 + }, + { + "epoch": 1.7917618520524716, + "grad_norm": 0.6303108930587769, + "learning_rate": 5.516767417468438e-07, + "loss": 0.1842, + "step": 19020 + }, + { + "epoch": 1.7918560561456394, + "grad_norm": 0.6936563849449158, + "learning_rate": 5.511822364152031e-07, + "loss": 0.207, + "step": 19021 + }, + { + "epoch": 1.7919502602388073, + "grad_norm": 1.0449975728988647, + "learning_rate": 5.506879465312187e-07, + "loss": 0.1773, + "step": 19022 + }, + { + "epoch": 1.7920444643319753, + "grad_norm": 0.6483970880508423, + "learning_rate": 5.501938721061628e-07, + "loss": 0.1649, + "step": 19023 + }, + { + "epoch": 1.792138668425143, + "grad_norm": 0.6974971890449524, + "learning_rate": 5.49700013151302e-07, + "loss": 0.1798, + "step": 19024 + }, + { + "epoch": 1.7922328725183108, + "grad_norm": 0.6729770302772522, + "learning_rate": 5.492063696778971e-07, + "loss": 0.1857, + "step": 19025 + }, + { + "epoch": 1.7923270766114787, + "grad_norm": 0.6208738684654236, + "learning_rate": 5.487129416971992e-07, + "loss": 0.166, + "step": 19026 + }, + { + "epoch": 1.7924212807046467, + "grad_norm": 0.6766891479492188, + "learning_rate": 5.48219729220465e-07, + "loss": 0.1979, + "step": 19027 + }, + { + "epoch": 1.7925154847978144, + "grad_norm": 0.6793447732925415, + "learning_rate": 5.477267322589397e-07, + "loss": 0.2425, + "step": 19028 + }, + { + "epoch": 1.7926096888909822, + "grad_norm": 0.6638621687889099, + "learning_rate": 5.4723395082386e-07, + "loss": 0.1736, + "step": 19029 + }, + { + "epoch": 1.7927038929841501, + "grad_norm": 0.6527950763702393, + "learning_rate": 5.46741384926468e-07, + "loss": 0.1992, + "step": 19030 + }, + { + "epoch": 1.792798097077318, + "grad_norm": 0.6407003998756409, + "learning_rate": 5.462490345779925e-07, + "loss": 0.1792, + "step": 19031 + }, + { + "epoch": 1.7928923011704858, + "grad_norm": 0.7016992568969727, + "learning_rate": 5.457568997896567e-07, + "loss": 0.2017, + "step": 19032 + }, + { + "epoch": 1.7929865052636536, + "grad_norm": 0.6246228218078613, + "learning_rate": 5.452649805726862e-07, + "loss": 0.1738, + "step": 19033 + }, + { + "epoch": 1.7930807093568215, + "grad_norm": 0.6411997079849243, + "learning_rate": 5.447732769382974e-07, + "loss": 0.1899, + "step": 19034 + }, + { + "epoch": 1.7931749134499895, + "grad_norm": 0.65321946144104, + "learning_rate": 5.442817888976992e-07, + "loss": 0.1956, + "step": 19035 + }, + { + "epoch": 1.7932691175431572, + "grad_norm": 0.6064602732658386, + "learning_rate": 5.437905164620993e-07, + "loss": 0.1716, + "step": 19036 + }, + { + "epoch": 1.793363321636325, + "grad_norm": 0.7640340924263, + "learning_rate": 5.432994596427021e-07, + "loss": 0.1923, + "step": 19037 + }, + { + "epoch": 1.793457525729493, + "grad_norm": 0.6219149231910706, + "learning_rate": 5.428086184507009e-07, + "loss": 0.1731, + "step": 19038 + }, + { + "epoch": 1.7935517298226609, + "grad_norm": 0.61652672290802, + "learning_rate": 5.423179928972878e-07, + "loss": 0.1758, + "step": 19039 + }, + { + "epoch": 1.7936459339158286, + "grad_norm": 0.7278186082839966, + "learning_rate": 5.418275829936537e-07, + "loss": 0.1938, + "step": 19040 + }, + { + "epoch": 1.7937401380089963, + "grad_norm": 0.6120370626449585, + "learning_rate": 5.413373887509766e-07, + "loss": 0.1806, + "step": 19041 + }, + { + "epoch": 1.7938343421021643, + "grad_norm": 0.7241223454475403, + "learning_rate": 5.40847410180434e-07, + "loss": 0.1943, + "step": 19042 + }, + { + "epoch": 1.7939285461953323, + "grad_norm": 0.6974479556083679, + "learning_rate": 5.403576472932015e-07, + "loss": 0.1824, + "step": 19043 + }, + { + "epoch": 1.7940227502885, + "grad_norm": 0.6181360483169556, + "learning_rate": 5.398681001004447e-07, + "loss": 0.1747, + "step": 19044 + }, + { + "epoch": 1.7941169543816677, + "grad_norm": 0.6469744443893433, + "learning_rate": 5.393787686133234e-07, + "loss": 0.1992, + "step": 19045 + }, + { + "epoch": 1.7942111584748357, + "grad_norm": 0.6220036745071411, + "learning_rate": 5.388896528429977e-07, + "loss": 0.1961, + "step": 19046 + }, + { + "epoch": 1.7943053625680037, + "grad_norm": 0.6217597723007202, + "learning_rate": 5.384007528006219e-07, + "loss": 0.1751, + "step": 19047 + }, + { + "epoch": 1.7943995666611714, + "grad_norm": 0.6373041272163391, + "learning_rate": 5.379120684973393e-07, + "loss": 0.1729, + "step": 19048 + }, + { + "epoch": 1.7944937707543391, + "grad_norm": 0.6712929010391235, + "learning_rate": 5.374235999442944e-07, + "loss": 0.1873, + "step": 19049 + }, + { + "epoch": 1.794587974847507, + "grad_norm": 0.6496888399124146, + "learning_rate": 5.369353471526285e-07, + "loss": 0.1811, + "step": 19050 + }, + { + "epoch": 1.794682178940675, + "grad_norm": 0.6715037226676941, + "learning_rate": 5.364473101334688e-07, + "loss": 0.1974, + "step": 19051 + }, + { + "epoch": 1.7947763830338428, + "grad_norm": 0.6432616710662842, + "learning_rate": 5.359594888979469e-07, + "loss": 0.1631, + "step": 19052 + }, + { + "epoch": 1.7948705871270105, + "grad_norm": 0.698383629322052, + "learning_rate": 5.354718834571859e-07, + "loss": 0.2134, + "step": 19053 + }, + { + "epoch": 1.7949647912201785, + "grad_norm": 0.6947939991950989, + "learning_rate": 5.349844938223026e-07, + "loss": 0.1871, + "step": 19054 + }, + { + "epoch": 1.7950589953133465, + "grad_norm": 0.654518187046051, + "learning_rate": 5.344973200044112e-07, + "loss": 0.1871, + "step": 19055 + }, + { + "epoch": 1.7951531994065142, + "grad_norm": 0.6151201128959656, + "learning_rate": 5.340103620146186e-07, + "loss": 0.1855, + "step": 19056 + }, + { + "epoch": 1.795247403499682, + "grad_norm": 0.632982075214386, + "learning_rate": 5.335236198640293e-07, + "loss": 0.2119, + "step": 19057 + }, + { + "epoch": 1.7953416075928499, + "grad_norm": 0.7302631139755249, + "learning_rate": 5.33037093563743e-07, + "loss": 0.2168, + "step": 19058 + }, + { + "epoch": 1.7954358116860178, + "grad_norm": 0.6297193765640259, + "learning_rate": 5.3255078312485e-07, + "loss": 0.1909, + "step": 19059 + }, + { + "epoch": 1.7955300157791856, + "grad_norm": 0.6735662221908569, + "learning_rate": 5.320646885584413e-07, + "loss": 0.2023, + "step": 19060 + }, + { + "epoch": 1.7956242198723533, + "grad_norm": 0.6780900359153748, + "learning_rate": 5.315788098756014e-07, + "loss": 0.1866, + "step": 19061 + }, + { + "epoch": 1.7957184239655213, + "grad_norm": 0.6257510781288147, + "learning_rate": 5.31093147087407e-07, + "loss": 0.1768, + "step": 19062 + }, + { + "epoch": 1.7958126280586892, + "grad_norm": 0.6984760761260986, + "learning_rate": 5.306077002049326e-07, + "loss": 0.1998, + "step": 19063 + }, + { + "epoch": 1.795906832151857, + "grad_norm": 0.6464702486991882, + "learning_rate": 5.301224692392482e-07, + "loss": 0.1832, + "step": 19064 + }, + { + "epoch": 1.7960010362450247, + "grad_norm": 0.6231998205184937, + "learning_rate": 5.296374542014171e-07, + "loss": 0.191, + "step": 19065 + }, + { + "epoch": 1.7960952403381927, + "grad_norm": 0.6024487018585205, + "learning_rate": 5.291526551024951e-07, + "loss": 0.18, + "step": 19066 + }, + { + "epoch": 1.7961894444313606, + "grad_norm": 0.6694967150688171, + "learning_rate": 5.286680719535431e-07, + "loss": 0.1978, + "step": 19067 + }, + { + "epoch": 1.7962836485245284, + "grad_norm": 0.6204403042793274, + "learning_rate": 5.281837047656069e-07, + "loss": 0.1856, + "step": 19068 + }, + { + "epoch": 1.7963778526176961, + "grad_norm": 0.6918314695358276, + "learning_rate": 5.276995535497286e-07, + "loss": 0.1882, + "step": 19069 + }, + { + "epoch": 1.796472056710864, + "grad_norm": 0.6221495866775513, + "learning_rate": 5.272156183169497e-07, + "loss": 0.1921, + "step": 19070 + }, + { + "epoch": 1.796566260804032, + "grad_norm": 0.7115456461906433, + "learning_rate": 5.267318990783066e-07, + "loss": 0.1937, + "step": 19071 + }, + { + "epoch": 1.7966604648971998, + "grad_norm": 0.6703652739524841, + "learning_rate": 5.26248395844825e-07, + "loss": 0.1953, + "step": 19072 + }, + { + "epoch": 1.7967546689903675, + "grad_norm": 0.6182885766029358, + "learning_rate": 5.257651086275317e-07, + "loss": 0.1893, + "step": 19073 + }, + { + "epoch": 1.7968488730835355, + "grad_norm": 0.7072000503540039, + "learning_rate": 5.252820374374468e-07, + "loss": 0.1899, + "step": 19074 + }, + { + "epoch": 1.7969430771767034, + "grad_norm": 0.6250677704811096, + "learning_rate": 5.247991822855835e-07, + "loss": 0.1744, + "step": 19075 + }, + { + "epoch": 1.7970372812698712, + "grad_norm": 0.710290789604187, + "learning_rate": 5.243165431829522e-07, + "loss": 0.1847, + "step": 19076 + }, + { + "epoch": 1.797131485363039, + "grad_norm": 0.683290958404541, + "learning_rate": 5.238341201405605e-07, + "loss": 0.1817, + "step": 19077 + }, + { + "epoch": 1.7972256894562069, + "grad_norm": 0.6023078560829163, + "learning_rate": 5.233519131694042e-07, + "loss": 0.1756, + "step": 19078 + }, + { + "epoch": 1.7973198935493748, + "grad_norm": 0.7020680904388428, + "learning_rate": 5.228699222804801e-07, + "loss": 0.2055, + "step": 19079 + }, + { + "epoch": 1.7974140976425426, + "grad_norm": 0.6486931443214417, + "learning_rate": 5.223881474847791e-07, + "loss": 0.174, + "step": 19080 + }, + { + "epoch": 1.7975083017357103, + "grad_norm": 0.6468766331672668, + "learning_rate": 5.21906588793285e-07, + "loss": 0.1969, + "step": 19081 + }, + { + "epoch": 1.7976025058288783, + "grad_norm": 0.6431040167808533, + "learning_rate": 5.214252462169789e-07, + "loss": 0.176, + "step": 19082 + }, + { + "epoch": 1.7976967099220462, + "grad_norm": 0.5801704525947571, + "learning_rate": 5.209441197668375e-07, + "loss": 0.1646, + "step": 19083 + }, + { + "epoch": 1.797790914015214, + "grad_norm": 0.5869934558868408, + "learning_rate": 5.204632094538298e-07, + "loss": 0.1798, + "step": 19084 + }, + { + "epoch": 1.7978851181083817, + "grad_norm": 0.6227966547012329, + "learning_rate": 5.199825152889193e-07, + "loss": 0.1832, + "step": 19085 + }, + { + "epoch": 1.7979793222015497, + "grad_norm": 0.5921785831451416, + "learning_rate": 5.195020372830695e-07, + "loss": 0.1767, + "step": 19086 + }, + { + "epoch": 1.7980735262947176, + "grad_norm": 5.5205302238464355, + "learning_rate": 5.19021775447236e-07, + "loss": 0.1777, + "step": 19087 + }, + { + "epoch": 1.7981677303878854, + "grad_norm": 0.6705521941184998, + "learning_rate": 5.185417297923678e-07, + "loss": 0.188, + "step": 19088 + }, + { + "epoch": 1.798261934481053, + "grad_norm": 0.6479392647743225, + "learning_rate": 5.180619003294107e-07, + "loss": 0.1869, + "step": 19089 + }, + { + "epoch": 1.798356138574221, + "grad_norm": 0.5793355703353882, + "learning_rate": 5.175822870693081e-07, + "loss": 0.1734, + "step": 19090 + }, + { + "epoch": 1.798450342667389, + "grad_norm": 0.5716192126274109, + "learning_rate": 5.171028900229924e-07, + "loss": 0.1715, + "step": 19091 + }, + { + "epoch": 1.7985445467605568, + "grad_norm": 0.7655090689659119, + "learning_rate": 5.16623709201397e-07, + "loss": 0.1876, + "step": 19092 + }, + { + "epoch": 1.7986387508537245, + "grad_norm": 0.6193764209747314, + "learning_rate": 5.161447446154489e-07, + "loss": 0.1659, + "step": 19093 + }, + { + "epoch": 1.7987329549468924, + "grad_norm": 0.7286919355392456, + "learning_rate": 5.156659962760657e-07, + "loss": 0.2241, + "step": 19094 + }, + { + "epoch": 1.7988271590400604, + "grad_norm": 0.6866196990013123, + "learning_rate": 5.151874641941679e-07, + "loss": 0.1928, + "step": 19095 + }, + { + "epoch": 1.7989213631332281, + "grad_norm": 0.6107556223869324, + "learning_rate": 5.147091483806621e-07, + "loss": 0.182, + "step": 19096 + }, + { + "epoch": 1.7990155672263959, + "grad_norm": 0.7106184363365173, + "learning_rate": 5.142310488464575e-07, + "loss": 0.2252, + "step": 19097 + }, + { + "epoch": 1.7991097713195638, + "grad_norm": 0.6771349906921387, + "learning_rate": 5.137531656024563e-07, + "loss": 0.1757, + "step": 19098 + }, + { + "epoch": 1.7992039754127318, + "grad_norm": 0.6582804322242737, + "learning_rate": 5.132754986595522e-07, + "loss": 0.217, + "step": 19099 + }, + { + "epoch": 1.7992981795058995, + "grad_norm": 0.647718071937561, + "learning_rate": 5.127980480286377e-07, + "loss": 0.1662, + "step": 19100 + }, + { + "epoch": 1.7993923835990673, + "grad_norm": 0.6686140894889832, + "learning_rate": 5.123208137206026e-07, + "loss": 0.2037, + "step": 19101 + }, + { + "epoch": 1.7994865876922352, + "grad_norm": 0.7159162163734436, + "learning_rate": 5.118437957463229e-07, + "loss": 0.1787, + "step": 19102 + }, + { + "epoch": 1.7995807917854032, + "grad_norm": 0.6867672204971313, + "learning_rate": 5.113669941166799e-07, + "loss": 0.1727, + "step": 19103 + }, + { + "epoch": 1.799674995878571, + "grad_norm": 0.653090238571167, + "learning_rate": 5.108904088425448e-07, + "loss": 0.2106, + "step": 19104 + }, + { + "epoch": 1.7997691999717387, + "grad_norm": 0.7055180072784424, + "learning_rate": 5.104140399347835e-07, + "loss": 0.2227, + "step": 19105 + }, + { + "epoch": 1.7998634040649066, + "grad_norm": 0.9326152801513672, + "learning_rate": 5.099378874042548e-07, + "loss": 0.2171, + "step": 19106 + }, + { + "epoch": 1.7999576081580746, + "grad_norm": 0.6862870454788208, + "learning_rate": 5.094619512618226e-07, + "loss": 0.2241, + "step": 19107 + }, + { + "epoch": 1.8000518122512423, + "grad_norm": 0.6412129998207092, + "learning_rate": 5.089862315183347e-07, + "loss": 0.1893, + "step": 19108 + }, + { + "epoch": 1.80014601634441, + "grad_norm": 0.6480741500854492, + "learning_rate": 5.085107281846369e-07, + "loss": 0.1951, + "step": 19109 + }, + { + "epoch": 1.800240220437578, + "grad_norm": 0.7141343355178833, + "learning_rate": 5.080354412715749e-07, + "loss": 0.1978, + "step": 19110 + }, + { + "epoch": 1.800334424530746, + "grad_norm": 0.6640894412994385, + "learning_rate": 5.075603707899846e-07, + "loss": 0.2012, + "step": 19111 + }, + { + "epoch": 1.8004286286239137, + "grad_norm": 0.6541938781738281, + "learning_rate": 5.070855167506972e-07, + "loss": 0.1696, + "step": 19112 + }, + { + "epoch": 1.8005228327170815, + "grad_norm": 1.3459975719451904, + "learning_rate": 5.066108791645407e-07, + "loss": 0.197, + "step": 19113 + }, + { + "epoch": 1.8006170368102494, + "grad_norm": 0.6184680461883545, + "learning_rate": 5.0613645804234e-07, + "loss": 0.1677, + "step": 19114 + }, + { + "epoch": 1.8007112409034174, + "grad_norm": 0.6969945430755615, + "learning_rate": 5.056622533949085e-07, + "loss": 0.1861, + "step": 19115 + }, + { + "epoch": 1.8008054449965851, + "grad_norm": 0.6932345628738403, + "learning_rate": 5.05188265233062e-07, + "loss": 0.1817, + "step": 19116 + }, + { + "epoch": 1.8008996490897529, + "grad_norm": 0.6407642364501953, + "learning_rate": 5.047144935676073e-07, + "loss": 0.1787, + "step": 19117 + }, + { + "epoch": 1.8009938531829208, + "grad_norm": 0.6974245309829712, + "learning_rate": 5.042409384093461e-07, + "loss": 0.2094, + "step": 19118 + }, + { + "epoch": 1.8010880572760888, + "grad_norm": 0.7181513905525208, + "learning_rate": 5.037675997690772e-07, + "loss": 0.1934, + "step": 19119 + }, + { + "epoch": 1.8011822613692565, + "grad_norm": 0.6898902654647827, + "learning_rate": 5.032944776575943e-07, + "loss": 0.175, + "step": 19120 + }, + { + "epoch": 1.8012764654624243, + "grad_norm": 0.653157651424408, + "learning_rate": 5.028215720856821e-07, + "loss": 0.2066, + "step": 19121 + }, + { + "epoch": 1.8013706695555922, + "grad_norm": 0.7411891222000122, + "learning_rate": 5.023488830641266e-07, + "loss": 0.1762, + "step": 19122 + }, + { + "epoch": 1.8014648736487602, + "grad_norm": 0.6776013374328613, + "learning_rate": 5.018764106037066e-07, + "loss": 0.2004, + "step": 19123 + }, + { + "epoch": 1.801559077741928, + "grad_norm": 0.750198245048523, + "learning_rate": 5.014041547151927e-07, + "loss": 0.1793, + "step": 19124 + }, + { + "epoch": 1.8016532818350957, + "grad_norm": 0.7108379006385803, + "learning_rate": 5.009321154093538e-07, + "loss": 0.1869, + "step": 19125 + }, + { + "epoch": 1.8017474859282636, + "grad_norm": 0.6518669724464417, + "learning_rate": 5.004602926969515e-07, + "loss": 0.1789, + "step": 19126 + }, + { + "epoch": 1.8018416900214316, + "grad_norm": 0.6811784505844116, + "learning_rate": 4.999886865887483e-07, + "loss": 0.2039, + "step": 19127 + }, + { + "epoch": 1.8019358941145993, + "grad_norm": 0.7192262411117554, + "learning_rate": 4.995172970954943e-07, + "loss": 0.2265, + "step": 19128 + }, + { + "epoch": 1.802030098207767, + "grad_norm": 0.6727216243743896, + "learning_rate": 4.990461242279377e-07, + "loss": 0.2054, + "step": 19129 + }, + { + "epoch": 1.802124302300935, + "grad_norm": 0.6997501850128174, + "learning_rate": 4.985751679968243e-07, + "loss": 0.2029, + "step": 19130 + }, + { + "epoch": 1.802218506394103, + "grad_norm": 0.5826748609542847, + "learning_rate": 4.981044284128911e-07, + "loss": 0.179, + "step": 19131 + }, + { + "epoch": 1.8023127104872707, + "grad_norm": 0.663162112236023, + "learning_rate": 4.976339054868706e-07, + "loss": 0.1897, + "step": 19132 + }, + { + "epoch": 1.8024069145804384, + "grad_norm": 0.6346247792243958, + "learning_rate": 4.971635992294943e-07, + "loss": 0.1799, + "step": 19133 + }, + { + "epoch": 1.8025011186736064, + "grad_norm": 0.6651972532272339, + "learning_rate": 4.966935096514835e-07, + "loss": 0.2241, + "step": 19134 + }, + { + "epoch": 1.8025953227667744, + "grad_norm": 0.6187891364097595, + "learning_rate": 4.962236367635597e-07, + "loss": 0.1736, + "step": 19135 + }, + { + "epoch": 1.802689526859942, + "grad_norm": 0.6713681817054749, + "learning_rate": 4.957539805764322e-07, + "loss": 0.2125, + "step": 19136 + }, + { + "epoch": 1.8027837309531098, + "grad_norm": 0.6716582775115967, + "learning_rate": 4.952845411008133e-07, + "loss": 0.1806, + "step": 19137 + }, + { + "epoch": 1.8028779350462778, + "grad_norm": 0.6291553974151611, + "learning_rate": 4.94815318347408e-07, + "loss": 0.1781, + "step": 19138 + }, + { + "epoch": 1.8029721391394458, + "grad_norm": 0.6568560004234314, + "learning_rate": 4.94346312326911e-07, + "loss": 0.1638, + "step": 19139 + }, + { + "epoch": 1.8030663432326135, + "grad_norm": 0.6333551406860352, + "learning_rate": 4.938775230500192e-07, + "loss": 0.1971, + "step": 19140 + }, + { + "epoch": 1.8031605473257812, + "grad_norm": 0.6884417533874512, + "learning_rate": 4.934089505274231e-07, + "loss": 0.2009, + "step": 19141 + }, + { + "epoch": 1.8032547514189492, + "grad_norm": 0.7167954444885254, + "learning_rate": 4.92940594769804e-07, + "loss": 0.1796, + "step": 19142 + }, + { + "epoch": 1.8033489555121172, + "grad_norm": 0.783322811126709, + "learning_rate": 4.924724557878413e-07, + "loss": 0.2155, + "step": 19143 + }, + { + "epoch": 1.803443159605285, + "grad_norm": 0.7364895939826965, + "learning_rate": 4.920045335922119e-07, + "loss": 0.2048, + "step": 19144 + }, + { + "epoch": 1.8035373636984526, + "grad_norm": 0.6621474027633667, + "learning_rate": 4.915368281935839e-07, + "loss": 0.1866, + "step": 19145 + }, + { + "epoch": 1.8036315677916206, + "grad_norm": 0.6693629622459412, + "learning_rate": 4.910693396026189e-07, + "loss": 0.1939, + "step": 19146 + }, + { + "epoch": 1.8037257718847886, + "grad_norm": 0.7286280989646912, + "learning_rate": 4.906020678299816e-07, + "loss": 0.1841, + "step": 19147 + }, + { + "epoch": 1.8038199759779563, + "grad_norm": 0.5911803245544434, + "learning_rate": 4.901350128863236e-07, + "loss": 0.1701, + "step": 19148 + }, + { + "epoch": 1.803914180071124, + "grad_norm": 0.6541381478309631, + "learning_rate": 4.896681747822929e-07, + "loss": 0.1726, + "step": 19149 + }, + { + "epoch": 1.804008384164292, + "grad_norm": 0.6095507144927979, + "learning_rate": 4.892015535285355e-07, + "loss": 0.1846, + "step": 19150 + }, + { + "epoch": 1.80410258825746, + "grad_norm": 0.6767376661300659, + "learning_rate": 4.887351491356929e-07, + "loss": 0.2344, + "step": 19151 + }, + { + "epoch": 1.8041967923506277, + "grad_norm": 0.6861122250556946, + "learning_rate": 4.882689616143976e-07, + "loss": 0.2194, + "step": 19152 + }, + { + "epoch": 1.8042909964437954, + "grad_norm": 0.7323029637336731, + "learning_rate": 4.878029909752801e-07, + "loss": 0.1884, + "step": 19153 + }, + { + "epoch": 1.8043852005369634, + "grad_norm": 0.6265630125999451, + "learning_rate": 4.873372372289664e-07, + "loss": 0.1771, + "step": 19154 + }, + { + "epoch": 1.8044794046301313, + "grad_norm": 0.5967429876327515, + "learning_rate": 4.868717003860735e-07, + "loss": 0.1783, + "step": 19155 + }, + { + "epoch": 1.8045736087232989, + "grad_norm": 0.6530036330223083, + "learning_rate": 4.864063804572183e-07, + "loss": 0.1992, + "step": 19156 + }, + { + "epoch": 1.8046678128164668, + "grad_norm": 0.7074195742607117, + "learning_rate": 4.859412774530126e-07, + "loss": 0.1943, + "step": 19157 + }, + { + "epoch": 1.8047620169096348, + "grad_norm": 0.6856885552406311, + "learning_rate": 4.854763913840576e-07, + "loss": 0.1971, + "step": 19158 + }, + { + "epoch": 1.8048562210028025, + "grad_norm": 0.6670020818710327, + "learning_rate": 4.850117222609563e-07, + "loss": 0.1848, + "step": 19159 + }, + { + "epoch": 1.8049504250959703, + "grad_norm": 0.6792109608650208, + "learning_rate": 4.845472700943033e-07, + "loss": 0.1978, + "step": 19160 + }, + { + "epoch": 1.8050446291891382, + "grad_norm": 0.5933213233947754, + "learning_rate": 4.84083034894689e-07, + "loss": 0.1635, + "step": 19161 + }, + { + "epoch": 1.8051388332823062, + "grad_norm": 0.6266000270843506, + "learning_rate": 4.836190166726951e-07, + "loss": 0.1722, + "step": 19162 + }, + { + "epoch": 1.805233037375474, + "grad_norm": 0.6405494213104248, + "learning_rate": 4.831552154389074e-07, + "loss": 0.2012, + "step": 19163 + }, + { + "epoch": 1.8053272414686417, + "grad_norm": 0.604314923286438, + "learning_rate": 4.826916312038999e-07, + "loss": 0.1682, + "step": 19164 + }, + { + "epoch": 1.8054214455618096, + "grad_norm": 0.5882378220558167, + "learning_rate": 4.822282639782405e-07, + "loss": 0.1597, + "step": 19165 + }, + { + "epoch": 1.8055156496549776, + "grad_norm": 0.653429388999939, + "learning_rate": 4.817651137724955e-07, + "loss": 0.1651, + "step": 19166 + }, + { + "epoch": 1.8056098537481453, + "grad_norm": 0.7554649710655212, + "learning_rate": 4.813021805972273e-07, + "loss": 0.1771, + "step": 19167 + }, + { + "epoch": 1.805704057841313, + "grad_norm": 0.7071376442909241, + "learning_rate": 4.808394644629899e-07, + "loss": 0.1935, + "step": 19168 + }, + { + "epoch": 1.805798261934481, + "grad_norm": 0.6626911759376526, + "learning_rate": 4.803769653803336e-07, + "loss": 0.1821, + "step": 19169 + }, + { + "epoch": 1.805892466027649, + "grad_norm": 0.6097730994224548, + "learning_rate": 4.799146833598067e-07, + "loss": 0.1864, + "step": 19170 + }, + { + "epoch": 1.8059866701208167, + "grad_norm": 0.7417007684707642, + "learning_rate": 4.794526184119463e-07, + "loss": 0.1902, + "step": 19171 + }, + { + "epoch": 1.8060808742139844, + "grad_norm": 0.6449300050735474, + "learning_rate": 4.789907705472907e-07, + "loss": 0.1854, + "step": 19172 + }, + { + "epoch": 1.8061750783071524, + "grad_norm": 0.6992318034172058, + "learning_rate": 4.785291397763714e-07, + "loss": 0.1829, + "step": 19173 + }, + { + "epoch": 1.8062692824003204, + "grad_norm": 0.682033896446228, + "learning_rate": 4.78067726109711e-07, + "loss": 0.2036, + "step": 19174 + }, + { + "epoch": 1.806363486493488, + "grad_norm": 0.6673874855041504, + "learning_rate": 4.776065295578336e-07, + "loss": 0.1852, + "step": 19175 + }, + { + "epoch": 1.8064576905866558, + "grad_norm": 0.6430282592773438, + "learning_rate": 4.771455501312538e-07, + "loss": 0.1915, + "step": 19176 + }, + { + "epoch": 1.8065518946798238, + "grad_norm": 0.6463133096694946, + "learning_rate": 4.766847878404823e-07, + "loss": 0.1993, + "step": 19177 + }, + { + "epoch": 1.8066460987729918, + "grad_norm": 0.6992751955986023, + "learning_rate": 4.762242426960262e-07, + "loss": 0.1657, + "step": 19178 + }, + { + "epoch": 1.8067403028661595, + "grad_norm": 0.6533843278884888, + "learning_rate": 4.757639147083859e-07, + "loss": 0.168, + "step": 19179 + }, + { + "epoch": 1.8068345069593272, + "grad_norm": 0.6807149648666382, + "learning_rate": 4.753038038880575e-07, + "loss": 0.1938, + "step": 19180 + }, + { + "epoch": 1.8069287110524952, + "grad_norm": 0.6650716066360474, + "learning_rate": 4.7484391024553376e-07, + "loss": 0.1906, + "step": 19181 + }, + { + "epoch": 1.8070229151456632, + "grad_norm": 0.7790847420692444, + "learning_rate": 4.743842337912985e-07, + "loss": 0.2003, + "step": 19182 + }, + { + "epoch": 1.807117119238831, + "grad_norm": 0.7163403034210205, + "learning_rate": 4.739247745358344e-07, + "loss": 0.1867, + "step": 19183 + }, + { + "epoch": 1.8072113233319986, + "grad_norm": 0.6882708072662354, + "learning_rate": 4.7346553248961867e-07, + "loss": 0.1994, + "step": 19184 + }, + { + "epoch": 1.8073055274251666, + "grad_norm": 0.6356422901153564, + "learning_rate": 4.7300650766312294e-07, + "loss": 0.187, + "step": 19185 + }, + { + "epoch": 1.8073997315183346, + "grad_norm": 0.7441783547401428, + "learning_rate": 4.7254770006681105e-07, + "loss": 0.2253, + "step": 19186 + }, + { + "epoch": 1.8074939356115023, + "grad_norm": 0.5874780416488647, + "learning_rate": 4.7208910971114577e-07, + "loss": 0.1716, + "step": 19187 + }, + { + "epoch": 1.80758813970467, + "grad_norm": 0.6484742760658264, + "learning_rate": 4.716307366065853e-07, + "loss": 0.192, + "step": 19188 + }, + { + "epoch": 1.807682343797838, + "grad_norm": 0.6442360877990723, + "learning_rate": 4.7117258076357806e-07, + "loss": 0.1775, + "step": 19189 + }, + { + "epoch": 1.807776547891006, + "grad_norm": 0.7025601267814636, + "learning_rate": 4.7071464219257343e-07, + "loss": 0.2379, + "step": 19190 + }, + { + "epoch": 1.8078707519841737, + "grad_norm": 0.6472790241241455, + "learning_rate": 4.702569209040131e-07, + "loss": 0.1845, + "step": 19191 + }, + { + "epoch": 1.8079649560773414, + "grad_norm": 0.6979325413703918, + "learning_rate": 4.6979941690833196e-07, + "loss": 0.2053, + "step": 19192 + }, + { + "epoch": 1.8080591601705094, + "grad_norm": 0.6204474568367004, + "learning_rate": 4.6934213021596285e-07, + "loss": 0.1803, + "step": 19193 + }, + { + "epoch": 1.8081533642636773, + "grad_norm": 0.7227051258087158, + "learning_rate": 4.688850608373341e-07, + "loss": 0.2019, + "step": 19194 + }, + { + "epoch": 1.808247568356845, + "grad_norm": 0.7145089507102966, + "learning_rate": 4.68428208782864e-07, + "loss": 0.1617, + "step": 19195 + }, + { + "epoch": 1.8083417724500128, + "grad_norm": 0.6264746785163879, + "learning_rate": 4.67971574062972e-07, + "loss": 0.1627, + "step": 19196 + }, + { + "epoch": 1.8084359765431808, + "grad_norm": 0.6440044045448303, + "learning_rate": 4.67515156688072e-07, + "loss": 0.2196, + "step": 19197 + }, + { + "epoch": 1.8085301806363487, + "grad_norm": 0.6561369895935059, + "learning_rate": 4.670589566685657e-07, + "loss": 0.2, + "step": 19198 + }, + { + "epoch": 1.8086243847295165, + "grad_norm": 0.6602401733398438, + "learning_rate": 4.6660297401485923e-07, + "loss": 0.1724, + "step": 19199 + }, + { + "epoch": 1.8087185888226842, + "grad_norm": 0.7573186159133911, + "learning_rate": 4.6614720873734976e-07, + "loss": 0.1782, + "step": 19200 + }, + { + "epoch": 1.8088127929158522, + "grad_norm": 0.6608302593231201, + "learning_rate": 4.6569166084642904e-07, + "loss": 0.1873, + "step": 19201 + }, + { + "epoch": 1.8089069970090201, + "grad_norm": 0.8319967985153198, + "learning_rate": 4.652363303524798e-07, + "loss": 0.2098, + "step": 19202 + }, + { + "epoch": 1.8090012011021879, + "grad_norm": 0.6997202038764954, + "learning_rate": 4.6478121726589056e-07, + "loss": 0.1604, + "step": 19203 + }, + { + "epoch": 1.8090954051953556, + "grad_norm": 0.651648223400116, + "learning_rate": 4.643263215970373e-07, + "loss": 0.1718, + "step": 19204 + }, + { + "epoch": 1.8091896092885236, + "grad_norm": 0.627785861492157, + "learning_rate": 4.6387164335628844e-07, + "loss": 0.1798, + "step": 19205 + }, + { + "epoch": 1.8092838133816915, + "grad_norm": 0.628860354423523, + "learning_rate": 4.634171825540146e-07, + "loss": 0.2039, + "step": 19206 + }, + { + "epoch": 1.8093780174748593, + "grad_norm": 0.6039461493492126, + "learning_rate": 4.6296293920057854e-07, + "loss": 0.166, + "step": 19207 + }, + { + "epoch": 1.809472221568027, + "grad_norm": 0.6618579030036926, + "learning_rate": 4.625089133063365e-07, + "loss": 0.1844, + "step": 19208 + }, + { + "epoch": 1.809566425661195, + "grad_norm": 0.6979915499687195, + "learning_rate": 4.620551048816402e-07, + "loss": 0.1894, + "step": 19209 + }, + { + "epoch": 1.809660629754363, + "grad_norm": 0.7428672909736633, + "learning_rate": 4.616015139368402e-07, + "loss": 0.2169, + "step": 19210 + }, + { + "epoch": 1.8097548338475307, + "grad_norm": 0.622818112373352, + "learning_rate": 4.6114814048227483e-07, + "loss": 0.1859, + "step": 19211 + }, + { + "epoch": 1.8098490379406984, + "grad_norm": 0.6509173512458801, + "learning_rate": 4.6069498452828487e-07, + "loss": 0.199, + "step": 19212 + }, + { + "epoch": 1.8099432420338664, + "grad_norm": 0.6740192174911499, + "learning_rate": 4.6024204608520305e-07, + "loss": 0.2001, + "step": 19213 + }, + { + "epoch": 1.8100374461270343, + "grad_norm": 0.6373996734619141, + "learning_rate": 4.597893251633556e-07, + "loss": 0.2098, + "step": 19214 + }, + { + "epoch": 1.810131650220202, + "grad_norm": 0.7393183708190918, + "learning_rate": 4.593368217730665e-07, + "loss": 0.2019, + "step": 19215 + }, + { + "epoch": 1.8102258543133698, + "grad_norm": 0.7272049188613892, + "learning_rate": 4.588845359246508e-07, + "loss": 0.2151, + "step": 19216 + }, + { + "epoch": 1.8103200584065378, + "grad_norm": 0.6717706322669983, + "learning_rate": 4.5843246762842466e-07, + "loss": 0.1979, + "step": 19217 + }, + { + "epoch": 1.8104142624997057, + "grad_norm": 0.7467896342277527, + "learning_rate": 4.579806168946943e-07, + "loss": 0.2139, + "step": 19218 + }, + { + "epoch": 1.8105084665928735, + "grad_norm": 0.6617717742919922, + "learning_rate": 4.5752898373376263e-07, + "loss": 0.2048, + "step": 19219 + }, + { + "epoch": 1.8106026706860412, + "grad_norm": 0.6223923563957214, + "learning_rate": 4.5707756815592805e-07, + "loss": 0.1697, + "step": 19220 + }, + { + "epoch": 1.8106968747792092, + "grad_norm": 0.6542524099349976, + "learning_rate": 4.5662637017148346e-07, + "loss": 0.218, + "step": 19221 + }, + { + "epoch": 1.8107910788723771, + "grad_norm": 0.6476825475692749, + "learning_rate": 4.5617538979071616e-07, + "loss": 0.1909, + "step": 19222 + }, + { + "epoch": 1.8108852829655449, + "grad_norm": 0.6108734011650085, + "learning_rate": 4.5572462702391016e-07, + "loss": 0.1751, + "step": 19223 + }, + { + "epoch": 1.8109794870587126, + "grad_norm": 0.6277502179145813, + "learning_rate": 4.5527408188134393e-07, + "loss": 0.171, + "step": 19224 + }, + { + "epoch": 1.8110736911518805, + "grad_norm": 0.631671130657196, + "learning_rate": 4.548237543732903e-07, + "loss": 0.184, + "step": 19225 + }, + { + "epoch": 1.8111678952450485, + "grad_norm": 0.6968255639076233, + "learning_rate": 4.543736445100144e-07, + "loss": 0.2269, + "step": 19226 + }, + { + "epoch": 1.8112620993382162, + "grad_norm": 0.6485012173652649, + "learning_rate": 4.5392375230178363e-07, + "loss": 0.1698, + "step": 19227 + }, + { + "epoch": 1.811356303431384, + "grad_norm": 0.6907787322998047, + "learning_rate": 4.534740777588553e-07, + "loss": 0.2169, + "step": 19228 + }, + { + "epoch": 1.811450507524552, + "grad_norm": 0.6859610676765442, + "learning_rate": 4.5302462089148015e-07, + "loss": 0.2113, + "step": 19229 + }, + { + "epoch": 1.81154471161772, + "grad_norm": 0.6284610033035278, + "learning_rate": 4.5257538170990876e-07, + "loss": 0.1978, + "step": 19230 + }, + { + "epoch": 1.8116389157108876, + "grad_norm": 0.6435138583183289, + "learning_rate": 4.5212636022438527e-07, + "loss": 0.2233, + "step": 19231 + }, + { + "epoch": 1.8117331198040554, + "grad_norm": 0.5762256979942322, + "learning_rate": 4.516775564451459e-07, + "loss": 0.1754, + "step": 19232 + }, + { + "epoch": 1.8118273238972233, + "grad_norm": 0.6733455657958984, + "learning_rate": 4.5122897038242465e-07, + "loss": 0.2014, + "step": 19233 + }, + { + "epoch": 1.8119215279903913, + "grad_norm": 0.6698833107948303, + "learning_rate": 4.507806020464522e-07, + "loss": 0.1868, + "step": 19234 + }, + { + "epoch": 1.812015732083559, + "grad_norm": 0.6298755407333374, + "learning_rate": 4.503324514474483e-07, + "loss": 0.1914, + "step": 19235 + }, + { + "epoch": 1.8121099361767268, + "grad_norm": 0.5595676302909851, + "learning_rate": 4.4988451859563355e-07, + "loss": 0.1586, + "step": 19236 + }, + { + "epoch": 1.8122041402698947, + "grad_norm": 0.9319345951080322, + "learning_rate": 4.49436803501222e-07, + "loss": 0.2107, + "step": 19237 + }, + { + "epoch": 1.8122983443630627, + "grad_norm": 0.7476606965065002, + "learning_rate": 4.4898930617442105e-07, + "loss": 0.218, + "step": 19238 + }, + { + "epoch": 1.8123925484562304, + "grad_norm": 0.6579570770263672, + "learning_rate": 4.485420266254348e-07, + "loss": 0.2043, + "step": 19239 + }, + { + "epoch": 1.8124867525493982, + "grad_norm": 0.6271825432777405, + "learning_rate": 4.480949648644628e-07, + "loss": 0.1884, + "step": 19240 + }, + { + "epoch": 1.8125809566425661, + "grad_norm": 0.7584695816040039, + "learning_rate": 4.4764812090169804e-07, + "loss": 0.2018, + "step": 19241 + }, + { + "epoch": 1.812675160735734, + "grad_norm": 0.6255564093589783, + "learning_rate": 4.472014947473269e-07, + "loss": 0.1973, + "step": 19242 + }, + { + "epoch": 1.8127693648289018, + "grad_norm": 0.5978403687477112, + "learning_rate": 4.4675508641153776e-07, + "loss": 0.1982, + "step": 19243 + }, + { + "epoch": 1.8128635689220696, + "grad_norm": 0.695257842540741, + "learning_rate": 4.46308895904507e-07, + "loss": 0.1881, + "step": 19244 + }, + { + "epoch": 1.8129577730152375, + "grad_norm": 0.7909465432167053, + "learning_rate": 4.4586292323640643e-07, + "loss": 0.2134, + "step": 19245 + }, + { + "epoch": 1.8130519771084055, + "grad_norm": 0.6328903436660767, + "learning_rate": 4.45417168417408e-07, + "loss": 0.1916, + "step": 19246 + }, + { + "epoch": 1.8131461812015732, + "grad_norm": 0.6638057827949524, + "learning_rate": 4.4497163145767576e-07, + "loss": 0.1938, + "step": 19247 + }, + { + "epoch": 1.813240385294741, + "grad_norm": 0.7206124663352966, + "learning_rate": 4.44526312367366e-07, + "loss": 0.2141, + "step": 19248 + }, + { + "epoch": 1.813334589387909, + "grad_norm": 0.6205139756202698, + "learning_rate": 4.4408121115663396e-07, + "loss": 0.1834, + "step": 19249 + }, + { + "epoch": 1.8134287934810769, + "grad_norm": 0.667712390422821, + "learning_rate": 4.436363278356304e-07, + "loss": 0.1915, + "step": 19250 + }, + { + "epoch": 1.8135229975742446, + "grad_norm": 1.0919421911239624, + "learning_rate": 4.431916624144961e-07, + "loss": 0.1779, + "step": 19251 + }, + { + "epoch": 1.8136172016674124, + "grad_norm": 0.685817301273346, + "learning_rate": 4.427472149033718e-07, + "loss": 0.1803, + "step": 19252 + }, + { + "epoch": 1.8137114057605803, + "grad_norm": 0.6311653256416321, + "learning_rate": 4.423029853123928e-07, + "loss": 0.2102, + "step": 19253 + }, + { + "epoch": 1.8138056098537483, + "grad_norm": 0.6238813996315002, + "learning_rate": 4.418589736516865e-07, + "loss": 0.2117, + "step": 19254 + }, + { + "epoch": 1.813899813946916, + "grad_norm": 0.9527063369750977, + "learning_rate": 4.414151799313782e-07, + "loss": 0.2, + "step": 19255 + }, + { + "epoch": 1.8139940180400838, + "grad_norm": 0.7247177362442017, + "learning_rate": 4.4097160416158525e-07, + "loss": 0.209, + "step": 19256 + }, + { + "epoch": 1.8140882221332517, + "grad_norm": 0.6345008611679077, + "learning_rate": 4.4052824635242296e-07, + "loss": 0.1965, + "step": 19257 + }, + { + "epoch": 1.8141824262264197, + "grad_norm": 0.6915823221206665, + "learning_rate": 4.4008510651400215e-07, + "loss": 0.2105, + "step": 19258 + }, + { + "epoch": 1.8142766303195874, + "grad_norm": 0.7735414505004883, + "learning_rate": 4.396421846564236e-07, + "loss": 0.2186, + "step": 19259 + }, + { + "epoch": 1.8143708344127552, + "grad_norm": 0.6169028282165527, + "learning_rate": 4.391994807897892e-07, + "loss": 0.1835, + "step": 19260 + }, + { + "epoch": 1.8144650385059231, + "grad_norm": 0.807994544506073, + "learning_rate": 4.387569949241943e-07, + "loss": 0.2006, + "step": 19261 + }, + { + "epoch": 1.814559242599091, + "grad_norm": 0.6707485914230347, + "learning_rate": 4.383147270697252e-07, + "loss": 0.1995, + "step": 19262 + }, + { + "epoch": 1.8146534466922588, + "grad_norm": 0.6820725202560425, + "learning_rate": 4.378726772364672e-07, + "loss": 0.1934, + "step": 19263 + }, + { + "epoch": 1.8147476507854265, + "grad_norm": 0.6363739371299744, + "learning_rate": 4.374308454345022e-07, + "loss": 0.1836, + "step": 19264 + }, + { + "epoch": 1.8148418548785945, + "grad_norm": 0.7199511528015137, + "learning_rate": 4.369892316739022e-07, + "loss": 0.1984, + "step": 19265 + }, + { + "epoch": 1.8149360589717625, + "grad_norm": 0.6855634450912476, + "learning_rate": 4.365478359647368e-07, + "loss": 0.2045, + "step": 19266 + }, + { + "epoch": 1.8150302630649302, + "grad_norm": 0.6977314949035645, + "learning_rate": 4.361066583170703e-07, + "loss": 0.195, + "step": 19267 + }, + { + "epoch": 1.815124467158098, + "grad_norm": 0.7039359211921692, + "learning_rate": 4.3566569874096356e-07, + "loss": 0.2127, + "step": 19268 + }, + { + "epoch": 1.815218671251266, + "grad_norm": 0.6829505562782288, + "learning_rate": 4.352249572464695e-07, + "loss": 0.2056, + "step": 19269 + }, + { + "epoch": 1.8153128753444339, + "grad_norm": 0.7237241268157959, + "learning_rate": 4.347844338436391e-07, + "loss": 0.2045, + "step": 19270 + }, + { + "epoch": 1.8154070794376016, + "grad_norm": 0.7656180262565613, + "learning_rate": 4.343441285425176e-07, + "loss": 0.2182, + "step": 19271 + }, + { + "epoch": 1.8155012835307693, + "grad_norm": 0.6508693695068359, + "learning_rate": 4.339040413531426e-07, + "loss": 0.2191, + "step": 19272 + }, + { + "epoch": 1.8155954876239373, + "grad_norm": 0.5876890420913696, + "learning_rate": 4.3346417228554927e-07, + "loss": 0.1686, + "step": 19273 + }, + { + "epoch": 1.8156896917171053, + "grad_norm": 0.8887391686439514, + "learning_rate": 4.330245213497686e-07, + "loss": 0.1815, + "step": 19274 + }, + { + "epoch": 1.815783895810273, + "grad_norm": 0.674713671207428, + "learning_rate": 4.325850885558236e-07, + "loss": 0.1841, + "step": 19275 + }, + { + "epoch": 1.8158780999034407, + "grad_norm": 0.6085453629493713, + "learning_rate": 4.321458739137352e-07, + "loss": 0.192, + "step": 19276 + }, + { + "epoch": 1.8159723039966087, + "grad_norm": 0.7171904444694519, + "learning_rate": 4.317068774335187e-07, + "loss": 0.2017, + "step": 19277 + }, + { + "epoch": 1.8160665080897767, + "grad_norm": 0.6836258769035339, + "learning_rate": 4.3126809912518274e-07, + "loss": 0.2024, + "step": 19278 + }, + { + "epoch": 1.8161607121829444, + "grad_norm": 0.7146151065826416, + "learning_rate": 4.308295389987305e-07, + "loss": 0.2202, + "step": 19279 + }, + { + "epoch": 1.8162549162761121, + "grad_norm": 0.6684953570365906, + "learning_rate": 4.303911970641661e-07, + "loss": 0.2119, + "step": 19280 + }, + { + "epoch": 1.81634912036928, + "grad_norm": 0.6618233323097229, + "learning_rate": 4.2995307333148273e-07, + "loss": 0.2026, + "step": 19281 + }, + { + "epoch": 1.816443324462448, + "grad_norm": 0.680989146232605, + "learning_rate": 4.2951516781066574e-07, + "loss": 0.1818, + "step": 19282 + }, + { + "epoch": 1.8165375285556158, + "grad_norm": 0.7819607257843018, + "learning_rate": 4.2907748051170816e-07, + "loss": 0.1894, + "step": 19283 + }, + { + "epoch": 1.8166317326487835, + "grad_norm": 0.6956021189689636, + "learning_rate": 4.2864001144458435e-07, + "loss": 0.1937, + "step": 19284 + }, + { + "epoch": 1.8167259367419515, + "grad_norm": 0.6716555953025818, + "learning_rate": 4.282027606192696e-07, + "loss": 0.1866, + "step": 19285 + }, + { + "epoch": 1.8168201408351194, + "grad_norm": 0.6552572846412659, + "learning_rate": 4.277657280457359e-07, + "loss": 0.1902, + "step": 19286 + }, + { + "epoch": 1.8169143449282872, + "grad_norm": 0.702485978603363, + "learning_rate": 4.2732891373394757e-07, + "loss": 0.2184, + "step": 19287 + }, + { + "epoch": 1.817008549021455, + "grad_norm": 0.6234025359153748, + "learning_rate": 4.268923176938633e-07, + "loss": 0.1775, + "step": 19288 + }, + { + "epoch": 1.8171027531146229, + "grad_norm": 0.6936768293380737, + "learning_rate": 4.2645593993543953e-07, + "loss": 0.1994, + "step": 19289 + }, + { + "epoch": 1.8171969572077908, + "grad_norm": 0.6392292380332947, + "learning_rate": 4.2601978046862833e-07, + "loss": 0.1835, + "step": 19290 + }, + { + "epoch": 1.8172911613009586, + "grad_norm": 0.6391674280166626, + "learning_rate": 4.2558383930336954e-07, + "loss": 0.1804, + "step": 19291 + }, + { + "epoch": 1.8173853653941263, + "grad_norm": 0.6619796752929688, + "learning_rate": 4.2514811644960743e-07, + "loss": 0.1971, + "step": 19292 + }, + { + "epoch": 1.8174795694872943, + "grad_norm": 0.6730448603630066, + "learning_rate": 4.2471261191727733e-07, + "loss": 0.188, + "step": 19293 + }, + { + "epoch": 1.8175737735804622, + "grad_norm": 0.7545876502990723, + "learning_rate": 4.242773257163069e-07, + "loss": 0.1949, + "step": 19294 + }, + { + "epoch": 1.8176679776736298, + "grad_norm": 0.6351592540740967, + "learning_rate": 4.238422578566226e-07, + "loss": 0.1849, + "step": 19295 + }, + { + "epoch": 1.8177621817667977, + "grad_norm": 0.7674629092216492, + "learning_rate": 4.2340740834814433e-07, + "loss": 0.1879, + "step": 19296 + }, + { + "epoch": 1.8178563858599657, + "grad_norm": 0.664547860622406, + "learning_rate": 4.229727772007886e-07, + "loss": 0.1951, + "step": 19297 + }, + { + "epoch": 1.8179505899531334, + "grad_norm": 0.6626757979393005, + "learning_rate": 4.2253836442446406e-07, + "loss": 0.1921, + "step": 19298 + }, + { + "epoch": 1.8180447940463011, + "grad_norm": 0.6940608024597168, + "learning_rate": 4.2210417002907736e-07, + "loss": 0.1735, + "step": 19299 + }, + { + "epoch": 1.818138998139469, + "grad_norm": 0.6468650698661804, + "learning_rate": 4.21670194024526e-07, + "loss": 0.1775, + "step": 19300 + }, + { + "epoch": 1.818233202232637, + "grad_norm": 0.650922417640686, + "learning_rate": 4.2123643642071e-07, + "loss": 0.1883, + "step": 19301 + }, + { + "epoch": 1.8183274063258048, + "grad_norm": 0.6870720982551575, + "learning_rate": 4.208028972275158e-07, + "loss": 0.2107, + "step": 19302 + }, + { + "epoch": 1.8184216104189725, + "grad_norm": 0.6382945775985718, + "learning_rate": 4.20369576454831e-07, + "loss": 0.194, + "step": 19303 + }, + { + "epoch": 1.8185158145121405, + "grad_norm": 0.6483957171440125, + "learning_rate": 4.1993647411253336e-07, + "loss": 0.1921, + "step": 19304 + }, + { + "epoch": 1.8186100186053085, + "grad_norm": 0.6496073007583618, + "learning_rate": 4.195035902105027e-07, + "loss": 0.1764, + "step": 19305 + }, + { + "epoch": 1.8187042226984762, + "grad_norm": 0.7171269059181213, + "learning_rate": 4.190709247586044e-07, + "loss": 0.1975, + "step": 19306 + }, + { + "epoch": 1.818798426791644, + "grad_norm": 0.5712437033653259, + "learning_rate": 4.186384777667063e-07, + "loss": 0.196, + "step": 19307 + }, + { + "epoch": 1.818892630884812, + "grad_norm": 0.6560826897621155, + "learning_rate": 4.1820624924466926e-07, + "loss": 0.2195, + "step": 19308 + }, + { + "epoch": 1.8189868349779799, + "grad_norm": 0.6667826175689697, + "learning_rate": 4.1777423920234763e-07, + "loss": 0.1854, + "step": 19309 + }, + { + "epoch": 1.8190810390711476, + "grad_norm": 0.6026217341423035, + "learning_rate": 4.173424476495924e-07, + "loss": 0.2189, + "step": 19310 + }, + { + "epoch": 1.8191752431643153, + "grad_norm": 0.6463965177536011, + "learning_rate": 4.1691087459625136e-07, + "loss": 0.1802, + "step": 19311 + }, + { + "epoch": 1.8192694472574833, + "grad_norm": 0.6469002366065979, + "learning_rate": 4.1647952005216096e-07, + "loss": 0.188, + "step": 19312 + }, + { + "epoch": 1.8193636513506513, + "grad_norm": 0.6489649415016174, + "learning_rate": 4.16048384027159e-07, + "loss": 0.1836, + "step": 19313 + }, + { + "epoch": 1.819457855443819, + "grad_norm": 0.5855406522750854, + "learning_rate": 4.1561746653107635e-07, + "loss": 0.1832, + "step": 19314 + }, + { + "epoch": 1.8195520595369867, + "grad_norm": 0.7545716166496277, + "learning_rate": 4.1518676757373753e-07, + "loss": 0.2094, + "step": 19315 + }, + { + "epoch": 1.8196462636301547, + "grad_norm": 0.6075674295425415, + "learning_rate": 4.147562871649624e-07, + "loss": 0.1663, + "step": 19316 + }, + { + "epoch": 1.8197404677233227, + "grad_norm": 0.5916205644607544, + "learning_rate": 4.143260253145709e-07, + "loss": 0.2141, + "step": 19317 + }, + { + "epoch": 1.8198346718164904, + "grad_norm": 0.7186866402626038, + "learning_rate": 4.138959820323696e-07, + "loss": 0.1988, + "step": 19318 + }, + { + "epoch": 1.8199288759096581, + "grad_norm": 0.6810881495475769, + "learning_rate": 4.1346615732816173e-07, + "loss": 0.1868, + "step": 19319 + }, + { + "epoch": 1.820023080002826, + "grad_norm": 0.6811802387237549, + "learning_rate": 4.1303655121175515e-07, + "loss": 0.1989, + "step": 19320 + }, + { + "epoch": 1.820117284095994, + "grad_norm": 0.6818514466285706, + "learning_rate": 4.1260716369294073e-07, + "loss": 0.187, + "step": 19321 + }, + { + "epoch": 1.8202114881891618, + "grad_norm": 0.6336425542831421, + "learning_rate": 4.1217799478150855e-07, + "loss": 0.1848, + "step": 19322 + }, + { + "epoch": 1.8203056922823295, + "grad_norm": 0.6554447412490845, + "learning_rate": 4.117490444872474e-07, + "loss": 0.1853, + "step": 19323 + }, + { + "epoch": 1.8203998963754975, + "grad_norm": 0.643510639667511, + "learning_rate": 4.1132031281993724e-07, + "loss": 0.1783, + "step": 19324 + }, + { + "epoch": 1.8204941004686654, + "grad_norm": 0.7794122099876404, + "learning_rate": 4.1089179978935245e-07, + "loss": 0.2049, + "step": 19325 + }, + { + "epoch": 1.8205883045618332, + "grad_norm": 0.6050351858139038, + "learning_rate": 4.10463505405263e-07, + "loss": 0.1839, + "step": 19326 + }, + { + "epoch": 1.820682508655001, + "grad_norm": 0.6417489051818848, + "learning_rate": 4.100354296774389e-07, + "loss": 0.2121, + "step": 19327 + }, + { + "epoch": 1.8207767127481689, + "grad_norm": 0.5908655524253845, + "learning_rate": 4.096075726156357e-07, + "loss": 0.1775, + "step": 19328 + }, + { + "epoch": 1.8208709168413368, + "grad_norm": 0.6148104667663574, + "learning_rate": 4.091799342296121e-07, + "loss": 0.1973, + "step": 19329 + }, + { + "epoch": 1.8209651209345046, + "grad_norm": 0.6370846629142761, + "learning_rate": 4.087525145291205e-07, + "loss": 0.2013, + "step": 19330 + }, + { + "epoch": 1.8210593250276723, + "grad_norm": 0.6966099739074707, + "learning_rate": 4.083253135239029e-07, + "loss": 0.2336, + "step": 19331 + }, + { + "epoch": 1.8211535291208403, + "grad_norm": 0.672433078289032, + "learning_rate": 4.078983312237017e-07, + "loss": 0.2297, + "step": 19332 + }, + { + "epoch": 1.8212477332140082, + "grad_norm": 0.6689161062240601, + "learning_rate": 4.074715676382546e-07, + "loss": 0.196, + "step": 19333 + }, + { + "epoch": 1.821341937307176, + "grad_norm": 0.680631160736084, + "learning_rate": 4.070450227772893e-07, + "loss": 0.1977, + "step": 19334 + }, + { + "epoch": 1.8214361414003437, + "grad_norm": 0.6848549842834473, + "learning_rate": 4.0661869665053476e-07, + "loss": 0.1986, + "step": 19335 + }, + { + "epoch": 1.8215303454935117, + "grad_norm": 0.6180917620658875, + "learning_rate": 4.0619258926770877e-07, + "loss": 0.1795, + "step": 19336 + }, + { + "epoch": 1.8216245495866796, + "grad_norm": 0.611783504486084, + "learning_rate": 4.0576670063852907e-07, + "loss": 0.172, + "step": 19337 + }, + { + "epoch": 1.8217187536798474, + "grad_norm": 0.6417288780212402, + "learning_rate": 4.053410307727079e-07, + "loss": 0.2153, + "step": 19338 + }, + { + "epoch": 1.821812957773015, + "grad_norm": 0.6193172931671143, + "learning_rate": 4.0491557967994866e-07, + "loss": 0.1822, + "step": 19339 + }, + { + "epoch": 1.821907161866183, + "grad_norm": 0.7174019813537598, + "learning_rate": 4.044903473699524e-07, + "loss": 0.2116, + "step": 19340 + }, + { + "epoch": 1.822001365959351, + "grad_norm": 0.6933748722076416, + "learning_rate": 4.0406533385241807e-07, + "loss": 0.2008, + "step": 19341 + }, + { + "epoch": 1.8220955700525188, + "grad_norm": 0.6742868423461914, + "learning_rate": 4.036405391370324e-07, + "loss": 0.2109, + "step": 19342 + }, + { + "epoch": 1.8221897741456865, + "grad_norm": 0.6713540554046631, + "learning_rate": 4.032159632334853e-07, + "loss": 0.1746, + "step": 19343 + }, + { + "epoch": 1.8222839782388545, + "grad_norm": 0.6660988926887512, + "learning_rate": 4.0279160615145476e-07, + "loss": 0.2031, + "step": 19344 + }, + { + "epoch": 1.8223781823320224, + "grad_norm": 0.6715826988220215, + "learning_rate": 4.023674679006184e-07, + "loss": 0.1947, + "step": 19345 + }, + { + "epoch": 1.8224723864251902, + "grad_norm": 0.5954000353813171, + "learning_rate": 4.0194354849064645e-07, + "loss": 0.1786, + "step": 19346 + }, + { + "epoch": 1.822566590518358, + "grad_norm": 0.675621509552002, + "learning_rate": 4.0151984793120546e-07, + "loss": 0.2103, + "step": 19347 + }, + { + "epoch": 1.8226607946115259, + "grad_norm": 0.6137824654579163, + "learning_rate": 4.010963662319567e-07, + "loss": 0.1714, + "step": 19348 + }, + { + "epoch": 1.8227549987046938, + "grad_norm": 0.6363803744316101, + "learning_rate": 4.006731034025546e-07, + "loss": 0.1876, + "step": 19349 + }, + { + "epoch": 1.8228492027978616, + "grad_norm": 0.6920458078384399, + "learning_rate": 4.002500594526526e-07, + "loss": 0.201, + "step": 19350 + }, + { + "epoch": 1.8229434068910293, + "grad_norm": 0.7287300229072571, + "learning_rate": 3.9982723439189517e-07, + "loss": 0.2102, + "step": 19351 + }, + { + "epoch": 1.8230376109841973, + "grad_norm": 0.6533631682395935, + "learning_rate": 3.9940462822992354e-07, + "loss": 0.1668, + "step": 19352 + }, + { + "epoch": 1.8231318150773652, + "grad_norm": 0.6046972870826721, + "learning_rate": 3.989822409763733e-07, + "loss": 0.1712, + "step": 19353 + }, + { + "epoch": 1.823226019170533, + "grad_norm": 0.7671160101890564, + "learning_rate": 3.985600726408778e-07, + "loss": 0.2099, + "step": 19354 + }, + { + "epoch": 1.8233202232637007, + "grad_norm": 0.7179263234138489, + "learning_rate": 3.9813812323306166e-07, + "loss": 0.2026, + "step": 19355 + }, + { + "epoch": 1.8234144273568687, + "grad_norm": 0.6337263584136963, + "learning_rate": 3.977163927625438e-07, + "loss": 0.1701, + "step": 19356 + }, + { + "epoch": 1.8235086314500366, + "grad_norm": 0.9400070905685425, + "learning_rate": 3.9729488123894435e-07, + "loss": 0.1918, + "step": 19357 + }, + { + "epoch": 1.8236028355432043, + "grad_norm": 0.716310977935791, + "learning_rate": 3.968735886718722e-07, + "loss": 0.1817, + "step": 19358 + }, + { + "epoch": 1.823697039636372, + "grad_norm": 0.6727654337882996, + "learning_rate": 3.9645251507093197e-07, + "loss": 0.1958, + "step": 19359 + }, + { + "epoch": 1.82379124372954, + "grad_norm": 0.7326470613479614, + "learning_rate": 3.960316604457282e-07, + "loss": 0.2024, + "step": 19360 + }, + { + "epoch": 1.823885447822708, + "grad_norm": 0.5652415752410889, + "learning_rate": 3.9561102480585644e-07, + "loss": 0.1667, + "step": 19361 + }, + { + "epoch": 1.8239796519158757, + "grad_norm": 0.6441262364387512, + "learning_rate": 3.951906081609036e-07, + "loss": 0.1873, + "step": 19362 + }, + { + "epoch": 1.8240738560090435, + "grad_norm": 0.6255739331245422, + "learning_rate": 3.947704105204619e-07, + "loss": 0.172, + "step": 19363 + }, + { + "epoch": 1.8241680601022114, + "grad_norm": 0.6597809195518494, + "learning_rate": 3.9435043189410935e-07, + "loss": 0.1958, + "step": 19364 + }, + { + "epoch": 1.8242622641953794, + "grad_norm": 0.674697995185852, + "learning_rate": 3.9393067229142045e-07, + "loss": 0.1795, + "step": 19365 + }, + { + "epoch": 1.8243564682885471, + "grad_norm": 0.6789885759353638, + "learning_rate": 3.9351113172196976e-07, + "loss": 0.1844, + "step": 19366 + }, + { + "epoch": 1.8244506723817149, + "grad_norm": 0.6318550705909729, + "learning_rate": 3.930918101953218e-07, + "loss": 0.1812, + "step": 19367 + }, + { + "epoch": 1.8245448764748828, + "grad_norm": 0.6167610883712769, + "learning_rate": 3.926727077210379e-07, + "loss": 0.2021, + "step": 19368 + }, + { + "epoch": 1.8246390805680508, + "grad_norm": 0.6036345958709717, + "learning_rate": 3.9225382430867377e-07, + "loss": 0.1781, + "step": 19369 + }, + { + "epoch": 1.8247332846612185, + "grad_norm": 0.6404289603233337, + "learning_rate": 3.9183515996778163e-07, + "loss": 0.168, + "step": 19370 + }, + { + "epoch": 1.8248274887543863, + "grad_norm": 0.6928600668907166, + "learning_rate": 3.914167147079073e-07, + "loss": 0.1899, + "step": 19371 + }, + { + "epoch": 1.8249216928475542, + "grad_norm": 0.6132739186286926, + "learning_rate": 3.909984885385909e-07, + "loss": 0.1929, + "step": 19372 + }, + { + "epoch": 1.8250158969407222, + "grad_norm": 0.6557868123054504, + "learning_rate": 3.9058048146937144e-07, + "loss": 0.1857, + "step": 19373 + }, + { + "epoch": 1.82511010103389, + "grad_norm": 0.6878867149353027, + "learning_rate": 3.9016269350977574e-07, + "loss": 0.2068, + "step": 19374 + }, + { + "epoch": 1.8252043051270577, + "grad_norm": 0.6477445363998413, + "learning_rate": 3.897451246693351e-07, + "loss": 0.229, + "step": 19375 + }, + { + "epoch": 1.8252985092202256, + "grad_norm": 0.6238030195236206, + "learning_rate": 3.8932777495756633e-07, + "loss": 0.1667, + "step": 19376 + }, + { + "epoch": 1.8253927133133936, + "grad_norm": 0.6949501037597656, + "learning_rate": 3.889106443839874e-07, + "loss": 0.1909, + "step": 19377 + }, + { + "epoch": 1.8254869174065613, + "grad_norm": 0.627713143825531, + "learning_rate": 3.884937329581118e-07, + "loss": 0.183, + "step": 19378 + }, + { + "epoch": 1.825581121499729, + "grad_norm": 0.7722302675247192, + "learning_rate": 3.8807704068944075e-07, + "loss": 0.2147, + "step": 19379 + }, + { + "epoch": 1.825675325592897, + "grad_norm": 0.6550974249839783, + "learning_rate": 3.8766056758748004e-07, + "loss": 0.198, + "step": 19380 + }, + { + "epoch": 1.825769529686065, + "grad_norm": 0.7231143116950989, + "learning_rate": 3.872443136617243e-07, + "loss": 0.1825, + "step": 19381 + }, + { + "epoch": 1.8258637337792327, + "grad_norm": 0.6727703213691711, + "learning_rate": 3.8682827892166373e-07, + "loss": 0.1667, + "step": 19382 + }, + { + "epoch": 1.8259579378724005, + "grad_norm": 0.6196810007095337, + "learning_rate": 3.8641246337678627e-07, + "loss": 0.19, + "step": 19383 + }, + { + "epoch": 1.8260521419655684, + "grad_norm": 0.6855287551879883, + "learning_rate": 3.85996867036571e-07, + "loss": 0.1965, + "step": 19384 + }, + { + "epoch": 1.8261463460587364, + "grad_norm": 0.6707298159599304, + "learning_rate": 3.8558148991049704e-07, + "loss": 0.2028, + "step": 19385 + }, + { + "epoch": 1.8262405501519041, + "grad_norm": 0.7413190007209778, + "learning_rate": 3.8516633200803346e-07, + "loss": 0.2143, + "step": 19386 + }, + { + "epoch": 1.8263347542450719, + "grad_norm": 0.6734809875488281, + "learning_rate": 3.8475139333864597e-07, + "loss": 0.1849, + "step": 19387 + }, + { + "epoch": 1.8264289583382398, + "grad_norm": 0.7256351709365845, + "learning_rate": 3.8433667391179927e-07, + "loss": 0.1967, + "step": 19388 + }, + { + "epoch": 1.8265231624314078, + "grad_norm": 0.6640051603317261, + "learning_rate": 3.8392217373694583e-07, + "loss": 0.2081, + "step": 19389 + }, + { + "epoch": 1.8266173665245755, + "grad_norm": 0.7027711272239685, + "learning_rate": 3.8350789282353805e-07, + "loss": 0.2062, + "step": 19390 + }, + { + "epoch": 1.8267115706177433, + "grad_norm": 0.6376315951347351, + "learning_rate": 3.8309383118102396e-07, + "loss": 0.1711, + "step": 19391 + }, + { + "epoch": 1.8268057747109112, + "grad_norm": 0.6850985288619995, + "learning_rate": 3.826799888188426e-07, + "loss": 0.1751, + "step": 19392 + }, + { + "epoch": 1.8268999788040792, + "grad_norm": 0.7391743659973145, + "learning_rate": 3.8226636574642983e-07, + "loss": 0.2025, + "step": 19393 + }, + { + "epoch": 1.826994182897247, + "grad_norm": 0.7282021045684814, + "learning_rate": 3.818529619732203e-07, + "loss": 0.1604, + "step": 19394 + }, + { + "epoch": 1.8270883869904146, + "grad_norm": 0.6102774739265442, + "learning_rate": 3.814397775086376e-07, + "loss": 0.1807, + "step": 19395 + }, + { + "epoch": 1.8271825910835826, + "grad_norm": 0.6688569784164429, + "learning_rate": 3.810268123621008e-07, + "loss": 0.1938, + "step": 19396 + }, + { + "epoch": 1.8272767951767506, + "grad_norm": 0.6612884998321533, + "learning_rate": 3.8061406654303134e-07, + "loss": 0.1826, + "step": 19397 + }, + { + "epoch": 1.8273709992699183, + "grad_norm": 0.667577862739563, + "learning_rate": 3.802015400608372e-07, + "loss": 0.2058, + "step": 19398 + }, + { + "epoch": 1.827465203363086, + "grad_norm": 0.5990484356880188, + "learning_rate": 3.797892329249231e-07, + "loss": 0.167, + "step": 19399 + }, + { + "epoch": 1.827559407456254, + "grad_norm": 0.7349708080291748, + "learning_rate": 3.793771451446948e-07, + "loss": 0.2224, + "step": 19400 + }, + { + "epoch": 1.827653611549422, + "grad_norm": 0.6905130743980408, + "learning_rate": 3.7896527672954707e-07, + "loss": 0.1711, + "step": 19401 + }, + { + "epoch": 1.8277478156425897, + "grad_norm": 0.7009580731391907, + "learning_rate": 3.7855362768886684e-07, + "loss": 0.2093, + "step": 19402 + }, + { + "epoch": 1.8278420197357574, + "grad_norm": 0.6246546506881714, + "learning_rate": 3.7814219803204654e-07, + "loss": 0.1887, + "step": 19403 + }, + { + "epoch": 1.8279362238289254, + "grad_norm": 0.6888798475265503, + "learning_rate": 3.777309877684654e-07, + "loss": 0.1849, + "step": 19404 + }, + { + "epoch": 1.8280304279220934, + "grad_norm": 0.6739806532859802, + "learning_rate": 3.773199969074959e-07, + "loss": 0.1991, + "step": 19405 + }, + { + "epoch": 1.828124632015261, + "grad_norm": 0.6582845449447632, + "learning_rate": 3.769092254585138e-07, + "loss": 0.1862, + "step": 19406 + }, + { + "epoch": 1.8282188361084288, + "grad_norm": 0.6401177048683167, + "learning_rate": 3.76498673430884e-07, + "loss": 0.1936, + "step": 19407 + }, + { + "epoch": 1.8283130402015968, + "grad_norm": 0.7239953279495239, + "learning_rate": 3.760883408339666e-07, + "loss": 0.1838, + "step": 19408 + }, + { + "epoch": 1.8284072442947648, + "grad_norm": 1.277886986732483, + "learning_rate": 3.756782276771187e-07, + "loss": 0.188, + "step": 19409 + }, + { + "epoch": 1.8285014483879325, + "grad_norm": 0.6182141304016113, + "learning_rate": 3.752683339696928e-07, + "loss": 0.1804, + "step": 19410 + }, + { + "epoch": 1.8285956524811002, + "grad_norm": 0.6419548392295837, + "learning_rate": 3.748586597210324e-07, + "loss": 0.2021, + "step": 19411 + }, + { + "epoch": 1.8286898565742682, + "grad_norm": 0.7066452503204346, + "learning_rate": 3.744492049404802e-07, + "loss": 0.2029, + "step": 19412 + }, + { + "epoch": 1.8287840606674362, + "grad_norm": 0.650208055973053, + "learning_rate": 3.740399696373742e-07, + "loss": 0.1919, + "step": 19413 + }, + { + "epoch": 1.8288782647606039, + "grad_norm": 0.6237804889678955, + "learning_rate": 3.7363095382104143e-07, + "loss": 0.1668, + "step": 19414 + }, + { + "epoch": 1.8289724688537716, + "grad_norm": 0.6652894020080566, + "learning_rate": 3.7322215750081214e-07, + "loss": 0.2056, + "step": 19415 + }, + { + "epoch": 1.8290666729469396, + "grad_norm": 0.6470969915390015, + "learning_rate": 3.728135806860045e-07, + "loss": 0.2095, + "step": 19416 + }, + { + "epoch": 1.8291608770401075, + "grad_norm": 0.6608524322509766, + "learning_rate": 3.7240522338593655e-07, + "loss": 0.1808, + "step": 19417 + }, + { + "epoch": 1.8292550811332753, + "grad_norm": 0.6656803488731384, + "learning_rate": 3.7199708560991974e-07, + "loss": 0.199, + "step": 19418 + }, + { + "epoch": 1.829349285226443, + "grad_norm": 0.6752721667289734, + "learning_rate": 3.715891673672578e-07, + "loss": 0.1808, + "step": 19419 + }, + { + "epoch": 1.829443489319611, + "grad_norm": 0.6856604218482971, + "learning_rate": 3.7118146866725433e-07, + "loss": 0.1986, + "step": 19420 + }, + { + "epoch": 1.829537693412779, + "grad_norm": 0.622015655040741, + "learning_rate": 3.707739895192042e-07, + "loss": 0.2018, + "step": 19421 + }, + { + "epoch": 1.8296318975059467, + "grad_norm": 0.7418761849403381, + "learning_rate": 3.703667299323988e-07, + "loss": 0.1864, + "step": 19422 + }, + { + "epoch": 1.8297261015991144, + "grad_norm": 0.8238862156867981, + "learning_rate": 3.6995968991612505e-07, + "loss": 0.1876, + "step": 19423 + }, + { + "epoch": 1.8298203056922824, + "grad_norm": 0.5917040109634399, + "learning_rate": 3.695528694796624e-07, + "loss": 0.1714, + "step": 19424 + }, + { + "epoch": 1.8299145097854503, + "grad_norm": 0.6978359818458557, + "learning_rate": 3.6914626863229e-07, + "loss": 0.1976, + "step": 19425 + }, + { + "epoch": 1.830008713878618, + "grad_norm": 0.6442034840583801, + "learning_rate": 3.687398873832759e-07, + "loss": 0.1762, + "step": 19426 + }, + { + "epoch": 1.8301029179717858, + "grad_norm": 0.6447684168815613, + "learning_rate": 3.683337257418873e-07, + "loss": 0.1844, + "step": 19427 + }, + { + "epoch": 1.8301971220649538, + "grad_norm": 0.7106531262397766, + "learning_rate": 3.679277837173856e-07, + "loss": 0.1863, + "step": 19428 + }, + { + "epoch": 1.8302913261581217, + "grad_norm": 0.6524806618690491, + "learning_rate": 3.6752206131902666e-07, + "loss": 0.2002, + "step": 19429 + }, + { + "epoch": 1.8303855302512895, + "grad_norm": 0.575257420539856, + "learning_rate": 3.67116558556061e-07, + "loss": 0.1731, + "step": 19430 + }, + { + "epoch": 1.8304797343444572, + "grad_norm": 0.7416486144065857, + "learning_rate": 3.6671127543773667e-07, + "loss": 0.1919, + "step": 19431 + }, + { + "epoch": 1.8305739384376252, + "grad_norm": 0.5736696124076843, + "learning_rate": 3.6630621197329297e-07, + "loss": 0.1704, + "step": 19432 + }, + { + "epoch": 1.830668142530793, + "grad_norm": 0.7141389846801758, + "learning_rate": 3.6590136817196585e-07, + "loss": 0.188, + "step": 19433 + }, + { + "epoch": 1.8307623466239606, + "grad_norm": 0.6803520321846008, + "learning_rate": 3.6549674404298796e-07, + "loss": 0.2003, + "step": 19434 + }, + { + "epoch": 1.8308565507171286, + "grad_norm": 0.6170375347137451, + "learning_rate": 3.650923395955841e-07, + "loss": 0.2045, + "step": 19435 + }, + { + "epoch": 1.8309507548102966, + "grad_norm": 0.7466601729393005, + "learning_rate": 3.646881548389736e-07, + "loss": 0.1842, + "step": 19436 + }, + { + "epoch": 1.8310449589034643, + "grad_norm": 0.6154623627662659, + "learning_rate": 3.642841897823768e-07, + "loss": 0.1744, + "step": 19437 + }, + { + "epoch": 1.831139162996632, + "grad_norm": 1.2468785047531128, + "learning_rate": 3.6388044443500306e-07, + "loss": 0.1646, + "step": 19438 + }, + { + "epoch": 1.8312333670898, + "grad_norm": 0.741445779800415, + "learning_rate": 3.6347691880605494e-07, + "loss": 0.2029, + "step": 19439 + }, + { + "epoch": 1.831327571182968, + "grad_norm": 0.6602708101272583, + "learning_rate": 3.630736129047385e-07, + "loss": 0.1977, + "step": 19440 + }, + { + "epoch": 1.8314217752761357, + "grad_norm": 0.6252934336662292, + "learning_rate": 3.626705267402475e-07, + "loss": 0.1898, + "step": 19441 + }, + { + "epoch": 1.8315159793693034, + "grad_norm": 0.720191478729248, + "learning_rate": 3.622676603217701e-07, + "loss": 0.194, + "step": 19442 + }, + { + "epoch": 1.8316101834624714, + "grad_norm": 0.6151571273803711, + "learning_rate": 3.618650136584978e-07, + "loss": 0.1842, + "step": 19443 + }, + { + "epoch": 1.8317043875556394, + "grad_norm": 0.6189284324645996, + "learning_rate": 3.614625867596089e-07, + "loss": 0.1729, + "step": 19444 + }, + { + "epoch": 1.831798591648807, + "grad_norm": 0.6306717991828918, + "learning_rate": 3.610603796342782e-07, + "loss": 0.1664, + "step": 19445 + }, + { + "epoch": 1.8318927957419748, + "grad_norm": 0.6504979133605957, + "learning_rate": 3.606583922916773e-07, + "loss": 0.178, + "step": 19446 + }, + { + "epoch": 1.8319869998351428, + "grad_norm": 0.6690945625305176, + "learning_rate": 3.602566247409744e-07, + "loss": 0.1962, + "step": 19447 + }, + { + "epoch": 1.8320812039283108, + "grad_norm": 0.6376203298568726, + "learning_rate": 3.5985507699132673e-07, + "loss": 0.161, + "step": 19448 + }, + { + "epoch": 1.8321754080214785, + "grad_norm": 0.6145581603050232, + "learning_rate": 3.5945374905189236e-07, + "loss": 0.1756, + "step": 19449 + }, + { + "epoch": 1.8322696121146462, + "grad_norm": 0.7321510314941406, + "learning_rate": 3.5905264093182293e-07, + "loss": 0.1899, + "step": 19450 + }, + { + "epoch": 1.8323638162078142, + "grad_norm": 0.7382429242134094, + "learning_rate": 3.586517526402622e-07, + "loss": 0.2289, + "step": 19451 + }, + { + "epoch": 1.8324580203009821, + "grad_norm": 0.6109058260917664, + "learning_rate": 3.582510841863529e-07, + "loss": 0.1925, + "step": 19452 + }, + { + "epoch": 1.8325522243941499, + "grad_norm": 0.6512452363967896, + "learning_rate": 3.5785063557923106e-07, + "loss": 0.2179, + "step": 19453 + }, + { + "epoch": 1.8326464284873176, + "grad_norm": 0.6336072683334351, + "learning_rate": 3.574504068280249e-07, + "loss": 0.1848, + "step": 19454 + }, + { + "epoch": 1.8327406325804856, + "grad_norm": 0.6218070983886719, + "learning_rate": 3.570503979418627e-07, + "loss": 0.1755, + "step": 19455 + }, + { + "epoch": 1.8328348366736535, + "grad_norm": 0.688758134841919, + "learning_rate": 3.566506089298638e-07, + "loss": 0.1956, + "step": 19456 + }, + { + "epoch": 1.8329290407668213, + "grad_norm": 0.643220067024231, + "learning_rate": 3.562510398011454e-07, + "loss": 0.1834, + "step": 19457 + }, + { + "epoch": 1.833023244859989, + "grad_norm": 0.6620644927024841, + "learning_rate": 3.558516905648179e-07, + "loss": 0.185, + "step": 19458 + }, + { + "epoch": 1.833117448953157, + "grad_norm": 0.7510180473327637, + "learning_rate": 3.554525612299864e-07, + "loss": 0.2435, + "step": 19459 + }, + { + "epoch": 1.833211653046325, + "grad_norm": 0.6576058268547058, + "learning_rate": 3.550536518057535e-07, + "loss": 0.1892, + "step": 19460 + }, + { + "epoch": 1.8333058571394927, + "grad_norm": 0.6471046209335327, + "learning_rate": 3.5465496230121076e-07, + "loss": 0.1922, + "step": 19461 + }, + { + "epoch": 1.8334000612326604, + "grad_norm": 0.6720855832099915, + "learning_rate": 3.5425649272545326e-07, + "loss": 0.1875, + "step": 19462 + }, + { + "epoch": 1.8334942653258284, + "grad_norm": 0.624860942363739, + "learning_rate": 3.538582430875659e-07, + "loss": 0.185, + "step": 19463 + }, + { + "epoch": 1.8335884694189963, + "grad_norm": 0.6520931124687195, + "learning_rate": 3.5346021339662696e-07, + "loss": 0.1967, + "step": 19464 + }, + { + "epoch": 1.833682673512164, + "grad_norm": 0.6448037028312683, + "learning_rate": 3.5306240366171584e-07, + "loss": 0.1984, + "step": 19465 + }, + { + "epoch": 1.8337768776053318, + "grad_norm": 0.6768196225166321, + "learning_rate": 3.526648138918998e-07, + "loss": 0.2234, + "step": 19466 + }, + { + "epoch": 1.8338710816984998, + "grad_norm": 0.6211151480674744, + "learning_rate": 3.522674440962448e-07, + "loss": 0.1543, + "step": 19467 + }, + { + "epoch": 1.8339652857916677, + "grad_norm": 0.66634202003479, + "learning_rate": 3.518702942838148e-07, + "loss": 0.179, + "step": 19468 + }, + { + "epoch": 1.8340594898848355, + "grad_norm": 0.7418509721755981, + "learning_rate": 3.5147336446366254e-07, + "loss": 0.19, + "step": 19469 + }, + { + "epoch": 1.8341536939780032, + "grad_norm": 0.6177558302879333, + "learning_rate": 3.5107665464483854e-07, + "loss": 0.1752, + "step": 19470 + }, + { + "epoch": 1.8342478980711712, + "grad_norm": 0.6559076905250549, + "learning_rate": 3.5068016483639e-07, + "loss": 0.1959, + "step": 19471 + }, + { + "epoch": 1.8343421021643391, + "grad_norm": 0.6523414850234985, + "learning_rate": 3.5028389504735637e-07, + "loss": 0.2168, + "step": 19472 + }, + { + "epoch": 1.8344363062575069, + "grad_norm": 0.6761232614517212, + "learning_rate": 3.498878452867727e-07, + "loss": 0.2221, + "step": 19473 + }, + { + "epoch": 1.8345305103506746, + "grad_norm": 0.6956623792648315, + "learning_rate": 3.494920155636716e-07, + "loss": 0.2094, + "step": 19474 + }, + { + "epoch": 1.8346247144438426, + "grad_norm": 0.6148270964622498, + "learning_rate": 3.490964058870772e-07, + "loss": 0.1809, + "step": 19475 + }, + { + "epoch": 1.8347189185370105, + "grad_norm": 0.7161653637886047, + "learning_rate": 3.487010162660087e-07, + "loss": 0.2107, + "step": 19476 + }, + { + "epoch": 1.8348131226301783, + "grad_norm": 0.6752950549125671, + "learning_rate": 3.4830584670948464e-07, + "loss": 0.197, + "step": 19477 + }, + { + "epoch": 1.834907326723346, + "grad_norm": 0.6317424178123474, + "learning_rate": 3.4791089722651437e-07, + "loss": 0.1743, + "step": 19478 + }, + { + "epoch": 1.835001530816514, + "grad_norm": 0.7135169506072998, + "learning_rate": 3.4751616782610075e-07, + "loss": 0.2119, + "step": 19479 + }, + { + "epoch": 1.835095734909682, + "grad_norm": 0.6575676798820496, + "learning_rate": 3.4712165851724764e-07, + "loss": 0.2017, + "step": 19480 + }, + { + "epoch": 1.8351899390028497, + "grad_norm": 0.6301974058151245, + "learning_rate": 3.467273693089501e-07, + "loss": 0.1984, + "step": 19481 + }, + { + "epoch": 1.8352841430960174, + "grad_norm": 0.6600757837295532, + "learning_rate": 3.463333002101954e-07, + "loss": 0.1607, + "step": 19482 + }, + { + "epoch": 1.8353783471891854, + "grad_norm": 0.6832287311553955, + "learning_rate": 3.4593945122997295e-07, + "loss": 0.1819, + "step": 19483 + }, + { + "epoch": 1.8354725512823533, + "grad_norm": 0.6303820013999939, + "learning_rate": 3.455458223772612e-07, + "loss": 0.1971, + "step": 19484 + }, + { + "epoch": 1.835566755375521, + "grad_norm": 0.6942116618156433, + "learning_rate": 3.4515241366103405e-07, + "loss": 0.2125, + "step": 19485 + }, + { + "epoch": 1.8356609594686888, + "grad_norm": 0.6522420644760132, + "learning_rate": 3.447592250902643e-07, + "loss": 0.195, + "step": 19486 + }, + { + "epoch": 1.8357551635618568, + "grad_norm": 0.6443928480148315, + "learning_rate": 3.4436625667391697e-07, + "loss": 0.1925, + "step": 19487 + }, + { + "epoch": 1.8358493676550247, + "grad_norm": 0.6874093413352966, + "learning_rate": 3.4397350842095054e-07, + "loss": 0.1964, + "step": 19488 + }, + { + "epoch": 1.8359435717481924, + "grad_norm": 0.7090978026390076, + "learning_rate": 3.435809803403223e-07, + "loss": 0.1954, + "step": 19489 + }, + { + "epoch": 1.8360377758413602, + "grad_norm": 0.6059409976005554, + "learning_rate": 3.4318867244098165e-07, + "loss": 0.1786, + "step": 19490 + }, + { + "epoch": 1.8361319799345281, + "grad_norm": 0.6071126461029053, + "learning_rate": 3.427965847318737e-07, + "loss": 0.1904, + "step": 19491 + }, + { + "epoch": 1.836226184027696, + "grad_norm": 0.7138732671737671, + "learning_rate": 3.4240471722193804e-07, + "loss": 0.2023, + "step": 19492 + }, + { + "epoch": 1.8363203881208638, + "grad_norm": 0.7073351144790649, + "learning_rate": 3.4201306992011187e-07, + "loss": 0.1697, + "step": 19493 + }, + { + "epoch": 1.8364145922140316, + "grad_norm": 0.6532337069511414, + "learning_rate": 3.416216428353236e-07, + "loss": 0.1824, + "step": 19494 + }, + { + "epoch": 1.8365087963071995, + "grad_norm": 0.7384734749794006, + "learning_rate": 3.4123043597649953e-07, + "loss": 0.1986, + "step": 19495 + }, + { + "epoch": 1.8366030004003675, + "grad_norm": 0.7011557817459106, + "learning_rate": 3.4083944935255686e-07, + "loss": 0.204, + "step": 19496 + }, + { + "epoch": 1.8366972044935352, + "grad_norm": 0.6641374230384827, + "learning_rate": 3.404486829724141e-07, + "loss": 0.2082, + "step": 19497 + }, + { + "epoch": 1.836791408586703, + "grad_norm": 0.6848839521408081, + "learning_rate": 3.400581368449818e-07, + "loss": 0.1907, + "step": 19498 + }, + { + "epoch": 1.836885612679871, + "grad_norm": 0.6220717430114746, + "learning_rate": 3.396678109791607e-07, + "loss": 0.1855, + "step": 19499 + }, + { + "epoch": 1.836979816773039, + "grad_norm": 0.6117632985115051, + "learning_rate": 3.3927770538385584e-07, + "loss": 0.1911, + "step": 19500 + }, + { + "epoch": 1.8370740208662066, + "grad_norm": 0.6125076413154602, + "learning_rate": 3.3888782006795795e-07, + "loss": 0.17, + "step": 19501 + }, + { + "epoch": 1.8371682249593744, + "grad_norm": 0.7635436654090881, + "learning_rate": 3.384981550403599e-07, + "loss": 0.191, + "step": 19502 + }, + { + "epoch": 1.8372624290525423, + "grad_norm": 0.7081218957901001, + "learning_rate": 3.3810871030994564e-07, + "loss": 0.1809, + "step": 19503 + }, + { + "epoch": 1.8373566331457103, + "grad_norm": 0.6071407794952393, + "learning_rate": 3.377194858855948e-07, + "loss": 0.1706, + "step": 19504 + }, + { + "epoch": 1.837450837238878, + "grad_norm": 0.6723511219024658, + "learning_rate": 3.373304817761835e-07, + "loss": 0.2303, + "step": 19505 + }, + { + "epoch": 1.8375450413320458, + "grad_norm": 0.6099964380264282, + "learning_rate": 3.3694169799058043e-07, + "loss": 0.1877, + "step": 19506 + }, + { + "epoch": 1.8376392454252137, + "grad_norm": 0.629387617111206, + "learning_rate": 3.365531345376505e-07, + "loss": 0.1945, + "step": 19507 + }, + { + "epoch": 1.8377334495183817, + "grad_norm": 0.6301486492156982, + "learning_rate": 3.361647914262545e-07, + "loss": 0.2245, + "step": 19508 + }, + { + "epoch": 1.8378276536115494, + "grad_norm": 0.7205371856689453, + "learning_rate": 3.357766686652464e-07, + "loss": 0.2173, + "step": 19509 + }, + { + "epoch": 1.8379218577047172, + "grad_norm": 0.6755000352859497, + "learning_rate": 3.353887662634758e-07, + "loss": 0.2357, + "step": 19510 + }, + { + "epoch": 1.8380160617978851, + "grad_norm": 0.71434485912323, + "learning_rate": 3.3500108422978906e-07, + "loss": 0.2079, + "step": 19511 + }, + { + "epoch": 1.838110265891053, + "grad_norm": 0.7100639939308167, + "learning_rate": 3.346136225730234e-07, + "loss": 0.2047, + "step": 19512 + }, + { + "epoch": 1.8382044699842208, + "grad_norm": 0.6243544816970825, + "learning_rate": 3.3422638130201526e-07, + "loss": 0.1963, + "step": 19513 + }, + { + "epoch": 1.8382986740773886, + "grad_norm": 0.6586558222770691, + "learning_rate": 3.338393604255952e-07, + "loss": 0.1856, + "step": 19514 + }, + { + "epoch": 1.8383928781705565, + "grad_norm": 0.6588220596313477, + "learning_rate": 3.334525599525862e-07, + "loss": 0.2013, + "step": 19515 + }, + { + "epoch": 1.8384870822637245, + "grad_norm": 0.6576763987541199, + "learning_rate": 3.3306597989180677e-07, + "loss": 0.2188, + "step": 19516 + }, + { + "epoch": 1.8385812863568922, + "grad_norm": 0.682857871055603, + "learning_rate": 3.3267962025207545e-07, + "loss": 0.2009, + "step": 19517 + }, + { + "epoch": 1.83867549045006, + "grad_norm": 0.6771490573883057, + "learning_rate": 3.322934810421996e-07, + "loss": 0.1909, + "step": 19518 + }, + { + "epoch": 1.838769694543228, + "grad_norm": 0.6297563314437866, + "learning_rate": 3.319075622709811e-07, + "loss": 0.1833, + "step": 19519 + }, + { + "epoch": 1.8388638986363959, + "grad_norm": 0.6497674584388733, + "learning_rate": 3.3152186394722506e-07, + "loss": 0.1742, + "step": 19520 + }, + { + "epoch": 1.8389581027295636, + "grad_norm": 0.6712440848350525, + "learning_rate": 3.311363860797223e-07, + "loss": 0.2257, + "step": 19521 + }, + { + "epoch": 1.8390523068227314, + "grad_norm": 0.7431421875953674, + "learning_rate": 3.307511286772613e-07, + "loss": 0.2156, + "step": 19522 + }, + { + "epoch": 1.8391465109158993, + "grad_norm": 0.6532416939735413, + "learning_rate": 3.3036609174863066e-07, + "loss": 0.2286, + "step": 19523 + }, + { + "epoch": 1.8392407150090673, + "grad_norm": 0.6792806386947632, + "learning_rate": 3.299812753026077e-07, + "loss": 0.2126, + "step": 19524 + }, + { + "epoch": 1.839334919102235, + "grad_norm": 0.6522883772850037, + "learning_rate": 3.295966793479655e-07, + "loss": 0.1737, + "step": 19525 + }, + { + "epoch": 1.8394291231954027, + "grad_norm": 0.673279881477356, + "learning_rate": 3.292123038934747e-07, + "loss": 0.1991, + "step": 19526 + }, + { + "epoch": 1.8395233272885707, + "grad_norm": 0.6407334804534912, + "learning_rate": 3.288281489478995e-07, + "loss": 0.1805, + "step": 19527 + }, + { + "epoch": 1.8396175313817387, + "grad_norm": 0.6358546018600464, + "learning_rate": 3.2844421451999954e-07, + "loss": 0.195, + "step": 19528 + }, + { + "epoch": 1.8397117354749064, + "grad_norm": 0.6791768670082092, + "learning_rate": 3.280605006185278e-07, + "loss": 0.1705, + "step": 19529 + }, + { + "epoch": 1.8398059395680741, + "grad_norm": 0.6021818518638611, + "learning_rate": 3.2767700725223617e-07, + "loss": 0.1819, + "step": 19530 + }, + { + "epoch": 1.839900143661242, + "grad_norm": 0.7272765636444092, + "learning_rate": 3.272937344298666e-07, + "loss": 0.1851, + "step": 19531 + }, + { + "epoch": 1.83999434775441, + "grad_norm": 0.6813640594482422, + "learning_rate": 3.269106821601586e-07, + "loss": 0.1815, + "step": 19532 + }, + { + "epoch": 1.8400885518475778, + "grad_norm": 0.685269296169281, + "learning_rate": 3.2652785045184764e-07, + "loss": 0.237, + "step": 19533 + }, + { + "epoch": 1.8401827559407455, + "grad_norm": 0.6834416389465332, + "learning_rate": 3.2614523931366105e-07, + "loss": 0.1951, + "step": 19534 + }, + { + "epoch": 1.8402769600339135, + "grad_norm": 0.6788691282272339, + "learning_rate": 3.2576284875432517e-07, + "loss": 0.2031, + "step": 19535 + }, + { + "epoch": 1.8403711641270815, + "grad_norm": 0.7511125206947327, + "learning_rate": 3.253806787825564e-07, + "loss": 0.2095, + "step": 19536 + }, + { + "epoch": 1.8404653682202492, + "grad_norm": 0.723578929901123, + "learning_rate": 3.249987294070711e-07, + "loss": 0.1858, + "step": 19537 + }, + { + "epoch": 1.840559572313417, + "grad_norm": 0.6681480407714844, + "learning_rate": 3.2461700063657785e-07, + "loss": 0.1839, + "step": 19538 + }, + { + "epoch": 1.840653776406585, + "grad_norm": 0.6754745841026306, + "learning_rate": 3.242354924797786e-07, + "loss": 0.2039, + "step": 19539 + }, + { + "epoch": 1.8407479804997529, + "grad_norm": 0.6273301839828491, + "learning_rate": 3.238542049453763e-07, + "loss": 0.1506, + "step": 19540 + }, + { + "epoch": 1.8408421845929206, + "grad_norm": 0.7100375294685364, + "learning_rate": 3.234731380420608e-07, + "loss": 0.2205, + "step": 19541 + }, + { + "epoch": 1.8409363886860883, + "grad_norm": 0.7500102519989014, + "learning_rate": 3.2309229177852287e-07, + "loss": 0.1939, + "step": 19542 + }, + { + "epoch": 1.8410305927792563, + "grad_norm": 0.6986610889434814, + "learning_rate": 3.227116661634466e-07, + "loss": 0.202, + "step": 19543 + }, + { + "epoch": 1.8411247968724243, + "grad_norm": 0.5962001085281372, + "learning_rate": 3.223312612055107e-07, + "loss": 0.1802, + "step": 19544 + }, + { + "epoch": 1.841219000965592, + "grad_norm": 0.6139877438545227, + "learning_rate": 3.2195107691338933e-07, + "loss": 0.1629, + "step": 19545 + }, + { + "epoch": 1.8413132050587597, + "grad_norm": 0.7145199775695801, + "learning_rate": 3.2157111329574997e-07, + "loss": 0.1788, + "step": 19546 + }, + { + "epoch": 1.8414074091519277, + "grad_norm": 0.6981918811798096, + "learning_rate": 3.211913703612568e-07, + "loss": 0.1853, + "step": 19547 + }, + { + "epoch": 1.8415016132450956, + "grad_norm": 0.6296716332435608, + "learning_rate": 3.208118481185707e-07, + "loss": 0.1902, + "step": 19548 + }, + { + "epoch": 1.8415958173382634, + "grad_norm": 0.7081273198127747, + "learning_rate": 3.204325465763425e-07, + "loss": 0.182, + "step": 19549 + }, + { + "epoch": 1.8416900214314311, + "grad_norm": 0.6704627275466919, + "learning_rate": 3.2005346574322195e-07, + "loss": 0.1726, + "step": 19550 + }, + { + "epoch": 1.841784225524599, + "grad_norm": 0.6365349888801575, + "learning_rate": 3.1967460562785325e-07, + "loss": 0.1836, + "step": 19551 + }, + { + "epoch": 1.841878429617767, + "grad_norm": 0.6927284002304077, + "learning_rate": 3.192959662388739e-07, + "loss": 0.1965, + "step": 19552 + }, + { + "epoch": 1.8419726337109348, + "grad_norm": 0.6339163780212402, + "learning_rate": 3.189175475849171e-07, + "loss": 0.1913, + "step": 19553 + }, + { + "epoch": 1.8420668378041025, + "grad_norm": 0.6798602938652039, + "learning_rate": 3.1853934967461363e-07, + "loss": 0.1805, + "step": 19554 + }, + { + "epoch": 1.8421610418972705, + "grad_norm": 0.5639039874076843, + "learning_rate": 3.1816137251658664e-07, + "loss": 0.1609, + "step": 19555 + }, + { + "epoch": 1.8422552459904384, + "grad_norm": 0.6263942718505859, + "learning_rate": 3.177836161194503e-07, + "loss": 0.1723, + "step": 19556 + }, + { + "epoch": 1.8423494500836062, + "grad_norm": 0.6475573778152466, + "learning_rate": 3.1740608049182444e-07, + "loss": 0.1978, + "step": 19557 + }, + { + "epoch": 1.842443654176774, + "grad_norm": 0.6725324988365173, + "learning_rate": 3.1702876564231434e-07, + "loss": 0.1766, + "step": 19558 + }, + { + "epoch": 1.8425378582699419, + "grad_norm": 0.5613603591918945, + "learning_rate": 3.1665167157952093e-07, + "loss": 0.1604, + "step": 19559 + }, + { + "epoch": 1.8426320623631098, + "grad_norm": 0.6868391036987305, + "learning_rate": 3.162747983120473e-07, + "loss": 0.2246, + "step": 19560 + }, + { + "epoch": 1.8427262664562776, + "grad_norm": 0.6153104305267334, + "learning_rate": 3.1589814584848334e-07, + "loss": 0.1964, + "step": 19561 + }, + { + "epoch": 1.8428204705494453, + "grad_norm": 0.6290621161460876, + "learning_rate": 3.1552171419741874e-07, + "loss": 0.1935, + "step": 19562 + }, + { + "epoch": 1.8429146746426133, + "grad_norm": 0.6391206383705139, + "learning_rate": 3.1514550336743554e-07, + "loss": 0.1899, + "step": 19563 + }, + { + "epoch": 1.8430088787357812, + "grad_norm": 0.7169826030731201, + "learning_rate": 3.147695133671147e-07, + "loss": 0.1895, + "step": 19564 + }, + { + "epoch": 1.843103082828949, + "grad_norm": 0.6721808314323425, + "learning_rate": 3.14393744205026e-07, + "loss": 0.1867, + "step": 19565 + }, + { + "epoch": 1.8431972869221167, + "grad_norm": 0.6748124957084656, + "learning_rate": 3.1401819588973814e-07, + "loss": 0.1858, + "step": 19566 + }, + { + "epoch": 1.8432914910152847, + "grad_norm": 0.7615307569503784, + "learning_rate": 3.1364286842981763e-07, + "loss": 0.2052, + "step": 19567 + }, + { + "epoch": 1.8433856951084526, + "grad_norm": 0.6342546343803406, + "learning_rate": 3.1326776183381757e-07, + "loss": 0.1788, + "step": 19568 + }, + { + "epoch": 1.8434798992016204, + "grad_norm": 0.6604020595550537, + "learning_rate": 3.128928761102945e-07, + "loss": 0.2, + "step": 19569 + }, + { + "epoch": 1.843574103294788, + "grad_norm": 0.6869939565658569, + "learning_rate": 3.1251821126779494e-07, + "loss": 0.1853, + "step": 19570 + }, + { + "epoch": 1.843668307387956, + "grad_norm": 0.6485978364944458, + "learning_rate": 3.1214376731486194e-07, + "loss": 0.2084, + "step": 19571 + }, + { + "epoch": 1.8437625114811238, + "grad_norm": 0.662097692489624, + "learning_rate": 3.1176954426003327e-07, + "loss": 0.1827, + "step": 19572 + }, + { + "epoch": 1.8438567155742915, + "grad_norm": 0.7116630673408508, + "learning_rate": 3.113955421118442e-07, + "loss": 0.1871, + "step": 19573 + }, + { + "epoch": 1.8439509196674595, + "grad_norm": 0.7092744708061218, + "learning_rate": 3.11021760878818e-07, + "loss": 0.1922, + "step": 19574 + }, + { + "epoch": 1.8440451237606275, + "grad_norm": 0.6666938066482544, + "learning_rate": 3.106482005694822e-07, + "loss": 0.1793, + "step": 19575 + }, + { + "epoch": 1.8441393278537952, + "grad_norm": 0.6568140387535095, + "learning_rate": 3.1027486119235117e-07, + "loss": 0.1878, + "step": 19576 + }, + { + "epoch": 1.844233531946963, + "grad_norm": 0.6853905916213989, + "learning_rate": 3.099017427559392e-07, + "loss": 0.208, + "step": 19577 + }, + { + "epoch": 1.844327736040131, + "grad_norm": 0.5811006426811218, + "learning_rate": 3.0952884526875285e-07, + "loss": 0.1856, + "step": 19578 + }, + { + "epoch": 1.8444219401332989, + "grad_norm": 0.6356227993965149, + "learning_rate": 3.091561687392952e-07, + "loss": 0.2054, + "step": 19579 + }, + { + "epoch": 1.8445161442264666, + "grad_norm": 0.6708242893218994, + "learning_rate": 3.0878371317606513e-07, + "loss": 0.1842, + "step": 19580 + }, + { + "epoch": 1.8446103483196343, + "grad_norm": 0.6495194435119629, + "learning_rate": 3.084114785875525e-07, + "loss": 0.1941, + "step": 19581 + }, + { + "epoch": 1.8447045524128023, + "grad_norm": 0.6219097375869751, + "learning_rate": 3.080394649822471e-07, + "loss": 0.1932, + "step": 19582 + }, + { + "epoch": 1.8447987565059702, + "grad_norm": 0.6671397089958191, + "learning_rate": 3.0766767236863226e-07, + "loss": 0.1922, + "step": 19583 + }, + { + "epoch": 1.844892960599138, + "grad_norm": 0.620679497718811, + "learning_rate": 3.072961007551822e-07, + "loss": 0.171, + "step": 19584 + }, + { + "epoch": 1.8449871646923057, + "grad_norm": 0.6596792936325073, + "learning_rate": 3.069247501503725e-07, + "loss": 0.1866, + "step": 19585 + }, + { + "epoch": 1.8450813687854737, + "grad_norm": 5.839211463928223, + "learning_rate": 3.065536205626685e-07, + "loss": 0.1831, + "step": 19586 + }, + { + "epoch": 1.8451755728786416, + "grad_norm": 0.6346095204353333, + "learning_rate": 3.061827120005323e-07, + "loss": 0.1834, + "step": 19587 + }, + { + "epoch": 1.8452697769718094, + "grad_norm": 0.6453737616539001, + "learning_rate": 3.058120244724239e-07, + "loss": 0.1848, + "step": 19588 + }, + { + "epoch": 1.8453639810649771, + "grad_norm": 0.6811696290969849, + "learning_rate": 3.05441557986792e-07, + "loss": 0.2404, + "step": 19589 + }, + { + "epoch": 1.845458185158145, + "grad_norm": 0.7306646108627319, + "learning_rate": 3.0507131255208545e-07, + "loss": 0.2129, + "step": 19590 + }, + { + "epoch": 1.845552389251313, + "grad_norm": 0.6541388630867004, + "learning_rate": 3.047012881767475e-07, + "loss": 0.1988, + "step": 19591 + }, + { + "epoch": 1.8456465933444808, + "grad_norm": 0.614191472530365, + "learning_rate": 3.043314848692136e-07, + "loss": 0.1756, + "step": 19592 + }, + { + "epoch": 1.8457407974376485, + "grad_norm": 0.5694600343704224, + "learning_rate": 3.03961902637917e-07, + "loss": 0.1822, + "step": 19593 + }, + { + "epoch": 1.8458350015308165, + "grad_norm": 0.7498295307159424, + "learning_rate": 3.035925414912844e-07, + "loss": 0.1825, + "step": 19594 + }, + { + "epoch": 1.8459292056239844, + "grad_norm": 0.7187083959579468, + "learning_rate": 3.0322340143773777e-07, + "loss": 0.1976, + "step": 19595 + }, + { + "epoch": 1.8460234097171522, + "grad_norm": 0.6782183647155762, + "learning_rate": 3.028544824856916e-07, + "loss": 0.195, + "step": 19596 + }, + { + "epoch": 1.84611761381032, + "grad_norm": 0.7186533212661743, + "learning_rate": 3.0248578464356246e-07, + "loss": 0.1832, + "step": 19597 + }, + { + "epoch": 1.8462118179034879, + "grad_norm": 0.664090633392334, + "learning_rate": 3.021173079197559e-07, + "loss": 0.2159, + "step": 19598 + }, + { + "epoch": 1.8463060219966558, + "grad_norm": 0.6171144247055054, + "learning_rate": 3.017490523226696e-07, + "loss": 0.1685, + "step": 19599 + }, + { + "epoch": 1.8464002260898236, + "grad_norm": 1.0775052309036255, + "learning_rate": 3.0138101786070575e-07, + "loss": 0.1899, + "step": 19600 + }, + { + "epoch": 1.8464944301829913, + "grad_norm": 0.6579177379608154, + "learning_rate": 3.0101320454225424e-07, + "loss": 0.2263, + "step": 19601 + }, + { + "epoch": 1.8465886342761593, + "grad_norm": 0.5745784640312195, + "learning_rate": 3.0064561237570067e-07, + "loss": 0.1777, + "step": 19602 + }, + { + "epoch": 1.8466828383693272, + "grad_norm": 0.5918304324150085, + "learning_rate": 3.0027824136942607e-07, + "loss": 0.1726, + "step": 19603 + }, + { + "epoch": 1.846777042462495, + "grad_norm": 0.6967973113059998, + "learning_rate": 2.999110915318115e-07, + "loss": 0.2281, + "step": 19604 + }, + { + "epoch": 1.8468712465556627, + "grad_norm": 0.6738758087158203, + "learning_rate": 2.9954416287122257e-07, + "loss": 0.2047, + "step": 19605 + }, + { + "epoch": 1.8469654506488307, + "grad_norm": 0.640472948551178, + "learning_rate": 2.9917745539603024e-07, + "loss": 0.1881, + "step": 19606 + }, + { + "epoch": 1.8470596547419986, + "grad_norm": 0.6335986852645874, + "learning_rate": 2.988109691145946e-07, + "loss": 0.1787, + "step": 19607 + }, + { + "epoch": 1.8471538588351664, + "grad_norm": 0.6743058562278748, + "learning_rate": 2.984447040352712e-07, + "loss": 0.1991, + "step": 19608 + }, + { + "epoch": 1.847248062928334, + "grad_norm": 0.623181939125061, + "learning_rate": 2.9807866016641316e-07, + "loss": 0.1831, + "step": 19609 + }, + { + "epoch": 1.847342267021502, + "grad_norm": 0.6506023406982422, + "learning_rate": 2.9771283751636627e-07, + "loss": 0.1946, + "step": 19610 + }, + { + "epoch": 1.84743647111467, + "grad_norm": 0.7284735441207886, + "learning_rate": 2.973472360934704e-07, + "loss": 0.1806, + "step": 19611 + }, + { + "epoch": 1.8475306752078378, + "grad_norm": 0.7416127324104309, + "learning_rate": 2.969818559060633e-07, + "loss": 0.2006, + "step": 19612 + }, + { + "epoch": 1.8476248793010055, + "grad_norm": 0.6531298756599426, + "learning_rate": 2.966166969624762e-07, + "loss": 0.2042, + "step": 19613 + }, + { + "epoch": 1.8477190833941735, + "grad_norm": 0.6501283049583435, + "learning_rate": 2.9625175927103455e-07, + "loss": 0.2047, + "step": 19614 + }, + { + "epoch": 1.8478132874873414, + "grad_norm": 0.605383574962616, + "learning_rate": 2.9588704284006176e-07, + "loss": 0.1642, + "step": 19615 + }, + { + "epoch": 1.8479074915805092, + "grad_norm": 0.7704370021820068, + "learning_rate": 2.9552254767787005e-07, + "loss": 0.2283, + "step": 19616 + }, + { + "epoch": 1.848001695673677, + "grad_norm": 0.691565752029419, + "learning_rate": 2.9515827379277386e-07, + "loss": 0.2185, + "step": 19617 + }, + { + "epoch": 1.8480958997668449, + "grad_norm": 0.7182550430297852, + "learning_rate": 2.947942211930765e-07, + "loss": 0.2153, + "step": 19618 + }, + { + "epoch": 1.8481901038600128, + "grad_norm": 0.6316009759902954, + "learning_rate": 2.944303898870804e-07, + "loss": 0.188, + "step": 19619 + }, + { + "epoch": 1.8482843079531805, + "grad_norm": 0.6875702738761902, + "learning_rate": 2.940667798830821e-07, + "loss": 0.1976, + "step": 19620 + }, + { + "epoch": 1.8483785120463483, + "grad_norm": 0.830883264541626, + "learning_rate": 2.9370339118937164e-07, + "loss": 0.1762, + "step": 19621 + }, + { + "epoch": 1.8484727161395162, + "grad_norm": 0.7475153207778931, + "learning_rate": 2.933402238142336e-07, + "loss": 0.2076, + "step": 19622 + }, + { + "epoch": 1.8485669202326842, + "grad_norm": 0.6479750871658325, + "learning_rate": 2.929772777659523e-07, + "loss": 0.1797, + "step": 19623 + }, + { + "epoch": 1.848661124325852, + "grad_norm": 0.6192037463188171, + "learning_rate": 2.926145530528002e-07, + "loss": 0.1769, + "step": 19624 + }, + { + "epoch": 1.8487553284190197, + "grad_norm": 0.7229530215263367, + "learning_rate": 2.9225204968304944e-07, + "loss": 0.1947, + "step": 19625 + }, + { + "epoch": 1.8488495325121876, + "grad_norm": 0.8542710542678833, + "learning_rate": 2.918897676649646e-07, + "loss": 0.1902, + "step": 19626 + }, + { + "epoch": 1.8489437366053556, + "grad_norm": 0.5919458866119385, + "learning_rate": 2.9152770700680677e-07, + "loss": 0.1791, + "step": 19627 + }, + { + "epoch": 1.8490379406985233, + "grad_norm": 0.6756226420402527, + "learning_rate": 2.9116586771683273e-07, + "loss": 0.1849, + "step": 19628 + }, + { + "epoch": 1.849132144791691, + "grad_norm": 0.6769475936889648, + "learning_rate": 2.9080424980329147e-07, + "loss": 0.1676, + "step": 19629 + }, + { + "epoch": 1.849226348884859, + "grad_norm": 0.6706231236457825, + "learning_rate": 2.904428532744274e-07, + "loss": 0.1998, + "step": 19630 + }, + { + "epoch": 1.849320552978027, + "grad_norm": 0.6368475556373596, + "learning_rate": 2.900816781384852e-07, + "loss": 0.1771, + "step": 19631 + }, + { + "epoch": 1.8494147570711947, + "grad_norm": 0.662482500076294, + "learning_rate": 2.897207244036948e-07, + "loss": 0.1941, + "step": 19632 + }, + { + "epoch": 1.8495089611643625, + "grad_norm": 0.6530662178993225, + "learning_rate": 2.893599920782908e-07, + "loss": 0.1699, + "step": 19633 + }, + { + "epoch": 1.8496031652575304, + "grad_norm": 0.5605717897415161, + "learning_rate": 2.889994811704966e-07, + "loss": 0.1567, + "step": 19634 + }, + { + "epoch": 1.8496973693506984, + "grad_norm": 0.6888538599014282, + "learning_rate": 2.886391916885323e-07, + "loss": 0.1917, + "step": 19635 + }, + { + "epoch": 1.8497915734438661, + "grad_norm": 0.7064043283462524, + "learning_rate": 2.882791236406124e-07, + "loss": 0.2006, + "step": 19636 + }, + { + "epoch": 1.8498857775370339, + "grad_norm": 0.630302369594574, + "learning_rate": 2.8791927703494924e-07, + "loss": 0.171, + "step": 19637 + }, + { + "epoch": 1.8499799816302018, + "grad_norm": 0.792571485042572, + "learning_rate": 2.8755965187974633e-07, + "loss": 0.1968, + "step": 19638 + }, + { + "epoch": 1.8500741857233698, + "grad_norm": 0.6825298070907593, + "learning_rate": 2.8720024818320256e-07, + "loss": 0.1816, + "step": 19639 + }, + { + "epoch": 1.8501683898165375, + "grad_norm": 0.6284648180007935, + "learning_rate": 2.868410659535159e-07, + "loss": 0.1926, + "step": 19640 + }, + { + "epoch": 1.8502625939097053, + "grad_norm": 0.6546636819839478, + "learning_rate": 2.864821051988753e-07, + "loss": 0.1859, + "step": 19641 + }, + { + "epoch": 1.8503567980028732, + "grad_norm": 0.6364118456840515, + "learning_rate": 2.861233659274632e-07, + "loss": 0.1617, + "step": 19642 + }, + { + "epoch": 1.8504510020960412, + "grad_norm": 0.6821374893188477, + "learning_rate": 2.8576484814746176e-07, + "loss": 0.1985, + "step": 19643 + }, + { + "epoch": 1.850545206189209, + "grad_norm": 0.6279967427253723, + "learning_rate": 2.854065518670457e-07, + "loss": 0.1971, + "step": 19644 + }, + { + "epoch": 1.8506394102823767, + "grad_norm": 0.6143205165863037, + "learning_rate": 2.850484770943829e-07, + "loss": 0.1705, + "step": 19645 + }, + { + "epoch": 1.8507336143755446, + "grad_norm": 0.6796934604644775, + "learning_rate": 2.846906238376401e-07, + "loss": 0.2045, + "step": 19646 + }, + { + "epoch": 1.8508278184687126, + "grad_norm": 0.692689061164856, + "learning_rate": 2.8433299210497645e-07, + "loss": 0.2174, + "step": 19647 + }, + { + "epoch": 1.8509220225618803, + "grad_norm": 0.6097289323806763, + "learning_rate": 2.839755819045453e-07, + "loss": 0.1979, + "step": 19648 + }, + { + "epoch": 1.851016226655048, + "grad_norm": 0.7667192816734314, + "learning_rate": 2.836183932444969e-07, + "loss": 0.187, + "step": 19649 + }, + { + "epoch": 1.851110430748216, + "grad_norm": 0.6377841234207153, + "learning_rate": 2.832614261329769e-07, + "loss": 0.1942, + "step": 19650 + }, + { + "epoch": 1.851204634841384, + "grad_norm": 0.5748652815818787, + "learning_rate": 2.829046805781221e-07, + "loss": 0.1794, + "step": 19651 + }, + { + "epoch": 1.8512988389345517, + "grad_norm": 0.6595532298088074, + "learning_rate": 2.8254815658806944e-07, + "loss": 0.19, + "step": 19652 + }, + { + "epoch": 1.8513930430277195, + "grad_norm": 0.6605759263038635, + "learning_rate": 2.8219185417094784e-07, + "loss": 0.1878, + "step": 19653 + }, + { + "epoch": 1.8514872471208874, + "grad_norm": 0.6056495904922485, + "learning_rate": 2.818357733348798e-07, + "loss": 0.1816, + "step": 19654 + }, + { + "epoch": 1.8515814512140554, + "grad_norm": 0.7514147758483887, + "learning_rate": 2.8147991408798534e-07, + "loss": 0.1719, + "step": 19655 + }, + { + "epoch": 1.8516756553072231, + "grad_norm": 0.5801824331283569, + "learning_rate": 2.8112427643837927e-07, + "loss": 0.1728, + "step": 19656 + }, + { + "epoch": 1.8517698594003908, + "grad_norm": 0.600534200668335, + "learning_rate": 2.8076886039417053e-07, + "loss": 0.1742, + "step": 19657 + }, + { + "epoch": 1.8518640634935588, + "grad_norm": 0.6482451558113098, + "learning_rate": 2.8041366596346267e-07, + "loss": 0.1869, + "step": 19658 + }, + { + "epoch": 1.8519582675867268, + "grad_norm": 0.6890844106674194, + "learning_rate": 2.8005869315435365e-07, + "loss": 0.2116, + "step": 19659 + }, + { + "epoch": 1.8520524716798945, + "grad_norm": 0.605459451675415, + "learning_rate": 2.797039419749403e-07, + "loss": 0.1708, + "step": 19660 + }, + { + "epoch": 1.8521466757730622, + "grad_norm": 0.6100725531578064, + "learning_rate": 2.793494124333085e-07, + "loss": 0.1537, + "step": 19661 + }, + { + "epoch": 1.8522408798662302, + "grad_norm": 0.598700225353241, + "learning_rate": 2.7899510453754386e-07, + "loss": 0.1716, + "step": 19662 + }, + { + "epoch": 1.8523350839593982, + "grad_norm": 0.6506624817848206, + "learning_rate": 2.7864101829572557e-07, + "loss": 0.1815, + "step": 19663 + }, + { + "epoch": 1.852429288052566, + "grad_norm": 0.6575130820274353, + "learning_rate": 2.7828715371592483e-07, + "loss": 0.1943, + "step": 19664 + }, + { + "epoch": 1.8525234921457336, + "grad_norm": 0.7161694765090942, + "learning_rate": 2.7793351080621425e-07, + "loss": 0.1939, + "step": 19665 + }, + { + "epoch": 1.8526176962389016, + "grad_norm": 0.6616843342781067, + "learning_rate": 2.7758008957465275e-07, + "loss": 0.1841, + "step": 19666 + }, + { + "epoch": 1.8527119003320696, + "grad_norm": 0.6931621432304382, + "learning_rate": 2.7722689002930183e-07, + "loss": 0.2313, + "step": 19667 + }, + { + "epoch": 1.8528061044252373, + "grad_norm": 0.6817244291305542, + "learning_rate": 2.768739121782149e-07, + "loss": 0.1765, + "step": 19668 + }, + { + "epoch": 1.852900308518405, + "grad_norm": 0.6396641135215759, + "learning_rate": 2.7652115602943895e-07, + "loss": 0.1903, + "step": 19669 + }, + { + "epoch": 1.852994512611573, + "grad_norm": 0.6513628959655762, + "learning_rate": 2.7616862159101866e-07, + "loss": 0.1911, + "step": 19670 + }, + { + "epoch": 1.853088716704741, + "grad_norm": 0.6474629640579224, + "learning_rate": 2.7581630887099307e-07, + "loss": 0.1885, + "step": 19671 + }, + { + "epoch": 1.8531829207979087, + "grad_norm": 0.7338670492172241, + "learning_rate": 2.7546421787739363e-07, + "loss": 0.1923, + "step": 19672 + }, + { + "epoch": 1.8532771248910764, + "grad_norm": 0.6354264616966248, + "learning_rate": 2.751123486182483e-07, + "loss": 0.1827, + "step": 19673 + }, + { + "epoch": 1.8533713289842444, + "grad_norm": 0.6260570287704468, + "learning_rate": 2.747607011015829e-07, + "loss": 0.1812, + "step": 19674 + }, + { + "epoch": 1.8534655330774124, + "grad_norm": 0.6957045197486877, + "learning_rate": 2.744092753354144e-07, + "loss": 0.2319, + "step": 19675 + }, + { + "epoch": 1.85355973717058, + "grad_norm": 0.7678351998329163, + "learning_rate": 2.740580713277519e-07, + "loss": 0.1946, + "step": 19676 + }, + { + "epoch": 1.8536539412637478, + "grad_norm": 0.5689692497253418, + "learning_rate": 2.7370708908661003e-07, + "loss": 0.1989, + "step": 19677 + }, + { + "epoch": 1.8537481453569158, + "grad_norm": 0.6453081965446472, + "learning_rate": 2.7335632861998807e-07, + "loss": 0.1909, + "step": 19678 + }, + { + "epoch": 1.8538423494500837, + "grad_norm": 0.6650949120521545, + "learning_rate": 2.730057899358829e-07, + "loss": 0.1831, + "step": 19679 + }, + { + "epoch": 1.8539365535432515, + "grad_norm": 0.714195191860199, + "learning_rate": 2.7265547304229034e-07, + "loss": 0.2036, + "step": 19680 + }, + { + "epoch": 1.8540307576364192, + "grad_norm": 0.6691907644271851, + "learning_rate": 2.7230537794719623e-07, + "loss": 0.1933, + "step": 19681 + }, + { + "epoch": 1.8541249617295872, + "grad_norm": 0.5945155024528503, + "learning_rate": 2.719555046585831e-07, + "loss": 0.1849, + "step": 19682 + }, + { + "epoch": 1.8542191658227551, + "grad_norm": 0.7244347929954529, + "learning_rate": 2.71605853184429e-07, + "loss": 0.1836, + "step": 19683 + }, + { + "epoch": 1.8543133699159229, + "grad_norm": 0.6852580904960632, + "learning_rate": 2.7125642353270755e-07, + "loss": 0.2216, + "step": 19684 + }, + { + "epoch": 1.8544075740090906, + "grad_norm": 0.6954776048660278, + "learning_rate": 2.709072157113846e-07, + "loss": 0.2154, + "step": 19685 + }, + { + "epoch": 1.8545017781022586, + "grad_norm": 0.7100127935409546, + "learning_rate": 2.705582297284237e-07, + "loss": 0.2137, + "step": 19686 + }, + { + "epoch": 1.8545959821954265, + "grad_norm": 0.7123499512672424, + "learning_rate": 2.7020946559178306e-07, + "loss": 0.1878, + "step": 19687 + }, + { + "epoch": 1.8546901862885943, + "grad_norm": 0.6751053333282471, + "learning_rate": 2.6986092330941295e-07, + "loss": 0.2135, + "step": 19688 + }, + { + "epoch": 1.854784390381762, + "grad_norm": 0.6273108124732971, + "learning_rate": 2.695126028892614e-07, + "loss": 0.2124, + "step": 19689 + }, + { + "epoch": 1.85487859447493, + "grad_norm": 0.7018774747848511, + "learning_rate": 2.691645043392721e-07, + "loss": 0.1948, + "step": 19690 + }, + { + "epoch": 1.854972798568098, + "grad_norm": 0.6028143763542175, + "learning_rate": 2.688166276673809e-07, + "loss": 0.1724, + "step": 19691 + }, + { + "epoch": 1.8550670026612657, + "grad_norm": 0.7303981781005859, + "learning_rate": 2.684689728815193e-07, + "loss": 0.2126, + "step": 19692 + }, + { + "epoch": 1.8551612067544334, + "grad_norm": 0.6413070559501648, + "learning_rate": 2.681215399896164e-07, + "loss": 0.1758, + "step": 19693 + }, + { + "epoch": 1.8552554108476014, + "grad_norm": 0.5861358642578125, + "learning_rate": 2.677743289995927e-07, + "loss": 0.1699, + "step": 19694 + }, + { + "epoch": 1.8553496149407693, + "grad_norm": 0.7060631513595581, + "learning_rate": 2.67427339919365e-07, + "loss": 0.1697, + "step": 19695 + }, + { + "epoch": 1.855443819033937, + "grad_norm": 0.7234995365142822, + "learning_rate": 2.6708057275684595e-07, + "loss": 0.2168, + "step": 19696 + }, + { + "epoch": 1.8555380231271048, + "grad_norm": 0.6536963582038879, + "learning_rate": 2.667340275199426e-07, + "loss": 0.1915, + "step": 19697 + }, + { + "epoch": 1.8556322272202728, + "grad_norm": 0.7373390197753906, + "learning_rate": 2.663877042165552e-07, + "loss": 0.1955, + "step": 19698 + }, + { + "epoch": 1.8557264313134407, + "grad_norm": 0.8008182644844055, + "learning_rate": 2.6604160285458203e-07, + "loss": 0.178, + "step": 19699 + }, + { + "epoch": 1.8558206354066085, + "grad_norm": 0.8788657188415527, + "learning_rate": 2.656957234419144e-07, + "loss": 0.2104, + "step": 19700 + }, + { + "epoch": 1.8559148394997762, + "grad_norm": 0.6970170140266418, + "learning_rate": 2.653500659864372e-07, + "loss": 0.1984, + "step": 19701 + }, + { + "epoch": 1.8560090435929442, + "grad_norm": 0.6879594922065735, + "learning_rate": 2.650046304960352e-07, + "loss": 0.1762, + "step": 19702 + }, + { + "epoch": 1.8561032476861121, + "grad_norm": 0.643665075302124, + "learning_rate": 2.6465941697858213e-07, + "loss": 0.1909, + "step": 19703 + }, + { + "epoch": 1.8561974517792799, + "grad_norm": 0.7155811786651611, + "learning_rate": 2.6431442544195053e-07, + "loss": 0.2002, + "step": 19704 + }, + { + "epoch": 1.8562916558724476, + "grad_norm": 0.6288726925849915, + "learning_rate": 2.6396965589400746e-07, + "loss": 0.1683, + "step": 19705 + }, + { + "epoch": 1.8563858599656156, + "grad_norm": 0.6954754590988159, + "learning_rate": 2.6362510834261226e-07, + "loss": 0.1861, + "step": 19706 + }, + { + "epoch": 1.8564800640587835, + "grad_norm": 0.6139570474624634, + "learning_rate": 2.6328078279562185e-07, + "loss": 0.1833, + "step": 19707 + }, + { + "epoch": 1.8565742681519513, + "grad_norm": 0.6650378108024597, + "learning_rate": 2.62936679260889e-07, + "loss": 0.2117, + "step": 19708 + }, + { + "epoch": 1.856668472245119, + "grad_norm": 0.625217080116272, + "learning_rate": 2.625927977462572e-07, + "loss": 0.1759, + "step": 19709 + }, + { + "epoch": 1.856762676338287, + "grad_norm": 1.1064122915267944, + "learning_rate": 2.6224913825956933e-07, + "loss": 0.1915, + "step": 19710 + }, + { + "epoch": 1.8568568804314547, + "grad_norm": 0.6624402403831482, + "learning_rate": 2.619057008086612e-07, + "loss": 0.1808, + "step": 19711 + }, + { + "epoch": 1.8569510845246224, + "grad_norm": 0.6218162178993225, + "learning_rate": 2.615624854013632e-07, + "loss": 0.1853, + "step": 19712 + }, + { + "epoch": 1.8570452886177904, + "grad_norm": 0.6406760215759277, + "learning_rate": 2.6121949204550024e-07, + "loss": 0.1946, + "step": 19713 + }, + { + "epoch": 1.8571394927109584, + "grad_norm": 0.6376236081123352, + "learning_rate": 2.6087672074889603e-07, + "loss": 0.1923, + "step": 19714 + }, + { + "epoch": 1.857233696804126, + "grad_norm": 0.6326330304145813, + "learning_rate": 2.6053417151936435e-07, + "loss": 0.1913, + "step": 19715 + }, + { + "epoch": 1.8573279008972938, + "grad_norm": 0.6673585176467896, + "learning_rate": 2.6019184436471335e-07, + "loss": 0.1839, + "step": 19716 + }, + { + "epoch": 1.8574221049904618, + "grad_norm": 0.6535778045654297, + "learning_rate": 2.598497392927535e-07, + "loss": 0.2008, + "step": 19717 + }, + { + "epoch": 1.8575163090836297, + "grad_norm": 0.7765671014785767, + "learning_rate": 2.5950785631128294e-07, + "loss": 0.1733, + "step": 19718 + }, + { + "epoch": 1.8576105131767975, + "grad_norm": 0.6393800377845764, + "learning_rate": 2.591661954280966e-07, + "loss": 0.1653, + "step": 19719 + }, + { + "epoch": 1.8577047172699652, + "grad_norm": 0.6803317070007324, + "learning_rate": 2.5882475665098493e-07, + "loss": 0.2272, + "step": 19720 + }, + { + "epoch": 1.8577989213631332, + "grad_norm": 0.6579533219337463, + "learning_rate": 2.58483539987735e-07, + "loss": 0.2057, + "step": 19721 + }, + { + "epoch": 1.8578931254563011, + "grad_norm": 0.642487108707428, + "learning_rate": 2.5814254544612503e-07, + "loss": 0.1982, + "step": 19722 + }, + { + "epoch": 1.8579873295494689, + "grad_norm": 0.6265017986297607, + "learning_rate": 2.5780177303393105e-07, + "loss": 0.1968, + "step": 19723 + }, + { + "epoch": 1.8580815336426366, + "grad_norm": 0.6324055194854736, + "learning_rate": 2.574612227589246e-07, + "loss": 0.1923, + "step": 19724 + }, + { + "epoch": 1.8581757377358046, + "grad_norm": 0.6792360544204712, + "learning_rate": 2.5712089462886836e-07, + "loss": 0.2093, + "step": 19725 + }, + { + "epoch": 1.8582699418289725, + "grad_norm": 0.6746578812599182, + "learning_rate": 2.5678078865152276e-07, + "loss": 0.2031, + "step": 19726 + }, + { + "epoch": 1.8583641459221403, + "grad_norm": 0.9754471182823181, + "learning_rate": 2.56440904834645e-07, + "loss": 0.1892, + "step": 19727 + }, + { + "epoch": 1.858458350015308, + "grad_norm": 0.6703466176986694, + "learning_rate": 2.5610124318598216e-07, + "loss": 0.2043, + "step": 19728 + }, + { + "epoch": 1.858552554108476, + "grad_norm": 0.663557231426239, + "learning_rate": 2.557618037132803e-07, + "loss": 0.1758, + "step": 19729 + }, + { + "epoch": 1.858646758201644, + "grad_norm": 0.5749669075012207, + "learning_rate": 2.5542258642427987e-07, + "loss": 0.1784, + "step": 19730 + }, + { + "epoch": 1.8587409622948117, + "grad_norm": 0.6767820715904236, + "learning_rate": 2.5508359132671356e-07, + "loss": 0.1933, + "step": 19731 + }, + { + "epoch": 1.8588351663879794, + "grad_norm": 0.7008445262908936, + "learning_rate": 2.5474481842831187e-07, + "loss": 0.2114, + "step": 19732 + }, + { + "epoch": 1.8589293704811474, + "grad_norm": 0.6820916533470154, + "learning_rate": 2.54406267736802e-07, + "loss": 0.1771, + "step": 19733 + }, + { + "epoch": 1.8590235745743153, + "grad_norm": 0.681978702545166, + "learning_rate": 2.540679392598999e-07, + "loss": 0.1864, + "step": 19734 + }, + { + "epoch": 1.859117778667483, + "grad_norm": 0.6696916222572327, + "learning_rate": 2.537298330053206e-07, + "loss": 0.1991, + "step": 19735 + }, + { + "epoch": 1.8592119827606508, + "grad_norm": 0.7833788394927979, + "learning_rate": 2.5339194898077346e-07, + "loss": 0.2168, + "step": 19736 + }, + { + "epoch": 1.8593061868538188, + "grad_norm": 0.6476829648017883, + "learning_rate": 2.530542871939645e-07, + "loss": 0.1839, + "step": 19737 + }, + { + "epoch": 1.8594003909469867, + "grad_norm": 0.7010082006454468, + "learning_rate": 2.52716847652591e-07, + "loss": 0.2018, + "step": 19738 + }, + { + "epoch": 1.8594945950401545, + "grad_norm": 0.6451516151428223, + "learning_rate": 2.5237963036434775e-07, + "loss": 0.176, + "step": 19739 + }, + { + "epoch": 1.8595887991333222, + "grad_norm": 0.7889953255653381, + "learning_rate": 2.520426353369254e-07, + "loss": 0.2127, + "step": 19740 + }, + { + "epoch": 1.8596830032264902, + "grad_norm": 0.6260870099067688, + "learning_rate": 2.517058625780044e-07, + "loss": 0.1932, + "step": 19741 + }, + { + "epoch": 1.8597772073196581, + "grad_norm": 0.594916582107544, + "learning_rate": 2.513693120952665e-07, + "loss": 0.1776, + "step": 19742 + }, + { + "epoch": 1.8598714114128259, + "grad_norm": 0.6291800141334534, + "learning_rate": 2.5103298389638544e-07, + "loss": 0.1819, + "step": 19743 + }, + { + "epoch": 1.8599656155059936, + "grad_norm": 0.7713466882705688, + "learning_rate": 2.5069687798902954e-07, + "loss": 0.2119, + "step": 19744 + }, + { + "epoch": 1.8600598195991616, + "grad_norm": 0.623530924320221, + "learning_rate": 2.5036099438086157e-07, + "loss": 0.2096, + "step": 19745 + }, + { + "epoch": 1.8601540236923295, + "grad_norm": 0.6710311770439148, + "learning_rate": 2.5002533307954103e-07, + "loss": 0.19, + "step": 19746 + }, + { + "epoch": 1.8602482277854973, + "grad_norm": 0.7561832666397095, + "learning_rate": 2.496898940927217e-07, + "loss": 0.2043, + "step": 19747 + }, + { + "epoch": 1.860342431878665, + "grad_norm": 0.6054635643959045, + "learning_rate": 2.493546774280531e-07, + "loss": 0.1888, + "step": 19748 + }, + { + "epoch": 1.860436635971833, + "grad_norm": 0.6866369247436523, + "learning_rate": 2.490196830931757e-07, + "loss": 0.1946, + "step": 19749 + }, + { + "epoch": 1.860530840065001, + "grad_norm": 0.6473537683486938, + "learning_rate": 2.4868491109573013e-07, + "loss": 0.1817, + "step": 19750 + }, + { + "epoch": 1.8606250441581687, + "grad_norm": 0.692039430141449, + "learning_rate": 2.4835036144335024e-07, + "loss": 0.1943, + "step": 19751 + }, + { + "epoch": 1.8607192482513364, + "grad_norm": 0.7021738886833191, + "learning_rate": 2.480160341436633e-07, + "loss": 0.1855, + "step": 19752 + }, + { + "epoch": 1.8608134523445043, + "grad_norm": 0.6827689409255981, + "learning_rate": 2.476819292042909e-07, + "loss": 0.2112, + "step": 19753 + }, + { + "epoch": 1.8609076564376723, + "grad_norm": 0.7296918630599976, + "learning_rate": 2.4734804663285485e-07, + "loss": 0.1787, + "step": 19754 + }, + { + "epoch": 1.86100186053084, + "grad_norm": 0.6645269989967346, + "learning_rate": 2.470143864369656e-07, + "loss": 0.1859, + "step": 19755 + }, + { + "epoch": 1.8610960646240078, + "grad_norm": 0.8175538182258606, + "learning_rate": 2.4668094862422943e-07, + "loss": 0.2661, + "step": 19756 + }, + { + "epoch": 1.8611902687171757, + "grad_norm": 0.6240702867507935, + "learning_rate": 2.463477332022535e-07, + "loss": 0.2444, + "step": 19757 + }, + { + "epoch": 1.8612844728103437, + "grad_norm": 0.6328794360160828, + "learning_rate": 2.460147401786339e-07, + "loss": 0.1992, + "step": 19758 + }, + { + "epoch": 1.8613786769035114, + "grad_norm": 0.6536951065063477, + "learning_rate": 2.4568196956096245e-07, + "loss": 0.2007, + "step": 19759 + }, + { + "epoch": 1.8614728809966792, + "grad_norm": 0.5887195467948914, + "learning_rate": 2.4534942135682637e-07, + "loss": 0.1582, + "step": 19760 + }, + { + "epoch": 1.8615670850898471, + "grad_norm": 0.6196441650390625, + "learning_rate": 2.4501709557381183e-07, + "loss": 0.1794, + "step": 19761 + }, + { + "epoch": 1.861661289183015, + "grad_norm": 0.6211194396018982, + "learning_rate": 2.446849922194916e-07, + "loss": 0.1877, + "step": 19762 + }, + { + "epoch": 1.8617554932761828, + "grad_norm": 0.6575772762298584, + "learning_rate": 2.443531113014408e-07, + "loss": 0.1821, + "step": 19763 + }, + { + "epoch": 1.8618496973693506, + "grad_norm": 0.7031137943267822, + "learning_rate": 2.440214528272278e-07, + "loss": 0.2013, + "step": 19764 + }, + { + "epoch": 1.8619439014625185, + "grad_norm": 0.6978479027748108, + "learning_rate": 2.436900168044121e-07, + "loss": 0.1889, + "step": 19765 + }, + { + "epoch": 1.8620381055556865, + "grad_norm": 0.697313666343689, + "learning_rate": 2.4335880324055204e-07, + "loss": 0.2136, + "step": 19766 + }, + { + "epoch": 1.8621323096488542, + "grad_norm": 0.7061405181884766, + "learning_rate": 2.430278121432017e-07, + "loss": 0.1921, + "step": 19767 + }, + { + "epoch": 1.862226513742022, + "grad_norm": 0.6298143267631531, + "learning_rate": 2.4269704351990606e-07, + "loss": 0.2184, + "step": 19768 + }, + { + "epoch": 1.86232071783519, + "grad_norm": 0.6164076328277588, + "learning_rate": 2.4236649737820695e-07, + "loss": 0.1586, + "step": 19769 + }, + { + "epoch": 1.862414921928358, + "grad_norm": 0.6876676678657532, + "learning_rate": 2.420361737256438e-07, + "loss": 0.1788, + "step": 19770 + }, + { + "epoch": 1.8625091260215256, + "grad_norm": 0.5701471567153931, + "learning_rate": 2.417060725697473e-07, + "loss": 0.1744, + "step": 19771 + }, + { + "epoch": 1.8626033301146934, + "grad_norm": 0.6103629469871521, + "learning_rate": 2.413761939180415e-07, + "loss": 0.1647, + "step": 19772 + }, + { + "epoch": 1.8626975342078613, + "grad_norm": 0.648344099521637, + "learning_rate": 2.410465377780513e-07, + "loss": 0.1946, + "step": 19773 + }, + { + "epoch": 1.8627917383010293, + "grad_norm": 0.6380503177642822, + "learning_rate": 2.407171041572942e-07, + "loss": 0.1682, + "step": 19774 + }, + { + "epoch": 1.862885942394197, + "grad_norm": 0.7133384943008423, + "learning_rate": 2.403878930632786e-07, + "loss": 0.2384, + "step": 19775 + }, + { + "epoch": 1.8629801464873648, + "grad_norm": 0.605835497379303, + "learning_rate": 2.400589045035118e-07, + "loss": 0.1668, + "step": 19776 + }, + { + "epoch": 1.8630743505805327, + "grad_norm": 0.6401779651641846, + "learning_rate": 2.3973013848549775e-07, + "loss": 0.2044, + "step": 19777 + }, + { + "epoch": 1.8631685546737007, + "grad_norm": 0.6554655432701111, + "learning_rate": 2.3940159501673054e-07, + "loss": 0.1887, + "step": 19778 + }, + { + "epoch": 1.8632627587668684, + "grad_norm": 0.7029209136962891, + "learning_rate": 2.3907327410470084e-07, + "loss": 0.1947, + "step": 19779 + }, + { + "epoch": 1.8633569628600362, + "grad_norm": 0.7291501760482788, + "learning_rate": 2.3874517575689817e-07, + "loss": 0.1917, + "step": 19780 + }, + { + "epoch": 1.8634511669532041, + "grad_norm": 0.7364242076873779, + "learning_rate": 2.3841729998079987e-07, + "loss": 0.1828, + "step": 19781 + }, + { + "epoch": 1.863545371046372, + "grad_norm": 0.6417863368988037, + "learning_rate": 2.3808964678388447e-07, + "loss": 0.1812, + "step": 19782 + }, + { + "epoch": 1.8636395751395398, + "grad_norm": 0.6952944993972778, + "learning_rate": 2.3776221617362261e-07, + "loss": 0.2288, + "step": 19783 + }, + { + "epoch": 1.8637337792327076, + "grad_norm": 0.6842653751373291, + "learning_rate": 2.3743500815747834e-07, + "loss": 0.2004, + "step": 19784 + }, + { + "epoch": 1.8638279833258755, + "grad_norm": 0.7092806100845337, + "learning_rate": 2.3710802274291567e-07, + "loss": 0.2084, + "step": 19785 + }, + { + "epoch": 1.8639221874190435, + "grad_norm": 0.7035544514656067, + "learning_rate": 2.3678125993738753e-07, + "loss": 0.1994, + "step": 19786 + }, + { + "epoch": 1.8640163915122112, + "grad_norm": 0.7044257521629333, + "learning_rate": 2.3645471974834577e-07, + "loss": 0.1983, + "step": 19787 + }, + { + "epoch": 1.864110595605379, + "grad_norm": 0.6268710494041443, + "learning_rate": 2.3612840218323664e-07, + "loss": 0.1734, + "step": 19788 + }, + { + "epoch": 1.864204799698547, + "grad_norm": 0.6927345395088196, + "learning_rate": 2.358023072494997e-07, + "loss": 0.1911, + "step": 19789 + }, + { + "epoch": 1.8642990037917149, + "grad_norm": 0.6738446950912476, + "learning_rate": 2.3547643495457018e-07, + "loss": 0.2192, + "step": 19790 + }, + { + "epoch": 1.8643932078848826, + "grad_norm": 0.6546797156333923, + "learning_rate": 2.35150785305881e-07, + "loss": 0.1623, + "step": 19791 + }, + { + "epoch": 1.8644874119780503, + "grad_norm": 0.6611368060112, + "learning_rate": 2.3482535831085396e-07, + "loss": 0.1766, + "step": 19792 + }, + { + "epoch": 1.8645816160712183, + "grad_norm": 0.683911919593811, + "learning_rate": 2.3450015397690985e-07, + "loss": 0.1853, + "step": 19793 + }, + { + "epoch": 1.8646758201643863, + "grad_norm": 0.6592962145805359, + "learning_rate": 2.3417517231146712e-07, + "loss": 0.1974, + "step": 19794 + }, + { + "epoch": 1.864770024257554, + "grad_norm": 0.6624939441680908, + "learning_rate": 2.3385041332193325e-07, + "loss": 0.1924, + "step": 19795 + }, + { + "epoch": 1.8648642283507217, + "grad_norm": 0.7302245497703552, + "learning_rate": 2.335258770157134e-07, + "loss": 0.2298, + "step": 19796 + }, + { + "epoch": 1.8649584324438897, + "grad_norm": 0.6368466019630432, + "learning_rate": 2.3320156340020605e-07, + "loss": 0.1981, + "step": 19797 + }, + { + "epoch": 1.8650526365370577, + "grad_norm": 0.5639004707336426, + "learning_rate": 2.3287747248280978e-07, + "loss": 0.1561, + "step": 19798 + }, + { + "epoch": 1.8651468406302254, + "grad_norm": 0.6973299384117126, + "learning_rate": 2.32553604270912e-07, + "loss": 0.209, + "step": 19799 + }, + { + "epoch": 1.8652410447233931, + "grad_norm": 0.6718546748161316, + "learning_rate": 2.322299587718968e-07, + "loss": 0.1788, + "step": 19800 + }, + { + "epoch": 1.865335248816561, + "grad_norm": 0.6919240355491638, + "learning_rate": 2.3190653599314493e-07, + "loss": 0.1624, + "step": 19801 + }, + { + "epoch": 1.865429452909729, + "grad_norm": 0.7096720933914185, + "learning_rate": 2.3158333594203054e-07, + "loss": 0.1912, + "step": 19802 + }, + { + "epoch": 1.8655236570028968, + "grad_norm": 0.6396719217300415, + "learning_rate": 2.3126035862592322e-07, + "loss": 0.1877, + "step": 19803 + }, + { + "epoch": 1.8656178610960645, + "grad_norm": 0.6330217123031616, + "learning_rate": 2.3093760405218823e-07, + "loss": 0.1923, + "step": 19804 + }, + { + "epoch": 1.8657120651892325, + "grad_norm": 0.633255124092102, + "learning_rate": 2.3061507222818303e-07, + "loss": 0.1969, + "step": 19805 + }, + { + "epoch": 1.8658062692824005, + "grad_norm": 0.7594079971313477, + "learning_rate": 2.3029276316126281e-07, + "loss": 0.1952, + "step": 19806 + }, + { + "epoch": 1.8659004733755682, + "grad_norm": 0.6460544466972351, + "learning_rate": 2.299706768587784e-07, + "loss": 0.2028, + "step": 19807 + }, + { + "epoch": 1.865994677468736, + "grad_norm": 0.7043282985687256, + "learning_rate": 2.2964881332807053e-07, + "loss": 0.1979, + "step": 19808 + }, + { + "epoch": 1.866088881561904, + "grad_norm": 0.6759858131408691, + "learning_rate": 2.2932717257648008e-07, + "loss": 0.1922, + "step": 19809 + }, + { + "epoch": 1.8661830856550718, + "grad_norm": 0.9607840776443481, + "learning_rate": 2.2900575461134112e-07, + "loss": 0.1948, + "step": 19810 + }, + { + "epoch": 1.8662772897482396, + "grad_norm": 0.6503164768218994, + "learning_rate": 2.2868455943998224e-07, + "loss": 0.2198, + "step": 19811 + }, + { + "epoch": 1.8663714938414073, + "grad_norm": 0.7966095805168152, + "learning_rate": 2.283635870697265e-07, + "loss": 0.2345, + "step": 19812 + }, + { + "epoch": 1.8664656979345753, + "grad_norm": 0.6505135893821716, + "learning_rate": 2.2804283750789357e-07, + "loss": 0.201, + "step": 19813 + }, + { + "epoch": 1.8665599020277432, + "grad_norm": 0.706760048866272, + "learning_rate": 2.2772231076179652e-07, + "loss": 0.1887, + "step": 19814 + }, + { + "epoch": 1.866654106120911, + "grad_norm": 0.705281674861908, + "learning_rate": 2.2740200683874392e-07, + "loss": 0.199, + "step": 19815 + }, + { + "epoch": 1.8667483102140787, + "grad_norm": 0.725080668926239, + "learning_rate": 2.270819257460377e-07, + "loss": 0.2273, + "step": 19816 + }, + { + "epoch": 1.8668425143072467, + "grad_norm": 0.6675900220870972, + "learning_rate": 2.2676206749097985e-07, + "loss": 0.1812, + "step": 19817 + }, + { + "epoch": 1.8669367184004146, + "grad_norm": 0.7213043570518494, + "learning_rate": 2.2644243208086003e-07, + "loss": 0.1949, + "step": 19818 + }, + { + "epoch": 1.8670309224935824, + "grad_norm": 0.6684525609016418, + "learning_rate": 2.26123019522968e-07, + "loss": 0.1977, + "step": 19819 + }, + { + "epoch": 1.8671251265867501, + "grad_norm": 0.6752498745918274, + "learning_rate": 2.2580382982458794e-07, + "loss": 0.2049, + "step": 19820 + }, + { + "epoch": 1.867219330679918, + "grad_norm": 0.6144601106643677, + "learning_rate": 2.254848629929951e-07, + "loss": 0.1635, + "step": 19821 + }, + { + "epoch": 1.867313534773086, + "grad_norm": 0.6752665042877197, + "learning_rate": 2.2516611903546482e-07, + "loss": 0.1816, + "step": 19822 + }, + { + "epoch": 1.8674077388662538, + "grad_norm": 0.8126716613769531, + "learning_rate": 2.248475979592646e-07, + "loss": 0.1905, + "step": 19823 + }, + { + "epoch": 1.8675019429594215, + "grad_norm": 0.6823545098304749, + "learning_rate": 2.2452929977165528e-07, + "loss": 0.2016, + "step": 19824 + }, + { + "epoch": 1.8675961470525895, + "grad_norm": 0.673636794090271, + "learning_rate": 2.2421122447989773e-07, + "loss": 0.1951, + "step": 19825 + }, + { + "epoch": 1.8676903511457574, + "grad_norm": 0.6112269759178162, + "learning_rate": 2.2389337209124173e-07, + "loss": 0.1794, + "step": 19826 + }, + { + "epoch": 1.8677845552389252, + "grad_norm": 0.7177988886833191, + "learning_rate": 2.2357574261293592e-07, + "loss": 0.2162, + "step": 19827 + }, + { + "epoch": 1.867878759332093, + "grad_norm": 0.640326201915741, + "learning_rate": 2.2325833605222337e-07, + "loss": 0.1776, + "step": 19828 + }, + { + "epoch": 1.8679729634252609, + "grad_norm": 0.624055802822113, + "learning_rate": 2.229411524163394e-07, + "loss": 0.1667, + "step": 19829 + }, + { + "epoch": 1.8680671675184288, + "grad_norm": 0.6927505135536194, + "learning_rate": 2.2262419171251827e-07, + "loss": 0.2026, + "step": 19830 + }, + { + "epoch": 1.8681613716115966, + "grad_norm": 0.7343767881393433, + "learning_rate": 2.2230745394798748e-07, + "loss": 0.1816, + "step": 19831 + }, + { + "epoch": 1.8682555757047643, + "grad_norm": 0.6705753803253174, + "learning_rate": 2.2199093912996684e-07, + "loss": 0.2015, + "step": 19832 + }, + { + "epoch": 1.8683497797979323, + "grad_norm": 0.6273950338363647, + "learning_rate": 2.21674647265675e-07, + "loss": 0.1789, + "step": 19833 + }, + { + "epoch": 1.8684439838911002, + "grad_norm": 0.6339665055274963, + "learning_rate": 2.2135857836232511e-07, + "loss": 0.1847, + "step": 19834 + }, + { + "epoch": 1.868538187984268, + "grad_norm": 0.769565224647522, + "learning_rate": 2.2104273242712138e-07, + "loss": 0.2013, + "step": 19835 + }, + { + "epoch": 1.8686323920774357, + "grad_norm": 0.7009322047233582, + "learning_rate": 2.2072710946726693e-07, + "loss": 0.1743, + "step": 19836 + }, + { + "epoch": 1.8687265961706037, + "grad_norm": 0.6942918300628662, + "learning_rate": 2.2041170948995827e-07, + "loss": 0.1651, + "step": 19837 + }, + { + "epoch": 1.8688208002637716, + "grad_norm": 0.6284037828445435, + "learning_rate": 2.2009653250238738e-07, + "loss": 0.1899, + "step": 19838 + }, + { + "epoch": 1.8689150043569394, + "grad_norm": 0.6660770773887634, + "learning_rate": 2.1978157851174076e-07, + "loss": 0.1821, + "step": 19839 + }, + { + "epoch": 1.869009208450107, + "grad_norm": 0.626297116279602, + "learning_rate": 2.194668475251982e-07, + "loss": 0.1937, + "step": 19840 + }, + { + "epoch": 1.869103412543275, + "grad_norm": 0.6980593800544739, + "learning_rate": 2.1915233954993843e-07, + "loss": 0.1972, + "step": 19841 + }, + { + "epoch": 1.869197616636443, + "grad_norm": 0.6938313841819763, + "learning_rate": 2.1883805459313122e-07, + "loss": 0.2205, + "step": 19842 + }, + { + "epoch": 1.8692918207296108, + "grad_norm": 0.6362899541854858, + "learning_rate": 2.1852399266194312e-07, + "loss": 0.1878, + "step": 19843 + }, + { + "epoch": 1.8693860248227785, + "grad_norm": 0.850473165512085, + "learning_rate": 2.1821015376353728e-07, + "loss": 0.1885, + "step": 19844 + }, + { + "epoch": 1.8694802289159465, + "grad_norm": 0.5963436365127563, + "learning_rate": 2.178965379050657e-07, + "loss": 0.1531, + "step": 19845 + }, + { + "epoch": 1.8695744330091144, + "grad_norm": 0.7101747989654541, + "learning_rate": 2.1758314509368163e-07, + "loss": 0.1954, + "step": 19846 + }, + { + "epoch": 1.8696686371022821, + "grad_norm": 0.5779017210006714, + "learning_rate": 2.1726997533653149e-07, + "loss": 0.1963, + "step": 19847 + }, + { + "epoch": 1.8697628411954499, + "grad_norm": 0.683344304561615, + "learning_rate": 2.1695702864075408e-07, + "loss": 0.2018, + "step": 19848 + }, + { + "epoch": 1.8698570452886178, + "grad_norm": 0.6608116030693054, + "learning_rate": 2.166443050134859e-07, + "loss": 0.2196, + "step": 19849 + }, + { + "epoch": 1.8699512493817856, + "grad_norm": 0.6488050818443298, + "learning_rate": 2.1633180446185897e-07, + "loss": 0.1804, + "step": 19850 + }, + { + "epoch": 1.8700454534749533, + "grad_norm": 0.6669037938117981, + "learning_rate": 2.1601952699299877e-07, + "loss": 0.2198, + "step": 19851 + }, + { + "epoch": 1.8701396575681213, + "grad_norm": 0.6127009391784668, + "learning_rate": 2.157074726140218e-07, + "loss": 0.1847, + "step": 19852 + }, + { + "epoch": 1.8702338616612892, + "grad_norm": 0.6842362880706787, + "learning_rate": 2.1539564133204793e-07, + "loss": 0.2067, + "step": 19853 + }, + { + "epoch": 1.870328065754457, + "grad_norm": 0.6373916864395142, + "learning_rate": 2.1508403315418592e-07, + "loss": 0.1647, + "step": 19854 + }, + { + "epoch": 1.8704222698476247, + "grad_norm": 0.6885888576507568, + "learning_rate": 2.1477264808753896e-07, + "loss": 0.2138, + "step": 19855 + }, + { + "epoch": 1.8705164739407927, + "grad_norm": 0.6469976305961609, + "learning_rate": 2.1446148613920914e-07, + "loss": 0.1906, + "step": 19856 + }, + { + "epoch": 1.8706106780339606, + "grad_norm": 0.6703843474388123, + "learning_rate": 2.1415054731629192e-07, + "loss": 0.1879, + "step": 19857 + }, + { + "epoch": 1.8707048821271284, + "grad_norm": 0.6307921409606934, + "learning_rate": 2.1383983162587607e-07, + "loss": 0.197, + "step": 19858 + }, + { + "epoch": 1.8707990862202961, + "grad_norm": 0.6355965733528137, + "learning_rate": 2.1352933907504592e-07, + "loss": 0.2153, + "step": 19859 + }, + { + "epoch": 1.870893290313464, + "grad_norm": 0.6926652193069458, + "learning_rate": 2.1321906967088356e-07, + "loss": 0.2309, + "step": 19860 + }, + { + "epoch": 1.870987494406632, + "grad_norm": 0.7311079502105713, + "learning_rate": 2.1290902342046006e-07, + "loss": 0.2102, + "step": 19861 + }, + { + "epoch": 1.8710816984997998, + "grad_norm": 0.6396556496620178, + "learning_rate": 2.1259920033084746e-07, + "loss": 0.1748, + "step": 19862 + }, + { + "epoch": 1.8711759025929675, + "grad_norm": 0.6666446924209595, + "learning_rate": 2.1228960040911128e-07, + "loss": 0.1977, + "step": 19863 + }, + { + "epoch": 1.8712701066861355, + "grad_norm": 0.6239649057388306, + "learning_rate": 2.1198022366230698e-07, + "loss": 0.1886, + "step": 19864 + }, + { + "epoch": 1.8713643107793034, + "grad_norm": 0.5872995257377625, + "learning_rate": 2.1167107009749223e-07, + "loss": 0.1899, + "step": 19865 + }, + { + "epoch": 1.8714585148724712, + "grad_norm": 0.67463219165802, + "learning_rate": 2.113621397217147e-07, + "loss": 0.2007, + "step": 19866 + }, + { + "epoch": 1.871552718965639, + "grad_norm": 0.6818503737449646, + "learning_rate": 2.1105343254201772e-07, + "loss": 0.1918, + "step": 19867 + }, + { + "epoch": 1.8716469230588069, + "grad_norm": 0.5923857092857361, + "learning_rate": 2.1074494856544335e-07, + "loss": 0.1603, + "step": 19868 + }, + { + "epoch": 1.8717411271519748, + "grad_norm": 0.6048617362976074, + "learning_rate": 2.1043668779902272e-07, + "loss": 0.1812, + "step": 19869 + }, + { + "epoch": 1.8718353312451426, + "grad_norm": 0.6945805549621582, + "learning_rate": 2.1012865024978458e-07, + "loss": 0.1966, + "step": 19870 + }, + { + "epoch": 1.8719295353383103, + "grad_norm": 0.6653010249137878, + "learning_rate": 2.0982083592475443e-07, + "loss": 0.1866, + "step": 19871 + }, + { + "epoch": 1.8720237394314783, + "grad_norm": 0.6669188141822815, + "learning_rate": 2.0951324483095003e-07, + "loss": 0.2064, + "step": 19872 + }, + { + "epoch": 1.8721179435246462, + "grad_norm": 0.8476389646530151, + "learning_rate": 2.0920587697538464e-07, + "loss": 0.2292, + "step": 19873 + }, + { + "epoch": 1.872212147617814, + "grad_norm": 0.624190628528595, + "learning_rate": 2.0889873236506707e-07, + "loss": 0.2114, + "step": 19874 + }, + { + "epoch": 1.8723063517109817, + "grad_norm": 0.6130045652389526, + "learning_rate": 2.0859181100700175e-07, + "loss": 0.1713, + "step": 19875 + }, + { + "epoch": 1.8724005558041497, + "grad_norm": 0.7087114453315735, + "learning_rate": 2.082851129081842e-07, + "loss": 0.2217, + "step": 19876 + }, + { + "epoch": 1.8724947598973176, + "grad_norm": 1.2582696676254272, + "learning_rate": 2.0797863807560992e-07, + "loss": 0.2109, + "step": 19877 + }, + { + "epoch": 1.8725889639904854, + "grad_norm": 0.6868299841880798, + "learning_rate": 2.0767238651626664e-07, + "loss": 0.1885, + "step": 19878 + }, + { + "epoch": 1.872683168083653, + "grad_norm": 0.7186660170555115, + "learning_rate": 2.0736635823713657e-07, + "loss": 0.1858, + "step": 19879 + }, + { + "epoch": 1.872777372176821, + "grad_norm": 0.70173180103302, + "learning_rate": 2.0706055324519748e-07, + "loss": 0.193, + "step": 19880 + }, + { + "epoch": 1.872871576269989, + "grad_norm": 0.5818378329277039, + "learning_rate": 2.0675497154742374e-07, + "loss": 0.1686, + "step": 19881 + }, + { + "epoch": 1.8729657803631568, + "grad_norm": 0.6666647791862488, + "learning_rate": 2.0644961315078205e-07, + "loss": 0.1864, + "step": 19882 + }, + { + "epoch": 1.8730599844563245, + "grad_norm": 0.7315630316734314, + "learning_rate": 2.061444780622346e-07, + "loss": 0.1925, + "step": 19883 + }, + { + "epoch": 1.8731541885494924, + "grad_norm": 0.6360408663749695, + "learning_rate": 2.0583956628874137e-07, + "loss": 0.1837, + "step": 19884 + }, + { + "epoch": 1.8732483926426604, + "grad_norm": 0.6370952129364014, + "learning_rate": 2.0553487783725122e-07, + "loss": 0.1782, + "step": 19885 + }, + { + "epoch": 1.8733425967358281, + "grad_norm": 0.6465074419975281, + "learning_rate": 2.0523041271471312e-07, + "loss": 0.184, + "step": 19886 + }, + { + "epoch": 1.8734368008289959, + "grad_norm": 0.6361848711967468, + "learning_rate": 2.0492617092807142e-07, + "loss": 0.1932, + "step": 19887 + }, + { + "epoch": 1.8735310049221638, + "grad_norm": 0.6392167806625366, + "learning_rate": 2.0462215248426066e-07, + "loss": 0.161, + "step": 19888 + }, + { + "epoch": 1.8736252090153318, + "grad_norm": 0.6628153324127197, + "learning_rate": 2.043183573902119e-07, + "loss": 0.1837, + "step": 19889 + }, + { + "epoch": 1.8737194131084995, + "grad_norm": 0.6620554327964783, + "learning_rate": 2.0401478565285626e-07, + "loss": 0.1989, + "step": 19890 + }, + { + "epoch": 1.8738136172016673, + "grad_norm": 0.6533010005950928, + "learning_rate": 2.037114372791127e-07, + "loss": 0.1882, + "step": 19891 + }, + { + "epoch": 1.8739078212948352, + "grad_norm": 0.6901665329933167, + "learning_rate": 2.0340831227589675e-07, + "loss": 0.193, + "step": 19892 + }, + { + "epoch": 1.8740020253880032, + "grad_norm": 0.6867063045501709, + "learning_rate": 2.0310541065012401e-07, + "loss": 0.19, + "step": 19893 + }, + { + "epoch": 1.874096229481171, + "grad_norm": 0.6477521657943726, + "learning_rate": 2.0280273240869896e-07, + "loss": 0.18, + "step": 19894 + }, + { + "epoch": 1.8741904335743387, + "grad_norm": 0.6686272621154785, + "learning_rate": 2.0250027755852274e-07, + "loss": 0.1796, + "step": 19895 + }, + { + "epoch": 1.8742846376675066, + "grad_norm": 0.6758459806442261, + "learning_rate": 2.0219804610649208e-07, + "loss": 0.1974, + "step": 19896 + }, + { + "epoch": 1.8743788417606746, + "grad_norm": 0.6487590670585632, + "learning_rate": 2.018960380594992e-07, + "loss": 0.2006, + "step": 19897 + }, + { + "epoch": 1.8744730458538423, + "grad_norm": 0.6570483446121216, + "learning_rate": 2.0159425342442972e-07, + "loss": 0.1605, + "step": 19898 + }, + { + "epoch": 1.87456724994701, + "grad_norm": 0.6574012041091919, + "learning_rate": 2.0129269220816372e-07, + "loss": 0.2001, + "step": 19899 + }, + { + "epoch": 1.874661454040178, + "grad_norm": 0.6349261999130249, + "learning_rate": 2.00991354417579e-07, + "loss": 0.1656, + "step": 19900 + }, + { + "epoch": 1.874755658133346, + "grad_norm": 0.6339261531829834, + "learning_rate": 2.0069024005954562e-07, + "loss": 0.2064, + "step": 19901 + }, + { + "epoch": 1.8748498622265137, + "grad_norm": 0.6916906833648682, + "learning_rate": 2.0038934914093033e-07, + "loss": 0.2335, + "step": 19902 + }, + { + "epoch": 1.8749440663196815, + "grad_norm": 0.7444424033164978, + "learning_rate": 2.000886816685932e-07, + "loss": 0.2186, + "step": 19903 + }, + { + "epoch": 1.8750382704128494, + "grad_norm": 0.7623520493507385, + "learning_rate": 1.9978823764938983e-07, + "loss": 0.1852, + "step": 19904 + }, + { + "epoch": 1.8751324745060174, + "grad_norm": 0.7236540913581848, + "learning_rate": 1.994880170901714e-07, + "loss": 0.1911, + "step": 19905 + }, + { + "epoch": 1.8752266785991851, + "grad_norm": 0.6486384272575378, + "learning_rate": 1.9918801999778137e-07, + "loss": 0.2081, + "step": 19906 + }, + { + "epoch": 1.8753208826923529, + "grad_norm": 0.6325013041496277, + "learning_rate": 1.9888824637906312e-07, + "loss": 0.1813, + "step": 19907 + }, + { + "epoch": 1.8754150867855208, + "grad_norm": 0.6888840794563293, + "learning_rate": 1.9858869624085007e-07, + "loss": 0.1898, + "step": 19908 + }, + { + "epoch": 1.8755092908786888, + "grad_norm": 0.6658682823181152, + "learning_rate": 1.9828936958997348e-07, + "loss": 0.1689, + "step": 19909 + }, + { + "epoch": 1.8756034949718565, + "grad_norm": 0.684428870677948, + "learning_rate": 1.979902664332567e-07, + "loss": 0.2025, + "step": 19910 + }, + { + "epoch": 1.8756976990650243, + "grad_norm": 0.7188761234283447, + "learning_rate": 1.9769138677752208e-07, + "loss": 0.1983, + "step": 19911 + }, + { + "epoch": 1.8757919031581922, + "grad_norm": 0.6171512007713318, + "learning_rate": 1.9739273062958307e-07, + "loss": 0.1802, + "step": 19912 + }, + { + "epoch": 1.8758861072513602, + "grad_norm": 0.6133853197097778, + "learning_rate": 1.970942979962509e-07, + "loss": 0.1807, + "step": 19913 + }, + { + "epoch": 1.875980311344528, + "grad_norm": 0.6813076138496399, + "learning_rate": 1.9679608888432787e-07, + "loss": 0.1673, + "step": 19914 + }, + { + "epoch": 1.8760745154376957, + "grad_norm": 0.6362394094467163, + "learning_rate": 1.9649810330061524e-07, + "loss": 0.2137, + "step": 19915 + }, + { + "epoch": 1.8761687195308636, + "grad_norm": 0.6668578386306763, + "learning_rate": 1.9620034125190645e-07, + "loss": 0.1992, + "step": 19916 + }, + { + "epoch": 1.8762629236240316, + "grad_norm": 0.6715618968009949, + "learning_rate": 1.9590280274499274e-07, + "loss": 0.2102, + "step": 19917 + }, + { + "epoch": 1.8763571277171993, + "grad_norm": 0.6445623636245728, + "learning_rate": 1.9560548778665755e-07, + "loss": 0.1746, + "step": 19918 + }, + { + "epoch": 1.876451331810367, + "grad_norm": 0.6466159224510193, + "learning_rate": 1.9530839638367993e-07, + "loss": 0.2126, + "step": 19919 + }, + { + "epoch": 1.876545535903535, + "grad_norm": 0.673216700553894, + "learning_rate": 1.9501152854283333e-07, + "loss": 0.2381, + "step": 19920 + }, + { + "epoch": 1.876639739996703, + "grad_norm": 0.6047160029411316, + "learning_rate": 1.9471488427088903e-07, + "loss": 0.1863, + "step": 19921 + }, + { + "epoch": 1.8767339440898707, + "grad_norm": 0.6081609129905701, + "learning_rate": 1.9441846357460825e-07, + "loss": 0.1868, + "step": 19922 + }, + { + "epoch": 1.8768281481830384, + "grad_norm": 0.6987688541412354, + "learning_rate": 1.9412226646075115e-07, + "loss": 0.1822, + "step": 19923 + }, + { + "epoch": 1.8769223522762064, + "grad_norm": 0.7152296304702759, + "learning_rate": 1.9382629293607237e-07, + "loss": 0.19, + "step": 19924 + }, + { + "epoch": 1.8770165563693744, + "grad_norm": 0.6419005393981934, + "learning_rate": 1.9353054300731865e-07, + "loss": 0.1771, + "step": 19925 + }, + { + "epoch": 1.877110760462542, + "grad_norm": 0.7111563682556152, + "learning_rate": 1.9323501668123467e-07, + "loss": 0.2131, + "step": 19926 + }, + { + "epoch": 1.8772049645557098, + "grad_norm": 0.6358315348625183, + "learning_rate": 1.9293971396455945e-07, + "loss": 0.1886, + "step": 19927 + }, + { + "epoch": 1.8772991686488778, + "grad_norm": 0.7015502452850342, + "learning_rate": 1.926446348640265e-07, + "loss": 0.1877, + "step": 19928 + }, + { + "epoch": 1.8773933727420458, + "grad_norm": 0.745125412940979, + "learning_rate": 1.9234977938636046e-07, + "loss": 0.1983, + "step": 19929 + }, + { + "epoch": 1.8774875768352135, + "grad_norm": 0.7070435285568237, + "learning_rate": 1.9205514753828924e-07, + "loss": 0.2048, + "step": 19930 + }, + { + "epoch": 1.8775817809283812, + "grad_norm": 0.6494175791740417, + "learning_rate": 1.9176073932652972e-07, + "loss": 0.2071, + "step": 19931 + }, + { + "epoch": 1.8776759850215492, + "grad_norm": 0.6327586770057678, + "learning_rate": 1.9146655475779207e-07, + "loss": 0.2104, + "step": 19932 + }, + { + "epoch": 1.8777701891147172, + "grad_norm": 0.6279453635215759, + "learning_rate": 1.911725938387865e-07, + "loss": 0.191, + "step": 19933 + }, + { + "epoch": 1.877864393207885, + "grad_norm": 0.6193822026252747, + "learning_rate": 1.908788565762165e-07, + "loss": 0.1915, + "step": 19934 + }, + { + "epoch": 1.8779585973010526, + "grad_norm": 0.6685409545898438, + "learning_rate": 1.9058534297677789e-07, + "loss": 0.1846, + "step": 19935 + }, + { + "epoch": 1.8780528013942206, + "grad_norm": 0.6512874364852905, + "learning_rate": 1.9029205304716415e-07, + "loss": 0.2003, + "step": 19936 + }, + { + "epoch": 1.8781470054873886, + "grad_norm": 0.6568180918693542, + "learning_rate": 1.8999898679406325e-07, + "loss": 0.1967, + "step": 19937 + }, + { + "epoch": 1.8782412095805563, + "grad_norm": 0.6729118824005127, + "learning_rate": 1.8970614422415656e-07, + "loss": 0.1732, + "step": 19938 + }, + { + "epoch": 1.878335413673724, + "grad_norm": 0.6299525499343872, + "learning_rate": 1.8941352534412095e-07, + "loss": 0.1785, + "step": 19939 + }, + { + "epoch": 1.878429617766892, + "grad_norm": 0.6564587950706482, + "learning_rate": 1.8912113016062994e-07, + "loss": 0.2139, + "step": 19940 + }, + { + "epoch": 1.87852382186006, + "grad_norm": 0.5952286124229431, + "learning_rate": 1.8882895868035045e-07, + "loss": 0.1631, + "step": 19941 + }, + { + "epoch": 1.8786180259532277, + "grad_norm": 0.6052758693695068, + "learning_rate": 1.885370109099427e-07, + "loss": 0.185, + "step": 19942 + }, + { + "epoch": 1.8787122300463954, + "grad_norm": 0.6648350358009338, + "learning_rate": 1.882452868560658e-07, + "loss": 0.2097, + "step": 19943 + }, + { + "epoch": 1.8788064341395634, + "grad_norm": 0.6911534070968628, + "learning_rate": 1.8795378652537e-07, + "loss": 0.2143, + "step": 19944 + }, + { + "epoch": 1.8789006382327313, + "grad_norm": 0.9408258199691772, + "learning_rate": 1.876625099245033e-07, + "loss": 0.1842, + "step": 19945 + }, + { + "epoch": 1.878994842325899, + "grad_norm": 0.6716334819793701, + "learning_rate": 1.8737145706010594e-07, + "loss": 0.1649, + "step": 19946 + }, + { + "epoch": 1.8790890464190668, + "grad_norm": 0.7083847522735596, + "learning_rate": 1.8708062793881487e-07, + "loss": 0.2114, + "step": 19947 + }, + { + "epoch": 1.8791832505122348, + "grad_norm": 0.682701587677002, + "learning_rate": 1.8679002256726143e-07, + "loss": 0.1999, + "step": 19948 + }, + { + "epoch": 1.8792774546054027, + "grad_norm": 0.6334688663482666, + "learning_rate": 1.864996409520714e-07, + "loss": 0.1749, + "step": 19949 + }, + { + "epoch": 1.8793716586985705, + "grad_norm": 0.8308340907096863, + "learning_rate": 1.862094830998673e-07, + "loss": 0.2284, + "step": 19950 + }, + { + "epoch": 1.8794658627917382, + "grad_norm": 0.6952033042907715, + "learning_rate": 1.8591954901726385e-07, + "loss": 0.2064, + "step": 19951 + }, + { + "epoch": 1.8795600668849062, + "grad_norm": 0.6250718832015991, + "learning_rate": 1.8562983871087237e-07, + "loss": 0.1716, + "step": 19952 + }, + { + "epoch": 1.8796542709780741, + "grad_norm": 0.6349963545799255, + "learning_rate": 1.8534035218729984e-07, + "loss": 0.191, + "step": 19953 + }, + { + "epoch": 1.8797484750712419, + "grad_norm": 0.6657272577285767, + "learning_rate": 1.8505108945314432e-07, + "loss": 0.1768, + "step": 19954 + }, + { + "epoch": 1.8798426791644096, + "grad_norm": 0.6782479882240295, + "learning_rate": 1.8476205051500495e-07, + "loss": 0.2027, + "step": 19955 + }, + { + "epoch": 1.8799368832575776, + "grad_norm": 0.6128109097480774, + "learning_rate": 1.8447323537946871e-07, + "loss": 0.1593, + "step": 19956 + }, + { + "epoch": 1.8800310873507455, + "grad_norm": 0.8520538210868835, + "learning_rate": 1.8418464405312253e-07, + "loss": 0.2047, + "step": 19957 + }, + { + "epoch": 1.8801252914439133, + "grad_norm": 0.624204695224762, + "learning_rate": 1.8389627654254894e-07, + "loss": 0.1936, + "step": 19958 + }, + { + "epoch": 1.880219495537081, + "grad_norm": 0.6802290678024292, + "learning_rate": 1.8360813285431932e-07, + "loss": 0.195, + "step": 19959 + }, + { + "epoch": 1.880313699630249, + "grad_norm": 0.686010479927063, + "learning_rate": 1.8332021299500513e-07, + "loss": 0.1886, + "step": 19960 + }, + { + "epoch": 1.880407903723417, + "grad_norm": 0.6636178493499756, + "learning_rate": 1.830325169711744e-07, + "loss": 0.2212, + "step": 19961 + }, + { + "epoch": 1.8805021078165847, + "grad_norm": 0.6198559403419495, + "learning_rate": 1.8274504478938194e-07, + "loss": 0.1818, + "step": 19962 + }, + { + "epoch": 1.8805963119097524, + "grad_norm": 0.6393874287605286, + "learning_rate": 1.8245779645618577e-07, + "loss": 0.2007, + "step": 19963 + }, + { + "epoch": 1.8806905160029204, + "grad_norm": 0.6653652787208557, + "learning_rate": 1.8217077197813627e-07, + "loss": 0.2078, + "step": 19964 + }, + { + "epoch": 1.8807847200960883, + "grad_norm": 0.6419366002082825, + "learning_rate": 1.8188397136177594e-07, + "loss": 0.1905, + "step": 19965 + }, + { + "epoch": 1.880878924189256, + "grad_norm": 0.6758688688278198, + "learning_rate": 1.81597394613644e-07, + "loss": 0.1839, + "step": 19966 + }, + { + "epoch": 1.8809731282824238, + "grad_norm": 0.7126138210296631, + "learning_rate": 1.8131104174027746e-07, + "loss": 0.2255, + "step": 19967 + }, + { + "epoch": 1.8810673323755918, + "grad_norm": 0.7680550217628479, + "learning_rate": 1.8102491274820444e-07, + "loss": 0.2362, + "step": 19968 + }, + { + "epoch": 1.8811615364687597, + "grad_norm": 0.6370869874954224, + "learning_rate": 1.8073900764394636e-07, + "loss": 0.1744, + "step": 19969 + }, + { + "epoch": 1.8812557405619275, + "grad_norm": 0.7094757556915283, + "learning_rate": 1.804533264340269e-07, + "loss": 0.1835, + "step": 19970 + }, + { + "epoch": 1.8813499446550952, + "grad_norm": 0.6785231828689575, + "learning_rate": 1.8016786912495753e-07, + "loss": 0.2028, + "step": 19971 + }, + { + "epoch": 1.8814441487482632, + "grad_norm": 0.7784723043441772, + "learning_rate": 1.7988263572324527e-07, + "loss": 0.2338, + "step": 19972 + }, + { + "epoch": 1.8815383528414311, + "grad_norm": 0.651883602142334, + "learning_rate": 1.7959762623539822e-07, + "loss": 0.2161, + "step": 19973 + }, + { + "epoch": 1.8816325569345989, + "grad_norm": 0.5864567756652832, + "learning_rate": 1.7931284066791345e-07, + "loss": 0.1738, + "step": 19974 + }, + { + "epoch": 1.8817267610277666, + "grad_norm": 0.6762986779212952, + "learning_rate": 1.7902827902728236e-07, + "loss": 0.1753, + "step": 19975 + }, + { + "epoch": 1.8818209651209346, + "grad_norm": 0.6702238321304321, + "learning_rate": 1.7874394131999428e-07, + "loss": 0.1954, + "step": 19976 + }, + { + "epoch": 1.8819151692141025, + "grad_norm": 0.663995623588562, + "learning_rate": 1.784598275525351e-07, + "loss": 0.1939, + "step": 19977 + }, + { + "epoch": 1.8820093733072703, + "grad_norm": 0.702885627746582, + "learning_rate": 1.7817593773137965e-07, + "loss": 0.2107, + "step": 19978 + }, + { + "epoch": 1.882103577400438, + "grad_norm": 0.6725521683692932, + "learning_rate": 1.7789227186300274e-07, + "loss": 0.2048, + "step": 19979 + }, + { + "epoch": 1.882197781493606, + "grad_norm": 0.7834375500679016, + "learning_rate": 1.7760882995387363e-07, + "loss": 0.2052, + "step": 19980 + }, + { + "epoch": 1.882291985586774, + "grad_norm": 0.6639535427093506, + "learning_rate": 1.7732561201045274e-07, + "loss": 0.1986, + "step": 19981 + }, + { + "epoch": 1.8823861896799416, + "grad_norm": 0.6540392637252808, + "learning_rate": 1.770426180391982e-07, + "loss": 0.1797, + "step": 19982 + }, + { + "epoch": 1.8824803937731094, + "grad_norm": 0.5807163119316101, + "learning_rate": 1.7675984804656488e-07, + "loss": 0.167, + "step": 19983 + }, + { + "epoch": 1.8825745978662773, + "grad_norm": 0.7048665881156921, + "learning_rate": 1.764773020389987e-07, + "loss": 0.2171, + "step": 19984 + }, + { + "epoch": 1.8826688019594453, + "grad_norm": 0.7323140501976013, + "learning_rate": 1.7619498002294234e-07, + "loss": 0.2208, + "step": 19985 + }, + { + "epoch": 1.8827630060526128, + "grad_norm": 0.6514236330986023, + "learning_rate": 1.7591288200483282e-07, + "loss": 0.2085, + "step": 19986 + }, + { + "epoch": 1.8828572101457808, + "grad_norm": 0.6266919374465942, + "learning_rate": 1.756310079911028e-07, + "loss": 0.1982, + "step": 19987 + }, + { + "epoch": 1.8829514142389487, + "grad_norm": 0.6137820482254028, + "learning_rate": 1.7534935798818042e-07, + "loss": 0.1874, + "step": 19988 + }, + { + "epoch": 1.8830456183321165, + "grad_norm": 0.6909066438674927, + "learning_rate": 1.7506793200248507e-07, + "loss": 0.1914, + "step": 19989 + }, + { + "epoch": 1.8831398224252842, + "grad_norm": 0.5796274542808533, + "learning_rate": 1.74786730040436e-07, + "loss": 0.2058, + "step": 19990 + }, + { + "epoch": 1.8832340265184522, + "grad_norm": 0.6112196445465088, + "learning_rate": 1.7450575210844588e-07, + "loss": 0.1845, + "step": 19991 + }, + { + "epoch": 1.8833282306116201, + "grad_norm": 0.6688932180404663, + "learning_rate": 1.742249982129185e-07, + "loss": 0.202, + "step": 19992 + }, + { + "epoch": 1.8834224347047879, + "grad_norm": 0.9059047698974609, + "learning_rate": 1.7394446836025758e-07, + "loss": 0.1984, + "step": 19993 + }, + { + "epoch": 1.8835166387979556, + "grad_norm": 0.7394698858261108, + "learning_rate": 1.7366416255685914e-07, + "loss": 0.1916, + "step": 19994 + }, + { + "epoch": 1.8836108428911236, + "grad_norm": 0.6489742994308472, + "learning_rate": 1.7338408080911473e-07, + "loss": 0.1968, + "step": 19995 + }, + { + "epoch": 1.8837050469842915, + "grad_norm": 0.6854963302612305, + "learning_rate": 1.7310422312341035e-07, + "loss": 0.2127, + "step": 19996 + }, + { + "epoch": 1.8837992510774593, + "grad_norm": 0.679210901260376, + "learning_rate": 1.7282458950612646e-07, + "loss": 0.1826, + "step": 19997 + }, + { + "epoch": 1.883893455170627, + "grad_norm": 0.7623416185379028, + "learning_rate": 1.7254517996364127e-07, + "loss": 0.2065, + "step": 19998 + }, + { + "epoch": 1.883987659263795, + "grad_norm": 0.7248129844665527, + "learning_rate": 1.7226599450232306e-07, + "loss": 0.1895, + "step": 19999 + }, + { + "epoch": 1.884081863356963, + "grad_norm": 0.6562262177467346, + "learning_rate": 1.7198703312853892e-07, + "loss": 0.1733, + "step": 20000 + }, + { + "epoch": 1.8841760674501307, + "grad_norm": 0.6889053583145142, + "learning_rate": 1.7170829584865155e-07, + "loss": 0.1781, + "step": 20001 + }, + { + "epoch": 1.8842702715432984, + "grad_norm": 0.6888020038604736, + "learning_rate": 1.7142978266901367e-07, + "loss": 0.204, + "step": 20002 + }, + { + "epoch": 1.8843644756364664, + "grad_norm": 0.608525812625885, + "learning_rate": 1.711514935959768e-07, + "loss": 0.1892, + "step": 20003 + }, + { + "epoch": 1.8844586797296343, + "grad_norm": 0.6745103597640991, + "learning_rate": 1.7087342863588708e-07, + "loss": 0.1947, + "step": 20004 + }, + { + "epoch": 1.884552883822802, + "grad_norm": 0.6390058398246765, + "learning_rate": 1.705955877950838e-07, + "loss": 0.1909, + "step": 20005 + }, + { + "epoch": 1.8846470879159698, + "grad_norm": 0.6507443785667419, + "learning_rate": 1.703179710799019e-07, + "loss": 0.1795, + "step": 20006 + }, + { + "epoch": 1.8847412920091378, + "grad_norm": 0.7287372946739197, + "learning_rate": 1.70040578496673e-07, + "loss": 0.1581, + "step": 20007 + }, + { + "epoch": 1.8848354961023057, + "grad_norm": 0.6483942866325378, + "learning_rate": 1.6976341005172204e-07, + "loss": 0.1877, + "step": 20008 + }, + { + "epoch": 1.8849297001954735, + "grad_norm": 0.6140819787979126, + "learning_rate": 1.694864657513662e-07, + "loss": 0.1849, + "step": 20009 + }, + { + "epoch": 1.8850239042886412, + "grad_norm": 0.6788772940635681, + "learning_rate": 1.692097456019226e-07, + "loss": 0.2083, + "step": 20010 + }, + { + "epoch": 1.8851181083818092, + "grad_norm": 0.6177812218666077, + "learning_rate": 1.689332496097018e-07, + "loss": 0.1967, + "step": 20011 + }, + { + "epoch": 1.8852123124749771, + "grad_norm": 0.6948332190513611, + "learning_rate": 1.6865697778100543e-07, + "loss": 0.2067, + "step": 20012 + }, + { + "epoch": 1.8853065165681449, + "grad_norm": 0.6861203908920288, + "learning_rate": 1.6838093012213507e-07, + "loss": 0.1807, + "step": 20013 + }, + { + "epoch": 1.8854007206613126, + "grad_norm": 0.6214278936386108, + "learning_rate": 1.6810510663938462e-07, + "loss": 0.1668, + "step": 20014 + }, + { + "epoch": 1.8854949247544806, + "grad_norm": 0.6081794500350952, + "learning_rate": 1.6782950733904235e-07, + "loss": 0.1663, + "step": 20015 + }, + { + "epoch": 1.8855891288476485, + "grad_norm": 0.6765573620796204, + "learning_rate": 1.6755413222739324e-07, + "loss": 0.2058, + "step": 20016 + }, + { + "epoch": 1.8856833329408162, + "grad_norm": 0.6488627195358276, + "learning_rate": 1.6727898131071673e-07, + "loss": 0.1857, + "step": 20017 + }, + { + "epoch": 1.885777537033984, + "grad_norm": 0.6169323325157166, + "learning_rate": 1.6700405459528556e-07, + "loss": 0.1729, + "step": 20018 + }, + { + "epoch": 1.885871741127152, + "grad_norm": 0.6104135513305664, + "learning_rate": 1.667293520873692e-07, + "loss": 0.1904, + "step": 20019 + }, + { + "epoch": 1.88596594522032, + "grad_norm": 0.7144765853881836, + "learning_rate": 1.6645487379323143e-07, + "loss": 0.2003, + "step": 20020 + }, + { + "epoch": 1.8860601493134876, + "grad_norm": 0.6818016171455383, + "learning_rate": 1.6618061971912848e-07, + "loss": 0.226, + "step": 20021 + }, + { + "epoch": 1.8861543534066554, + "grad_norm": 0.6748316884040833, + "learning_rate": 1.6590658987131746e-07, + "loss": 0.2002, + "step": 20022 + }, + { + "epoch": 1.8862485574998233, + "grad_norm": 0.7445883750915527, + "learning_rate": 1.6563278425604456e-07, + "loss": 0.2012, + "step": 20023 + }, + { + "epoch": 1.8863427615929913, + "grad_norm": 0.6547936797142029, + "learning_rate": 1.6535920287955365e-07, + "loss": 0.1807, + "step": 20024 + }, + { + "epoch": 1.886436965686159, + "grad_norm": 0.7145929932594299, + "learning_rate": 1.6508584574808194e-07, + "loss": 0.2002, + "step": 20025 + }, + { + "epoch": 1.8865311697793268, + "grad_norm": 0.6741235256195068, + "learning_rate": 1.648127128678634e-07, + "loss": 0.1981, + "step": 20026 + }, + { + "epoch": 1.8866253738724947, + "grad_norm": 0.6544743776321411, + "learning_rate": 1.6453980424512405e-07, + "loss": 0.2128, + "step": 20027 + }, + { + "epoch": 1.8867195779656627, + "grad_norm": 0.6516964435577393, + "learning_rate": 1.6426711988609012e-07, + "loss": 0.2026, + "step": 20028 + }, + { + "epoch": 1.8868137820588304, + "grad_norm": 0.6945900917053223, + "learning_rate": 1.6399465979697548e-07, + "loss": 0.195, + "step": 20029 + }, + { + "epoch": 1.8869079861519982, + "grad_norm": 0.6723647117614746, + "learning_rate": 1.6372242398399517e-07, + "loss": 0.1887, + "step": 20030 + }, + { + "epoch": 1.8870021902451661, + "grad_norm": 0.6073116660118103, + "learning_rate": 1.6345041245335536e-07, + "loss": 0.1613, + "step": 20031 + }, + { + "epoch": 1.887096394338334, + "grad_norm": 0.8018856048583984, + "learning_rate": 1.6317862521125883e-07, + "loss": 0.2245, + "step": 20032 + }, + { + "epoch": 1.8871905984315018, + "grad_norm": 0.6451196670532227, + "learning_rate": 1.6290706226390286e-07, + "loss": 0.1764, + "step": 20033 + }, + { + "epoch": 1.8872848025246696, + "grad_norm": 0.6342352032661438, + "learning_rate": 1.6263572361747805e-07, + "loss": 0.1632, + "step": 20034 + }, + { + "epoch": 1.8873790066178375, + "grad_norm": 0.7167083024978638, + "learning_rate": 1.6236460927817387e-07, + "loss": 0.205, + "step": 20035 + }, + { + "epoch": 1.8874732107110055, + "grad_norm": 0.651252269744873, + "learning_rate": 1.6209371925216987e-07, + "loss": 0.1874, + "step": 20036 + }, + { + "epoch": 1.8875674148041732, + "grad_norm": 0.7464094758033752, + "learning_rate": 1.618230535456433e-07, + "loss": 0.1811, + "step": 20037 + }, + { + "epoch": 1.887661618897341, + "grad_norm": 0.6483302116394043, + "learning_rate": 1.6155261216476814e-07, + "loss": 0.1919, + "step": 20038 + }, + { + "epoch": 1.887755822990509, + "grad_norm": 0.6636731028556824, + "learning_rate": 1.612823951157072e-07, + "loss": 0.1963, + "step": 20039 + }, + { + "epoch": 1.8878500270836769, + "grad_norm": 0.6971153616905212, + "learning_rate": 1.6101240240462334e-07, + "loss": 0.1915, + "step": 20040 + }, + { + "epoch": 1.8879442311768446, + "grad_norm": 0.6511744856834412, + "learning_rate": 1.6074263403767388e-07, + "loss": 0.2154, + "step": 20041 + }, + { + "epoch": 1.8880384352700124, + "grad_norm": 0.7304086685180664, + "learning_rate": 1.6047309002100832e-07, + "loss": 0.2097, + "step": 20042 + }, + { + "epoch": 1.8881326393631803, + "grad_norm": 0.6479068398475647, + "learning_rate": 1.6020377036077395e-07, + "loss": 0.1919, + "step": 20043 + }, + { + "epoch": 1.8882268434563483, + "grad_norm": 0.6447890996932983, + "learning_rate": 1.5993467506311144e-07, + "loss": 0.1861, + "step": 20044 + }, + { + "epoch": 1.888321047549516, + "grad_norm": 0.6425917148590088, + "learning_rate": 1.5966580413415588e-07, + "loss": 0.1676, + "step": 20045 + }, + { + "epoch": 1.8884152516426838, + "grad_norm": 0.7210531830787659, + "learning_rate": 1.5939715758003794e-07, + "loss": 0.203, + "step": 20046 + }, + { + "epoch": 1.8885094557358517, + "grad_norm": 0.6781623363494873, + "learning_rate": 1.591287354068849e-07, + "loss": 0.2009, + "step": 20047 + }, + { + "epoch": 1.8886036598290197, + "grad_norm": 0.6199914216995239, + "learning_rate": 1.5886053762081522e-07, + "loss": 0.1801, + "step": 20048 + }, + { + "epoch": 1.8886978639221874, + "grad_norm": 0.6513656973838806, + "learning_rate": 1.585925642279429e-07, + "loss": 0.1806, + "step": 20049 + }, + { + "epoch": 1.8887920680153552, + "grad_norm": 1.187853455543518, + "learning_rate": 1.5832481523438304e-07, + "loss": 0.174, + "step": 20050 + }, + { + "epoch": 1.8888862721085231, + "grad_norm": 0.6098828315734863, + "learning_rate": 1.5805729064623742e-07, + "loss": 0.1808, + "step": 20051 + }, + { + "epoch": 1.888980476201691, + "grad_norm": 0.6046240329742432, + "learning_rate": 1.5778999046960452e-07, + "loss": 0.1749, + "step": 20052 + }, + { + "epoch": 1.8890746802948588, + "grad_norm": 0.6492502093315125, + "learning_rate": 1.5752291471058277e-07, + "loss": 0.1886, + "step": 20053 + }, + { + "epoch": 1.8891688843880265, + "grad_norm": 0.6995798945426941, + "learning_rate": 1.5725606337526067e-07, + "loss": 0.182, + "step": 20054 + }, + { + "epoch": 1.8892630884811945, + "grad_norm": 0.6726314425468445, + "learning_rate": 1.5698943646972109e-07, + "loss": 0.2054, + "step": 20055 + }, + { + "epoch": 1.8893572925743625, + "grad_norm": 0.6404174566268921, + "learning_rate": 1.5672303400004474e-07, + "loss": 0.1767, + "step": 20056 + }, + { + "epoch": 1.8894514966675302, + "grad_norm": 0.6526312232017517, + "learning_rate": 1.5645685597230675e-07, + "loss": 0.2123, + "step": 20057 + }, + { + "epoch": 1.889545700760698, + "grad_norm": 0.671974241733551, + "learning_rate": 1.5619090239257562e-07, + "loss": 0.2004, + "step": 20058 + }, + { + "epoch": 1.889639904853866, + "grad_norm": 1.1866815090179443, + "learning_rate": 1.5592517326691536e-07, + "loss": 0.2233, + "step": 20059 + }, + { + "epoch": 1.8897341089470339, + "grad_norm": 0.7386412024497986, + "learning_rate": 1.5565966860138671e-07, + "loss": 0.2209, + "step": 20060 + }, + { + "epoch": 1.8898283130402016, + "grad_norm": 0.7039284706115723, + "learning_rate": 1.5539438840204146e-07, + "loss": 0.2161, + "step": 20061 + }, + { + "epoch": 1.8899225171333693, + "grad_norm": 0.6055103540420532, + "learning_rate": 1.5512933267492813e-07, + "loss": 0.1669, + "step": 20062 + }, + { + "epoch": 1.8900167212265373, + "grad_norm": 0.650798499584198, + "learning_rate": 1.5486450142609298e-07, + "loss": 0.1972, + "step": 20063 + }, + { + "epoch": 1.8901109253197053, + "grad_norm": 0.6920046806335449, + "learning_rate": 1.5459989466157123e-07, + "loss": 0.1981, + "step": 20064 + }, + { + "epoch": 1.890205129412873, + "grad_norm": 0.6305274963378906, + "learning_rate": 1.543355123874002e-07, + "loss": 0.2177, + "step": 20065 + }, + { + "epoch": 1.8902993335060407, + "grad_norm": 0.6411617994308472, + "learning_rate": 1.54071354609604e-07, + "loss": 0.184, + "step": 20066 + }, + { + "epoch": 1.8903935375992087, + "grad_norm": 0.6737891435623169, + "learning_rate": 1.538074213342089e-07, + "loss": 0.1961, + "step": 20067 + }, + { + "epoch": 1.8904877416923767, + "grad_norm": 0.7235912084579468, + "learning_rate": 1.5354371256723232e-07, + "loss": 0.2219, + "step": 20068 + }, + { + "epoch": 1.8905819457855444, + "grad_norm": 0.6493980288505554, + "learning_rate": 1.5328022831468615e-07, + "loss": 0.174, + "step": 20069 + }, + { + "epoch": 1.8906761498787121, + "grad_norm": 0.6539517045021057, + "learning_rate": 1.5301696858257996e-07, + "loss": 0.1801, + "step": 20070 + }, + { + "epoch": 1.89077035397188, + "grad_norm": 0.6524957418441772, + "learning_rate": 1.527539333769146e-07, + "loss": 0.1695, + "step": 20071 + }, + { + "epoch": 1.890864558065048, + "grad_norm": 0.5983673334121704, + "learning_rate": 1.5249112270368848e-07, + "loss": 0.1808, + "step": 20072 + }, + { + "epoch": 1.8909587621582158, + "grad_norm": 0.6361520290374756, + "learning_rate": 1.522285365688958e-07, + "loss": 0.1751, + "step": 20073 + }, + { + "epoch": 1.8910529662513835, + "grad_norm": 0.5791687369346619, + "learning_rate": 1.5196617497852062e-07, + "loss": 0.1603, + "step": 20074 + }, + { + "epoch": 1.8911471703445515, + "grad_norm": 0.7468518614768982, + "learning_rate": 1.5170403793854816e-07, + "loss": 0.2091, + "step": 20075 + }, + { + "epoch": 1.8912413744377194, + "grad_norm": 0.6266886591911316, + "learning_rate": 1.5144212545495362e-07, + "loss": 0.2033, + "step": 20076 + }, + { + "epoch": 1.8913355785308872, + "grad_norm": 0.6483442783355713, + "learning_rate": 1.5118043753371004e-07, + "loss": 0.1796, + "step": 20077 + }, + { + "epoch": 1.891429782624055, + "grad_norm": 0.6359021067619324, + "learning_rate": 1.5091897418078482e-07, + "loss": 0.2068, + "step": 20078 + }, + { + "epoch": 1.8915239867172229, + "grad_norm": 0.6496128439903259, + "learning_rate": 1.5065773540213658e-07, + "loss": 0.1739, + "step": 20079 + }, + { + "epoch": 1.8916181908103908, + "grad_norm": 0.5661174654960632, + "learning_rate": 1.5039672120372605e-07, + "loss": 0.1427, + "step": 20080 + }, + { + "epoch": 1.8917123949035586, + "grad_norm": 0.6317970752716064, + "learning_rate": 1.5013593159150296e-07, + "loss": 0.1852, + "step": 20081 + }, + { + "epoch": 1.8918065989967263, + "grad_norm": 0.671819806098938, + "learning_rate": 1.4987536657141254e-07, + "loss": 0.2126, + "step": 20082 + }, + { + "epoch": 1.8919008030898943, + "grad_norm": 0.5922439098358154, + "learning_rate": 1.4961502614939781e-07, + "loss": 0.1864, + "step": 20083 + }, + { + "epoch": 1.8919950071830622, + "grad_norm": 0.6616160273551941, + "learning_rate": 1.4935491033139515e-07, + "loss": 0.1833, + "step": 20084 + }, + { + "epoch": 1.89208921127623, + "grad_norm": 0.6308454871177673, + "learning_rate": 1.4909501912333534e-07, + "loss": 0.1751, + "step": 20085 + }, + { + "epoch": 1.8921834153693977, + "grad_norm": 0.6988815665245056, + "learning_rate": 1.4883535253114146e-07, + "loss": 0.1915, + "step": 20086 + }, + { + "epoch": 1.8922776194625657, + "grad_norm": 0.7224921584129333, + "learning_rate": 1.4857591056073873e-07, + "loss": 0.2116, + "step": 20087 + }, + { + "epoch": 1.8923718235557336, + "grad_norm": 0.6897280216217041, + "learning_rate": 1.4831669321804022e-07, + "loss": 0.184, + "step": 20088 + }, + { + "epoch": 1.8924660276489014, + "grad_norm": 0.6268225312232971, + "learning_rate": 1.4805770050895561e-07, + "loss": 0.1891, + "step": 20089 + }, + { + "epoch": 1.892560231742069, + "grad_norm": 0.7011879682540894, + "learning_rate": 1.4779893243939358e-07, + "loss": 0.1736, + "step": 20090 + }, + { + "epoch": 1.892654435835237, + "grad_norm": 0.6202412247657776, + "learning_rate": 1.475403890152527e-07, + "loss": 0.2048, + "step": 20091 + }, + { + "epoch": 1.892748639928405, + "grad_norm": 0.6074902415275574, + "learning_rate": 1.4728207024242601e-07, + "loss": 0.1935, + "step": 20092 + }, + { + "epoch": 1.8928428440215728, + "grad_norm": 0.6898573637008667, + "learning_rate": 1.4702397612680774e-07, + "loss": 0.2253, + "step": 20093 + }, + { + "epoch": 1.8929370481147405, + "grad_norm": 0.5918618440628052, + "learning_rate": 1.4676610667428094e-07, + "loss": 0.1476, + "step": 20094 + }, + { + "epoch": 1.8930312522079085, + "grad_norm": 0.730482280254364, + "learning_rate": 1.465084618907231e-07, + "loss": 0.2003, + "step": 20095 + }, + { + "epoch": 1.8931254563010764, + "grad_norm": 0.7516427040100098, + "learning_rate": 1.462510417820129e-07, + "loss": 0.2024, + "step": 20096 + }, + { + "epoch": 1.8932196603942442, + "grad_norm": 0.6613539457321167, + "learning_rate": 1.4599384635401782e-07, + "loss": 0.182, + "step": 20097 + }, + { + "epoch": 1.893313864487412, + "grad_norm": 0.5953218340873718, + "learning_rate": 1.4573687561260096e-07, + "loss": 0.1722, + "step": 20098 + }, + { + "epoch": 1.8934080685805799, + "grad_norm": 0.6937636137008667, + "learning_rate": 1.4548012956362435e-07, + "loss": 0.2115, + "step": 20099 + }, + { + "epoch": 1.8935022726737478, + "grad_norm": 0.649179995059967, + "learning_rate": 1.4522360821294214e-07, + "loss": 0.1802, + "step": 20100 + }, + { + "epoch": 1.8935964767669156, + "grad_norm": 0.661853015422821, + "learning_rate": 1.4496731156640076e-07, + "loss": 0.1799, + "step": 20101 + }, + { + "epoch": 1.8936906808600833, + "grad_norm": 3.292248249053955, + "learning_rate": 1.4471123962984668e-07, + "loss": 0.209, + "step": 20102 + }, + { + "epoch": 1.8937848849532513, + "grad_norm": 0.6019024848937988, + "learning_rate": 1.4445539240911744e-07, + "loss": 0.1684, + "step": 20103 + }, + { + "epoch": 1.8938790890464192, + "grad_norm": 0.626997709274292, + "learning_rate": 1.4419976991004724e-07, + "loss": 0.1496, + "step": 20104 + }, + { + "epoch": 1.893973293139587, + "grad_norm": 0.708346962928772, + "learning_rate": 1.439443721384648e-07, + "loss": 0.1885, + "step": 20105 + }, + { + "epoch": 1.8940674972327547, + "grad_norm": 0.6549419164657593, + "learning_rate": 1.436891991001932e-07, + "loss": 0.1761, + "step": 20106 + }, + { + "epoch": 1.8941617013259227, + "grad_norm": 0.6317760348320007, + "learning_rate": 1.4343425080105112e-07, + "loss": 0.1765, + "step": 20107 + }, + { + "epoch": 1.8942559054190906, + "grad_norm": 0.739311933517456, + "learning_rate": 1.4317952724685168e-07, + "loss": 0.1923, + "step": 20108 + }, + { + "epoch": 1.8943501095122584, + "grad_norm": 0.6841950416564941, + "learning_rate": 1.429250284434025e-07, + "loss": 0.2034, + "step": 20109 + }, + { + "epoch": 1.894444313605426, + "grad_norm": 0.67605060338974, + "learning_rate": 1.4267075439650669e-07, + "loss": 0.2024, + "step": 20110 + }, + { + "epoch": 1.894538517698594, + "grad_norm": 0.6526362895965576, + "learning_rate": 1.4241670511196292e-07, + "loss": 0.218, + "step": 20111 + }, + { + "epoch": 1.894632721791762, + "grad_norm": 0.5401507616043091, + "learning_rate": 1.4216288059556217e-07, + "loss": 0.1535, + "step": 20112 + }, + { + "epoch": 1.8947269258849297, + "grad_norm": 0.6044568419456482, + "learning_rate": 1.419092808530953e-07, + "loss": 0.1881, + "step": 20113 + }, + { + "epoch": 1.8948211299780975, + "grad_norm": 0.9056061506271362, + "learning_rate": 1.4165590589034106e-07, + "loss": 0.1577, + "step": 20114 + }, + { + "epoch": 1.8949153340712654, + "grad_norm": 0.7809275388717651, + "learning_rate": 1.4140275571307926e-07, + "loss": 0.2021, + "step": 20115 + }, + { + "epoch": 1.8950095381644334, + "grad_norm": 0.8135993480682373, + "learning_rate": 1.4114983032707974e-07, + "loss": 0.213, + "step": 20116 + }, + { + "epoch": 1.8951037422576011, + "grad_norm": 0.7396150827407837, + "learning_rate": 1.408971297381123e-07, + "loss": 0.2063, + "step": 20117 + }, + { + "epoch": 1.8951979463507689, + "grad_norm": 0.6179821491241455, + "learning_rate": 1.4064465395193794e-07, + "loss": 0.1617, + "step": 20118 + }, + { + "epoch": 1.8952921504439368, + "grad_norm": 0.6806744337081909, + "learning_rate": 1.4039240297431202e-07, + "loss": 0.2023, + "step": 20119 + }, + { + "epoch": 1.8953863545371048, + "grad_norm": 0.6412350535392761, + "learning_rate": 1.401403768109888e-07, + "loss": 0.1918, + "step": 20120 + }, + { + "epoch": 1.8954805586302725, + "grad_norm": 0.6875594854354858, + "learning_rate": 1.3988857546771262e-07, + "loss": 0.2108, + "step": 20121 + }, + { + "epoch": 1.8955747627234403, + "grad_norm": 0.6698450446128845, + "learning_rate": 1.3963699895022664e-07, + "loss": 0.196, + "step": 20122 + }, + { + "epoch": 1.8956689668166082, + "grad_norm": 0.9093018174171448, + "learning_rate": 1.3938564726426517e-07, + "loss": 0.1867, + "step": 20123 + }, + { + "epoch": 1.8957631709097762, + "grad_norm": 0.6904942393302917, + "learning_rate": 1.3913452041556253e-07, + "loss": 0.2227, + "step": 20124 + }, + { + "epoch": 1.8958573750029437, + "grad_norm": 0.6031886339187622, + "learning_rate": 1.3888361840984187e-07, + "loss": 0.1764, + "step": 20125 + }, + { + "epoch": 1.8959515790961117, + "grad_norm": 0.6098013520240784, + "learning_rate": 1.3863294125282532e-07, + "loss": 0.192, + "step": 20126 + }, + { + "epoch": 1.8960457831892796, + "grad_norm": 0.6808583736419678, + "learning_rate": 1.3838248895022832e-07, + "loss": 0.1942, + "step": 20127 + }, + { + "epoch": 1.8961399872824474, + "grad_norm": 0.6212560534477234, + "learning_rate": 1.3813226150776293e-07, + "loss": 0.1779, + "step": 20128 + }, + { + "epoch": 1.896234191375615, + "grad_norm": 0.6856077909469604, + "learning_rate": 1.3788225893113239e-07, + "loss": 0.235, + "step": 20129 + }, + { + "epoch": 1.896328395468783, + "grad_norm": 0.6871376037597656, + "learning_rate": 1.3763248122603878e-07, + "loss": 0.2022, + "step": 20130 + }, + { + "epoch": 1.896422599561951, + "grad_norm": 0.5714020133018494, + "learning_rate": 1.3738292839817757e-07, + "loss": 0.1616, + "step": 20131 + }, + { + "epoch": 1.8965168036551188, + "grad_norm": 0.6463074088096619, + "learning_rate": 1.3713360045323753e-07, + "loss": 0.2026, + "step": 20132 + }, + { + "epoch": 1.8966110077482865, + "grad_norm": 0.6515389084815979, + "learning_rate": 1.3688449739690524e-07, + "loss": 0.1741, + "step": 20133 + }, + { + "epoch": 1.8967052118414545, + "grad_norm": 0.6189078688621521, + "learning_rate": 1.3663561923486059e-07, + "loss": 0.1829, + "step": 20134 + }, + { + "epoch": 1.8967994159346224, + "grad_norm": 0.7111350297927856, + "learning_rate": 1.3638696597277678e-07, + "loss": 0.2132, + "step": 20135 + }, + { + "epoch": 1.8968936200277902, + "grad_norm": 0.6568692922592163, + "learning_rate": 1.361385376163238e-07, + "loss": 0.1729, + "step": 20136 + }, + { + "epoch": 1.896987824120958, + "grad_norm": 0.6552120447158813, + "learning_rate": 1.3589033417116927e-07, + "loss": 0.2014, + "step": 20137 + }, + { + "epoch": 1.8970820282141259, + "grad_norm": 0.6305198073387146, + "learning_rate": 1.356423556429687e-07, + "loss": 0.2047, + "step": 20138 + }, + { + "epoch": 1.8971762323072938, + "grad_norm": 0.6298912763595581, + "learning_rate": 1.3539460203737754e-07, + "loss": 0.1524, + "step": 20139 + }, + { + "epoch": 1.8972704364004616, + "grad_norm": 0.682796835899353, + "learning_rate": 1.3514707336004685e-07, + "loss": 0.192, + "step": 20140 + }, + { + "epoch": 1.8973646404936293, + "grad_norm": 0.5889750719070435, + "learning_rate": 1.3489976961661765e-07, + "loss": 0.1597, + "step": 20141 + }, + { + "epoch": 1.8974588445867973, + "grad_norm": 0.6618540287017822, + "learning_rate": 1.3465269081273102e-07, + "loss": 0.1992, + "step": 20142 + }, + { + "epoch": 1.8975530486799652, + "grad_norm": 0.687576413154602, + "learning_rate": 1.344058369540202e-07, + "loss": 0.1907, + "step": 20143 + }, + { + "epoch": 1.897647252773133, + "grad_norm": 0.6274227499961853, + "learning_rate": 1.3415920804611294e-07, + "loss": 0.1833, + "step": 20144 + }, + { + "epoch": 1.8977414568663007, + "grad_norm": 0.7021694183349609, + "learning_rate": 1.339128040946347e-07, + "loss": 0.1872, + "step": 20145 + }, + { + "epoch": 1.8978356609594687, + "grad_norm": 0.6623535752296448, + "learning_rate": 1.3366662510520212e-07, + "loss": 0.21, + "step": 20146 + }, + { + "epoch": 1.8979298650526366, + "grad_norm": 0.6356085538864136, + "learning_rate": 1.334206710834296e-07, + "loss": 0.1825, + "step": 20147 + }, + { + "epoch": 1.8980240691458043, + "grad_norm": 0.7023216485977173, + "learning_rate": 1.3317494203492486e-07, + "loss": 0.1884, + "step": 20148 + }, + { + "epoch": 1.898118273238972, + "grad_norm": 0.5692326426506042, + "learning_rate": 1.329294379652901e-07, + "loss": 0.1502, + "step": 20149 + }, + { + "epoch": 1.89821247733214, + "grad_norm": 0.7072599530220032, + "learning_rate": 1.326841588801242e-07, + "loss": 0.1983, + "step": 20150 + }, + { + "epoch": 1.898306681425308, + "grad_norm": 0.6492814421653748, + "learning_rate": 1.324391047850193e-07, + "loss": 0.203, + "step": 20151 + }, + { + "epoch": 1.8984008855184757, + "grad_norm": 0.6531577706336975, + "learning_rate": 1.321942756855632e-07, + "loss": 0.1945, + "step": 20152 + }, + { + "epoch": 1.8984950896116435, + "grad_norm": 0.6064938902854919, + "learning_rate": 1.3194967158733917e-07, + "loss": 0.2079, + "step": 20153 + }, + { + "epoch": 1.8985892937048114, + "grad_norm": 0.6969150900840759, + "learning_rate": 1.3170529249592278e-07, + "loss": 0.1949, + "step": 20154 + }, + { + "epoch": 1.8986834977979794, + "grad_norm": 0.6392337083816528, + "learning_rate": 1.3146113841688845e-07, + "loss": 0.172, + "step": 20155 + }, + { + "epoch": 1.8987777018911471, + "grad_norm": 0.7279536128044128, + "learning_rate": 1.3121720935580174e-07, + "loss": 0.1923, + "step": 20156 + }, + { + "epoch": 1.8988719059843149, + "grad_norm": 1.4701002836227417, + "learning_rate": 1.3097350531822485e-07, + "loss": 0.1832, + "step": 20157 + }, + { + "epoch": 1.8989661100774828, + "grad_norm": 0.6353821754455566, + "learning_rate": 1.3073002630971444e-07, + "loss": 0.1703, + "step": 20158 + }, + { + "epoch": 1.8990603141706508, + "grad_norm": 0.5935443043708801, + "learning_rate": 1.3048677233582274e-07, + "loss": 0.1731, + "step": 20159 + }, + { + "epoch": 1.8991545182638185, + "grad_norm": 0.6935319304466248, + "learning_rate": 1.3024374340209645e-07, + "loss": 0.2273, + "step": 20160 + }, + { + "epoch": 1.8992487223569863, + "grad_norm": 0.6721147298812866, + "learning_rate": 1.3000093951407667e-07, + "loss": 0.1765, + "step": 20161 + }, + { + "epoch": 1.8993429264501542, + "grad_norm": 0.684522271156311, + "learning_rate": 1.297583606773001e-07, + "loss": 0.2394, + "step": 20162 + }, + { + "epoch": 1.8994371305433222, + "grad_norm": 0.6819477081298828, + "learning_rate": 1.295160068972967e-07, + "loss": 0.2015, + "step": 20163 + }, + { + "epoch": 1.89953133463649, + "grad_norm": 0.8085724711418152, + "learning_rate": 1.2927387817959325e-07, + "loss": 0.2037, + "step": 20164 + }, + { + "epoch": 1.8996255387296577, + "grad_norm": 0.6287370324134827, + "learning_rate": 1.2903197452971196e-07, + "loss": 0.1824, + "step": 20165 + }, + { + "epoch": 1.8997197428228256, + "grad_norm": 0.6373084187507629, + "learning_rate": 1.2879029595316616e-07, + "loss": 0.1777, + "step": 20166 + }, + { + "epoch": 1.8998139469159936, + "grad_norm": 0.6652271151542664, + "learning_rate": 1.2854884245546818e-07, + "loss": 0.167, + "step": 20167 + }, + { + "epoch": 1.8999081510091613, + "grad_norm": 0.6203648447990417, + "learning_rate": 1.2830761404212355e-07, + "loss": 0.1893, + "step": 20168 + }, + { + "epoch": 1.900002355102329, + "grad_norm": 0.6763032674789429, + "learning_rate": 1.2806661071863124e-07, + "loss": 0.2128, + "step": 20169 + }, + { + "epoch": 1.900096559195497, + "grad_norm": 0.7861047983169556, + "learning_rate": 1.2782583249048798e-07, + "loss": 0.2064, + "step": 20170 + }, + { + "epoch": 1.900190763288665, + "grad_norm": 0.8120763897895813, + "learning_rate": 1.2758527936318377e-07, + "loss": 0.182, + "step": 20171 + }, + { + "epoch": 1.9002849673818327, + "grad_norm": 0.8147682547569275, + "learning_rate": 1.2734495134220204e-07, + "loss": 0.2264, + "step": 20172 + }, + { + "epoch": 1.9003791714750005, + "grad_norm": 0.6320006251335144, + "learning_rate": 1.2710484843302506e-07, + "loss": 0.1756, + "step": 20173 + }, + { + "epoch": 1.9004733755681684, + "grad_norm": 0.6364458203315735, + "learning_rate": 1.2686497064112513e-07, + "loss": 0.1835, + "step": 20174 + }, + { + "epoch": 1.9005675796613364, + "grad_norm": 0.6355153918266296, + "learning_rate": 1.2662531797197343e-07, + "loss": 0.1915, + "step": 20175 + }, + { + "epoch": 1.9006617837545041, + "grad_norm": 0.6249171495437622, + "learning_rate": 1.2638589043103445e-07, + "loss": 0.1619, + "step": 20176 + }, + { + "epoch": 1.9007559878476719, + "grad_norm": 0.7112020254135132, + "learning_rate": 1.2614668802376717e-07, + "loss": 0.1869, + "step": 20177 + }, + { + "epoch": 1.9008501919408398, + "grad_norm": 0.7528996467590332, + "learning_rate": 1.259077107556239e-07, + "loss": 0.1981, + "step": 20178 + }, + { + "epoch": 1.9009443960340078, + "grad_norm": 0.6597367525100708, + "learning_rate": 1.2566895863205698e-07, + "loss": 0.192, + "step": 20179 + }, + { + "epoch": 1.9010386001271755, + "grad_norm": 0.6589730978012085, + "learning_rate": 1.2543043165850865e-07, + "loss": 0.1977, + "step": 20180 + }, + { + "epoch": 1.9011328042203433, + "grad_norm": 0.6787749528884888, + "learning_rate": 1.2519212984041684e-07, + "loss": 0.1911, + "step": 20181 + }, + { + "epoch": 1.9012270083135112, + "grad_norm": 0.6698355078697205, + "learning_rate": 1.2495405318321606e-07, + "loss": 0.186, + "step": 20182 + }, + { + "epoch": 1.9013212124066792, + "grad_norm": 0.6706238985061646, + "learning_rate": 1.2471620169233644e-07, + "loss": 0.2028, + "step": 20183 + }, + { + "epoch": 1.901415416499847, + "grad_norm": 0.6361337304115295, + "learning_rate": 1.2447857537319918e-07, + "loss": 0.1627, + "step": 20184 + }, + { + "epoch": 1.9015096205930146, + "grad_norm": 0.6631746888160706, + "learning_rate": 1.242411742312233e-07, + "loss": 0.2048, + "step": 20185 + }, + { + "epoch": 1.9016038246861826, + "grad_norm": 0.7597262859344482, + "learning_rate": 1.2400399827182107e-07, + "loss": 0.182, + "step": 20186 + }, + { + "epoch": 1.9016980287793506, + "grad_norm": 0.7315596342086792, + "learning_rate": 1.2376704750040159e-07, + "loss": 0.1897, + "step": 20187 + }, + { + "epoch": 1.9017922328725183, + "grad_norm": 0.61861652135849, + "learning_rate": 1.2353032192236713e-07, + "loss": 0.1684, + "step": 20188 + }, + { + "epoch": 1.901886436965686, + "grad_norm": 0.7388166785240173, + "learning_rate": 1.2329382154311565e-07, + "loss": 0.216, + "step": 20189 + }, + { + "epoch": 1.901980641058854, + "grad_norm": 0.6822174191474915, + "learning_rate": 1.2305754636804056e-07, + "loss": 0.1772, + "step": 20190 + }, + { + "epoch": 1.902074845152022, + "grad_norm": 0.7317981719970703, + "learning_rate": 1.2282149640252762e-07, + "loss": 0.2467, + "step": 20191 + }, + { + "epoch": 1.9021690492451897, + "grad_norm": 0.6668426990509033, + "learning_rate": 1.2258567165195912e-07, + "loss": 0.206, + "step": 20192 + }, + { + "epoch": 1.9022632533383574, + "grad_norm": 0.6788719296455383, + "learning_rate": 1.2235007212171414e-07, + "loss": 0.1744, + "step": 20193 + }, + { + "epoch": 1.9023574574315254, + "grad_norm": 0.6401882767677307, + "learning_rate": 1.2211469781716388e-07, + "loss": 0.2099, + "step": 20194 + }, + { + "epoch": 1.9024516615246934, + "grad_norm": 0.6083666086196899, + "learning_rate": 1.2187954874367413e-07, + "loss": 0.1973, + "step": 20195 + }, + { + "epoch": 1.902545865617861, + "grad_norm": 0.6650665998458862, + "learning_rate": 1.2164462490660722e-07, + "loss": 0.1755, + "step": 20196 + }, + { + "epoch": 1.9026400697110288, + "grad_norm": 0.7086012363433838, + "learning_rate": 1.2140992631132108e-07, + "loss": 0.2069, + "step": 20197 + }, + { + "epoch": 1.9027342738041968, + "grad_norm": 0.6304386258125305, + "learning_rate": 1.211754529631659e-07, + "loss": 0.2152, + "step": 20198 + }, + { + "epoch": 1.9028284778973648, + "grad_norm": 0.6851261854171753, + "learning_rate": 1.2094120486748739e-07, + "loss": 0.2461, + "step": 20199 + }, + { + "epoch": 1.9029226819905325, + "grad_norm": 0.7031466960906982, + "learning_rate": 1.2070718202962794e-07, + "loss": 0.1928, + "step": 20200 + }, + { + "epoch": 1.9030168860837002, + "grad_norm": 0.676091730594635, + "learning_rate": 1.2047338445492442e-07, + "loss": 0.1927, + "step": 20201 + }, + { + "epoch": 1.9031110901768682, + "grad_norm": 0.6730955243110657, + "learning_rate": 1.2023981214870585e-07, + "loss": 0.1544, + "step": 20202 + }, + { + "epoch": 1.9032052942700362, + "grad_norm": 0.7274423837661743, + "learning_rate": 1.200064651162991e-07, + "loss": 0.1963, + "step": 20203 + }, + { + "epoch": 1.903299498363204, + "grad_norm": 0.6768518686294556, + "learning_rate": 1.1977334336302437e-07, + "loss": 0.2041, + "step": 20204 + }, + { + "epoch": 1.9033937024563716, + "grad_norm": 0.6196874976158142, + "learning_rate": 1.1954044689419852e-07, + "loss": 0.1871, + "step": 20205 + }, + { + "epoch": 1.9034879065495396, + "grad_norm": 0.6369749307632446, + "learning_rate": 1.1930777571512843e-07, + "loss": 0.1709, + "step": 20206 + }, + { + "epoch": 1.9035821106427075, + "grad_norm": 0.7110297083854675, + "learning_rate": 1.1907532983112424e-07, + "loss": 0.1748, + "step": 20207 + }, + { + "epoch": 1.9036763147358753, + "grad_norm": 0.6341861486434937, + "learning_rate": 1.18843109247484e-07, + "loss": 0.2121, + "step": 20208 + }, + { + "epoch": 1.903770518829043, + "grad_norm": 0.6789807081222534, + "learning_rate": 1.186111139695001e-07, + "loss": 0.2006, + "step": 20209 + }, + { + "epoch": 1.903864722922211, + "grad_norm": 0.6980389356613159, + "learning_rate": 1.1837934400246609e-07, + "loss": 0.2074, + "step": 20210 + }, + { + "epoch": 1.903958927015379, + "grad_norm": 0.7165026068687439, + "learning_rate": 1.1814779935166553e-07, + "loss": 0.1957, + "step": 20211 + }, + { + "epoch": 1.9040531311085467, + "grad_norm": 0.6362962126731873, + "learning_rate": 1.1791648002237643e-07, + "loss": 0.1841, + "step": 20212 + }, + { + "epoch": 1.9041473352017144, + "grad_norm": 0.6579410433769226, + "learning_rate": 1.1768538601987566e-07, + "loss": 0.1918, + "step": 20213 + }, + { + "epoch": 1.9042415392948824, + "grad_norm": 0.7375689744949341, + "learning_rate": 1.1745451734943125e-07, + "loss": 0.2069, + "step": 20214 + }, + { + "epoch": 1.9043357433880503, + "grad_norm": 0.8764556050300598, + "learning_rate": 1.1722387401630675e-07, + "loss": 0.1832, + "step": 20215 + }, + { + "epoch": 1.904429947481218, + "grad_norm": 0.6192811727523804, + "learning_rate": 1.169934560257624e-07, + "loss": 0.179, + "step": 20216 + }, + { + "epoch": 1.9045241515743858, + "grad_norm": 0.7142428755760193, + "learning_rate": 1.1676326338305178e-07, + "loss": 0.186, + "step": 20217 + }, + { + "epoch": 1.9046183556675538, + "grad_norm": 0.753334641456604, + "learning_rate": 1.16533296093424e-07, + "loss": 0.2124, + "step": 20218 + }, + { + "epoch": 1.9047125597607217, + "grad_norm": 0.6500793695449829, + "learning_rate": 1.1630355416212047e-07, + "loss": 0.179, + "step": 20219 + }, + { + "epoch": 1.9048067638538895, + "grad_norm": 0.708154022693634, + "learning_rate": 1.1607403759438363e-07, + "loss": 0.1871, + "step": 20220 + }, + { + "epoch": 1.9049009679470572, + "grad_norm": 0.6770479083061218, + "learning_rate": 1.1584474639544263e-07, + "loss": 0.2003, + "step": 20221 + }, + { + "epoch": 1.9049951720402252, + "grad_norm": 0.6528639197349548, + "learning_rate": 1.1561568057052775e-07, + "loss": 0.1887, + "step": 20222 + }, + { + "epoch": 1.9050893761333931, + "grad_norm": 0.650513768196106, + "learning_rate": 1.1538684012486257e-07, + "loss": 0.1968, + "step": 20223 + }, + { + "epoch": 1.9051835802265609, + "grad_norm": 0.6311100125312805, + "learning_rate": 1.1515822506366403e-07, + "loss": 0.1724, + "step": 20224 + }, + { + "epoch": 1.9052777843197286, + "grad_norm": 0.6588751077651978, + "learning_rate": 1.1492983539214576e-07, + "loss": 0.1835, + "step": 20225 + }, + { + "epoch": 1.9053719884128966, + "grad_norm": 0.6212252378463745, + "learning_rate": 1.1470167111551356e-07, + "loss": 0.2025, + "step": 20226 + }, + { + "epoch": 1.9054661925060645, + "grad_norm": 0.6927551031112671, + "learning_rate": 1.1447373223897218e-07, + "loss": 0.2378, + "step": 20227 + }, + { + "epoch": 1.9055603965992323, + "grad_norm": 0.6051120758056641, + "learning_rate": 1.1424601876771746e-07, + "loss": 0.1782, + "step": 20228 + }, + { + "epoch": 1.9056546006924, + "grad_norm": 0.6797753572463989, + "learning_rate": 1.1401853070694192e-07, + "loss": 0.2105, + "step": 20229 + }, + { + "epoch": 1.905748804785568, + "grad_norm": 0.6577433347702026, + "learning_rate": 1.137912680618336e-07, + "loss": 0.2051, + "step": 20230 + }, + { + "epoch": 1.905843008878736, + "grad_norm": 0.7104130983352661, + "learning_rate": 1.1356423083757284e-07, + "loss": 0.1819, + "step": 20231 + }, + { + "epoch": 1.9059372129719037, + "grad_norm": 0.5975115299224854, + "learning_rate": 1.1333741903933771e-07, + "loss": 0.1865, + "step": 20232 + }, + { + "epoch": 1.9060314170650714, + "grad_norm": 0.583385169506073, + "learning_rate": 1.131108326722996e-07, + "loss": 0.1636, + "step": 20233 + }, + { + "epoch": 1.9061256211582394, + "grad_norm": 0.9173849821090698, + "learning_rate": 1.1288447174162443e-07, + "loss": 0.1802, + "step": 20234 + }, + { + "epoch": 1.9062198252514073, + "grad_norm": 0.665015697479248, + "learning_rate": 1.1265833625247469e-07, + "loss": 0.1786, + "step": 20235 + }, + { + "epoch": 1.906314029344575, + "grad_norm": 0.6025359034538269, + "learning_rate": 1.1243242621000516e-07, + "loss": 0.1622, + "step": 20236 + }, + { + "epoch": 1.9064082334377428, + "grad_norm": 0.649183988571167, + "learning_rate": 1.1220674161936839e-07, + "loss": 0.1929, + "step": 20237 + }, + { + "epoch": 1.9065024375309108, + "grad_norm": 0.6834205389022827, + "learning_rate": 1.1198128248570916e-07, + "loss": 0.1898, + "step": 20238 + }, + { + "epoch": 1.9065966416240787, + "grad_norm": 0.6396623253822327, + "learning_rate": 1.117560488141689e-07, + "loss": 0.1844, + "step": 20239 + }, + { + "epoch": 1.9066908457172465, + "grad_norm": 0.6639078259468079, + "learning_rate": 1.1153104060988352e-07, + "loss": 0.224, + "step": 20240 + }, + { + "epoch": 1.9067850498104142, + "grad_norm": 0.5923094749450684, + "learning_rate": 1.1130625787798333e-07, + "loss": 0.1751, + "step": 20241 + }, + { + "epoch": 1.9068792539035821, + "grad_norm": 0.6624804735183716, + "learning_rate": 1.1108170062359313e-07, + "loss": 0.2065, + "step": 20242 + }, + { + "epoch": 1.90697345799675, + "grad_norm": 0.6475779414176941, + "learning_rate": 1.1085736885183329e-07, + "loss": 0.1869, + "step": 20243 + }, + { + "epoch": 1.9070676620899178, + "grad_norm": 0.7120112180709839, + "learning_rate": 1.1063326256782081e-07, + "loss": 0.1913, + "step": 20244 + }, + { + "epoch": 1.9071618661830856, + "grad_norm": 0.671790361404419, + "learning_rate": 1.1040938177666383e-07, + "loss": 0.1788, + "step": 20245 + }, + { + "epoch": 1.9072560702762535, + "grad_norm": 0.6533803939819336, + "learning_rate": 1.1018572648346493e-07, + "loss": 0.1903, + "step": 20246 + }, + { + "epoch": 1.9073502743694215, + "grad_norm": 0.5851737260818481, + "learning_rate": 1.0996229669332892e-07, + "loss": 0.167, + "step": 20247 + }, + { + "epoch": 1.9074444784625892, + "grad_norm": 0.6730858087539673, + "learning_rate": 1.0973909241134839e-07, + "loss": 0.2212, + "step": 20248 + }, + { + "epoch": 1.907538682555757, + "grad_norm": 0.5762879848480225, + "learning_rate": 1.0951611364260927e-07, + "loss": 0.1573, + "step": 20249 + }, + { + "epoch": 1.907632886648925, + "grad_norm": 0.6985439658164978, + "learning_rate": 1.0929336039220084e-07, + "loss": 0.2021, + "step": 20250 + }, + { + "epoch": 1.907727090742093, + "grad_norm": 0.7053725719451904, + "learning_rate": 1.0907083266520013e-07, + "loss": 0.2159, + "step": 20251 + }, + { + "epoch": 1.9078212948352606, + "grad_norm": 0.6184783577919006, + "learning_rate": 1.0884853046667976e-07, + "loss": 0.1779, + "step": 20252 + }, + { + "epoch": 1.9079154989284284, + "grad_norm": 0.6174992322921753, + "learning_rate": 1.086264538017101e-07, + "loss": 0.1519, + "step": 20253 + }, + { + "epoch": 1.9080097030215963, + "grad_norm": 0.7269670963287354, + "learning_rate": 1.0840460267535601e-07, + "loss": 0.2499, + "step": 20254 + }, + { + "epoch": 1.9081039071147643, + "grad_norm": 0.6113536357879639, + "learning_rate": 1.0818297709267344e-07, + "loss": 0.1747, + "step": 20255 + }, + { + "epoch": 1.908198111207932, + "grad_norm": 0.7576474547386169, + "learning_rate": 1.0796157705871724e-07, + "loss": 0.2002, + "step": 20256 + }, + { + "epoch": 1.9082923153010998, + "grad_norm": 0.6527425646781921, + "learning_rate": 1.0774040257853557e-07, + "loss": 0.1962, + "step": 20257 + }, + { + "epoch": 1.9083865193942677, + "grad_norm": 5.325571537017822, + "learning_rate": 1.075194536571711e-07, + "loss": 0.2052, + "step": 20258 + }, + { + "epoch": 1.9084807234874357, + "grad_norm": 0.6798804998397827, + "learning_rate": 1.0729873029966198e-07, + "loss": 0.204, + "step": 20259 + }, + { + "epoch": 1.9085749275806034, + "grad_norm": 0.6803638935089111, + "learning_rate": 1.0707823251104199e-07, + "loss": 0.1818, + "step": 20260 + }, + { + "epoch": 1.9086691316737712, + "grad_norm": 0.572781503200531, + "learning_rate": 1.068579602963371e-07, + "loss": 0.1903, + "step": 20261 + }, + { + "epoch": 1.9087633357669391, + "grad_norm": 0.7031172513961792, + "learning_rate": 1.0663791366056997e-07, + "loss": 0.2152, + "step": 20262 + }, + { + "epoch": 1.9088575398601069, + "grad_norm": 0.5797815322875977, + "learning_rate": 1.0641809260876101e-07, + "loss": 0.1712, + "step": 20263 + }, + { + "epoch": 1.9089517439532746, + "grad_norm": 0.7296099066734314, + "learning_rate": 1.0619849714591957e-07, + "loss": 0.2167, + "step": 20264 + }, + { + "epoch": 1.9090459480464426, + "grad_norm": 0.871605396270752, + "learning_rate": 1.0597912727705273e-07, + "loss": 0.1918, + "step": 20265 + }, + { + "epoch": 1.9091401521396105, + "grad_norm": 0.6410840153694153, + "learning_rate": 1.0575998300716317e-07, + "loss": 0.178, + "step": 20266 + }, + { + "epoch": 1.9092343562327783, + "grad_norm": 0.6435098648071289, + "learning_rate": 1.05541064341248e-07, + "loss": 0.1723, + "step": 20267 + }, + { + "epoch": 1.909328560325946, + "grad_norm": 0.6844145059585571, + "learning_rate": 1.0532237128429879e-07, + "loss": 0.1897, + "step": 20268 + }, + { + "epoch": 1.909422764419114, + "grad_norm": 0.6417859196662903, + "learning_rate": 1.0510390384130153e-07, + "loss": 0.1831, + "step": 20269 + }, + { + "epoch": 1.909516968512282, + "grad_norm": 0.5997002720832825, + "learning_rate": 1.0488566201723782e-07, + "loss": 0.1719, + "step": 20270 + }, + { + "epoch": 1.9096111726054497, + "grad_norm": 0.6502866744995117, + "learning_rate": 1.0466764581708478e-07, + "loss": 0.1949, + "step": 20271 + }, + { + "epoch": 1.9097053766986174, + "grad_norm": 0.6558030247688293, + "learning_rate": 1.0444985524581285e-07, + "loss": 0.2132, + "step": 20272 + }, + { + "epoch": 1.9097995807917854, + "grad_norm": 0.6933730244636536, + "learning_rate": 1.0423229030838811e-07, + "loss": 0.1989, + "step": 20273 + }, + { + "epoch": 1.9098937848849533, + "grad_norm": 0.7598581314086914, + "learning_rate": 1.0401495100976988e-07, + "loss": 0.2297, + "step": 20274 + }, + { + "epoch": 1.909987988978121, + "grad_norm": 0.6667605042457581, + "learning_rate": 1.0379783735491756e-07, + "loss": 0.1992, + "step": 20275 + }, + { + "epoch": 1.9100821930712888, + "grad_norm": 0.7057095170021057, + "learning_rate": 1.0358094934877716e-07, + "loss": 0.1963, + "step": 20276 + }, + { + "epoch": 1.9101763971644568, + "grad_norm": 0.6734713912010193, + "learning_rate": 1.0336428699629696e-07, + "loss": 0.2209, + "step": 20277 + }, + { + "epoch": 1.9102706012576247, + "grad_norm": 0.691481351852417, + "learning_rate": 1.0314785030241636e-07, + "loss": 0.1905, + "step": 20278 + }, + { + "epoch": 1.9103648053507924, + "grad_norm": 0.7067854404449463, + "learning_rate": 1.0293163927207139e-07, + "loss": 0.1922, + "step": 20279 + }, + { + "epoch": 1.9104590094439602, + "grad_norm": 0.6748788952827454, + "learning_rate": 1.0271565391018922e-07, + "loss": 0.1621, + "step": 20280 + }, + { + "epoch": 1.9105532135371281, + "grad_norm": 0.6618409156799316, + "learning_rate": 1.0249989422169926e-07, + "loss": 0.1876, + "step": 20281 + }, + { + "epoch": 1.910647417630296, + "grad_norm": 0.7199574112892151, + "learning_rate": 1.0228436021151645e-07, + "loss": 0.196, + "step": 20282 + }, + { + "epoch": 1.9107416217234638, + "grad_norm": 0.7408638000488281, + "learning_rate": 1.0206905188455796e-07, + "loss": 0.1892, + "step": 20283 + }, + { + "epoch": 1.9108358258166316, + "grad_norm": 0.6736592054367065, + "learning_rate": 1.0185396924573209e-07, + "loss": 0.2049, + "step": 20284 + }, + { + "epoch": 1.9109300299097995, + "grad_norm": 0.6533849239349365, + "learning_rate": 1.0163911229994494e-07, + "loss": 0.1868, + "step": 20285 + }, + { + "epoch": 1.9110242340029675, + "grad_norm": 0.6521468162536621, + "learning_rate": 1.0142448105209257e-07, + "loss": 0.1949, + "step": 20286 + }, + { + "epoch": 1.9111184380961352, + "grad_norm": 0.6551706790924072, + "learning_rate": 1.0121007550707107e-07, + "loss": 0.1896, + "step": 20287 + }, + { + "epoch": 1.911212642189303, + "grad_norm": 0.7433573007583618, + "learning_rate": 1.0099589566976875e-07, + "loss": 0.1899, + "step": 20288 + }, + { + "epoch": 1.911306846282471, + "grad_norm": 0.6351568102836609, + "learning_rate": 1.0078194154506948e-07, + "loss": 0.1748, + "step": 20289 + }, + { + "epoch": 1.911401050375639, + "grad_norm": 0.6436189413070679, + "learning_rate": 1.0056821313785048e-07, + "loss": 0.2004, + "step": 20290 + }, + { + "epoch": 1.9114952544688066, + "grad_norm": 0.6571237444877625, + "learning_rate": 1.0035471045298673e-07, + "loss": 0.1974, + "step": 20291 + }, + { + "epoch": 1.9115894585619744, + "grad_norm": 0.6632843017578125, + "learning_rate": 1.0014143349534433e-07, + "loss": 0.1633, + "step": 20292 + }, + { + "epoch": 1.9116836626551423, + "grad_norm": 0.7167983055114746, + "learning_rate": 9.99283822697894e-08, + "loss": 0.1928, + "step": 20293 + }, + { + "epoch": 1.9117778667483103, + "grad_norm": 0.673212468624115, + "learning_rate": 9.971555678117695e-08, + "loss": 0.195, + "step": 20294 + }, + { + "epoch": 1.911872070841478, + "grad_norm": 0.6878961324691772, + "learning_rate": 9.950295703436086e-08, + "loss": 0.1676, + "step": 20295 + }, + { + "epoch": 1.9119662749346458, + "grad_norm": 0.6188074946403503, + "learning_rate": 9.929058303418947e-08, + "loss": 0.1617, + "step": 20296 + }, + { + "epoch": 1.9120604790278137, + "grad_norm": 0.6131939888000488, + "learning_rate": 9.90784347855045e-08, + "loss": 0.1658, + "step": 20297 + }, + { + "epoch": 1.9121546831209817, + "grad_norm": 0.752884030342102, + "learning_rate": 9.886651229314315e-08, + "loss": 0.1878, + "step": 20298 + }, + { + "epoch": 1.9122488872141494, + "grad_norm": 0.7726864218711853, + "learning_rate": 9.865481556193713e-08, + "loss": 0.2343, + "step": 20299 + }, + { + "epoch": 1.9123430913073172, + "grad_norm": 0.6393436789512634, + "learning_rate": 9.844334459671479e-08, + "loss": 0.1942, + "step": 20300 + }, + { + "epoch": 1.9124372954004851, + "grad_norm": 0.7119874954223633, + "learning_rate": 9.823209940229673e-08, + "loss": 0.1838, + "step": 20301 + }, + { + "epoch": 1.912531499493653, + "grad_norm": 0.6908301711082458, + "learning_rate": 9.80210799835002e-08, + "loss": 0.2055, + "step": 20302 + }, + { + "epoch": 1.9126257035868208, + "grad_norm": 0.8175268173217773, + "learning_rate": 9.7810286345138e-08, + "loss": 0.1851, + "step": 20303 + }, + { + "epoch": 1.9127199076799886, + "grad_norm": 0.6205576658248901, + "learning_rate": 9.75997184920141e-08, + "loss": 0.1817, + "step": 20304 + }, + { + "epoch": 1.9128141117731565, + "grad_norm": 0.656181275844574, + "learning_rate": 9.738937642893132e-08, + "loss": 0.191, + "step": 20305 + }, + { + "epoch": 1.9129083158663245, + "grad_norm": 0.6175089478492737, + "learning_rate": 9.717926016068468e-08, + "loss": 0.1718, + "step": 20306 + }, + { + "epoch": 1.9130025199594922, + "grad_norm": 0.6579227447509766, + "learning_rate": 9.696936969206705e-08, + "loss": 0.182, + "step": 20307 + }, + { + "epoch": 1.91309672405266, + "grad_norm": 0.7359580993652344, + "learning_rate": 9.675970502786236e-08, + "loss": 0.1923, + "step": 20308 + }, + { + "epoch": 1.913190928145828, + "grad_norm": 0.6713550090789795, + "learning_rate": 9.655026617285235e-08, + "loss": 0.1953, + "step": 20309 + }, + { + "epoch": 1.9132851322389959, + "grad_norm": 0.6320589780807495, + "learning_rate": 9.634105313181207e-08, + "loss": 0.1974, + "step": 20310 + }, + { + "epoch": 1.9133793363321636, + "grad_norm": 0.6372619867324829, + "learning_rate": 9.613206590951219e-08, + "loss": 0.1738, + "step": 20311 + }, + { + "epoch": 1.9134735404253314, + "grad_norm": 0.687659740447998, + "learning_rate": 9.592330451071774e-08, + "loss": 0.1917, + "step": 20312 + }, + { + "epoch": 1.9135677445184993, + "grad_norm": 0.7863016724586487, + "learning_rate": 9.571476894018938e-08, + "loss": 0.2445, + "step": 20313 + }, + { + "epoch": 1.9136619486116673, + "grad_norm": 0.7134937644004822, + "learning_rate": 9.550645920268109e-08, + "loss": 0.2446, + "step": 20314 + }, + { + "epoch": 1.913756152704835, + "grad_norm": 0.6415748596191406, + "learning_rate": 9.52983753029435e-08, + "loss": 0.2007, + "step": 20315 + }, + { + "epoch": 1.9138503567980027, + "grad_norm": 0.7357137799263, + "learning_rate": 9.50905172457206e-08, + "loss": 0.212, + "step": 20316 + }, + { + "epoch": 1.9139445608911707, + "grad_norm": 0.6473010778427124, + "learning_rate": 9.488288503575193e-08, + "loss": 0.1914, + "step": 20317 + }, + { + "epoch": 1.9140387649843387, + "grad_norm": 0.6214069128036499, + "learning_rate": 9.467547867777261e-08, + "loss": 0.1621, + "step": 20318 + }, + { + "epoch": 1.9141329690775064, + "grad_norm": 0.7933803796768188, + "learning_rate": 9.446829817650993e-08, + "loss": 0.2149, + "step": 20319 + }, + { + "epoch": 1.9142271731706741, + "grad_norm": 0.5886857509613037, + "learning_rate": 9.426134353669015e-08, + "loss": 0.1725, + "step": 20320 + }, + { + "epoch": 1.914321377263842, + "grad_norm": 0.6440650224685669, + "learning_rate": 9.405461476303168e-08, + "loss": 0.1838, + "step": 20321 + }, + { + "epoch": 1.91441558135701, + "grad_norm": 0.695639431476593, + "learning_rate": 9.384811186024744e-08, + "loss": 0.2212, + "step": 20322 + }, + { + "epoch": 1.9145097854501778, + "grad_norm": 0.6366848349571228, + "learning_rate": 9.36418348330459e-08, + "loss": 0.1734, + "step": 20323 + }, + { + "epoch": 1.9146039895433455, + "grad_norm": 0.6480180025100708, + "learning_rate": 9.343578368613215e-08, + "loss": 0.1764, + "step": 20324 + }, + { + "epoch": 1.9146981936365135, + "grad_norm": 0.6710912585258484, + "learning_rate": 9.322995842420247e-08, + "loss": 0.1866, + "step": 20325 + }, + { + "epoch": 1.9147923977296815, + "grad_norm": 0.6814228892326355, + "learning_rate": 9.302435905195084e-08, + "loss": 0.2081, + "step": 20326 + }, + { + "epoch": 1.9148866018228492, + "grad_norm": 0.6365862488746643, + "learning_rate": 9.281898557406577e-08, + "loss": 0.1719, + "step": 20327 + }, + { + "epoch": 1.914980805916017, + "grad_norm": 0.6735439896583557, + "learning_rate": 9.261383799523016e-08, + "loss": 0.1744, + "step": 20328 + }, + { + "epoch": 1.915075010009185, + "grad_norm": 0.6732083559036255, + "learning_rate": 9.240891632011917e-08, + "loss": 0.2123, + "step": 20329 + }, + { + "epoch": 1.9151692141023529, + "grad_norm": 0.6826969981193542, + "learning_rate": 9.220422055340794e-08, + "loss": 0.2134, + "step": 20330 + }, + { + "epoch": 1.9152634181955206, + "grad_norm": 0.6632513999938965, + "learning_rate": 9.199975069976496e-08, + "loss": 0.2069, + "step": 20331 + }, + { + "epoch": 1.9153576222886883, + "grad_norm": 0.6673405170440674, + "learning_rate": 9.179550676384874e-08, + "loss": 0.1835, + "step": 20332 + }, + { + "epoch": 1.9154518263818563, + "grad_norm": 0.6578190326690674, + "learning_rate": 9.159148875031887e-08, + "loss": 0.1841, + "step": 20333 + }, + { + "epoch": 1.9155460304750243, + "grad_norm": 1.0255050659179688, + "learning_rate": 9.138769666382719e-08, + "loss": 0.1841, + "step": 20334 + }, + { + "epoch": 1.915640234568192, + "grad_norm": 0.6334037184715271, + "learning_rate": 9.118413050901886e-08, + "loss": 0.2038, + "step": 20335 + }, + { + "epoch": 1.9157344386613597, + "grad_norm": 0.70944744348526, + "learning_rate": 9.098079029053686e-08, + "loss": 0.198, + "step": 20336 + }, + { + "epoch": 1.9158286427545277, + "grad_norm": 0.6494820713996887, + "learning_rate": 9.077767601301856e-08, + "loss": 0.18, + "step": 20337 + }, + { + "epoch": 1.9159228468476956, + "grad_norm": 0.6620484590530396, + "learning_rate": 9.057478768109362e-08, + "loss": 0.1865, + "step": 20338 + }, + { + "epoch": 1.9160170509408634, + "grad_norm": 0.7518098950386047, + "learning_rate": 9.037212529938832e-08, + "loss": 0.1946, + "step": 20339 + }, + { + "epoch": 1.9161112550340311, + "grad_norm": 0.6428109407424927, + "learning_rate": 9.016968887252454e-08, + "loss": 0.1787, + "step": 20340 + }, + { + "epoch": 1.916205459127199, + "grad_norm": 0.6537749171257019, + "learning_rate": 8.996747840511744e-08, + "loss": 0.1849, + "step": 20341 + }, + { + "epoch": 1.916299663220367, + "grad_norm": 0.7021124362945557, + "learning_rate": 8.97654939017778e-08, + "loss": 0.183, + "step": 20342 + }, + { + "epoch": 1.9163938673135348, + "grad_norm": 0.634307861328125, + "learning_rate": 8.956373536711194e-08, + "loss": 0.162, + "step": 20343 + }, + { + "epoch": 1.9164880714067025, + "grad_norm": 0.6356215476989746, + "learning_rate": 8.93622028057195e-08, + "loss": 0.1747, + "step": 20344 + }, + { + "epoch": 1.9165822754998705, + "grad_norm": 0.7076974511146545, + "learning_rate": 8.91608962221957e-08, + "loss": 0.2075, + "step": 20345 + }, + { + "epoch": 1.9166764795930384, + "grad_norm": 0.7133541107177734, + "learning_rate": 8.895981562113021e-08, + "loss": 0.2031, + "step": 20346 + }, + { + "epoch": 1.9167706836862062, + "grad_norm": 0.6232772469520569, + "learning_rate": 8.875896100711046e-08, + "loss": 0.1717, + "step": 20347 + }, + { + "epoch": 1.916864887779374, + "grad_norm": 0.7322257161140442, + "learning_rate": 8.855833238471279e-08, + "loss": 0.2104, + "step": 20348 + }, + { + "epoch": 1.9169590918725419, + "grad_norm": 0.6773502230644226, + "learning_rate": 8.835792975851354e-08, + "loss": 0.201, + "step": 20349 + }, + { + "epoch": 1.9170532959657098, + "grad_norm": 0.6514344215393066, + "learning_rate": 8.81577531330835e-08, + "loss": 0.1896, + "step": 20350 + }, + { + "epoch": 1.9171475000588776, + "grad_norm": 0.6321004033088684, + "learning_rate": 8.795780251298458e-08, + "loss": 0.1911, + "step": 20351 + }, + { + "epoch": 1.9172417041520453, + "grad_norm": 0.7514435052871704, + "learning_rate": 8.775807790277757e-08, + "loss": 0.1937, + "step": 20352 + }, + { + "epoch": 1.9173359082452133, + "grad_norm": 0.605535626411438, + "learning_rate": 8.75585793070155e-08, + "loss": 0.1696, + "step": 20353 + }, + { + "epoch": 1.9174301123383812, + "grad_norm": 0.6275140047073364, + "learning_rate": 8.735930673024806e-08, + "loss": 0.1816, + "step": 20354 + }, + { + "epoch": 1.917524316431549, + "grad_norm": 0.6267510056495667, + "learning_rate": 8.716026017701829e-08, + "loss": 0.1861, + "step": 20355 + }, + { + "epoch": 1.9176185205247167, + "grad_norm": 0.6523311138153076, + "learning_rate": 8.696143965186587e-08, + "loss": 0.2062, + "step": 20356 + }, + { + "epoch": 1.9177127246178847, + "grad_norm": 0.6325419545173645, + "learning_rate": 8.676284515932166e-08, + "loss": 0.177, + "step": 20357 + }, + { + "epoch": 1.9178069287110526, + "grad_norm": 0.6407454609870911, + "learning_rate": 8.656447670391755e-08, + "loss": 0.2011, + "step": 20358 + }, + { + "epoch": 1.9179011328042204, + "grad_norm": 0.6525038480758667, + "learning_rate": 8.636633429017438e-08, + "loss": 0.2089, + "step": 20359 + }, + { + "epoch": 1.917995336897388, + "grad_norm": 0.6729236245155334, + "learning_rate": 8.616841792260966e-08, + "loss": 0.1705, + "step": 20360 + }, + { + "epoch": 1.918089540990556, + "grad_norm": 0.6049158573150635, + "learning_rate": 8.597072760573754e-08, + "loss": 0.185, + "step": 20361 + }, + { + "epoch": 1.918183745083724, + "grad_norm": 0.6646718382835388, + "learning_rate": 8.577326334406555e-08, + "loss": 0.1724, + "step": 20362 + }, + { + "epoch": 1.9182779491768918, + "grad_norm": 0.6387299299240112, + "learning_rate": 8.557602514209562e-08, + "loss": 0.1698, + "step": 20363 + }, + { + "epoch": 1.9183721532700595, + "grad_norm": 0.6570069789886475, + "learning_rate": 8.537901300432528e-08, + "loss": 0.2044, + "step": 20364 + }, + { + "epoch": 1.9184663573632275, + "grad_norm": 0.651914119720459, + "learning_rate": 8.518222693524647e-08, + "loss": 0.1818, + "step": 20365 + }, + { + "epoch": 1.9185605614563954, + "grad_norm": 0.7072866559028625, + "learning_rate": 8.498566693934563e-08, + "loss": 0.1767, + "step": 20366 + }, + { + "epoch": 1.9186547655495632, + "grad_norm": 0.5826490521430969, + "learning_rate": 8.478933302110692e-08, + "loss": 0.1895, + "step": 20367 + }, + { + "epoch": 1.918748969642731, + "grad_norm": 0.6720185875892639, + "learning_rate": 8.459322518500568e-08, + "loss": 0.1924, + "step": 20368 + }, + { + "epoch": 1.9188431737358989, + "grad_norm": 0.6119192242622375, + "learning_rate": 8.439734343551276e-08, + "loss": 0.178, + "step": 20369 + }, + { + "epoch": 1.9189373778290668, + "grad_norm": 0.6786589026451111, + "learning_rate": 8.420168777709459e-08, + "loss": 0.1727, + "step": 20370 + }, + { + "epoch": 1.9190315819222346, + "grad_norm": 0.6455722451210022, + "learning_rate": 8.400625821421426e-08, + "loss": 0.1887, + "step": 20371 + }, + { + "epoch": 1.9191257860154023, + "grad_norm": 0.721203625202179, + "learning_rate": 8.38110547513249e-08, + "loss": 0.2199, + "step": 20372 + }, + { + "epoch": 1.9192199901085703, + "grad_norm": 0.6263687014579773, + "learning_rate": 8.36160773928807e-08, + "loss": 0.1862, + "step": 20373 + }, + { + "epoch": 1.9193141942017382, + "grad_norm": 0.6503720283508301, + "learning_rate": 8.342132614332587e-08, + "loss": 0.1976, + "step": 20374 + }, + { + "epoch": 1.919408398294906, + "grad_norm": 0.6442580223083496, + "learning_rate": 8.322680100710023e-08, + "loss": 0.1629, + "step": 20375 + }, + { + "epoch": 1.9195026023880737, + "grad_norm": 0.6746324896812439, + "learning_rate": 8.303250198864021e-08, + "loss": 0.2157, + "step": 20376 + }, + { + "epoch": 1.9195968064812416, + "grad_norm": 0.7027804255485535, + "learning_rate": 8.283842909237671e-08, + "loss": 0.1923, + "step": 20377 + }, + { + "epoch": 1.9196910105744096, + "grad_norm": 0.7159023284912109, + "learning_rate": 8.264458232273398e-08, + "loss": 0.2157, + "step": 20378 + }, + { + "epoch": 1.9197852146675773, + "grad_norm": 0.6947713494300842, + "learning_rate": 8.24509616841318e-08, + "loss": 0.2016, + "step": 20379 + }, + { + "epoch": 1.919879418760745, + "grad_norm": 0.694394052028656, + "learning_rate": 8.225756718098554e-08, + "loss": 0.2118, + "step": 20380 + }, + { + "epoch": 1.919973622853913, + "grad_norm": 0.7258616089820862, + "learning_rate": 8.206439881770611e-08, + "loss": 0.204, + "step": 20381 + }, + { + "epoch": 1.920067826947081, + "grad_norm": 0.6162565350532532, + "learning_rate": 8.187145659869445e-08, + "loss": 0.1816, + "step": 20382 + }, + { + "epoch": 1.9201620310402487, + "grad_norm": 0.7104194164276123, + "learning_rate": 8.167874052835367e-08, + "loss": 0.1873, + "step": 20383 + }, + { + "epoch": 1.9202562351334165, + "grad_norm": 0.699333667755127, + "learning_rate": 8.148625061107695e-08, + "loss": 0.2319, + "step": 20384 + }, + { + "epoch": 1.9203504392265844, + "grad_norm": 0.629817008972168, + "learning_rate": 8.1293986851253e-08, + "loss": 0.1998, + "step": 20385 + }, + { + "epoch": 1.9204446433197524, + "grad_norm": 0.7063592672348022, + "learning_rate": 8.110194925326498e-08, + "loss": 0.1816, + "step": 20386 + }, + { + "epoch": 1.9205388474129201, + "grad_norm": 0.6157312989234924, + "learning_rate": 8.09101378214927e-08, + "loss": 0.1853, + "step": 20387 + }, + { + "epoch": 1.9206330515060879, + "grad_norm": 0.6738056540489197, + "learning_rate": 8.071855256030936e-08, + "loss": 0.2251, + "step": 20388 + }, + { + "epoch": 1.9207272555992558, + "grad_norm": 0.636600136756897, + "learning_rate": 8.052719347408367e-08, + "loss": 0.1744, + "step": 20389 + }, + { + "epoch": 1.9208214596924238, + "grad_norm": 0.6762627363204956, + "learning_rate": 8.03360605671788e-08, + "loss": 0.191, + "step": 20390 + }, + { + "epoch": 1.9209156637855915, + "grad_norm": 0.6872976422309875, + "learning_rate": 8.014515384395239e-08, + "loss": 0.1777, + "step": 20391 + }, + { + "epoch": 1.9210098678787593, + "grad_norm": 0.6908997893333435, + "learning_rate": 7.99544733087576e-08, + "loss": 0.1986, + "step": 20392 + }, + { + "epoch": 1.9211040719719272, + "grad_norm": 0.6895566582679749, + "learning_rate": 7.976401896594322e-08, + "loss": 0.1888, + "step": 20393 + }, + { + "epoch": 1.9211982760650952, + "grad_norm": 0.5949784517288208, + "learning_rate": 7.957379081985017e-08, + "loss": 0.1556, + "step": 20394 + }, + { + "epoch": 1.921292480158263, + "grad_norm": 0.6485186219215393, + "learning_rate": 7.938378887481834e-08, + "loss": 0.1813, + "step": 20395 + }, + { + "epoch": 1.9213866842514307, + "grad_norm": 0.6753706336021423, + "learning_rate": 7.91940131351776e-08, + "loss": 0.1891, + "step": 20396 + }, + { + "epoch": 1.9214808883445986, + "grad_norm": 0.73382967710495, + "learning_rate": 7.90044636052556e-08, + "loss": 0.219, + "step": 20397 + }, + { + "epoch": 1.9215750924377666, + "grad_norm": 0.6741048097610474, + "learning_rate": 7.881514028937664e-08, + "loss": 0.158, + "step": 20398 + }, + { + "epoch": 1.9216692965309343, + "grad_norm": 0.7196323871612549, + "learning_rate": 7.862604319185507e-08, + "loss": 0.2049, + "step": 20399 + }, + { + "epoch": 1.921763500624102, + "grad_norm": 0.6244286298751831, + "learning_rate": 7.843717231700299e-08, + "loss": 0.194, + "step": 20400 + }, + { + "epoch": 1.92185770471727, + "grad_norm": 0.7138739228248596, + "learning_rate": 7.824852766912806e-08, + "loss": 0.1892, + "step": 20401 + }, + { + "epoch": 1.9219519088104378, + "grad_norm": 0.6458199620246887, + "learning_rate": 7.806010925253016e-08, + "loss": 0.2042, + "step": 20402 + }, + { + "epoch": 1.9220461129036055, + "grad_norm": 0.6492854952812195, + "learning_rate": 7.787191707150699e-08, + "loss": 0.2186, + "step": 20403 + }, + { + "epoch": 1.9221403169967735, + "grad_norm": 0.6758908033370972, + "learning_rate": 7.768395113034955e-08, + "loss": 0.1919, + "step": 20404 + }, + { + "epoch": 1.9222345210899414, + "grad_norm": 0.7675185799598694, + "learning_rate": 7.749621143334218e-08, + "loss": 0.2034, + "step": 20405 + }, + { + "epoch": 1.9223287251831092, + "grad_norm": 0.6330274939537048, + "learning_rate": 7.730869798476704e-08, + "loss": 0.18, + "step": 20406 + }, + { + "epoch": 1.922422929276277, + "grad_norm": 0.7554860711097717, + "learning_rate": 7.712141078889957e-08, + "loss": 0.1995, + "step": 20407 + }, + { + "epoch": 1.9225171333694449, + "grad_norm": 0.7081100940704346, + "learning_rate": 7.693434985000969e-08, + "loss": 0.1805, + "step": 20408 + }, + { + "epoch": 1.9226113374626128, + "grad_norm": 0.6767376661300659, + "learning_rate": 7.67475151723629e-08, + "loss": 0.2089, + "step": 20409 + }, + { + "epoch": 1.9227055415557806, + "grad_norm": 0.7426806092262268, + "learning_rate": 7.65609067602191e-08, + "loss": 0.1946, + "step": 20410 + }, + { + "epoch": 1.9227997456489483, + "grad_norm": 0.6279316544532776, + "learning_rate": 7.637452461783379e-08, + "loss": 0.1672, + "step": 20411 + }, + { + "epoch": 1.9228939497421162, + "grad_norm": 0.6278228163719177, + "learning_rate": 7.618836874945579e-08, + "loss": 0.1907, + "step": 20412 + }, + { + "epoch": 1.9229881538352842, + "grad_norm": 0.7160715460777283, + "learning_rate": 7.60024391593317e-08, + "loss": 0.2292, + "step": 20413 + }, + { + "epoch": 1.923082357928452, + "grad_norm": 0.6022506356239319, + "learning_rate": 7.581673585169924e-08, + "loss": 0.1743, + "step": 20414 + }, + { + "epoch": 1.9231765620216197, + "grad_norm": 0.630576491355896, + "learning_rate": 7.56312588307928e-08, + "loss": 0.1912, + "step": 20415 + }, + { + "epoch": 1.9232707661147876, + "grad_norm": 0.6564637422561646, + "learning_rate": 7.544600810084234e-08, + "loss": 0.1843, + "step": 20416 + }, + { + "epoch": 1.9233649702079556, + "grad_norm": 0.6240977644920349, + "learning_rate": 7.526098366607227e-08, + "loss": 0.1947, + "step": 20417 + }, + { + "epoch": 1.9234591743011233, + "grad_norm": 0.6668100953102112, + "learning_rate": 7.507618553069918e-08, + "loss": 0.1758, + "step": 20418 + }, + { + "epoch": 1.923553378394291, + "grad_norm": 0.6493514180183411, + "learning_rate": 7.489161369893971e-08, + "loss": 0.1886, + "step": 20419 + }, + { + "epoch": 1.923647582487459, + "grad_norm": 0.6922727823257446, + "learning_rate": 7.470726817500052e-08, + "loss": 0.2163, + "step": 20420 + }, + { + "epoch": 1.923741786580627, + "grad_norm": 0.6386302709579468, + "learning_rate": 7.452314896308488e-08, + "loss": 0.1744, + "step": 20421 + }, + { + "epoch": 1.9238359906737947, + "grad_norm": 0.6456319689750671, + "learning_rate": 7.433925606739168e-08, + "loss": 0.1942, + "step": 20422 + }, + { + "epoch": 1.9239301947669625, + "grad_norm": 0.655189573764801, + "learning_rate": 7.41555894921131e-08, + "loss": 0.2088, + "step": 20423 + }, + { + "epoch": 1.9240243988601304, + "grad_norm": 0.7557071447372437, + "learning_rate": 7.397214924143914e-08, + "loss": 0.2351, + "step": 20424 + }, + { + "epoch": 1.9241186029532984, + "grad_norm": 0.6467447876930237, + "learning_rate": 7.378893531954979e-08, + "loss": 0.1937, + "step": 20425 + }, + { + "epoch": 1.9242128070464661, + "grad_norm": 0.6445953845977783, + "learning_rate": 7.360594773062502e-08, + "loss": 0.1799, + "step": 20426 + }, + { + "epoch": 1.9243070111396339, + "grad_norm": 0.7056514620780945, + "learning_rate": 7.342318647883595e-08, + "loss": 0.1942, + "step": 20427 + }, + { + "epoch": 1.9244012152328018, + "grad_norm": 0.6664270162582397, + "learning_rate": 7.324065156834925e-08, + "loss": 0.1935, + "step": 20428 + }, + { + "epoch": 1.9244954193259698, + "grad_norm": 0.6955601572990417, + "learning_rate": 7.305834300332715e-08, + "loss": 0.1971, + "step": 20429 + }, + { + "epoch": 1.9245896234191375, + "grad_norm": 0.6699842214584351, + "learning_rate": 7.287626078792854e-08, + "loss": 0.1635, + "step": 20430 + }, + { + "epoch": 1.9246838275123053, + "grad_norm": 0.6588364839553833, + "learning_rate": 7.269440492630342e-08, + "loss": 0.1974, + "step": 20431 + }, + { + "epoch": 1.9247780316054732, + "grad_norm": 0.629204273223877, + "learning_rate": 7.251277542259849e-08, + "loss": 0.1776, + "step": 20432 + }, + { + "epoch": 1.9248722356986412, + "grad_norm": 0.6657838225364685, + "learning_rate": 7.233137228095599e-08, + "loss": 0.1848, + "step": 20433 + }, + { + "epoch": 1.924966439791809, + "grad_norm": 0.6298301219940186, + "learning_rate": 7.215019550551039e-08, + "loss": 0.1993, + "step": 20434 + }, + { + "epoch": 1.9250606438849767, + "grad_norm": 0.982568085193634, + "learning_rate": 7.196924510039505e-08, + "loss": 0.1842, + "step": 20435 + }, + { + "epoch": 1.9251548479781446, + "grad_norm": 0.7430713772773743, + "learning_rate": 7.178852106973444e-08, + "loss": 0.2153, + "step": 20436 + }, + { + "epoch": 1.9252490520713126, + "grad_norm": 0.6544826030731201, + "learning_rate": 7.160802341765083e-08, + "loss": 0.1877, + "step": 20437 + }, + { + "epoch": 1.9253432561644803, + "grad_norm": 0.8141215443611145, + "learning_rate": 7.142775214825759e-08, + "loss": 0.1938, + "step": 20438 + }, + { + "epoch": 1.925437460257648, + "grad_norm": 0.6867548823356628, + "learning_rate": 7.124770726566699e-08, + "loss": 0.2055, + "step": 20439 + }, + { + "epoch": 1.925531664350816, + "grad_norm": 0.7175918817520142, + "learning_rate": 7.106788877398352e-08, + "loss": 0.1829, + "step": 20440 + }, + { + "epoch": 1.925625868443984, + "grad_norm": 0.6427897810935974, + "learning_rate": 7.088829667730834e-08, + "loss": 0.2073, + "step": 20441 + }, + { + "epoch": 1.9257200725371517, + "grad_norm": 0.6684300303459167, + "learning_rate": 7.070893097973486e-08, + "loss": 0.2149, + "step": 20442 + }, + { + "epoch": 1.9258142766303195, + "grad_norm": 0.6904497742652893, + "learning_rate": 7.052979168535312e-08, + "loss": 0.19, + "step": 20443 + }, + { + "epoch": 1.9259084807234874, + "grad_norm": 0.657542884349823, + "learning_rate": 7.035087879824987e-08, + "loss": 0.211, + "step": 20444 + }, + { + "epoch": 1.9260026848166554, + "grad_norm": 0.7140212059020996, + "learning_rate": 7.017219232250295e-08, + "loss": 0.2022, + "step": 20445 + }, + { + "epoch": 1.9260968889098231, + "grad_norm": 0.6623939871788025, + "learning_rate": 6.999373226218575e-08, + "loss": 0.1808, + "step": 20446 + }, + { + "epoch": 1.9261910930029909, + "grad_norm": 0.6854307055473328, + "learning_rate": 6.981549862136839e-08, + "loss": 0.2012, + "step": 20447 + }, + { + "epoch": 1.9262852970961588, + "grad_norm": 0.668220043182373, + "learning_rate": 6.963749140411647e-08, + "loss": 0.1721, + "step": 20448 + }, + { + "epoch": 1.9263795011893268, + "grad_norm": 0.6678497195243835, + "learning_rate": 6.945971061448676e-08, + "loss": 0.1829, + "step": 20449 + }, + { + "epoch": 1.9264737052824945, + "grad_norm": 0.6869219541549683, + "learning_rate": 6.928215625653267e-08, + "loss": 0.206, + "step": 20450 + }, + { + "epoch": 1.9265679093756622, + "grad_norm": 0.7499255537986755, + "learning_rate": 6.910482833430432e-08, + "loss": 0.2119, + "step": 20451 + }, + { + "epoch": 1.9266621134688302, + "grad_norm": 0.667402982711792, + "learning_rate": 6.892772685184401e-08, + "loss": 0.1866, + "step": 20452 + }, + { + "epoch": 1.9267563175619982, + "grad_norm": 0.6190139651298523, + "learning_rate": 6.875085181318963e-08, + "loss": 0.1795, + "step": 20453 + }, + { + "epoch": 1.926850521655166, + "grad_norm": 0.7362930178642273, + "learning_rate": 6.857420322237574e-08, + "loss": 0.1715, + "step": 20454 + }, + { + "epoch": 1.9269447257483336, + "grad_norm": 0.6166073083877563, + "learning_rate": 6.83977810834291e-08, + "loss": 0.1835, + "step": 20455 + }, + { + "epoch": 1.9270389298415016, + "grad_norm": 0.7105567455291748, + "learning_rate": 6.822158540037204e-08, + "loss": 0.1931, + "step": 20456 + }, + { + "epoch": 1.9271331339346696, + "grad_norm": 0.6946137547492981, + "learning_rate": 6.80456161772236e-08, + "loss": 0.1973, + "step": 20457 + }, + { + "epoch": 1.9272273380278373, + "grad_norm": 0.6640714406967163, + "learning_rate": 6.7869873417995e-08, + "loss": 0.1764, + "step": 20458 + }, + { + "epoch": 1.927321542121005, + "grad_norm": 0.6363579630851746, + "learning_rate": 6.769435712669303e-08, + "loss": 0.2156, + "step": 20459 + }, + { + "epoch": 1.927415746214173, + "grad_norm": 0.6204269528388977, + "learning_rate": 6.751906730732116e-08, + "loss": 0.1818, + "step": 20460 + }, + { + "epoch": 1.927509950307341, + "grad_norm": 0.6275210380554199, + "learning_rate": 6.734400396387508e-08, + "loss": 0.1657, + "step": 20461 + }, + { + "epoch": 1.9276041544005087, + "grad_norm": 0.7040166258811951, + "learning_rate": 6.716916710034604e-08, + "loss": 0.2073, + "step": 20462 + }, + { + "epoch": 1.9276983584936764, + "grad_norm": 0.6579363346099854, + "learning_rate": 6.699455672072197e-08, + "loss": 0.1735, + "step": 20463 + }, + { + "epoch": 1.9277925625868444, + "grad_norm": 0.6550668478012085, + "learning_rate": 6.682017282898412e-08, + "loss": 0.183, + "step": 20464 + }, + { + "epoch": 1.9278867666800124, + "grad_norm": 0.6513266563415527, + "learning_rate": 6.664601542910709e-08, + "loss": 0.2005, + "step": 20465 + }, + { + "epoch": 1.92798097077318, + "grad_norm": 0.6602237224578857, + "learning_rate": 6.647208452506437e-08, + "loss": 0.206, + "step": 20466 + }, + { + "epoch": 1.9280751748663478, + "grad_norm": 0.6728419065475464, + "learning_rate": 6.629838012082057e-08, + "loss": 0.2028, + "step": 20467 + }, + { + "epoch": 1.9281693789595158, + "grad_norm": 0.6689519882202148, + "learning_rate": 6.612490222033585e-08, + "loss": 0.1919, + "step": 20468 + }, + { + "epoch": 1.9282635830526837, + "grad_norm": 0.675183892250061, + "learning_rate": 6.595165082756594e-08, + "loss": 0.1951, + "step": 20469 + }, + { + "epoch": 1.9283577871458515, + "grad_norm": 0.6705566644668579, + "learning_rate": 6.577862594646323e-08, + "loss": 0.2098, + "step": 20470 + }, + { + "epoch": 1.9284519912390192, + "grad_norm": 0.6774200201034546, + "learning_rate": 6.560582758097011e-08, + "loss": 0.2222, + "step": 20471 + }, + { + "epoch": 1.9285461953321872, + "grad_norm": 0.7202482223510742, + "learning_rate": 6.54332557350279e-08, + "loss": 0.2014, + "step": 20472 + }, + { + "epoch": 1.9286403994253551, + "grad_norm": 0.6623703241348267, + "learning_rate": 6.526091041257231e-08, + "loss": 0.2471, + "step": 20473 + }, + { + "epoch": 1.9287346035185229, + "grad_norm": 0.7550646662712097, + "learning_rate": 6.508879161753246e-08, + "loss": 0.2039, + "step": 20474 + }, + { + "epoch": 1.9288288076116906, + "grad_norm": 0.6356313824653625, + "learning_rate": 6.491689935383294e-08, + "loss": 0.1916, + "step": 20475 + }, + { + "epoch": 1.9289230117048586, + "grad_norm": 0.5859925746917725, + "learning_rate": 6.474523362539175e-08, + "loss": 0.1876, + "step": 20476 + }, + { + "epoch": 1.9290172157980265, + "grad_norm": 0.604534924030304, + "learning_rate": 6.457379443612467e-08, + "loss": 0.1855, + "step": 20477 + }, + { + "epoch": 1.9291114198911943, + "grad_norm": 0.6435129046440125, + "learning_rate": 6.440258178994185e-08, + "loss": 0.1972, + "step": 20478 + }, + { + "epoch": 1.929205623984362, + "grad_norm": 0.6097612380981445, + "learning_rate": 6.423159569074466e-08, + "loss": 0.1838, + "step": 20479 + }, + { + "epoch": 1.92929982807753, + "grad_norm": 0.8696154952049255, + "learning_rate": 6.40608361424333e-08, + "loss": 0.2129, + "step": 20480 + }, + { + "epoch": 1.929394032170698, + "grad_norm": 0.669417679309845, + "learning_rate": 6.389030314890132e-08, + "loss": 0.1839, + "step": 20481 + }, + { + "epoch": 1.9294882362638657, + "grad_norm": 0.6988387107849121, + "learning_rate": 6.371999671403672e-08, + "loss": 0.2303, + "step": 20482 + }, + { + "epoch": 1.9295824403570334, + "grad_norm": 0.6801834106445312, + "learning_rate": 6.354991684172307e-08, + "loss": 0.2104, + "step": 20483 + }, + { + "epoch": 1.9296766444502014, + "grad_norm": 0.6566222906112671, + "learning_rate": 6.338006353583837e-08, + "loss": 0.1936, + "step": 20484 + }, + { + "epoch": 1.9297708485433693, + "grad_norm": 0.6295554041862488, + "learning_rate": 6.321043680025506e-08, + "loss": 0.1843, + "step": 20485 + }, + { + "epoch": 1.929865052636537, + "grad_norm": 0.6945067048072815, + "learning_rate": 6.304103663884231e-08, + "loss": 0.1984, + "step": 20486 + }, + { + "epoch": 1.9299592567297048, + "grad_norm": 0.6206005811691284, + "learning_rate": 6.287186305546033e-08, + "loss": 0.1999, + "step": 20487 + }, + { + "epoch": 1.9300534608228728, + "grad_norm": 0.6710313558578491, + "learning_rate": 6.270291605396938e-08, + "loss": 0.1945, + "step": 20488 + }, + { + "epoch": 1.9301476649160407, + "grad_norm": 0.666602611541748, + "learning_rate": 6.253419563821971e-08, + "loss": 0.1935, + "step": 20489 + }, + { + "epoch": 1.9302418690092085, + "grad_norm": 0.6301133632659912, + "learning_rate": 6.236570181205826e-08, + "loss": 0.1839, + "step": 20490 + }, + { + "epoch": 1.9303360731023762, + "grad_norm": 0.5872913002967834, + "learning_rate": 6.219743457932859e-08, + "loss": 0.1781, + "step": 20491 + }, + { + "epoch": 1.9304302771955442, + "grad_norm": 0.7004070281982422, + "learning_rate": 6.202939394386542e-08, + "loss": 0.2036, + "step": 20492 + }, + { + "epoch": 1.9305244812887121, + "grad_norm": 0.657778799533844, + "learning_rate": 6.186157990950236e-08, + "loss": 0.1629, + "step": 20493 + }, + { + "epoch": 1.9306186853818799, + "grad_norm": 0.7054680585861206, + "learning_rate": 6.169399248006525e-08, + "loss": 0.2054, + "step": 20494 + }, + { + "epoch": 1.9307128894750476, + "grad_norm": 0.7021805644035339, + "learning_rate": 6.152663165937434e-08, + "loss": 0.1837, + "step": 20495 + }, + { + "epoch": 1.9308070935682156, + "grad_norm": 0.6459454894065857, + "learning_rate": 6.135949745124659e-08, + "loss": 0.1786, + "step": 20496 + }, + { + "epoch": 1.9309012976613835, + "grad_norm": 0.6974847912788391, + "learning_rate": 6.119258985949227e-08, + "loss": 0.1964, + "step": 20497 + }, + { + "epoch": 1.9309955017545513, + "grad_norm": 0.6011584997177124, + "learning_rate": 6.102590888791837e-08, + "loss": 0.1849, + "step": 20498 + }, + { + "epoch": 1.931089705847719, + "grad_norm": 0.6021872758865356, + "learning_rate": 6.085945454032405e-08, + "loss": 0.1708, + "step": 20499 + }, + { + "epoch": 1.931183909940887, + "grad_norm": 0.649183452129364, + "learning_rate": 6.069322682050516e-08, + "loss": 0.1923, + "step": 20500 + }, + { + "epoch": 1.931278114034055, + "grad_norm": 0.6922840476036072, + "learning_rate": 6.052722573225312e-08, + "loss": 0.18, + "step": 20501 + }, + { + "epoch": 1.9313723181272227, + "grad_norm": 0.6575815677642822, + "learning_rate": 6.036145127935045e-08, + "loss": 0.2021, + "step": 20502 + }, + { + "epoch": 1.9314665222203904, + "grad_norm": 0.6628938317298889, + "learning_rate": 6.019590346557969e-08, + "loss": 0.1705, + "step": 20503 + }, + { + "epoch": 1.9315607263135584, + "grad_norm": 0.6496869921684265, + "learning_rate": 6.003058229471448e-08, + "loss": 0.1946, + "step": 20504 + }, + { + "epoch": 1.9316549304067263, + "grad_norm": 0.6441403031349182, + "learning_rate": 5.986548777052514e-08, + "loss": 0.183, + "step": 20505 + }, + { + "epoch": 1.931749134499894, + "grad_norm": 0.6747954487800598, + "learning_rate": 5.970061989677422e-08, + "loss": 0.1668, + "step": 20506 + }, + { + "epoch": 1.9318433385930618, + "grad_norm": 0.6658079028129578, + "learning_rate": 5.953597867722316e-08, + "loss": 0.2094, + "step": 20507 + }, + { + "epoch": 1.9319375426862297, + "grad_norm": 0.6353333592414856, + "learning_rate": 5.937156411562339e-08, + "loss": 0.1758, + "step": 20508 + }, + { + "epoch": 1.9320317467793977, + "grad_norm": 0.6521235108375549, + "learning_rate": 5.9207376215726364e-08, + "loss": 0.1817, + "step": 20509 + }, + { + "epoch": 1.9321259508725654, + "grad_norm": 0.6600897908210754, + "learning_rate": 5.9043414981275746e-08, + "loss": 0.2001, + "step": 20510 + }, + { + "epoch": 1.9322201549657332, + "grad_norm": 0.7067880630493164, + "learning_rate": 5.8879680416007446e-08, + "loss": 0.2525, + "step": 20511 + }, + { + "epoch": 1.9323143590589011, + "grad_norm": 0.6741988658905029, + "learning_rate": 5.871617252365847e-08, + "loss": 0.2186, + "step": 20512 + }, + { + "epoch": 1.932408563152069, + "grad_norm": 0.6814460158348083, + "learning_rate": 5.855289130795472e-08, + "loss": 0.1737, + "step": 20513 + }, + { + "epoch": 1.9325027672452368, + "grad_norm": 0.6088449954986572, + "learning_rate": 5.838983677261878e-08, + "loss": 0.1628, + "step": 20514 + }, + { + "epoch": 1.9325969713384046, + "grad_norm": 0.6774396896362305, + "learning_rate": 5.8227008921371006e-08, + "loss": 0.1886, + "step": 20515 + }, + { + "epoch": 1.9326911754315725, + "grad_norm": 0.6780672073364258, + "learning_rate": 5.806440775792177e-08, + "loss": 0.1772, + "step": 20516 + }, + { + "epoch": 1.9327853795247405, + "grad_norm": 0.6692124605178833, + "learning_rate": 5.7902033285979206e-08, + "loss": 0.2049, + "step": 20517 + }, + { + "epoch": 1.9328795836179082, + "grad_norm": 0.6149314045906067, + "learning_rate": 5.773988550924703e-08, + "loss": 0.1849, + "step": 20518 + }, + { + "epoch": 1.932973787711076, + "grad_norm": 0.695295512676239, + "learning_rate": 5.757796443142116e-08, + "loss": 0.2055, + "step": 20519 + }, + { + "epoch": 1.933067991804244, + "grad_norm": 0.7034395933151245, + "learning_rate": 5.74162700561931e-08, + "loss": 0.1683, + "step": 20520 + }, + { + "epoch": 1.933162195897412, + "grad_norm": 0.667819619178772, + "learning_rate": 5.7254802387252116e-08, + "loss": 0.1928, + "step": 20521 + }, + { + "epoch": 1.9332563999905796, + "grad_norm": 0.6549688577651978, + "learning_rate": 5.709356142827749e-08, + "loss": 0.1894, + "step": 20522 + }, + { + "epoch": 1.9333506040837474, + "grad_norm": 0.6827957630157471, + "learning_rate": 5.6932547182946274e-08, + "loss": 0.2203, + "step": 20523 + }, + { + "epoch": 1.9334448081769153, + "grad_norm": 0.6484357118606567, + "learning_rate": 5.677175965493109e-08, + "loss": 0.1796, + "step": 20524 + }, + { + "epoch": 1.9335390122700833, + "grad_norm": 0.6589913368225098, + "learning_rate": 5.661119884789679e-08, + "loss": 0.2064, + "step": 20525 + }, + { + "epoch": 1.933633216363251, + "grad_norm": 0.7631707191467285, + "learning_rate": 5.645086476550488e-08, + "loss": 0.2103, + "step": 20526 + }, + { + "epoch": 1.9337274204564188, + "grad_norm": 0.6485490202903748, + "learning_rate": 5.6290757411411324e-08, + "loss": 0.1945, + "step": 20527 + }, + { + "epoch": 1.9338216245495867, + "grad_norm": 0.6263374090194702, + "learning_rate": 5.613087678926765e-08, + "loss": 0.176, + "step": 20528 + }, + { + "epoch": 1.9339158286427547, + "grad_norm": 0.6242257952690125, + "learning_rate": 5.5971222902716506e-08, + "loss": 0.1901, + "step": 20529 + }, + { + "epoch": 1.9340100327359224, + "grad_norm": 0.652399480342865, + "learning_rate": 5.581179575540163e-08, + "loss": 0.1974, + "step": 20530 + }, + { + "epoch": 1.9341042368290902, + "grad_norm": 0.6437662243843079, + "learning_rate": 5.5652595350956795e-08, + "loss": 0.1727, + "step": 20531 + }, + { + "epoch": 1.9341984409222581, + "grad_norm": 0.587958812713623, + "learning_rate": 5.549362169301131e-08, + "loss": 0.1661, + "step": 20532 + }, + { + "epoch": 1.934292645015426, + "grad_norm": 0.6266226172447205, + "learning_rate": 5.533487478519117e-08, + "loss": 0.1831, + "step": 20533 + }, + { + "epoch": 1.9343868491085938, + "grad_norm": 0.7345966100692749, + "learning_rate": 5.5176354631115703e-08, + "loss": 0.1631, + "step": 20534 + }, + { + "epoch": 1.9344810532017616, + "grad_norm": 0.6884997487068176, + "learning_rate": 5.501806123439868e-08, + "loss": 0.182, + "step": 20535 + }, + { + "epoch": 1.9345752572949295, + "grad_norm": 0.6600018739700317, + "learning_rate": 5.485999459865055e-08, + "loss": 0.2295, + "step": 20536 + }, + { + "epoch": 1.9346694613880975, + "grad_norm": 0.6569324135780334, + "learning_rate": 5.47021547274762e-08, + "loss": 0.1773, + "step": 20537 + }, + { + "epoch": 1.9347636654812652, + "grad_norm": 0.7247839570045471, + "learning_rate": 5.4544541624471645e-08, + "loss": 0.2064, + "step": 20538 + }, + { + "epoch": 1.934857869574433, + "grad_norm": 0.6159790754318237, + "learning_rate": 5.438715529323291e-08, + "loss": 0.1731, + "step": 20539 + }, + { + "epoch": 1.934952073667601, + "grad_norm": 0.6411365270614624, + "learning_rate": 5.422999573734822e-08, + "loss": 0.183, + "step": 20540 + }, + { + "epoch": 1.9350462777607687, + "grad_norm": 0.7017724514007568, + "learning_rate": 5.4073062960401376e-08, + "loss": 0.1574, + "step": 20541 + }, + { + "epoch": 1.9351404818539364, + "grad_norm": 0.7132136821746826, + "learning_rate": 5.391635696596953e-08, + "loss": 0.2154, + "step": 20542 + }, + { + "epoch": 1.9352346859471043, + "grad_norm": 0.6471320390701294, + "learning_rate": 5.3759877757628696e-08, + "loss": 0.1964, + "step": 20543 + }, + { + "epoch": 1.9353288900402723, + "grad_norm": 0.6615820527076721, + "learning_rate": 5.36036253389427e-08, + "loss": 0.1993, + "step": 20544 + }, + { + "epoch": 1.93542309413344, + "grad_norm": 0.6011489033699036, + "learning_rate": 5.3447599713477574e-08, + "loss": 0.1711, + "step": 20545 + }, + { + "epoch": 1.9355172982266078, + "grad_norm": 0.6503063440322876, + "learning_rate": 5.3291800884789356e-08, + "loss": 0.2259, + "step": 20546 + }, + { + "epoch": 1.9356115023197757, + "grad_norm": 0.7811630964279175, + "learning_rate": 5.313622885643077e-08, + "loss": 0.2019, + "step": 20547 + }, + { + "epoch": 1.9357057064129437, + "grad_norm": 0.6372681856155396, + "learning_rate": 5.298088363195009e-08, + "loss": 0.1767, + "step": 20548 + }, + { + "epoch": 1.9357999105061114, + "grad_norm": 0.6439894437789917, + "learning_rate": 5.2825765214887806e-08, + "loss": 0.1816, + "step": 20549 + }, + { + "epoch": 1.9358941145992792, + "grad_norm": 0.6761665344238281, + "learning_rate": 5.2670873608782205e-08, + "loss": 0.1906, + "step": 20550 + }, + { + "epoch": 1.9359883186924471, + "grad_norm": 0.6620761156082153, + "learning_rate": 5.251620881716379e-08, + "loss": 0.1579, + "step": 20551 + }, + { + "epoch": 1.936082522785615, + "grad_norm": 0.7432847619056702, + "learning_rate": 5.236177084356087e-08, + "loss": 0.2337, + "step": 20552 + }, + { + "epoch": 1.9361767268787828, + "grad_norm": 0.6455764770507812, + "learning_rate": 5.2207559691492825e-08, + "loss": 0.1861, + "step": 20553 + }, + { + "epoch": 1.9362709309719506, + "grad_norm": 0.6577297449111938, + "learning_rate": 5.205357536447797e-08, + "loss": 0.188, + "step": 20554 + }, + { + "epoch": 1.9363651350651185, + "grad_norm": 0.6162768006324768, + "learning_rate": 5.1899817866025715e-08, + "loss": 0.193, + "step": 20555 + }, + { + "epoch": 1.9364593391582865, + "grad_norm": 0.7234073877334595, + "learning_rate": 5.174628719964325e-08, + "loss": 0.2097, + "step": 20556 + }, + { + "epoch": 1.9365535432514542, + "grad_norm": 0.59145188331604, + "learning_rate": 5.159298336883001e-08, + "loss": 0.1879, + "step": 20557 + }, + { + "epoch": 1.936647747344622, + "grad_norm": 0.6481671929359436, + "learning_rate": 5.143990637708318e-08, + "loss": 0.1887, + "step": 20558 + }, + { + "epoch": 1.93674195143779, + "grad_norm": 0.7127054929733276, + "learning_rate": 5.12870562278911e-08, + "loss": 0.2034, + "step": 20559 + }, + { + "epoch": 1.936836155530958, + "grad_norm": 0.7050976753234863, + "learning_rate": 5.113443292474096e-08, + "loss": 0.231, + "step": 20560 + }, + { + "epoch": 1.9369303596241256, + "grad_norm": 0.6505662798881531, + "learning_rate": 5.098203647111111e-08, + "loss": 0.185, + "step": 20561 + }, + { + "epoch": 1.9370245637172934, + "grad_norm": 0.6301153898239136, + "learning_rate": 5.082986687047764e-08, + "loss": 0.194, + "step": 20562 + }, + { + "epoch": 1.9371187678104613, + "grad_norm": 0.6156437397003174, + "learning_rate": 5.0677924126311116e-08, + "loss": 0.1698, + "step": 20563 + }, + { + "epoch": 1.9372129719036293, + "grad_norm": 0.6495509743690491, + "learning_rate": 5.0526208242073214e-08, + "loss": 0.1731, + "step": 20564 + }, + { + "epoch": 1.937307175996797, + "grad_norm": 0.6313520073890686, + "learning_rate": 5.037471922122561e-08, + "loss": 0.1806, + "step": 20565 + }, + { + "epoch": 1.9374013800899648, + "grad_norm": 0.6380580067634583, + "learning_rate": 5.0223457067222205e-08, + "loss": 0.1929, + "step": 20566 + }, + { + "epoch": 1.9374955841831327, + "grad_norm": 0.6428207159042358, + "learning_rate": 5.007242178351024e-08, + "loss": 0.1943, + "step": 20567 + }, + { + "epoch": 1.9375897882763007, + "grad_norm": 0.657741129398346, + "learning_rate": 4.992161337353696e-08, + "loss": 0.1745, + "step": 20568 + }, + { + "epoch": 1.9376839923694684, + "grad_norm": 0.6035696268081665, + "learning_rate": 4.977103184073739e-08, + "loss": 0.1974, + "step": 20569 + }, + { + "epoch": 1.9377781964626362, + "grad_norm": 0.6318914890289307, + "learning_rate": 4.9620677188546574e-08, + "loss": 0.2102, + "step": 20570 + }, + { + "epoch": 1.9378724005558041, + "grad_norm": 0.6037071943283081, + "learning_rate": 4.947054942039398e-08, + "loss": 0.1714, + "step": 20571 + }, + { + "epoch": 1.937966604648972, + "grad_norm": 0.575206458568573, + "learning_rate": 4.93206485397002e-08, + "loss": 0.1661, + "step": 20572 + }, + { + "epoch": 1.9380608087421398, + "grad_norm": 0.6761251091957092, + "learning_rate": 4.9170974549885844e-08, + "loss": 0.1891, + "step": 20573 + }, + { + "epoch": 1.9381550128353076, + "grad_norm": 0.6738649010658264, + "learning_rate": 4.902152745436261e-08, + "loss": 0.1772, + "step": 20574 + }, + { + "epoch": 1.9382492169284755, + "grad_norm": 0.7871718406677246, + "learning_rate": 4.8872307256537796e-08, + "loss": 0.1654, + "step": 20575 + }, + { + "epoch": 1.9383434210216435, + "grad_norm": 0.6799382567405701, + "learning_rate": 4.872331395981311e-08, + "loss": 0.1969, + "step": 20576 + }, + { + "epoch": 1.9384376251148112, + "grad_norm": 0.7592743039131165, + "learning_rate": 4.857454756758806e-08, + "loss": 0.1738, + "step": 20577 + }, + { + "epoch": 1.938531829207979, + "grad_norm": 0.6246234178543091, + "learning_rate": 4.842600808325326e-08, + "loss": 0.1758, + "step": 20578 + }, + { + "epoch": 1.938626033301147, + "grad_norm": 0.6319633722305298, + "learning_rate": 4.82776955101949e-08, + "loss": 0.1651, + "step": 20579 + }, + { + "epoch": 1.9387202373943149, + "grad_norm": 0.632641077041626, + "learning_rate": 4.812960985179693e-08, + "loss": 0.1751, + "step": 20580 + }, + { + "epoch": 1.9388144414874826, + "grad_norm": 0.6395126581192017, + "learning_rate": 4.798175111143444e-08, + "loss": 0.1976, + "step": 20581 + }, + { + "epoch": 1.9389086455806503, + "grad_norm": 0.7053261995315552, + "learning_rate": 4.783411929247805e-08, + "loss": 0.2402, + "step": 20582 + }, + { + "epoch": 1.9390028496738183, + "grad_norm": 0.6206017136573792, + "learning_rate": 4.76867143982962e-08, + "loss": 0.1891, + "step": 20583 + }, + { + "epoch": 1.9390970537669863, + "grad_norm": 0.5933031439781189, + "learning_rate": 4.7539536432249514e-08, + "loss": 0.1845, + "step": 20584 + }, + { + "epoch": 1.939191257860154, + "grad_norm": 0.6479809284210205, + "learning_rate": 4.7392585397691984e-08, + "loss": 0.1945, + "step": 20585 + }, + { + "epoch": 1.9392854619533217, + "grad_norm": 0.7193289995193481, + "learning_rate": 4.724586129797537e-08, + "loss": 0.2043, + "step": 20586 + }, + { + "epoch": 1.9393796660464897, + "grad_norm": 0.6792411804199219, + "learning_rate": 4.709936413644589e-08, + "loss": 0.1795, + "step": 20587 + }, + { + "epoch": 1.9394738701396577, + "grad_norm": 0.6745784878730774, + "learning_rate": 4.695309391644309e-08, + "loss": 0.1985, + "step": 20588 + }, + { + "epoch": 1.9395680742328254, + "grad_norm": 0.6432570219039917, + "learning_rate": 4.680705064130209e-08, + "loss": 0.1558, + "step": 20589 + }, + { + "epoch": 1.9396622783259931, + "grad_norm": 0.5894613265991211, + "learning_rate": 4.6661234314353546e-08, + "loss": 0.1871, + "step": 20590 + }, + { + "epoch": 1.939756482419161, + "grad_norm": 0.6259870529174805, + "learning_rate": 4.651564493892258e-08, + "loss": 0.1642, + "step": 20591 + }, + { + "epoch": 1.939850686512329, + "grad_norm": 0.70440274477005, + "learning_rate": 4.6370282518327646e-08, + "loss": 0.1899, + "step": 20592 + }, + { + "epoch": 1.9399448906054968, + "grad_norm": 0.6017360687255859, + "learning_rate": 4.622514705588388e-08, + "loss": 0.1562, + "step": 20593 + }, + { + "epoch": 1.9400390946986645, + "grad_norm": 0.6458675265312195, + "learning_rate": 4.608023855490085e-08, + "loss": 0.1943, + "step": 20594 + }, + { + "epoch": 1.9401332987918325, + "grad_norm": 0.6895346641540527, + "learning_rate": 4.593555701868257e-08, + "loss": 0.2013, + "step": 20595 + }, + { + "epoch": 1.9402275028850005, + "grad_norm": 0.7026508450508118, + "learning_rate": 4.5791102450527536e-08, + "loss": 0.2151, + "step": 20596 + }, + { + "epoch": 1.9403217069781682, + "grad_norm": 0.6759473085403442, + "learning_rate": 4.564687485372976e-08, + "loss": 0.1829, + "step": 20597 + }, + { + "epoch": 1.940415911071336, + "grad_norm": 0.6939036846160889, + "learning_rate": 4.550287423157773e-08, + "loss": 0.1713, + "step": 20598 + }, + { + "epoch": 1.940510115164504, + "grad_norm": 0.6667701601982117, + "learning_rate": 4.5359100587355484e-08, + "loss": 0.1779, + "step": 20599 + }, + { + "epoch": 1.9406043192576719, + "grad_norm": 0.7237780690193176, + "learning_rate": 4.5215553924340406e-08, + "loss": 0.2004, + "step": 20600 + }, + { + "epoch": 1.9406985233508396, + "grad_norm": 0.635470986366272, + "learning_rate": 4.507223424580765e-08, + "loss": 0.1792, + "step": 20601 + }, + { + "epoch": 1.9407927274440073, + "grad_norm": 0.5838043689727783, + "learning_rate": 4.4929141555021264e-08, + "loss": 0.17, + "step": 20602 + }, + { + "epoch": 1.9408869315371753, + "grad_norm": 0.7381105422973633, + "learning_rate": 4.478627585524753e-08, + "loss": 0.1825, + "step": 20603 + }, + { + "epoch": 1.9409811356303432, + "grad_norm": 0.6283450126647949, + "learning_rate": 4.464363714974274e-08, + "loss": 0.1676, + "step": 20604 + }, + { + "epoch": 1.941075339723511, + "grad_norm": 0.6756641268730164, + "learning_rate": 4.4501225441759834e-08, + "loss": 0.2005, + "step": 20605 + }, + { + "epoch": 1.9411695438166787, + "grad_norm": 0.6628778576850891, + "learning_rate": 4.4359040734544e-08, + "loss": 0.1685, + "step": 20606 + }, + { + "epoch": 1.9412637479098467, + "grad_norm": 0.6249292492866516, + "learning_rate": 4.4217083031339314e-08, + "loss": 0.1731, + "step": 20607 + }, + { + "epoch": 1.9413579520030146, + "grad_norm": 0.5770609378814697, + "learning_rate": 4.407535233538318e-08, + "loss": 0.1637, + "step": 20608 + }, + { + "epoch": 1.9414521560961824, + "grad_norm": 0.6798595190048218, + "learning_rate": 4.3933848649904133e-08, + "loss": 0.1909, + "step": 20609 + }, + { + "epoch": 1.9415463601893501, + "grad_norm": 0.6528865098953247, + "learning_rate": 4.37925719781318e-08, + "loss": 0.1786, + "step": 20610 + }, + { + "epoch": 1.941640564282518, + "grad_norm": 0.7164531946182251, + "learning_rate": 4.365152232328695e-08, + "loss": 0.1838, + "step": 20611 + }, + { + "epoch": 1.941734768375686, + "grad_norm": 0.5923558473587036, + "learning_rate": 4.351069968858479e-08, + "loss": 0.189, + "step": 20612 + }, + { + "epoch": 1.9418289724688538, + "grad_norm": 0.645929217338562, + "learning_rate": 4.3370104077236075e-08, + "loss": 0.2127, + "step": 20613 + }, + { + "epoch": 1.9419231765620215, + "grad_norm": 0.6221653819084167, + "learning_rate": 4.322973549244713e-08, + "loss": 0.1982, + "step": 20614 + }, + { + "epoch": 1.9420173806551895, + "grad_norm": 0.7193864583969116, + "learning_rate": 4.308959393741985e-08, + "loss": 0.1985, + "step": 20615 + }, + { + "epoch": 1.9421115847483574, + "grad_norm": 0.6187189221382141, + "learning_rate": 4.294967941534722e-08, + "loss": 0.2, + "step": 20616 + }, + { + "epoch": 1.9422057888415252, + "grad_norm": 0.7417852878570557, + "learning_rate": 4.280999192942115e-08, + "loss": 0.1993, + "step": 20617 + }, + { + "epoch": 1.942299992934693, + "grad_norm": 0.6823968291282654, + "learning_rate": 4.267053148282685e-08, + "loss": 0.2025, + "step": 20618 + }, + { + "epoch": 1.9423941970278609, + "grad_norm": 0.6772221922874451, + "learning_rate": 4.25312980787429e-08, + "loss": 0.1873, + "step": 20619 + }, + { + "epoch": 1.9424884011210288, + "grad_norm": 0.6777254939079285, + "learning_rate": 4.239229172034565e-08, + "loss": 0.2373, + "step": 20620 + }, + { + "epoch": 1.9425826052141966, + "grad_norm": 0.6470945477485657, + "learning_rate": 4.2253512410803666e-08, + "loss": 0.187, + "step": 20621 + }, + { + "epoch": 1.9426768093073643, + "grad_norm": 0.6564001441001892, + "learning_rate": 4.211496015328109e-08, + "loss": 0.1966, + "step": 20622 + }, + { + "epoch": 1.9427710134005323, + "grad_norm": 0.6375804543495178, + "learning_rate": 4.197663495093873e-08, + "loss": 0.2003, + "step": 20623 + }, + { + "epoch": 1.9428652174937002, + "grad_norm": 0.644978404045105, + "learning_rate": 4.183853680692851e-08, + "loss": 0.1998, + "step": 20624 + }, + { + "epoch": 1.942959421586868, + "grad_norm": 0.6153957843780518, + "learning_rate": 4.170066572440124e-08, + "loss": 0.1617, + "step": 20625 + }, + { + "epoch": 1.9430536256800357, + "grad_norm": 0.6638757586479187, + "learning_rate": 4.156302170649773e-08, + "loss": 0.1924, + "step": 20626 + }, + { + "epoch": 1.9431478297732037, + "grad_norm": 0.6751947402954102, + "learning_rate": 4.1425604756359926e-08, + "loss": 0.1853, + "step": 20627 + }, + { + "epoch": 1.9432420338663716, + "grad_norm": 0.6615750789642334, + "learning_rate": 4.1288414877118656e-08, + "loss": 0.2287, + "step": 20628 + }, + { + "epoch": 1.9433362379595394, + "grad_norm": 0.7551138401031494, + "learning_rate": 4.115145207190363e-08, + "loss": 0.1796, + "step": 20629 + }, + { + "epoch": 1.943430442052707, + "grad_norm": 0.7317015528678894, + "learning_rate": 4.10147163438368e-08, + "loss": 0.1991, + "step": 20630 + }, + { + "epoch": 1.943524646145875, + "grad_norm": 0.656338632106781, + "learning_rate": 4.0878207696036785e-08, + "loss": 0.2065, + "step": 20631 + }, + { + "epoch": 1.943618850239043, + "grad_norm": 0.6661907434463501, + "learning_rate": 4.074192613161554e-08, + "loss": 0.1673, + "step": 20632 + }, + { + "epoch": 1.9437130543322108, + "grad_norm": 0.6182055473327637, + "learning_rate": 4.060587165368057e-08, + "loss": 0.1812, + "step": 20633 + }, + { + "epoch": 1.9438072584253785, + "grad_norm": 0.5980770587921143, + "learning_rate": 4.047004426533385e-08, + "loss": 0.1686, + "step": 20634 + }, + { + "epoch": 1.9439014625185465, + "grad_norm": 0.6584701538085938, + "learning_rate": 4.03344439696729e-08, + "loss": 0.197, + "step": 20635 + }, + { + "epoch": 1.9439956666117144, + "grad_norm": 0.5787577033042908, + "learning_rate": 4.019907076978968e-08, + "loss": 0.1736, + "step": 20636 + }, + { + "epoch": 1.9440898707048822, + "grad_norm": 0.6624363660812378, + "learning_rate": 4.0063924668770625e-08, + "loss": 0.1777, + "step": 20637 + }, + { + "epoch": 1.9441840747980499, + "grad_norm": 0.7424578666687012, + "learning_rate": 3.9929005669697704e-08, + "loss": 0.2047, + "step": 20638 + }, + { + "epoch": 1.9442782788912178, + "grad_norm": 0.7298041582107544, + "learning_rate": 3.9794313775647356e-08, + "loss": 0.2012, + "step": 20639 + }, + { + "epoch": 1.9443724829843858, + "grad_norm": 0.6549528241157532, + "learning_rate": 3.965984898969044e-08, + "loss": 0.1871, + "step": 20640 + }, + { + "epoch": 1.9444666870775535, + "grad_norm": 0.7583874464035034, + "learning_rate": 3.95256113148923e-08, + "loss": 0.1971, + "step": 20641 + }, + { + "epoch": 1.9445608911707213, + "grad_norm": 0.6117969155311584, + "learning_rate": 3.939160075431381e-08, + "loss": 0.1929, + "step": 20642 + }, + { + "epoch": 1.9446550952638892, + "grad_norm": 0.6076126098632812, + "learning_rate": 3.925781731101252e-08, + "loss": 0.1872, + "step": 20643 + }, + { + "epoch": 1.9447492993570572, + "grad_norm": 0.6362821459770203, + "learning_rate": 3.912426098803712e-08, + "loss": 0.1621, + "step": 20644 + }, + { + "epoch": 1.944843503450225, + "grad_norm": 0.8712957501411438, + "learning_rate": 3.899093178843294e-08, + "loss": 0.1825, + "step": 20645 + }, + { + "epoch": 1.9449377075433927, + "grad_norm": 0.5873278379440308, + "learning_rate": 3.885782971524088e-08, + "loss": 0.1802, + "step": 20646 + }, + { + "epoch": 1.9450319116365606, + "grad_norm": 0.6395159959793091, + "learning_rate": 3.872495477149518e-08, + "loss": 0.236, + "step": 20647 + }, + { + "epoch": 1.9451261157297286, + "grad_norm": 0.6261362433433533, + "learning_rate": 3.8592306960226755e-08, + "loss": 0.1899, + "step": 20648 + }, + { + "epoch": 1.9452203198228963, + "grad_norm": 0.6366989016532898, + "learning_rate": 3.8459886284458736e-08, + "loss": 0.1882, + "step": 20649 + }, + { + "epoch": 1.945314523916064, + "grad_norm": 0.6599946022033691, + "learning_rate": 3.8327692747210934e-08, + "loss": 0.1902, + "step": 20650 + }, + { + "epoch": 1.945408728009232, + "grad_norm": 0.6210522651672363, + "learning_rate": 3.819572635149871e-08, + "loss": 0.166, + "step": 20651 + }, + { + "epoch": 1.9455029321024, + "grad_norm": 0.6272507905960083, + "learning_rate": 3.8063987100328546e-08, + "loss": 0.1754, + "step": 20652 + }, + { + "epoch": 1.9455971361955677, + "grad_norm": 0.6985970735549927, + "learning_rate": 3.7932474996706935e-08, + "loss": 0.213, + "step": 20653 + }, + { + "epoch": 1.9456913402887355, + "grad_norm": 0.6704853177070618, + "learning_rate": 3.780119004363148e-08, + "loss": 0.2075, + "step": 20654 + }, + { + "epoch": 1.9457855443819034, + "grad_norm": 0.668319046497345, + "learning_rate": 3.7670132244096434e-08, + "loss": 0.1871, + "step": 20655 + }, + { + "epoch": 1.9458797484750714, + "grad_norm": 0.675618052482605, + "learning_rate": 3.753930160108832e-08, + "loss": 0.1683, + "step": 20656 + }, + { + "epoch": 1.9459739525682391, + "grad_norm": 0.6533845663070679, + "learning_rate": 3.7408698117591404e-08, + "loss": 0.1827, + "step": 20657 + }, + { + "epoch": 1.9460681566614069, + "grad_norm": 0.6358233094215393, + "learning_rate": 3.727832179658442e-08, + "loss": 0.2002, + "step": 20658 + }, + { + "epoch": 1.9461623607545748, + "grad_norm": 0.6991185545921326, + "learning_rate": 3.714817264103832e-08, + "loss": 0.1645, + "step": 20659 + }, + { + "epoch": 1.9462565648477428, + "grad_norm": 0.6090744733810425, + "learning_rate": 3.701825065392184e-08, + "loss": 0.1733, + "step": 20660 + }, + { + "epoch": 1.9463507689409105, + "grad_norm": 0.6858947277069092, + "learning_rate": 3.688855583819817e-08, + "loss": 0.2196, + "step": 20661 + }, + { + "epoch": 1.9464449730340783, + "grad_norm": 0.6599785089492798, + "learning_rate": 3.675908819682272e-08, + "loss": 0.1772, + "step": 20662 + }, + { + "epoch": 1.9465391771272462, + "grad_norm": 0.6226392388343811, + "learning_rate": 3.66298477327498e-08, + "loss": 0.1916, + "step": 20663 + }, + { + "epoch": 1.9466333812204142, + "grad_norm": 0.6897684931755066, + "learning_rate": 3.6500834448923714e-08, + "loss": 0.1855, + "step": 20664 + }, + { + "epoch": 1.946727585313582, + "grad_norm": 0.6366028189659119, + "learning_rate": 3.637204834828767e-08, + "loss": 0.1716, + "step": 20665 + }, + { + "epoch": 1.9468217894067497, + "grad_norm": 0.6105790734291077, + "learning_rate": 3.62434894337782e-08, + "loss": 0.1825, + "step": 20666 + }, + { + "epoch": 1.9469159934999176, + "grad_norm": 0.7614659070968628, + "learning_rate": 3.611515770832741e-08, + "loss": 0.1832, + "step": 20667 + }, + { + "epoch": 1.9470101975930856, + "grad_norm": 0.699749767780304, + "learning_rate": 3.598705317485851e-08, + "loss": 0.2074, + "step": 20668 + }, + { + "epoch": 1.9471044016862533, + "grad_norm": 0.5799646973609924, + "learning_rate": 3.5859175836295835e-08, + "loss": 0.1649, + "step": 20669 + }, + { + "epoch": 1.947198605779421, + "grad_norm": 0.6293885707855225, + "learning_rate": 3.5731525695553714e-08, + "loss": 0.1942, + "step": 20670 + }, + { + "epoch": 1.947292809872589, + "grad_norm": 0.5930148363113403, + "learning_rate": 3.5604102755542045e-08, + "loss": 0.1958, + "step": 20671 + }, + { + "epoch": 1.947387013965757, + "grad_norm": 0.6413487792015076, + "learning_rate": 3.54769070191674e-08, + "loss": 0.2138, + "step": 20672 + }, + { + "epoch": 1.9474812180589247, + "grad_norm": 0.6529505252838135, + "learning_rate": 3.534993848932966e-08, + "loss": 0.144, + "step": 20673 + }, + { + "epoch": 1.9475754221520925, + "grad_norm": 0.6703647971153259, + "learning_rate": 3.522319716892431e-08, + "loss": 0.1825, + "step": 20674 + }, + { + "epoch": 1.9476696262452604, + "grad_norm": 1.0214293003082275, + "learning_rate": 3.509668306084124e-08, + "loss": 0.1857, + "step": 20675 + }, + { + "epoch": 1.9477638303384284, + "grad_norm": 0.6616676449775696, + "learning_rate": 3.497039616796372e-08, + "loss": 0.1599, + "step": 20676 + }, + { + "epoch": 1.947858034431596, + "grad_norm": 0.6677464246749878, + "learning_rate": 3.4844336493172756e-08, + "loss": 0.1868, + "step": 20677 + }, + { + "epoch": 1.9479522385247638, + "grad_norm": 0.6382472515106201, + "learning_rate": 3.471850403934274e-08, + "loss": 0.19, + "step": 20678 + }, + { + "epoch": 1.9480464426179318, + "grad_norm": 0.6883176565170288, + "learning_rate": 3.459289880934247e-08, + "loss": 0.182, + "step": 20679 + }, + { + "epoch": 1.9481406467110995, + "grad_norm": 0.6284949779510498, + "learning_rate": 3.4467520806035216e-08, + "loss": 0.1463, + "step": 20680 + }, + { + "epoch": 1.9482348508042673, + "grad_norm": 0.6195784211158752, + "learning_rate": 3.434237003228091e-08, + "loss": 0.1756, + "step": 20681 + }, + { + "epoch": 1.9483290548974352, + "grad_norm": 1.060126781463623, + "learning_rate": 3.4217446490932836e-08, + "loss": 0.1801, + "step": 20682 + }, + { + "epoch": 1.9484232589906032, + "grad_norm": 0.6482633948326111, + "learning_rate": 3.409275018483982e-08, + "loss": 0.1984, + "step": 20683 + }, + { + "epoch": 1.948517463083771, + "grad_norm": 0.6856955885887146, + "learning_rate": 3.396828111684402e-08, + "loss": 0.1791, + "step": 20684 + }, + { + "epoch": 1.9486116671769387, + "grad_norm": 0.6002383828163147, + "learning_rate": 3.38440392897843e-08, + "loss": 0.1688, + "step": 20685 + }, + { + "epoch": 1.9487058712701066, + "grad_norm": 0.614564836025238, + "learning_rate": 3.3720024706492825e-08, + "loss": 0.1931, + "step": 20686 + }, + { + "epoch": 1.9488000753632746, + "grad_norm": 0.68077152967453, + "learning_rate": 3.359623736979844e-08, + "loss": 0.2129, + "step": 20687 + }, + { + "epoch": 1.9488942794564423, + "grad_norm": 0.6167360544204712, + "learning_rate": 3.3472677282523344e-08, + "loss": 0.1796, + "step": 20688 + }, + { + "epoch": 1.94898848354961, + "grad_norm": 0.5975761413574219, + "learning_rate": 3.3349344447485276e-08, + "loss": 0.1715, + "step": 20689 + }, + { + "epoch": 1.949082687642778, + "grad_norm": 0.7060175538063049, + "learning_rate": 3.322623886749532e-08, + "loss": 0.1988, + "step": 20690 + }, + { + "epoch": 1.949176891735946, + "grad_norm": 0.6285139322280884, + "learning_rate": 3.310336054536123e-08, + "loss": 0.1556, + "step": 20691 + }, + { + "epoch": 1.9492710958291137, + "grad_norm": 0.6609798669815063, + "learning_rate": 3.29807094838841e-08, + "loss": 0.2013, + "step": 20692 + }, + { + "epoch": 1.9493652999222815, + "grad_norm": 0.6288373470306396, + "learning_rate": 3.285828568586058e-08, + "loss": 0.1625, + "step": 20693 + }, + { + "epoch": 1.9494595040154494, + "grad_norm": 0.6585246324539185, + "learning_rate": 3.2736089154083996e-08, + "loss": 0.1839, + "step": 20694 + }, + { + "epoch": 1.9495537081086174, + "grad_norm": 0.6595421433448792, + "learning_rate": 3.261411989133878e-08, + "loss": 0.1927, + "step": 20695 + }, + { + "epoch": 1.9496479122017851, + "grad_norm": 0.8907338976860046, + "learning_rate": 3.249237790040605e-08, + "loss": 0.2283, + "step": 20696 + }, + { + "epoch": 1.9497421162949529, + "grad_norm": 0.5782504081726074, + "learning_rate": 3.237086318406246e-08, + "loss": 0.1581, + "step": 20697 + }, + { + "epoch": 1.9498363203881208, + "grad_norm": 0.7235888838768005, + "learning_rate": 3.224957574507914e-08, + "loss": 0.197, + "step": 20698 + }, + { + "epoch": 1.9499305244812888, + "grad_norm": 0.6389434337615967, + "learning_rate": 3.2128515586219436e-08, + "loss": 0.1921, + "step": 20699 + }, + { + "epoch": 1.9500247285744565, + "grad_norm": 0.7259790897369385, + "learning_rate": 3.2007682710245566e-08, + "loss": 0.1944, + "step": 20700 + }, + { + "epoch": 1.9501189326676243, + "grad_norm": 0.7283763885498047, + "learning_rate": 3.1887077119913125e-08, + "loss": 0.1937, + "step": 20701 + }, + { + "epoch": 1.9502131367607922, + "grad_norm": 0.6539479494094849, + "learning_rate": 3.176669881797101e-08, + "loss": 0.2125, + "step": 20702 + }, + { + "epoch": 1.9503073408539602, + "grad_norm": 0.693548321723938, + "learning_rate": 3.16465478071637e-08, + "loss": 0.2007, + "step": 20703 + }, + { + "epoch": 1.950401544947128, + "grad_norm": 0.6632748246192932, + "learning_rate": 3.152662409023233e-08, + "loss": 0.1843, + "step": 20704 + }, + { + "epoch": 1.9504957490402957, + "grad_norm": 0.6446796655654907, + "learning_rate": 3.140692766991027e-08, + "loss": 0.1897, + "step": 20705 + }, + { + "epoch": 1.9505899531334636, + "grad_norm": 0.7004801630973816, + "learning_rate": 3.128745854892645e-08, + "loss": 0.1994, + "step": 20706 + }, + { + "epoch": 1.9506841572266316, + "grad_norm": 0.6569026708602905, + "learning_rate": 3.116821673000647e-08, + "loss": 0.1799, + "step": 20707 + }, + { + "epoch": 1.9507783613197993, + "grad_norm": 0.6596437096595764, + "learning_rate": 3.104920221586705e-08, + "loss": 0.201, + "step": 20708 + }, + { + "epoch": 1.950872565412967, + "grad_norm": 0.6498751640319824, + "learning_rate": 3.093041500922378e-08, + "loss": 0.1701, + "step": 20709 + }, + { + "epoch": 1.950966769506135, + "grad_norm": 1.0099226236343384, + "learning_rate": 3.08118551127845e-08, + "loss": 0.1956, + "step": 20710 + }, + { + "epoch": 1.951060973599303, + "grad_norm": 0.5959270596504211, + "learning_rate": 3.069352252925262e-08, + "loss": 0.1547, + "step": 20711 + }, + { + "epoch": 1.9511551776924707, + "grad_norm": 0.6641378998756409, + "learning_rate": 3.0575417261325954e-08, + "loss": 0.2029, + "step": 20712 + }, + { + "epoch": 1.9512493817856384, + "grad_norm": 0.5687611699104309, + "learning_rate": 3.045753931169792e-08, + "loss": 0.1604, + "step": 20713 + }, + { + "epoch": 1.9513435858788064, + "grad_norm": 0.6910865306854248, + "learning_rate": 3.033988868305637e-08, + "loss": 0.2156, + "step": 20714 + }, + { + "epoch": 1.9514377899719744, + "grad_norm": 0.7515230774879456, + "learning_rate": 3.02224653780836e-08, + "loss": 0.1985, + "step": 20715 + }, + { + "epoch": 1.951531994065142, + "grad_norm": 0.695780336856842, + "learning_rate": 3.010526939945746e-08, + "loss": 0.2037, + "step": 20716 + }, + { + "epoch": 1.9516261981583098, + "grad_norm": 0.9847862720489502, + "learning_rate": 2.998830074984915e-08, + "loss": 0.2012, + "step": 20717 + }, + { + "epoch": 1.9517204022514778, + "grad_norm": 0.6138686537742615, + "learning_rate": 2.9871559431927656e-08, + "loss": 0.1473, + "step": 20718 + }, + { + "epoch": 1.9518146063446458, + "grad_norm": 0.7259178757667542, + "learning_rate": 2.9755045448351948e-08, + "loss": 0.2157, + "step": 20719 + }, + { + "epoch": 1.9519088104378135, + "grad_norm": 0.6341254711151123, + "learning_rate": 2.963875880178213e-08, + "loss": 0.2033, + "step": 20720 + }, + { + "epoch": 1.9520030145309812, + "grad_norm": 0.6881905794143677, + "learning_rate": 2.952269949486719e-08, + "loss": 0.1853, + "step": 20721 + }, + { + "epoch": 1.9520972186241492, + "grad_norm": 0.6890899538993835, + "learning_rate": 2.9406867530255013e-08, + "loss": 0.1895, + "step": 20722 + }, + { + "epoch": 1.9521914227173172, + "grad_norm": 0.6870761513710022, + "learning_rate": 2.9291262910585705e-08, + "loss": 0.1887, + "step": 20723 + }, + { + "epoch": 1.952285626810485, + "grad_norm": 0.6434525847434998, + "learning_rate": 2.9175885638494937e-08, + "loss": 0.2036, + "step": 20724 + }, + { + "epoch": 1.9523798309036526, + "grad_norm": 0.6391791105270386, + "learning_rate": 2.9060735716615052e-08, + "loss": 0.1865, + "step": 20725 + }, + { + "epoch": 1.9524740349968206, + "grad_norm": 0.6555296778678894, + "learning_rate": 2.8945813147570613e-08, + "loss": 0.1823, + "step": 20726 + }, + { + "epoch": 1.9525682390899886, + "grad_norm": 0.6324348449707031, + "learning_rate": 2.8831117933981746e-08, + "loss": 0.2056, + "step": 20727 + }, + { + "epoch": 1.9526624431831563, + "grad_norm": 0.6991801857948303, + "learning_rate": 2.8716650078465247e-08, + "loss": 0.1755, + "step": 20728 + }, + { + "epoch": 1.952756647276324, + "grad_norm": 0.6337845921516418, + "learning_rate": 2.8602409583629032e-08, + "loss": 0.1886, + "step": 20729 + }, + { + "epoch": 1.952850851369492, + "grad_norm": 0.6326943039894104, + "learning_rate": 2.8488396452078792e-08, + "loss": 0.1732, + "step": 20730 + }, + { + "epoch": 1.95294505546266, + "grad_norm": 2.937892436981201, + "learning_rate": 2.837461068641467e-08, + "loss": 0.1972, + "step": 20731 + }, + { + "epoch": 1.9530392595558277, + "grad_norm": 0.6691240668296814, + "learning_rate": 2.8261052289231262e-08, + "loss": 0.2107, + "step": 20732 + }, + { + "epoch": 1.9531334636489954, + "grad_norm": 0.6853199005126953, + "learning_rate": 2.814772126311649e-08, + "loss": 0.174, + "step": 20733 + }, + { + "epoch": 1.9532276677421634, + "grad_norm": 0.769709587097168, + "learning_rate": 2.8034617610656068e-08, + "loss": 0.2015, + "step": 20734 + }, + { + "epoch": 1.9533218718353313, + "grad_norm": 0.7242408990859985, + "learning_rate": 2.7921741334429043e-08, + "loss": 0.1769, + "step": 20735 + }, + { + "epoch": 1.953416075928499, + "grad_norm": 0.5964789390563965, + "learning_rate": 2.780909243700669e-08, + "loss": 0.2053, + "step": 20736 + }, + { + "epoch": 1.9535102800216668, + "grad_norm": 0.597507894039154, + "learning_rate": 2.769667092096029e-08, + "loss": 0.1854, + "step": 20737 + }, + { + "epoch": 1.9536044841148348, + "grad_norm": 0.6764325499534607, + "learning_rate": 2.7584476788852233e-08, + "loss": 0.1898, + "step": 20738 + }, + { + "epoch": 1.9536986882080027, + "grad_norm": 0.6390020251274109, + "learning_rate": 2.7472510043240476e-08, + "loss": 0.1711, + "step": 20739 + }, + { + "epoch": 1.9537928923011705, + "grad_norm": 0.6687731146812439, + "learning_rate": 2.7360770686678528e-08, + "loss": 0.1789, + "step": 20740 + }, + { + "epoch": 1.9538870963943382, + "grad_norm": 0.6528924107551575, + "learning_rate": 2.7249258721714354e-08, + "loss": 0.1725, + "step": 20741 + }, + { + "epoch": 1.9539813004875062, + "grad_norm": 0.6461551785469055, + "learning_rate": 2.7137974150889258e-08, + "loss": 0.1745, + "step": 20742 + }, + { + "epoch": 1.9540755045806741, + "grad_norm": 0.6037712693214417, + "learning_rate": 2.7026916976742313e-08, + "loss": 0.1654, + "step": 20743 + }, + { + "epoch": 1.9541697086738419, + "grad_norm": 0.6182787418365479, + "learning_rate": 2.691608720180594e-08, + "loss": 0.1831, + "step": 20744 + }, + { + "epoch": 1.9542639127670096, + "grad_norm": 0.668842077255249, + "learning_rate": 2.680548482860479e-08, + "loss": 0.2006, + "step": 20745 + }, + { + "epoch": 1.9543581168601776, + "grad_norm": 0.7709841728210449, + "learning_rate": 2.6695109859663502e-08, + "loss": 0.1965, + "step": 20746 + }, + { + "epoch": 1.9544523209533455, + "grad_norm": 0.6828837394714355, + "learning_rate": 2.658496229749785e-08, + "loss": 0.1648, + "step": 20747 + }, + { + "epoch": 1.9545465250465133, + "grad_norm": 0.6607949733734131, + "learning_rate": 2.6475042144619157e-08, + "loss": 0.1985, + "step": 20748 + }, + { + "epoch": 1.954640729139681, + "grad_norm": 0.6904767155647278, + "learning_rate": 2.636534940353319e-08, + "loss": 0.2011, + "step": 20749 + }, + { + "epoch": 1.954734933232849, + "grad_norm": 0.6577705144882202, + "learning_rate": 2.62558840767424e-08, + "loss": 0.1972, + "step": 20750 + }, + { + "epoch": 1.954829137326017, + "grad_norm": 0.6928842067718506, + "learning_rate": 2.6146646166741452e-08, + "loss": 0.1792, + "step": 20751 + }, + { + "epoch": 1.9549233414191847, + "grad_norm": 0.7457311749458313, + "learning_rate": 2.60376356760228e-08, + "loss": 0.182, + "step": 20752 + }, + { + "epoch": 1.9550175455123524, + "grad_norm": 0.6536678671836853, + "learning_rate": 2.5928852607070008e-08, + "loss": 0.2133, + "step": 20753 + }, + { + "epoch": 1.9551117496055204, + "grad_norm": 0.6196486949920654, + "learning_rate": 2.5820296962365543e-08, + "loss": 0.1561, + "step": 20754 + }, + { + "epoch": 1.9552059536986883, + "grad_norm": 0.6116530299186707, + "learning_rate": 2.5711968744382975e-08, + "loss": 0.1589, + "step": 20755 + }, + { + "epoch": 1.955300157791856, + "grad_norm": 0.5360685586929321, + "learning_rate": 2.5603867955593666e-08, + "loss": 0.1532, + "step": 20756 + }, + { + "epoch": 1.9553943618850238, + "grad_norm": 0.6683377027511597, + "learning_rate": 2.5495994598461193e-08, + "loss": 0.1899, + "step": 20757 + }, + { + "epoch": 1.9554885659781918, + "grad_norm": 0.6415463089942932, + "learning_rate": 2.538834867544693e-08, + "loss": 0.1595, + "step": 20758 + }, + { + "epoch": 1.9555827700713597, + "grad_norm": 0.6860426664352417, + "learning_rate": 2.528093018900335e-08, + "loss": 0.1714, + "step": 20759 + }, + { + "epoch": 1.9556769741645275, + "grad_norm": 0.684950590133667, + "learning_rate": 2.517373914158072e-08, + "loss": 0.1818, + "step": 20760 + }, + { + "epoch": 1.9557711782576952, + "grad_norm": 0.7248852252960205, + "learning_rate": 2.5066775535623756e-08, + "loss": 0.1866, + "step": 20761 + }, + { + "epoch": 1.9558653823508632, + "grad_norm": 0.6961067318916321, + "learning_rate": 2.49600393735705e-08, + "loss": 0.2025, + "step": 20762 + }, + { + "epoch": 1.9559595864440311, + "grad_norm": 0.624409556388855, + "learning_rate": 2.4853530657855674e-08, + "loss": 0.1796, + "step": 20763 + }, + { + "epoch": 1.9560537905371989, + "grad_norm": 0.6325263977050781, + "learning_rate": 2.4747249390906224e-08, + "loss": 0.1957, + "step": 20764 + }, + { + "epoch": 1.9561479946303666, + "grad_norm": 0.6427236795425415, + "learning_rate": 2.4641195575147992e-08, + "loss": 0.1819, + "step": 20765 + }, + { + "epoch": 1.9562421987235346, + "grad_norm": 0.6469488143920898, + "learning_rate": 2.453536921299571e-08, + "loss": 0.1907, + "step": 20766 + }, + { + "epoch": 1.9563364028167025, + "grad_norm": 0.6345981955528259, + "learning_rate": 2.442977030686522e-08, + "loss": 0.1878, + "step": 20767 + }, + { + "epoch": 1.9564306069098703, + "grad_norm": 0.6015830039978027, + "learning_rate": 2.43243988591646e-08, + "loss": 0.1661, + "step": 20768 + }, + { + "epoch": 1.956524811003038, + "grad_norm": 0.7506802678108215, + "learning_rate": 2.421925487229415e-08, + "loss": 0.2052, + "step": 20769 + }, + { + "epoch": 1.956619015096206, + "grad_norm": 0.6623401641845703, + "learning_rate": 2.4114338348653067e-08, + "loss": 0.19, + "step": 20770 + }, + { + "epoch": 1.956713219189374, + "grad_norm": 0.5909781455993652, + "learning_rate": 2.4009649290632762e-08, + "loss": 0.1666, + "step": 20771 + }, + { + "epoch": 1.9568074232825416, + "grad_norm": 0.7276968359947205, + "learning_rate": 2.3905187700620223e-08, + "loss": 0.1946, + "step": 20772 + }, + { + "epoch": 1.9569016273757094, + "grad_norm": 0.7656787037849426, + "learning_rate": 2.3800953580997988e-08, + "loss": 0.2245, + "step": 20773 + }, + { + "epoch": 1.9569958314688773, + "grad_norm": 0.6558153629302979, + "learning_rate": 2.369694693414304e-08, + "loss": 0.2119, + "step": 20774 + }, + { + "epoch": 1.9570900355620453, + "grad_norm": 0.7375321984291077, + "learning_rate": 2.3593167762425707e-08, + "loss": 0.201, + "step": 20775 + }, + { + "epoch": 1.957184239655213, + "grad_norm": 0.6460342407226562, + "learning_rate": 2.3489616068212983e-08, + "loss": 0.182, + "step": 20776 + }, + { + "epoch": 1.9572784437483808, + "grad_norm": 0.6745608448982239, + "learning_rate": 2.3386291853866318e-08, + "loss": 0.2549, + "step": 20777 + }, + { + "epoch": 1.9573726478415487, + "grad_norm": 0.6970850229263306, + "learning_rate": 2.3283195121741597e-08, + "loss": 0.2054, + "step": 20778 + }, + { + "epoch": 1.9574668519347167, + "grad_norm": 0.6697490811347961, + "learning_rate": 2.318032587418917e-08, + "loss": 0.1944, + "step": 20779 + }, + { + "epoch": 1.9575610560278844, + "grad_norm": 0.7169244289398193, + "learning_rate": 2.3077684113554933e-08, + "loss": 0.2082, + "step": 20780 + }, + { + "epoch": 1.9576552601210522, + "grad_norm": 0.6706309914588928, + "learning_rate": 2.2975269842178127e-08, + "loss": 0.2004, + "step": 20781 + }, + { + "epoch": 1.9577494642142201, + "grad_norm": 0.6824950575828552, + "learning_rate": 2.2873083062395775e-08, + "loss": 0.1927, + "step": 20782 + }, + { + "epoch": 1.957843668307388, + "grad_norm": 0.625842273235321, + "learning_rate": 2.2771123776537117e-08, + "loss": 0.1686, + "step": 20783 + }, + { + "epoch": 1.9579378724005558, + "grad_norm": 0.6121344566345215, + "learning_rate": 2.266939198692586e-08, + "loss": 0.1839, + "step": 20784 + }, + { + "epoch": 1.9580320764937236, + "grad_norm": 0.6271312832832336, + "learning_rate": 2.2567887695883472e-08, + "loss": 0.1969, + "step": 20785 + }, + { + "epoch": 1.9581262805868915, + "grad_norm": 0.6655808687210083, + "learning_rate": 2.2466610905723662e-08, + "loss": 0.1727, + "step": 20786 + }, + { + "epoch": 1.9582204846800595, + "grad_norm": 0.7358273267745972, + "learning_rate": 2.2365561618755694e-08, + "loss": 0.1868, + "step": 20787 + }, + { + "epoch": 1.9583146887732272, + "grad_norm": 0.6705983281135559, + "learning_rate": 2.2264739837283278e-08, + "loss": 0.1884, + "step": 20788 + }, + { + "epoch": 1.958408892866395, + "grad_norm": 1.703456163406372, + "learning_rate": 2.21641455636068e-08, + "loss": 0.2058, + "step": 20789 + }, + { + "epoch": 1.958503096959563, + "grad_norm": 0.7661097049713135, + "learning_rate": 2.2063778800017755e-08, + "loss": 0.1642, + "step": 20790 + }, + { + "epoch": 1.9585973010527309, + "grad_norm": 0.6239246726036072, + "learning_rate": 2.1963639548805426e-08, + "loss": 0.1883, + "step": 20791 + }, + { + "epoch": 1.9586915051458986, + "grad_norm": 0.7588548064231873, + "learning_rate": 2.1863727812254653e-08, + "loss": 0.2189, + "step": 20792 + }, + { + "epoch": 1.9587857092390664, + "grad_norm": 0.6334348917007446, + "learning_rate": 2.1764043592641394e-08, + "loss": 0.1825, + "step": 20793 + }, + { + "epoch": 1.9588799133322343, + "grad_norm": 0.6758701205253601, + "learning_rate": 2.166458689223938e-08, + "loss": 0.1957, + "step": 20794 + }, + { + "epoch": 1.9589741174254023, + "grad_norm": 0.65650874376297, + "learning_rate": 2.1565357713317914e-08, + "loss": 0.1863, + "step": 20795 + }, + { + "epoch": 1.95906832151857, + "grad_norm": 0.6460846662521362, + "learning_rate": 2.1466356058137406e-08, + "loss": 0.1894, + "step": 20796 + }, + { + "epoch": 1.9591625256117378, + "grad_norm": 0.7715196013450623, + "learning_rate": 2.136758192895605e-08, + "loss": 0.2193, + "step": 20797 + }, + { + "epoch": 1.9592567297049057, + "grad_norm": 0.72203129529953, + "learning_rate": 2.126903532802538e-08, + "loss": 0.2003, + "step": 20798 + }, + { + "epoch": 1.9593509337980737, + "grad_norm": 0.5925700664520264, + "learning_rate": 2.1170716257594704e-08, + "loss": 0.163, + "step": 20799 + }, + { + "epoch": 1.9594451378912414, + "grad_norm": 0.6946857571601868, + "learning_rate": 2.1072624719903346e-08, + "loss": 0.211, + "step": 20800 + }, + { + "epoch": 1.9595393419844092, + "grad_norm": 0.6126140356063843, + "learning_rate": 2.0974760717188402e-08, + "loss": 0.1883, + "step": 20801 + }, + { + "epoch": 1.9596335460775771, + "grad_norm": 0.6870433688163757, + "learning_rate": 2.0877124251682535e-08, + "loss": 0.2168, + "step": 20802 + }, + { + "epoch": 1.959727750170745, + "grad_norm": 0.6843072175979614, + "learning_rate": 2.077971532561063e-08, + "loss": 0.1801, + "step": 20803 + }, + { + "epoch": 1.9598219542639128, + "grad_norm": 0.6433764696121216, + "learning_rate": 2.0682533941195347e-08, + "loss": 0.1753, + "step": 20804 + }, + { + "epoch": 1.9599161583570806, + "grad_norm": 0.7131536602973938, + "learning_rate": 2.058558010065048e-08, + "loss": 0.1776, + "step": 20805 + }, + { + "epoch": 1.9600103624502485, + "grad_norm": 0.60561603307724, + "learning_rate": 2.0488853806188703e-08, + "loss": 0.188, + "step": 20806 + }, + { + "epoch": 1.9601045665434165, + "grad_norm": 0.6701720952987671, + "learning_rate": 2.0392355060013802e-08, + "loss": 0.197, + "step": 20807 + }, + { + "epoch": 1.9601987706365842, + "grad_norm": 0.7826750874519348, + "learning_rate": 2.0296083864327354e-08, + "loss": 0.2015, + "step": 20808 + }, + { + "epoch": 1.960292974729752, + "grad_norm": 0.6032291054725647, + "learning_rate": 2.0200040221324268e-08, + "loss": 0.1881, + "step": 20809 + }, + { + "epoch": 1.96038717882292, + "grad_norm": 0.6667314767837524, + "learning_rate": 2.0104224133193907e-08, + "loss": 0.1953, + "step": 20810 + }, + { + "epoch": 1.9604813829160879, + "grad_norm": 0.6437525153160095, + "learning_rate": 2.0008635602122294e-08, + "loss": 0.1878, + "step": 20811 + }, + { + "epoch": 1.9605755870092556, + "grad_norm": 0.6329096555709839, + "learning_rate": 1.9913274630287694e-08, + "loss": 0.1737, + "step": 20812 + }, + { + "epoch": 1.9606697911024233, + "grad_norm": 0.7089539766311646, + "learning_rate": 1.9818141219865028e-08, + "loss": 0.1867, + "step": 20813 + }, + { + "epoch": 1.9607639951955913, + "grad_norm": 0.6241235136985779, + "learning_rate": 1.9723235373023674e-08, + "loss": 0.1948, + "step": 20814 + }, + { + "epoch": 1.9608581992887593, + "grad_norm": 0.6769440770149231, + "learning_rate": 1.962855709192635e-08, + "loss": 0.2273, + "step": 20815 + }, + { + "epoch": 1.9609524033819268, + "grad_norm": 0.7112860083580017, + "learning_rate": 1.9534106378733543e-08, + "loss": 0.185, + "step": 20816 + }, + { + "epoch": 1.9610466074750947, + "grad_norm": 0.6273424625396729, + "learning_rate": 1.943988323559798e-08, + "loss": 0.219, + "step": 20817 + }, + { + "epoch": 1.9611408115682627, + "grad_norm": 0.6284955739974976, + "learning_rate": 1.934588766466905e-08, + "loss": 0.1928, + "step": 20818 + }, + { + "epoch": 1.9612350156614304, + "grad_norm": 0.6763734817504883, + "learning_rate": 1.9252119668088377e-08, + "loss": 0.2021, + "step": 20819 + }, + { + "epoch": 1.9613292197545982, + "grad_norm": 0.6680343151092529, + "learning_rate": 1.9158579247995356e-08, + "loss": 0.1989, + "step": 20820 + }, + { + "epoch": 1.9614234238477661, + "grad_norm": 0.7202074527740479, + "learning_rate": 1.9065266406521622e-08, + "loss": 0.1871, + "step": 20821 + }, + { + "epoch": 1.961517627940934, + "grad_norm": 0.7383487224578857, + "learning_rate": 1.8972181145796576e-08, + "loss": 0.2086, + "step": 20822 + }, + { + "epoch": 1.9616118320341018, + "grad_norm": 0.6500657796859741, + "learning_rate": 1.8879323467940748e-08, + "loss": 0.1911, + "step": 20823 + }, + { + "epoch": 1.9617060361272696, + "grad_norm": 0.6608936190605164, + "learning_rate": 1.878669337507355e-08, + "loss": 0.2075, + "step": 20824 + }, + { + "epoch": 1.9618002402204375, + "grad_norm": 0.6508350372314453, + "learning_rate": 1.869429086930552e-08, + "loss": 0.1741, + "step": 20825 + }, + { + "epoch": 1.9618944443136055, + "grad_norm": 0.8364340662956238, + "learning_rate": 1.860211595274386e-08, + "loss": 0.1714, + "step": 20826 + }, + { + "epoch": 1.9619886484067732, + "grad_norm": 0.6984458565711975, + "learning_rate": 1.8510168627490222e-08, + "loss": 0.1909, + "step": 20827 + }, + { + "epoch": 1.962082852499941, + "grad_norm": 0.7252766489982605, + "learning_rate": 1.8418448895641816e-08, + "loss": 0.2306, + "step": 20828 + }, + { + "epoch": 1.962177056593109, + "grad_norm": 0.7223662734031677, + "learning_rate": 1.8326956759290305e-08, + "loss": 0.1873, + "step": 20829 + }, + { + "epoch": 1.9622712606862769, + "grad_norm": 0.6001616716384888, + "learning_rate": 1.8235692220520683e-08, + "loss": 0.1931, + "step": 20830 + }, + { + "epoch": 1.9623654647794446, + "grad_norm": 0.704746425151825, + "learning_rate": 1.8144655281413515e-08, + "loss": 0.2057, + "step": 20831 + }, + { + "epoch": 1.9624596688726124, + "grad_norm": 0.6368359923362732, + "learning_rate": 1.8053845944046024e-08, + "loss": 0.1926, + "step": 20832 + }, + { + "epoch": 1.9625538729657803, + "grad_norm": 0.8043307065963745, + "learning_rate": 1.7963264210488774e-08, + "loss": 0.1829, + "step": 20833 + }, + { + "epoch": 1.9626480770589483, + "grad_norm": 0.6016947031021118, + "learning_rate": 1.787291008280567e-08, + "loss": 0.1647, + "step": 20834 + }, + { + "epoch": 1.962742281152116, + "grad_norm": 0.677052915096283, + "learning_rate": 1.7782783563058403e-08, + "loss": 0.1874, + "step": 20835 + }, + { + "epoch": 1.9628364852452838, + "grad_norm": 0.7198638319969177, + "learning_rate": 1.7692884653301988e-08, + "loss": 0.2044, + "step": 20836 + }, + { + "epoch": 1.9629306893384517, + "grad_norm": 0.7062585353851318, + "learning_rate": 1.7603213355584793e-08, + "loss": 0.1919, + "step": 20837 + }, + { + "epoch": 1.9630248934316197, + "grad_norm": 0.5616207122802734, + "learning_rate": 1.751376967195295e-08, + "loss": 0.1785, + "step": 20838 + }, + { + "epoch": 1.9631190975247874, + "grad_norm": 0.6764761209487915, + "learning_rate": 1.7424553604444837e-08, + "loss": 0.1881, + "step": 20839 + }, + { + "epoch": 1.9632133016179552, + "grad_norm": 0.6514866352081299, + "learning_rate": 1.7335565155095492e-08, + "loss": 0.2154, + "step": 20840 + }, + { + "epoch": 1.9633075057111231, + "grad_norm": 0.6551622748374939, + "learning_rate": 1.7246804325934396e-08, + "loss": 0.1762, + "step": 20841 + }, + { + "epoch": 1.963401709804291, + "grad_norm": 0.7031304240226746, + "learning_rate": 1.715827111898327e-08, + "loss": 0.1962, + "step": 20842 + }, + { + "epoch": 1.9634959138974588, + "grad_norm": 0.6236386299133301, + "learning_rate": 1.7069965536263834e-08, + "loss": 0.1811, + "step": 20843 + }, + { + "epoch": 1.9635901179906265, + "grad_norm": 0.5879573822021484, + "learning_rate": 1.6981887579787802e-08, + "loss": 0.1586, + "step": 20844 + }, + { + "epoch": 1.9636843220837945, + "grad_norm": 0.6167858839035034, + "learning_rate": 1.6894037251563578e-08, + "loss": 0.1576, + "step": 20845 + }, + { + "epoch": 1.9637785261769625, + "grad_norm": 0.692031979560852, + "learning_rate": 1.6806414553593998e-08, + "loss": 0.1806, + "step": 20846 + }, + { + "epoch": 1.9638727302701302, + "grad_norm": 0.6623846292495728, + "learning_rate": 1.6719019487877463e-08, + "loss": 0.1965, + "step": 20847 + }, + { + "epoch": 1.963966934363298, + "grad_norm": 0.8027023673057556, + "learning_rate": 1.6631852056407936e-08, + "loss": 0.1857, + "step": 20848 + }, + { + "epoch": 1.964061138456466, + "grad_norm": 0.6101537942886353, + "learning_rate": 1.6544912261170498e-08, + "loss": 0.2053, + "step": 20849 + }, + { + "epoch": 1.9641553425496339, + "grad_norm": 0.6344097852706909, + "learning_rate": 1.6458200104149115e-08, + "loss": 0.2072, + "step": 20850 + }, + { + "epoch": 1.9642495466428016, + "grad_norm": 0.6655710935592651, + "learning_rate": 1.637171558732109e-08, + "loss": 0.1984, + "step": 20851 + }, + { + "epoch": 1.9643437507359693, + "grad_norm": 0.6916816234588623, + "learning_rate": 1.6285458712657077e-08, + "loss": 0.1997, + "step": 20852 + }, + { + "epoch": 1.9644379548291373, + "grad_norm": 0.7174298167228699, + "learning_rate": 1.6199429482125495e-08, + "loss": 0.1941, + "step": 20853 + }, + { + "epoch": 1.9645321589223053, + "grad_norm": 0.5899234414100647, + "learning_rate": 1.6113627897687e-08, + "loss": 0.2045, + "step": 20854 + }, + { + "epoch": 1.964626363015473, + "grad_norm": 0.6875238418579102, + "learning_rate": 1.6028053961297808e-08, + "loss": 0.1827, + "step": 20855 + }, + { + "epoch": 1.9647205671086407, + "grad_norm": 0.7000439167022705, + "learning_rate": 1.5942707674909687e-08, + "loss": 0.2122, + "step": 20856 + }, + { + "epoch": 1.9648147712018087, + "grad_norm": 0.6685363054275513, + "learning_rate": 1.585758904046886e-08, + "loss": 0.2103, + "step": 20857 + }, + { + "epoch": 1.9649089752949767, + "grad_norm": 0.7183793783187866, + "learning_rate": 1.577269805991488e-08, + "loss": 0.2031, + "step": 20858 + }, + { + "epoch": 1.9650031793881444, + "grad_norm": 0.6927871108055115, + "learning_rate": 1.5688034735185098e-08, + "loss": 0.1904, + "step": 20859 + }, + { + "epoch": 1.9650973834813121, + "grad_norm": 0.733188807964325, + "learning_rate": 1.5603599068209075e-08, + "loss": 0.2567, + "step": 20860 + }, + { + "epoch": 1.96519158757448, + "grad_norm": 0.6338531374931335, + "learning_rate": 1.5519391060911936e-08, + "loss": 0.1739, + "step": 20861 + }, + { + "epoch": 1.965285791667648, + "grad_norm": 0.6898400187492371, + "learning_rate": 1.543541071521326e-08, + "loss": 0.2261, + "step": 20862 + }, + { + "epoch": 1.9653799957608158, + "grad_norm": 0.7352140545845032, + "learning_rate": 1.5351658033029293e-08, + "loss": 0.184, + "step": 20863 + }, + { + "epoch": 1.9654741998539835, + "grad_norm": 0.703982949256897, + "learning_rate": 1.5268133016269614e-08, + "loss": 0.2074, + "step": 20864 + }, + { + "epoch": 1.9655684039471515, + "grad_norm": 0.6581400036811829, + "learning_rate": 1.518483566683826e-08, + "loss": 0.2005, + "step": 20865 + }, + { + "epoch": 1.9656626080403194, + "grad_norm": 0.7103128433227539, + "learning_rate": 1.510176598663371e-08, + "loss": 0.2084, + "step": 20866 + }, + { + "epoch": 1.9657568121334872, + "grad_norm": 0.6426986455917358, + "learning_rate": 1.5018923977551115e-08, + "loss": 0.1778, + "step": 20867 + }, + { + "epoch": 1.965851016226655, + "grad_norm": 0.6864650845527649, + "learning_rate": 1.4936309641478964e-08, + "loss": 0.1784, + "step": 20868 + }, + { + "epoch": 1.9659452203198229, + "grad_norm": 0.5860040187835693, + "learning_rate": 1.4853922980301305e-08, + "loss": 0.1818, + "step": 20869 + }, + { + "epoch": 1.9660394244129908, + "grad_norm": 0.6675563454627991, + "learning_rate": 1.4771763995896637e-08, + "loss": 0.1914, + "step": 20870 + }, + { + "epoch": 1.9661336285061586, + "grad_norm": 0.6752128601074219, + "learning_rate": 1.4689832690137907e-08, + "loss": 0.1751, + "step": 20871 + }, + { + "epoch": 1.9662278325993263, + "grad_norm": 0.680773138999939, + "learning_rate": 1.4608129064893617e-08, + "loss": 0.1913, + "step": 20872 + }, + { + "epoch": 1.9663220366924943, + "grad_norm": 0.7888768315315247, + "learning_rate": 1.4526653122026723e-08, + "loss": 0.1837, + "step": 20873 + }, + { + "epoch": 1.9664162407856622, + "grad_norm": 0.6779602766036987, + "learning_rate": 1.444540486339574e-08, + "loss": 0.1974, + "step": 20874 + }, + { + "epoch": 1.96651044487883, + "grad_norm": 0.6856969594955444, + "learning_rate": 1.4364384290851408e-08, + "loss": 0.2144, + "step": 20875 + }, + { + "epoch": 1.9666046489719977, + "grad_norm": 0.8639304041862488, + "learning_rate": 1.4283591406242247e-08, + "loss": 0.2132, + "step": 20876 + }, + { + "epoch": 1.9666988530651657, + "grad_norm": 0.6300287842750549, + "learning_rate": 1.4203026211411231e-08, + "loss": 0.1926, + "step": 20877 + }, + { + "epoch": 1.9667930571583336, + "grad_norm": 0.65023273229599, + "learning_rate": 1.4122688708193555e-08, + "loss": 0.1996, + "step": 20878 + }, + { + "epoch": 1.9668872612515014, + "grad_norm": 0.6050977110862732, + "learning_rate": 1.40425788984222e-08, + "loss": 0.1718, + "step": 20879 + }, + { + "epoch": 1.9669814653446691, + "grad_norm": 0.7740190625190735, + "learning_rate": 1.396269678392459e-08, + "loss": 0.2018, + "step": 20880 + }, + { + "epoch": 1.967075669437837, + "grad_norm": 0.6520513892173767, + "learning_rate": 1.3883042366520382e-08, + "loss": 0.1751, + "step": 20881 + }, + { + "epoch": 1.967169873531005, + "grad_norm": 0.6170967221260071, + "learning_rate": 1.3803615648027014e-08, + "loss": 0.1862, + "step": 20882 + }, + { + "epoch": 1.9672640776241728, + "grad_norm": 0.6485597491264343, + "learning_rate": 1.3724416630255256e-08, + "loss": 0.1849, + "step": 20883 + }, + { + "epoch": 1.9673582817173405, + "grad_norm": 0.6519567370414734, + "learning_rate": 1.3645445315010331e-08, + "loss": 0.2085, + "step": 20884 + }, + { + "epoch": 1.9674524858105085, + "grad_norm": 0.5942420363426208, + "learning_rate": 1.3566701704094131e-08, + "loss": 0.1782, + "step": 20885 + }, + { + "epoch": 1.9675466899036764, + "grad_norm": 0.7131619453430176, + "learning_rate": 1.3488185799300779e-08, + "loss": 0.2051, + "step": 20886 + }, + { + "epoch": 1.9676408939968442, + "grad_norm": 0.6562275886535645, + "learning_rate": 1.3409897602421063e-08, + "loss": 0.2094, + "step": 20887 + }, + { + "epoch": 1.967735098090012, + "grad_norm": 0.6430373787879944, + "learning_rate": 1.3331837115241331e-08, + "loss": 0.1886, + "step": 20888 + }, + { + "epoch": 1.9678293021831799, + "grad_norm": 0.7399052977561951, + "learning_rate": 1.325400433953905e-08, + "loss": 0.2365, + "step": 20889 + }, + { + "epoch": 1.9679235062763478, + "grad_norm": 0.7590723633766174, + "learning_rate": 1.317639927709058e-08, + "loss": 0.189, + "step": 20890 + }, + { + "epoch": 1.9680177103695156, + "grad_norm": 0.689531683921814, + "learning_rate": 1.3099021929665612e-08, + "loss": 0.1898, + "step": 20891 + }, + { + "epoch": 1.9681119144626833, + "grad_norm": 0.664011538028717, + "learning_rate": 1.3021872299028292e-08, + "loss": 0.196, + "step": 20892 + }, + { + "epoch": 1.9682061185558513, + "grad_norm": 0.6723976731300354, + "learning_rate": 1.2944950386937217e-08, + "loss": 0.1864, + "step": 20893 + }, + { + "epoch": 1.9683003226490192, + "grad_norm": 0.7510846853256226, + "learning_rate": 1.2868256195145423e-08, + "loss": 0.2106, + "step": 20894 + }, + { + "epoch": 1.968394526742187, + "grad_norm": 0.7858664989471436, + "learning_rate": 1.2791789725404846e-08, + "loss": 0.208, + "step": 20895 + }, + { + "epoch": 1.9684887308353547, + "grad_norm": 0.6809086799621582, + "learning_rate": 1.2715550979455205e-08, + "loss": 0.1966, + "step": 20896 + }, + { + "epoch": 1.9685829349285227, + "grad_norm": 0.6665279865264893, + "learning_rate": 1.2639539959037328e-08, + "loss": 0.2033, + "step": 20897 + }, + { + "epoch": 1.9686771390216906, + "grad_norm": 0.6726946830749512, + "learning_rate": 1.2563756665883165e-08, + "loss": 0.1964, + "step": 20898 + }, + { + "epoch": 1.9687713431148584, + "grad_norm": 0.8123178482055664, + "learning_rate": 1.2488201101722442e-08, + "loss": 0.2193, + "step": 20899 + }, + { + "epoch": 1.968865547208026, + "grad_norm": 0.610478937625885, + "learning_rate": 1.2412873268274895e-08, + "loss": 0.1741, + "step": 20900 + }, + { + "epoch": 1.968959751301194, + "grad_norm": 0.6494256258010864, + "learning_rate": 1.2337773167261369e-08, + "loss": 0.1874, + "step": 20901 + }, + { + "epoch": 1.969053955394362, + "grad_norm": 0.7286932468414307, + "learning_rate": 1.2262900800392719e-08, + "loss": 0.196, + "step": 20902 + }, + { + "epoch": 1.9691481594875297, + "grad_norm": 0.6259111166000366, + "learning_rate": 1.2188256169375357e-08, + "loss": 0.204, + "step": 20903 + }, + { + "epoch": 1.9692423635806975, + "grad_norm": 0.6284198760986328, + "learning_rate": 1.2113839275913475e-08, + "loss": 0.1742, + "step": 20904 + }, + { + "epoch": 1.9693365676738654, + "grad_norm": 0.6095848679542542, + "learning_rate": 1.2039650121702384e-08, + "loss": 0.1963, + "step": 20905 + }, + { + "epoch": 1.9694307717670334, + "grad_norm": 0.6099793314933777, + "learning_rate": 1.1965688708432954e-08, + "loss": 0.1802, + "step": 20906 + }, + { + "epoch": 1.9695249758602011, + "grad_norm": 0.6415167450904846, + "learning_rate": 1.1891955037793835e-08, + "loss": 0.1839, + "step": 20907 + }, + { + "epoch": 1.9696191799533689, + "grad_norm": 0.6364742517471313, + "learning_rate": 1.181844911146368e-08, + "loss": 0.1616, + "step": 20908 + }, + { + "epoch": 1.9697133840465368, + "grad_norm": 0.6914396286010742, + "learning_rate": 1.1745170931121152e-08, + "loss": 0.2044, + "step": 20909 + }, + { + "epoch": 1.9698075881397048, + "grad_norm": 0.5957686305046082, + "learning_rate": 1.1672120498434914e-08, + "loss": 0.1871, + "step": 20910 + }, + { + "epoch": 1.9699017922328725, + "grad_norm": 0.6073902249336243, + "learning_rate": 1.1599297815072519e-08, + "loss": 0.1727, + "step": 20911 + }, + { + "epoch": 1.9699959963260403, + "grad_norm": 0.7050167918205261, + "learning_rate": 1.1526702882692641e-08, + "loss": 0.1733, + "step": 20912 + }, + { + "epoch": 1.9700902004192082, + "grad_norm": 0.6207622289657593, + "learning_rate": 1.1454335702951735e-08, + "loss": 0.1791, + "step": 20913 + }, + { + "epoch": 1.9701844045123762, + "grad_norm": 0.6318159103393555, + "learning_rate": 1.138219627749959e-08, + "loss": 0.1814, + "step": 20914 + }, + { + "epoch": 1.970278608605544, + "grad_norm": 0.6200817823410034, + "learning_rate": 1.1310284607980448e-08, + "loss": 0.1966, + "step": 20915 + }, + { + "epoch": 1.9703728126987117, + "grad_norm": 0.7505404353141785, + "learning_rate": 1.1238600696035218e-08, + "loss": 0.2249, + "step": 20916 + }, + { + "epoch": 1.9704670167918796, + "grad_norm": 0.6427599191665649, + "learning_rate": 1.1167144543298147e-08, + "loss": 0.1887, + "step": 20917 + }, + { + "epoch": 1.9705612208850476, + "grad_norm": 0.6350158452987671, + "learning_rate": 1.1095916151397935e-08, + "loss": 0.2103, + "step": 20918 + }, + { + "epoch": 1.9706554249782153, + "grad_norm": 0.6772736310958862, + "learning_rate": 1.1024915521958835e-08, + "loss": 0.2059, + "step": 20919 + }, + { + "epoch": 1.970749629071383, + "grad_norm": 0.6636703014373779, + "learning_rate": 1.0954142656600663e-08, + "loss": 0.2083, + "step": 20920 + }, + { + "epoch": 1.970843833164551, + "grad_norm": 0.6341129541397095, + "learning_rate": 1.0883597556935466e-08, + "loss": 0.207, + "step": 20921 + }, + { + "epoch": 1.970938037257719, + "grad_norm": 0.6010454297065735, + "learning_rate": 1.0813280224573063e-08, + "loss": 0.1776, + "step": 20922 + }, + { + "epoch": 1.9710322413508867, + "grad_norm": 0.6927230954170227, + "learning_rate": 1.074319066111551e-08, + "loss": 0.2501, + "step": 20923 + }, + { + "epoch": 1.9711264454440545, + "grad_norm": 0.6573917865753174, + "learning_rate": 1.0673328868162635e-08, + "loss": 0.1835, + "step": 20924 + }, + { + "epoch": 1.9712206495372224, + "grad_norm": 0.8325679898262024, + "learning_rate": 1.0603694847307611e-08, + "loss": 0.1972, + "step": 20925 + }, + { + "epoch": 1.9713148536303904, + "grad_norm": 0.6681457161903381, + "learning_rate": 1.0534288600135834e-08, + "loss": 0.2064, + "step": 20926 + }, + { + "epoch": 1.9714090577235581, + "grad_norm": 0.6150078773498535, + "learning_rate": 1.0465110128231593e-08, + "loss": 0.1748, + "step": 20927 + }, + { + "epoch": 1.9715032618167259, + "grad_norm": 0.6080055832862854, + "learning_rate": 1.0396159433172514e-08, + "loss": 0.1718, + "step": 20928 + }, + { + "epoch": 1.9715974659098938, + "grad_norm": 0.6420427560806274, + "learning_rate": 1.0327436516529566e-08, + "loss": 0.1793, + "step": 20929 + }, + { + "epoch": 1.9716916700030618, + "grad_norm": 0.6000234484672546, + "learning_rate": 1.025894137987149e-08, + "loss": 0.192, + "step": 20930 + }, + { + "epoch": 1.9717858740962295, + "grad_norm": 0.6501505970954895, + "learning_rate": 1.0190674024758151e-08, + "loss": 0.1859, + "step": 20931 + }, + { + "epoch": 1.9718800781893973, + "grad_norm": 0.6825153231620789, + "learning_rate": 1.0122634452747193e-08, + "loss": 0.2155, + "step": 20932 + }, + { + "epoch": 1.9719742822825652, + "grad_norm": 0.740336000919342, + "learning_rate": 1.0054822665388486e-08, + "loss": 0.1839, + "step": 20933 + }, + { + "epoch": 1.9720684863757332, + "grad_norm": 0.6732248663902283, + "learning_rate": 9.987238664231902e-09, + "loss": 0.1905, + "step": 20934 + }, + { + "epoch": 1.972162690468901, + "grad_norm": 0.6618796586990356, + "learning_rate": 9.919882450815099e-09, + "loss": 0.2039, + "step": 20935 + }, + { + "epoch": 1.9722568945620687, + "grad_norm": 0.6390891671180725, + "learning_rate": 9.852754026673517e-09, + "loss": 0.1539, + "step": 20936 + }, + { + "epoch": 1.9723510986552366, + "grad_norm": 0.681266188621521, + "learning_rate": 9.785853393341483e-09, + "loss": 0.2135, + "step": 20937 + }, + { + "epoch": 1.9724453027484046, + "grad_norm": 0.6857741475105286, + "learning_rate": 9.719180552341113e-09, + "loss": 0.1801, + "step": 20938 + }, + { + "epoch": 1.9725395068415723, + "grad_norm": 0.7095636129379272, + "learning_rate": 9.652735505193411e-09, + "loss": 0.1985, + "step": 20939 + }, + { + "epoch": 1.97263371093474, + "grad_norm": 0.7109867334365845, + "learning_rate": 9.586518253413835e-09, + "loss": 0.2128, + "step": 20940 + }, + { + "epoch": 1.972727915027908, + "grad_norm": 0.6484083533287048, + "learning_rate": 9.520528798512286e-09, + "loss": 0.1798, + "step": 20941 + }, + { + "epoch": 1.972822119121076, + "grad_norm": 0.7068780660629272, + "learning_rate": 9.454767141993115e-09, + "loss": 0.2016, + "step": 20942 + }, + { + "epoch": 1.9729163232142437, + "grad_norm": 0.7174715399742126, + "learning_rate": 9.389233285356237e-09, + "loss": 0.2383, + "step": 20943 + }, + { + "epoch": 1.9730105273074114, + "grad_norm": 0.6550261378288269, + "learning_rate": 9.323927230096008e-09, + "loss": 0.1767, + "step": 20944 + }, + { + "epoch": 1.9731047314005794, + "grad_norm": 0.727986752986908, + "learning_rate": 9.25884897770013e-09, + "loss": 0.1937, + "step": 20945 + }, + { + "epoch": 1.9731989354937474, + "grad_norm": 0.6893591284751892, + "learning_rate": 9.19399852965408e-09, + "loss": 0.1766, + "step": 20946 + }, + { + "epoch": 1.973293139586915, + "grad_norm": 0.6319138407707214, + "learning_rate": 9.129375887435566e-09, + "loss": 0.1713, + "step": 20947 + }, + { + "epoch": 1.9733873436800828, + "grad_norm": 0.7045475244522095, + "learning_rate": 9.064981052518962e-09, + "loss": 0.2057, + "step": 20948 + }, + { + "epoch": 1.9734815477732508, + "grad_norm": 0.6276645064353943, + "learning_rate": 9.000814026371985e-09, + "loss": 0.1659, + "step": 20949 + }, + { + "epoch": 1.9735757518664188, + "grad_norm": 0.6116898655891418, + "learning_rate": 8.936874810456798e-09, + "loss": 0.1723, + "step": 20950 + }, + { + "epoch": 1.9736699559595865, + "grad_norm": 0.6246646046638489, + "learning_rate": 8.873163406233342e-09, + "loss": 0.1769, + "step": 20951 + }, + { + "epoch": 1.9737641600527542, + "grad_norm": 0.6728521585464478, + "learning_rate": 8.809679815153793e-09, + "loss": 0.1988, + "step": 20952 + }, + { + "epoch": 1.9738583641459222, + "grad_norm": 0.6070953011512756, + "learning_rate": 8.746424038663658e-09, + "loss": 0.1755, + "step": 20953 + }, + { + "epoch": 1.97395256823909, + "grad_norm": 0.6520624756813049, + "learning_rate": 8.68339607820845e-09, + "loss": 0.1921, + "step": 20954 + }, + { + "epoch": 1.9740467723322577, + "grad_norm": 0.5930506587028503, + "learning_rate": 8.620595935222575e-09, + "loss": 0.2102, + "step": 20955 + }, + { + "epoch": 1.9741409764254256, + "grad_norm": 0.632493257522583, + "learning_rate": 8.558023611139332e-09, + "loss": 0.1945, + "step": 20956 + }, + { + "epoch": 1.9742351805185936, + "grad_norm": 0.6277579665184021, + "learning_rate": 8.495679107385357e-09, + "loss": 0.195, + "step": 20957 + }, + { + "epoch": 1.9743293846117613, + "grad_norm": 0.614940345287323, + "learning_rate": 8.433562425382847e-09, + "loss": 0.1631, + "step": 20958 + }, + { + "epoch": 1.974423588704929, + "grad_norm": 0.6482104659080505, + "learning_rate": 8.371673566546223e-09, + "loss": 0.2037, + "step": 20959 + }, + { + "epoch": 1.974517792798097, + "grad_norm": 0.6362460255622864, + "learning_rate": 8.310012532287692e-09, + "loss": 0.1875, + "step": 20960 + }, + { + "epoch": 1.974611996891265, + "grad_norm": 0.6358654499053955, + "learning_rate": 8.248579324013905e-09, + "loss": 0.1679, + "step": 20961 + }, + { + "epoch": 1.9747062009844327, + "grad_norm": 0.6750145554542542, + "learning_rate": 8.187373943124854e-09, + "loss": 0.1883, + "step": 20962 + }, + { + "epoch": 1.9748004050776005, + "grad_norm": 0.7022234201431274, + "learning_rate": 8.126396391017199e-09, + "loss": 0.199, + "step": 20963 + }, + { + "epoch": 1.9748946091707684, + "grad_norm": 0.6213939189910889, + "learning_rate": 8.065646669078719e-09, + "loss": 0.1867, + "step": 20964 + }, + { + "epoch": 1.9749888132639364, + "grad_norm": 0.6233702898025513, + "learning_rate": 8.005124778698304e-09, + "loss": 0.187, + "step": 20965 + }, + { + "epoch": 1.9750830173571041, + "grad_norm": 0.7024995684623718, + "learning_rate": 7.944830721252627e-09, + "loss": 0.192, + "step": 20966 + }, + { + "epoch": 1.9751772214502719, + "grad_norm": 0.569704532623291, + "learning_rate": 7.88476449811726e-09, + "loss": 0.1759, + "step": 20967 + }, + { + "epoch": 1.9752714255434398, + "grad_norm": 0.8393198847770691, + "learning_rate": 7.824926110663323e-09, + "loss": 0.1922, + "step": 20968 + }, + { + "epoch": 1.9753656296366078, + "grad_norm": 0.8141598105430603, + "learning_rate": 7.765315560254172e-09, + "loss": 0.1702, + "step": 20969 + }, + { + "epoch": 1.9754598337297755, + "grad_norm": 0.6242630481719971, + "learning_rate": 7.70593284824872e-09, + "loss": 0.1508, + "step": 20970 + }, + { + "epoch": 1.9755540378229433, + "grad_norm": 0.6809794902801514, + "learning_rate": 7.64677797600255e-09, + "loss": 0.2107, + "step": 20971 + }, + { + "epoch": 1.9756482419161112, + "grad_norm": 0.675658106803894, + "learning_rate": 7.587850944862363e-09, + "loss": 0.2127, + "step": 20972 + }, + { + "epoch": 1.9757424460092792, + "grad_norm": 0.6515462398529053, + "learning_rate": 7.529151756172637e-09, + "loss": 0.1931, + "step": 20973 + }, + { + "epoch": 1.975836650102447, + "grad_norm": 0.6538036465644836, + "learning_rate": 7.470680411272301e-09, + "loss": 0.1893, + "step": 20974 + }, + { + "epoch": 1.9759308541956146, + "grad_norm": 0.6760832071304321, + "learning_rate": 7.412436911493626e-09, + "loss": 0.204, + "step": 20975 + }, + { + "epoch": 1.9760250582887826, + "grad_norm": 0.6080470085144043, + "learning_rate": 7.354421258165545e-09, + "loss": 0.1816, + "step": 20976 + }, + { + "epoch": 1.9761192623819506, + "grad_norm": 0.6867570281028748, + "learning_rate": 7.296633452611446e-09, + "loss": 0.19, + "step": 20977 + }, + { + "epoch": 1.9762134664751183, + "grad_norm": 0.6577616333961487, + "learning_rate": 7.239073496146942e-09, + "loss": 0.1888, + "step": 20978 + }, + { + "epoch": 1.976307670568286, + "grad_norm": 0.7067936658859253, + "learning_rate": 7.181741390086539e-09, + "loss": 0.2001, + "step": 20979 + }, + { + "epoch": 1.976401874661454, + "grad_norm": 0.7678654789924622, + "learning_rate": 7.124637135735857e-09, + "loss": 0.1906, + "step": 20980 + }, + { + "epoch": 1.976496078754622, + "grad_norm": 0.7363203167915344, + "learning_rate": 7.067760734398299e-09, + "loss": 0.1853, + "step": 20981 + }, + { + "epoch": 1.9765902828477897, + "grad_norm": 0.6460639238357544, + "learning_rate": 7.0111121873694956e-09, + "loss": 0.1728, + "step": 20982 + }, + { + "epoch": 1.9766844869409574, + "grad_norm": 0.6732394695281982, + "learning_rate": 6.954691495941745e-09, + "loss": 0.19, + "step": 20983 + }, + { + "epoch": 1.9767786910341254, + "grad_norm": 0.7039426565170288, + "learning_rate": 6.898498661401798e-09, + "loss": 0.2023, + "step": 20984 + }, + { + "epoch": 1.9768728951272934, + "grad_norm": 0.7103269100189209, + "learning_rate": 6.84253368503085e-09, + "loss": 0.1967, + "step": 20985 + }, + { + "epoch": 1.976967099220461, + "grad_norm": 0.6651984453201294, + "learning_rate": 6.786796568105658e-09, + "loss": 0.1835, + "step": 20986 + }, + { + "epoch": 1.9770613033136288, + "grad_norm": 0.6574163436889648, + "learning_rate": 6.731287311895207e-09, + "loss": 0.2106, + "step": 20987 + }, + { + "epoch": 1.9771555074067968, + "grad_norm": 0.6663548946380615, + "learning_rate": 6.676005917666262e-09, + "loss": 0.1886, + "step": 20988 + }, + { + "epoch": 1.9772497114999648, + "grad_norm": 0.5763439536094666, + "learning_rate": 6.620952386680035e-09, + "loss": 0.1604, + "step": 20989 + }, + { + "epoch": 1.9773439155931325, + "grad_norm": 0.7055424451828003, + "learning_rate": 6.56612672019108e-09, + "loss": 0.2046, + "step": 20990 + }, + { + "epoch": 1.9774381196863002, + "grad_norm": 0.6616795659065247, + "learning_rate": 6.511528919449505e-09, + "loss": 0.1805, + "step": 20991 + }, + { + "epoch": 1.9775323237794682, + "grad_norm": 0.7356581091880798, + "learning_rate": 6.457158985699874e-09, + "loss": 0.1928, + "step": 20992 + }, + { + "epoch": 1.9776265278726362, + "grad_norm": 0.7133409976959229, + "learning_rate": 6.403016920182304e-09, + "loss": 0.2264, + "step": 20993 + }, + { + "epoch": 1.977720731965804, + "grad_norm": 0.7125285267829895, + "learning_rate": 6.3491027241313615e-09, + "loss": 0.174, + "step": 20994 + }, + { + "epoch": 1.9778149360589716, + "grad_norm": 0.6706700325012207, + "learning_rate": 6.295416398777176e-09, + "loss": 0.2053, + "step": 20995 + }, + { + "epoch": 1.9779091401521396, + "grad_norm": 0.6878630518913269, + "learning_rate": 6.241957945342103e-09, + "loss": 0.2206, + "step": 20996 + }, + { + "epoch": 1.9780033442453075, + "grad_norm": 0.671504557132721, + "learning_rate": 6.188727365046276e-09, + "loss": 0.227, + "step": 20997 + }, + { + "epoch": 1.9780975483384753, + "grad_norm": 0.6460820436477661, + "learning_rate": 6.135724659103171e-09, + "loss": 0.1885, + "step": 20998 + }, + { + "epoch": 1.978191752431643, + "grad_norm": 0.6574946045875549, + "learning_rate": 6.082949828721818e-09, + "loss": 0.1643, + "step": 20999 + }, + { + "epoch": 1.978285956524811, + "grad_norm": 0.6905536651611328, + "learning_rate": 6.030402875104591e-09, + "loss": 0.2162, + "step": 21000 + }, + { + "epoch": 1.978380160617979, + "grad_norm": 0.7331179976463318, + "learning_rate": 5.978083799450529e-09, + "loss": 0.1888, + "step": 21001 + }, + { + "epoch": 1.9784743647111467, + "grad_norm": 0.6354816555976868, + "learning_rate": 5.925992602952013e-09, + "loss": 0.187, + "step": 21002 + }, + { + "epoch": 1.9785685688043144, + "grad_norm": 0.6240286231040955, + "learning_rate": 5.8741292867969815e-09, + "loss": 0.1625, + "step": 21003 + }, + { + "epoch": 1.9786627728974824, + "grad_norm": 0.7908602952957153, + "learning_rate": 5.8224938521678206e-09, + "loss": 0.198, + "step": 21004 + }, + { + "epoch": 1.9787569769906503, + "grad_norm": 0.598463773727417, + "learning_rate": 5.771086300242479e-09, + "loss": 0.1771, + "step": 21005 + }, + { + "epoch": 1.978851181083818, + "grad_norm": 0.7266849875450134, + "learning_rate": 5.719906632193351e-09, + "loss": 0.203, + "step": 21006 + }, + { + "epoch": 1.9789453851769858, + "grad_norm": 0.6000388264656067, + "learning_rate": 5.66895484918617e-09, + "loss": 0.1789, + "step": 21007 + }, + { + "epoch": 1.9790395892701538, + "grad_norm": 0.6529139280319214, + "learning_rate": 5.618230952382231e-09, + "loss": 0.2196, + "step": 21008 + }, + { + "epoch": 1.9791337933633217, + "grad_norm": 0.7088423371315002, + "learning_rate": 5.567734942940605e-09, + "loss": 0.2085, + "step": 21009 + }, + { + "epoch": 1.9792279974564895, + "grad_norm": 0.6654103994369507, + "learning_rate": 5.517466822011486e-09, + "loss": 0.1979, + "step": 21010 + }, + { + "epoch": 1.9793222015496572, + "grad_norm": 0.6654288172721863, + "learning_rate": 5.467426590739511e-09, + "loss": 0.2016, + "step": 21011 + }, + { + "epoch": 1.9794164056428252, + "grad_norm": 0.6455620527267456, + "learning_rate": 5.41761425026821e-09, + "loss": 0.1829, + "step": 21012 + }, + { + "epoch": 1.9795106097359931, + "grad_norm": 0.7251503467559814, + "learning_rate": 5.368029801732233e-09, + "loss": 0.1934, + "step": 21013 + }, + { + "epoch": 1.9796048138291609, + "grad_norm": 0.6838080883026123, + "learning_rate": 5.318673246261785e-09, + "loss": 0.2239, + "step": 21014 + }, + { + "epoch": 1.9796990179223286, + "grad_norm": 0.644503116607666, + "learning_rate": 5.269544584982633e-09, + "loss": 0.1811, + "step": 21015 + }, + { + "epoch": 1.9797932220154966, + "grad_norm": 0.7098379135131836, + "learning_rate": 5.220643819014992e-09, + "loss": 0.2072, + "step": 21016 + }, + { + "epoch": 1.9798874261086645, + "grad_norm": 0.6474086046218872, + "learning_rate": 5.171970949473526e-09, + "loss": 0.1932, + "step": 21017 + }, + { + "epoch": 1.9799816302018323, + "grad_norm": 0.7026640772819519, + "learning_rate": 5.1235259774695675e-09, + "loss": 0.1896, + "step": 21018 + }, + { + "epoch": 1.980075834295, + "grad_norm": 0.6126406788825989, + "learning_rate": 5.0753089041055695e-09, + "loss": 0.2013, + "step": 21019 + }, + { + "epoch": 1.980170038388168, + "grad_norm": 0.7585750818252563, + "learning_rate": 5.027319730481761e-09, + "loss": 0.1909, + "step": 21020 + }, + { + "epoch": 1.980264242481336, + "grad_norm": 0.6204404830932617, + "learning_rate": 4.979558457693934e-09, + "loss": 0.1926, + "step": 21021 + }, + { + "epoch": 1.9803584465745037, + "grad_norm": 0.6853671073913574, + "learning_rate": 4.9320250868289955e-09, + "loss": 0.1982, + "step": 21022 + }, + { + "epoch": 1.9804526506676714, + "grad_norm": 0.5925233364105225, + "learning_rate": 4.884719618971634e-09, + "loss": 0.1758, + "step": 21023 + }, + { + "epoch": 1.9805468547608394, + "grad_norm": 0.6932353973388672, + "learning_rate": 4.837642055199876e-09, + "loss": 0.2079, + "step": 21024 + }, + { + "epoch": 1.9806410588540073, + "grad_norm": 0.7288113236427307, + "learning_rate": 4.790792396588417e-09, + "loss": 0.211, + "step": 21025 + }, + { + "epoch": 1.980735262947175, + "grad_norm": 0.6650190353393555, + "learning_rate": 4.744170644204182e-09, + "loss": 0.2163, + "step": 21026 + }, + { + "epoch": 1.9808294670403428, + "grad_norm": 0.6971545815467834, + "learning_rate": 4.697776799110764e-09, + "loss": 0.195, + "step": 21027 + }, + { + "epoch": 1.9809236711335108, + "grad_norm": 0.6654476523399353, + "learning_rate": 4.651610862366207e-09, + "loss": 0.1858, + "step": 21028 + }, + { + "epoch": 1.9810178752266787, + "grad_norm": 0.6698181629180908, + "learning_rate": 4.605672835024111e-09, + "loss": 0.1772, + "step": 21029 + }, + { + "epoch": 1.9811120793198465, + "grad_norm": 0.6747551560401917, + "learning_rate": 4.559962718129196e-09, + "loss": 0.1806, + "step": 21030 + }, + { + "epoch": 1.9812062834130142, + "grad_norm": 0.6727416515350342, + "learning_rate": 4.514480512726183e-09, + "loss": 0.2066, + "step": 21031 + }, + { + "epoch": 1.9813004875061822, + "grad_norm": 0.6404261589050293, + "learning_rate": 4.469226219852019e-09, + "loss": 0.1907, + "step": 21032 + }, + { + "epoch": 1.98139469159935, + "grad_norm": 0.6485568284988403, + "learning_rate": 4.424199840536991e-09, + "loss": 0.1919, + "step": 21033 + }, + { + "epoch": 1.9814888956925178, + "grad_norm": 0.7125912308692932, + "learning_rate": 4.379401375809167e-09, + "loss": 0.2046, + "step": 21034 + }, + { + "epoch": 1.9815830997856856, + "grad_norm": 0.6653649806976318, + "learning_rate": 4.3348308266888405e-09, + "loss": 0.1825, + "step": 21035 + }, + { + "epoch": 1.9816773038788535, + "grad_norm": 0.6265192627906799, + "learning_rate": 4.290488194192976e-09, + "loss": 0.1716, + "step": 21036 + }, + { + "epoch": 1.9817715079720215, + "grad_norm": 0.735074520111084, + "learning_rate": 4.246373479332988e-09, + "loss": 0.221, + "step": 21037 + }, + { + "epoch": 1.9818657120651892, + "grad_norm": 0.6242669224739075, + "learning_rate": 4.202486683114737e-09, + "loss": 0.1945, + "step": 21038 + }, + { + "epoch": 1.981959916158357, + "grad_norm": 0.6551904678344727, + "learning_rate": 4.158827806538534e-09, + "loss": 0.2021, + "step": 21039 + }, + { + "epoch": 1.982054120251525, + "grad_norm": 0.7333979606628418, + "learning_rate": 4.11539685059914e-09, + "loss": 0.2229, + "step": 21040 + }, + { + "epoch": 1.982148324344693, + "grad_norm": 0.6325864195823669, + "learning_rate": 4.072193816287983e-09, + "loss": 0.1741, + "step": 21041 + }, + { + "epoch": 1.9822425284378606, + "grad_norm": 0.6482536196708679, + "learning_rate": 4.029218704589832e-09, + "loss": 0.1838, + "step": 21042 + }, + { + "epoch": 1.9823367325310284, + "grad_norm": 0.7280809879302979, + "learning_rate": 3.986471516485013e-09, + "loss": 0.1788, + "step": 21043 + }, + { + "epoch": 1.9824309366241963, + "grad_norm": 0.6607115268707275, + "learning_rate": 3.943952252947192e-09, + "loss": 0.2105, + "step": 21044 + }, + { + "epoch": 1.9825251407173643, + "grad_norm": 0.6494821310043335, + "learning_rate": 3.901660914946703e-09, + "loss": 0.1802, + "step": 21045 + }, + { + "epoch": 1.982619344810532, + "grad_norm": 0.7037579417228699, + "learning_rate": 3.859597503448331e-09, + "loss": 0.1895, + "step": 21046 + }, + { + "epoch": 1.9827135489036998, + "grad_norm": 0.639551043510437, + "learning_rate": 3.817762019409088e-09, + "loss": 0.1785, + "step": 21047 + }, + { + "epoch": 1.9828077529968677, + "grad_norm": 0.7694389820098877, + "learning_rate": 3.776154463784875e-09, + "loss": 0.2015, + "step": 21048 + }, + { + "epoch": 1.9829019570900357, + "grad_norm": 0.6747332811355591, + "learning_rate": 3.7347748375238245e-09, + "loss": 0.1784, + "step": 21049 + }, + { + "epoch": 1.9829961611832034, + "grad_norm": 0.623313844203949, + "learning_rate": 3.6936231415696243e-09, + "loss": 0.1946, + "step": 21050 + }, + { + "epoch": 1.9830903652763712, + "grad_norm": 0.5907295346260071, + "learning_rate": 3.6526993768604134e-09, + "loss": 0.1746, + "step": 21051 + }, + { + "epoch": 1.9831845693695391, + "grad_norm": 0.6866522431373596, + "learning_rate": 3.6120035443287792e-09, + "loss": 0.1809, + "step": 21052 + }, + { + "epoch": 1.983278773462707, + "grad_norm": 0.6459100842475891, + "learning_rate": 3.5715356449039786e-09, + "loss": 0.1776, + "step": 21053 + }, + { + "epoch": 1.9833729775558748, + "grad_norm": 0.6088927388191223, + "learning_rate": 3.531295679507496e-09, + "loss": 0.185, + "step": 21054 + }, + { + "epoch": 1.9834671816490426, + "grad_norm": 0.6171404123306274, + "learning_rate": 3.4912836490574863e-09, + "loss": 0.1779, + "step": 21055 + }, + { + "epoch": 1.9835613857422105, + "grad_norm": 0.5789969563484192, + "learning_rate": 3.451499554465443e-09, + "loss": 0.1727, + "step": 21056 + }, + { + "epoch": 1.9836555898353785, + "grad_norm": 0.6608253717422485, + "learning_rate": 3.411943396639528e-09, + "loss": 0.2095, + "step": 21057 + }, + { + "epoch": 1.9837497939285462, + "grad_norm": 0.701216995716095, + "learning_rate": 3.3726151764812422e-09, + "loss": 0.1964, + "step": 21058 + }, + { + "epoch": 1.983843998021714, + "grad_norm": 0.6264675259590149, + "learning_rate": 3.333514894887646e-09, + "loss": 0.1879, + "step": 21059 + }, + { + "epoch": 1.983938202114882, + "grad_norm": 0.6653956770896912, + "learning_rate": 3.294642552750249e-09, + "loss": 0.2088, + "step": 21060 + }, + { + "epoch": 1.9840324062080499, + "grad_norm": 0.6880512237548828, + "learning_rate": 3.2559981509550087e-09, + "loss": 0.2115, + "step": 21061 + }, + { + "epoch": 1.9841266103012176, + "grad_norm": 0.6396224498748779, + "learning_rate": 3.2175816903834423e-09, + "loss": 0.1788, + "step": 21062 + }, + { + "epoch": 1.9842208143943854, + "grad_norm": 0.6308459043502808, + "learning_rate": 3.1793931719104054e-09, + "loss": 0.1938, + "step": 21063 + }, + { + "epoch": 1.9843150184875533, + "grad_norm": 0.6442133784294128, + "learning_rate": 3.1414325964085335e-09, + "loss": 0.1815, + "step": 21064 + }, + { + "epoch": 1.9844092225807213, + "grad_norm": 0.6585400700569153, + "learning_rate": 3.1036999647426903e-09, + "loss": 0.1702, + "step": 21065 + }, + { + "epoch": 1.984503426673889, + "grad_norm": 0.6935811042785645, + "learning_rate": 3.066195277772188e-09, + "loss": 0.1974, + "step": 21066 + }, + { + "epoch": 1.9845976307670568, + "grad_norm": 0.6294695734977722, + "learning_rate": 3.0289185363530095e-09, + "loss": 0.1808, + "step": 21067 + }, + { + "epoch": 1.9846918348602247, + "grad_norm": 0.7599447965621948, + "learning_rate": 2.9918697413355844e-09, + "loss": 0.2071, + "step": 21068 + }, + { + "epoch": 1.9847860389533927, + "grad_norm": 0.6241219639778137, + "learning_rate": 2.9550488935636834e-09, + "loss": 0.1812, + "step": 21069 + }, + { + "epoch": 1.9848802430465604, + "grad_norm": 0.6321325302124023, + "learning_rate": 2.9184559938777445e-09, + "loss": 0.1707, + "step": 21070 + }, + { + "epoch": 1.9849744471397281, + "grad_norm": 0.6703804731369019, + "learning_rate": 2.882091043112656e-09, + "loss": 0.1764, + "step": 21071 + }, + { + "epoch": 1.985068651232896, + "grad_norm": 0.6230263710021973, + "learning_rate": 2.8459540420955333e-09, + "loss": 0.1956, + "step": 21072 + }, + { + "epoch": 1.985162855326064, + "grad_norm": 0.6744650602340698, + "learning_rate": 2.810044991651273e-09, + "loss": 0.1939, + "step": 21073 + }, + { + "epoch": 1.9852570594192318, + "grad_norm": 0.62555992603302, + "learning_rate": 2.774363892600329e-09, + "loss": 0.1987, + "step": 21074 + }, + { + "epoch": 1.9853512635123995, + "grad_norm": 0.6785553693771362, + "learning_rate": 2.7389107457542753e-09, + "loss": 0.1902, + "step": 21075 + }, + { + "epoch": 1.9854454676055675, + "grad_norm": 0.6026608347892761, + "learning_rate": 2.7036855519213532e-09, + "loss": 0.1864, + "step": 21076 + }, + { + "epoch": 1.9855396716987355, + "grad_norm": 0.7131816744804382, + "learning_rate": 2.6686883119064755e-09, + "loss": 0.2175, + "step": 21077 + }, + { + "epoch": 1.9856338757919032, + "grad_norm": 0.6569606065750122, + "learning_rate": 2.6339190265056714e-09, + "loss": 0.1861, + "step": 21078 + }, + { + "epoch": 1.985728079885071, + "grad_norm": 0.6766698956489563, + "learning_rate": 2.599377696513861e-09, + "loss": 0.1809, + "step": 21079 + }, + { + "epoch": 1.985822283978239, + "grad_norm": 0.6847541928291321, + "learning_rate": 2.565064322717081e-09, + "loss": 0.1806, + "step": 21080 + }, + { + "epoch": 1.9859164880714069, + "grad_norm": 0.6673032641410828, + "learning_rate": 2.5309789058980405e-09, + "loss": 0.1815, + "step": 21081 + }, + { + "epoch": 1.9860106921645746, + "grad_norm": 0.6046951413154602, + "learning_rate": 2.4971214468338945e-09, + "loss": 0.177, + "step": 21082 + }, + { + "epoch": 1.9861048962577423, + "grad_norm": 0.6393413543701172, + "learning_rate": 2.4634919462973584e-09, + "loss": 0.1649, + "step": 21083 + }, + { + "epoch": 1.9861991003509103, + "grad_norm": 0.6841180324554443, + "learning_rate": 2.430090405054486e-09, + "loss": 0.1977, + "step": 21084 + }, + { + "epoch": 1.9862933044440783, + "grad_norm": 0.6634268164634705, + "learning_rate": 2.3969168238668906e-09, + "loss": 0.201, + "step": 21085 + }, + { + "epoch": 1.986387508537246, + "grad_norm": 0.7379873394966125, + "learning_rate": 2.3639712034906336e-09, + "loss": 0.231, + "step": 21086 + }, + { + "epoch": 1.9864817126304137, + "grad_norm": 0.6023994088172913, + "learning_rate": 2.331253544678447e-09, + "loss": 0.1829, + "step": 21087 + }, + { + "epoch": 1.9865759167235817, + "grad_norm": 0.6844269633293152, + "learning_rate": 2.2987638481752892e-09, + "loss": 0.1836, + "step": 21088 + }, + { + "epoch": 1.9866701208167497, + "grad_norm": 0.6246742010116577, + "learning_rate": 2.26650211472168e-09, + "loss": 0.1885, + "step": 21089 + }, + { + "epoch": 1.9867643249099174, + "grad_norm": 0.7646008729934692, + "learning_rate": 2.2344683450536977e-09, + "loss": 0.2014, + "step": 21090 + }, + { + "epoch": 1.9868585290030851, + "grad_norm": 0.585985004901886, + "learning_rate": 2.2026625399018675e-09, + "loss": 0.1718, + "step": 21091 + }, + { + "epoch": 1.986952733096253, + "grad_norm": 0.6901452541351318, + "learning_rate": 2.1710846999911663e-09, + "loss": 0.202, + "step": 21092 + }, + { + "epoch": 1.9870469371894208, + "grad_norm": 0.6707181334495544, + "learning_rate": 2.139734826041018e-09, + "loss": 0.2082, + "step": 21093 + }, + { + "epoch": 1.9871411412825886, + "grad_norm": 0.636265218257904, + "learning_rate": 2.108612918767516e-09, + "loss": 0.1917, + "step": 21094 + }, + { + "epoch": 1.9872353453757565, + "grad_norm": 0.6749941110610962, + "learning_rate": 2.0777189788800944e-09, + "loss": 0.207, + "step": 21095 + }, + { + "epoch": 1.9873295494689245, + "grad_norm": 0.652577817440033, + "learning_rate": 2.047053007081523e-09, + "loss": 0.1886, + "step": 21096 + }, + { + "epoch": 1.9874237535620922, + "grad_norm": 0.6920233964920044, + "learning_rate": 2.0166150040734634e-09, + "loss": 0.1792, + "step": 21097 + }, + { + "epoch": 1.98751795765526, + "grad_norm": 0.5810310244560242, + "learning_rate": 1.9864049705486942e-09, + "loss": 0.1831, + "step": 21098 + }, + { + "epoch": 1.987612161748428, + "grad_norm": 0.6675482988357544, + "learning_rate": 1.9564229071955542e-09, + "loss": 0.1858, + "step": 21099 + }, + { + "epoch": 1.9877063658415959, + "grad_norm": 0.7007507085800171, + "learning_rate": 1.9266688146979406e-09, + "loss": 0.1904, + "step": 21100 + }, + { + "epoch": 1.9878005699347636, + "grad_norm": 0.7041182518005371, + "learning_rate": 1.89714269373531e-09, + "loss": 0.2054, + "step": 21101 + }, + { + "epoch": 1.9878947740279314, + "grad_norm": 0.6133896708488464, + "learning_rate": 1.8678445449804574e-09, + "loss": 0.1738, + "step": 21102 + }, + { + "epoch": 1.9879889781210993, + "grad_norm": 0.598605751991272, + "learning_rate": 1.8387743691006267e-09, + "loss": 0.1745, + "step": 21103 + }, + { + "epoch": 1.9880831822142673, + "grad_norm": 0.6418994665145874, + "learning_rate": 1.8099321667586211e-09, + "loss": 0.1743, + "step": 21104 + }, + { + "epoch": 1.988177386307435, + "grad_norm": 0.6561501026153564, + "learning_rate": 1.7813179386139134e-09, + "loss": 0.1717, + "step": 21105 + }, + { + "epoch": 1.9882715904006028, + "grad_norm": 0.667719841003418, + "learning_rate": 1.7529316853170941e-09, + "loss": 0.1967, + "step": 21106 + }, + { + "epoch": 1.9883657944937707, + "grad_norm": 0.799945592880249, + "learning_rate": 1.7247734075154232e-09, + "loss": 0.1945, + "step": 21107 + }, + { + "epoch": 1.9884599985869387, + "grad_norm": 0.6253148913383484, + "learning_rate": 1.69684310585283e-09, + "loss": 0.2079, + "step": 21108 + }, + { + "epoch": 1.9885542026801064, + "grad_norm": 0.6322131156921387, + "learning_rate": 1.6691407809643624e-09, + "loss": 0.2096, + "step": 21109 + }, + { + "epoch": 1.9886484067732741, + "grad_norm": 0.6566372513771057, + "learning_rate": 1.6416664334817368e-09, + "loss": 0.1995, + "step": 21110 + }, + { + "epoch": 1.988742610866442, + "grad_norm": 0.6330196261405945, + "learning_rate": 1.6144200640322294e-09, + "loss": 0.1688, + "step": 21111 + }, + { + "epoch": 1.98883681495961, + "grad_norm": 0.6636334657669067, + "learning_rate": 1.587401673236455e-09, + "loss": 0.1712, + "step": 21112 + }, + { + "epoch": 1.9889310190527778, + "grad_norm": 0.6503570079803467, + "learning_rate": 1.5606112617105873e-09, + "loss": 0.184, + "step": 21113 + }, + { + "epoch": 1.9890252231459455, + "grad_norm": 0.6788511872291565, + "learning_rate": 1.5340488300663592e-09, + "loss": 0.2023, + "step": 21114 + }, + { + "epoch": 1.9891194272391135, + "grad_norm": 0.6875736713409424, + "learning_rate": 1.5077143789088423e-09, + "loss": 0.204, + "step": 21115 + }, + { + "epoch": 1.9892136313322815, + "grad_norm": 0.6112891435623169, + "learning_rate": 1.4816079088375567e-09, + "loss": 0.1792, + "step": 21116 + }, + { + "epoch": 1.9893078354254492, + "grad_norm": 0.623278796672821, + "learning_rate": 1.4557294204498028e-09, + "loss": 0.2012, + "step": 21117 + }, + { + "epoch": 1.989402039518617, + "grad_norm": 0.622858464717865, + "learning_rate": 1.4300789143328887e-09, + "loss": 0.1846, + "step": 21118 + }, + { + "epoch": 1.989496243611785, + "grad_norm": 0.6334249377250671, + "learning_rate": 1.4046563910741218e-09, + "loss": 0.1719, + "step": 21119 + }, + { + "epoch": 1.9895904477049529, + "grad_norm": 0.6933590173721313, + "learning_rate": 1.379461851253039e-09, + "loss": 0.2181, + "step": 21120 + }, + { + "epoch": 1.9896846517981206, + "grad_norm": 0.6153637170791626, + "learning_rate": 1.354495295442515e-09, + "loss": 0.1806, + "step": 21121 + }, + { + "epoch": 1.9897788558912883, + "grad_norm": 1.0160863399505615, + "learning_rate": 1.3297567242120945e-09, + "loss": 0.1945, + "step": 21122 + }, + { + "epoch": 1.9898730599844563, + "grad_norm": 0.7127150893211365, + "learning_rate": 1.3052461381279912e-09, + "loss": 0.1917, + "step": 21123 + }, + { + "epoch": 1.9899672640776243, + "grad_norm": 0.6873974204063416, + "learning_rate": 1.2809635377464268e-09, + "loss": 0.1915, + "step": 21124 + }, + { + "epoch": 1.990061468170792, + "grad_norm": 0.6947431564331055, + "learning_rate": 1.256908923622513e-09, + "loss": 0.2103, + "step": 21125 + }, + { + "epoch": 1.9901556722639597, + "grad_norm": 0.6591554284095764, + "learning_rate": 1.2330822963046996e-09, + "loss": 0.1967, + "step": 21126 + }, + { + "epoch": 1.9902498763571277, + "grad_norm": 0.6387174129486084, + "learning_rate": 1.2094836563358858e-09, + "loss": 0.1994, + "step": 21127 + }, + { + "epoch": 1.9903440804502956, + "grad_norm": 0.6355052590370178, + "learning_rate": 1.18611300425453e-09, + "loss": 0.2133, + "step": 21128 + }, + { + "epoch": 1.9904382845434634, + "grad_norm": 0.6713987588882446, + "learning_rate": 1.1629703405924286e-09, + "loss": 0.2125, + "step": 21129 + }, + { + "epoch": 1.9905324886366311, + "grad_norm": 0.7112892866134644, + "learning_rate": 1.1400556658780482e-09, + "loss": 0.211, + "step": 21130 + }, + { + "epoch": 1.990626692729799, + "grad_norm": 0.6970196962356567, + "learning_rate": 1.1173689806354137e-09, + "loss": 0.2274, + "step": 21131 + }, + { + "epoch": 1.990720896822967, + "grad_norm": 0.7993209362030029, + "learning_rate": 1.0949102853785586e-09, + "loss": 0.1852, + "step": 21132 + }, + { + "epoch": 1.9908151009161348, + "grad_norm": 0.6768918633460999, + "learning_rate": 1.0726795806226263e-09, + "loss": 0.1814, + "step": 21133 + }, + { + "epoch": 1.9909093050093025, + "grad_norm": 0.6346969604492188, + "learning_rate": 1.0506768668727685e-09, + "loss": 0.1727, + "step": 21134 + }, + { + "epoch": 1.9910035091024705, + "grad_norm": 0.6714830994606018, + "learning_rate": 1.0289021446308057e-09, + "loss": 0.2042, + "step": 21135 + }, + { + "epoch": 1.9910977131956384, + "grad_norm": 0.6177438497543335, + "learning_rate": 1.0073554143941178e-09, + "loss": 0.1928, + "step": 21136 + }, + { + "epoch": 1.9911919172888062, + "grad_norm": 0.8300781846046448, + "learning_rate": 9.860366766534234e-10, + "loss": 0.2055, + "step": 21137 + }, + { + "epoch": 1.991286121381974, + "grad_norm": 0.6905630826950073, + "learning_rate": 9.649459318950006e-10, + "loss": 0.1829, + "step": 21138 + }, + { + "epoch": 1.9913803254751419, + "grad_norm": 0.5799639225006104, + "learning_rate": 9.440831805984652e-10, + "loss": 0.1669, + "step": 21139 + }, + { + "epoch": 1.9914745295683098, + "grad_norm": 0.7242265939712524, + "learning_rate": 9.234484232423236e-10, + "loss": 0.1898, + "step": 21140 + }, + { + "epoch": 1.9915687336614776, + "grad_norm": 0.6649330854415894, + "learning_rate": 9.030416602939795e-10, + "loss": 0.1789, + "step": 21141 + }, + { + "epoch": 1.9916629377546453, + "grad_norm": 0.6277757287025452, + "learning_rate": 8.828628922208371e-10, + "loss": 0.1725, + "step": 21142 + }, + { + "epoch": 1.9917571418478133, + "grad_norm": 0.6722846031188965, + "learning_rate": 8.629121194825285e-10, + "loss": 0.1899, + "step": 21143 + }, + { + "epoch": 1.9918513459409812, + "grad_norm": 0.8008500933647156, + "learning_rate": 8.431893425342453e-10, + "loss": 0.1756, + "step": 21144 + }, + { + "epoch": 1.991945550034149, + "grad_norm": 0.5988976359367371, + "learning_rate": 8.236945618245174e-10, + "loss": 0.1752, + "step": 21145 + }, + { + "epoch": 1.9920397541273167, + "grad_norm": 0.6233307719230652, + "learning_rate": 8.044277777985443e-10, + "loss": 0.1696, + "step": 21146 + }, + { + "epoch": 1.9921339582204847, + "grad_norm": 0.7080352306365967, + "learning_rate": 7.853889908959744e-10, + "loss": 0.2124, + "step": 21147 + }, + { + "epoch": 1.9922281623136526, + "grad_norm": 0.6927669048309326, + "learning_rate": 7.665782015497946e-10, + "loss": 0.2214, + "step": 21148 + }, + { + "epoch": 1.9923223664068204, + "grad_norm": 0.6810639500617981, + "learning_rate": 7.479954101907716e-10, + "loss": 0.2008, + "step": 21149 + }, + { + "epoch": 1.992416570499988, + "grad_norm": 0.6719371676445007, + "learning_rate": 7.296406172407899e-10, + "loss": 0.1887, + "step": 21150 + }, + { + "epoch": 1.992510774593156, + "grad_norm": 0.6678481698036194, + "learning_rate": 7.11513823119514e-10, + "loss": 0.1964, + "step": 21151 + }, + { + "epoch": 1.992604978686324, + "grad_norm": 0.7267085909843445, + "learning_rate": 6.936150282399467e-10, + "loss": 0.1981, + "step": 21152 + }, + { + "epoch": 1.9926991827794918, + "grad_norm": 0.6565226316452026, + "learning_rate": 6.759442330106503e-10, + "loss": 0.2088, + "step": 21153 + }, + { + "epoch": 1.9927933868726595, + "grad_norm": 0.6537086963653564, + "learning_rate": 6.585014378335253e-10, + "loss": 0.1796, + "step": 21154 + }, + { + "epoch": 1.9928875909658275, + "grad_norm": 0.7065827250480652, + "learning_rate": 6.412866431071418e-10, + "loss": 0.1854, + "step": 21155 + }, + { + "epoch": 1.9929817950589954, + "grad_norm": 0.6767070293426514, + "learning_rate": 6.242998492234087e-10, + "loss": 0.2037, + "step": 21156 + }, + { + "epoch": 1.9930759991521632, + "grad_norm": 0.6987535357475281, + "learning_rate": 6.075410565697937e-10, + "loss": 0.1934, + "step": 21157 + }, + { + "epoch": 1.993170203245331, + "grad_norm": 0.6832001805305481, + "learning_rate": 5.910102655293237e-10, + "loss": 0.1842, + "step": 21158 + }, + { + "epoch": 1.9932644073384989, + "grad_norm": 0.6096380949020386, + "learning_rate": 5.747074764783645e-10, + "loss": 0.1772, + "step": 21159 + }, + { + "epoch": 1.9933586114316668, + "grad_norm": 0.6690545678138733, + "learning_rate": 5.586326897877303e-10, + "loss": 0.2058, + "step": 21160 + }, + { + "epoch": 1.9934528155248346, + "grad_norm": 0.6548620462417603, + "learning_rate": 5.427859058249052e-10, + "loss": 0.1867, + "step": 21161 + }, + { + "epoch": 1.9935470196180023, + "grad_norm": 0.6081674695014954, + "learning_rate": 5.271671249507116e-10, + "loss": 0.1811, + "step": 21162 + }, + { + "epoch": 1.9936412237111703, + "grad_norm": 0.6763384938240051, + "learning_rate": 5.11776347521531e-10, + "loss": 0.1969, + "step": 21163 + }, + { + "epoch": 1.9937354278043382, + "grad_norm": 0.621370792388916, + "learning_rate": 4.966135738893041e-10, + "loss": 0.189, + "step": 21164 + }, + { + "epoch": 1.993829631897506, + "grad_norm": 0.6853028535842896, + "learning_rate": 4.816788043982001e-10, + "loss": 0.2095, + "step": 21165 + }, + { + "epoch": 1.9939238359906737, + "grad_norm": 0.674619734287262, + "learning_rate": 4.669720393890575e-10, + "loss": 0.189, + "step": 21166 + }, + { + "epoch": 1.9940180400838416, + "grad_norm": 0.7634488344192505, + "learning_rate": 4.524932791982739e-10, + "loss": 0.1928, + "step": 21167 + }, + { + "epoch": 1.9941122441770096, + "grad_norm": 0.6809353232383728, + "learning_rate": 4.382425241544752e-10, + "loss": 0.1871, + "step": 21168 + }, + { + "epoch": 1.9942064482701773, + "grad_norm": 0.7955659031867981, + "learning_rate": 4.2421977458406706e-10, + "loss": 0.1707, + "step": 21169 + }, + { + "epoch": 1.994300652363345, + "grad_norm": 0.6505053043365479, + "learning_rate": 4.104250308056834e-10, + "loss": 0.2144, + "step": 21170 + }, + { + "epoch": 1.994394856456513, + "grad_norm": 0.6431297659873962, + "learning_rate": 3.9685829313462765e-10, + "loss": 0.1708, + "step": 21171 + }, + { + "epoch": 1.994489060549681, + "grad_norm": 0.641369640827179, + "learning_rate": 3.8351956187954175e-10, + "loss": 0.201, + "step": 21172 + }, + { + "epoch": 1.9945832646428487, + "grad_norm": 0.6753309369087219, + "learning_rate": 3.7040883734462685e-10, + "loss": 0.1774, + "step": 21173 + }, + { + "epoch": 1.9946774687360165, + "grad_norm": 0.6169741153717041, + "learning_rate": 3.575261198296431e-10, + "loss": 0.1898, + "step": 21174 + }, + { + "epoch": 1.9947716728291844, + "grad_norm": 0.7171098589897156, + "learning_rate": 3.448714096276895e-10, + "loss": 0.1973, + "step": 21175 + }, + { + "epoch": 1.9948658769223524, + "grad_norm": 0.6104955077171326, + "learning_rate": 3.3244470702742393e-10, + "loss": 0.1667, + "step": 21176 + }, + { + "epoch": 1.9949600810155201, + "grad_norm": 0.6183255314826965, + "learning_rate": 3.202460123119533e-10, + "loss": 0.1809, + "step": 21177 + }, + { + "epoch": 1.9950542851086879, + "grad_norm": 0.6417047381401062, + "learning_rate": 3.0827532575994355e-10, + "loss": 0.1965, + "step": 21178 + }, + { + "epoch": 1.9951484892018558, + "grad_norm": 0.6793981790542603, + "learning_rate": 2.965326476445096e-10, + "loss": 0.1922, + "step": 21179 + }, + { + "epoch": 1.9952426932950238, + "grad_norm": 0.6903060674667358, + "learning_rate": 2.85017978232105e-10, + "loss": 0.1954, + "step": 21180 + }, + { + "epoch": 1.9953368973881915, + "grad_norm": 0.618532657623291, + "learning_rate": 2.7373131778696273e-10, + "loss": 0.1777, + "step": 21181 + }, + { + "epoch": 1.9954311014813593, + "grad_norm": 0.6335455775260925, + "learning_rate": 2.6267266656443415e-10, + "loss": 0.226, + "step": 21182 + }, + { + "epoch": 1.9955253055745272, + "grad_norm": 0.6843485832214355, + "learning_rate": 2.518420248187603e-10, + "loss": 0.2062, + "step": 21183 + }, + { + "epoch": 1.9956195096676952, + "grad_norm": 0.6743312478065491, + "learning_rate": 2.4123939279641073e-10, + "loss": 0.2091, + "step": 21184 + }, + { + "epoch": 1.995713713760863, + "grad_norm": 0.6582971811294556, + "learning_rate": 2.308647707371936e-10, + "loss": 0.1927, + "step": 21185 + }, + { + "epoch": 1.9958079178540307, + "grad_norm": 0.6203919053077698, + "learning_rate": 2.2071815888091708e-10, + "loss": 0.1995, + "step": 21186 + }, + { + "epoch": 1.9959021219471986, + "grad_norm": 0.6721940040588379, + "learning_rate": 2.107995574562871e-10, + "loss": 0.2133, + "step": 21187 + }, + { + "epoch": 1.9959963260403666, + "grad_norm": 0.6331708431243896, + "learning_rate": 2.011089666908994e-10, + "loss": 0.2051, + "step": 21188 + }, + { + "epoch": 1.9960905301335343, + "grad_norm": 0.6765003204345703, + "learning_rate": 1.9164638680457813e-10, + "loss": 0.1733, + "step": 21189 + }, + { + "epoch": 1.996184734226702, + "grad_norm": 0.6852294206619263, + "learning_rate": 1.824118180138168e-10, + "loss": 0.219, + "step": 21190 + }, + { + "epoch": 1.99627893831987, + "grad_norm": 0.6871711015701294, + "learning_rate": 1.7340526052955775e-10, + "loss": 0.2076, + "step": 21191 + }, + { + "epoch": 1.996373142413038, + "grad_norm": 0.6813973784446716, + "learning_rate": 1.6462671455608202e-10, + "loss": 0.1989, + "step": 21192 + }, + { + "epoch": 1.9964673465062057, + "grad_norm": 0.5848715901374817, + "learning_rate": 1.5607618029434003e-10, + "loss": 0.176, + "step": 21193 + }, + { + "epoch": 1.9965615505993735, + "grad_norm": 0.6575416922569275, + "learning_rate": 1.4775365793973096e-10, + "loss": 0.194, + "step": 21194 + }, + { + "epoch": 1.9966557546925414, + "grad_norm": 0.7851409912109375, + "learning_rate": 1.3965914768099277e-10, + "loss": 0.1856, + "step": 21195 + }, + { + "epoch": 1.9967499587857094, + "grad_norm": 0.6446042656898499, + "learning_rate": 1.3179264970242246e-10, + "loss": 0.172, + "step": 21196 + }, + { + "epoch": 1.9968441628788771, + "grad_norm": 0.6077773571014404, + "learning_rate": 1.2415416418498639e-10, + "loss": 0.1735, + "step": 21197 + }, + { + "epoch": 1.9969383669720449, + "grad_norm": 0.6568995118141174, + "learning_rate": 1.1674369130187935e-10, + "loss": 0.1956, + "step": 21198 + }, + { + "epoch": 1.9970325710652128, + "grad_norm": 0.6406089663505554, + "learning_rate": 1.0956123122185524e-10, + "loss": 0.2155, + "step": 21199 + }, + { + "epoch": 1.9971267751583808, + "grad_norm": 0.6521788239479065, + "learning_rate": 1.0260678410922709e-10, + "loss": 0.2187, + "step": 21200 + }, + { + "epoch": 1.9972209792515485, + "grad_norm": 1.0811222791671753, + "learning_rate": 9.588035012164654e-11, + "loss": 0.1717, + "step": 21201 + }, + { + "epoch": 1.9973151833447162, + "grad_norm": 0.7579187750816345, + "learning_rate": 8.93819294134346e-11, + "loss": 0.1841, + "step": 21202 + }, + { + "epoch": 1.9974093874378842, + "grad_norm": 0.6129136681556702, + "learning_rate": 8.311152213336116e-11, + "loss": 0.1583, + "step": 21203 + }, + { + "epoch": 1.9975035915310522, + "grad_norm": 0.6253575086593628, + "learning_rate": 7.706912842242453e-11, + "loss": 0.1878, + "step": 21204 + }, + { + "epoch": 1.99759779562422, + "grad_norm": 0.6372220516204834, + "learning_rate": 7.125474842051283e-11, + "loss": 0.2136, + "step": 21205 + }, + { + "epoch": 1.9976919997173876, + "grad_norm": 0.6634848713874817, + "learning_rate": 6.566838225752215e-11, + "loss": 0.1663, + "step": 21206 + }, + { + "epoch": 1.9977862038105556, + "grad_norm": 0.6424389481544495, + "learning_rate": 6.031003006445879e-11, + "loss": 0.2077, + "step": 21207 + }, + { + "epoch": 1.9978804079037236, + "grad_norm": 0.710203230381012, + "learning_rate": 5.5179691960116634e-11, + "loss": 0.2175, + "step": 21208 + }, + { + "epoch": 1.9979746119968913, + "grad_norm": 0.8055575489997864, + "learning_rate": 5.027736806328953e-11, + "loss": 0.194, + "step": 21209 + }, + { + "epoch": 1.998068816090059, + "grad_norm": 0.5930612683296204, + "learning_rate": 4.56030584849998e-11, + "loss": 0.1555, + "step": 21210 + }, + { + "epoch": 1.998163020183227, + "grad_norm": 0.6169185042381287, + "learning_rate": 4.1156763332939055e-11, + "loss": 0.1783, + "step": 21211 + }, + { + "epoch": 1.998257224276395, + "grad_norm": 0.6444560885429382, + "learning_rate": 3.693848270702738e-11, + "loss": 0.1945, + "step": 21212 + }, + { + "epoch": 1.9983514283695627, + "grad_norm": 0.6154327392578125, + "learning_rate": 3.29482167049644e-11, + "loss": 0.1705, + "step": 21213 + }, + { + "epoch": 1.9984456324627304, + "grad_norm": 0.6865419745445251, + "learning_rate": 2.9185965416678173e-11, + "loss": 0.1827, + "step": 21214 + }, + { + "epoch": 1.9985398365558984, + "grad_norm": 0.7709320187568665, + "learning_rate": 2.5651728927655882e-11, + "loss": 0.1861, + "step": 21215 + }, + { + "epoch": 1.9986340406490664, + "grad_norm": 0.6527905464172363, + "learning_rate": 2.2345507320054028e-11, + "loss": 0.2061, + "step": 21216 + }, + { + "epoch": 1.998728244742234, + "grad_norm": 0.696857213973999, + "learning_rate": 1.9267300667147327e-11, + "loss": 0.1955, + "step": 21217 + }, + { + "epoch": 1.9988224488354018, + "grad_norm": 0.6352835297584534, + "learning_rate": 1.6417109041100277e-11, + "loss": 0.1845, + "step": 21218 + }, + { + "epoch": 1.9989166529285698, + "grad_norm": 0.6665422916412354, + "learning_rate": 1.379493250519559e-11, + "loss": 0.173, + "step": 21219 + }, + { + "epoch": 1.9990108570217378, + "grad_norm": 0.6305170655250549, + "learning_rate": 1.1400771120495535e-11, + "loss": 0.1696, + "step": 21220 + }, + { + "epoch": 1.9991050611149055, + "grad_norm": 0.656869649887085, + "learning_rate": 9.234624940290815e-12, + "loss": 0.1995, + "step": 21221 + }, + { + "epoch": 1.9991992652080732, + "grad_norm": 0.6680158972740173, + "learning_rate": 7.296494015651689e-12, + "loss": 0.1975, + "step": 21222 + }, + { + "epoch": 1.9992934693012412, + "grad_norm": 0.6655610799789429, + "learning_rate": 5.586378389876856e-12, + "loss": 0.2161, + "step": 21223 + }, + { + "epoch": 1.9993876733944091, + "grad_norm": 0.6311565637588501, + "learning_rate": 4.104278102934345e-12, + "loss": 0.1804, + "step": 21224 + }, + { + "epoch": 1.9994818774875769, + "grad_norm": 0.6253184676170349, + "learning_rate": 2.8501931859103994e-12, + "loss": 0.1869, + "step": 21225 + }, + { + "epoch": 1.9995760815807446, + "grad_norm": 0.6699874401092529, + "learning_rate": 1.8241236698912645e-12, + "loss": 0.1773, + "step": 21226 + }, + { + "epoch": 1.9996702856739126, + "grad_norm": 0.697054922580719, + "learning_rate": 1.0260695781916241e-12, + "loss": 0.2091, + "step": 21227 + }, + { + "epoch": 1.9997644897670805, + "grad_norm": 0.6569732427597046, + "learning_rate": 4.560309274648234e-13, + "loss": 0.1943, + "step": 21228 + }, + { + "epoch": 1.9998586938602483, + "grad_norm": 0.660923182964325, + "learning_rate": 1.140077321437616e-13, + "loss": 0.1805, + "step": 21229 + }, + { + "epoch": 1.999952897953416, + "grad_norm": 0.5622615218162537, + "learning_rate": 0.0, + "loss": 0.1563, + "step": 21230 + }, + { + "epoch": 1.999952897953416, + "step": 21230, + "total_flos": 8.859100111364424e+17, + "train_loss": 0.27731341837475, + "train_runtime": 55817.536, + "train_samples_per_second": 36.514, + "train_steps_per_second": 0.38 + } + ], + "logging_steps": 1.0, + "max_steps": 21230, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 2000000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 8.859100111364424e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}